automated-youtube-dl/server/process/main.py

import concurrent
import datetime
import traceback
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from queue import Empty

from server import opts
from server.api.jobs.queue import job_queue, job_status, queued_jobs
from server.mysql import db_logger
from server.process.funcs import remove_duplicates_from_playlist
from server.process.mysql import insert_video
from server.process.threads import download_video
from server.process.ytlogging import YtdlLogger
from ydl.yt_dlp import YDL

# TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Scripts-Type.md#archivist-scripts

# https://github.com/yt-dlp/yt-dlp#embedding-examples
ydl_opts = {
    # TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Details.md
    # https://old.reddit.com/r/DataHoarder/comments/c6fh4x/after_hoarding_over_50k_youtube_videos_here_is/
    'format': f'(bestvideo[filesize<{opts.max_size}M][vcodec^=av01][height>=1080][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9.2][height>=1080][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9][height>=1080][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec^=av01][height>=1080]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9.2][height>=1080]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9][height>=1080]/bestvideo[filesize<{opts.max_size}M][height>=1080]/bestvideo[filesize<{opts.max_size}M][vcodec^=av01][height>=720][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9.2][height>=720][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9][height>=720][fps>30]/bestvideo[filesize<{opts.max_size}M][vcodec^=av01][height>=720]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9.2][height>=720]/bestvideo[filesize<{opts.max_size}M][vcodec=vp9][height>=720]/bestvideo[filesize<{opts.max_size}M][height>=720]/bestvideo[filesize<{opts.max_size}M])+(bestaudio[acodec=opus]/bestaudio)/best',
    'merge_output_format': 'mkv',
    'logtostderr': True,
    'embedchapters': True,
    'writethumbnail': True,
    # Save the thumbnail to a file. Embedding seems to be broken right now so this is an alternative.
    'embedthumbnail': True,
    'embeddescription': True,
    'writesubtitles': True,
    # 'allsubtitles': True, # Download every language.
    'subtitlesformat': 'vtt',
    'subtitleslangs': ['en'],
    'writeautomaticsub': True,
    'writedescription': True,
    'ignoreerrors': True,
    'continuedl': False,
    'addmetadata': True,
    'writeinfojson': True,
    'verbose': opts.ydlp_verbose,
    'postprocessors': [
        {'key': 'FFmpegEmbedSubtitle'},
        {'key': 'FFmpegMetadata', 'add_metadata': True},
        {'key': 'EmbedThumbnail', 'already_have_thumbnail': True},
        {'key': 'FFmpegThumbnailsConvertor', 'format': 'jpg', 'when': 'before_dl'},
        # {'key': 'FFmpegSubtitlesConvertor', 'format': 'srt'}
    ],
    # 'external_downloader': 'aria2c',
    # 'external_downloader_args': ['-j 32', '-s 32', '-x 16', '--file-allocation=none', '--optimize-concurrent-downloads=true', '--http-accept-gzip=true', '--continue=true'],
}


def do_download():
    while True:
        try:
            # Get a job from the queue
            job, l_id, url, base_output, ignore_downloaded = job_queue.get(timeout=5)

            # Remove the job from the list of queued jobs
            queued_jobs.remove(job.id())

            # Update the job status
            job_status[job.id()] = 'running'

            start_time = int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1e3)
            encountered_errors = 0
            logger = db_logger('DOWNLOADER', 'jobs', job_id=job.id())
            logger.info('Starting job')

            ydl = YDL(ydl_opts=dict(ydl_opts, **{'logger': YtdlLogger('DOWNLOADER', 'jobs', job.id())}))
            playlist = ydl.playlist_contents(str(url))

            if not playlist:
                logger.fatal('URL is not a playlist!')
                quit(1)

            playlist['entries'] = remove_duplicates_from_playlist(playlist['entries'])

            logger.info(f'Downloading item: "{playlist["title"]}" ({playlist["id"]}) {url}')

            download_queue = []
            for p, video in enumerate(playlist['entries']):
                download_queue.append(video)

            playlist_ydl_opts = ydl_opts.copy()

            if len(download_queue):
                with ProcessPoolExecutor(max_workers=opts.threads) as executor:
                    futures = {executor.submit(download_video, video, ydl_opts=playlist_ydl_opts, output_dir=Path(base_output), ignore_downloaded=ignore_downloaded, job=job) for video in download_queue}
                    for future in concurrent.futures.as_completed(futures):
                        try:
                            result = future.result()
                            error = False

                            if result['downloaded_video_id']:
                                logger.info(result['downloaded_video_id'])

                            for line in result['video_critical_err_msg_short']:
                                encountered_errors += 1
                                error = True
                                logger.error(f"{result['video_id']} - {line}")

                            for line in result['video_critical_err_msg']:
                                encountered_errors += 1
                                error = True
                                logger.error(f"{result['video_id']} - {line}")

                            for line in result['logger_msg']:
                                logger.info(f"{result['video_id']} - {line}")

                            if not error:
                                insert_video(l_id, result['video_id'], result['url'])
                        except Exception as exc:
                            logger.error(f'Video download generated an exception: {exc}')
            if encountered_errors == 0:
                job.success(True)
            else:
                job.success(False)
            job.finish()

            # Update the job status
            job_status[job.id()] = 'finished'
            job.finish()
            print('======================================================= finished =============')
        except Empty:
            break
        except Exception as e:
            logger = db_logger(name='DOWNLOADER', table='logs', console=True)
            logger.fatal(f'failed with {e.__class__.__name__}: {e}. {traceback.format_exc()}')
            break