#!/usr/bin/env python3 import argparse import logging.config import math import os import re import shutil import subprocess import sys import tempfile import time from multiprocessing import Manager, Pool, cpu_count from pathlib import Path from threading import Thread import yaml from appdirs import user_data_dir from tqdm.auto import tqdm from process.funcs import get_silent_logger, remove_duplicates_from_playlist, restart_program, setup_file_logger from process.threads import bar_eraser, download_video from ydl.files import create_directories, resolve_path from ydl.yt_dlp import YDL, update_ytdlp def signal_handler(sig, frame): # TODO: https://www.g-loaded.eu/2016/11/24/how-to-terminate-running-python-threads-using-signals/ # raise ServiceExit sys.exit(0) # signal.signal(signal.SIGTERM, signal_handler) # signal.signal(signal.SIGINT, signal_handler) url_regex = re.compile(r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) ansi_escape_regex = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') parser = argparse.ArgumentParser() parser.add_argument('file', help='URL to download or path of a file containing the URLs of the videos to download.') parser.add_argument('--output', required=False, help='Output directory. Ignored paths specified in a YAML file.') parser.add_argument('--no-update', '-n', action='store_true', help='Don\'t update yt-dlp at launch.') parser.add_argument('--max-size', type=int, default=1100, help='Max allowed size of a video in MB.') parser.add_argument('--rm-cache', '-r', action='store_true', help='Delete the yt-dlp cache on start.') parser.add_argument('--threads', type=int, default=(cpu_count() - 1), help=f'How many download processes to use. Default: number of CPU cores (for your machine: {cpu_count()}) - 1 = {cpu_count() - 1}') parser.add_argument('--daemon', '-d', action='store_true', help="Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep.") parser.add_argument('--sleep', type=float, default=60, help='How many minutes to sleep when in daemon mode.') parser.add_argument('--download-cache-file-directory', default=user_data_dir('automated-youtube-dl', 'cyberes'), help='The path to the directory to track downloaded videos. Defaults to your appdata path.') parser.add_argument('--silence-errors', '-s', action='store_true', help="Don't print any error messages to the console.") parser.add_argument('--ignore-downloaded', '-i', action='store_true', help='Ignore videos that have been already downloaded and disable checks. Let youtube-dl handle everything.') parser.add_argument('--erase-downloaded-tracker', '-e', action='store_true', help='Erase the tracked video file.') parser.add_argument('--ratelimit-sleep', type=int, default=5, help='How many seconds to sleep between items to prevent rate-limiting. Does not affect time between videos as you should be fine since it takes a few seconds to merge everything and clean up.') parser.add_argument('--input-datatype', choices=['auto', 'txt', 'yaml'], default='auto', help='The datatype of the input file. If set to auto, the file will be scanned for a URL on the first line.' 'If is a URL, the filetype will be set to txt. If it is a key: value pair then the filetype will be set to yaml.') parser.add_argument('--log-dir', default=None, help='Where to store the logs. Must be set when --output is not.') parser.add_argument('--verbose', '-v', action='store_true') parser.add_argument('--verify', '-z', action='store_true', help='Run ffprobe on the downloaded files.') args = parser.parse_args() if args.threads <= 0: print("Can't have 0 threads!") sys.exit(1) if args.output: args.output = resolve_path(args.output) if args.log_dir: args.log_dir = resolve_path(args.log_dir) elif not args.output and not args.log_dir: args.log_dir = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_logs')) # print('Must set --log-dir when --output is not.') # sys.exit(1) else: args.log_dir = args.output / 'logs' args.download_cache_file_directory = resolve_path(args.download_cache_file_directory) # TODO: use logging for this if args.verbose: print('Cache directory:', args.download_cache_file_directory) log_time = time.time() def load_input_file(): """ Get the URLs of the videos to download. Is the input a URL or file? """ url_list = {} if not re.match(url_regex, str(args.file)) or args.input_datatype in ('txt', 'yaml'): args.file = resolve_path(args.file) if not args.file.exists(): print('Input file does not exist:', args.file) sys.exit(1) input_file = [x.strip().strip('\n') for x in list(args.file.open())] if args.input_datatype == 'yaml' or (re.match(r'^.*?:\w*', input_file[0]) and args.input_datatype == 'auto'): with open(args.file, 'r') as file: try: url_list = yaml.safe_load(file) except yaml.YAMLError as e: print('Failed to load config file, error:', e) sys.exit(1) elif args.input_datatype == 'txt' or (re.match(url_regex, input_file[0]) and args.input_datatype == 'auto'): if not args.output: args.output = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_output')) # print('You must specify an output path with --output when the input datatype is a text file.') # sys.exit(1) url_list[str(args.output)] = input_file else: print('Unknown file type:', args.input_datatype) print(input_file) sys.exit(1) del input_file # release file object # Verify each line in the file is a valid URL. # Also resolve the paths resolved_paths = {} for directory, urls in url_list.items(): for item in urls: if not re.match(url_regex, str(item)): print(f'Not a url:', item) sys.exit(1) resolved_paths[resolve_path(directory)] = urls url_list = resolved_paths else: # They gave us just a URL if not args.output: # Set a default path args.output = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_output')) # print('You must specify an output path with --output when the input is a URL.') # sys.exit(1) url_list[str(args.output)] = [args.file] return url_list url_list = load_input_file() # Create directories AFTER loading the file create_directories(*url_list.keys(), args.download_cache_file_directory) def do_update(): if not args.no_update: print('Updating yt-dlp...') updated = update_ytdlp() if updated: print('Restarting program...') restart_program() else: print('Up to date.') if args.rm_cache: subprocess.run('yt-dlp --rm-cache-dir', shell=True) # TODO: compress old log files if args.daemon: print('Running in daemon mode.') create_directories(args.log_dir) # TODO: log file rotation https://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/ # TODO: log to one file instead of one for each run file_logger = setup_file_logger('youtube_dl', args.log_dir / f'{str(int(log_time))}.log', level=logging.INFO) video_error_logger = setup_file_logger('video_errors', args.log_dir / f'{int(log_time)}-errors.log', level=logging.INFO) logger = get_silent_logger('yt-dl', silent=not args.daemon) def log_info_twice(msg): logger.info(msg) file_logger.info(ansi_escape_regex.sub('', msg)) log_info_twice('Starting process.') start_time = time.time() manager = Manager() def load_existing_videos(): # Find existing videos. output = set() if not download_archive_file.exists(): download_archive_file.touch() with open(download_archive_file, 'r') as file: output.update(([line.rstrip() for line in file])) # Remove duplicate lines. # Something may have gone wrong in the past so we want to make sure everything is cleaned up. with open(download_archive_file) as file: uniqlines = set(file.readlines()) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'w') as tmp: tmp.writelines(set(uniqlines)) shutil.move(path, download_archive_file) return output status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon, leave=False) def log_bar(log_msg, level): status_bar.write(f'[{level}] {log_msg}') if level == 'warning': logger.warning(log_msg) elif level == 'error': logger.error(log_msg) else: logger.info(log_msg) # def log_with_video_id(log_msg, video_id, level, logger_obj): # log_msg = f'{video_id} - {log_msg}' # if level == 'warning': # logger_obj.warning(log_msg) # elif level == 'error': # logger_obj.error(log_msg) # else: # logger_obj.info(log_msg) def print_without_paths(msg): """ Remove any filepaths or other stuff we don't want in the message. """ m = re.match(r'(^[^\/]+(?:\\.[^\/]*)*)', msg) if m: msg = m.group(1) m1 = re.match(r'^(.*?): ', msg) msg = msg.strip('to "').strip('to: ').strip() if args.daemon: log_info_twice(msg) else: status_bar.set_description_str(msg) class ytdl_logger(object): def debug(self, msg): file_logger.debug(self.__clean_msg(msg)) # if msg.startswith('[debug] '): # pass if '[download]' not in msg: print_without_paths(msg) def info(self, msg): file_logger.info(self.__clean_msg(msg)) print_without_paths(msg) def warning(self, msg): file_logger.warning(self.__clean_msg(msg)) if args.daemon: logger.warning(msg) else: status_bar.write(msg) def error(self, msg): file_logger.error(self.__clean_msg(msg)) if args.daemon: logger.error(msg) else: status_bar.write(msg) def __clean_msg(self, msg): return ansi_escape_regex.sub('', msg) # TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Scripts-Type.md#archivist-scripts # https://github.com/yt-dlp/yt-dlp#embedding-examples ydl_opts = { # TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Details.md # https://old.reddit.com/r/DataHoarder/comments/c6fh4x/after_hoarding_over_50k_youtube_videos_here_is/ 'format': f'(bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=1080]/bestvideo[filesize<{args.max_size}M][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=720]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=720]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=720]/bestvideo[filesize<{args.max_size}M][height>=720]/bestvideo[filesize<{args.max_size}M])+(bestaudio[acodec=opus]/bestaudio)/best', 'merge_output_format': 'mkv', 'logtostderr': True, 'embedchapters': True, 'writethumbnail': True, # Save the thumbnail to a file. Embedding seems to be broken right now so this is an alternative. 'embedthumbnail': True, 'embeddescription': True, 'writesubtitles': True, # 'allsubtitles': True, # Download every language. 'subtitlesformat': 'vtt', 'subtitleslangs': ['en'], 'writeautomaticsub': True, 'writedescription': True, 'ignoreerrors': True, 'continuedl': False, 'addmetadata': True, 'writeinfojson': True, 'verbose': args.verbose, 'postprocessors': [ {'key': 'FFmpegEmbedSubtitle'}, {'key': 'FFmpegMetadata', 'add_metadata': True}, {'key': 'EmbedThumbnail', 'already_have_thumbnail': True}, {'key': 'FFmpegThumbnailsConvertor', 'format': 'jpg', 'when': 'before_dl'}, # {'key': 'FFmpegSubtitlesConvertor', 'format': 'srt'} ], # 'external_downloader': 'aria2c', # 'external_downloader_args': ['-j 32', '-s 32', '-x 16', '--file-allocation=none', '--optimize-concurrent-downloads=true', '--http-accept-gzip=true', '--continue=true'], } yt_dlp = YDL(dict(ydl_opts, **{'logger': ytdl_logger()})) url_count = 0 for k, v in url_list.items(): for item in v: url_count += 1 # Init bars video_bars = manager.list() if not args.daemon: for i in range(args.threads): video_bars.append([3 + i, manager.Lock()]) encountered_errors = 0 errored_videos = 0 # The video progress bars have an issue where when a bar is closed it # will shift its position back 1 then return to the correct position. # This thread will clear empty spots. if not args.daemon: eraser_exit = manager.Value(bool, False) Thread(target=bar_eraser, args=(video_bars, eraser_exit,)).start() already_erased_downloaded_tracker = False while True: # do_update() # this doesn't work very well. freezes progress_bar = tqdm(total=url_count, position=0, desc='Inputs', disable=args.daemon, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}') for output_path, urls in url_list.items(): for target_url in urls: logger.info('Fetching playlist...') playlist = yt_dlp.playlist_contents(str(target_url)) if not playlist: progress_bar.update() continue url_list = load_input_file() download_archive_file = args.download_cache_file_directory / (str(playlist['id']) + '.log') if args.erase_downloaded_tracker and not already_erased_downloaded_tracker: if download_archive_file.exists(): os.remove(download_archive_file) already_erased_downloaded_tracker = True downloaded_videos = load_existing_videos() msg = f'Found {len(downloaded_videos)} downloaded videos for playlist "{playlist["title"]}" ({playlist["id"]}). {"Ignoring." if args.ignore_downloaded else ""}' if args.daemon: logger.info(msg) else: progress_bar.write(msg) download_archive_logger = setup_file_logger('download_archive', download_archive_file, format_str='%(message)s') playlist['entries'] = remove_duplicates_from_playlist(playlist['entries']) log_info_twice(f'Downloading item: "{playlist["title"]}" ({playlist["id"]}) {target_url}') # Remove already downloaded files from the to-do list. download_queue = [] for p, video in enumerate(playlist['entries']): if video['id'] not in download_queue: if not args.ignore_downloaded and video['id'] not in downloaded_videos: download_queue.append(video) # downloaded_videos.add(video['id']) elif args.ignore_downloaded: download_queue.append(video) playlist_bar = tqdm(total=len(playlist['entries']), position=1, desc=f'"{playlist["title"]}" ({playlist["id"]})', disable=args.daemon, leave=False) if not args.ignore_downloaded: playlist_bar.update(len(downloaded_videos)) playlist_ydl_opts = ydl_opts.copy() # playlist_ydl_opts['outtmpl'] = f'{output_path}/{get_output_templ()}' if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded with Pool(processes=args.threads) as pool: if sys.stdout.isatty(): # Doesn't work if not connected to a terminal: # OSError: [Errno 25] Inappropriate ioctl for device status_bar.set_description_str('=' * os.get_terminal_size()[0]) logger.info('Starting downloads...') for result in pool.imap_unordered(download_video, ((video, { 'bars': video_bars, 'ydl_opts': playlist_ydl_opts, 'output_dir': Path(output_path), 'ignore_downloaded': args.ignore_downloaded, 'verify': args.verify }) for video in download_queue)): # Save the video ID to the file if result['downloaded_video_id']: download_archive_logger.info(result['downloaded_video_id']) # Print short error messages. # An error should never be added to both video_critical_err_msg_short and video_critical_err_msg. for line in result['video_critical_err_msg_short']: # file_msg = f"{result['video_id']} - {ansi_escape_regex.sub('', line)}" # term_msg = f"{result['video_id']} - {line}" msg = f"{result['video_id']} - {line}" video_error_logger.error(msg) file_logger.error(msg) encountered_errors += 1 if args.daemon: logger.error(msg) else: status_bar.write(msg) # Print longer error messages. # Won't print anything to console if the silence_errors arg is set. for line in result['video_critical_err_msg']: # file_msg = f"{result['video_id']} - {ansi_escape_regex.sub('', line)}" # term_msg = f"{result['video_id']} - {line}" msg = f"{result['video_id']} - {line}" video_error_logger.error(msg) file_logger.error(msg) encountered_errors += 1 if not args.silence_errors: if args.daemon: logger.error(msg) else: status_bar.write(msg) # if len(result['video_critical_err_msg']): # errored_videos += 1 # if args.silence_errors and args.daemon: # logger.error(f"{result['video_id']} - Failed due to error.") for line in result['logger_msg']: log_info_twice(f"{result['video_id']} - {line}") # TODO: if no error launch a verify multiprocess # if kwargs['verify']: # try: # info = yt_dlp.extract_info(video['url']) # except Exception as e: # output_dict['video_critical_err_msg'].append(f'Failed to verify video, extract_info failed: {e}') # file_path = base_path + info['ext'] # result = ffprobe(file_path) # if not result[0]: # output_dict['video_critical_err_msg'].append(f'Failed to verify video: {result[4]}') playlist_bar.update() else: msg = f"All videos already downloaded for \"{playlist['title']}\"." if args.daemon: logger.info(msg) else: status_bar.write(msg) log_info_twice(f"Finished item: '{playlist['title']}' {target_url}") # Sleep a bit to prevent rate-limiting if progress_bar.n < len(url_list.keys()) - 1: status_bar.set_description_str(f'Sleeping {args.ratelimit_sleep}s...') time.sleep(args.ratelimit_sleep) progress_bar.update() error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.' if args.daemon: logger.info(error_msg) else: status_bar.write(error_msg) log_info_twice(f"Finished process in {round(math.ceil(time.time() - start_time) / 60, 2)} min.") if not args.daemon: break else: logger.info(f'Sleeping for {args.sleep} min.') try: time.sleep(args.sleep * 60) except KeyboardInterrupt: sys.exit(0) # downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded # Clean up the remaining bars. Have to close them in order. # These variables may be undefined so we will just ignore any errors. # Not in one try/catch because we don't want to skip anything. try: eraser_exit.value = True except NameError: pass except AttributeError: pass try: playlist_bar.close() except NameError: pass except AttributeError: pass try: status_bar.close() except NameError: pass except AttributeError: pass