diff --git a/.gitignore b/.gitignore index 2629746..0c25724 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .idea +targets.* +!targets.sample.* # ---> Python # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index f9a511a..ca5bba7 100644 --- a/README.md +++ b/README.md @@ -53,17 +53,18 @@ Output Directory/ Videos will be saved using this name format: ``` -%(title)s --- %(uploader)s --- %(uploader_id)s --- %(id)s +[%(id)s] [%(title)s] [%(uploader)s] [%(uploader_id)s] ``` #### Arguments -| Argument | Flag | Help | -| ------------- | ---- | ------------------------------------------------------------ | -| `--no-update` | `-n` | Don\'t update yt-dlp at launch. | -| `--max-size` | | Max allowed size of a video in MB. Default: 1100. | -| `--rm-cache` | `-r` | Delete the yt-dlp cache on start. | -| `--threads` | | How many download processes to use (threads). Default is how many CPU cores you have. You will want to find a good value that doesn't overload your connection. | -| `--daemon` | `-d` | Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep. | -| `--sleep` | | How many minutes to sleep when in daemon mode. | -| `--silent` | `-s` | Don't print any error messages to the console. | \ No newline at end of file +| Argument | Flag | Help | +| --------------------- | ---- | ------------------------------------------------------------ | +| `--no-update` | `-n` | Don\'t update yt-dlp at launch. | +| `--max-size` | | Max allowed size of a video in MB. Default: 1100. | +| `--rm-cache` | `-r` | Delete the yt-dlp cache on start. | +| `--threads` | | How many download processes to use (threads). Default is how many CPU cores you have. You will want to find a good value that doesn't overload your connection. | +| `--daemon` | `-d` | Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep. | +| `--sleep` | | How many minutes to sleep when in daemon mode. | +| `--silent` | `-s` | Don't print any error messages to the console. | +| `--ignore-downloaded` | `-i` | Ignore videos that have been already downloaded and let youtube-dl handle everything. Videos will not be re-downloaded, but metadata will be updated. | \ No newline at end of file diff --git a/downloader.py b/downloader.py index 60d80dd..2757d27 100755 --- a/downloader.py +++ b/downloader.py @@ -8,36 +8,44 @@ import subprocess import sys import time from multiprocessing import Manager, Pool, cpu_count +from pathlib import Path +from threading import Thread +import yaml +from appdirs import user_data_dir from tqdm.auto import tqdm import ydl.yt_dlp as ydl from process.funcs import get_silent_logger, remove_duplicates_from_playlist, restart_program, setup_file_logger -from process.threads import download_video +from process.threads import bar_eraser, download_video from ydl.files import create_directories, resolve_path # logging.basicConfig(level=1000) # logging.getLogger().setLevel(1000) -urlRegex = re.compile( - r'^(?:http|ftp)s?://' # http:// or https:// - r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... - r'localhost|' # localhost... - r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip - r'(?::\d+)?' # optional port - r'(?:/?|[/?]\S+)$', re.IGNORECASE) +urlRegex = re.compile(r'^(?:http|ftp)s?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... + r'localhost|' # localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', re.IGNORECASE) parser = argparse.ArgumentParser() parser.add_argument('file', help='URL to download or path of a file containing the URLs of the videos to download.') -parser.add_argument('output', help='Output directory.') +parser.add_argument('output', help='Output directory. Ignored paths specified in a YAML file.') parser.add_argument('--no-update', '-n', action='store_true', help='Don\'t update yt-dlp at launch.') parser.add_argument('--max-size', type=int, default=1100, help='Max allowed size of a video in MB.') parser.add_argument('--rm-cache', '-r', action='store_true', help='Delete the yt-dlp cache on start.') parser.add_argument('--threads', type=int, default=cpu_count(), help='How many download processes to use.') parser.add_argument('--daemon', '-d', action='store_true', help="Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep.") parser.add_argument('--sleep', type=float, default=60, help='How many minutes to sleep when in daemon mode.') +parser.add_argument('--download-cache-file-directory', default=user_data_dir('automated-youtube-dl', 'cyberes'), help='The path to the directory to track downloaded videos. Defaults to your appdata path.') parser.add_argument('--silence-errors', '-s', action='store_true', help="Don't print any error messages to the console.") -parser.add_argument('--ignore-downloaded', '-i', action='store_true', help='Ignore videos that have been already downloaded and let YouTubeDL handle everything.') +parser.add_argument('--ignore-downloaded', '-i', action='store_true', help='Ignore videos that have been already downloaded and let youtube-dl handle everything.') +parser.add_argument('--erase-downloaded-tracker', '-e', action='store_true', help='Erase the tracked video file.') +parser.add_argument('--ratelimit-sleep', type=int, default=5, help='How many seconds to sleep to prevent rate-limiting.') +parser.add_argument('--input-datatype', choices=['auto', 'txt', 'yaml'], default='auto', help='The datatype of the input file. If set to auto, the file will be scanned for a URL on the firstline.' + 'If is a URL, the filetype will be set to txt. If it is a key: value pair then the filetype will be set to yaml.') args = parser.parse_args() if args.threads <= 0: @@ -45,22 +53,42 @@ if args.threads <= 0: sys.exit(1) args.output = resolve_path(args.output) +args.download_cache_file_directory = resolve_path(args.download_cache_file_directory) log_time = time.time() # Get the URLs of the videos to download. Is the input a URL or file? -if not re.match(urlRegex, str(args.file)): +url_list = {} +if not re.match(urlRegex, str(args.file)) or args.input_datatype in ('txt', 'yaml'): args.file = resolve_path(args.file) if not args.file.exists(): print('Input file does not exist:', args.file) sys.exit(1) - url_list = [x.strip().strip('\n') for x in list(args.file.open())] + input_file = [x.strip().strip('\n') for x in list(args.file.open())] + if args.input_datatype == 'yaml' or (re.match(r'^.*?:\w*', input_file[0]) and args.input_datatype == 'auto'): + with open(args.file, 'r') as file: + try: + url_list = yaml.safe_load(file) + except yaml.YAMLError as e: + print('Failed to load config file, error:', e) + sys.exit(1) + elif args.input_datatype == 'txt' or (re.match(urlRegex, input_file[0]) and args.input_datatype == 'auto'): + url_list[str(args.output)] = input_file + else: + print('Unknown file type:', args.input_datatype) + print(input_file) + sys.exit(1) + del input_file # release file object # Verify each line in the file is a valid URL. - for i, line in enumerate(url_list): - if not re.match(urlRegex, line): - print(f'Line {i} not a url:', line) - sys.exit(1) + for directory, urls in url_list.items(): + for item in urls: + if not re.match(urlRegex, str(item)): + print(f'Not a url:', item) + sys.exit(1) else: - url_list = [args.file] + url_list[str(args.output)] = [args.file] + +# Create directories AFTER loading the file +create_directories(*url_list.keys(), args.download_cache_file_directory) def do_update(): @@ -98,8 +126,6 @@ start_time = time.time() manager = Manager() -download_archive_file = args.output / 'download-archive.log' - def load_existing_videos(): # Find existing videos. @@ -111,13 +137,7 @@ def load_existing_videos(): return output -downloaded_videos = load_existing_videos() -print('Found', len(downloaded_videos), 'downloaded videos.') - -# Create this object AFTER reading in the download_archive. -download_archive_logger = setup_file_logger('download_archive', download_archive_file, format_str='%(message)s') - -status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon) +status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon, leave=False) def log_bar(msg, level): @@ -195,94 +215,118 @@ ydl_opts = { } main_opts = dict(ydl_opts, **{'logger': ytdl_logger()}) -# thread_opts = dict(ydl_opts, **{'logger': ydl.ytdl_no_logger()}) yt_dlp = ydl.YDL(main_opts) +url_count = 0 +for k, v in url_list.items(): + for item in v: + url_count += 1 + # Init bars -playlist_bar = tqdm(position=1, desc='Playlist', disable=args.daemon) +progress_bar = tqdm(total=url_count, position=0, desc='Inputs', disable=args.daemon) video_bars = manager.list() if not args.daemon: for i in range(args.threads): - video_bars.append([ - 3 + i, - manager.Lock() - ]) + video_bars.append([3 + i, manager.Lock()]) + +encountered_errors = 0 +errored_videos = 0 + +# The video progress bars have an issue where when a bar is closed it will shift its position back 1 then return to the correct position. +# This thread will clear empty spots. +if not args.daemon: + eraser_exit = manager.Value(bool, False) + Thread(target=bar_eraser, args=(video_bars, eraser_exit,)).start() while True: do_update() - for i, target_url in tqdm(enumerate(url_list), total=len(url_list), position=0, desc='Inputs', disable=args.daemon): - logger.info('Fetching playlist...') - playlist = yt_dlp.playlist_contents(target_url) - playlist['entries'] = remove_duplicates_from_playlist(playlist['entries']) - encountered_errors = 0 - errored_videos = 0 + for output_path, urls in url_list.items(): + for target_url in urls: + logger.info('Fetching playlist...') + playlist = yt_dlp.playlist_contents(str(target_url)) + if not playlist: + progress_bar.update() + continue - log_info_twice(f"Downloading item: '{playlist['title']}' {target_url}") + download_archive_file = args.download_cache_file_directory / (str(playlist['id']) + '.log') + if args.erase_downloaded_tracker: + if download_archive_file.exists(): + os.remove(download_archive_file) + downloaded_videos = load_existing_videos() - playlist_bar.total = len(playlist['entries']) - playlist_bar.set_description(playlist['title']) + msg = f'Found {len(downloaded_videos)} downloaded videos for playlist "{playlist["title"]}" ({playlist["id"]}). {"Ignoring." if args.ignore_downloaded else ""}' + if args.daemon: + print(msg) + else: + status_bar.write(msg) + download_archive_logger = setup_file_logger('download_archive', download_archive_file, format_str='%(message)s') - # print(playlist['entries'][0]) - # sys.exit() + playlist['entries'] = remove_duplicates_from_playlist(playlist['entries']) - # Remove already downloaded files from the to-do list. - if not args.ignore_downloaded: + log_info_twice(f'Downloading item: "{playlist["title"]}" ({playlist["id"]}) {target_url}') + + # Remove already downloaded files from the to-do list. download_queue = [] - s = set() for p, video in enumerate(playlist['entries']): - if video['id'] not in downloaded_videos and video['id'] not in s: - download_queue.append(video) - s.add(video['id']) - playlist_bar.update(len(downloaded_videos)) + if video['id'] not in download_queue: + if not args.ignore_downloaded and video['id'] not in downloaded_videos: + download_queue.append(video) + # downloaded_videos.add(video['id']) + elif args.ignore_downloaded: + download_queue.append(video) - if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded - with Pool(processes=args.threads) as pool: - if sys.stdout.isatty(): - # Doesn't work if not connected to a terminal: - # OSError: [Errno 25] Inappropriate ioctl for device - status_bar.set_description_str('=' * os.get_terminal_size()[0]) - logger.info('Starting downloads...') - for result in pool.imap_unordered(download_video, - ((video, { - 'bars': video_bars, - 'ydl_opts': ydl_opts, - 'output_dir': args.output, - }) for video in download_queue)): - # Save the video ID to the file - if result['downloaded_video_id']: - download_archive_logger.info(result['downloaded_video_id']) + playlist_bar = tqdm(total=len(playlist['entries']), position=1, desc=f'"{playlist["title"]}" ({playlist["id"]})', disable=args.daemon, leave=False) + if not args.ignore_downloaded: + playlist_bar.update(len(downloaded_videos)) - # Print stuff - for line in result['video_error_logger_msg']: - video_error_logger.info(line) - file_logger.error(line) - encountered_errors += 1 - if not args.silence_errors: - if args.daemon: - logger.error(line) - else: - playlist_bar.write(line) + if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded + with Pool(processes=args.threads) as pool: + if sys.stdout.isatty(): + # Doesn't work if not connected to a terminal: + # OSError: [Errno 25] Inappropriate ioctl for device + status_bar.set_description_str('=' * os.get_terminal_size()[0]) + logger.info('Starting downloads...') + for result in pool.imap_unordered(download_video, ((video, {'bars': video_bars, 'ydl_opts': ydl_opts, 'output_dir': Path(output_path), }) for video in download_queue)): + # Save the video ID to the file + if result['downloaded_video_id']: + download_archive_logger.info(result['downloaded_video_id']) - if len(result['video_error_logger_msg']): - errored_videos += 1 - if args.silence_errors and args.daemon: - logger.error(f"{result['video_id']} failed due to error.") + # Print stuff + for line in result['video_error_logger_msg']: + video_error_logger.info(line) + file_logger.error(line) + encountered_errors += 1 + if not args.silence_errors: + if args.daemon: + logger.error(line) + else: + status_bar.write(line) - # for line in result['status_msg']: - # playlist_bar.write(line) - for line in result['logger_msg']: - log_info_twice(line) - playlist_bar.update() - else: - playlist_bar.write(f"All videos already downloaded for '{playlist['title']}'.") + if len(result['video_error_logger_msg']): + errored_videos += 1 + if args.silence_errors and args.daemon: + logger.error(f"{result['video_id']} failed due to error.") - error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.' - if args.daemon: - logger.info(error_msg) - else: - playlist_bar.write(error_msg) + # for line in result['status_msg']: + # playlist_bar.write(line) + for line in result['logger_msg']: + log_info_twice(line) + playlist_bar.update() + else: + status_bar.write(f"All videos already downloaded for '{playlist['title']}'.") + log_info_twice(f"Finished item: '{playlist['title']}' {target_url}") - log_info_twice(f"Finished item: '{playlist['title']}' {target_url}") + # Sleep a bit to prevent rate-limiting + if progress_bar.n < len(url_list.keys()) - 1: + status_bar.set_description_str(f'Sleeping {args.ratelimit_sleep}s...') + time.sleep(args.ratelimit_sleep) + + progress_bar.update() + error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.' + if args.daemon: + logger.info(error_msg) + else: + status_bar.write(error_msg) log_info_twice(f"Finished process in {round(math.ceil(time.time() - start_time) / 60, 2)} min.") if not args.daemon: break @@ -292,12 +336,9 @@ while True: time.sleep(args.sleep * 60) except KeyboardInterrupt: sys.exit() - downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded - -# Erase the status bar. -status_bar.set_description_str('\x1b[2KDone!') -status_bar.refresh() + # downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded # Clean up the remaining bars. Have to close them in order. +eraser_exit.value = True playlist_bar.close() status_bar.close() diff --git a/process/funcs.py b/process/funcs.py index 045338e..6144f63 100644 --- a/process/funcs.py +++ b/process/funcs.py @@ -22,7 +22,7 @@ def restart_program(): os.execl(python, python, *sys.argv) -def setup_file_logger(name, log_file, level=logging.INFO, format_str: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s', filemode='a', no_console: bool = True): +def setup_file_logger(name, log_file, level=logging.INFO, format_str: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s', filemode='a'): formatter = logging.Formatter(format_str) logger = logging.getLogger(name) diff --git a/process/threads.py b/process/threads.py index 1866cca..1b3754e 100644 --- a/process/threads.py +++ b/process/threads.py @@ -1,6 +1,8 @@ import math import os import time +from multiprocessing import Manager +from threading import Thread import numpy as np from tqdm.auto import tqdm @@ -12,21 +14,25 @@ from process.funcs import setup_file_logger class ytdl_logger(object): errors = [] - def __init__(self, logger): + def __init__(self, logger=None): self.logger = logger def debug(self, msg): - self.logger.info(msg) + if self.logger: + self.logger.info(msg) def info(self, msg): - self.logger.info(msg) + if self.logger: + self.logger.info(msg) def warning(self, msg): - self.logger.warning(msg) + if self.logger: + self.logger.warning(msg) def error(self, msg): - self.logger.error(msg) - self.errors.append(msg) + if self.logger: + self.logger.error(msg) + self.errors.append(msg) def is_manager_lock_locked(lock) -> bool: @@ -57,6 +63,7 @@ def download_video(args) -> dict: bar.set_postfix({ 'speed': d['_speed_str'], 'size': f"{d['_downloaded_bytes_str'].strip()}/{d['_total_bytes_str'].strip()}", + 'offset': offset }) video = args[0] @@ -69,21 +76,28 @@ def download_video(args) -> dict: while not locked: for item in kwargs['bars']: if not is_manager_lock_locked(item[1]): - locked = item[1].acquire(timeout=0.1) # get the lock ASAP and don't wait if we didn't get it. + locked = item[1].acquire(timeout=0.2) # get the lock ASAP and don't wait if we didn't get it. offset = item[0] bar_lock = item[1] break kwargs['ydl_opts']['progress_hooks'] = [progress_hook] desc_with = int(np.round(os.get_terminal_size()[0] * (1 / 4))) - bar = tqdm(total=100, position=(offset if locked else None), desc=f"{video['id']} - {video['title']}".ljust(desc_with)[:desc_with], bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]', leave=False) + bar = tqdm(total=100, position=offset, desc=f"{video['id']} - {video['title']}".ljust(desc_with)[:desc_with], bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]', leave=False) - ylogger = ytdl_logger(setup_file_logger(video['id'], kwargs['output_dir'] / f"[{video['id']}].log")) - kwargs['ydl_opts']['logger'] = ylogger - yt_dlp = ydl.YDL(kwargs['ydl_opts']) output_dict = {'downloaded_video_id': None, 'video_id': video['id'], 'video_error_logger_msg': [], 'status_msg': [], 'logger_msg': []} # empty object start_time = time.time() try: + kwargs['ydl_opts']['logger'] = ytdl_logger() # dummy silent logger + yt_dlp = ydl.YDL(kwargs['ydl_opts']) + try: + base_path = os.path.splitext(yt_dlp.prepare_filename(yt_dlp.extract_info(video['url'], download=False)))[0] + except AttributeError: + # Sometimes we won't be able to pull the video info so just use the video's ID + base_path = kwargs['output_dir'] / video['id'] + ylogger = ytdl_logger(setup_file_logger(video['id'], str(base_path) + '.log')) + kwargs['ydl_opts']['logger'] = ylogger + yt_dlp = ydl.YDL(kwargs['ydl_opts']) # recreate the object with the correct logging path error_code = yt_dlp(video['url']) # Do the download if not error_code: elapsed = round(math.ceil(time.time() - start_time) / 60, 2) @@ -96,7 +110,43 @@ def download_video(args) -> dict: output_dict['video_error_logger_msg'] = output_dict['video_error_logger_msg'] + ylogger.errors except Exception as e: output_dict['video_error_logger_msg'].append(f"EXCEPTION -> {e}") + bar.update(100 - bar.n) if locked: bar.close() bar_lock.release() return output_dict + + +def bar_eraser(video_bars, eraser_exit): + manager = Manager() + queue = manager.dict() + queue_lock = manager.Lock() + + def eraser(): + nonlocal queue + while not eraser_exit.value: + for i in queue.keys(): + if eraser_exit.value: + return + i = int(i) + lock = video_bars[i][1].acquire(timeout=0.1) + bar_lock = video_bars[i][1] + if lock: + bar = tqdm(position=video_bars[i][0], leave=False, bar_format='\x1b[2K') + bar.close() + with queue_lock: + del queue_dict[i] + queue = queue_dict + bar_lock.release() + + Thread(target=eraser).start() + + while not eraser_exit.value: + for i, item in enumerate(video_bars): + if eraser_exit.value: + return + if is_manager_lock_locked(item[1]): + with queue_lock: + queue_dict = queue + queue_dict[i] = True + queue = queue_dict diff --git a/requirements.txt b/requirements.txt index 926ae0a..297a27f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ yt-dlp psutil tqdm mergedeep -numpy \ No newline at end of file +numpy +pyyaml +appdirs \ No newline at end of file diff --git a/targets.sample.txt b/targets.sample.txt new file mode 100644 index 0000000..2b93fb6 --- /dev/null +++ b/targets.sample.txt @@ -0,0 +1 @@ +https://www.youtube.com/playlist?list=example1234 \ No newline at end of file diff --git a/targets.sample.yaml b/targets.sample.yaml new file mode 100644 index 0000000..0e1252c --- /dev/null +++ b/targets.sample.yaml @@ -0,0 +1,5 @@ +/path/to/storage/Example Playlist: + - https://www.youtube.com/playlist?list=ExamplePlaylist1234 + +/path/to/storage/Music: + - https://www.youtube.com/MyMusicPlaylist1234 \ No newline at end of file diff --git a/ydl/yt_dlp.py b/ydl/yt_dlp.py index 640bf58..d7a9ec4 100644 --- a/ydl/yt_dlp.py +++ b/ydl/yt_dlp.py @@ -29,13 +29,15 @@ class YDL: sizes.append(d) return tuple(sizes) - def playlist_contents(self, url: str) -> dict: + def playlist_contents(self, url: str) -> dict | bool: ydl_opts = merge({ 'extract_flat': True, 'skip_download': True }, self.ydl_opts) with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.sanitize_info(ydl.extract_info(url, download=False)) + if not info: + return False entries = [] if info['_type'] == 'playlist': if 'entries' in info.keys(): @@ -53,14 +55,23 @@ class YDL: 'entries': entries, } - def __call__(self, *args, **kwargs): - return self.yt_dlp.download(*args, **kwargs) - # def filter_filesize(self, info, *, incomplete): # duration = info.get('duration') # if duration and duration < 60: # return 'The video is too short' + def extract_info(self, *args, **kwargs): + return self.yt_dlp.extract_info(*args, **kwargs) + + def prepare_filename(self, *args, **kwargs): + return self.yt_dlp.prepare_filename(*args, **kwargs) + + def process_info(self, *args, **kwargs): + return self.yt_dlp.process_info(*args, **kwargs) + + def __call__(self, *args, **kwargs): + return self.yt_dlp.download(*args, **kwargs) + def update_ytdlp(): old = subprocess.check_output('pip freeze | grep yt-dlp', shell=True).decode().strip('\n')