This repository has been archived on 2023-11-11. You can view files and clone it, but cannot push or open issues or pull requests.
automated-youtube-dl/downloader.py

509 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import logging.config
import math
import os
import re
import shutil
import subprocess
import sys
import tempfile
import time
from multiprocessing import Manager, Pool, cpu_count
from pathlib import Path
from threading import Thread
import yaml
from appdirs import user_data_dir
from tqdm.auto import tqdm
from process.funcs import get_silent_logger, remove_duplicates_from_playlist, restart_program, setup_file_logger
from process.threads import bar_eraser, download_video
from ydl.files import create_directories, resolve_path
from ydl.yt_dlp import YDL, update_ytdlp
def signal_handler(sig, frame):
# TODO: https://www.g-loaded.eu/2016/11/24/how-to-terminate-running-python-threads-using-signals/
# raise ServiceExit
sys.exit(0)
# signal.signal(signal.SIGTERM, signal_handler)
# signal.signal(signal.SIGINT, signal_handler)
url_regex = re.compile(r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
ansi_escape_regex = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
parser = argparse.ArgumentParser()
parser.add_argument('file', help='URL to download or path of a file containing the URLs of the videos to download.')
parser.add_argument('--output', required=False, help='Output directory. Ignored paths specified in a YAML file.')
parser.add_argument('--no-update', '-n', action='store_true', help='Don\'t update yt-dlp at launch.')
parser.add_argument('--max-size', type=int, default=1100, help='Max allowed size of a video in MB.')
parser.add_argument('--rm-cache', '-r', action='store_true', help='Delete the yt-dlp cache on start.')
parser.add_argument('--threads', type=int, default=(cpu_count() - 1),
help=f'How many download processes to use. Default: number of CPU cores (for your machine: {cpu_count()}) - 1 = {cpu_count() - 1}')
parser.add_argument('--daemon', '-d', action='store_true',
help="Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep.")
parser.add_argument('--sleep', type=float, default=60, help='How many minutes to sleep when in daemon mode.')
parser.add_argument('--download-cache-file-directory', default=user_data_dir('automated-youtube-dl', 'cyberes'),
help='The path to the directory to track downloaded videos. Defaults to your appdata path.')
parser.add_argument('--silence-errors', '-s', action='store_true',
help="Don't print any error messages to the console.")
parser.add_argument('--ignore-downloaded', '-i', action='store_true',
help='Ignore videos that have been already downloaded and disable checks. Let youtube-dl handle everything.')
parser.add_argument('--erase-downloaded-tracker', '-e', action='store_true', help='Erase the tracked video file.')
parser.add_argument('--ratelimit-sleep', type=int, default=5,
help='How many seconds to sleep between items to prevent rate-limiting. Does not affect time between videos as you should be fine since it takes a few seconds to merge everything and clean up.')
parser.add_argument('--input-datatype', choices=['auto', 'txt', 'yaml'], default='auto',
help='The datatype of the input file. If set to auto, the file will be scanned for a URL on the first line.'
'If is a URL, the filetype will be set to txt. If it is a key: value pair then the filetype will be set to yaml.')
parser.add_argument('--log-dir', default=None, help='Where to store the logs. Must be set when --output is not.')
parser.add_argument('--verbose', '-v', action='store_true')
parser.add_argument('--verify', '-z', action='store_true', help='Run ffprobe on the downloaded files.')
args = parser.parse_args()
if args.threads <= 0:
print("Can't have 0 threads!")
sys.exit(1)
if args.output:
args.output = resolve_path(args.output)
if args.log_dir:
args.log_dir = resolve_path(args.log_dir)
elif not args.output and not args.log_dir:
args.log_dir = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_logs'))
# print('Must set --log-dir when --output is not.')
# sys.exit(1)
else:
args.log_dir = args.output / 'logs'
args.download_cache_file_directory = resolve_path(args.download_cache_file_directory)
# TODO: use logging for this
if args.verbose:
print('Cache directory:', args.download_cache_file_directory)
log_time = time.time()
def load_input_file():
"""
Get the URLs of the videos to download. Is the input a URL or file?
"""
url_list = {}
if not re.match(url_regex, str(args.file)) or args.input_datatype in ('txt', 'yaml'):
args.file = resolve_path(args.file)
if not args.file.exists():
print('Input file does not exist:', args.file)
sys.exit(1)
input_file = [x.strip().strip('\n') for x in list(args.file.open())]
if args.input_datatype == 'yaml' or (re.match(r'^.*?:\w*', input_file[0]) and args.input_datatype == 'auto'):
with open(args.file, 'r') as file:
try:
url_list = yaml.safe_load(file)
except yaml.YAMLError as e:
print('Failed to load config file, error:', e)
sys.exit(1)
elif args.input_datatype == 'txt' or (re.match(url_regex, input_file[0]) and args.input_datatype == 'auto'):
if not args.output:
args.output = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_output'))
# print('You must specify an output path with --output when the input datatype is a text file.')
# sys.exit(1)
url_list[str(args.output)] = input_file
else:
print('Unknown file type:', args.input_datatype)
print(input_file)
sys.exit(1)
del input_file # release file object
# Verify each line in the file is a valid URL.
# Also resolve the paths
resolved_paths = {}
for directory, urls in url_list.items():
for item in urls:
if not re.match(url_regex, str(item)):
print(f'Not a url:', item)
sys.exit(1)
resolved_paths[resolve_path(directory)] = urls
url_list = resolved_paths
else:
# They gave us just a URL
if not args.output:
# Set a default path
args.output = resolve_path(Path(os.getcwd(), 'automated-youtube-dl_output'))
# print('You must specify an output path with --output when the input is a URL.')
# sys.exit(1)
url_list[str(args.output)] = [args.file]
return url_list
url_list = load_input_file()
# Create directories AFTER loading the file
create_directories(*url_list.keys(), args.download_cache_file_directory)
def do_update():
if not args.no_update:
print('Updating yt-dlp...')
updated = update_ytdlp()
if updated:
print('Restarting program...')
restart_program()
else:
print('Up to date.')
if args.rm_cache:
subprocess.run('yt-dlp --rm-cache-dir', shell=True)
# TODO: compress old log files
if args.daemon:
print('Running in daemon mode.')
create_directories(args.log_dir)
# TODO: log file rotation https://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/
# TODO: log to one file instead of one for each run
file_logger = setup_file_logger('youtube_dl', args.log_dir / f'{str(int(log_time))}.log', level=logging.INFO)
video_error_logger = setup_file_logger('video_errors', args.log_dir / f'{int(log_time)}-errors.log', level=logging.INFO)
logger = get_silent_logger('yt-dl', silent=not args.daemon)
def log_info_twice(msg):
logger.info(msg)
file_logger.info(ansi_escape_regex.sub('', msg))
log_info_twice('Starting process.')
start_time = time.time()
manager = Manager()
def load_existing_videos():
# Find existing videos.
output = set()
if not download_archive_file.exists():
download_archive_file.touch()
with open(download_archive_file, 'r') as file:
output.update(([line.rstrip() for line in file]))
# Remove duplicate lines.
# Something may have gone wrong in the past so we want to make sure everything is cleaned up.
with open(download_archive_file) as file:
uniqlines = set(file.readlines())
fd, path = tempfile.mkstemp()
with os.fdopen(fd, 'w') as tmp:
tmp.writelines(set(uniqlines))
shutil.move(path, download_archive_file)
return output
status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon, leave=False)
def log_bar(log_msg, level):
status_bar.write(f'[{level}] {log_msg}')
if level == 'warning':
logger.warning(log_msg)
elif level == 'error':
logger.error(log_msg)
else:
logger.info(log_msg)
# def log_with_video_id(log_msg, video_id, level, logger_obj):
# log_msg = f'{video_id} - {log_msg}'
# if level == 'warning':
# logger_obj.warning(log_msg)
# elif level == 'error':
# logger_obj.error(log_msg)
# else:
# logger_obj.info(log_msg)
def print_without_paths(msg):
"""
Remove any filepaths or other stuff we don't want in the message.
"""
m = re.match(r'(^[^\/]+(?:\\.[^\/]*)*)', msg)
if m:
msg = m.group(1)
m1 = re.match(r'^(.*?): ', msg)
msg = msg.strip('to "').strip('to: ').strip()
if args.daemon:
log_info_twice(msg)
else:
status_bar.set_description_str(msg)
class ytdl_logger(object):
def debug(self, msg):
file_logger.debug(self.__clean_msg(msg))
# if msg.startswith('[debug] '):
# pass
if '[download]' not in msg:
print_without_paths(msg)
def info(self, msg):
file_logger.info(self.__clean_msg(msg))
print_without_paths(msg)
def warning(self, msg):
file_logger.warning(self.__clean_msg(msg))
if args.daemon:
logger.warning(msg)
else:
status_bar.write(msg)
def error(self, msg):
file_logger.error(self.__clean_msg(msg))
if args.daemon:
logger.error(msg)
else:
status_bar.write(msg)
def __clean_msg(self, msg):
return ansi_escape_regex.sub('', msg)
# TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Scripts-Type.md#archivist-scripts
# https://github.com/yt-dlp/yt-dlp#embedding-examples
ydl_opts = {
# TODO: https://github.com/TheFrenchGhosty/TheFrenchGhostys-Ultimate-YouTube-DL-Scripts-Collection/blob/master/docs/Details.md
# https://old.reddit.com/r/DataHoarder/comments/c6fh4x/after_hoarding_over_50k_youtube_videos_here_is/
'format': f'(bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=1080][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=1080]/bestvideo[filesize<{args.max_size}M][height>=1080]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=720][fps>30]/bestvideo[filesize<{args.max_size}M][vcodec^=av01][height>=720]/bestvideo[filesize<{args.max_size}M][vcodec=vp9.2][height>=720]/bestvideo[filesize<{args.max_size}M][vcodec=vp9][height>=720]/bestvideo[filesize<{args.max_size}M][height>=720]/bestvideo[filesize<{args.max_size}M])+(bestaudio[acodec=opus]/bestaudio)/best',
'merge_output_format': 'mkv',
'logtostderr': True,
'embedchapters': True,
'writethumbnail': True,
# Save the thumbnail to a file. Embedding seems to be broken right now so this is an alternative.
'embedthumbnail': True,
'embeddescription': True,
'writesubtitles': True,
# 'allsubtitles': True, # Download every language.
'subtitlesformat': 'vtt',
'subtitleslangs': ['en'],
'writeautomaticsub': True,
'writedescription': True,
'ignoreerrors': True,
'continuedl': False,
'addmetadata': True,
'writeinfojson': True,
'verbose': args.verbose,
'postprocessors': [
{'key': 'FFmpegEmbedSubtitle'},
{'key': 'FFmpegMetadata', 'add_metadata': True},
{'key': 'EmbedThumbnail', 'already_have_thumbnail': True},
{'key': 'FFmpegThumbnailsConvertor', 'format': 'jpg', 'when': 'before_dl'},
# {'key': 'FFmpegSubtitlesConvertor', 'format': 'srt'}
],
# 'external_downloader': 'aria2c',
# 'external_downloader_args': ['-j 32', '-s 32', '-x 16', '--file-allocation=none', '--optimize-concurrent-downloads=true', '--http-accept-gzip=true', '--continue=true'],
}
yt_dlp = YDL(dict(ydl_opts, **{'logger': ytdl_logger()}))
url_count = 0
for k, v in url_list.items():
for item in v:
url_count += 1
# Init bars
video_bars = manager.list()
if not args.daemon:
for i in range(args.threads):
video_bars.append([3 + i, manager.Lock()])
encountered_errors = 0
errored_videos = 0
# The video progress bars have an issue where when a bar is closed it
# will shift its position back 1 then return to the correct position.
# This thread will clear empty spots.
if not args.daemon:
eraser_exit = manager.Value(bool, False)
Thread(target=bar_eraser, args=(video_bars, eraser_exit,)).start()
already_erased_downloaded_tracker = False
while True:
# do_update() # this doesn't work very well. freezes
progress_bar = tqdm(total=url_count, position=0, desc='Inputs', disable=args.daemon,
bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')
for output_path, urls in url_list.items():
for target_url in urls:
logger.info('Fetching playlist...')
playlist = yt_dlp.playlist_contents(str(target_url))
if not playlist:
progress_bar.update()
continue
url_list = load_input_file()
download_archive_file = args.download_cache_file_directory / (str(playlist['id']) + '.log')
if args.erase_downloaded_tracker and not already_erased_downloaded_tracker:
if download_archive_file.exists():
os.remove(download_archive_file)
already_erased_downloaded_tracker = True
downloaded_videos = load_existing_videos()
msg = f'Found {len(downloaded_videos)} downloaded videos for playlist "{playlist["title"]}" ({playlist["id"]}). {"Ignoring." if args.ignore_downloaded else ""}'
if args.daemon:
logger.info(msg)
else:
progress_bar.write(msg)
download_archive_logger = setup_file_logger('download_archive', download_archive_file,
format_str='%(message)s')
playlist['entries'] = remove_duplicates_from_playlist(playlist['entries'])
log_info_twice(f'Downloading item: "{playlist["title"]}" ({playlist["id"]}) {target_url}')
# Remove already downloaded files from the to-do list.
download_queue = []
for p, video in enumerate(playlist['entries']):
if video['id'] not in download_queue:
if not args.ignore_downloaded and video['id'] not in downloaded_videos:
download_queue.append(video)
# downloaded_videos.add(video['id'])
elif args.ignore_downloaded:
download_queue.append(video)
playlist_bar = tqdm(total=len(playlist['entries']), position=1,
desc=f'"{playlist["title"]}" ({playlist["id"]})', disable=args.daemon, leave=False)
if not args.ignore_downloaded:
playlist_bar.update(len(downloaded_videos))
playlist_ydl_opts = ydl_opts.copy()
# playlist_ydl_opts['outtmpl'] = f'{output_path}/{get_output_templ()}'
if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded
with Pool(processes=args.threads) as pool:
if sys.stdout.isatty():
# Doesn't work if not connected to a terminal:
# OSError: [Errno 25] Inappropriate ioctl for device
status_bar.set_description_str('=' * os.get_terminal_size()[0])
logger.info('Starting downloads...')
for result in pool.imap_unordered(download_video,
((video, {
'bars': video_bars,
'ydl_opts': playlist_ydl_opts,
'output_dir': Path(output_path),
'ignore_downloaded': args.ignore_downloaded,
'verify': args.verify
}) for video in download_queue)):
# Save the video ID to the file
if result['downloaded_video_id']:
download_archive_logger.info(result['downloaded_video_id'])
# Print short error messages.
# An error should never be added to both video_critical_err_msg_short and video_critical_err_msg.
for line in result['video_critical_err_msg_short']:
# file_msg = f"{result['video_id']} - {ansi_escape_regex.sub('', line)}"
# term_msg = f"{result['video_id']} - {line}"
msg = f"{result['video_id']} - {line}"
video_error_logger.error(msg)
file_logger.error(msg)
encountered_errors += 1
if args.daemon:
logger.error(msg)
else:
status_bar.write(msg)
# Print longer error messages.
# Won't print anything to console if the silence_errors arg is set.
for line in result['video_critical_err_msg']:
# file_msg = f"{result['video_id']} - {ansi_escape_regex.sub('', line)}"
# term_msg = f"{result['video_id']} - {line}"
msg = f"{result['video_id']} - {line}"
video_error_logger.error(msg)
file_logger.error(msg)
encountered_errors += 1
if not args.silence_errors:
if args.daemon:
logger.error(msg)
else:
status_bar.write(msg)
# if len(result['video_critical_err_msg']):
# errored_videos += 1
# if args.silence_errors and args.daemon:
# logger.error(f"{result['video_id']} - Failed due to error.")
for line in result['logger_msg']:
log_info_twice(f"{result['video_id']} - {line}")
# TODO: if no error launch a verify multiprocess
# if kwargs['verify']:
# try:
# info = yt_dlp.extract_info(video['url'])
# except Exception as e:
# output_dict['video_critical_err_msg'].append(f'Failed to verify video, extract_info failed: {e}')
# file_path = base_path + info['ext']
# result = ffprobe(file_path)
# if not result[0]:
# output_dict['video_critical_err_msg'].append(f'Failed to verify video: {result[4]}')
playlist_bar.update()
else:
msg = f"All videos already downloaded for \"{playlist['title']}\"."
if args.daemon:
logger.info(msg)
else:
status_bar.write(msg)
log_info_twice(f"Finished item: '{playlist['title']}' {target_url}")
# Sleep a bit to prevent rate-limiting
if progress_bar.n < len(url_list.keys()) - 1:
status_bar.set_description_str(f'Sleeping {args.ratelimit_sleep}s...')
time.sleep(args.ratelimit_sleep)
progress_bar.update()
error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.'
if args.daemon:
logger.info(error_msg)
else:
status_bar.write(error_msg)
log_info_twice(f"Finished process in {round(math.ceil(time.time() - start_time) / 60, 2)} min.")
if not args.daemon:
break
else:
logger.info(f'Sleeping for {args.sleep} min.')
try:
time.sleep(args.sleep * 60)
except KeyboardInterrupt:
sys.exit(0)
# downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded
# Clean up the remaining bars. Have to close them in order.
# These variables may be undefined so we will just ignore any errors.
# Not in one try/catch because we don't want to skip anything.
try:
eraser_exit.value = True
except NameError:
pass
except AttributeError:
pass
try:
playlist_bar.close()
except NameError:
pass
except AttributeError:
pass
try:
status_bar.close()
except NameError:
pass
except AttributeError:
pass