2023-01-20 21:42:36 -07:00
#!/usr/bin/env python3
import argparse
import logging . config
import math
import os
import re
import subprocess
import sys
import time
from multiprocessing import Manager , Pool , cpu_count
2023-02-02 20:35:37 -07:00
from pathlib import Path
from threading import Thread
2023-01-20 22:47:18 -07:00
2023-02-02 20:35:37 -07:00
import yaml
from appdirs import user_data_dir
2023-01-20 22:47:18 -07:00
from tqdm . auto import tqdm
2023-01-21 18:19:03 -07:00
import ydl . yt_dlp as ydl
from process . funcs import get_silent_logger , remove_duplicates_from_playlist , restart_program , setup_file_logger
2023-02-02 20:35:37 -07:00
from process . threads import bar_eraser , download_video
2023-01-21 18:19:03 -07:00
from ydl . files import create_directories , resolve_path
# logging.basicConfig(level=1000)
# logging.getLogger().setLevel(1000)
2023-01-20 21:42:36 -07:00
2023-02-02 20:35:37 -07:00
urlRegex = re . compile ( r ' ^(?:http|ftp)s?:// ' # http:// or https://
r ' (?:(?:[A-Z0-9](?:[A-Z0-9-] { 0,61}[A-Z0-9])? \ .)+(?:[A-Z] { 2,6} \ .?|[A-Z0-9-] { 2,} \ .?)| ' # domain...
r ' localhost| ' # localhost...
r ' \ d { 1,3} \ . \ d { 1,3} \ . \ d { 1,3} \ . \ d { 1,3}) ' # ...or ip
r ' (?:: \ d+)? ' # optional port
r ' (?:/?|[/?] \ S+)$ ' , re . IGNORECASE )
2023-01-20 21:42:36 -07:00
parser = argparse . ArgumentParser ( )
parser . add_argument ( ' file ' , help = ' URL to download or path of a file containing the URLs of the videos to download. ' )
2023-02-02 20:35:37 -07:00
parser . add_argument ( ' output ' , help = ' Output directory. Ignored paths specified in a YAML file. ' )
2023-01-20 21:42:36 -07:00
parser . add_argument ( ' --no-update ' , ' -n ' , action = ' store_true ' , help = ' Don \' t update yt-dlp at launch. ' )
parser . add_argument ( ' --max-size ' , type = int , default = 1100 , help = ' Max allowed size of a video in MB. ' )
parser . add_argument ( ' --rm-cache ' , ' -r ' , action = ' store_true ' , help = ' Delete the yt-dlp cache on start. ' )
parser . add_argument ( ' --threads ' , type = int , default = cpu_count ( ) , help = ' How many download processes to use. ' )
2023-01-21 18:19:03 -07:00
parser . add_argument ( ' --daemon ' , ' -d ' , action = ' store_true ' , help = " Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep. " )
parser . add_argument ( ' --sleep ' , type = float , default = 60 , help = ' How many minutes to sleep when in daemon mode. ' )
2023-02-02 20:35:37 -07:00
parser . add_argument ( ' --download-cache-file-directory ' , default = user_data_dir ( ' automated-youtube-dl ' , ' cyberes ' ) , help = ' The path to the directory to track downloaded videos. Defaults to your appdata path. ' )
2023-01-21 18:19:03 -07:00
parser . add_argument ( ' --silence-errors ' , ' -s ' , action = ' store_true ' , help = " Don ' t print any error messages to the console. " )
2023-02-02 20:35:37 -07:00
parser . add_argument ( ' --ignore-downloaded ' , ' -i ' , action = ' store_true ' , help = ' Ignore videos that have been already downloaded and let youtube-dl handle everything. ' )
parser . add_argument ( ' --erase-downloaded-tracker ' , ' -e ' , action = ' store_true ' , help = ' Erase the tracked video file. ' )
parser . add_argument ( ' --ratelimit-sleep ' , type = int , default = 5 , help = ' How many seconds to sleep to prevent rate-limiting. ' )
parser . add_argument ( ' --input-datatype ' , choices = [ ' auto ' , ' txt ' , ' yaml ' ] , default = ' auto ' , help = ' The datatype of the input file. If set to auto, the file will be scanned for a URL on the firstline. '
' If is a URL, the filetype will be set to txt. If it is a key: value pair then the filetype will be set to yaml. ' )
2023-01-20 21:42:36 -07:00
args = parser . parse_args ( )
if args . threads < = 0 :
print ( " Can ' t have 0 threads! " )
sys . exit ( 1 )
args . output = resolve_path ( args . output )
2023-02-02 20:35:37 -07:00
args . download_cache_file_directory = resolve_path ( args . download_cache_file_directory )
2023-01-20 21:42:36 -07:00
log_time = time . time ( )
# Get the URLs of the videos to download. Is the input a URL or file?
2023-02-02 20:35:37 -07:00
url_list = { }
if not re . match ( urlRegex , str ( args . file ) ) or args . input_datatype in ( ' txt ' , ' yaml ' ) :
2023-01-20 21:42:36 -07:00
args . file = resolve_path ( args . file )
if not args . file . exists ( ) :
print ( ' Input file does not exist: ' , args . file )
sys . exit ( 1 )
2023-02-02 20:35:37 -07:00
input_file = [ x . strip ( ) . strip ( ' \n ' ) for x in list ( args . file . open ( ) ) ]
if args . input_datatype == ' yaml ' or ( re . match ( r ' ^.*?: \ w* ' , input_file [ 0 ] ) and args . input_datatype == ' auto ' ) :
with open ( args . file , ' r ' ) as file :
try :
url_list = yaml . safe_load ( file )
except yaml . YAMLError as e :
print ( ' Failed to load config file, error: ' , e )
sys . exit ( 1 )
elif args . input_datatype == ' txt ' or ( re . match ( urlRegex , input_file [ 0 ] ) and args . input_datatype == ' auto ' ) :
url_list [ str ( args . output ) ] = input_file
else :
print ( ' Unknown file type: ' , args . input_datatype )
print ( input_file )
sys . exit ( 1 )
del input_file # release file object
2023-01-20 21:42:36 -07:00
# Verify each line in the file is a valid URL.
2023-02-02 20:35:37 -07:00
for directory , urls in url_list . items ( ) :
for item in urls :
if not re . match ( urlRegex , str ( item ) ) :
print ( f ' Not a url: ' , item )
sys . exit ( 1 )
2023-01-20 21:42:36 -07:00
else :
2023-02-02 20:35:37 -07:00
url_list [ str ( args . output ) ] = [ args . file ]
# Create directories AFTER loading the file
create_directories ( * url_list . keys ( ) , args . download_cache_file_directory )
2023-01-20 21:42:36 -07:00
2023-02-01 13:00:48 -07:00
def do_update ( ) :
if not args . no_update :
print ( ' Checking if yt-dlp needs to be updated... ' )
updated = ydl . update_ytdlp ( )
if updated :
print ( ' Restarting program... ' )
restart_program ( )
2023-01-20 21:42:36 -07:00
if args . rm_cache :
subprocess . run ( ' yt-dlp --rm-cache-dir ' , shell = True )
2023-01-21 18:19:03 -07:00
if args . daemon :
print ( ' Running in daemon mode. ' )
2023-01-20 21:42:36 -07:00
log_dir = args . output / ' logs '
create_directories ( args . output , log_dir )
2023-01-22 12:40:46 -07:00
# TODO: log file rotation https://www.blog.pythonlibrary.org/2014/02/11/python-how-to-create-rotating-logs/
# TODO: log to one file instead of one for each run
2023-01-21 18:19:03 -07:00
file_logger = setup_file_logger ( ' youtube_dl ' , log_dir / f ' youtube_dl- { str ( int ( log_time ) ) } .log ' , level = logging . INFO )
2023-01-20 21:42:36 -07:00
video_error_logger = setup_file_logger ( ' youtube_dl_video_errors ' , log_dir / f ' youtube_dl-errors- { int ( log_time ) } .log ' , level = logging . INFO )
2023-01-21 18:19:03 -07:00
logger = get_silent_logger ( ' yt-dl ' , silent = not args . daemon )
def log_info_twice ( msg ) :
logger . info ( msg )
file_logger . info ( msg )
2023-01-20 21:42:36 -07:00
2023-01-21 18:19:03 -07:00
log_info_twice ( ' Starting process. ' )
2023-01-20 21:42:36 -07:00
start_time = time . time ( )
manager = Manager ( )
2023-01-21 18:19:03 -07:00
def load_existing_videos ( ) :
# Find existing videos.
output = set ( )
if not download_archive_file . exists ( ) :
download_archive_file . touch ( )
with open ( download_archive_file , ' r ' ) as file :
output . update ( ( [ line . rstrip ( ) for line in file ] ) )
return output
2023-02-02 20:35:37 -07:00
status_bar = tqdm ( position = 2 , bar_format = ' {desc} ' , disable = args . daemon , leave = False )
2023-01-20 21:42:36 -07:00
def log_bar ( msg , level ) :
status_bar . write ( f ' [ { level } ] { msg } ' )
2023-01-21 18:19:03 -07:00
if level == ' warning ' :
logger . warning ( msg )
elif level == ' error ' :
logger . error ( msg )
else :
logger . info ( msg )
2023-01-20 21:42:36 -07:00
def print_without_paths ( msg ) :
"""
Remove any filepaths or other stuff we don ' t want in the message.
"""
m = re . match ( r ' (^[^ \ /]+(?: \\ .[^ \ /]*)*) ' , msg )
if m :
msg = m . group ( 1 )
m1 = re . match ( r ' ^(.*?): ' , msg )
2023-01-21 18:19:03 -07:00
msg = msg . strip ( ' to " ' ) . strip ( ' to: ' ) . strip ( )
if args . daemon :
log_info_twice ( msg )
else :
status_bar . set_description_str ( msg )
2023-01-20 21:42:36 -07:00
class ytdl_logger ( object ) :
def debug ( self , msg ) :
2023-01-21 18:19:03 -07:00
file_logger . debug ( msg )
2023-01-20 21:42:36 -07:00
# if msg.startswith('[debug] '):
# pass
if ' [download] ' not in msg :
print_without_paths ( msg )
def info ( self , msg ) :
2023-01-21 18:19:03 -07:00
file_logger . info ( msg )
2023-01-20 21:42:36 -07:00
print_without_paths ( msg )
def warning ( self , msg ) :
2023-01-21 18:19:03 -07:00
file_logger . warning ( msg )
2023-01-20 21:42:36 -07:00
log_bar ( msg , ' warning ' )
def error ( self , msg ) :
2023-01-21 18:19:03 -07:00
file_logger . error ( msg )
2023-01-20 21:42:36 -07:00
log_bar ( msg , ' error ' )
# https://github.com/yt-dlp/yt-dlp#embedding-examples
ydl_opts = {
' format ' : f ' (bestvideo[filesize< { args . max_size } M][vcodec^=av01][height>=1080][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec=vp9.2][height>=1080][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec=vp9][height>=1080][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec^=av01][height>=1080]/bestvideo[filesize< { args . max_size } M][vcodec=vp9.2][height>=1080]/bestvideo[filesize< { args . max_size } M][vcodec=vp9][height>=1080]/bestvideo[filesize< { args . max_size } M][height>=1080]/bestvideo[filesize< { args . max_size } M][vcodec^=av01][height>=720][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec=vp9.2][height>=720][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec=vp9][height>=720][fps>30]/bestvideo[filesize< { args . max_size } M][vcodec^=av01][height>=720]/bestvideo[filesize< { args . max_size } M][vcodec=vp9.2][height>=720]/bestvideo[filesize< { args . max_size } M][vcodec=vp9][height>=720]/bestvideo[filesize< { args . max_size } M][height>=720]/bestvideo[filesize< { args . max_size } M])+(bestaudio[acodec=opus]/bestaudio)/best ' ,
2023-01-21 18:19:03 -07:00
' outtmpl ' : f ' { args . output } /[%(id)s] [%(title)s] [%(uploader)s] [%(uploader_id)s].%(ext)s ' , # leading dash can cause issues due to bash args so we surround the variables in brackets
2023-01-20 21:42:36 -07:00
' merge_output_format ' : ' mkv ' ,
' logtostderr ' : True ,
' embedchapters ' : True ,
2023-01-21 18:19:03 -07:00
' writethumbnail ' : True , # Save the thumbnail to a file. Embedding seems to be broken right now so this is an alternative.
2023-01-20 21:42:36 -07:00
' embedthumbnail ' : True ,
2023-01-21 18:19:03 -07:00
' embeddescription ' : True ,
2023-01-20 21:42:36 -07:00
' writesubtitles ' : True ,
# 'allsubtitles': True, # Download every language.
' subtitlesformat ' : ' vtt ' ,
' subtitleslangs ' : [ ' en ' ] ,
' writeautomaticsub ' : True ,
2023-02-01 13:00:48 -07:00
' writedescription ' : True ,
2023-01-21 18:19:03 -07:00
' ignoreerrors ' : True ,
' continuedl ' : False ,
' addmetadata ' : True ,
' writeinfojson ' : True ,
2023-01-20 21:42:36 -07:00
' postprocessors ' : [
{ ' key ' : ' FFmpegEmbedSubtitle ' } ,
{ ' key ' : ' FFmpegMetadata ' , ' add_metadata ' : True } ,
{ ' key ' : ' EmbedThumbnail ' , ' already_have_thumbnail ' : True } ,
2023-01-21 18:19:03 -07:00
# {'key': 'FFmpegSubtitlesConvertor', 'format': 'srt'}
2023-01-20 21:42:36 -07:00
] ,
}
main_opts = dict ( ydl_opts , * * { ' logger ' : ytdl_logger ( ) } )
yt_dlp = ydl . YDL ( main_opts )
2023-02-02 20:35:37 -07:00
url_count = 0
for k , v in url_list . items ( ) :
for item in v :
url_count + = 1
2023-01-20 21:42:36 -07:00
# Init bars
2023-02-02 20:35:37 -07:00
progress_bar = tqdm ( total = url_count , position = 0 , desc = ' Inputs ' , disable = args . daemon )
2023-01-20 21:42:36 -07:00
video_bars = manager . list ( )
2023-01-21 18:19:03 -07:00
if not args . daemon :
for i in range ( args . threads ) :
2023-02-02 20:35:37 -07:00
video_bars . append ( [ 3 + i , manager . Lock ( ) ] )
2023-01-21 18:19:03 -07:00
2023-02-02 20:35:37 -07:00
encountered_errors = 0
errored_videos = 0
2023-01-21 18:19:03 -07:00
2023-02-02 20:35:37 -07:00
# The video progress bars have an issue where when a bar is closed it will shift its position back 1 then return to the correct position.
# This thread will clear empty spots.
if not args . daemon :
eraser_exit = manager . Value ( bool , False )
Thread ( target = bar_eraser , args = ( video_bars , eraser_exit , ) ) . start ( )
2023-01-21 18:19:03 -07:00
2023-02-02 20:35:37 -07:00
while True :
do_update ( )
for output_path , urls in url_list . items ( ) :
for target_url in urls :
logger . info ( ' Fetching playlist... ' )
playlist = yt_dlp . playlist_contents ( str ( target_url ) )
if not playlist :
progress_bar . update ( )
continue
download_archive_file = args . download_cache_file_directory / ( str ( playlist [ ' id ' ] ) + ' .log ' )
if args . erase_downloaded_tracker :
if download_archive_file . exists ( ) :
os . remove ( download_archive_file )
downloaded_videos = load_existing_videos ( )
msg = f ' Found { len ( downloaded_videos ) } downloaded videos for playlist " { playlist [ " title " ] } " ( { playlist [ " id " ] } ). { " Ignoring. " if args . ignore_downloaded else " " } '
if args . daemon :
print ( msg )
else :
status_bar . write ( msg )
download_archive_logger = setup_file_logger ( ' download_archive ' , download_archive_file , format_str = ' %(message)s ' )
playlist [ ' entries ' ] = remove_duplicates_from_playlist ( playlist [ ' entries ' ] )
log_info_twice ( f ' Downloading item: " { playlist [ " title " ] } " ( { playlist [ " id " ] } ) { target_url } ' )
# Remove already downloaded files from the to-do list.
2023-02-01 13:06:11 -07:00
download_queue = [ ]
for p , video in enumerate ( playlist [ ' entries ' ] ) :
2023-02-02 20:35:37 -07:00
if video [ ' id ' ] not in download_queue :
if not args . ignore_downloaded and video [ ' id ' ] not in downloaded_videos :
download_queue . append ( video )
# downloaded_videos.add(video['id'])
elif args . ignore_downloaded :
download_queue . append ( video )
playlist_bar = tqdm ( total = len ( playlist [ ' entries ' ] ) , position = 1 , desc = f ' " { playlist [ " title " ] } " ( { playlist [ " id " ] } ) ' , disable = args . daemon , leave = False )
if not args . ignore_downloaded :
playlist_bar . update ( len ( downloaded_videos ) )
if len ( download_queue ) : # Don't mess with multiprocessing if all videos are already downloaded
with Pool ( processes = args . threads ) as pool :
if sys . stdout . isatty ( ) :
# Doesn't work if not connected to a terminal:
# OSError: [Errno 25] Inappropriate ioctl for device
status_bar . set_description_str ( ' = ' * os . get_terminal_size ( ) [ 0 ] )
logger . info ( ' Starting downloads... ' )
for result in pool . imap_unordered ( download_video , ( ( video , { ' bars ' : video_bars , ' ydl_opts ' : ydl_opts , ' output_dir ' : Path ( output_path ) , } ) for video in download_queue ) ) :
# Save the video ID to the file
if result [ ' downloaded_video_id ' ] :
download_archive_logger . info ( result [ ' downloaded_video_id ' ] )
# Print stuff
for line in result [ ' video_error_logger_msg ' ] :
video_error_logger . info ( line )
file_logger . error ( line )
encountered_errors + = 1
if not args . silence_errors :
if args . daemon :
logger . error ( line )
else :
status_bar . write ( line )
if len ( result [ ' video_error_logger_msg ' ] ) :
errored_videos + = 1
if args . silence_errors and args . daemon :
logger . error ( f " { result [ ' video_id ' ] } failed due to error. " )
# for line in result['status_msg']:
# playlist_bar.write(line)
for line in result [ ' logger_msg ' ] :
log_info_twice ( line )
playlist_bar . update ( )
else :
status_bar . write ( f " All videos already downloaded for ' { playlist [ ' title ' ] } ' . " )
log_info_twice ( f " Finished item: ' { playlist [ ' title ' ] } ' { target_url } " )
# Sleep a bit to prevent rate-limiting
if progress_bar . n < len ( url_list . keys ( ) ) - 1 :
status_bar . set_description_str ( f ' Sleeping { args . ratelimit_sleep } s... ' )
time . sleep ( args . ratelimit_sleep )
progress_bar . update ( )
error_msg = f ' Encountered { encountered_errors } errors on { errored_videos } videos. '
if args . daemon :
logger . info ( error_msg )
else :
status_bar . write ( error_msg )
2023-01-21 18:19:03 -07:00
log_info_twice ( f " Finished process in { round ( math . ceil ( time . time ( ) - start_time ) / 60 , 2 ) } min. " )
if not args . daemon :
break
2023-01-20 22:47:18 -07:00
else :
2023-01-21 18:19:03 -07:00
logger . info ( f ' Sleeping for { args . sleep } min. ' )
try :
time . sleep ( args . sleep * 60 )
except KeyboardInterrupt :
sys . exit ( )
2023-02-02 20:35:37 -07:00
# downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded
2023-01-20 22:47:18 -07:00
# Clean up the remaining bars. Have to close them in order.
2023-02-02 20:35:37 -07:00
eraser_exit . value = True
2023-01-20 22:47:18 -07:00
playlist_bar . close ( )
status_bar . close ( )