better config file, better printing

This commit is contained in:
Cyberes 2023-02-02 20:35:37 -07:00
parent 915d7f89f0
commit cc29f29ae9
9 changed files with 238 additions and 125 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.idea
targets.*
!targets.sample.*
# ---> Python
# Byte-compiled / optimized / DLL files

View File

@ -53,17 +53,18 @@ Output Directory/
Videos will be saved using this name format:
```
%(title)s --- %(uploader)s --- %(uploader_id)s --- %(id)s
[%(id)s] [%(title)s] [%(uploader)s] [%(uploader_id)s]
```
#### Arguments
| Argument | Flag | Help |
| ------------- | ---- | ------------------------------------------------------------ |
| `--no-update` | `-n` | Don\'t update yt-dlp at launch. |
| `--max-size` | | Max allowed size of a video in MB. Default: 1100. |
| `--rm-cache` | `-r` | Delete the yt-dlp cache on start. |
| `--threads` | | How many download processes to use (threads). Default is how many CPU cores you have. You will want to find a good value that doesn't overload your connection. |
| `--daemon` | `-d` | Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep. |
| `--sleep` | | How many minutes to sleep when in daemon mode. |
| `--silent` | `-s` | Don't print any error messages to the console. |
| Argument | Flag | Help |
| --------------------- | ---- | ------------------------------------------------------------ |
| `--no-update` | `-n` | Don\'t update yt-dlp at launch. |
| `--max-size` | | Max allowed size of a video in MB. Default: 1100. |
| `--rm-cache` | `-r` | Delete the yt-dlp cache on start. |
| `--threads` | | How many download processes to use (threads). Default is how many CPU cores you have. You will want to find a good value that doesn't overload your connection. |
| `--daemon` | `-d` | Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep. |
| `--sleep` | | How many minutes to sleep when in daemon mode. |
| `--silent` | `-s` | Don't print any error messages to the console. |
| `--ignore-downloaded` | `-i` | Ignore videos that have been already downloaded and let youtube-dl handle everything. Videos will not be re-downloaded, but metadata will be updated. |

View File

@ -8,36 +8,44 @@ import subprocess
import sys
import time
from multiprocessing import Manager, Pool, cpu_count
from pathlib import Path
from threading import Thread
import yaml
from appdirs import user_data_dir
from tqdm.auto import tqdm
import ydl.yt_dlp as ydl
from process.funcs import get_silent_logger, remove_duplicates_from_playlist, restart_program, setup_file_logger
from process.threads import download_video
from process.threads import bar_eraser, download_video
from ydl.files import create_directories, resolve_path
# logging.basicConfig(level=1000)
# logging.getLogger().setLevel(1000)
urlRegex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
urlRegex = re.compile(r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
parser = argparse.ArgumentParser()
parser.add_argument('file', help='URL to download or path of a file containing the URLs of the videos to download.')
parser.add_argument('output', help='Output directory.')
parser.add_argument('output', help='Output directory. Ignored paths specified in a YAML file.')
parser.add_argument('--no-update', '-n', action='store_true', help='Don\'t update yt-dlp at launch.')
parser.add_argument('--max-size', type=int, default=1100, help='Max allowed size of a video in MB.')
parser.add_argument('--rm-cache', '-r', action='store_true', help='Delete the yt-dlp cache on start.')
parser.add_argument('--threads', type=int, default=cpu_count(), help='How many download processes to use.')
parser.add_argument('--daemon', '-d', action='store_true', help="Run in daemon mode. Disables progress bars sleeps for the amount of time specified in --sleep.")
parser.add_argument('--sleep', type=float, default=60, help='How many minutes to sleep when in daemon mode.')
parser.add_argument('--download-cache-file-directory', default=user_data_dir('automated-youtube-dl', 'cyberes'), help='The path to the directory to track downloaded videos. Defaults to your appdata path.')
parser.add_argument('--silence-errors', '-s', action='store_true', help="Don't print any error messages to the console.")
parser.add_argument('--ignore-downloaded', '-i', action='store_true', help='Ignore videos that have been already downloaded and let YouTubeDL handle everything.')
parser.add_argument('--ignore-downloaded', '-i', action='store_true', help='Ignore videos that have been already downloaded and let youtube-dl handle everything.')
parser.add_argument('--erase-downloaded-tracker', '-e', action='store_true', help='Erase the tracked video file.')
parser.add_argument('--ratelimit-sleep', type=int, default=5, help='How many seconds to sleep to prevent rate-limiting.')
parser.add_argument('--input-datatype', choices=['auto', 'txt', 'yaml'], default='auto', help='The datatype of the input file. If set to auto, the file will be scanned for a URL on the firstline.'
'If is a URL, the filetype will be set to txt. If it is a key: value pair then the filetype will be set to yaml.')
args = parser.parse_args()
if args.threads <= 0:
@ -45,22 +53,42 @@ if args.threads <= 0:
sys.exit(1)
args.output = resolve_path(args.output)
args.download_cache_file_directory = resolve_path(args.download_cache_file_directory)
log_time = time.time()
# Get the URLs of the videos to download. Is the input a URL or file?
if not re.match(urlRegex, str(args.file)):
url_list = {}
if not re.match(urlRegex, str(args.file)) or args.input_datatype in ('txt', 'yaml'):
args.file = resolve_path(args.file)
if not args.file.exists():
print('Input file does not exist:', args.file)
sys.exit(1)
url_list = [x.strip().strip('\n') for x in list(args.file.open())]
input_file = [x.strip().strip('\n') for x in list(args.file.open())]
if args.input_datatype == 'yaml' or (re.match(r'^.*?:\w*', input_file[0]) and args.input_datatype == 'auto'):
with open(args.file, 'r') as file:
try:
url_list = yaml.safe_load(file)
except yaml.YAMLError as e:
print('Failed to load config file, error:', e)
sys.exit(1)
elif args.input_datatype == 'txt' or (re.match(urlRegex, input_file[0]) and args.input_datatype == 'auto'):
url_list[str(args.output)] = input_file
else:
print('Unknown file type:', args.input_datatype)
print(input_file)
sys.exit(1)
del input_file # release file object
# Verify each line in the file is a valid URL.
for i, line in enumerate(url_list):
if not re.match(urlRegex, line):
print(f'Line {i} not a url:', line)
sys.exit(1)
for directory, urls in url_list.items():
for item in urls:
if not re.match(urlRegex, str(item)):
print(f'Not a url:', item)
sys.exit(1)
else:
url_list = [args.file]
url_list[str(args.output)] = [args.file]
# Create directories AFTER loading the file
create_directories(*url_list.keys(), args.download_cache_file_directory)
def do_update():
@ -98,8 +126,6 @@ start_time = time.time()
manager = Manager()
download_archive_file = args.output / 'download-archive.log'
def load_existing_videos():
# Find existing videos.
@ -111,13 +137,7 @@ def load_existing_videos():
return output
downloaded_videos = load_existing_videos()
print('Found', len(downloaded_videos), 'downloaded videos.')
# Create this object AFTER reading in the download_archive.
download_archive_logger = setup_file_logger('download_archive', download_archive_file, format_str='%(message)s')
status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon)
status_bar = tqdm(position=2, bar_format='{desc}', disable=args.daemon, leave=False)
def log_bar(msg, level):
@ -195,94 +215,118 @@ ydl_opts = {
}
main_opts = dict(ydl_opts, **{'logger': ytdl_logger()})
# thread_opts = dict(ydl_opts, **{'logger': ydl.ytdl_no_logger()})
yt_dlp = ydl.YDL(main_opts)
url_count = 0
for k, v in url_list.items():
for item in v:
url_count += 1
# Init bars
playlist_bar = tqdm(position=1, desc='Playlist', disable=args.daemon)
progress_bar = tqdm(total=url_count, position=0, desc='Inputs', disable=args.daemon)
video_bars = manager.list()
if not args.daemon:
for i in range(args.threads):
video_bars.append([
3 + i,
manager.Lock()
])
video_bars.append([3 + i, manager.Lock()])
encountered_errors = 0
errored_videos = 0
# The video progress bars have an issue where when a bar is closed it will shift its position back 1 then return to the correct position.
# This thread will clear empty spots.
if not args.daemon:
eraser_exit = manager.Value(bool, False)
Thread(target=bar_eraser, args=(video_bars, eraser_exit,)).start()
while True:
do_update()
for i, target_url in tqdm(enumerate(url_list), total=len(url_list), position=0, desc='Inputs', disable=args.daemon):
logger.info('Fetching playlist...')
playlist = yt_dlp.playlist_contents(target_url)
playlist['entries'] = remove_duplicates_from_playlist(playlist['entries'])
encountered_errors = 0
errored_videos = 0
for output_path, urls in url_list.items():
for target_url in urls:
logger.info('Fetching playlist...')
playlist = yt_dlp.playlist_contents(str(target_url))
if not playlist:
progress_bar.update()
continue
log_info_twice(f"Downloading item: '{playlist['title']}' {target_url}")
download_archive_file = args.download_cache_file_directory / (str(playlist['id']) + '.log')
if args.erase_downloaded_tracker:
if download_archive_file.exists():
os.remove(download_archive_file)
downloaded_videos = load_existing_videos()
playlist_bar.total = len(playlist['entries'])
playlist_bar.set_description(playlist['title'])
msg = f'Found {len(downloaded_videos)} downloaded videos for playlist "{playlist["title"]}" ({playlist["id"]}). {"Ignoring." if args.ignore_downloaded else ""}'
if args.daemon:
print(msg)
else:
status_bar.write(msg)
download_archive_logger = setup_file_logger('download_archive', download_archive_file, format_str='%(message)s')
# print(playlist['entries'][0])
# sys.exit()
playlist['entries'] = remove_duplicates_from_playlist(playlist['entries'])
# Remove already downloaded files from the to-do list.
if not args.ignore_downloaded:
log_info_twice(f'Downloading item: "{playlist["title"]}" ({playlist["id"]}) {target_url}')
# Remove already downloaded files from the to-do list.
download_queue = []
s = set()
for p, video in enumerate(playlist['entries']):
if video['id'] not in downloaded_videos and video['id'] not in s:
download_queue.append(video)
s.add(video['id'])
playlist_bar.update(len(downloaded_videos))
if video['id'] not in download_queue:
if not args.ignore_downloaded and video['id'] not in downloaded_videos:
download_queue.append(video)
# downloaded_videos.add(video['id'])
elif args.ignore_downloaded:
download_queue.append(video)
if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded
with Pool(processes=args.threads) as pool:
if sys.stdout.isatty():
# Doesn't work if not connected to a terminal:
# OSError: [Errno 25] Inappropriate ioctl for device
status_bar.set_description_str('=' * os.get_terminal_size()[0])
logger.info('Starting downloads...')
for result in pool.imap_unordered(download_video,
((video, {
'bars': video_bars,
'ydl_opts': ydl_opts,
'output_dir': args.output,
}) for video in download_queue)):
# Save the video ID to the file
if result['downloaded_video_id']:
download_archive_logger.info(result['downloaded_video_id'])
playlist_bar = tqdm(total=len(playlist['entries']), position=1, desc=f'"{playlist["title"]}" ({playlist["id"]})', disable=args.daemon, leave=False)
if not args.ignore_downloaded:
playlist_bar.update(len(downloaded_videos))
# Print stuff
for line in result['video_error_logger_msg']:
video_error_logger.info(line)
file_logger.error(line)
encountered_errors += 1
if not args.silence_errors:
if args.daemon:
logger.error(line)
else:
playlist_bar.write(line)
if len(download_queue): # Don't mess with multiprocessing if all videos are already downloaded
with Pool(processes=args.threads) as pool:
if sys.stdout.isatty():
# Doesn't work if not connected to a terminal:
# OSError: [Errno 25] Inappropriate ioctl for device
status_bar.set_description_str('=' * os.get_terminal_size()[0])
logger.info('Starting downloads...')
for result in pool.imap_unordered(download_video, ((video, {'bars': video_bars, 'ydl_opts': ydl_opts, 'output_dir': Path(output_path), }) for video in download_queue)):
# Save the video ID to the file
if result['downloaded_video_id']:
download_archive_logger.info(result['downloaded_video_id'])
if len(result['video_error_logger_msg']):
errored_videos += 1
if args.silence_errors and args.daemon:
logger.error(f"{result['video_id']} failed due to error.")
# Print stuff
for line in result['video_error_logger_msg']:
video_error_logger.info(line)
file_logger.error(line)
encountered_errors += 1
if not args.silence_errors:
if args.daemon:
logger.error(line)
else:
status_bar.write(line)
# for line in result['status_msg']:
# playlist_bar.write(line)
for line in result['logger_msg']:
log_info_twice(line)
playlist_bar.update()
else:
playlist_bar.write(f"All videos already downloaded for '{playlist['title']}'.")
if len(result['video_error_logger_msg']):
errored_videos += 1
if args.silence_errors and args.daemon:
logger.error(f"{result['video_id']} failed due to error.")
error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.'
if args.daemon:
logger.info(error_msg)
else:
playlist_bar.write(error_msg)
# for line in result['status_msg']:
# playlist_bar.write(line)
for line in result['logger_msg']:
log_info_twice(line)
playlist_bar.update()
else:
status_bar.write(f"All videos already downloaded for '{playlist['title']}'.")
log_info_twice(f"Finished item: '{playlist['title']}' {target_url}")
log_info_twice(f"Finished item: '{playlist['title']}' {target_url}")
# Sleep a bit to prevent rate-limiting
if progress_bar.n < len(url_list.keys()) - 1:
status_bar.set_description_str(f'Sleeping {args.ratelimit_sleep}s...')
time.sleep(args.ratelimit_sleep)
progress_bar.update()
error_msg = f'Encountered {encountered_errors} errors on {errored_videos} videos.'
if args.daemon:
logger.info(error_msg)
else:
status_bar.write(error_msg)
log_info_twice(f"Finished process in {round(math.ceil(time.time() - start_time) / 60, 2)} min.")
if not args.daemon:
break
@ -292,12 +336,9 @@ while True:
time.sleep(args.sleep * 60)
except KeyboardInterrupt:
sys.exit()
downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded
# Erase the status bar.
status_bar.set_description_str('\x1b[2KDone!')
status_bar.refresh()
# downloaded_videos = load_existing_videos() # reload the videos that have already been downloaded
# Clean up the remaining bars. Have to close them in order.
eraser_exit.value = True
playlist_bar.close()
status_bar.close()

View File

@ -22,7 +22,7 @@ def restart_program():
os.execl(python, python, *sys.argv)
def setup_file_logger(name, log_file, level=logging.INFO, format_str: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s', filemode='a', no_console: bool = True):
def setup_file_logger(name, log_file, level=logging.INFO, format_str: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s', filemode='a'):
formatter = logging.Formatter(format_str)
logger = logging.getLogger(name)

View File

@ -1,6 +1,8 @@
import math
import os
import time
from multiprocessing import Manager
from threading import Thread
import numpy as np
from tqdm.auto import tqdm
@ -12,21 +14,25 @@ from process.funcs import setup_file_logger
class ytdl_logger(object):
errors = []
def __init__(self, logger):
def __init__(self, logger=None):
self.logger = logger
def debug(self, msg):
self.logger.info(msg)
if self.logger:
self.logger.info(msg)
def info(self, msg):
self.logger.info(msg)
if self.logger:
self.logger.info(msg)
def warning(self, msg):
self.logger.warning(msg)
if self.logger:
self.logger.warning(msg)
def error(self, msg):
self.logger.error(msg)
self.errors.append(msg)
if self.logger:
self.logger.error(msg)
self.errors.append(msg)
def is_manager_lock_locked(lock) -> bool:
@ -57,6 +63,7 @@ def download_video(args) -> dict:
bar.set_postfix({
'speed': d['_speed_str'],
'size': f"{d['_downloaded_bytes_str'].strip()}/{d['_total_bytes_str'].strip()}",
'offset': offset
})
video = args[0]
@ -69,21 +76,28 @@ def download_video(args) -> dict:
while not locked:
for item in kwargs['bars']:
if not is_manager_lock_locked(item[1]):
locked = item[1].acquire(timeout=0.1) # get the lock ASAP and don't wait if we didn't get it.
locked = item[1].acquire(timeout=0.2) # get the lock ASAP and don't wait if we didn't get it.
offset = item[0]
bar_lock = item[1]
break
kwargs['ydl_opts']['progress_hooks'] = [progress_hook]
desc_with = int(np.round(os.get_terminal_size()[0] * (1 / 4)))
bar = tqdm(total=100, position=(offset if locked else None), desc=f"{video['id']} - {video['title']}".ljust(desc_with)[:desc_with], bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]', leave=False)
bar = tqdm(total=100, position=offset, desc=f"{video['id']} - {video['title']}".ljust(desc_with)[:desc_with], bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]', leave=False)
ylogger = ytdl_logger(setup_file_logger(video['id'], kwargs['output_dir'] / f"[{video['id']}].log"))
kwargs['ydl_opts']['logger'] = ylogger
yt_dlp = ydl.YDL(kwargs['ydl_opts'])
output_dict = {'downloaded_video_id': None, 'video_id': video['id'], 'video_error_logger_msg': [], 'status_msg': [], 'logger_msg': []} # empty object
start_time = time.time()
try:
kwargs['ydl_opts']['logger'] = ytdl_logger() # dummy silent logger
yt_dlp = ydl.YDL(kwargs['ydl_opts'])
try:
base_path = os.path.splitext(yt_dlp.prepare_filename(yt_dlp.extract_info(video['url'], download=False)))[0]
except AttributeError:
# Sometimes we won't be able to pull the video info so just use the video's ID
base_path = kwargs['output_dir'] / video['id']
ylogger = ytdl_logger(setup_file_logger(video['id'], str(base_path) + '.log'))
kwargs['ydl_opts']['logger'] = ylogger
yt_dlp = ydl.YDL(kwargs['ydl_opts']) # recreate the object with the correct logging path
error_code = yt_dlp(video['url']) # Do the download
if not error_code:
elapsed = round(math.ceil(time.time() - start_time) / 60, 2)
@ -96,7 +110,43 @@ def download_video(args) -> dict:
output_dict['video_error_logger_msg'] = output_dict['video_error_logger_msg'] + ylogger.errors
except Exception as e:
output_dict['video_error_logger_msg'].append(f"EXCEPTION -> {e}")
bar.update(100 - bar.n)
if locked:
bar.close()
bar_lock.release()
return output_dict
def bar_eraser(video_bars, eraser_exit):
manager = Manager()
queue = manager.dict()
queue_lock = manager.Lock()
def eraser():
nonlocal queue
while not eraser_exit.value:
for i in queue.keys():
if eraser_exit.value:
return
i = int(i)
lock = video_bars[i][1].acquire(timeout=0.1)
bar_lock = video_bars[i][1]
if lock:
bar = tqdm(position=video_bars[i][0], leave=False, bar_format='\x1b[2K')
bar.close()
with queue_lock:
del queue_dict[i]
queue = queue_dict
bar_lock.release()
Thread(target=eraser).start()
while not eraser_exit.value:
for i, item in enumerate(video_bars):
if eraser_exit.value:
return
if is_manager_lock_locked(item[1]):
with queue_lock:
queue_dict = queue
queue_dict[i] = True
queue = queue_dict

View File

@ -2,4 +2,6 @@ yt-dlp
psutil
tqdm
mergedeep
numpy
numpy
pyyaml
appdirs

1
targets.sample.txt Normal file
View File

@ -0,0 +1 @@
https://www.youtube.com/playlist?list=example1234

5
targets.sample.yaml Normal file
View File

@ -0,0 +1,5 @@
/path/to/storage/Example Playlist:
- https://www.youtube.com/playlist?list=ExamplePlaylist1234
/path/to/storage/Music:
- https://www.youtube.com/MyMusicPlaylist1234

View File

@ -29,13 +29,15 @@ class YDL:
sizes.append(d)
return tuple(sizes)
def playlist_contents(self, url: str) -> dict:
def playlist_contents(self, url: str) -> dict | bool:
ydl_opts = merge({
'extract_flat': True,
'skip_download': True
}, self.ydl_opts)
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.sanitize_info(ydl.extract_info(url, download=False))
if not info:
return False
entries = []
if info['_type'] == 'playlist':
if 'entries' in info.keys():
@ -53,14 +55,23 @@ class YDL:
'entries': entries,
}
def __call__(self, *args, **kwargs):
return self.yt_dlp.download(*args, **kwargs)
# def filter_filesize(self, info, *, incomplete):
# duration = info.get('duration')
# if duration and duration < 60:
# return 'The video is too short'
def extract_info(self, *args, **kwargs):
return self.yt_dlp.extract_info(*args, **kwargs)
def prepare_filename(self, *args, **kwargs):
return self.yt_dlp.prepare_filename(*args, **kwargs)
def process_info(self, *args, **kwargs):
return self.yt_dlp.process_info(*args, **kwargs)
def __call__(self, *args, **kwargs):
return self.yt_dlp.download(*args, **kwargs)
def update_ytdlp():
old = subprocess.check_output('pip freeze | grep yt-dlp', shell=True).decode().strip('\n')