diff --git a/downloader.py b/downloader.py index 27e0930..29ed197 100755 --- a/downloader.py +++ b/downloader.py @@ -4,9 +4,11 @@ import logging.config import math import os import re +import shutil import signal import subprocess import sys +import tempfile import time from multiprocessing import Manager, Pool, cpu_count from pathlib import Path @@ -171,6 +173,15 @@ def load_existing_videos(): download_archive_file.touch() with open(download_archive_file, 'r') as file: output.update(([line.rstrip() for line in file])) + + # Remove duplicate lines. + # Something may have gone wrong in the past so we want to make sure everything is cleaned up. + with open(download_archive_file) as file: + uniqlines = set(file.readlines()) + fd, path = tempfile.mkstemp() + with os.fdopen(fd, 'w') as tmp: + tmp.writelines(set(uniqlines)) + shutil.move(path, download_archive_file) return output diff --git a/process/threads.py b/process/threads.py index 91eef37..9af74fa 100644 --- a/process/threads.py +++ b/process/threads.py @@ -70,15 +70,23 @@ def download_video(args) -> dict: video = args[0] kwargs = args[1] + output_dict = {'downloaded_video_id': None, 'video_id': video['id'], 'video_error_logger_msg': [], 'status_msg': [], 'logger_msg': []} # empty object + + if not video['channel_id'] or not video['channel'] or not video['channel_url']: + if video['duration'] or isinstance(video['view_count'], int): + # Sometimes videos don't have channel_id, channel, or channel_url but are actually valid. Like shorts. + pass + else: + output_dict['video_error_logger_msg'].append(f"{video['id']} unavailable.") + return output_dict # Get a bar locked = False if len(kwargs['bars']): - # We're going to wait until a bar is available for us to use. - while not locked: + while not locked: # We're going to wait until a bar is available for us to use. for item in kwargs['bars']: if not is_manager_lock_locked(item[1]): - locked = item[1].acquire(timeout=0.1) # get the lock ASAP and don't wait if we didn't get it. + locked = item[1].acquire(timeout=0.01) # get the lock ASAP and don't wait if we didn't get it. offset = item[0] bar_lock = item[1] break @@ -86,16 +94,19 @@ def download_video(args) -> dict: desc_with = int(np.round(os.get_terminal_size()[0] * (1 / 4))) bar = tqdm(total=100, position=offset, desc=f"{video['id']} - {video['title']}".ljust(desc_with)[:desc_with], bar_format='{l_bar}{bar}| {elapsed}<{remaining}{postfix}', leave=False) - output_dict = {'downloaded_video_id': None, 'video_id': video['id'], 'video_error_logger_msg': [], 'status_msg': [], 'logger_msg': []} # empty object start_time = time.time() try: kwargs['ydl_opts']['logger'] = ytdl_logger() # dummy silent logger yt_dlp = ydl.YDL(kwargs['ydl_opts']) - url = video['url'] - video = yt_dlp.sanitize_info(yt_dlp.extract_info(video['url'], download=False)) - video['url'] = url - del url + video_n = yt_dlp.get_info(video['url']) + if not video_n: + output_dict['video_error_logger_msg'].append(f"{video['id']} failed to get info.") + return output_dict + video_n['url'] = video['url'] + video = video_n + del video_n + try: base_path = os.path.splitext(yt_dlp.prepare_filename(video))[0] except AttributeError: @@ -111,7 +122,7 @@ def download_video(args) -> dict: output_dict['downloaded_video_id'] = video['id'] else: output_dict['video_error_logger_msg'] = output_dict['video_error_logger_msg'] + ylogger.errors - except Exception as e: + except Exception: output_dict['video_error_logger_msg'].append(f"EXCEPTION -> {traceback.format_exc()}") if locked: bar.update(100 - bar.n) diff --git a/ydl/yt_dlp.py b/ydl/yt_dlp.py index 3d3ea45..fef17a3 100644 --- a/ydl/yt_dlp.py +++ b/ydl/yt_dlp.py @@ -40,19 +40,19 @@ class YDL: 'logger': self.ydl_opts['logger'], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.sanitize_info(ydl.extract_info(url, download=False)) + info = self.get_info(url) if not info: return False entries = [] if info['_type'] == 'playlist': if 'entries' in info.keys(): # When downloading a channel youtube-dl returns a playlist for videos and another for shorts. - # We need to combine all the videos into one list + # We need to combine all the videos into one list. for item in info['entries']: if item['_type'] in ('video', 'url'): entries.append(item) elif item['_type'] == 'playlist': - for video in ydl.sanitize_info(ydl.extract_info(item['webpage_url'], download=False))['entries']: + for video in self.get_info(item['webpage_url'])['entries']: entries.append(video) else: raise ValueError(f"Unknown sub-media type: {item['_type']}") @@ -83,8 +83,15 @@ class YDL: def process_info(self, *args, **kwargs): return self.yt_dlp.process_info(*args, **kwargs) - def sanitize_info(self, *args, **kwargs): - return self.yt_dlp.sanitize_info(*args, **kwargs) + def get_info(self, url): + ydl_opts = { + 'extract_flat': True, + 'skip_download': True, + 'ignoreerrors': True, + 'logger': self.ydl_opts['logger'], + } + ydl = yt_dlp.YoutubeDL(ydl_opts) + return ydl.sanitize_info(ydl.extract_info(url, download=False)) def __call__(self, *args, **kwargs): return self.yt_dlp.download(*args, **kwargs)