[youtube:feed] Implement lazy playlist extraction (closes #10184)

2018-04-22 06:07:32 +07:00 · 2018-04-22 06:07:32 +07:00 · 3853309fe2
parent 6cdaaf7031
commit 3853309fe2
1 changed files with 10 additions and 6 deletions
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -2699,10 +2699,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
    def _real_initialize(self):
        self._login()

-    def _real_extract(self, url):
-        page = self._download_webpage(
-            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
-
+    def _entries(self, page):
        # The extraction process is the same as for playlists, but the regex
        # for the video ids doesn't contain an index
        ids = []
@ -2713,12 +2710,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
            # 'recommended' feed has infinite 'load more' and each new portion spins
            # the same videos in (sometimes) slightly different order, so we'll check
            # for unicity and break when portion has no new videos
-            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
            if not new_ids:
                break

            ids.extend(new_ids)

+            for entry in self._ids_to_results(new_ids):
+                yield entry
+
            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
            if not mobj:
                break
@ -2730,8 +2730,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
            content_html = more['content_html']
            more_widget_html = more['load_more_widget_html']

+    def _real_extract(self, url):
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME,
+            self._PLAYLIST_TITLE)
        return self.playlist_result(
-            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+            self._entries(page), playlist_title=self._PLAYLIST_TITLE)


 class YoutubeWatchLaterIE(YoutubePlaylistIE):