From b2741f2654e6ddfebc1771b5d5fadb5fd6fe3863 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 5 May 2023 19:25:42 +0100 Subject: [PATCH] [InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp * add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386 thanks selfisekai * add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921, thanks Lesmiscore, pukkandan * add tests for the above * also fix HTML5 type recognition and tests, from https://github.com/yt-dlp/yt-dlp/commit/222a230871fe4fe63f35c49590379c9a77116819, thanks Lesmiscore * update extractors in PR using above, fix tests. --- test/test_InfoExtractor.py | 111 +++++++++++++++++++++++++-- youtube_dl/extractor/clipchamp.py | 7 -- youtube_dl/extractor/common.py | 51 +++++++++++- youtube_dl/extractor/globalplayer.py | 32 ++++---- youtube_dl/extractor/whyp.py | 25 +----- 5 files changed, 168 insertions(+), 58 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6d25441db..34773fbd0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -7,15 +7,33 @@ import io import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from youtube_dl.compat import compat_etree_fromstring, compat_http_server -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError import threading +from test.helper import ( + expect_dict, + expect_value, + FakeYDL, + http_server_port, +) +from youtube_dl.compat import ( + compat_etree_fromstring, + compat_http_server, +) +from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor import ( + get_info_extractor, + YoutubeIE, +) +from youtube_dl.utils import ( + encode_data_uri, + ExtractorError, + RegexNotFoundError, + strip_jsonp, +) + TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" @@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_nextjs_data(self): + html = ''' + + + + + + Test _search_nextjs_data() + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + +''' + search = self.ie._search_nextjs_data(html, 'testID') + self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') + + def test_search_nuxt_data(self): + html = ''' + + + + + Nuxt.js Test Page + + + + +
+

Example heading

+
+

Decoy text

+
+
+ + + + +''' + search = self.ie._search_nuxt_data(html, 'testID') + self.assertEqual(search['track']['id'], 'testid') + def test_search_json_ld_realworld(self): # https://github.com/ytdl-org/youtube-dl/issues/23306 expect_dict( @@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( diff --git a/youtube_dl/extractor/clipchamp.py b/youtube_dl/extractor/clipchamp.py index 5a732e808..3b485eaab 100644 --- a/youtube_dl/extractor/clipchamp.py +++ b/youtube_dl/extractor/clipchamp.py @@ -35,13 +35,6 @@ class ClipchampIE(InfoExtractor): _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} - def _search_nextjs_data(self, webpage, video_id, **kw): - return self._parse_json( - self._search_regex( - r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', **kw), - video_id, **kw) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dbdf456f5..549781186 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import base64 import datetime +import functools import hashlib import json import netrc @@ -23,6 +24,7 @@ from ..compat import ( compat_getpass, compat_integer_types, compat_http_client, + compat_map as map, compat_os_name, compat_str, compat_urllib_error, @@ -31,6 +33,7 @@ from ..compat import ( compat_urllib_request, compat_urlparse, compat_xml_parse_error, + compat_zip as zip, ) from ..downloader.f4m import ( get_base_url, @@ -70,6 +73,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + traverse_obj, try_get, unescapeHTML, unified_strdate, @@ -1349,6 +1353,44 @@ class InfoExtractor(object): break return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) + kw.pop('transform_source', None) + next_data = self._search_regex( + r''']+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P[^<]+)''', + webpage, 'next.js data', group='nd', **kw) + if not next_data: + return {} + return self._parse_json(next_data, video_id, **nkw) + + def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" + + # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) + context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') + fatal = kwargs.get('fatal', True) + traverse = kwargs.get('traverse', ('data', 0)) + + re_ctx = re.escape(context_name) + + FUNCTION_RE = (r'\(\s*function\s*\((?P[\s\S]*?)\)\s*\{\s*' + r'return\s+(?P\{[\s\S]*?})\s*;?\s*}\s*\((?P[\s\S]*?)\)') + + js, arg_keys, arg_vals = self._search_regex( + (p.format(re_ctx, FUNCTION_RE) for p in + (r'', + r'{0}\s*\([\s\S]*?{1}')), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} + + args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( + '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) + + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) @@ -2496,7 +2538,8 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -2514,6 +2557,7 @@ class InfoExtractor(object): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -2522,7 +2566,7 @@ class InfoExtractor(object): # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ - _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)' media_tags = [(media_tag, media_tag_name, media_type, '') for media_tag, media_tag_name, media_type in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] @@ -2540,7 +2584,8 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: diff --git a/youtube_dl/extractor/globalplayer.py b/youtube_dl/extractor/globalplayer.py index cceab9e6a..db490b141 100644 --- a/youtube_dl/extractor/globalplayer.py +++ b/youtube_dl/extractor/globalplayer.py @@ -24,13 +24,6 @@ class GlobalPlayerBaseIE(InfoExtractor): def _match_valid_url(cls, url): return cls.re.match(cls._VALID_URL, url) - def _search_nextjs_data(self, webpage, video_id, **kw): - return self._parse_json( - self._search_regex( - r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', **kw), - video_id, **kw) - def _get_page_props(self, url, video_id): webpage = self._download_webpage(url, video_id) return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] @@ -39,13 +32,14 @@ class GlobalPlayerBaseIE(InfoExtractor): return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests url, video_id, note='Determining source extension')) - def _extract_audio(self, episode, series): + @staticmethod + def _clean_desc(x): + x = clean_html(x) + if x: + x = x.replace('\xa0', ' ') + return x - def clean_desc(x): - x = clean_html(x) - if x: - x = x.replace('\xa0', ' ') - return x + def _extract_audio(self, episode, series): return merge_dicts({ 'vcodec': 'none', @@ -56,7 +50,7 @@ class GlobalPlayerBaseIE(InfoExtractor): 'uploader': 'itunesAuthor', # podcasts only }), traverse_obj(episode, { 'id': 'id', - 'description': ('description', T(clean_desc)), + 'description': ('description', T(self._clean_desc)), 'duration': ('duration', T(parse_duration)), 'thumbnail': 'imageUrl', 'url': 'streamUrl', @@ -141,9 +135,9 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): 'ext': 'aac', # 'live_status': 'is_live', 'is_live': True, - 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b', 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', - 'title': 're:^Classic FM Hall of Fame.+$' + 'title': 're:Classic FM Hall of Fame.+$' }, }] @@ -160,7 +154,7 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): 'is_live': True, }, traverse_obj(station, { 'title': 'title', - 'description': 'description', + 'description': ('description', T(self._clean_desc)), 'thumbnail': 'image', }), rev=True) @@ -177,7 +171,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE): 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', 'categories': ['Society & Culture', 'True Crime'], 'uploader': 'Global', - 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + 'description': r're:(?s).+\bscam\b.+?\bseries available now\b', }, }, { # radio catchup @@ -203,7 +197,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE): series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], 'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None, }, traverse_obj(series, { - 'description': 'description', + 'description': ('description', T(self._clean_desc)), 'thumbnail': 'imageUrl', 'title': 'title', 'uploader': 'itunesAuthor', # podcasts only diff --git a/youtube_dl/extractor/whyp.py b/youtube_dl/extractor/whyp.py index 16f9154ad..644eb4617 100644 --- a/youtube_dl/extractor/whyp.py +++ b/youtube_dl/extractor/whyp.py @@ -21,7 +21,7 @@ class WhypIE(InfoExtractor): 'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3', 'id': '18337', 'title': 'Home Page Example Track', - 'description': 'md5:bd758000fb93f3159339c852b5b9133c', + 'description': r're:(?s).+\bexample track\b', 'ext': 'mp3', 'duration': 52.82, 'uploader': 'Brad', @@ -33,29 +33,6 @@ class WhypIE(InfoExtractor): 'only_matching': True, }] - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', fatal=True, traverse=('data', 0)): - """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" - - import functools - import json - import re - from ..utils import (js_to_json, NO_DEFAULT) - - re_ctx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' - js, arg_keys, arg_vals = self._search_regex( - (p.format(re_ctx, FUNCTION_RE) for p in (r'', r'{0}\(.*?{1}')), - webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), - default=NO_DEFAULT if fatal else (None, None, None)) - if js is None: - return {} - - args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( - '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) - - ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) - return traverse_obj(ret, traverse) or {} - def _real_extract(self, url): unique_id = self._match_id(url) webpage = self._download_webpage(url, unique_id)