[extractor/BiliIntl] Fix metadata extraction

Closes #4116
This commit is contained in:
pukkandan 2022-06-20 03:03:19 +05:30
parent 40268a7974
commit 8072ef2bbd
No known key found for this signature in database
GPG Key ID: 7EEE9E1E817D0A39
4 changed files with 20 additions and 24 deletions

View File

@ -947,12 +947,11 @@ class BiliIntlIE(BiliIntlBaseIE):
video_id = ep_id or aid video_id = ep_id or aid
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
# Bstation layout # Bstation layout
initial_data = self._parse_json(self._search_regex( initial_data = (
r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={})
'preload state', default='{}'), video_id, fatal=False) or {} or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None))
video_data = ( video_data = traverse_obj(
traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict)
or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
if season_id and not video_data: if season_id and not video_data:
# Non-Bstation layout, read through episode list # Non-Bstation layout, read through episode list
@ -960,7 +959,7 @@ class BiliIntlIE(BiliIntlBaseIE):
video_data = traverse_obj(season_json, video_data = traverse_obj(season_json,
('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id),
expected_type=dict, get_all=False) expected_type=dict, get_all=False)
return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid)
class BiliIntlSeriesIE(BiliIntlBaseIE): class BiliIntlSeriesIE(BiliIntlBaseIE):

View File

@ -1588,15 +1588,13 @@ class InfoExtractor:
webpage, 'next.js data', fatal=fatal, **kw), webpage, 'next.js data', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal) video_id, transform_source=transform_source, fatal=fatal)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', return_full_data=False): def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
# not all website do this, but it can be changed
# https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
rectx = re.escape(context_name) rectx = re.escape(context_name)
FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
js, arg_keys, arg_vals = self._search_regex( js, arg_keys, arg_vals = self._search_regex(
(r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx, (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
args = dict(zip(arg_keys.split(','), arg_vals.split(','))) args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
@ -1604,10 +1602,8 @@ class InfoExtractor:
if val in ('undefined', 'void 0'): if val in ('undefined', 'void 0'):
args[key] = 'null' args[key] = 'null'
ret = self._parse_json(js_to_json(js, args), video_id) ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
if return_full_data: return traverse_obj(ret, traverse) or {}
return ret
return ret['data'][0]
@staticmethod @staticmethod
def _hidden_inputs(html): def _hidden_inputs(html):

View File

@ -1,8 +1,5 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import traverse_obj, unified_timestamp
traverse_obj,
unified_timestamp,
)
class FourZeroStudioArchiveIE(InfoExtractor): class FourZeroStudioArchiveIE(InfoExtractor):
@ -25,7 +22,7 @@ class FourZeroStudioArchiveIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None)
pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False)
uploader_internal_id = traverse_obj(nuxt_data, ( uploader_internal_id = traverse_obj(nuxt_data, (
@ -82,7 +79,7 @@ class FourZeroStudioClipIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
nuxt_data = self._search_nuxt_data(webpage, video_id, return_full_data=True) nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None)
clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False)

View File

@ -3216,7 +3216,11 @@ def js_to_json(code, vars={}):
return '"%s"' % v return '"%s"' % v
def create_map(mobj):
return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
return re.sub(r'''(?sx) return re.sub(r'''(?sx)
"(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|