[youtube] improve subtitle extraction

This commit is contained in:
Remita Amine 2021-02-01 18:12:35 +01:00
parent efef4ddf51
commit 65eee5a745
1 changed files with 18 additions and 25 deletions

View File

@ -1664,7 +1664,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response, player_response,
lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
if pctr: if pctr:
def process_language(container, base_url, caption, query): def process_language(container, base_url, lang_code, query):
lang_subs = [] lang_subs = []
for fmt in self._SUBTITLE_FORMATS: for fmt in self._SUBTITLE_FORMATS:
query.update({ query.update({
@ -1674,35 +1674,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': fmt, 'ext': fmt,
'url': update_url_query(base_url, query), 'url': update_url_query(base_url, query),
}) })
subtitles[caption['languageCode']] = lang_subs container[lang_code] = lang_subs
subtitles = {} subtitles = {}
for caption_track in pctr['captionTracks']: for caption_track in (pctr.get('captionTracks') or []):
base_url = caption_track['baseUrl'] base_url = caption_track.get('baseUrl')
if not base_url:
continue
if caption_track.get('kind') != 'asr': if caption_track.get('kind') != 'asr':
lang_subs = [] lang_code = caption_track.get('languageCode')
for fmt in self._SUBTITLE_FORMATS: if not lang_code:
lang_subs.append({ continue
'ext': fmt, process_language(
'url': update_url_query(base_url, { subtitles, base_url, lang_code, {})
'fmt': fmt,
}),
})
subtitles[caption_track['languageCode']] = lang_subs
continue continue
automatic_captions = {} automatic_captions = {}
for translation_language in pctr['translationLanguages']: for translation_language in (pctr.get('translationLanguages') or []):
translation_language_code = translation_language['languageCode'] translation_language_code = translation_language.get('languageCode')
lang_subs = [] if not translation_language_code:
for fmt in self._SUBTITLE_FORMATS: continue
lang_subs.append({ process_language(
'ext': fmt, automatic_captions, base_url, translation_language_code,
'url': update_url_query(base_url, { {'tlang': translation_language_code})
'fmt': fmt,
'tlang': translation_language_code,
}),
})
automatic_captions[translation_language_code] = lang_subs
info['automatic_captions'] = automatic_captions info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles info['subtitles'] = subtitles