From 550e65410a7a1b105923494ac44460a4dc1a15d9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 23 Jul 2023 19:09:52 -0500 Subject: [PATCH] [ie] Extract subtitles from SMIL manifests (#7667) Authored by: bashonly, pukkandan --- yt_dlp/extractor/common.py | 46 +++++++++++++++++++-------------- yt_dlp/extractor/livestream.py | 5 ++-- yt_dlp/extractor/mediaset.py | 6 +++-- yt_dlp/extractor/nbc.py | 1 - yt_dlp/extractor/theplatform.py | 4 +-- 5 files changed, 34 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 64a280dc0..b69ac1d65 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2248,18 +2248,10 @@ class InfoExtractor: if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.url - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2285,9 +2277,8 @@ class InfoExtractor: def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2326,7 +2317,14 @@ class InfoExtractor: return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2334,7 +2332,7 @@ class InfoExtractor: base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 @@ -2382,8 +2380,9 @@ class InfoExtractor: src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2404,11 +2403,15 @@ class InfoExtractor: f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2439,7 +2442,10 @@ class InfoExtractor: 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index 692d6ab3a..a05a0fa9e 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -80,7 +80,8 @@ class LivestreamIE(InfoExtractor): }] _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' @@ -104,7 +105,7 @@ class LivestreamIE(InfoExtractor): 'tbr': tbr, 'preference': -1000, # Strictly inferior than all other formats? }) - return formats + return formats, {} def _extract_video_info(self, video_data): video_id = compat_str(video_data['id']) diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 1fa529914..e3b728dca 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -154,10 +154,12 @@ class MediasetIE(ThePlatformBaseIE): } }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + return super(MediasetIE, self)._parse_smil_formats_and_subtitles( + smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) def _check_drm_formats(self, tp_formats, video_id): has_nondrm, drm_manifest = False, '' diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 299b05174..b3c28ab55 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -131,7 +131,6 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'age_limit': 0, 'thumbnail': r're:https?://.+\.jpg', }, - 'expected_warnings': ['Ignoring subtitle tracks'], 'params': { 'skip_download': 'm3u8', }, diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 537f6f6cd..8307b912d 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -45,7 +45,7 @@ class ThePlatformBaseIE(OnceIE): raise ExtractorError( error_element.attrib['abstract'], expected=True) - smil_formats = self._parse_smil_formats( + smil_formats, subtitles = self._parse_smil_formats_and_subtitles( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com @@ -65,8 +65,6 @@ class ThePlatformBaseIE(OnceIE): formats.append(_format) - subtitles = self._parse_smil_subtitles(meta, default_ns) - return formats, subtitles def _download_theplatform_metadata(self, path, video_id):