mirror of https://github.com/yt-dlp/yt-dlp.git
[ie] Extract subtitles from SMIL manifests (#7667)
Authored by: bashonly, pukkandan
This commit is contained in:
parent
39837ae319
commit
550e65410a
|
@ -2248,18 +2248,10 @@ class InfoExtractor:
|
||||||
if res is False:
|
if res is False:
|
||||||
assert not fatal
|
assert not fatal
|
||||||
return [], {}
|
return [], {}
|
||||||
|
|
||||||
smil, urlh = res
|
smil, urlh = res
|
||||||
smil_url = urlh.url
|
|
||||||
|
|
||||||
namespace = self._parse_smil_namespace(smil)
|
return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
|
||||||
|
namespace=self._parse_smil_namespace(smil))
|
||||||
fmts = self._parse_smil_formats(
|
|
||||||
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
|
||||||
subs = self._parse_smil_subtitles(
|
|
||||||
smil, namespace=namespace)
|
|
||||||
|
|
||||||
return fmts, subs
|
|
||||||
|
|
||||||
def _extract_smil_formats(self, *args, **kwargs):
|
def _extract_smil_formats(self, *args, **kwargs):
|
||||||
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
|
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
|
||||||
|
@ -2285,9 +2277,8 @@ class InfoExtractor:
|
||||||
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
|
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
|
||||||
namespace = self._parse_smil_namespace(smil)
|
namespace = self._parse_smil_namespace(smil)
|
||||||
|
|
||||||
formats = self._parse_smil_formats(
|
formats, subtitles = self._parse_smil_formats_and_subtitles(
|
||||||
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
||||||
subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
|
|
||||||
|
|
||||||
video_id = os.path.splitext(url_basename(smil_url))[0]
|
video_id = os.path.splitext(url_basename(smil_url))[0]
|
||||||
title = None
|
title = None
|
||||||
|
@ -2326,7 +2317,14 @@ class InfoExtractor:
|
||||||
return self._search_regex(
|
return self._search_regex(
|
||||||
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
|
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
|
||||||
|
|
||||||
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
def _parse_smil_formats(self, *args, **kwargs):
|
||||||
|
fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
|
||||||
|
if subs:
|
||||||
|
self._report_ignoring_subs('SMIL')
|
||||||
|
return fmts
|
||||||
|
|
||||||
|
def _parse_smil_formats_and_subtitles(
|
||||||
|
self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
||||||
base = smil_url
|
base = smil_url
|
||||||
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
|
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
|
||||||
b = meta.get('base') or meta.get('httpBase')
|
b = meta.get('base') or meta.get('httpBase')
|
||||||
|
@ -2334,7 +2332,7 @@ class InfoExtractor:
|
||||||
base = b
|
base = b
|
||||||
break
|
break
|
||||||
|
|
||||||
formats = []
|
formats, subtitles = [], {}
|
||||||
rtmp_count = 0
|
rtmp_count = 0
|
||||||
http_count = 0
|
http_count = 0
|
||||||
m3u8_count = 0
|
m3u8_count = 0
|
||||||
|
@ -2382,8 +2380,9 @@ class InfoExtractor:
|
||||||
src_url = src_url.strip()
|
src_url = src_url.strip()
|
||||||
|
|
||||||
if proto == 'm3u8' or src_ext == 'm3u8':
|
if proto == 'm3u8' or src_ext == 'm3u8':
|
||||||
m3u8_formats = self._extract_m3u8_formats(
|
m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
|
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
|
||||||
|
self._merge_subtitles(m3u8_subs, target=subtitles)
|
||||||
if len(m3u8_formats) == 1:
|
if len(m3u8_formats) == 1:
|
||||||
m3u8_count += 1
|
m3u8_count += 1
|
||||||
m3u8_formats[0].update({
|
m3u8_formats[0].update({
|
||||||
|
@ -2404,11 +2403,15 @@ class InfoExtractor:
|
||||||
f4m_url += urllib.parse.urlencode(f4m_params)
|
f4m_url += urllib.parse.urlencode(f4m_params)
|
||||||
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
|
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
|
||||||
elif src_ext == 'mpd':
|
elif src_ext == 'mpd':
|
||||||
formats.extend(self._extract_mpd_formats(
|
mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
|
||||||
src_url, video_id, mpd_id='dash', fatal=False))
|
src_url, video_id, mpd_id='dash', fatal=False)
|
||||||
|
formats.extend(mpd_formats)
|
||||||
|
self._merge_subtitles(mpd_subs, target=subtitles)
|
||||||
elif re.search(r'\.ism/[Mm]anifest', src_url):
|
elif re.search(r'\.ism/[Mm]anifest', src_url):
|
||||||
formats.extend(self._extract_ism_formats(
|
ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
|
||||||
src_url, video_id, ism_id='mss', fatal=False))
|
src_url, video_id, ism_id='mss', fatal=False)
|
||||||
|
formats.extend(ism_formats)
|
||||||
|
self._merge_subtitles(ism_subs, target=subtitles)
|
||||||
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
|
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
|
||||||
http_count += 1
|
http_count += 1
|
||||||
formats.append({
|
formats.append({
|
||||||
|
@ -2439,7 +2442,10 @@ class InfoExtractor:
|
||||||
'format_note': 'SMIL storyboards',
|
'format_note': 'SMIL storyboards',
|
||||||
})
|
})
|
||||||
|
|
||||||
return formats
|
smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
|
||||||
|
self._merge_subtitles(smil_subs, target=subtitles)
|
||||||
|
|
||||||
|
return formats, subtitles
|
||||||
|
|
||||||
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
|
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
|
||||||
urls = []
|
urls = []
|
||||||
|
|
|
@ -80,7 +80,8 @@ class LivestreamIE(InfoExtractor):
|
||||||
}]
|
}]
|
||||||
_API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s'
|
_API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s'
|
||||||
|
|
||||||
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
def _parse_smil_formats_and_subtitles(
|
||||||
|
self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
||||||
base_ele = find_xpath_attr(
|
base_ele = find_xpath_attr(
|
||||||
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
|
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
|
||||||
base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
|
base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
|
||||||
|
@ -104,7 +105,7 @@ class LivestreamIE(InfoExtractor):
|
||||||
'tbr': tbr,
|
'tbr': tbr,
|
||||||
'preference': -1000, # Strictly inferior than all other formats?
|
'preference': -1000, # Strictly inferior than all other formats?
|
||||||
})
|
})
|
||||||
return formats
|
return formats, {}
|
||||||
|
|
||||||
def _extract_video_info(self, video_data):
|
def _extract_video_info(self, video_data):
|
||||||
video_id = compat_str(video_data['id'])
|
video_id = compat_str(video_data['id'])
|
||||||
|
|
|
@ -154,10 +154,12 @@ class MediasetIE(ThePlatformBaseIE):
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
def _parse_smil_formats_and_subtitles(
|
||||||
|
self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
|
||||||
for video in smil.findall(self._xpath_ns('.//video', namespace)):
|
for video in smil.findall(self._xpath_ns('.//video', namespace)):
|
||||||
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
|
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
|
||||||
return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
|
return super(MediasetIE, self)._parse_smil_formats_and_subtitles(
|
||||||
|
smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
|
||||||
|
|
||||||
def _check_drm_formats(self, tp_formats, video_id):
|
def _check_drm_formats(self, tp_formats, video_id):
|
||||||
has_nondrm, drm_manifest = False, ''
|
has_nondrm, drm_manifest = False, ''
|
||||||
|
|
|
@ -131,7 +131,6 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
|
||||||
'age_limit': 0,
|
'age_limit': 0,
|
||||||
'thumbnail': r're:https?://.+\.jpg',
|
'thumbnail': r're:https?://.+\.jpg',
|
||||||
},
|
},
|
||||||
'expected_warnings': ['Ignoring subtitle tracks'],
|
|
||||||
'params': {
|
'params': {
|
||||||
'skip_download': 'm3u8',
|
'skip_download': 'm3u8',
|
||||||
},
|
},
|
||||||
|
|
|
@ -45,7 +45,7 @@ class ThePlatformBaseIE(OnceIE):
|
||||||
raise ExtractorError(
|
raise ExtractorError(
|
||||||
error_element.attrib['abstract'], expected=True)
|
error_element.attrib['abstract'], expected=True)
|
||||||
|
|
||||||
smil_formats = self._parse_smil_formats(
|
smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
|
||||||
meta, smil_url, video_id, namespace=default_ns,
|
meta, smil_url, video_id, namespace=default_ns,
|
||||||
# the parameters are from syfy.com, other sites may use others,
|
# the parameters are from syfy.com, other sites may use others,
|
||||||
# they also work for nbc.com
|
# they also work for nbc.com
|
||||||
|
@ -65,8 +65,6 @@ class ThePlatformBaseIE(OnceIE):
|
||||||
|
|
||||||
formats.append(_format)
|
formats.append(_format)
|
||||||
|
|
||||||
subtitles = self._parse_smil_subtitles(meta, default_ns)
|
|
||||||
|
|
||||||
return formats, subtitles
|
return formats, subtitles
|
||||||
|
|
||||||
def _download_theplatform_metadata(self, path, video_id):
|
def _download_theplatform_metadata(self, path, video_id):
|
||||||
|
|
Loading…
Reference in New Issue