[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags
This commit is contained in:
parent
55af45fcab
commit
520251c093
|
@ -1,6 +1,7 @@
|
|||
version <unreleased>
|
||||
|
||||
Core
|
||||
* Support m3u8 manifests in HTML5 multimedia tags
|
||||
* Fix js_to_json(): correct octal or hexadecimal number detection
|
||||
|
||||
Extractors
|
||||
|
|
|
@ -1695,7 +1695,7 @@ class InfoExtractor(object):
|
|||
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
||||
return formats
|
||||
|
||||
def _parse_html5_media_entries(self, base_url, webpage):
|
||||
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None):
|
||||
def absolute_url(video_url):
|
||||
return compat_urlparse.urljoin(base_url, video_url)
|
||||
|
||||
|
@ -1710,6 +1710,21 @@ class InfoExtractor(object):
|
|||
return f
|
||||
return {}
|
||||
|
||||
def _media_formats(src, cur_media_type):
|
||||
full_url = absolute_url(src)
|
||||
if determine_ext(full_url) == 'm3u8':
|
||||
is_plain_url = False
|
||||
formats = self._extract_m3u8_formats(
|
||||
full_url, video_id, ext='mp4', entry_protocol='m3u8_native',
|
||||
m3u8_id=m3u8_id)
|
||||
else:
|
||||
is_plain_url = True
|
||||
formats = [{
|
||||
'url': full_url,
|
||||
'vcodec': 'none' if cur_media_type == 'audio' else None,
|
||||
}]
|
||||
return is_plain_url, formats
|
||||
|
||||
entries = []
|
||||
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
|
||||
media_info = {
|
||||
|
@ -1719,10 +1734,8 @@ class InfoExtractor(object):
|
|||
media_attributes = extract_attributes(media_tag)
|
||||
src = media_attributes.get('src')
|
||||
if src:
|
||||
media_info['formats'].append({
|
||||
'url': absolute_url(src),
|
||||
'vcodec': 'none' if media_type == 'audio' else None,
|
||||
})
|
||||
_, formats = _media_formats(src)
|
||||
media_info['formats'].extend(formats)
|
||||
media_info['thumbnail'] = media_attributes.get('poster')
|
||||
if media_content:
|
||||
for source_tag in re.findall(r'<source[^>]+>', media_content):
|
||||
|
@ -1730,12 +1743,13 @@ class InfoExtractor(object):
|
|||
src = source_attributes.get('src')
|
||||
if not src:
|
||||
continue
|
||||
f = parse_content_type(source_attributes.get('type'))
|
||||
f.update({
|
||||
'url': absolute_url(src),
|
||||
'vcodec': 'none' if media_type == 'audio' else None,
|
||||
})
|
||||
media_info['formats'].append(f)
|
||||
is_plain_url, formats = _media_formats(src, media_type)
|
||||
if is_plain_url:
|
||||
f = parse_content_type(source_attributes.get('type'))
|
||||
f.update(formats[0])
|
||||
media_info['formats'].append(f)
|
||||
else:
|
||||
media_info['formats'].extend(formats)
|
||||
for track_tag in re.findall(r'<track[^>]+>', media_content):
|
||||
track_attributes = extract_attributes(track_tag)
|
||||
kind = track_attributes.get('kind')
|
||||
|
|
Loading…
Reference in New Issue