[tvp] Fix embeds (#1401)

Authored by: selfisekai
This commit is contained in:
Lauren Liberda 2021-10-31 11:02:50 +05:30 committed by pukkandan
parent c0599d4fe4
commit 56bb56f3cf
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
2 changed files with 129 additions and 76 deletions

View File

@ -135,6 +135,7 @@ from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE from .medialaan import MedialaanIE
from .simplecast import SimplecastIE from .simplecast import SimplecastIE
from .wimtv import WimTVIE from .wimtv import WimTVIE
from .tvp import TVPEmbedIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -3508,6 +3509,10 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
tvp_urls = TVPEmbedIE._extract_urls(webpage)
if tvp_urls:
return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:

View File

@ -89,114 +89,162 @@ class TVPIE(InfoExtractor):
class TVPEmbedIE(InfoExtractor): class TVPEmbedIE(InfoExtractor):
IE_NAME = 'tvp:embed' IE_NAME = 'tvp:embed'
IE_DESC = 'Telewizja Polska' IE_DESC = 'Telewizja Polska'
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' _VALID_URL = r'''(?x)
(?:
tvp:
|https?://
(?:[^/]+\.)?
(?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/
(?:sess/
(?:tvplayer\.php\?.*?object_id
|TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
|shared/details\.php\?.*?object_id)
=)
(?P<id>\d+)
'''
_TESTS = [{ _TESTS = [{
'url': 'tvp:194536', 'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': { 'info_dict': {
'id': '194536', 'id': '194536',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek', 'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
}, },
}, { }, {
# not available 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': { 'info_dict': {
'id': '22670268', 'id': '51247504',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40', 'title': 'Razmova 091220',
}, },
'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, { }, {
'url': 'tvp:22670268', # TVPlayer2 embed URL
'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
'only_matching': True,
}, {
'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
'only_matching': True,
}, {
# pulsembed on dziennik.pl
'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage, **kw):
return [m.group('embed') for m in re.finditer(
r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:],
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# it could be anything that is a valid JS function name
callback = random.choice((
'jebac_pis',
'jebacpis',
'ziobro',
'sasin70',
'sasin_przejebal_70_milionow_PLN',
'tvp_is_a_state_propaganda_service',
))
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
+ '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
error = self._html_search_regex( # stripping JSONP padding
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', datastr = webpage[15 + len(callback):-3]
webpage, 'error', default=None) or clean_html( if datastr.startswith('null,'):
get_element_by_attribute('class', 'msg error', webpage)) error = self._parse_json(datastr[5:], video_id)
if error: raise ExtractorError(error[0]['desc'])
raise ExtractorError('%s said: %s' % (
self.IE_NAME, clean_html(error)), expected=True)
title = self._search_regex( content = self._parse_json(datastr, video_id)['content']
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', info = content['info']
webpage, 'title', group='title') is_live = try_get(info, lambda x: x['isLive'], bool)
series_title = self._search_regex(
r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
webpage, 'series', group='series', default=None)
if series_title:
title = '%s, %s' % (series_title, title)
thumbnail = self._search_regex(
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
video_url = self._search_regex(
r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
'formats', group='url', default=None)
if not video_url or 'material_niedostepny.mp4' in video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
formats = [] formats = []
video_url_base = self._search_regex( for file in content['files']:
r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url = file.get('url')
video_url, 'video base url', default=None) if not video_url:
if video_url_base: continue
# TODO: <Group> found instead of <AdaptationSet> in MPD manifest. if video_url.endswith('.m3u8'):
# It's not mentioned in MPEG-DASH standard. Figure that out. formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live))
# formats.extend(self._extract_mpd_formats( elif video_url.endswith('.mpd'):
# video_url_base + '.ism/video.mpd', if is_live:
# video_id, mpd_id='dash', fatal=False)) # doesn't work with either ffmpeg or native downloader
formats.extend(self._extract_ism_formats( continue
video_url_base + '.ism/Manifest', formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
video_id, 'mss', fatal=False)) elif video_url.endswith('.f4m'):
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
video_url_base + '.ism/video.f4m', elif video_url.endswith('.ism/manifest'):
video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
m3u8_formats = self._extract_m3u8_formats(
video_url_base + '.ism/video.m3u8', video_id,
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
lambda f: f.get('vcodec') != 'none', m3u8_formats))
formats.extend(m3u8_formats)
for i, m3u8_format in enumerate(m3u8_formats, 2):
http_url = '%s-%d.mp4' % (video_url_base, i)
if self._is_valid_url(http_url, video_id):
f = m3u8_format.copy()
f.update({
'url': http_url,
'format_id': f['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
else: else:
formats = [{ # mp4, wmv or something
quality = file.get('quality', {})
formats.append({
'format_id': 'direct', 'format_id': 'direct',
'url': video_url, 'url': video_url,
'ext': determine_ext(video_url, 'mp4'), 'ext': determine_ext(video_url, file['type']),
}] 'fps': int_or_none(quality.get('fps')),
'tbr': int_or_none(quality.get('bitrate')),
'width': int_or_none(quality.get('width')),
'height': int_or_none(quality.get('height')),
})
self._sort_formats(formats) self._sort_formats(formats)
return { title = dict_get(info, ('subtitle', 'title', 'seoTitle'))
description = dict_get(info, ('description', 'seoDescription'))
thumbnails = []
for thumb in content.get('posters') or ():
thumb_url = thumb.get('src')
if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
continue
thumbnails.append({
'url': thumb.get('src'),
'width': thumb.get('width'),
'height': thumb.get('height'),
})
age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int)
if age_limit == 1:
age_limit = 0
duration = try_get(info, lambda x: x['duration'], int) if not is_live else None
subtitles = {}
for sub in content.get('subtitles') or []:
if not sub.get('url'):
continue
subtitles.setdefault(sub['lang'], []).append({
'url': sub['url'],
'ext': sub.get('type'),
})
info_dict = {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'thumbnail': thumbnail, 'description': description,
'thumbnails': thumbnails,
'age_limit': age_limit,
'is_live': is_live,
'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
} }
# vod.tvp.pl
if info.get('vortalName') == 'vod':
info_dict.update({
'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
'series': info.get('title'),
'season': info.get('season'),
'episode_number': info.get('episode'),
})
return info_dict
class TVPWebsiteIE(InfoExtractor): class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series' IE_NAME = 'tvp:series'