[sportbox:embed] Add extractor

This commit is contained in:
Sergey M․ 2015-05-15 22:50:44 +06:00
parent 6181864290
commit 3a7382950b
2 changed files with 82 additions and 61 deletions

View File

@ -502,7 +502,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE from .spike import SpikeIE
from .sport5 import Sport5IE from .sport5 import Sport5IE
from .sportbox import SportBoxIE from .sportbox import (
SportBoxIE,
SportBoxEmbedIE,
)
from .sportdeutschland import SportDeutschlandIE from .sportdeutschland import SportDeutschlandIE
from .srf import SrfIE from .srf import SrfIE
from .srmediathek import SRMediathekIE from .srmediathek import SRMediathekIE

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
@ -12,8 +13,7 @@ from ..utils import (
class SportBoxIE(InfoExtractor): class SportBoxIE(InfoExtractor):
_VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
_TESTS = [ _TESTS = [{
{
'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
'md5': 'ff56a598c2cf411a9a38a69709e97079', 'md5': 'ff56a598c2cf411a9a38a69709e97079',
'info_dict': { 'info_dict': {
@ -30,30 +30,13 @@ class SportBoxIE(InfoExtractor):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
}, }, {
{
'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
'md5': 'ff56a598c2cf411a9a38a69709e97079',
'info_dict': {
'id': '211355',
'ext': 'mp4',
'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'description': '16 детских коллективов приняли участие в суперфинале турнира «Поле славы боевой».',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1426237001,
'upload_date': '20150313',
'duration': 292,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
'only_matching': True, 'only_matching': True,
}, }, {
] 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -61,39 +44,74 @@ class SportBoxIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
sobj = re.search(r'src="/vdl/player/(?P<media_type>\w+)/(?P<video_id>\d+)"', webpage) player = self._search_regex(
if (sobj): r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
video_id = sobj.group('video_id')
media_type = sobj.group('media_type')
else:
raise RegexNotFoundError('Unable to extract video_id')
player = self._download_webpage(
'http://news.sportbox.ru/vdl/player/%s/%s' % (media_type, video_id),
display_id, 'Downloading player webpage')
hls = self._search_regex(
r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]+([^\"]+)['\"]+", player, 'hls file')
formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
title = self._html_search_regex( title = self._html_search_regex(
r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title') r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
description = self._html_search_regex( description = self._html_search_regex(
r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False) r'(?s)<div itemprop="description">(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
timestamp = parse_iso8601(self._search_regex( timestamp = parse_iso8601(self._search_regex(
r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False)) r'<span itemprop="uploadDate">([^<]+)</span>',
webpage, 'timestamp', fatal=False))
duration = parse_duration(self._html_search_regex( duration = parse_duration(self._html_search_regex(
r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False)) r'<meta itemprop="duration" content="PT([^"]+)">',
webpage, 'duration', fatal=False))
return { return {
'id': video_id, '_type': 'url_transparent',
'url': compat_urlparse.urljoin(url, '/%s' % player),
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'timestamp': timestamp, 'timestamp': timestamp,
'duration': duration, 'duration': duration,
}
class SportBoxEmbedIE(InfoExtractor):
_VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
'info_dict': {
'id': '211355',
'ext': 'mp4',
'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
hls = self._search_regex(
r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
webpage, 'hls file')
formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
title = self._search_regex(
r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
thumbnail = self._search_regex(
r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats, 'formats': formats,
} }