[tagesschau] Add suppot for sendung (Fixes #4378)

This commit is contained in:
Philipp Hagemeister 2014-12-06 00:42:39 +01:00
parent 90644a6843
commit 045c48847a
1 changed files with 79 additions and 22 deletions

View File

@ -4,10 +4,11 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_filesize
class TagesschauIE(InfoExtractor): class TagesschauIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html' _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
_TESTS = [{ _TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
@ -19,6 +20,16 @@ class TagesschauIE(InfoExtractor):
'description': 'md5:69da3c61275b426426d711bde96463ab', 'description': 'md5:69da3c61275b426426d711bde96463ab',
'thumbnail': 're:^http:.*\.jpg$', 'thumbnail': 're:^http:.*\.jpg$',
}, },
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
'md5': '3c54c1f6243d279b706bde660ceec633',
'info_dict': {
'id': '5727',
'ext': 'mp4',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
'thumbnail': 're:^http:.*\.jpg$',
}
}] }]
_FORMATS = { _FORMATS = {
@ -32,32 +43,78 @@ class TagesschauIE(InfoExtractor):
display_id = video_id.lstrip('-') display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
playerpage = self._download_webpage( player_url = self._html_search_meta(
'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id, 'twitter:player', webpage, 'player URL', default=None)
display_id, 'Downloading player page') if player_url:
playerpage = self._download_webpage(
player_url, display_id, 'Downloading player page')
medias = re.findall( medias = re.findall(
r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
playerpage) playerpage)
formats = []
formats = [] for url, ext, res in medias:
for url, ext, res in medias: f = {
f = { 'format_id': res + '_' + ext,
'format_id': res + '_' + ext, 'url': url,
'url': url, 'ext': ext,
'ext': ext, }
} f.update(self._FORMATS.get(res, {}))
f.update(self._FORMATS.get(res, {})) formats.append(f)
formats.append(f) thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
else:
download_text = self._search_regex(
r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
webpage, 'download links')
links = re.finditer(
r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
webpage)
formats = []
for l in links:
format_id = self._search_regex(
r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
format = {
'format_id': format_id,
'url': l.group('url'),
'format_name': l.group('name'),
}
m = re.match(
r'''(?x)
Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
(?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
(?P<vbr>[0-9]+)kbps&\#10;
Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
l.group('title'))
if m:
format.update({
'format_note': m.group('audio_desc'),
'vcodec': m.group('vcodec'),
'width': int(m.group('width')),
'height': int(m.group('height')),
'abr': int(m.group('abr')),
'vbr': int(m.group('vbr')),
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
formats.append(format)
thumbnail_fn = self._search_regex(
r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
webpage, 'thumbnail', fatal=False)
description = self._html_search_regex(
r'(?s)<p class="teasertext">(.*?)</p>',
webpage, 'description', fatal=False)
title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats) self._sort_formats(formats)
thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
return { return {
'id': display_id, 'id': display_id,
'title': self._og_search_title(webpage).strip(), 'title': title,
'thumbnail': 'http://www.tagesschau.de' + thumbnail, 'thumbnail': thumbnail,
'formats': formats, 'formats': formats,
'description': self._og_search_description(webpage).strip(), 'description': description,
} }