Merge pull request #1705 from iemejia/master
[ted] support for subtitles
This commit is contained in:
commit
231516b6c9
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
# Allow direct execution
|
||||||
|
import os
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from youtube_dl.extractor import TEDIE
|
||||||
|
from youtube_dl.utils import *
|
||||||
|
from helper import FakeYDL
|
||||||
|
|
||||||
|
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
class TestTedSubtitles(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.DL = FakeYDL()
|
||||||
|
self.url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
|
||||||
|
def getInfoDict(self):
|
||||||
|
IE = TEDIE(self.DL)
|
||||||
|
info_dict = IE.extract(self.url)
|
||||||
|
return info_dict
|
||||||
|
def getSubtitles(self):
|
||||||
|
info_dict = self.getInfoDict()
|
||||||
|
return info_dict[0]['subtitles']
|
||||||
|
def test_no_writesubtitles(self):
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
self.assertEqual(subtitles, None)
|
||||||
|
def test_subtitles(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
self.assertEqual(md5(subtitles['en']), '2154f31ff9b9f89a0aa671537559c21d')
|
||||||
|
def test_subtitles_lang(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
|
self.DL.params['subtitleslangs'] = ['fr']
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
self.assertEqual(md5(subtitles['fr']), '7616cbc6df20ec2c1204083c83871cf6')
|
||||||
|
def test_allsubtitles(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
|
self.DL.params['allsubtitles'] = True
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
self.assertEqual(len(subtitles.keys()), 28)
|
||||||
|
def test_list_subtitles(self):
|
||||||
|
self.DL.params['listsubtitles'] = True
|
||||||
|
info_dict = self.getInfoDict()
|
||||||
|
self.assertEqual(info_dict, [None])
|
||||||
|
def test_automatic_captions(self):
|
||||||
|
self.DL.params['writeautomaticsub'] = True
|
||||||
|
self.DL.params['subtitleslang'] = ['en']
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
self.assertTrue(len(subtitles.keys()) == 0)
|
||||||
|
# def test_nosubtitles(self):
|
||||||
|
# self.DL.expect_warning(u'video doesn\'t have subtitles')
|
||||||
|
# self.url = 'http://www.ted.com/talks/rodrigo_canales_the_deadly_genius_of_drug_cartels.html'
|
||||||
|
# self.DL.params['writesubtitles'] = True
|
||||||
|
# self.DL.params['allsubtitles'] = True
|
||||||
|
# subtitles = self.getSubtitles()
|
||||||
|
def test_multiple_langs(self):
|
||||||
|
self.DL.params['writesubtitles'] = True
|
||||||
|
langs = ['es', 'fr', 'de']
|
||||||
|
self.DL.params['subtitleslangs'] = langs
|
||||||
|
subtitles = self.getSubtitles()
|
||||||
|
for lang in langs:
|
||||||
|
self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
|
@ -141,9 +141,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
|
||||||
raise ExtractorError(u'Unable to extract video URL')
|
raise ExtractorError(u'Unable to extract video URL')
|
||||||
|
|
||||||
# subtitles
|
# subtitles
|
||||||
video_subtitles = self.extract_subtitles(video_id)
|
video_subtitles = self.extract_subtitles(video_id, webpage)
|
||||||
if self._downloader.params.get('listsubtitles', False):
|
if self._downloader.params.get('listsubtitles', False):
|
||||||
self._list_available_subtitles(video_id)
|
self._list_available_subtitles(video_id, webpage)
|
||||||
return
|
return
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -157,7 +157,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
|
||||||
'age_limit': age_limit,
|
'age_limit': age_limit,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_available_subtitles(self, video_id):
|
def _get_available_subtitles(self, video_id, webpage):
|
||||||
try:
|
try:
|
||||||
sub_list = self._download_webpage(
|
sub_list = self._download_webpage(
|
||||||
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
|
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
|
||||||
|
|
|
@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
||||||
return any([self._downloader.params.get('writesubtitles', False),
|
return any([self._downloader.params.get('writesubtitles', False),
|
||||||
self._downloader.params.get('writeautomaticsub')])
|
self._downloader.params.get('writeautomaticsub')])
|
||||||
|
|
||||||
def _list_available_subtitles(self, video_id, webpage=None):
|
def _list_available_subtitles(self, video_id, webpage):
|
||||||
""" outputs the available subtitles for the video """
|
""" outputs the available subtitles for the video """
|
||||||
sub_lang_list = self._get_available_subtitles(video_id)
|
sub_lang_list = self._get_available_subtitles(video_id, webpage)
|
||||||
auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
|
auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
|
||||||
sub_lang = ",".join(list(sub_lang_list.keys()))
|
sub_lang = ",".join(list(sub_lang_list.keys()))
|
||||||
self.to_screen(u'%s: Available subtitles for video: %s' %
|
self.to_screen(u'%s: Available subtitles for video: %s' %
|
||||||
|
@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
||||||
self.to_screen(u'%s: Available automatic captions for video: %s' %
|
self.to_screen(u'%s: Available automatic captions for video: %s' %
|
||||||
(video_id, auto_lang))
|
(video_id, auto_lang))
|
||||||
|
|
||||||
def extract_subtitles(self, video_id, video_webpage=None):
|
def extract_subtitles(self, video_id, webpage):
|
||||||
"""
|
"""
|
||||||
returns {sub_lang: sub} ,{} if subtitles not found or None if the
|
returns {sub_lang: sub} ,{} if subtitles not found or None if the
|
||||||
subtitles aren't requested.
|
subtitles aren't requested.
|
||||||
|
@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
||||||
return None
|
return None
|
||||||
available_subs_list = {}
|
available_subs_list = {}
|
||||||
if self._downloader.params.get('writeautomaticsub', False):
|
if self._downloader.params.get('writeautomaticsub', False):
|
||||||
available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
|
available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))
|
||||||
if self._downloader.params.get('writesubtitles', False):
|
if self._downloader.params.get('writesubtitles', False):
|
||||||
available_subs_list.update(self._get_available_subtitles(video_id))
|
available_subs_list.update(self._get_available_subtitles(video_id, webpage))
|
||||||
|
|
||||||
if not available_subs_list: # error, it didn't get the available subtitles
|
if not available_subs_list: # error, it didn't get the available subtitles
|
||||||
return {}
|
return {}
|
||||||
|
@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
|
||||||
return
|
return
|
||||||
return sub
|
return sub
|
||||||
|
|
||||||
def _get_available_subtitles(self, video_id):
|
def _get_available_subtitles(self, video_id, webpage):
|
||||||
"""
|
"""
|
||||||
returns {sub_lang: url} or {} if not available
|
returns {sub_lang: url} or {} if not available
|
||||||
Must be redefined by the subclasses
|
Must be redefined by the subclasses
|
||||||
|
|
|
@ -1,10 +1,14 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .subtitles import SubtitlesInfoExtractor
|
||||||
|
|
||||||
|
from ..utils import (
|
||||||
|
compat_str,
|
||||||
|
RegexNotFoundError,
|
||||||
|
)
|
||||||
|
|
||||||
class TEDIE(InfoExtractor):
|
class TEDIE(SubtitlesInfoExtractor):
|
||||||
_VALID_URL=r'''http://www\.ted\.com/
|
_VALID_URL=r'''http://www\.ted\.com/
|
||||||
(
|
(
|
||||||
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
||||||
|
@ -82,11 +86,21 @@ class TEDIE(InfoExtractor):
|
||||||
'url': stream['file'],
|
'url': stream['file'],
|
||||||
'format': stream['id']
|
'format': stream['id']
|
||||||
} for stream in info['htmlStreams']]
|
} for stream in info['htmlStreams']]
|
||||||
|
|
||||||
|
video_id = info['id']
|
||||||
|
|
||||||
|
# subtitles
|
||||||
|
video_subtitles = self.extract_subtitles(video_id, webpage)
|
||||||
|
if self._downloader.params.get('listsubtitles', False):
|
||||||
|
self._list_available_subtitles(video_id, webpage)
|
||||||
|
return
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'id': info['id'],
|
'id': video_id,
|
||||||
'title': title,
|
'title': title,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
'description': desc,
|
'description': desc,
|
||||||
|
'subtitles': video_subtitles,
|
||||||
'formats': formats,
|
'formats': formats,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,3 +108,17 @@ class TEDIE(InfoExtractor):
|
||||||
info.update(info['formats'][-1])
|
info.update(info['formats'][-1])
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def _get_available_subtitles(self, video_id, webpage):
|
||||||
|
try:
|
||||||
|
options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
|
||||||
|
languages = re.findall(r'(?:<option value=")(\S+)"', options)
|
||||||
|
if languages:
|
||||||
|
sub_lang_list = {}
|
||||||
|
for l in languages:
|
||||||
|
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
|
||||||
|
sub_lang_list[l] = url
|
||||||
|
return sub_lang_list
|
||||||
|
except RegexNotFoundError as err:
|
||||||
|
self._downloader.report_warning(u'video doesn\'t have subtitles')
|
||||||
|
return {}
|
||||||
|
|
|
@ -1082,7 +1082,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
|
||||||
else:
|
else:
|
||||||
raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
|
raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
|
||||||
|
|
||||||
def _get_available_subtitles(self, video_id):
|
def _get_available_subtitles(self, video_id, webpage):
|
||||||
try:
|
try:
|
||||||
sub_list = self._download_webpage(
|
sub_list = self._download_webpage(
|
||||||
'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
|
'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
|
||||||
|
|
Loading…
Reference in New Issue