2015-07-01 23:13:27 -06:00
# coding: utf-8
2014-02-04 08:31:00 -07:00
from __future__ import unicode_literals
2013-08-22 03:57:21 -06:00
import re
from . common import InfoExtractor
2014-03-20 17:59:51 -06:00
from . . utils import (
2014-12-31 09:24:14 -07:00
ExtractorError ,
2015-05-01 05:43:06 -06:00
determine_ext ,
int_or_none ,
2014-09-28 20:48:50 -06:00
unified_strdate ,
2014-03-20 17:59:51 -06:00
US_RATINGS ,
)
2013-08-22 03:57:21 -06:00
class PBSIE ( InfoExtractor ) :
2014-02-04 08:31:00 -07:00
_VALID_URL = r ''' (?x)https?://
( ? :
2014-09-28 20:48:50 -06:00
# Direct video URL
video \. pbs \. org / ( ? : viralplayer | video ) / ( ? P < id > [ 0 - 9 ] + ) / ? |
# Article with embedded player (or direct video)
( ? : www \. ) ? pbs \. org / ( ? : [ ^ / ] + / ) { 2 , 5 } ( ? P < presumptive_id > [ ^ / ] + ? ) ( ? : \. html ) ? / ? ( ? : $ | [ ? \#]) |
2014-02-04 08:31:00 -07:00
# Player
2014-03-20 17:46:32 -06:00
video \. pbs \. org / ( ? : widget / ) ? partnerplayer / ( ? P < player_id > [ ^ / ] + ) /
2014-02-04 08:31:00 -07:00
)
'''
2013-08-22 03:57:21 -06:00
2014-08-18 06:20:53 -06:00
_TESTS = [
{
' url ' : ' http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/ ' ,
' md5 ' : ' ce1888486f0908d555a8093cac9a7362 ' ,
' info_dict ' : {
' id ' : ' 2365006249 ' ,
' ext ' : ' mp4 ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' Constitution USA with Peter Sagal - A More Perfect Union ' ,
2014-08-18 06:20:53 -06:00
' description ' : ' md5:ba0c207295339c8d6eced00b7c363c6a ' ,
' duration ' : 3190 ,
} ,
2015-07-01 23:08:48 -06:00
' params ' : {
' skip_download ' : True , # requires ffmpeg
} ,
2013-08-22 03:57:21 -06:00
} ,
2014-08-18 06:24:18 -06:00
{
' url ' : ' http://www.pbs.org/wgbh/pages/frontline/losing-iraq/ ' ,
' md5 ' : ' 143c98aa54a346738a3d78f54c925321 ' ,
' info_dict ' : {
' id ' : ' 2365297690 ' ,
' ext ' : ' mp4 ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' FRONTLINE - Losing Iraq ' ,
2014-08-18 06:24:18 -06:00
' description ' : ' md5:f5bfbefadf421e8bb8647602011caf8e ' ,
' duration ' : 5050 ,
} ,
2015-07-01 23:08:48 -06:00
' params ' : {
' skip_download ' : True , # requires ffmpeg
}
2014-08-18 06:24:18 -06:00
} ,
2014-08-18 06:20:53 -06:00
{
' url ' : ' http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/ ' ,
' md5 ' : ' b19856d7f5351b17a5ab1dc6a64be633 ' ,
' info_dict ' : {
' id ' : ' 2201174722 ' ,
' ext ' : ' mp4 ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist ' ,
2014-08-18 06:20:53 -06:00
' description ' : ' md5:5871c15cba347c1b3d28ac47a73c7c28 ' ,
' duration ' : 801 ,
} ,
} ,
2014-08-21 19:16:08 -06:00
{
' url ' : ' http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/ ' ,
' md5 ' : ' c62859342be2a0358d6c9eb306595978 ' ,
' info_dict ' : {
' id ' : ' 2365297708 ' ,
' ext ' : ' mp4 ' ,
' description ' : ' md5:68d87ef760660eb564455eb30ca464fe ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full ' ,
2014-08-21 19:16:08 -06:00
' duration ' : 6559 ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
2015-07-01 23:08:48 -06:00
} ,
' params ' : {
' skip_download ' : True , # requires ffmpeg
} ,
2014-09-28 20:48:50 -06:00
} ,
{
' url ' : ' http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html ' ,
' md5 ' : ' 908f3e5473a693b266b84e25e1cf9703 ' ,
' info_dict ' : {
' id ' : ' 2365160389 ' ,
' display_id ' : ' killer-typhoon ' ,
' ext ' : ' mp4 ' ,
' description ' : ' md5:c741d14e979fc53228c575894094f157 ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' NOVA - Killer Typhoon ' ,
2014-09-28 20:48:50 -06:00
' duration ' : 3172 ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' upload_date ' : ' 20140122 ' ,
2015-07-01 23:08:48 -06:00
} ,
' params ' : {
' skip_download ' : True , # requires ffmpeg
} ,
2014-10-23 07:41:45 -06:00
} ,
{
' url ' : ' http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/ ' ,
' info_dict ' : {
' id ' : ' united-states-of-secrets ' ,
} ,
' playlist_count ' : 2 ,
2015-07-01 23:05:43 -06:00
} ,
{
' url ' : ' http://www.pbs.org/wgbh/americanexperience/films/death/player/ ' ,
' info_dict ' : {
' id ' : ' 2280706814 ' ,
' display_id ' : ' player ' ,
' ext ' : ' mp4 ' ,
2015-07-19 10:47:58 -06:00
' title ' : ' American Experience - Death and the Civil War ' ,
2015-07-01 23:05:43 -06:00
' description ' : ' American Experience, TV’ s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today. ' ,
' duration ' : 6705 ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
} ,
' params ' : {
' skip_download ' : True , # requires ffmpeg
2015-07-01 23:08:48 -06:00
} ,
2015-07-19 10:47:58 -06:00
} ,
{
' url ' : ' http://video.pbs.org/video/2365367186/ ' ,
' info_dict ' : {
' id ' : ' 2365367186 ' ,
' display_id ' : ' 2365367186 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' To Catch A Comet - Full Episode ' ,
' description ' : ' On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins. ' ,
' duration ' : 3342 ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
} ,
' params ' : {
' skip_download ' : True , # requires ffmpeg
} ,
2014-08-21 19:16:08 -06:00
}
2014-08-18 06:20:53 -06:00
]
2013-08-22 03:57:21 -06:00
2014-09-28 20:48:50 -06:00
def _extract_webpage ( self , url ) :
2013-08-22 03:57:21 -06:00
mobj = re . match ( self . _VALID_URL , url )
2014-02-04 08:31:00 -07:00
presumptive_id = mobj . group ( ' presumptive_id ' )
display_id = presumptive_id
if presumptive_id :
webpage = self . _download_webpage ( url , display_id )
2014-08-02 06:09:36 -06:00
2014-09-28 20:48:50 -06:00
upload_date = unified_strdate ( self . _search_regex (
r ' <input type= " hidden " id= " air_date_[0-9]+ " value= " ([^ " ]+) " ' ,
webpage , ' upload date ' , default = None ) )
2014-10-23 07:41:45 -06:00
# tabbed frontline videos
tabbed_videos = re . findall (
r ' <div[^>]+class= " videotab[^ " ]* " [^>]+vid= " ( \ d+) " ' , webpage )
if tabbed_videos :
return tabbed_videos , presumptive_id , upload_date
2014-08-18 06:20:53 -06:00
MEDIA_ID_REGEXES = [
r " div \ s*: \ s* ' videoembed ' \ s*, \ s*mediaid \ s*: \ s* ' ( \ d+) ' " , # frontline video embed
r ' class= " coveplayerid " >([^<]+)< ' , # coveplayer
2014-09-28 20:48:50 -06:00
r ' <input type= " hidden " id= " pbs_video_id_[0-9]+ " value= " ([0-9]+) " /> ' , # jwplayer
2014-08-18 06:20:53 -06:00
]
2014-08-02 06:09:36 -06:00
media_id = self . _search_regex (
2014-08-18 06:20:53 -06:00
MEDIA_ID_REGEXES , webpage , ' media ID ' , fatal = False , default = None )
2014-08-02 06:09:36 -06:00
if media_id :
2014-09-28 20:48:50 -06:00
return media_id , presumptive_id , upload_date
2014-08-02 06:09:36 -06:00
2014-02-04 08:31:00 -07:00
url = self . _search_regex (
2015-07-01 23:05:43 -06:00
r ' <iframe \ s+[^>]* \ s+src=[ " \' ]([^ \' " ]+partnerplayer[^ \' " ]+)[ " \' ] ' ,
2014-02-04 08:31:00 -07:00
webpage , ' player URL ' )
mobj = re . match ( self . _VALID_URL , url )
player_id = mobj . group ( ' player_id ' )
if not display_id :
display_id = player_id
if player_id :
player_page = self . _download_webpage (
url , display_id , note = ' Downloading player page ' ,
errnote = ' Could not download player page ' )
video_id = self . _search_regex (
r ' <div \ s+id= " video_([0-9]+) " ' , player_page , ' video ID ' )
else :
video_id = mobj . group ( ' id ' )
display_id = video_id
2014-09-28 20:48:50 -06:00
return video_id , display_id , None
2014-08-02 06:09:36 -06:00
def _real_extract ( self , url ) :
2014-09-28 20:48:50 -06:00
video_id , display_id , upload_date = self . _extract_webpage ( url )
2014-08-02 06:09:36 -06:00
2014-10-23 07:41:45 -06:00
if isinstance ( video_id , list ) :
entries = [ self . url_result (
' http://video.pbs.org/video/ %s ' % vid_id , ' PBS ' , vid_id )
for vid_id in video_id ]
return self . playlist_result ( entries , display_id )
2015-05-01 05:43:06 -06:00
info = self . _download_json (
' http://video.pbs.org/videoInfo/ %s ?format=json&type=partner ' % video_id ,
display_id )
formats = [ ]
for encoding_name in ( ' recommended_encoding ' , ' alternate_encoding ' ) :
redirect = info . get ( encoding_name )
if not redirect :
continue
redirect_url = redirect . get ( ' url ' )
if not redirect_url :
continue
redirect_info = self . _download_json (
redirect_url + ' ?format=json ' , display_id ,
' Downloading %s video url info ' % encoding_name )
if redirect_info [ ' status ' ] == ' error ' :
if redirect_info [ ' http_code ' ] == 403 :
message = (
' The video is not available in your region due to '
' right restrictions ' )
else :
message = redirect_info [ ' message ' ]
raise ExtractorError ( message , expected = True )
format_url = redirect_info . get ( ' url ' )
if not format_url :
continue
if determine_ext ( format_url ) == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
format_url , display_id , ' mp4 ' , preference = 1 , m3u8_id = ' hls ' ) )
2014-12-31 09:24:14 -07:00
else :
2015-05-01 05:43:06 -06:00
formats . append ( {
' url ' : format_url ,
2015-05-06 09:31:25 -06:00
' format_id ' : redirect . get ( ' eeid ' ) ,
2015-05-01 05:43:06 -06:00
} )
self . _sort_formats ( formats )
2014-12-31 09:24:14 -07:00
2014-03-20 17:59:51 -06:00
rating_str = info . get ( ' rating ' )
if rating_str is not None :
rating_str = rating_str . rpartition ( ' - ' ) [ 2 ]
age_limit = US_RATINGS . get ( rating_str )
2015-07-09 12:58:01 -06:00
subtitles = { }
closed_captions_url = info . get ( ' closed_captions_url ' )
if closed_captions_url :
subtitles [ ' en ' ] = [ {
' ext ' : ' ttml ' ,
' url ' : closed_captions_url ,
} ]
2015-07-19 10:47:58 -06:00
# video.pbs.org video.pbs.org/videoInfo/... frequently provides an obscure 'title' value, like
# 'Full Episode', 'Episode 5', etc. prepend program->title
2015-07-19 12:59:12 -06:00
alt_title = info . get ( ' program ' , { } ) . get ( ' title ' )
if alt_title :
2015-07-19 10:47:58 -06:00
info [ ' title ' ] = alt_title + ' - ' + re . sub ( r ' ^ ' + alt_title + ' [ \ s \ - \ :]+ ' , ' ' , info [ ' title ' ] )
2014-02-04 08:31:00 -07:00
return {
' id ' : video_id ,
2014-09-28 20:48:50 -06:00
' display_id ' : display_id ,
2014-02-04 08:31:00 -07:00
' title ' : info [ ' title ' ] ,
' description ' : info [ ' program ' ] . get ( ' description ' ) ,
' thumbnail ' : info . get ( ' image_url ' ) ,
2015-05-01 05:43:06 -06:00
' duration ' : int_or_none ( info . get ( ' duration ' ) ) ,
2014-03-20 17:59:51 -06:00
' age_limit ' : age_limit ,
2014-09-28 20:48:50 -06:00
' upload_date ' : upload_date ,
2015-05-01 05:43:06 -06:00
' formats ' : formats ,
2015-07-09 12:58:01 -06:00
' subtitles ' : subtitles ,
2014-02-04 08:31:00 -07:00
}