2014-03-22 16:05:31 -06:00
from __future__ import unicode_literals
2013-06-23 12:07:51 -06:00
import re
from . common import InfoExtractor
2014-12-13 04:24:42 -07:00
from . . compat import (
2013-06-23 12:07:51 -06:00
compat_parse_qs ,
2015-07-17 11:41:47 -06:00
compat_urllib_parse_unquote ,
2014-12-13 04:24:42 -07:00
)
from . . utils import (
2013-07-16 17:14:30 -06:00
determine_ext ,
2013-06-23 12:07:51 -06:00
ExtractorError ,
2014-08-21 17:36:07 -06:00
int_or_none ,
2016-03-25 14:19:24 -06:00
urlencode_postdata ,
2016-07-05 19:19:55 -06:00
get_element_by_attribute ,
mimetype2ext ,
2013-06-23 12:07:51 -06:00
)
2014-03-22 16:05:31 -06:00
class MetacafeIE ( InfoExtractor ) :
2016-07-05 19:19:55 -06:00
_VALID_URL = r ' https?://(?:www \ .)?metacafe \ .com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+) '
2013-06-23 12:07:51 -06:00
_DISCLAIMER = ' http://www.metacafe.com/family_filter/ '
_FILTER_POST = ' http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user '
2014-03-22 16:05:31 -06:00
IE_NAME = ' metacafe '
2013-11-01 04:55:35 -06:00
_TESTS = [
2014-03-22 16:05:31 -06:00
# Youtube video
{
' add_ie ' : [ ' Youtube ' ] ,
2014-11-23 13:20:46 -07:00
' url ' : ' http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/ ' ,
2014-03-22 16:05:31 -06:00
' info_dict ' : {
' id ' : ' _aUehQsCQtM ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20090102 ' ,
2014-03-22 16:13:15 -06:00
' title ' : ' The Electric Company | " Short I " | PBS KIDS GO! ' ,
2014-03-22 16:05:31 -06:00
' description ' : ' md5:2439a8ef6d5a70e380c22f5ad323e5a8 ' ,
' uploader ' : ' PBS ' ,
' uploader_id ' : ' PBS '
}
2013-11-01 04:55:35 -06:00
} ,
2014-03-22 16:05:31 -06:00
# Normal metacafe video
{
' url ' : ' http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/ ' ,
' md5 ' : ' 6e0bca200eaad2552e6915ed6fd4d9ad ' ,
' info_dict ' : {
' id ' : ' 11121940 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' News: Stuff You Won \' t Do with Your PlayStation 4 ' ,
' uploader ' : ' ign ' ,
' description ' : ' Sony released a massive FAQ on the PlayStation Blog detailing the PS4 \' s capabilities and limitations. ' ,
} ,
2016-07-05 19:19:55 -06:00
' skip ' : ' Page is temporarily unavailable. ' ,
2013-11-01 04:55:35 -06:00
} ,
2014-03-22 16:05:31 -06:00
# AnyClip video
{
' url ' : ' http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/ ' ,
' info_dict ' : {
' id ' : ' an-dVVXnuY7Jh77J ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Andromeda Strain (1971): Stop the Bomb Part 3 ' ,
2016-07-05 19:19:55 -06:00
' uploader ' : ' AnyClip ' ,
' description ' : ' md5:cbef0460d31e3807f6feb4e7a5952e5b ' ,
2014-03-22 16:05:31 -06:00
} ,
2013-11-01 04:55:35 -06:00
} ,
2014-03-22 16:05:31 -06:00
# age-restricted video
{
' url ' : ' http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/ ' ,
' md5 ' : ' 98dde7c1a35d02178e8ab7560fe8bd09 ' ,
' info_dict ' : {
' id ' : ' 5186653 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' BBC INTERNAL Christmas Tape \' 79 - UNCENSORED Outtakes, Etc. ' ,
' uploader ' : ' Dwayne Pipe ' ,
' description ' : ' md5:950bf4c581e2c059911fa3ffbe377e4b ' ,
' age_limit ' : 18 ,
} ,
2013-12-04 15:43:50 -07:00
} ,
2014-03-22 16:05:31 -06:00
# cbs video
{
2014-03-22 16:08:11 -06:00
' url ' : ' http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/ ' ,
2014-03-22 16:05:31 -06:00
' info_dict ' : {
2014-03-22 16:08:11 -06:00
' id ' : ' 8VD4r_Zws8VP ' ,
2014-03-22 16:05:31 -06:00
' ext ' : ' flv ' ,
2014-03-22 16:08:11 -06:00
' title ' : ' Open: This is Face the Nation, February 9 ' ,
' description ' : ' md5:8a9ceec26d1f7ed6eab610834cc1a476 ' ,
' duration ' : 96 ,
2016-04-24 06:44:52 -06:00
' uploader ' : ' CBSI-NEW ' ,
' upload_date ' : ' 20140209 ' ,
' timestamp ' : 1391959800 ,
2014-03-22 16:05:31 -06:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2013-12-04 15:43:50 -07:00
} ,
2014-08-21 17:36:07 -06:00
# Movieclips.com video
{
' url ' : ' http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/ ' ,
' info_dict ' : {
' id ' : ' mv-Wy7ZU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' My Week with Marilyn - Do You Love Me? ' ,
' description ' : ' From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie. ' ,
' uploader ' : ' movie_trailers ' ,
' duration ' : 176 ,
} ,
' params ' : {
' skip_download ' : ' requires rtmpdump ' ,
}
}
2013-11-01 04:55:35 -06:00
]
2013-06-27 12:18:35 -06:00
2013-06-23 12:07:51 -06:00
def report_disclaimer ( self ) :
2014-03-22 16:05:31 -06:00
self . to_screen ( ' Retrieving disclaimer ' )
2013-06-23 12:07:51 -06:00
2016-07-05 19:19:55 -06:00
def _confirm_age ( self ) :
2013-06-23 12:07:51 -06:00
# Retrieve disclaimer
2013-12-08 14:24:55 -07:00
self . report_disclaimer ( )
2014-03-22 16:05:31 -06:00
self . _download_webpage ( self . _DISCLAIMER , None , False , ' Unable to retrieve disclaimer ' )
2013-06-23 12:07:51 -06:00
# Confirm age
2013-12-08 14:24:55 -07:00
self . report_age_confirmation ( )
2016-07-05 19:19:55 -06:00
self . _download_webpage (
self . _FILTER_POST , None , False , ' Unable to confirm age ' ,
data = urlencode_postdata ( {
' filters ' : ' 0 ' ,
' submit ' : " Continue - I ' m over 18 " ,
} ) , headers = {
' Content-Type ' : ' application/x-www-form-urlencoded ' ,
} )
2014-03-22 16:16:02 -06:00
2013-06-23 12:07:51 -06:00
def _real_extract ( self , url ) :
# Extract id and simplified title from URL
2016-07-05 19:19:55 -06:00
video_id , display_id = re . match ( self . _VALID_URL , url ) . groups ( )
2013-06-23 12:07:51 -06:00
2013-12-04 15:43:50 -07:00
# the video may come from an external site
m_external = re . match ( ' ^( \ w {2} )-(.*)$ ' , video_id )
if m_external is not None :
prefix , ext_id = m_external . groups ( )
# Check if video comes from YouTube
if prefix == ' yt ' :
return self . url_result ( ' http://www.youtube.com/watch?v= %s ' % ext_id , ' Youtube ' )
# CBS videos use theplatform.com
if prefix == ' cb ' :
return self . url_result ( ' theplatform: %s ' % ext_id , ' ThePlatform ' )
2013-06-23 12:07:51 -06:00
2016-07-05 19:19:55 -06:00
# self._confirm_age()
2013-11-01 04:55:35 -06:00
# AnyClip videos require the flashversion cookie so that we get the link
# to the mp4 file
2016-07-05 19:19:55 -06:00
headers = { }
if video_id . startswith ( ' an- ' ) :
headers [ ' Cookie ' ] = ' flashVersion=0; '
# Retrieve video webpage to extract further information
webpage = self . _download_webpage ( url , video_id , headers = headers )
error = get_element_by_attribute (
' class ' , ' notfound-page-title ' , webpage )
if error :
raise ExtractorError ( error , expected = True )
video_title = self . _html_search_meta (
[ ' og:title ' , ' twitter:title ' ] , webpage , ' title ' , default = None ) or self . _search_regex ( r ' <h1>(.*?)</h1> ' , webpage , ' title ' )
2013-06-23 12:07:51 -06:00
# Extract URL, uploader and title from webpage
self . report_extraction ( video_id )
2014-08-21 05:37:19 -06:00
video_url = None
2015-12-05 08:12:02 -07:00
mobj = re . search ( r ' (?m)&(?:media|video)URL=([^&]+) ' , webpage )
2013-06-23 12:07:51 -06:00
if mobj is not None :
2015-07-17 11:41:47 -06:00
mediaURL = compat_urllib_parse_unquote ( mobj . group ( 1 ) )
2015-12-05 08:12:02 -07:00
video_ext = determine_ext ( mediaURL )
2013-06-23 12:07:51 -06:00
# Extract gdaKey if available
mobj = re . search ( r ' (?m)&gdaKey=(.*?)& ' , webpage )
if mobj is None :
video_url = mediaURL
else :
gdaKey = mobj . group ( 1 )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , gdaKey )
2014-08-21 05:37:19 -06:00
if video_url is None :
2013-07-16 17:14:30 -06:00
mobj = re . search ( r ' <video src= " ([^ " ]+) " ' , webpage )
if mobj :
video_url = mobj . group ( 1 )
video_ext = ' mp4 '
2014-08-21 05:37:19 -06:00
if video_url is None :
flashvars = self . _search_regex (
r ' name= " flashvars " value= " (.*?) " ' , webpage , ' flashvars ' ,
default = None )
if flashvars :
2014-08-21 05:25:17 -06:00
vardict = compat_parse_qs ( flashvars )
2013-07-16 17:14:30 -06:00
if ' mediaData ' not in vardict :
2014-03-22 16:05:31 -06:00
raise ExtractorError ( ' Unable to extract media URL ' )
mobj = re . search (
r ' " mediaURL " : " (?P<mediaURL>http.*?) " ,(.*?) " key " : " (?P<key>.*?) " ' , vardict [ ' mediaData ' ] [ 0 ] )
2013-07-16 17:14:30 -06:00
if mobj is None :
2014-03-22 16:05:31 -06:00
raise ExtractorError ( ' Unable to extract media URL ' )
2013-07-16 17:14:30 -06:00
mediaURL = mobj . group ( ' mediaURL ' ) . replace ( ' \\ / ' , ' / ' )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , mobj . group ( ' key ' ) )
video_ext = determine_ext ( video_url )
2014-08-21 17:36:07 -06:00
if video_url is None :
player_url = self . _search_regex (
r " swfobject \ .embedSWF \ ( ' ([^ ' ]+) ' " ,
webpage , ' config URL ' , default = None )
if player_url :
config_url = self . _search_regex (
r ' config=(.+)$ ' , player_url , ' config URL ' )
config_doc = self . _download_xml (
config_url , video_id ,
note = ' Downloading video config ' )
smil_url = config_doc . find ( ' .//properties ' ) . attrib [ ' smil_file ' ]
smil_doc = self . _download_xml (
smil_url , video_id ,
note = ' Downloading SMIL document ' )
base_url = smil_doc . find ( ' ./head/meta ' ) . attrib [ ' base ' ]
video_url = [ ]
for vn in smil_doc . findall ( ' .//video ' ) :
br = int ( vn . attrib [ ' system-bitrate ' ] )
play_path = vn . attrib [ ' src ' ]
video_url . append ( {
' format_id ' : ' smil- %d ' % br ,
' url ' : base_url ,
' play_path ' : play_path ,
' page_url ' : url ,
' player_url ' : player_url ,
' ext ' : play_path . partition ( ' : ' ) [ 0 ] ,
} )
2016-07-05 19:19:55 -06:00
if video_url is None :
flashvars = self . _parse_json ( self . _search_regex (
r ' flashvars \ s*= \ s*( { .*}); ' , webpage , ' flashvars ' ,
default = None ) , video_id , fatal = False )
if flashvars :
video_url = [ ]
for source in flashvars . get ( ' sources ' ) :
source_url = source . get ( ' src ' )
if not source_url :
continue
2016-07-06 02:11:46 -06:00
ext = mimetype2ext ( source . get ( ' type ' ) ) or determine_ext ( source_url )
if ext == ' m3u8 ' :
2016-07-05 19:19:55 -06:00
video_url . extend ( self . _extract_m3u8_formats (
source_url , video_id , ' mp4 ' ,
' m3u8_native ' , m3u8_id = ' hls ' , fatal = False ) )
else :
video_url . append ( {
' url ' : source_url ,
' ext ' : ext ,
} )
2013-06-23 12:07:51 -06:00
2014-08-21 17:36:07 -06:00
if video_url is None :
raise ExtractorError ( ' Unsupported video type ' )
2014-08-21 05:37:19 -06:00
2016-07-05 19:19:55 -06:00
description = self . _html_search_meta (
[ ' og:description ' , ' twitter:description ' , ' description ' ] ,
webpage , ' title ' , fatal = False )
thumbnail = self . _html_search_meta (
[ ' og:image ' , ' twitter:image ' ] , webpage , ' title ' , fatal = False )
2013-07-17 02:45:24 -06:00
video_uploader = self . _html_search_regex (
2014-11-23 13:39:15 -07:00
r ' submitter=(.*?);|googletag \ .pubads \ ( \ ) \ .setTargeting \ ( " (?:channel|submiter) " , " ([^ " ]+) " \ ); ' ,
webpage , ' uploader nickname ' , fatal = False )
2014-08-21 17:36:07 -06:00
duration = int_or_none (
2016-07-05 19:19:55 -06:00
self . _html_search_meta ( ' video:duration ' , webpage , default = None ) )
2014-08-21 17:36:07 -06:00
age_limit = (
18
2015-12-05 08:12:50 -07:00
if re . search ( r ' (?: " contentRating " :| " rating " ,) " restricted " ' , webpage )
2014-08-21 17:36:07 -06:00
else 0 )
2013-06-23 12:07:51 -06:00
2014-08-21 17:36:07 -06:00
if isinstance ( video_url , list ) :
formats = video_url
2013-11-01 04:55:35 -06:00
else :
2014-08-21 17:36:07 -06:00
formats = [ {
' url ' : video_url ,
' ext ' : video_ext ,
} ]
self . _sort_formats ( formats )
2016-07-05 19:19:55 -06:00
2013-07-17 02:49:49 -06:00
return {
2014-03-22 16:05:31 -06:00
' id ' : video_id ,
2016-07-05 19:19:55 -06:00
' display_id ' : display_id ,
2013-07-17 02:45:35 -06:00
' description ' : description ,
2013-07-16 17:14:30 -06:00
' uploader ' : video_uploader ,
2014-03-22 16:05:31 -06:00
' title ' : video_title ,
2014-08-21 05:25:17 -06:00
' thumbnail ' : thumbnail ,
2013-11-01 04:55:35 -06:00
' age_limit ' : age_limit ,
2014-08-21 17:36:07 -06:00
' formats ' : formats ,
' duration ' : duration ,
2013-07-17 02:49:49 -06:00
}