2013-09-10 03:19:58 -06:00
import json
2024-06-11 17:09:58 -06:00
import urllib . parse
2013-09-10 03:19:58 -06:00
from . common import InfoExtractor
2014-12-13 04:24:42 -07:00
from . . utils import (
2013-09-10 03:19:58 -06:00
ExtractorError ,
2016-07-05 05:01:04 -06:00
get_element_by_id ,
2013-09-10 03:19:58 -06:00
)
class SlideshareIE ( InfoExtractor ) :
2016-09-08 05:29:05 -06:00
_VALID_URL = r ' https?://(?:www \ .)?slideshare \ .net/[^/]+?/(?P<title>.+?)($| \ ?) '
2013-09-10 03:19:58 -06:00
_TEST = {
2014-02-09 06:22:56 -07:00
' url ' : ' http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity ' ,
' info_dict ' : {
' id ' : ' 25665706 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Managing Scale and Complexity ' ,
' description ' : ' This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix. ' ,
2013-09-10 03:19:58 -06:00
} ,
}
def _real_extract ( self , url ) :
2021-08-18 19:41:24 -06:00
mobj = self . _match_valid_url ( url )
2013-09-10 03:19:58 -06:00
page_title = mobj . group ( ' title ' )
webpage = self . _download_webpage ( url , page_title )
slideshare_obj = self . _search_regex (
2017-07-14 09:11:07 -06:00
r ' \ $ \ .extend \ (.*?slideshare_object, \ s*( \ { .*? \ }) \ ); ' ,
2014-02-09 06:22:56 -07:00
webpage , ' slideshare object ' )
2013-09-10 03:19:58 -06:00
info = json . loads ( slideshare_obj )
2014-02-09 06:22:56 -07:00
if info [ ' slideshow ' ] [ ' type ' ] != ' video ' :
2024-06-11 17:09:58 -06:00
raise ExtractorError ( ' Webpage type is " {} " : only video extraction is supported for Slideshare ' . format ( info [ ' slideshow ' ] [ ' type ' ] ) , expected = True )
2013-09-10 03:19:58 -06:00
doc = info [ ' doc ' ]
bucket = info [ ' jsplayer ' ] [ ' video_bucket ' ]
ext = info [ ' jsplayer ' ] [ ' video_extension ' ]
2024-06-11 17:09:58 -06:00
video_url = urllib . parse . urljoin ( bucket , doc + ' -SD. ' + ext )
2016-07-05 05:01:04 -06:00
description = get_element_by_id ( ' slideshow-description-paragraph ' , webpage ) or self . _html_search_regex (
2014-12-31 11:26:19 -07:00
r ' (?s)<p[^>]+itemprop= " description " [^>]*>(.+?)</p> ' , webpage ,
2014-04-10 18:19:15 -06:00
' description ' , fatal = False )
2013-09-10 03:19:58 -06:00
return {
' _type ' : ' video ' ,
' id ' : info [ ' slideshow ' ] [ ' id ' ] ,
' title ' : info [ ' slideshow ' ] [ ' title ' ] ,
' ext ' : ext ,
' url ' : video_url ,
' thumbnail ' : info [ ' slideshow ' ] [ ' pin_image_url ' ] ,
2016-07-05 05:01:04 -06:00
' description ' : description . strip ( ) if description else None ,
2013-09-10 03:19:58 -06:00
}