diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 768d45fc1..e5405c235 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 2bd90da57..33b01ce7f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 272895b47..285610cc7 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 608fcfba4..af73525fb 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d085ab1ef..42c878b83 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac34196cb..90bd63c32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,5 @@ name: CI -on: [push] +on: [push, pull_request] jobs: tests: name: Tests @@ -7,7 +7,7 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-latest] + os: [ubuntu-18.04] # TODO: python 2.6 python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] python-impl: [cpython] @@ -26,11 +26,11 @@ jobs: ytdl-test-set: download run-tests-ext: bat # jython - - os: ubuntu-latest + - os: ubuntu-18.04 python-impl: jython ytdl-test-set: core run-tests-ext: sh - - os: ubuntu-latest + - os: ubuntu-18.04 python-impl: jython ytdl-test-set: download run-tests-ext: sh @@ -49,11 +49,18 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index 3629c4fb8..658864282 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,397 @@ +version 2021.12.17 + +Core +* [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) + +Extractors +* [youtube] Update signature function patterns (#30363, #30366) +* [peertube] Only call description endpoint if necessary (#29383) +* [periscope] Pass referer to HLS requests (#29419) +- [liveleak] Remove extractor (#17625, #24222, #29331) ++ [pornhub] Add support for pornhubthbh7ap3u.onion +* [pornhub] Detect geo restriction +* [pornhub] Dismiss tbr extracted from download URLs (#28927) +* [curiositystream:collection] Extend _VALID_URL (#26326, #29117) +* [youtube] Make get_video_info processing more robust (#29333) +* [youtube] Workaround for get_video_info request (#29333) +* [bilibili] Strip uploader name (#29202) +* [youtube] Update invidious instance list (#29281) +* [umg:de] Update GraphQL API URL (#29304) +* [nrk] Switch psapi URL to https (#29344) ++ [egghead] Add support for app.egghead.io (#28404, #29303) +* [appleconnect] Fix extraction (#29208) ++ [orf:tvthek] Add support for MPD formats (#28672, #29236) + + +version 2021.06.06 + +Extractors +* [facebook] Improve login required detection +* [youporn] Fix formats and view count extraction (#29216) +* [orf:tvthek] Fix thumbnails extraction (#29217) +* [formula1] Fix extraction (#29206) +* [ard] Relax URL regular expression and fix video ids (#22724, #29091) ++ [ustream] Detect https embeds (#29133) +* [ted] Prefer own formats over external sources (#29142) +* [twitch:clips] Improve extraction (#29149) ++ [twitch:clips] Add access token query to download URLs (#29136) +* [youtube] Fix get_video_info request (#29086, #29165) +* [vimeo] Fix vimeo pro embed extraction (#29126) +* [redbulltv] Fix embed data extraction (#28770) +* [shahid] Relax URL regular expression (#28772, #28930) + + +version 2021.05.16 + +Core +* [options] Fix thumbnail option group name (#29042) +* [YoutubeDL] Improve extract_info doc (#28946) + +Extractors ++ [playstuff] Add support for play.stuff.co.nz (#28901, #28931) +* [eroprofile] Fix extraction (#23200, #23626, #29008) ++ [vivo] Add support for vivo.st (#29009) ++ [generic] Add support for og:audio (#28311, #29015) +* [phoenix] Fix extraction (#29057) ++ [generic] Add support for sibnet embeds ++ [vk] Add support for sibnet embeds (#9500) ++ [generic] Add Referer header for direct videojs download URLs (#2879, + #20217, #29053) +* [orf:radio] Switch download URLs to HTTPS (#29012, #29046) +- [blinkx] Remove extractor (#28941) +* [medaltv] Relax URL regular expression (#28884) ++ [funimation] Add support for optional lang code in URLs (#28950) ++ [gdcvault] Add support for HTML5 videos +* [dispeak] Improve FLV extraction (#13513, #28970) +* [kaltura] Improve iframe extraction (#28969) +* [kaltura] Make embed code alternatives actually work +* [cda] Improve extraction (#28709, #28937) +* [twitter] Improve formats extraction from vmap URL (#28909) +* [xtube] Fix formats extraction (#28870) +* [svtplay] Improve extraction (#28507, #28876) +* [tv2dk] Fix extraction (#28888) + + +version 2021.04.26 + +Extractors ++ [xfileshare] Add support for wolfstream.tv (#28858) +* [francetvinfo] Improve video id extraction (#28792) +* [medaltv] Fix extraction (#28807) +* [tver] Redirect all downloads to Brightcove (#28849) +* [go] Improve video id extraction (#25207, #25216, #26058) +* [youtube] Fix lazy extractors (#28780) ++ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) +* [cbsnews] Fix extraction for python <3.6 (#23359) + + +version 2021.04.17 + +Core ++ [utils] Add support for experimental HTTP response status code + 308 Permanent Redirect (#27877, #28768) + +Extractors ++ [lbry] Add support for HLS videos (#27877, #28768) +* [youtube] Fix stretched ratio calculation +* [youtube] Improve stretch extraction (#28769) +* [youtube:tab] Improve grid extraction (#28725) ++ [youtube:tab] Detect series playlist on playlists page (#28723) ++ [youtube] Add more invidious instances (#28706) +* [pluralsight] Extend anti-throttling timeout (#28712) +* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) ++ [maoritv] Add support for maoritelevision.com (#24552) ++ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with + continuation requests (#28702) +* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) ++ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) +* [cbssports] Fix extraction (#28682) +* [jamendo] Fix track extraction (#28686) +* [curiositystream] Fix format extraction (#26845, #28668) + + +version 2021.04.07 + +Core +* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies ++ [compat] Introduce compat_cookies_SimpleCookie +* [extractor/common] Improve JSON-LD author extraction +* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, + #28640) + +Extractors +* [youtube] Fix extraction of videos with restricted location (#28685) ++ [line] Add support for live.line.me (#17205, #28658) +* [vimeo] Improve extraction (#28591) +* [youku] Update ccode (#17852, #28447, #28460, #28648) +* [youtube] Prefer direct entry metadata over entry metadata from playlist + (#28619, #28636) +* [screencastomatic] Fix extraction (#11976, #24489) ++ [palcomp3] Add support for palcomp3.com (#13120) ++ [arnes] Add support for video.arnes.si (#28483) ++ [youtube:tab] Add support for hashtags (#28308) + + +version 2021.04.01 + +Extractors +* [youtube] Setup CONSENT cookie when needed (#28604) +* [vimeo] Fix password protected review extraction (#27591) +* [youtube] Improve age-restricted video extraction (#28578) + + +version 2021.03.31 + +Extractors +* [vlive] Fix inkey request (#28589) +* [francetvinfo] Improve video id extraction (#28584) ++ [instagram] Extract duration (#28469) +* [instagram] Improve title extraction (#28469) ++ [sbs] Add support for ondemand watch URLs (#28566) +* [youtube] Fix video's channel extraction (#28562) +* [picarto] Fix live stream extraction (#28532) +* [vimeo] Fix unlisted video extraction (#28414) +* [youtube:tab] Fix playlist/community continuation items extraction (#28266) +* [ard] Improve clip id extraction (#22724, #28528) + + +version 2021.03.25 + +Extractors ++ [zoom] Add support for zoom.us (#16597, #27002, #28531) +* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) +* [youtube] Fix default value for youtube_include_dash_manifest (#28523) +* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) ++ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) ++ [tiktok] Detect private videos (#28453) +* [vimeo:album] Fix extraction for albums with number of videos multiple + to page size (#28486) +* [vvvvid] Fix kenc format extraction (#28473) +* [mlb] Fix video extraction (#21241) +* [svtplay] Improve extraction (#28448) +* [applepodcasts] Fix extraction (#28445) +* [rtve] Improve extraction + + Extract all formats + * Fix RTVE Infantil extraction (#24851) + + Extract is_live and series + + +version 2021.03.14 + +Core ++ Introduce release_timestamp meta field (#28386) + +Extractors ++ [southpark] Add support for southparkstudios.com (#28413) +* [southpark] Fix extraction (#26763, #28413) +* [sportdeutschland] Fix extraction (#21856, #28425) +* [pinterest] Reduce the number of HLS format requests +* [peertube] Improve thumbnail extraction (#28419) +* [tver] Improve title extraction (#28418) +* [fujitv] Fix HLS formats extension (#28416) +* [shahid] Fix format extraction (#28383) ++ [lbry] Add support for channel filters (#28385) ++ [bandcamp] Extract release timestamp ++ [lbry] Extract release timestamp (#28386) +* [pornhub] Detect flagged videos ++ [pornhub] Extract formats from get_media end point (#28395) +* [bilibili] Fix video info extraction (#28341) ++ [cbs] Add support for Paramount+ (#28342) ++ [trovo] Add Origin header to VOD formats (#28346) +* [voxmedia] Fix volume embed extraction (#28338) + + +version 2021.03.03 + +Extractors +* [youtube:tab] Switch continuation to browse API (#28289, #28327) +* [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) ++ [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) + + +version 2021.03.02 + +Extractors +* [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, + #27930, #28198, #28199, #28274) + * Generalize cross-extractor video ids for zdf based extractors + * Improve extraction + * Fix 3sat and phoenix +* [stretchinternet] Fix extraction (#28297) +* [urplay] Fix episode data extraction (#28292) ++ [bandaichannel] Add support for b-ch.com (#21404) +* [srgssr] Improve extraction (#14717, #14725, #27231, #28238) + + Extract subtitle + * Fix extraction for new videos + * Update srf download domains +* [vvvvid] Reduce season request payload size ++ [vvvvid] Extract series sublists playlist title (#27601, #27618) ++ [dplay] Extract Ad-Free uplynk URLs (#28160) ++ [wat] Detect DRM protected videos (#27958) +* [tf1] Improve extraction (#27980, #28040) +* [tmz] Fix and improve extraction (#24603, #24687, 28211) ++ [gedidigital] Add support for Gedi group sites (#7347, #26946) +* [youtube] Fix get_video_info request + + +version 2021.02.22 + +Core ++ [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase + (#28112) + +Extractors +* [apa] Fix and improve extraction (#27750) ++ [youporn] Extract duration (#28019) ++ [peertube] Add support for canard.tube (#28190) +* [youtube] Fixup m4a_dash formats (#28165) ++ [samplefocus] Add support for samplefocus.com (#27763) ++ [vimeo] Add support for unlisted video source format extraction +* [viki] Improve extraction (#26522, #28203) + * Extract uploader URL and episode number + * Report login required error + + Extract 480p formats + * Fix API v4 calls +* [ninegag] Unescape title (#28201) +* [youtube] Improve URL regular expression (#28193) ++ [youtube] Add support for redirect.invidious.io (#28193) ++ [dplay] Add support for de.hgtv.com (#28182) ++ [dplay] Add support for discoveryplus.com (#24698) ++ [simplecast] Add support for simplecast.com (#24107) +* [youtube] Fix uploader extraction in flat playlist mode (#28045) +* [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) ++ [storyfire] Add support for storyfire.com (#25628, #26349) ++ [zhihu] Add support for zhihu.com (#28177) +* [youtube] Fix controversial videos when authenticated with cookies (#28174) +* [ccma] Fix timestamp parsing in python 2 ++ [videopress] Add support for video.wordpress.com +* [kakao] Improve info extraction and detect geo restriction (#26577) +* [xboxclips] Fix extraction (#27151) +* [ard] Improve formats extraction (#28155) ++ [canvas] Add support for dagelijksekost.een.be (#28119) + + +version 2021.02.10 + +Extractors +* [youtube:tab] Improve grid continuation extraction (#28130) +* [ign] Fix extraction (#24771) ++ [xhamster] Extract format filesize ++ [xhamster] Extract formats from xplayer settings (#28114) ++ [youtube] Add support phone/tablet JS player (#26424) +* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, + #27109, #27236, #28063) ++ [cda] Detect geo restricted videos (#28106) +* [urplay] Fix extraction (#28073, #28074) +* [youtube] Fix release date extraction (#28094) ++ [youtube] Extract abr and vbr (#28100) +* [youtube] Skip OTF formats (#28070) + + +version 2021.02.04.1 + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + +version 2021.02.04 + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + +version 2021.01.24.1 + +Core +* Introduce --output-na-placeholder (#27896) + +Extractors +* [franceculture] Make thumbnail optional (#18807) +* [franceculture] Fix extraction (#27891, #27903) +* [njpwworld] Fix extraction (#27890) +* [comedycentral] Fix extraction (#27905) +* [wat] Fix format extraction (#27901) ++ [americastestkitchen:season] Add support for seasons (#27861) ++ [trovo] Add support for trovo.live (#26125) ++ [aol] Add support for yahoo videos (#26650) +* [yahoo] Fix single video extraction +* [lbry] Unescape lbry URI (#27872) +* [9gag] Fix and improve extraction (#23022) +* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) +* [aljazeera] Fix extraction (#20911, #27779) ++ [minds] Add support for minds.com (#17934) +* [ard] Fix title and description extraction (#27761) ++ [spotify] Add support for Spotify Podcasts (#27443) + + +version 2021.01.16 + +Core +* [YoutubeDL] Protect from infinite recursion due to recursively nested + playlists (#27833) +* [YoutubeDL] Ignore failure to create existing directory (#27811) +* [YoutubeDL] Raise syntax error for format selection expressions with multiple + + operators (#27803) + +Extractors ++ [animeondemand] Add support for lazy playlist extraction (#27829) +* [youporn] Restrict fallback download URL (#27822) +* [youporn] Improve height and tbr extraction (#20425, #23659) +* [youporn] Fix extraction (#27822) ++ [twitter] Add support for unified cards (#27826) ++ [twitch] Add Authorization header with OAuth token for GraphQL requests + (#27790) +* [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) +* [cspan] Improve info extraction (#27791) +* [adn] Improve info extraction +* [adn] Fix extraction (#26963, #27732) +* [youtube:search] Extract from all sections (#27604) +* [youtube:search] fix viewcount and try to extract all video sections (#27604) +* [twitch] Improve login error extraction +* [twitch] Fix authentication (#27743) +* [3qsdn] Improve extraction (#21058) +* [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) +* [khanacademy] Fix extraction (#2887, #26803) +* [spike] Update Paramount Network feed URL (#27715) + + version 2021.01.08 Core diff --git a/README.md b/README.md index 85fed6d3a..2841ed68f 100644 --- a/README.md +++ b/README.md @@ -52,394 +52,431 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help Print this help text and exit - --version Print program version and exit - -U, --update Update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs - --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported - extractors - --force-generic-extractor Force extraction to use the generic - extractor - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --config-location PATH Location of the configuration file; either - the path to the config or its containing - directory. - --flat-playlist Do not extract the videos of a playlist, - only list them. - --mark-watched Mark videos watched (YouTube only) - --no-mark-watched Do not mark videos watched (YouTube only) - --no-color Do not emit color codes in output + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. + Make sure that you have sufficient + permissions (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for + example to skip unavailable videos in a + playlist + --abort-on-error Abort downloading of further videos (in + the playlist or the command line) if an + error occurs + --dump-user-agent Display the current browser + identification + --list-extractors List all supported extractors + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. + For example "gvsearch2:" downloads two + videos from google videos for youtube- + dl "large apple". Use the value "auto" + to let youtube-dl guess ("auto_warning" + to emit a warning when guessing). + "error" just throws an error. The + default value "fixup_error" repairs + broken URLs, but emits an error if this + is not possible instead of searching. + --ignore-config Do not read configuration files. When + given in the global configuration file + /etc/youtube-dl.conf: Do not read the + user configuration in + ~/.config/youtube-dl/config + (%APPDATA%/youtube-dl/config.txt on + Windows) + --config-location PATH Location of the configuration file; + either the path to the config or its + containing directory. + --flat-playlist Do not extract the videos of a + playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube + only) + --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable SOCKS proxy, specify a proper - scheme. For example - socks5://127.0.0.1:1080/. Pass in an empty - string (--proxy "") for direct connection - --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - -4, --force-ipv4 Make all connections via IPv4 - -6, --force-ipv6 Make all connections via IPv6 + --proxy URL Use the specified HTTP/HTTPS/SOCKS + proxy. To enable SOCKS proxy, specify a + proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an + empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in + seconds + --source-address IP Client-side IP address to bind to + -4, --force-ipv4 Make all connections via IPv4 + -6, --force-ipv6 Make all connections via IPv6 ## Geo Restriction: - --geo-verification-proxy URL Use this proxy to verify the IP address for - some geo-restricted sites. The default - proxy specified by --proxy (or none, if the - option is not present) is used for the - actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR - notation + --geo-verification-proxy URL Use this proxy to verify the IP address + for some geo-restricted sites. The + default proxy specified by --proxy (or + none, if the option is not present) is + used for the actual downloading. + --geo-bypass Bypass geographic restriction via + faking X-Forwarded-For HTTP header + --no-geo-bypass Do not bypass geographic restriction + via faking X-Forwarded-For HTTP header + --geo-bypass-country CODE Force bypass geographic restriction + with explicitly provided two-letter ISO + 3166-2 country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction + with explicitly provided IP block in + CIDR notation ## Video Selection: - --playlist-start NUMBER Playlist video to start at (default is 1) - --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify - indices of the videos in the playlist - separated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER Generic video filter. Specify any key (see - the "OUTPUT TEMPLATE" for a list of - available keys) to match if the key is - present, !key to check if the key is not - present, key > NUMBER (like "comment_count - > 12", also works with >=, <, <=, !=, =) to - compare against a number, key = 'LITERAL' - (like "uploader = 'Mike Smith'", also works - with !=) to match against a string literal - and & to require multiple matches. Values - which are not known are excluded unless you - put a question mark (?) after the operator. - For example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count NUMBER (like + "comment_count > 12", also works with + >=, <, <=, !=, =) to compare against a + number, key = 'LITERAL' (like "uploader + = 'Mike Smith'", also works with !=) to + match against a string literal and & to + require multiple matches. Values which + are not known are excluded unless you + put a question mark (?) after the + operator. For example, to only match + videos that have been liked more than + 100 times and disliked less than 50 + times (or the dislike functionality is + not available at the given service), + but who also have a description, use + --match-filter "like_count > 100 & + dislike_count .+?) - (?P.+)" - --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors (default) - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' - --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt|lrc) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg/avconv and + ffprobe/avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "flac", "mp3", "m4a", "opus", "vorbis", + or "wav"; "best" by default; No effect + without -x + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate + like 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the + postprocessor + -k, --keep-video Keep the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites Do not overwrite post-processed files; + the post-processed files are + overwritten by default + --embed-subs Embed subtitles in the video (only for + mp4, webm and mkv videos) + --embed-thumbnail Embed thumbnail in the audio as cover + art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song + title / artist from the video title. + The format syntax is the same as + --output. Regular expression with named + capture groups may also be used. The + parsed parameters replace existing + values. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". + Example (regex): --metadata-from-title + "(?P<artist>.+?) - (?P<title>.+)" + --xattrs Write metadata to the video file's + xattrs (using dublin core and xdg + standards) + --fixup POLICY Automatically correct known faults of + the file. One of never (do nothing), + warn (only emit a warning), + detect_or_warn (the default; fix file + if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running + the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running + the postprocessors (default) + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading and post-processing, + similar to find's -exec syntax. + Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subs FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION @@ -583,7 +620,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. @@ -856,7 +893,7 @@ Since June 2012 ([#342](https://github.com/ytdl-org/youtube-dl/issues/342)) yout ### The exe throws an error due to missing `MSVCR100.dll` -To run the exe you need to install first the [Microsoft Visual C++ 2010 Redistributable Package (x86)](https://www.microsoft.com/en-US/download/details.aspx?id=5555). +To run the exe you need to install first the [Microsoft Visual C++ 2010 Service Pack 1 Redistributable Package (x86)](https://download.microsoft.com/download/1/6/5/165255E7-1014-4D0A-B094-B6A430A6BFFC/vcredist_x86.exe). ### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files? diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a49043fa..ae2a6b8b0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,9 +1,9 @@ # Supported sites - **1tv**: Первый канал - - **1up.com** - **20min** - **220.ro** - **23video** + - **247sports** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -46,10 +46,11 @@ - **Amara** - **AMCNetworks** - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **Anvato** - - **aol.com** + - **aol.com**: Yahoo screen and movies - **APA** - **Aparat** - **AppleConnect** @@ -82,6 +83,7 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 + - **bandaichannel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -89,7 +91,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV** - **Beatport** @@ -116,7 +119,6 @@ - **BitChuteChannel** - **BleacherReport** - **BleacherReportCMS** - - **blinkx** - **Bloomberg** - **BokeCC** - **BongaCams** @@ -158,7 +160,8 @@ - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - - **CBSSports** + - **cbssports** + - **cbssports:embed** - **CCMA** - **CCTV**: 央视网 - **CDA** @@ -192,8 +195,6 @@ - **CNNArticle** - **CNNBlogs** - **ComedyCentral** - - **ComedyCentralFullEpisodes** - - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** @@ -214,6 +215,7 @@ - **curiositystream** - **curiositystream:collection** - **CWTV** + - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion** - **dailymotion:playlist** @@ -235,6 +237,7 @@ - **DiscoveryGo** - **DiscoveryGoPlaylist** - **DiscoveryNetworksDe** + - **DiscoveryPlus** - **DiscoveryVR** - **Disney** - **dlive:stream** @@ -330,6 +333,7 @@ - **Gaskrank** - **Gazeta** - **GDCVault** + - **GediDigital** - **generic**: Generic downloader that works on some sites - **Gfycat** - **GiantBomb** @@ -355,6 +359,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HGTVDe** - **HiDive** - **HistoricFilms** - **history:player** @@ -377,6 +382,8 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IGNArticle** + - **IGNVideo** - **IHeartRadio** - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers @@ -418,7 +425,8 @@ - **Katsomo** - **KeezMovies** - **Ketnet** - - **KhanAcademy** + - **khanacademy** + - **khanacademy:unit** - **KickStarter** - **KinjaEmbed** - **KinoPoisk** @@ -456,14 +464,14 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** - **LineTV** - **linkedin:learning** - **linkedin:learning:course** - **LinuxAcademy** - **LiTV** - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** @@ -481,6 +489,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **MaoriTV** - **Markiza** - **MarkizaPage** - **massengeschmack.tv** @@ -505,6 +514,9 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **minds** + - **minds:channel** + - **minds:group** - **MinistryGrid** - **Minoto** - **miomio.tv** @@ -513,6 +525,7 @@ - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBVideo** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -534,6 +547,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -667,12 +681,14 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **pcmag** - **PearVideo** - **PeerTube** - **People** @@ -694,6 +710,7 @@ - **play.fm** - **player.sky.it** - **PlayPlusTV** + - **PlayStuff** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -799,6 +816,7 @@ - **safari:course**: safaribooksonline.com online courses - **SAKTV** - **SaltTV** + - **SampleFocus** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -821,6 +839,9 @@ - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** + - **simplecast** + - **simplecast:episode** + - **simplecast:podcast** - **Sina** - **sky.it** - **sky:news** @@ -858,6 +879,8 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **spotify** + - **spotify:show** - **Spreaker** - **SpreakerPage** - **SpreakerShow** @@ -871,6 +894,9 @@ - **Steam** - **Stitcher** - **StitcherShow** + - **StoryFire** + - **StoryFireSeries** + - **StoryFireUser** - **Streamable** - **streamcloud.eu** - **StreamCZ** @@ -939,12 +965,13 @@ - **TNAFlixNetworkEmbed** - **toggle** - **ToonGoggles** - - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **Trovo** + - **TrovoVod** - **TruNews** - **TruTV** - **Tube8** @@ -1038,6 +1065,7 @@ - **Vidbit** - **Viddler** - **Videa** + - **video.arnes.si**: Arnes Video - **video.google:search**: Google Video search - **video.sky.it** - **video.sky.it:live** @@ -1052,7 +1080,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1097,6 +1124,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** @@ -1132,7 +1160,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** @@ -1191,5 +1219,8 @@ - **ZattooLive** - **ZDF** - **ZDFChannel** + - **Zhihu** - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** - **Zype** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 62f916d11..a35effe0e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -464,6 +464,7 @@ class TestFormatSelection(unittest.TestCase): assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') assert_syntax_error('/') + assert_syntax_error('bestvideo+bestvideo+bestaudio') def test_format_filtering(self): formats = [ @@ -632,13 +633,20 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - ydl = YoutubeDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + ydl = YoutubeDL(params) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 0e1328ede..26df356b4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -70,15 +70,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_execution.py b/test/test_execution.py index 11661bb68..32948d93e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('youtube_dl/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c4f0abbea..cf2fdf14f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -12,6 +12,7 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( YoutubePlaylistIE, + YoutubeTabIE, YoutubeIE, ) @@ -57,14 +58,22 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) - def test_youtube_flat_playlist_titles(self): + def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() dl.params['extract_flat'] = True - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') + ie = YoutubeTabIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc') self.assertIsPlaylist(result) - for entry in result['entries']: - self.assertTrue(entry.get('title')) + entries = list(result['entries']) + self.assertTrue(len(entries) == 1) + video = entries[0] + self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['ie_key'], 'Youtube') + self.assertEqual(video['id'], 'BaW_jenozKc') + self.assertEqual(video['url'], 'BaW_jenozKc') + self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐') + self.assertEqual(video['duration'], 10) + self.assertEqual(video['uploader'], 'Philipp Hagemeister') if __name__ == '__main__': diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..e18e71101 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..627d4cb92 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -19,55 +19,46 @@ from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) @@ -78,6 +69,10 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -86,13 +81,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) @@ -104,13 +95,13 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_input, expected_sig): +def make_tfunc(url, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): - basename = 'player-%s.%s' % (test_id, stype) + basename = 'player-%s.js' % test_id fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): @@ -118,22 +109,16 @@ def make_tfunc(url, stype, sig_input, expected_sig): ydl = FakeYDL() ie = YoutubeIE(ydl) - if stype == 'js': - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - else: - assert stype == 'swf' - with open(fn, 'rb') as testf: - swfcode = testf.read() - func = ie._parse_sig_swf(swfcode) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) - test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + test_func.__name__ = str('test_signature_js_' + test_id) setattr(TestSignature, test_func.__name__, test_func) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index aaac149e9..fe30758ef 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -163,6 +163,7 @@ class YoutubeDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -338,6 +339,8 @@ class YoutubeDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -656,7 +659,7 @@ class YoutubeDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -676,8 +679,8 @@ class YoutubeDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: @@ -770,11 +773,20 @@ class YoutubeDL(object): def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): - ''' - Returns a list with a dictionary for each video we find. - If 'download', also downloads the videos. - extra_info is a dict containing the extra values to add to each result - ''' + """ + Return a list with a dictionary for each video extracted. + + Arguments: + url -- URL to extract + + Keyword arguments: + download -- whether to download videos during extraction + ie_key -- extractor key hint + extra_info -- dictionary containing the extra values to add to each result + process -- whether to resolve all unresolved references (URLs, playlist items), + must be True for download to work. + force_generic_extractor -- force using the generic extractor + """ if not ie_key and force_generic_extractor: ie_key = 'Generic' @@ -906,115 +918,23 @@ class YoutubeDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1039,6 +959,118 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + @__handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( @@ -1226,6 +1258,8 @@ class YoutubeDL(object): group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) elif string == '+': + if inside_merge: + raise syntax_error('Unexpected "+"', start) video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) if not video_selector or not audio_selector: @@ -1486,14 +1520,18 @@ class YoutubeDL(object): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. @@ -1777,6 +1815,8 @@ class YoutubeDL(object): os.makedirs(dn) return True except (OSError, IOError) as err: + if isinstance(err, OSError) and err.errno == errno.EEXIST: + return True self.report_error('unable to create directory ' + error_to_compat_str(err)) return False diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..e1bd67919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6c3d49d45..9e45c454b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -73,6 +73,15 @@ try: except ImportError: # Python 2 import Cookie as compat_cookies +if sys.version_info[0] == 2: + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + if isinstance(rawdata, compat_str): + rawdata = str(rawdata) + return super(compat_cookies_SimpleCookie, self).load(rawdata) +else: + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie + try: import html.entities as compat_html_entities except ImportError: # Python 2 @@ -3000,6 +3009,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', + 'compat_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', 'compat_etree_fromstring', diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8b407bf9c..908c83377 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -36,6 +37,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download @@ -47,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -100,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index c95ad2173..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -10,6 +10,7 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_HTTPError, compat_b64decode, compat_ord, ) @@ -18,11 +19,14 @@ from ..utils import ( bytes_to_long, ExtractorError, float_or_none, + int_or_none, intlist_to_bytes, long_to_bytes, pkcs1pad, strip_or_none, - urljoin, + try_get, + unified_strdate, + urlencode_postdata, ) @@ -31,16 +35,30 @@ class ADNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' _TEST = { 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'md5': '0319c99885ff5547565cacb4f3f9348d', 'info_dict': { 'id': '7778', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + 'series': 'Blue Exorcist - Kyôto Saga', + 'duration': 1467, + 'release_date': '20170106', + 'comment_count': int, + 'average_rating': float, + 'season_number': 2, + 'episode': 'Début des hostilités', + 'episode_number': 1, } } + + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' - _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' + _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, 'end': 3, @@ -54,26 +72,24 @@ class ADNIE(InfoExtractor): def _ass_subtitles_timecode(seconds): return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_path, video_id): - if not sub_path: + def _get_subtitles(self, sub_url, video_id): + if not sub_url: return None enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, sub_path), - video_id, 'Downloading subtitles location', fatal=False) or '{}' + sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}' subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') if subtitle_location: enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, subtitle_location), - video_id, 'Downloading subtitles data', fatal=False, - headers={'Origin': 'https://animedigitalnetwork.fr'}) + subtitle_location, video_id, 'Downloading subtitles data', + fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), + bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -117,61 +133,100 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if not username: + return + try: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_config = self._parse_json(self._search_regex( - r'playerConfig\s*=\s*({.+});', webpage, - 'player config', default='{}'), video_id, fatal=False) - if not player_config: - config_url = urljoin(self._BASE_URL, self._search_regex( - r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', - webpage, 'config url')) - player_config = self._download_json( - config_url, video_id, - 'Downloading player config JSON metadata')['player'] + video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id + player = self._download_json( + video_base_url + 'configuration', video_id, + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] + options = player['options'] - video_info = {} - video_info_str = self._search_regex( - r'videoInfo\s*=\s*({.+});', webpage, - 'video info', fatal=False) - if video_info_str: - video_info = self._parse_json( - video_info_str, video_id, fatal=False) or {} + user = options['user'] + if not user.get('hasAccess'): + self.raise_login_required() - options = player_config.get('options') or {} - metas = options.get('metas') or {} - links = player_config.get('links') or {} - sub_path = player_config.get('subtitles') - error = None - if not links: - links_url = player_config.get('linksurl') or options['videoUrl'] - token = options['token'] - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) - message = bytes_to_intlist(json.dumps({ - 'k': self._K, - 'e': 60, - 't': token, - })) + token = self._download_json( + user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), + video_id, 'Downloading access token', headers={ + 'x-player-refresh-token': user['refreshToken'] + }, data=b'')['token'] + + links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 't': token, + })) + + # Sometimes authentication fails for no good reason, retry with + # a different random padding + links_data = None + for _ in range(3): padded_message = intlist_to_bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() - links_data = self._download_json( - urljoin(self._BASE_URL, links_url), video_id, - 'Downloading links JSON metadata', headers={ - 'Authorization': 'Bearer ' + authorization, - }) - links = links_data.get('links') or {} - metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') or \ - 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id - sub_path += '&token=' + token - error = links_data.get('error') - title = metas.get('title') or video_info['title'] + + try: + links_data = self._download_json( + links_url, video_id, 'Downloading links JSON metadata', headers={ + 'X-Player-Token': authorization + }, query={ + 'freeWithAds': 'true', + 'adaptive': 'false', + 'withMetadata': 'true', + 'source': 'Web' + }) + break + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError): + raise e + + if e.cause.code == 401: + # This usually goes away with a different random pkcs1pad, so retry + continue + + error = self._parse_json(e.cause.read(), video_id) + message = error.get('message') + if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message) + else: + raise ExtractorError('Giving up retrying') + + links = links_data.get('links') or {} + metas = links_data.get('metadata') or {} + sub_url = (links.get('subtitles') or {}).get('all') + video_info = links_data.get('video') or {} + title = metas['title'] formats = [] - for format_id, qualities in links.items(): + for format_id, qualities in (links.get('streaming') or {}).items(): if not isinstance(qualities, dict): continue for quality, load_balancer_url in qualities.items(): @@ -189,19 +244,26 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - if not error: - error = options.get('error') - if not formats and error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) + video = (self._download_json( + self._API_BASE_URL + 'video/%s' % video_id, video_id, + 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + show = video.get('show') or {} + return { 'id': video_id, 'title': title, - 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), - 'thumbnail': video_info.get('image'), + 'description': strip_or_none(metas.get('summary') or video.get('summary')), + 'thumbnail': video_info.get('image') or player.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(sub_path, video_id), - 'episode': metas.get('subtitle') or video_info.get('videoTitle'), - 'series': video_info.get('playlistTitle'), + 'subtitles': self.extract_subtitles(sub_url, video_id), + 'episode': metas.get('subtitle') or video.get('name'), + 'episode_number': int_or_none(video.get('shortNumber')), + 'series': show.get('title'), + 'season_number': int_or_none(video.get('season')), + 'duration': int_or_none(video_info.get('duration') or video.get('duration')), + 'release_date': unified_strdate(video.get('releaseDate')), + 'average_rating': float_or_none(video.get('rating') or metas.get('rating')), + 'comment_count': int_or_none(video.get('commentsCount')), } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8e4963131..e55c03fd7 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -252,11 +252,11 @@ class AENetworksShowIE(AENetworksListBaseIE): _TESTS = [{ 'url': 'http://www.history.com/shows/ancient-aliens', 'info_dict': { - 'id': 'SH012427480000', + 'id': 'SERIES1574', 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, - 'playlist_mincount': 168, + 'playlist_mincount': 150, }] _RESOURCE = 'series' _ITEMS_KEY = 'episodes' diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index e20f00fc3..be960c0f9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,13 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, try_get, unified_strdate, + unified_timestamp, ) @@ -22,8 +25,8 @@ class AmericasTestKitchenIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, @@ -33,6 +36,27 @@ class AmericasTestKitchenIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, @@ -60,7 +84,76 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season_1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season_12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + season = 'Season %d' % season_number + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'attributesToHighlight': '', + 'hitsPerPage': 1000, + }) + + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } + + return self.playlist_result( + entries(), 'season_%d' % season_number, season) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 00ce684d1..54e097d2f 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 98ccdaa4a..cbc1c0ecb 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -6,25 +6,21 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -46,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage, @@ -59,16 +57,18 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) if not source_url: continue ext = determine_ext(source_url) @@ -77,18 +77,19 @@ class APAIE(InfoExtractor): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index a84b8b1eb..494f8330c 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -9,10 +9,10 @@ from ..utils import ( class AppleConnectIE(InfoExtractor): - _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' - _TEST = { + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P<id>[\w-]+)' + _TESTS = [{ 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': 'e7c38568a01ea45402570e6029206723', + 'md5': 'c1d41f72c8bcaf222e089434619316e4', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', @@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor): 'upload_date': '20150710', 'timestamp': 1436545535, }, - } + }, { + 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -36,7 +39,7 @@ class AppleConnectIE(InfoExtractor): video_data = self._parse_json(video_json, video_id) timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) - like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None)) return { 'id': video_id, diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d..d45a9fe52 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE): if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) - title = self._html_search_regex( + title = self._og_search_title(webpage, default=None) or self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', r'<h4 class="headline">(.*?)</h4>', r'<title[^>]*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,31 +249,40 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'id': 'maischberger-die-woche-video100', + 'display_id': 'maischberger-die-woche-video100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', + 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', 'only_matching': True, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = mobj.group('id') player_url = mobj.group('mainurl') + '~playerXml.xml' doc = self._download_xml(player_url, display_id) @@ -284,25 +293,47 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) return { - 'id': mobj.group('id'), + 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, 'display_id': display_id, 'title': video_node.find('./title').text, @@ -313,19 +344,19 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { @@ -343,22 +374,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -388,7 +419,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -413,7 +444,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/youtube_dl/extractor/awaan.py +++ b/youtube_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): diff --git a/youtube_dl/extractor/bandaichannel.py b/youtube_dl/extractor/bandaichannel.py new file mode 100644 index 000000000..d67285913 --- /dev/null +++ b/youtube_dl/extractor/bandaichannel.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from ..utils import extract_attributes + + +class BandaiChannelIE(BrightcoveNewIE): + IE_NAME = 'bandaichannel' + _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' + _TESTS = [{ + 'url': 'https://www.b-ch.com/titles/514/001', + 'md5': 'a0f2d787baa5729bed71108257f613a4', + 'info_dict': { + 'id': '6128044564001', + 'ext': 'mp4', + 'title': 'メタルファイターMIKU 第1話', + 'timestamp': 1580354056, + 'uploader_id': '5797077852001', + 'upload_date': '20200130', + 'duration': 1387.733, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + attrs = extract_attributes(self._search_regex( + r'(]+\bid="bcplayer"[^>]*>)', webpage, 'player')) + bc = self._download_json( + 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'], + video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc'] + return self._parse_brightcove_metadata(bc, bc['id']) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b4daee54e..247d982ce 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,31 +1,39 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, + compat_urlparse, +) from ..utils import ( + ExtractorError, + OnDemandPagedList, clean_html, dict_get, - ExtractorError, float_or_none, get_element_by_class, int_or_none, js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, ) -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_urlparse, -) class BBCCoUkIE(InfoExtractor): @@ -756,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -793,11 +810,25 @@ class BBCIE(BBCCoUkIE): 'description': 'Learn English words and phrases from this story', }, 'add_ie': [BBCCoUkIE.ie_key()], + }, { + # BBC Reel + 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', + 'info_dict': { + 'id': 'p07c6sb9', + 'ext': 'mp4', + 'title': 'How positive thinking is harming your happiness', + 'alt_title': 'The downsides of positive thinking', + 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'duration': 235, + 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'upload_date': '20190604', + 'categories': ['Psychology'], + }, }] @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -929,7 +960,7 @@ class BBCIE(BBCCoUkIE): else: entry['title'] = info['title'] entry['formats'].extend(info['formats']) - except Exception as e: + except ExtractorError as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) @@ -980,6 +1011,37 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._parse_json(self._html_search_regex( + r']+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P(?:(?!\2).)+)', + webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) + if initial_data: + init_data = try_get( + initial_data, lambda x: x['initData']['items'][0], dict) or {} + smp_data = init_data.get('smpData') or {} + clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} + version_id = clip_data.get('versionID') + if version_id: + title = smp_data['title'] + formats, subtitles = self._download_media_selector(version_id) + self._sort_formats(formats) + image_url = smp_data.get('holdingImageURL') + display_date = init_data.get('displayDate') + topic_title = init_data.get('topicTitle') + + return { + 'id': version_id, + 'title': title, + 'formats': formats, + 'alt_title': init_data.get('shortTitle'), + 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, + 'description': smp_data.get('summary') or init_data.get('shortSummary'), + 'upload_date': display_date.replace('-', '') if display_date else None, + 'subtitles': subtitles, + 'duration': int_or_none(clip_data.get('duration')), + 'categories': [topic_title] if topic_title else None, + } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # There are several setPayload calls may be present but the video # seems to be always related to the first one @@ -1041,7 +1103,7 @@ class BBCIE(BBCCoUkIE): thumbnail = None image_url = current_programme.get('image_url') if image_url: - thumbnail = image_url.replace('{recipe}', '1920x1920') + thumbnail = image_url.replace('{recipe}', 'raw') return { 'id': programme_id, 'title': title, @@ -1114,12 +1176,29 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') @@ -1293,21 +1372,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1316,14 +1523,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4dc597e16..bff6ea194 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -156,6 +156,7 @@ class BiliBiliIE(InfoExtractor): cid = js['result']['cid'] headers = { + 'Accept': 'application/json', 'Referer': url } headers.update(self.geo_verification_headers()) @@ -232,7 +233,7 @@ class BiliBiliIE(InfoExtractor): webpage) if uploader_mobj: info.update({ - 'uploader': uploader_mobj.group('name'), + 'uploader': uploader_mobj.group('name').strip(), 'uploader_id': uploader_mobj.group('id'), }) if not info.get('uploader'): diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py deleted file mode 100644 index db5e12b21..000000000 --- a/youtube_dl/extractor/blinkx.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - remove_start, - int_or_none, -) - - -class BlinkxIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P[^?]+)' - IE_NAME = 'blinkx' - - _TEST = { - 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', - 'md5': '337cf7a344663ec79bf93a526a2e06c7', - 'info_dict': { - 'id': 'Da0Gw3xc', - 'ext': 'mp4', - 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', - 'uploader': 'IGN News', - 'upload_date': '20150217', - 'timestamp': 1424215740, - 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', - 'duration': 47.743333, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = video_id[:8] - - api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' - + 'video=%s' % video_id) - data_json = self._download_webpage(api_url, display_id) - data = json.loads(data_json)['api']['results'][0] - duration = None - thumbnails = [] - formats = [] - for m in data['media']: - if m['type'] == 'jpg': - thumbnails.append({ - 'url': m['link'], - 'width': int(m['w']), - 'height': int(m['h']), - }) - elif m['type'] == 'original': - duration = float(m['d']) - elif m['type'] == 'youtube': - yt_id = m['link'] - self.to_screen('Youtube video detected: %s' % yt_id) - return self.url_result(yt_id, 'Youtube', video_id=yt_id) - elif m['type'] in ('flv', 'mp4'): - vcodec = remove_start(m['vcodec'], 'ff') - acodec = remove_start(m['acodec'], 'ff') - vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) - abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) - tbr = vbr + abr if vbr and abr else None - format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) - formats.append({ - 'format_id': format_id, - 'url': m['link'], - 'vcodec': vcodec, - 'acodec': acodec, - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(m.get('w')), - 'height': int_or_none(m.get('h')), - }) - - self._sort_formats(formats) - - return { - 'id': display_id, - 'fullid': video_id, - 'title': data['title'], - 'formats': formats, - 'uploader': data['channel_name'], - 'timestamp': data['pubdate_epoch'], - 'description': data.get('description'), - 'thumbnails': thumbnails, - 'duration': duration, - } diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8b76a0200..eefbab241 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,19 +7,21 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE): 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 4a19a73d2..c79e55a75 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE): }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re + +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' - +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 544647f92..e6ae49352 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar +import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +27,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +39,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +92,28 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +128,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +147,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index d67900e62..e1b391937 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -95,8 +95,11 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): + self.raise_geo_restricted() + need_confirm_age = False - if self._html_search_regex(r'(]+action="/a/validatebirth")', + if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( url, video_id, note='Confirming age') @@ -130,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -194,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -206,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d5faa0eb7..797c35fd5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,7 +17,7 @@ import math from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, @@ -230,8 +230,10 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video became available + (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. @@ -1273,6 +1275,7 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1280,7 +1283,11 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), @@ -2064,7 +2071,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', @@ -2078,10 +2085,9 @@ class InfoExtractor(object): mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) + mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2359,15 +2365,7 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111, - # https://github.com/ytdl-org/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) + formats.append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats @@ -2903,10 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 766942146..2e01aff48 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -8,11 +8,14 @@ from ..utils import ( ExtractorError, extract_attributes, find_xpath_attr, + get_element_by_attribute, get_element_by_class, int_or_none, js_to_json, merge_dicts, + parse_iso8601, smuggle_url, + str_to_int, unescapeHTML, ) from .senateisvp import SenateISVPIE @@ -116,8 +119,30 @@ class CSpanIE(InfoExtractor): jwsetup, video_id, require_title=False, m3u8_id='hls', base_url=url) add_referer(info['formats']) + for subtitles in info['subtitles'].values(): + for subtitle in subtitles: + ext = determine_ext(subtitle['url']) + if ext == 'php': + ext = 'vtt' + subtitle['ext'] = ext ld_info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info, ld_info) + title = get_element_by_class('video-page-title', webpage) or \ + self._og_search_title(webpage) + description = get_element_by_attribute('itemprop', 'description', webpage) or \ + self._html_search_meta(['og:description', 'description'], webpage) + return merge_dicts(info, ld_info, { + 'title': title, + 'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage), + 'description': description, + 'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)), + 'location': get_element_by_attribute('itemprop', 'contentLocation', webpage), + 'duration': int_or_none(self._search_regex( + r'jwsetup\.seclength\s*=\s*(\d+);', + webpage, 'duration', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>", + webpage, 'views', fatal=False)), + }) # Obsolete # We first look for clipid, because clipprog always appears before diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e4a7fca6c..48ff30432 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -132,7 +145,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://app.curiositystream.com/collection/2', 'info_dict': { @@ -140,10 +153,13 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, + }, { + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c345e0274..276fd4b09 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -32,6 +32,18 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624, empty speakerVideo + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -84,26 +96,20 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(video_path, '.flv'), + 'ext': 'flv', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, + }) return formats def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 47501dbe6..bbb199094 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -10,11 +11,13 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + strip_or_none, unified_timestamp, ) class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)' _VALID_URL = r'''(?x)https?:// (?P<domain> (?:www\.)?(?P<host>d @@ -24,7 +27,7 @@ class DPlayIE(InfoExtractor): ) )| (?P<subdomain_country>es|it)\.dplay\.com - )/[^/]+/(?P<id>[^/]+/[^/?#]+)''' + )/[^/]+''' + _PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -151,56 +154,79 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, }) disco_base = 'https://%s/' % disco_host - token = self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] headers = { 'Referer': url, - 'Authorization': 'Bearer ' + token, } - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise video_id = video['data']['id'] info = video['data']['attributes'] title = info['name'].strip() formats = [] try: - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - display_id, headers=headers)['data']['attributes']['streaming'] + streaming = self._download_video_playback_info( + disco_base, video_id, headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code == 'access.denied.missingpackage': - self.raise_login_required() - raise ExtractorError(info['errors'][0]['detail'], expected=True) + self._process_errors(e, geo_countries) raise - for format_id, format_dict in streaming.items(): + for format_dict in streaming: if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') if not format_url: continue + format_id = format_dict.get('type') ext = determine_ext(format_url) if format_id == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -248,7 +274,7 @@ class DPlayIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': info.get('description'), + 'description': strip_or_none(info.get('description')), 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, @@ -268,3 +294,76 @@ class DPlayIE(InfoExtractor): host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + 'product': 'dplus_us', + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..5a07c18f4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,43 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) +from .zdf import ZDFIE -class DreiSatIE(InfoExtractor): +class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html + 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'info_dict': { + 'id': '140913_sendung_schweizweit', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'timestamp': 1410623100, + 'upload_date': '20140913' }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ - (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, + 'params': { + 'skip_download': True, } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) + }, { + # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html + 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index df11dc206..9bbd703e0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -12,26 +12,35 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource, fatal=fatal) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': '72', + 'id': '432655', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, - } + }, { + 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript', + 'only_matching': True, + }] def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +53,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,10 +65,10 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { @@ -74,7 +82,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, @@ -83,13 +91,16 @@ class EggheadLessonIE(InfoExtractor): }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', 'only_matching': True, + }, { + 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index c08643a17..c460dc7f9 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, - unescapeHTML + merge_dicts, ) @@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor): 'title': 'sexy babe softcore', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + }, + 'skip': 'Video not found', }, { 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', 'md5': '1baa9602ede46ce904c431f5418d8916', @@ -77,19 +78,15 @@ class EroProfileIE(InfoExtractor): [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) - video_url = unescapeHTML(self._search_regex( - r'<source src="([^"]+)', webpage, 'video url')) title = self._html_search_regex( - r'Title:</th><td>([^<]+)</td>', webpage, 'title') - thumbnail = self._search_regex( - r'onclick="showVideoPlayer\(\)"><img src="([^"]+)', - webpage, 'thumbnail', fatal=False) + (r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'), + webpage, 'title') - return { + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, - 'thumbnail': thumbnail, 'age_limit': 18, - } + }) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d403a2dbe..4e9954c6a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -42,7 +42,10 @@ from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE @@ -69,6 +72,7 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, @@ -87,11 +91,13 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .bandaichannel import BandaiChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) @@ -126,7 +132,6 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) -from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -160,6 +165,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, @@ -184,7 +190,11 @@ from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, @@ -232,11 +242,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( @@ -287,7 +294,11 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -416,6 +427,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE @@ -470,8 +482,8 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, ) from .iheart import ( IHeartRadioIE, @@ -526,7 +538,10 @@ from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE -from .khanacademy import KhanAcademyIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE @@ -583,7 +598,11 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, @@ -591,10 +610,6 @@ from .linkedin import ( from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .livejournal import LiveJournalIE -from .liveleak import ( - LiveLeakIE, - LiveLeakEmbedIE, -) from .livestream import ( LivestreamIE, LivestreamOriginalIE, @@ -620,6 +635,7 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, @@ -648,6 +664,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE @@ -658,7 +679,10 @@ from .mixcloud import ( MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -859,6 +883,11 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE @@ -892,6 +921,7 @@ from .platzi import ( from .playfm import PlayFMIE from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE +from .playstuff import PlayStuffIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE @@ -1016,6 +1046,7 @@ from .safari import ( SafariApiIE, SafariCourseIE, ) +from .samplefocus import SampleFocusIE from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE @@ -1048,6 +1079,11 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) from .sina import SinaIE from .sixplay import SixPlayIE from .skyit import ( @@ -1113,6 +1149,10 @@ from .stitcher import ( from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) from .spreaker import ( SpreakerIE, SpreakerPageIE, @@ -1128,6 +1168,11 @@ from .srgssr import ( from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE @@ -1226,6 +1271,10 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE @@ -1244,6 +1293,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, @@ -1382,7 +1432,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, @@ -1442,6 +1491,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE @@ -1585,5 +1635,10 @@ from .zattoo import ( ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ZingMp3IE +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cb34c59f5..04650af39 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -521,7 +521,10 @@ class FacebookIE(InfoExtractor): raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) - elif '>You must log in to continue' in webpage: + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): self.raise_login_required() if not video_data and '/watchparty/' in url: diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index fecfc28ae..67662e6de 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,29 +5,23 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' - _TESTS = [{ - 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'md5': '8c79e54be72078b26b89e0e111c0502b', + _VALID_URL = r'https?://(?:www\.)?formula1\.com/en/latest/video\.[^.]+\.(?P<id>\d+)\.html' + _TEST = { + 'url': 'https://www.formula1.com/en/latest/video.race-highlights-spain-2016.6060988138001.html', + 'md5': 'be7d3a8c2f804eb2ab2aa5d941c359f8', 'info_dict': { - 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', + 'id': '6060988138001', 'ext': 'mp4', 'title': 'Race highlights - Spain 2016', + 'timestamp': 1463332814, + 'upload_date': '20160515', + 'uploader_id': '6057949432001', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'only_matching': True, - }] + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/6057949432001/S1WMrhjlh_default/index.html?videoId=%s' def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - ooyala_embed_code = self._search_regex( - r'data-videoid="([^"]+)"', webpage, 'ooyala embed code') + bc_id = self._match_id(url) return self.url_result( - 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) + self.BRIGHTCOVE_URL_TEMPLATE % bc_id, 'BrightcoveNew', bc_id) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 306b45fc9..14f4cb489 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -20,10 +20,14 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -36,19 +40,19 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) @@ -64,6 +68,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3ca415077..e4ec2e200 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "<figure id=" pattern (#28792) + 'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -399,7 +403,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), webpage, 'video id') return self._make_url_result(video_id) diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/youtube_dl/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 8bbedca26..d8f1e169a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -16,7 +16,7 @@ from ..utils import ( class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TOKEN = None @@ -51,6 +51,10 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, + }, { + # with lang code + 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', + 'only_matching': True, }] def _login(self): diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 2f555c1d4..acc6478b8 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -102,6 +103,26 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -175,7 +196,18 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex( r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>', - start_page, 'xml filename') + start_page, 'xml filename', default=None) + if not xml_name: + info = self._parse_html5_media_entries(url, start_page, video_id)[0] + info.update({ + 'title': remove_start(self._search_regex( + r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page, + 'title', default=None) or self._og_search_title( + start_page, default=None), 'GDC Vault - '), + 'id': video_id, + 'display_id': display_id, + }) + return info embed_url = '%s/xml/%s' % (xml_root, xml_name) ie_key = 'DigitallySpeaking' diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py new file mode 100644 index 000000000..6c4153b40 --- /dev/null +++ b/youtube_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?P<type>format|param)',\s*'(?P<name>[^']+)',\s*'(?P<val>[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 780971a92..a9c064105 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .arkena import ArkenaIE from .instagram import InstagramIE -from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .kaltura import KalturaIE @@ -126,8 +125,11 @@ from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE +from .vk import VKIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE +from .simplecast import SimplecastIE class GenericIE(InfoExtractor): @@ -1626,31 +1628,6 @@ class GenericIE(InfoExtractor): 'upload_date': '20160409', }, }, - # LiveLeak embed - { - 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': '7619da8c820e835bef21a1efa2a0fc71', - 'info_dict': { - 'id': '874_1459135191', - 'ext': 'mp4', - 'title': 'Man shows poor quality of new apartment building', - 'description': 'The wall is like a sand pile.', - 'uploader': 'Lake8737', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Another LiveLeak embed pattern (#13336) - { - 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', - 'info_dict': { - 'id': '2eb_1496309988', - 'ext': 'mp4', - 'title': 'Thief robs place where everyone was armed', - 'description': 'md5:694d73ee79e535953cf2488562288eee', - 'uploader': 'brazilwtf', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, # Duplicated embedded video URLs { 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', @@ -2223,6 +2200,34 @@ class GenericIE(InfoExtractor): 'duration': 1581, }, }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, + { + # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) + 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', + 'only_matching': True, + }, ] def report_following_redirect(self, new_url): @@ -2462,6 +2467,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2593,6 +2601,11 @@ class GenericIE(InfoExtractor): if arc_urls: return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + mychannels_urls = MedialaanIE._extract_urls(webpage) + if mychannels_urls: + return self.playlist_from_matches( + mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', @@ -2744,6 +2757,11 @@ class GenericIE(InfoExtractor): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + # Look for sibnet embedded player + sibnet_urls = VKIE._extract_sibnet_urls(webpage) + if sibnet_urls: + return self.playlist_from_matches(sibnet_urls, video_id, video_title) + # Look for embedded ivi player mobj = re.search(r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) if mobj is not None: @@ -2769,6 +2787,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') + # Look for Simplecast embeds + simplecast_urls = SimplecastIE._extract_urls(webpage) + if simplecast_urls: + return self.playlist_from_matches( + simplecast_urls, video_id, video_title) + # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: @@ -2914,7 +2938,7 @@ class GenericIE(InfoExtractor): webpage) if not mobj: mobj = re.search( - r'data-video-link=["\'](?Phttp://m.mlb.com/video/[^"\']+)', + r'data-video-link=["\'](?Phttp://m\.mlb\.com/video/[^"\']+)', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') @@ -3129,11 +3153,6 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - # Look for LiveLeak embeds - liveleak_urls = LiveLeakIE._extract_urls(webpage) - if liveleak_urls: - return self.playlist_from_matches(liveleak_urls, video_id, video_title) - # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) if threeqsdn_url: @@ -3361,6 +3380,9 @@ class GenericIE(InfoExtractor): 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), + 'http_headers': { + 'Referer': full_response.geturl(), + }, }) if formats: self._sort_formats(formats) @@ -3429,7 +3451,7 @@ class GenericIE(InfoExtractor): m_video_type = re.findall(r'videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r']+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?Pvideos|article2)(/.+)?/(?P.+)' - IE_NAME = 'pcmag' - - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' - +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1eeddc3b6..12e10143c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, @@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by britneyspears', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') + title = media.get('title') thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') @@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor): continue entries.append({ 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, + 'title': node.get('title') or 'Video %d' % edge_num, 'url': node_video_url, 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 'view_count': int_or_none(node.get('video_view_count')), @@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor): 'id': video_id, 'formats': formats, 'ext': 'mp4', - 'title': 'Video by %s' % uploader_id, + 'title': title or 'Video by %s' % uploader_id, 'description': description, + 'duration': duration, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader_id': uploader_id, diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 490efa8fb..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Maya Filipič - Stories from Emona I', - 'artist': 'Maya Filipič', + # 'title': 'Maya Filipič - Stories from Emona I', + 'title': 'Stories from Emona I', + # 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1217438117, 'upload_date': '20080730', + 'license': 'by-nc-nd', + 'view_count': int, + 'like_count': int, + 'average_rating': int, + 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage( - 'https://www.jamendo.com/track/' + track_id, track_id) - models = self._parse_json(self._html_search_regex( - r"data-bundled-models='([^']+)", - webpage, 'bundled models'), track_id) - track = models['track']['models'][0] + # webpage = self._download_webpage( + # 'https://www.jamendo.com/track/' + track_id, track_id) + # models = self._parse_json(self._html_search_regex( + # r"data-bundled-models='([^']+)", + # webpage, 'bundled models'), track_id) + # track = models['track']['models'][0] + track = self._call_api('track', track_id) title = track_name = track['name'] - get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} - artist = get_model('artist') - artist_name = artist.get('name') - if artist_name: - title = '%s - %s' % (artist_name, title) - album = get_model('album') + # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + # artist = get_model('artist') + # artist_name = artist.get('name') + # if artist_name: + # title = '%s - %s' % (artist_name, title) + # album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor): urls = [] thumbnails = [] - for _, covers in track.get('cover', {}).items(): + for covers in (track.get('cover') or {}).values(): for cover_id, cover_url in covers.items(): if not cover_url or cover_url in urls: continue @@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor): }) tags = [] - for tag in track.get('tags', []): + for tag in (track.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue tags.append(tag_name) stats = track.get('stats') or {} + license = track.get('licenseCC') or [] return { 'id': track_id, @@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - 'artist': artist_name, + # 'artist': artist_name, 'track': track_name, - 'album': album.get('name'), + # 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(track.get('licenseCC', [])) or None, + 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', @@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 } - } - - def _call_api(self, resource, resource_id): - path = '/api/%ss' % resource - rand = compat_str(random.random()) - return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ - 'id[]': resource_id, - }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) - })[0] + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor): album_name = album.get('name') entries = [] - for track in album.get('tracks', []): + for track in (album.get('tracks') or []): track_id = track.get('id') if not track_id: continue diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460d..c731612c4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,16 +142,16 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 61739efa7..87e520378 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -1,82 +1,107 @@ from __future__ import unicode_literals -import re +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, + int_or_none, + parse_iso8601, + try_get, ) -class KhanAcademyIE(InfoExtractor): - _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P[^/]+)/(?:[^/]+/){,2}(?P[^?#/]+)(?:$|[?#])' - IE_NAME = 'KhanAcademy' +class KhanAcademyBaseIE(InfoExtractor): + _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _TESTS = [{ - 'url': 'http://www.khanacademy.org/video/one-time-pad', - 'md5': '7b391cce85e758fb94f763ddc1bbb979', + def _parse_video(self, video): + return { + '_type': 'url_transparent', + 'url': video['youtubeId'], + 'id': video.get('slug'), + 'title': video.get('title'), + 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'description': video.get('description'), + 'ie_key': 'Youtube', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + component_props = self._parse_json(self._download_json( + 'https://www.khanacademy.org/api/internal/graphql', + display_id, query={ + 'hash': 1604303425, + 'variables': json.dumps({ + 'path': display_id, + 'queryParams': '', + }), + })['data']['contentJson'], display_id)['componentProps'] + return self._parse_component_props(component_props) + + +class KhanAcademyIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy' + _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', + 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', 'info_dict': { - 'id': 'one-time-pad', - 'ext': 'webm', + 'id': 'FlIG3TvQCBQ', + 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', 'duration': 176, 'uploader': 'Brit Cruise', 'uploader_id': 'khanacademy', 'upload_date': '20120411', + 'timestamp': 1334170113, + 'license': 'cc-by-nc-sa', }, 'add_ie': ['Youtube'], - }, { - 'url': 'https://www.khanacademy.org/math/applied-math/cryptography', + } + + def _parse_component_props(self, component_props): + video = component_props['tutorialPageData']['contentModel'] + info = self._parse_video(video) + author_names = video.get('authorNames') + info.update({ + 'uploader': ', '.join(author_names) if author_names else None, + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'license': video.get('kaUserLicense'), + }) + return info + + +class KhanAcademyUnitIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy:unit' + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { 'id': 'cryptography', - 'title': 'Journey into cryptography', + 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', }, - 'playlist_mincount': 3, - }] + 'playlist_mincount': 31, + } - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + def _parse_component_props(self, component_props): + curation = component_props['curation'] - if m.group('key') == 'video': - data = self._download_json( - 'http://api.khanacademy.org/api/v1/videos/' + video_id, - video_id, 'Downloading video info') - - upload_date = unified_strdate(data['date_added']) - uploader = ', '.join(data['author_names']) - return { - '_type': 'url_transparent', - 'url': data['url'], - 'id': video_id, - 'title': data['title'], - 'thumbnail': data['image_url'], - 'duration': data['duration'], - 'description': data['description'], - 'uploader': uploader, - 'upload_date': upload_date, + entries = [] + tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] + for tutorial_number, tutorial in enumerate(tutorials, 1): + chapter_info = { + 'chapter': tutorial.get('title'), + 'chapter_number': tutorial_number, + 'chapter_id': tutorial.get('id'), } - else: - # topic - data = self._download_json( - 'http://api.khanacademy.org/api/v1/topic/' + video_id, - video_id, 'Downloading topic info') + for content_item in (tutorial.get('contentItems') or []): + if content_item.get('kind') == 'Video': + info = self._parse_video(content_item) + info.update(chapter_info) + entries.append(info) - entries = [ - { - '_type': 'url', - 'url': c['url'], - 'id': c['id'], - 'title': c['title'], - } - for c in data['children'] if c['kind'] in ('Video', 'Topic')] - - return { - '_type': 'playlist', - 'id': video_id, - 'title': data['title'], - 'description': data['description'], - 'entries': entries, - } + return self.playlist_result( + entries, curation.get('unit'), curation.get('title'), + curation.get('description')) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 41cc245eb..cfd6b8393 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -5,7 +5,12 @@ import functools import json from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -57,6 +62,7 @@ class LBRYBaseIE(InfoExtractor): 'description': stream_value.get('description'), 'license': stream_value.get('license'), 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), 'tags': stream_value.get('tags'), 'duration': int_or_none(media.get('duration')), 'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -89,6 +95,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', 'width': 1280, 'height': 720, } @@ -103,6 +111,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', 'tags': list, 'duration': 2570, 'channel': 'The LBRY Foundation', @@ -110,6 +120,26 @@ class LBRYIE(LBRYBaseIE): 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -131,6 +161,9 @@ class LBRYIE(LBRYBaseIE): }, { 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', + 'only_matching': True, }] def _real_extract(self, url): @@ -139,6 +172,7 @@ class LBRYIE(LBRYBaseIE): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') + display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') result_value = result['value'] @@ -149,10 +183,18 @@ class LBRYIE(LBRYBaseIE): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url info.update({ 'id': claim_id, 'title': title, - 'url': streaming_url, }) return info @@ -174,17 +216,18 @@ class LBRYChannelIE(LBRYBaseIE): }] _PAGE_SIZE = 50 - def _fetch_page(self, claim_id, url, page): + def _fetch_page(self, claim_id, url, params, page): page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) result = self._call_api_proxy( - 'claim_search', claim_id, { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - 'stream_types': self._SUPPORTED_STREAM_TYPES, - }, 'page %d' % page) + 'claim_search', claim_id, page_params, 'page %d' % page) for item in (result.get('items') or []): stream_claim_name = item.get('name') stream_claim_id = item.get('claim_id') @@ -205,8 +248,31 @@ class LBRYChannelIE(LBRYBaseIE): result = self._resolve_url( 'lbry://' + display_id, display_id, 'channel') claim_id = result['claim_id'] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url), + functools.partial(self._fetch_page, claim_id, url, params), self._PAGE_SIZE) result_value = result.get('value') or {} return self.playlist_result( diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 7f5fa446e..2526daa77 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, +) class LineTVIE(InfoExtractor): @@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor): for thumbnail in video_info.get('thumbnails', {}).get('list', [])], 'view_count': video_info.get('meta', {}).get('count'), } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py deleted file mode 100644 index 4ac437c8b..000000000 --- a/youtube_dl/extractor/liveleak.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P[\w_]+)' - _TESTS = [{ - 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '0813c2430bea7a46bf13acf3406992f4', - 'info_dict': { - 'id': '757_1364311680', - 'ext': 'mp4', - 'description': 'extremely bad day for this guy..!', - 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', - 'info_dict': { - 'id': 'f93_1390833151', - 'ext': 'mp4', - 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', - 'uploader': 'ARD_Stinkt', - 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Prochan embed - 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', - 'md5': '42c6d97d54f1db107958760788c5f48f', - 'info_dict': { - 'id': '4f7_1392687779', - 'ext': 'mp4', - 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.", - 'uploader': 'CapObveus', - 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', - 'age_limit': 18, - }, - 'skip': 'Video is dead', - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/5983 - # Multiple resolutions - 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', - 'info_dict': { - 'id': '801_1409392012', - 'ext': 'mp4', - 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', - 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521 - 'url': 'http://m.liveleak.com/view?i=763_1473349649', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': '763_1473349649', - 'ext': 'mp4', - 'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty', - 'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.', - 'uploader': 'Ziz', - 'upload_date': '20160908', - 'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw' - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.liveleak.com/view?i=677_1439397581', - 'info_dict': { - 'id': '677_1439397581', - 'title': 'Fuel Depot in China Explosion caught on video', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', - 'only_matching': True, - }, { - # No original video - 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._og_search_description(webpage) - video_uploader = self._html_search_regex( - r'By:.*?(\w+)', webpage, 'uploader', fatal=False) - age_limit = int_or_none(self._search_regex( - r'you confirm that you are ([0-9]+) years and over.', - webpage, 'age limit', default=None)) - video_thumbnail = self._og_search_thumbnail(webpage) - - entries = self._parse_html5_media_entries(url, webpage, video_id) - if not entries: - # Maybe an embed? - embed_url = self._search_regex( - r']+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } - - for idx, info_dict in enumerate(entries): - formats = [] - for a_format in info_dict['formats']: - if not a_format.get('height'): - a_format['height'] = int_or_none(self._search_regex( - r'([0-9]+)p\.mp4', a_format['url'], 'height label', - default=None)) - formats.append(a_format) - - # Removing '.*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/ytdl-org/youtube-dl/pull/4768) - orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) - if a_format['url'] != orig_url: - format_id = a_format.get('format_id') - format_id = 'original' + ('-' + format_id if format_id else '') - if self._is_valid_url(orig_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) - info_dict['formats'] = formats - - # Don't append entry ID for one-video pages to keep backward compatibility - if len(entries) > 1: - info_dict['id'] = '%s_%s' % (video_id, idx + 1) - else: - info_dict['id'] = video_id - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - }) - - return self.playlist_result(entries, video_id, video_title) - - -class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[ift])=(?P[\w_]+)' - - # See generic.py for actual test cases - _TESTS = [{ - 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', - 'only_matching': True, - }, { - 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', - 'only_matching': True, - }] - - def _real_extract(self, url): - kind, video_id = re.match(self._VALID_URL, url).groups() - - if kind == 'f': - webpage = self._download_webpage(url, video_id) - liveleak_url = self._search_regex( - r'(?:logourl\s*:\s*|window\.open\()(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, - webpage, 'LiveLeak URL', group='url') - else: - liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) - - return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py new file mode 100644 index 000000000..0d23fec75 --- /dev/null +++ b/youtube_dl/extractor/maoritv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MaoriTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54', + 'md5': '5ade8ef53851b6a132c051b1cd858899', + 'info_dict': { + 'id': '4774724855001', + 'ext': 'mp4', + 'title': 'Kōrero Mai, Series 1 Episode 54', + 'upload_date': '20160226', + 'timestamp': 1456455018, + 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb', + 'uploader_id': '1614493167001', + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 1603b55f6..67bb4debb 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,33 +15,39 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } + }, { + 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'only_matching': True, + }, { + 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 50d5db802..788acf7fb 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

(.+?)

', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py new file mode 100644 index 000000000..8e9f0f825 --- /dev/null +++ b/youtube_dl/extractor/minds.py @@ -0,0 +1,196 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) + + +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', + 'upload_date': '20130524', + 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', + }, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', + }, + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + + formats = [] + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) + + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') + + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] + + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() + + return { + 'id': video_id, + 'title': entity.get('title') or video_id, + 'formats': formats, + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), + 'tags': tags, + 'comment_count': int_or_none(entity.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 + + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 + while True: + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: + continue + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 9759560f1..69319857d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -251,8 +251,11 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None entries.append(self.url_result( - cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) + cloudcast_url, MixcloudIE.ie_key(), video_id)) page_info = items['pageInfo'] has_next_page = page_info['hasNextPage'] @@ -321,7 +324,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): _DESCRIPTION_KEY = 'biog' _ROOT_TYPE = 'user' _NODE_TEMPLATE = '''slug - url''' + url + owner { username }''' def _get_playlist_title(self, title, slug): return '%s (%s)' % (title, slug) @@ -345,6 +349,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): _NODE_TEMPLATE = '''cloudcast { slug url + owner { username } }''' def _get_cloudcast(self, node): diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index b907f6b49..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,15 +1,91 @@ from __future__ import unicode_literals -from .nhl import NHLBaseIE +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + try_get, +) -class MLBIE(NHLBaseIE): +class MLBBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._download_video_data(display_id) + video_id = video['id'] + title = video['title'] + feed = self._get_feed(video) + + formats = [] + for playback in (feed.get('playbacks') or []): + playback_url = playback.get('url') + if not playback_url: + continue + name = playback.get('name') + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=name, fatal=False)) + else: + f = { + 'format_id': name, + 'url': playback_url, + } + mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name) + if mobj: + f.update({ + 'height': int(mobj.group(3)), + 'tbr': int(mobj.group(1)), + 'width': int(mobj.group(2)), + }) + mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url) + if mobj: + f.update({ + 'fps': int(mobj.group(3)), + 'height': int(mobj.group(2)), + 'tbr': int(mobj.group(4)), + 'width': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnails = [] + for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): + src = cut.get('src') + if not src: + continue + thumbnails.append({ + 'height': int_or_none(cut.get('height')), + 'url': src, + 'width': int_or_none(cut.get('width')), + }) + + language = (video.get('language') or 'EN').lower() + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': video.get('description'), + 'duration': parse_duration(feed.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)), + 'subtitles': self._extract_mlb_subtitles(feed, language), + } + + +class MLBIE(MLBBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*(?Pmlb)\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:[^/]+/)*c-| + (?:[^/]+/)*video/[^/]+/c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE): (?P\d+) ) ''' - _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -76,18 +151,6 @@ class MLBIE(NHLBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - { - 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', - 'md5': 'e09e37b552351fddbf4d9e699c924d68', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429139220, - 'upload_date': '20150415', - } - }, { 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', 'only_matching': True, @@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE): 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, - { - 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', - 'only_matching': True, - } ] + _TIMESTAMP_KEY = 'date' + + @staticmethod + def _get_feed(video): + return video + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for keyword in (feed.get('keywordsAll') or []): + keyword_type = keyword.get('type') + if keyword_type and keyword_type.startswith('closed_captions_location_'): + cc_location = keyword.get('value') + if cc_location: + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + return subtitles + + def _download_video_data(self, display_id): + return self._download_json( + 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + display_id) + + +class MLBVideoIE(MLBBaseIE): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', + 'info_dict': { + 'id': 'c04a8863-f569-42e6-9f87-992393657614', + 'ext': 'mp4', + 'title': "Ackley's spectacular catch", + 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', + 'duration': 66, + 'timestamp': 1405995000, + 'upload_date': '20140722', + 'thumbnail': r're:^https?://.+', + }, + } + _TIMESTAMP_KEY = 'timestamp' + + @classmethod + def suitable(cls, url): + return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + + @staticmethod + def _get_feed(video): + return video['feeds'][0] + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for cc_location in (feed.get('closedCaptions') or []): + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + + def _download_video_data(self, display_id): + # https://www.mlb.com/data-service/en/videos/[SLUG] + return self._download_json( + 'https://fastball-gateway.mlb.com/graphql', + display_id, query={ + 'query': '''{ + mediaPlayback(ids: "%s") { + description + feeds(types: CMS) { + closedCaptions + duration + image { + cuts { + width + height + src + } + } + playbacks { + name + url + } + } + id + timestamp + title + } +}''' % display_id, + })['data']['mediaPlayback'][0] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index df1034fc5..5a5205c0e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -253,6 +253,12 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + for c in parent['children']: + if c.get('type') == t: + return c + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +284,14 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): @@ -309,7 +323,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') @@ -349,18 +363,6 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @staticmethod - def extract_child_with_type(parent, t): - children = parent['children'] - return next(c for c in children if c.get('type') == t) - - def _extract_mgid(self, webpage): - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self.extract_child_with_type(data, 'MainContainer') - video_player = self.extract_child_with_type(main_container, 'VideoPlayer') - return video_player['props']['media']['video']['config']['uri'] - class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index a569c889e..cfc220314 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -23,11 +23,9 @@ class NineCNineMediaIE(InfoExtractor): destination_code, content_id = re.match(self._VALID_URL, url).groups() api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) content = self._download_json(api_base_url, content_id, query={ - '$include': '[Media,Season,ContentPackages]', + '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', }) title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index dc6a27d36..14390823b 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,104 +1,130 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + try_get, + unescapeHTML, + url_or_none, +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'kXzwOKyGlSA', + 'id': 'ae5Ag7B', 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + } }, { - 'url': 'http://9gag.com/tv/p/aKolP3', - 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', + # HTML escaped title + 'url': 'https://9gag.com/gag/av5nvyb', 'only_matching': True, }] - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, - } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - webpage = self._download_webpage(url, display_id) + if post.get('type') != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + title = unescapeHTML(post['title']) - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + duration = None + formats = [] + thumbnails = [] + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: + continue + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, + 'ext': ext, + }) + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + + tags = None + post_tags = post.get('tags') + if post_tags: + tags = [] + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, - 'id': video_id, - 'display_id': display_id, + 'id': post_id, 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'timestamp': int_or_none(post.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, + 'tags': tags, } diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r']+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r']+class="tag-[^"]+">]*>([^<]+)', post_content - ) if post_content else None + r']+class="tag-[^"]+"[^>]*>([^<]+)', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 40dee2162..6d01a25c3 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -58,7 +58,7 @@ class NRKBaseIE(InfoExtractor): def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): return self._download_json( - urljoin('http://psapi.nrk.no/', path), + urljoin('https://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 700ce448c..8d537d7ae 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -98,6 +98,9 @@ class ORFTVthekIE(InfoExtractor): elif ext == 'f4m': formats.extend(self._extract_f4m_formats( src, video_id, f4m_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -140,6 +143,25 @@ class ORFTVthekIE(InfoExtractor): }) upload_date = unified_strdate(sd.get('created_date')) + + thumbnails = [] + preview = sd.get('preview_image_url') + if preview: + thumbnails.append({ + 'id': 'preview', + 'url': preview, + 'preference': 0, + }) + image = sd.get('image_full_url') + if not image and len(data_jsb) == 1: + image = self._og_search_thumbnail(webpage) + if image: + thumbnails.append({ + 'id': 'full', + 'url': image, + 'preference': 1, + }) + entries.append({ '_type': 'video', 'id': video_id, @@ -149,7 +171,7 @@ class ORFTVthekIE(InfoExtractor): 'description': sd.get('description'), 'duration': int_or_none(sd.get('duration_in_seconds')), 'upload_date': upload_date, - 'thumbnail': sd.get('image_full_url'), + 'thumbnails': thumbnails, }) return { @@ -182,7 +204,7 @@ class ORFRadioIE(InfoExtractor): duration = end - start if end and start else None entries.append({ 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), + 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), 'title': title, 'description': clean_html(data.get('subtitle')), 'duration': duration, diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py new file mode 100644 index 000000000..fb29d83f9 --- /dev/null +++ b/youtube_dl/extractor/palcomp3.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = re.match(self._VALID_URL, url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @ classmethod + def suitable(cls, url): + return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index c39d12728..3af533925 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -413,7 +413,8 @@ class PeerTubeIE(InfoExtractor): peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| - peertube\.cpy\.re + peertube\.cpy\.re| + canard\.tube )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' @@ -450,6 +451,18 @@ class PeerTubeIE(InfoExtractor): 'tags': ['framasoft', 'peertube'], 'categories': ['Science & Technology'], } + }, { + # Issue #26002 + 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', + 'info_dict': { + 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', + 'ext': 'mp4', + 'title': 'Dot matrix printer shell demo', + 'uploader_id': '3', + 'timestamp': 1587401293, + 'upload_date': '20200420', + 'uploader': 'Drew DeVault', + } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, @@ -526,7 +539,15 @@ class PeerTubeIE(InfoExtractor): title = video['name'] formats = [] - for file_ in video['files']: + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: if not isinstance(file_, dict): continue file_url = url_or_none(file_.get('fileUrl')) @@ -548,15 +569,15 @@ class PeerTubeIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - full_description = self._call_api( - host, video_id, 'description', note='Downloading description JSON', - fatal=False) + description = video.get('description') + if len(description) >= 250: + # description is shortened + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) - description = None - if isinstance(full_description, dict): - description = str_or_none(full_description.get('description')) - if not description: - description = video.get('description') + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) or description subtitles = self.extract_subtitles(host, video_id) @@ -578,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -600,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index b15906390..b93a02b7d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,6 +12,10 @@ from ..utils import ( class PeriscopeBaseIE(InfoExtractor): + _M3U8_HEADERS = { + 'Referer': 'https://www.periscope.tv/' + } + def _call_api(self, method, query, item_id): return self._download_json( 'https://api.periscope.tv/api/v2/%s' % method, @@ -54,9 +58,11 @@ class PeriscopeBaseIE(InfoExtractor): m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native' if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=fatal) + m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS) if len(m3u8_formats) == 1: self._add_width_and_height(m3u8_formats[0], width, height) + for f in m3u8_formats: + f.setdefault('http_headers', {}).update(self._M3U8_HEADERS) return m3u8_formats diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e1..e3ea01443 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,45 +1,133 @@ +# coding: utf-8 from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re + +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + try_get, + unified_timestamp, + urljoin, +) -class PhoenixIE(DreiSatIE): +class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' - _TESTS = [ - { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613902500, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'series': 'corona nachgehakt', + 'episode': 'Wohin führt der Protest in der Pandemie?', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + article_id = self._match_id(url) - internal_id = self._search_regex( - r'
[a-zA-Z0-9]+)(?:/(?P[a-zA-Z0-9]+))?' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + channel_id = self._match_id(url) - metadata = self._download_json( - 'https://api.picarto.tv/v1/channel/name/' + channel_id, - channel_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] - if metadata.get('online') is False: + if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] cdn_data = self._download_json( - 'https://picarto.tv/process/channel', channel_id, - data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Downloading load balancing info') + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') - token = mobj.group('token') or 'public' - params = { - 'con': int(time.time() * 1000), - 'token': token, - } - - prefered_edge = cdn_data.get('preferedEdge') formats = [] - - for edge in cdn_data['edges']: - edge_ep = edge.get('ep') - if not edge_ep or not isinstance(edge_ep, compat_str): + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: continue - edge_id = edge.get('id') - for tech in cdn_data['techs']: - tech_label = tech.get('label') - tech_type = tech.get('type') - preference = 0 - if edge_id == prefered_edge: - preference += 1 - format_id = [] - if edge_id: - format_id.append(edge_id) - if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': - format_id.append('hls') - formats.extend(self._extract_m3u8_formats( - update_url_query( - 'https://%s/hls/%s/index.m3u8' - % (edge_ep, channel_id), params), - channel_id, 'mp4', preference=preference, - m3u8_id='-'.join(format_id), fatal=False)) - continue - elif tech_type == 'video/mp4' or tech_label == 'MP4': - format_id.append('mp4') - formats.append({ - 'url': update_url_query( - 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), - params), - 'format_id': '-'.join(format_id), - 'preference': preference, - }) - else: - # rtmp format does not seem to work - continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) self._sort_formats(formats) mature = metadata.get('adult') @@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(metadata.get('title') or channel_id), + 'title': self._live_title(title.strip()), 'is_live': True, - 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), 'channel': channel_id, + 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index b249c9eda..42528d746 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': diff --git a/youtube_dl/extractor/playstuff.py b/youtube_dl/extractor/playstuff.py new file mode 100644 index 000000000..5a329957f --- /dev/null +++ b/youtube_dl/extractor/playstuff.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + smuggle_url, + try_get, +) + + +class PlayStuffIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a', + 'md5': 'c82d3669e5247c64bc382577843e5bd0', + 'info_dict': { + 'id': '6250584958001', + 'ext': 'mp4', + 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga', + 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913', + 'uploader_id': '6005208634001', + 'timestamp': 1619491027, + 'upload_date': '20210427', + }, + 'add_ie': ['BrightcoveNew'], + }, { + # geo restricted, bypassable + 'url': 'https://play.stuff.co.nz/details/_6155660351001', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + state = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'), + video_id) + + account_id = try_get( + state, lambda x: x['configurations']['accountId'], + compat_str) or '6005208634001' + player_id = try_get( + state, lambda x: x['configurations']['playerId'], + compat_str) or 'default' + + entries = [] + for item_id, video in state['items'].items(): + if not isinstance(video, dict): + continue + asset_id = try_get( + video, lambda x: x['content']['attributes']['assetId'], + compat_str) + if not asset_id: + continue + entries.append(self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id), + {'geo_countries': ['NZ']}), + 'BrightcoveNew', video_id)) + + return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index abd08bc28..2d63855df 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -393,7 +393,7 @@ query viewClip { # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( - random.randint(2, 5), display_id, + random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..e2e1500ff 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,11 +22,16 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, + urlencode_postdata, url_or_none, ) class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + _PORNHUB_HOST_RE = r'(?:(?Ppornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) @@ -52,17 +57,79 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)? + %s + /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) - ''' + ''' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', @@ -103,6 +170,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -163,12 +231,27 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + # geo restricted + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -180,12 +263,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -198,7 +276,8 @@ class PornHubIE(PornHubBaseIE): webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) @@ -206,6 +285,11 @@ class PornHubIE(PornHubBaseIE): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) + if any(re.search(p, webpage) for p in ( + r'class=["\']geoBlocked["\']', + r'>\s*This content is unavailable in your country')): + self.raise_geo_restricted() + # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. @@ -327,35 +411,49 @@ class PornHubIE(PornHubBaseIE): upload_date = None formats = [] + + def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return + if not height: + height = int_or_none(self._search_regex( + r'(?P\d+)[pP]?_\d+[kK]', format_url, 'height', + default=None)) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + }) + for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) - self._sort_formats(formats) + add_format(video_url) + self._sort_formats( + formats, field_preference=('height', 'width', 'fps', 'format_id')) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', @@ -405,6 +503,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -422,29 +524,9 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -463,14 +545,30 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -483,36 +581,59 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): ]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') + def _entries(self, url, host, item_id): + page = self._extract_page(url) - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + VIDEOS = '/videos' - entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num, fallback=True) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -617,6 +738,9 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): }, { 'url': 'https://de.pornhub.com/playlist/4667351', 'only_matching': True, + }, { + 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', + 'only_matching': True, }] @classmethod @@ -627,7 +751,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { @@ -637,4 +761,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, + }, { + 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 8c016a77d..0c497856e 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -15,17 +15,17 @@ class RDSIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ - 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + # has two 9c9media ContentPackages, the web player selects the first ContentPackage + 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', 'info_dict': { - 'id': '604333', - 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'id': '2083309', + 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', 'ext': 'flv', - 'title': 'Fowler Jr. prend la direction de Jacksonville', - 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', - 'timestamp': 1430397346, - 'upload_date': '20150430', - 'duration': 154.354, - 'age_limit': 0, + 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', + 'description': 'md5:83fa38ecc4a79b19e433433254077f25', + 'timestamp': 1606129030, + 'upload_date': '20201123', + 'duration': 773.039, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 3aae79f5d..6d000b372 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -133,8 +133,10 @@ class RedBullEmbedIE(RedBullTVIE): rrn_id = self._match_id(url) asset_id = self._download_json( 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', - rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'}, - query={ + rrn_id, headers={ + 'Accept': 'application/json', + 'API-KEY': 'e90a1ff11335423998b100c929ecc866', + }, query={ 'query': '''{ resource(id: "%s", enforceGeoBlocking: false) { %s diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } diff --git a/youtube_dl/extractor/samplefocus.py b/youtube_dl/extractor/samplefocus.py new file mode 100644 index 000000000..806c3c354 --- /dev/null +++ b/youtube_dl/extractor/samplefocus.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'

(.+?)

', webpage, 'title') + + mp3_url = self._search_regex( + r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r']+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index f722528cd..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -43,6 +43,9 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index b5e76c9af..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, +) class ScreencastOMaticIE(InfoExtractor): - _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { @@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', - 'duration': 369.163, + 'duration': 369, + 'upload_date': '20141216', } - } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - jwplayer_data = self._parse_json( - self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), - video_id, transform_source=js_to_json) - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict.update({ - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), }) - return info_dict + return info diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 5c2a6206b..88b938e05 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -21,6 +21,7 @@ from ..utils import ( class ShahidBaseIE(AWSIE): _AWS_PROXY_HOST = 'api2.shahid.net' _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/' def _handle_error(self, e): fail_data = self._parse_json( @@ -49,15 +50,18 @@ class ShahidBaseIE(AWSIE): class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' - _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { - 'id': '275286', + 'id': '816924', 'ext': 'mp4', - 'title': 'مجلس الشباب الموسم 1 كليب 1', - 'timestamp': 1506988800, - 'upload_date': '20171003', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], }, 'params': { # m3u8 download @@ -70,6 +74,9 @@ class ShahidIE(ShahidBaseIE): # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', 'only_matching': True + }, { + 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319', + 'only_matching': True }] def _real_initialize(self): @@ -109,12 +116,15 @@ class ShahidIE(ShahidBaseIE): page_type = 'episode' playout = self._call_api( - 'playout/url/' + video_id, video_id)['playout'] + 'playout/new/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( @@ -162,7 +172,7 @@ class ShahidIE(ShahidBaseIE): class ShahidShowIE(ShahidBaseIE): - _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', 'info_dict': { diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 02295d1a4..93ab2a167 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -86,10 +86,10 @@ class SharedIE(SharedBaseIE): class VivoIE(SharedBaseIE): IE_DESC = 'vivo.sx' - _VALID_URL = r'https?://vivo\.sx/(?P[\da-z]{10})' + _VALID_URL = r'https?://vivo\.s[xt]/(?P[\da-z]{10})' _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' - _TEST = { + _TESTS = [{ 'url': 'http://vivo.sx/d7ddda0e78', 'md5': '15b3af41be0b4fe01f4df075c2678b2c', 'info_dict': { @@ -98,7 +98,10 @@ class VivoIE(SharedBaseIE): 'title': 'Chicken', 'filesize': 515659, }, - } + }, { + 'url': 'http://vivo.st/d7ddda0e78', + 'only_matching': True, + }] def _extract_title(self, webpage): title = self._html_search_regex( diff --git a/youtube_dl/extractor/simplecast.py b/youtube_dl/extractor/simplecast.py new file mode 100644 index 000000000..2d0b3c06d --- /dev/null +++ b/youtube_dl/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'''(?x)]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index da75a43a7..0774da06e 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 4c5e3f7c2..5805f3d44 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', 'mgid': uri, } - - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) - - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) - else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info diff --git a/youtube_dl/extractor/spotify.py b/youtube_dl/extractor/spotify.py new file mode 100644 index 000000000..826f98cff --- /dev/null +++ b/youtube_dl/extractor/spotify.py @@ -0,0 +1,156 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + float_or_none, + int_or_none, + strip_or_none, + try_get, + unified_strdate, +) + + +class SpotifyBaseIE(InfoExtractor): + _ACCESS_TOKEN = None + _OPERATION_HASHES = { + 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', + 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', + 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', + } + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P[^/?&#]+)' + + def _real_initialize(self): + self._ACCESS_TOKEN = self._download_json( + 'https://open.spotify.com/get_access_token', None)['accessToken'] + + def _call_api(self, operation, video_id, variables): + return self._download_json( + 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ + 'operationName': 'query' + operation, + 'variables': json.dumps(variables), + 'extensions': json.dumps({ + 'persistedQuery': { + 'sha256Hash': self._OPERATION_HASHES[operation], + }, + }) + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + + def _extract_episode(self, episode, series): + episode_id = episode['id'] + title = episode['name'].strip() + + formats = [] + audio_preview = episode.get('audioPreview') or {} + audio_preview_url = audio_preview.get('url') + if audio_preview_url: + f = { + 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), + 'vcodec': 'none', + } + audio_preview_format = audio_preview.get('format') + if audio_preview_format: + f['format_id'] = audio_preview_format + mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) + if mobj: + f.update({ + 'abr': int(mobj.group(2)), + 'ext': mobj.group(1).lower(), + }) + formats.append(f) + + for item in (try_get(episode, lambda x: x['audio']['items']) or []): + item_url = item.get('url') + if not (item_url and item.get('externallyHosted')): + continue + formats.append({ + 'url': clean_podcast_url(item_url), + 'vcodec': 'none', + }) + + thumbnails = [] + for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + return { + 'id': episode_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': strip_or_none(episode.get('description')), + 'duration': float_or_none(try_get( + episode, lambda x: x['duration']['totalMilliseconds']), 1000), + 'release_date': unified_strdate(try_get( + episode, lambda x: x['releaseDate']['isoString'])), + 'series': series, + } + + +class SpotifyIE(SpotifyBaseIE): + IE_NAME = 'spotify' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' + _TEST = { + 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', + 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', + 'info_dict': { + 'id': '4Z7GAJ50bgctf6uclHlWKo', + 'ext': 'mp3', + 'title': 'From the archive: Why time management is ruining our lives', + 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', + 'duration': 2083.605, + 'release_date': '20201217', + 'series': "The Guardian's Audio Long Reads", + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('Episode', episode_id, { + 'uri': 'spotify:episode:' + episode_id + })['episode'] + return self._extract_episode( + episode, try_get(episode, lambda x: x['podcast']['name'])) + + +class SpotifyShowIE(SpotifyBaseIE): + IE_NAME = 'spotify:show' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' + _TEST = { + 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', + 'info_dict': { + 'id': '4PM9Ke6l66IRNpottHKV9M', + 'title': 'The Story from the Guardian', + 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', + }, + 'playlist_mincount': 36, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + podcast = self._call_api('ShowEpisodes', show_id, { + 'limit': 1000000000, + 'offset': 0, + 'uri': 'spotify:show:' + show_id, + })['podcast'] + podcast_name = podcast.get('name') + + entries = [] + for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): + episode = item.get('episode') + if not episode: + continue + entries.append(self._extract_episode(episode, podcast_name)) + + return self.playlist_result( + entries, show_id, podcast_name, podcast.get('description')) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py new file mode 100644 index 000000000..9c698626f --- /dev/null +++ b/youtube_dl/extractor/storyfire.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools + +from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + + uploader_id = video.get('hostID') + + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P[0-9a-f]{24})' + _TEST = { + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', + 'info_dict': { + 'id': '378954662', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', + 'uploader': 'whang!', + 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) + + +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P[^/]+)/video' + _TEST = { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + }, + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 + + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) + + +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13, + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0, + }] + + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) + + def _real_extract(self, url): + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 4dbead2ba..ec08eae55 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none class StretchInternetIE(InfoExtractor): @@ -11,22 +10,28 @@ class StretchInternetIE(InfoExtractor): 'info_dict': { 'id': '573272', 'ext': 'mp4', - 'title': 'University of Mary Wrestling vs. Upper Iowa', - 'timestamp': 1575668361, - 'upload_date': '20191206', + 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA', + # 'timestamp': 1575668361, + # 'upload_date': '20191206', + 'uploader_id': '99997', } } def _real_extract(self, url): video_id = self._match_id(url) + media_url = self._download_json( + 'https://core.stretchlive.com/trinity/event/tcg/' + video_id, + video_id)[0]['media'][0]['url'] event = self._download_json( - 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, - video_id)[0] + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={'eventID': video_id, 'token': 'asdf'})['event'] return { 'id': video_id, 'title': event['title'], - 'timestamp': int_or_none(event.get('dateCreated'), 1000), - 'url': 'https://' + event['media'][0]['url'], + # TODO: parse US timezone abbreviations + # 'timestamp': event.get('dateTimeString'), + 'url': 'https://' + media_url, + 'uploader_id': event.get('ownerID'), } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..a5bb6daa7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ - 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'url': 'https://www.svtplay.se/video/30479064', 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': 'jNwpV9P', + 'id': '8zVbDPA', 'ext': 'mp4', - 'title': 'Det här är himlen', - 'timestamp': 1586044800, - 'upload_date': '20200405', - 'duration': 3515, + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { @@ -173,6 +174,12 @@ class SVTPlayIE(SVTPlayBaseIE): # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B 'skip_download': True, }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -219,7 +226,8 @@ class SVTPlayIE(SVTPlayBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id, svt_id = mobj.group('id', 'svt_id') + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') if svt_id: return self._extract_by_video_id(svt_id) @@ -254,9 +262,12 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 63e2455b2..f09f1a3f9 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -123,6 +123,10 @@ class TEDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # with own formats and private Youtube external + 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity', + 'only_matching': True, }] _NATIVE_FORMATS = { @@ -210,16 +214,6 @@ class TEDIE(InfoExtractor): player_talk = talk_info['player_talks'][0] - external = player_talk.get('external') - if isinstance(external, dict): - service = external.get('service') - if isinstance(service, compat_str): - ext_url = None - if service.lower() == 'youtube': - ext_url = external.get('code') - - return self.url_result(ext_url or external['uri']) - resources_ = player_talk.get('resources') or talk_info.get('resources') http_url = None @@ -294,6 +288,16 @@ class TEDIE(InfoExtractor): 'vcodec': 'none', }) + if not formats: + external = player_talk.get('external') + if isinstance(external, dict): + service = external.get('service') + if isinstance(service, compat_str): + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') + return self.url_result(ext_url or external['uri']) + self._sort_formats(formats) video_id = compat_str(talk_info['id']) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P[^/]+)/videos/(?P[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', - 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', - 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'skip': 'HTTP Error 410: Gone', - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }, { 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { 'id': '13641379', 'ext': 'mp4', 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - webpage = self._download_webpage(url, video_id) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - wat_id = None + decoration = video.get('decoration') or {} - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|)', webpage, - 'data', default='{}'), video_id, fatal=False) + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass - - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2'), - webpage, 'wat id', group='id') - - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py index f26937da1..f6d37bb9e 100644 --- a/youtube_dl/extractor/threeqsdn.py +++ b/youtube_dl/extractor/threeqsdn.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, - js_to_json, - mimetype2ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, ) @@ -15,29 +18,35 @@ class ThreeQSDNIE(InfoExtractor): IE_DESC = '3Q SDN' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ - # ondemand from http://www.philharmonie.tv/veranstaltung/26/ - 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', - 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', + # https://player.3qsdn.com/demo.html + 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', + 'md5': '64a57396b16fa011b15e0ea60edce918', 'info_dict': { - 'id': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'id': '7201c779-6b3c-11e7-a40e-002590c750be', 'ext': 'mp4', - 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'title': 'Video Ads', 'is_live': False, + 'description': 'Video Ads Demo', + 'timestamp': 1500334803, + 'upload_date': '20170717', + 'duration': 888.032, + 'subtitles': { + 'eng': 'count:1', + }, }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'], + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], }, { # live video stream - 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be', 'info_dict': { - 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'id': '66e68995-11ca-11e8-9273-002590c750be', 'ext': 'mp4', - 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': True, # m3u8 downloads }, - 'expected_warnings': ['Failed to download MPD manifest'], }, { # live audio stream 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', @@ -58,6 +67,14 @@ class ThreeQSDNIE(InfoExtractor): # live video with rtmp link 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', 'only_matching': True, + }, { + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'only_matching': True, + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'only_matching': True, }] @staticmethod @@ -70,73 +87,78 @@ class ThreeQSDNIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - js = self._download_webpage( - 'http://playout.3qsdn.com/%s' % video_id, video_id, - query={'js': 'true'}) + try: + config = self._download_json( + url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_geo_restricted() + raise - if any(p in js for p in ( - '>This content is not available in your country', - 'playout.3qsdn.com/forbidden')): - self.raise_geo_restricted() - - stream_content = self._search_regex( - r'streamContent\s*:\s*(["\'])(?P.+?)\1', js, - 'stream content', default='demand', group='content') - - live = stream_content == 'live' - - stream_type = self._search_regex( - r'streamType\s*:\s*(["\'])(?Paudio|video)\1', js, - 'stream type', default='video', group='type') + live = config.get('streamContent') == 'live' + aspect = float_or_none(config.get('aspect')) formats = [] - urls = set() - - def extract_formats(item_url, item={}): - if not item_url or item_url in urls: - return - urls.add(item_url) - ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - item_url, video_id, mpd_id='mpd', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - item_url, video_id, 'mp4', - entry_protocol='m3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - item_url, video_id, f4m_id='hds', fatal=False)) - else: - if not self._is_valid_url(item_url, video_id): - return - formats.append({ - 'url': item_url, - 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else ext, - 'vcodec': 'none' if stream_type == 'audio' else None, - }) - - for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js): - f = self._parse_json( - item_js, video_id, transform_source=js_to_json, fatal=False) - if not f: + for source_type, source in (config.get('sources') or {}).items(): + if not source: continue - extract_formats(f.get('src'), f) + if source_type == 'dash': + formats.extend(self._extract_mpd_formats( + source, video_id, mpd_id='mpd', fatal=False)) + elif source_type == 'hls': + formats.extend(self._extract_m3u8_formats( + source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif source_type == 'progressive': + for s in source: + src = s.get('src') + if not (src and self._is_valid_url(src, video_id)): + continue + width = None + format_id = ['http'] + ext = determine_ext(src) + if ext: + format_id.append(ext) + height = int_or_none(s.get('height')) + if height: + format_id.append('%dp' % height) + if aspect: + width = int(height * aspect) + formats.append({ + 'ext': ext, + 'format_id': '-'.join(format_id), + 'height': height, + 'source_preference': 0, + 'url': src, + 'vcodec': 'none' if height == 0 else None, + 'width': width, + }) + for f in formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id')) - # More relaxed version to collect additional URLs and acting - # as a future-proof fallback - for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): - extract_formats(src) + subtitles = {} + for subtitle in (config.get('subtitles') or []): + src = subtitle.get('src') + if not src: + continue + subtitles.setdefault(subtitle.get('label') or 'eng', []).append({ + 'url': src, + }) - self._sort_formats(formats) - - title = self._live_title(video_id) if live else video_id + title = config.get('title') or video_id return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if live else title, + 'thumbnail': config.get('poster') or None, + 'description': config.get('description') or None, + 'timestamp': parse_iso8601(config.get('upload_date')), + 'duration': float_or_none(config.get('vlength')) or None, 'is_live': live, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index ea1beb8af..4faa6de54 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( + page_props = self._parse_json(self._search_regex( r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', - 'info_dict': { - 'id': '0_okj015ty', - 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } - }, { 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', + 'info_dict': { + 'id': 'P6YjLBLk', + 'ext': 'mp4', + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py new file mode 100644 index 000000000..de0107aa9 --- /dev/null +++ b/youtube_dl/extractor/trovo.py @@ -0,0 +1,194 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + 'http_headers': {'Origin': 'https://trovo.live'}, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 4a19b9be6..334b7d540 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,7 +189,60 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 931d4d650..a4a30b1e6 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -25,6 +25,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -40,28 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - info.update({ - 'title': main.get('subtitle') or main['title'], - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index db264e8a1..a378bd6dc 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,6 +17,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, ExtractorError, float_or_none, int_or_none, @@ -48,6 +49,7 @@ class TwitchBaseIE(InfoExtractor): 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', } @@ -76,14 +78,14 @@ class TwitchBaseIE(InfoExtractor): headers = { 'Referer': page_url, - 'Origin': page_url, + 'Origin': 'https://www.twitch.tv', 'Content-Type': 'text/plain;charset=UTF-8', } response = self._download_json( post_url, None, note, data=json.dumps(form).encode(), headers=headers, expected_status=400) - error = response.get('error_description') or response.get('error_code') + error = dict_get(response, ('error', 'error_description', 'error_code')) if error: fail(error) @@ -137,13 +139,17 @@ class TwitchBaseIE(InfoExtractor): self._sort_formats(formats) def _download_base_gql(self, video_id, ops, note, fatal=True): + headers = { + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + } + gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token') + if gql_auth: + headers['Authorization'] = 'OAuth ' + gql_auth.value return self._download_json( 'https://gql.twitch.tv/gql', video_id, note, data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) + headers=headers, fatal=fatal) def _download_gql(self, video_id, ops, note, fatal=True): for op in ops: @@ -888,7 +894,25 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_base_gql( + clip = self._download_gql( + video_id, [{ + 'operationName': 'VideoAccessToken_Clip', + 'variables': { + 'slug': video_id, + }, + }], + 'Downloading clip access token GraphQL')[0]['data']['clip'] + + if not clip: + raise ExtractorError( + 'This clip is no longer available', expected=True) + + access_query = { + 'sig': clip['playbackAccessToken']['signature'], + 'token': clip['playbackAccessToken']['value'], + } + + data = self._download_base_gql( video_id, { 'query': '''{ clip(slug: "%s") { @@ -913,11 +937,10 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL', fatal=False) - if not clip: - raise ExtractorError( - 'This clip is no longer available', expected=True) + if data: + clip = try_get(data, lambda x: x['data']['clip'], dict) or clip formats = [] for option in clip.get('videoQualities', []): @@ -927,7 +950,7 @@ class TwitchClipsIE(TwitchBaseIE): if not source: continue formats.append({ - 'url': source, + 'url': update_url_query(source, access_query), 'format_id': option.get('quality'), 'height': int_or_none(option.get('quality')), 'fps': int_or_none(option.get('frameRate')), diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1190d721e..cfa7a7326 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f] def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] urls = [] @@ -373,6 +377,24 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1eVjYOLGkGrQL', }, 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -389,6 +411,22 @@ class TwitterIE(TwitterBaseIE): # appplayer card 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, }] def _real_extract(self, url): @@ -433,8 +471,7 @@ class TwitterIE(TwitterBaseIE): 'tags': tags, } - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': + def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] @@ -461,6 +498,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) else: card = status.get('card') if card: @@ -493,7 +534,12 @@ class TwitterIE(TwitterBaseIE): '_type': 'url', 'url': get_binding_value('card_url'), }) - # amplify, promo_video_website, promo_video_convo, appplayer, ... + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py index d815cd9a6..47948b6ce 100644 --- a/youtube_dl/extractor/umg.py +++ b/youtube_dl/extractor/umg.py @@ -28,7 +28,7 @@ class UMGDeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - 'https://api.universal-music.de/graphql', + 'https://graphql.universal-music.de/', video_id, query={ 'query': '''{ universalMusic(channel:16) { @@ -56,11 +56,9 @@ class UMGDeIE(InfoExtractor): formats = [] def add_m3u8_format(format_id): - m3u8_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( hls_url_template % format_id, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal='False') - if m3u8_formats and m3u8_formats[0].get('height'): - formats.extend(m3u8_formats) + 'm3u8_native', m3u8_id='hls', fatal=False)) for f in video_data.get('formats', []): f_url = f.get('url') diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 10b817760..d6c79147e 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -21,6 +21,11 @@ class URPlayIE(InfoExtractor): 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'timestamp': 1513292400, 'upload_date': '20171214', + 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', + 'duration': 2269, + 'categories': ['Kultur & historia'], + 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], + 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -31,6 +36,10 @@ class URPlayIE(InfoExtractor): 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', 'timestamp': 1440086400, 'upload_date': '20150820', + 'series': 'Tripp, Trapp, Träd', + 'duration': 865, + 'tags': ['Sova'], + 'episode': 'Sovkudde', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -41,9 +50,11 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._html_search_regex( - r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['currentProduct'] + vid = int(video_id) + accessible_episodes = self._parse_json(self._html_search_regex( + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 9e860aeb7..1e29cbe22 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -75,7 +75,7 @@ class UstreamIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?Phttp://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index fe7a26b62..22e99e8f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -23,6 +23,8 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect 'tv.aftonbladet.se/abtv': 'abtv', 'www.aftonbladet.se/tv': 'abtv', } @@ -140,6 +142,10 @@ class VGTVIE(XstreamIE): 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', 'only_matching': True, }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index e5f964d39..6376ff096 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r']+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r']+data-comment-vote-count=["\'](\d+)', - r']+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py deleted file mode 100644 index 42ea4952c..000000000 --- a/youtube_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a311f21ef..2e9cbf148 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -21,6 +21,7 @@ from ..utils import ( parse_iso8601, sanitized_Request, std_headers, + try_get, ) @@ -30,7 +31,7 @@ class VikiBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' - _APP_VERSION = '2.2.5.1428709186' + _APP_VERSION = '6.0.0' _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False @@ -41,7 +42,7 @@ class VikiBaseIE(InfoExtractor): _ERRORS = { 'geo': 'Sorry, this content is not available in your region.', 'upcoming': 'Sorry, this content is not yet available.', - # 'paywall': 'paywall', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } def _prepare_call(self, path, timestamp=None, post_data=None): @@ -62,7 +63,8 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, + headers={'x-viki-app-ver': self._APP_VERSION}) error = resp.get('error') if error: @@ -82,11 +84,13 @@ class VikiBaseIE(InfoExtractor): expected=True) def _check_errors(self, data): - for reason, status in data.get('blocking', {}).items(): + for reason, status in (data.get('blocking') or {}).items(): if status and reason in self._ERRORS: message = self._ERRORS[reason] if reason == 'geo': self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + self.raise_login_required(message) raise ExtractorError('%s said: %s' % ( self.IE_NAME, message), expected=True) @@ -131,13 +135,19 @@ class VikiIE(VikiBaseIE): 'info_dict': { 'id': '1023585v', 'ext': 'mp4', - 'title': 'Heirs Episode 14', - 'uploader': 'SBS', - 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, 'upload_date': '20131121', 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'Blocked in the US', + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -153,7 +163,8 @@ class VikiIE(VikiBaseIE): 'uploader': 'Arirang TV', 'like_count': int, 'age_limit': 0, - } + }, + 'skip': 'Sorry. There was an error loading this video', }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { @@ -171,7 +182,7 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '94e0e34fd58f169f40c184f232356cfe', + 'md5': '0a53dc252e6e690feccd756861495a8c', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -183,6 +194,10 @@ class VikiIE(VikiBaseIE): 'uploader': 'group8', 'like_count': int, 'age_limit': 13, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', }, 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { @@ -209,7 +224,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': 'adf9e321a0ae5d0aace349efaaff7691', + 'md5': '41faaba0de90483fb4848952af7c7d0d', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -220,6 +235,10 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): @@ -229,36 +248,33 @@ class VikiIE(VikiBaseIE): 'https://www.viki.com/api/videos/' + video_id, video_id, 'Downloading video JSON', headers={ 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '4.0.57', + 'x-viki-app-ver': '3.0.0', }) video = resp['video'] self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + episode_number = int_or_none(video.get('number')) if not title: - title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles', {}) + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') title = '%s - %s' % (container_title, title) description = self.dict_selection(video.get('descriptions', {}), 'en') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('created_at')) - uploader = video.get('author') - like_count = int_or_none(video.get('likes', {}).get('count')) - age_limit = parse_age_limit(video.get('rating')) + like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) thumbnails = [] - for thumbnail_id, thumbnail in video.get('images', {}).items(): + for thumbnail_id, thumbnail in (video.get('images') or {}).items(): thumbnails.append({ 'id': thumbnail_id, 'url': thumbnail.get('url'), }) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( @@ -269,13 +285,15 @@ class VikiIE(VikiBaseIE): 'id': video_id, 'title': title, 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), 'like_count': like_count, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('rating')), 'thumbnails': thumbnails, 'subtitles': subtitles, + 'episode_number': episode_number, } formats = [] @@ -360,7 +378,7 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', }, 'playlist_mincount': 71, }, { @@ -371,6 +389,7 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + 'skip': 'Page not found', }, { 'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'only_matching': True, diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 15cd06268..0b386f450 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,14 +16,14 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, OnDemandPagedList, parse_filesize, - RegexNotFoundError, + parse_iso8601, sanitized_Request, smuggle_url, std_headers, @@ -74,25 +73,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') - def _verify_video_password(self, url, video_id, webpage): + def _get_video_password(self): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) self._set_vimeo_cookie('vuid', vuid) return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -123,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -142,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -188,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -226,10 +227,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'is_live': is_live, } - def _extract_original_format(self, url, video_id): + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, - query={'action': 'load_download_config'}, + url, video_id, fatal=False, query=query, headers={'X-Requested-With': 'XMLHttpRequest'}) if download_data: source_file = download_data.get('source_file') @@ -276,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) - (?:/[\da-f]+)? + (?:/(?P[\da-f]{10}))? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -316,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -329,9 +333,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '54469442', 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'The BLN & Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', - 'uploader_id': 'theblnbusinessofsoftware', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, }, @@ -394,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -466,6 +476,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -509,6 +520,11 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -543,9 +559,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return urls[0] if urls else None def _verify_player_video_password(self, url, video_id, headers): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + password = self._get_video_password() data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -562,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) headers = std_headers.copy() @@ -570,22 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + mobj = re.match(self._VALID_URL, url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') + if unlisted_hash: + return self._extract_from_api(video_id, unlisted_hash) - # Extract ID from URL - video_id = self._match_id(url) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -605,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '//player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r']+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -621,66 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r']+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - self._verify_video_password(redirect_url, video_id, webpage) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - - vod = config.get('video', {}).get('vod', {}) + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) + video = config.get('video') or {} + vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -695,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description - if not video_description: - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -711,29 +734,18 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self._downloader.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r']+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None - formats = [] source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id) + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) if source_format: formats.append(source_format) @@ -748,29 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } + }) - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) - - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -931,11 +934,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): } if hashed_pass: query['_hashed_pass'] = hashed_pass - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return for video in videos: link = video.get('link') if not link: @@ -1050,10 +1057,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - clip_data = self._download_json( - page_url.replace('/review/', '/review/data/'), - video_id)['clipData'] - config_url = clip_data['configUrl'] + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 00ec006c4..6b3513ee0 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -300,6 +300,13 @@ class VKIE(VKBaseIE): 'only_matching': True, }] + @staticmethod + def _extract_sibnet_urls(webpage): + # https://help.sibnet.ru/?sibnet_video_embed + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', + webpage)] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -408,6 +415,10 @@ class VKIE(VKBaseIE): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) + sibnet_urls = self._extract_sibnet_urls(info_page) + if sibnet_urls: + return self.url_result(sibnet_urls[0]) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 6224e6200..42da34d44 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR'} + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} if fields: query['fields'] = fields try: @@ -116,7 +116,7 @@ class VLiveIE(VLiveBaseIE): headers={'Referer': 'https://www.vlive.tv/'}, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode())['message']) + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) raise def _real_extract(self, url): diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + try_get, + unified_timestamp, ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE): setup = self._parse_json(self._search_regex( r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) - video_data = setup.get('video') or {} + player_setup = setup.get('player_setup') or setup + video_data = player_setup.get('video') or {} + formatted_metadata = video_data.get('formatted_metadata') or {} info = { 'id': video_id, - 'title': video_data.get('title_short'), + 'title': player_setup.get('title') or video_data.get('title_short'), 'description': video_data.get('description_long') or video_data.get('description_short'), - 'thumbnail': video_data.get('brightcove_thumbnail') + 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), + 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')), } - asset = setup.get('asset') or setup.get('params') or {} + asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {} formats = [] hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE): if formats: self._sort_formats(formats) info['formats'] = formats + info['duration'] = int_or_none(asset.get('duration')) return info for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor): }, { # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68', 'info_dict': { 'id': 'Gy8Md3Eky38', 'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'TheVerge', 'upload_date': '20141021', 'uploader': 'The Verge', + 'timestamp': 1413907200, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor): # Volume embed, Youtube 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': '22986359b', 'ext': 'mp4', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', 'upload_date': '20150915', - 'uploader': 'Vox', + 'timestamp': 1442332800, + 'duration': 285, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + 'timestamp': 1402938000, + 'upload_date': '20140616', + 'duration': 4114, }, 'add_ie': ['VoxMediaVolume'], }] diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/youtube_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index f4cae7fe9..bc196f8a0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -47,6 +48,22 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -58,12 +75,15 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _download_info(self, show_id, path, video_id, fatal=True): + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }, fatal=fatal) + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -81,7 +101,8 @@ class VVVVIDIE(InfoExtractor): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, 'season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( @@ -154,37 +175,50 @@ class VVVVIDIE(InfoExtractor): if season_number: info['season_number'] = int(season_number) - for quality in ('_sd', ''): + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') - if video_type in ('video/rcs', 'video/kenc'): - if video_type == 'video/kenc': - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) - self._sort_formats(formats) + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, - 'formats': formats, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, @@ -220,9 +254,13 @@ class VVVVIDShowIE(VVVVIDIE): show_info = self._download_info( show_id, 'info/', show_title, fatal=False) + if not show_title: + base_url += "/title" + entries = [] for season in (seasons or []): episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') for episode in episodes: if episode.get('playable') is False: continue @@ -232,12 +270,13 @@ class VVVVIDShowIE(VVVVIDIE): continue info = self._extract_common_video_info(episode) info.update({ - '_type': 'url', + '_type': 'url_transparent', 'ie_key': VVVVIDIE.ie_key(), 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), 'season_id': season_id, + 'playlist_title': playlist_title, }) entries.append(info) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 8ef3e0906..f1bccc2d6 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -1,15 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, - unified_strdate, - HEADRequest, int_or_none, + try_get, + unified_strdate, ) @@ -32,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -43,17 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] - - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -61,97 +53,54 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc, expected=True) - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] - - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info - - title = first_chapter['title'] - - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None - - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) + title = video_info['title'] formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - video_url = self._html_search_regex( - r'>(?:Link|Download): ]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 76aeaf9a4..f73b9778f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 18969058f..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7893f363e..84969f8e1 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import hashlib +import itertools +import re from .common import InfoExtractor from ..compat import compat_str @@ -209,17 +210,27 @@ class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): missing_track_ids = [ track_id for track_id in track_ids if track_id not in present_track_ids] - missing_tracks = self._call_api( - 'track-entries', tld, url, item_id, - 'Downloading missing tracks JSON', { - 'entries': ','.join(missing_track_ids), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) + # Request missing tracks in chunks to avoid exceeding max HTTP header size, + # see https://github.com/ytdl-org/youtube-dl/issues/27355 + _TRACKS_PER_CHUNK = 250 + for chunk_num in itertools.count(0): + start = chunk_num * _TRACKS_PER_CHUNK + end = start + _TRACKS_PER_CHUNK + missing_track_ids_req = missing_track_ids[start:end] + assert missing_track_ids_req + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), { + 'entries': ','.join(missing_track_ids_req), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + if end >= len(missing_track_ids): + break return tracks diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 61d1ab209..880c89687 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0590', + 'ccode': '0532', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7b9feafeb..7084d3d12 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -4,13 +4,12 @@ import re from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, str_to_int, - unescapeHTML, unified_strdate, url_or_none, ) -from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): @@ -25,6 +24,7 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, @@ -33,6 +33,7 @@ class YouPornIE(InfoExtractor): 'tags': list, 'age_limit': 18, }, + 'skip': 'This video has been disabled', }, { # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', @@ -54,12 +55,16 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, }, { 'url': 'http://www.youporn.com/watch/505835', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, }] @staticmethod @@ -73,6 +78,40 @@ class YouPornIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + definitions = self._download_json( + 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, + display_id) + + formats = [] + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = url_or_none(definition.get('videoUrl')) + if not video_url: + continue + f = { + 'url': video_url, + 'filesize': int_or_none(definition.get('videoSize')), + } + height = int_or_none(definition.get('quality')) + # Video URL's path looks like this: + # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 + # We will benefit from it by extracting some metadata + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'tbr': bitrate, + }) + f['height'] = height + formats.append(f) + self._sort_formats(formats) + webpage = self._download_webpage( 'http://www.youporn.com/watch/%s' % video_id, display_id, headers={'Cookie': 'age_verified=1'}) @@ -83,64 +122,6 @@ class YouPornIE(InfoExtractor): webpage, default=None) or self._html_search_meta( 'title', webpage, fatal=True) - links = [] - - # Main source - definitions = self._parse_json( - self._search_regex( - r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, - 'media definitions', default='[]'), - video_id, fatal=False) - if definitions: - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if video_url: - links.append(video_url) - - # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): - links.append(link) - - # Fallback #2 (unavailable as at 22.06.2017) - sources = self._search_regex( - r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) - if sources: - for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): - links.append(link) - - # Fallback #3 (unavailable as at 22.06.2017) - for _, link in re.findall( - r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): - links.append(link) - - # Fallback #4, encrypted links (unavailable as at 22.06.2017) - for _, encrypted_link in re.findall( - r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): - links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) - - formats = [] - for video_url in set(unescapeHTML(link) for link in links): - f = { - 'url': video_url, - } - # Video URL's path looks like this: - # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 - # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - formats.append(f) - self._sort_formats(formats) - description = self._html_search_regex( r'(?s)]+\bid=["\']description["\'][^>]*>(.+?)', webpage, 'description', @@ -149,6 +130,8 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', @@ -161,13 +144,12 @@ class YouPornIE(InfoExtractor): age_limit = self._rta_search(webpage) - average_rating = int_or_none(self._search_regex( - r']+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%', - webpage, 'average rating', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'(?s)]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P[\d,.]+)<', - webpage, 'view count', fatal=False, group='count')) + view_count = None + views = self._search_regex( + r'(]+\bclass=["\']js_videoInfoViews["\']>)', webpage, + 'views', default=None) + if views: + view_count = str_to_int(extract_attributes(views).get('data-value')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', default=None)) @@ -190,9 +172,9 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, - 'average_rating': average_rating, 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bbd3e80d8..7cd651c67 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,41 +2,35 @@ from __future__ import unicode_literals - import itertools import json import os.path import random import re -import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_HTTPError, compat_parse_qs, - compat_urllib_parse_unquote, + compat_str, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, - compat_str, ) +from ..jsinterp import JSInterpreter from ..utils import ( - bool_or_none, - clean_html, - error_to_compat_str, ExtractorError, + clean_html, + dict_get, float_or_none, - get_element_by_id, int_or_none, mimetype2ext, parse_codecs, parse_duration, - remove_quotes, + qualities, remove_start, smuggle_url, str_or_none, @@ -46,13 +40,16 @@ from ..utils import ( unified_strdate, unsmuggle_url, update_url_query, - uppercase_escape, url_or_none, urlencode_postdata, urljoin, ) +def parse_qs(url): + return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -68,17 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. @@ -262,10 +248,25 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _real_initialize(self): + self._initialize_consent() if self._downloader is None: return - self._set_language() if not self._login(): return @@ -282,19 +283,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta| you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' + $""" % { + 'invidious': '|'.join(_INVIDIOUS_SITES), + } _PLAYER_INFO_RE = ( - r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', + r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False + + IE_NAME = 'youtube' + _TESTS = [ + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'start_time': 1, + 'end_time': 9, + } + }, + { + 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', + 'note': 'Embed-only video (#1746)', + 'info_dict': { + 'id': 'yZIXLfi8CZQ', + 'ext': 'mp4', + 'upload_date': '20120608', + 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', + 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', + 'uploader': 'SET India', + 'uploader_id': 'setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', + 'age_limit': 18, + }, + 'skip': 'Private video', + }, + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', + 'note': '256k DASH audio (format 141) via DASH manifest', + 'info_dict': { + 'id': 'a9LDPn-MO4I', + 'ext': 'm4a', + 'upload_date': '20121002', + 'uploader_id': '8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', + 'description': '', + 'uploader': '8KVIDEO', + 'title': 'UHDTV TEST 8K VIDEO.mp4' + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', + }, + 'skip': 'format 141 not served anymore', + }, + # DASH manifest with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', + 'info_dict': { + 'id': 'IB3lcPjvWLA', + 'ext': 'm4a', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, + 'uploader': 'AfrojackVEVO', + 'uploader_id': 'AfrojackVEVO', + 'upload_date': '20131011', + 'abr': 129.495, + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, + # Controversy video + { + 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', + 'info_dict': { + 'id': 'T4XJQO3qol8', + 'ext': 'mp4', + 'duration': 219, + 'upload_date': '20100909', + 'uploader': 'Amazing Atheist', + 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'title': 'Burning Everyone\'s Koran', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + } + }, + # Normal age-gate video (No vevo, embed allowed), available via embed page + { + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', + 'info_dict': { + 'id': 'HtVdAasjOgU', + 'ext': 'mp4', + 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, + 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'upload_date': '20140605', + 'age_limit': 18, + }, + }, + { + # Age-gated video only available with authentication (unavailable + # via embed page workaround) + 'url': 'XgnwCQzjau8', + 'only_matching': True, + }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'deadmau5', + 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'duration': 6085, + 'upload_date': '20150827', + 'uploader_id': 'olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympic', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'duration': 85, + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫ᄋᄅ', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + }, + # url_encoded_fmt_stream_map is empty string + { + 'url': 'qEJwOuvDf7I', + 'info_dict': { + 'id': 'qEJwOuvDf7I', + 'ext': 'webm', + 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', + 'description': '', + 'upload_date': '20150404', + 'uploader_id': 'spbelect', + 'uploader': 'Наблюдатели Петербурга', + }, + 'params': { + 'skip_download': 'requires avconv', + }, + 'skip': 'This live event has ended.', + }, + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'webm', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:31', + }, + 'skip': 'not actual anymore', + }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150501', # According to ' valid video id redirection + 'url': 'DJztXj2GPfl', + 'info_dict': { + 'id': 'DJztXj2GPfk', + 'ext': 'mp4', + 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', + 'description': 'md5:bf577a41da97918e94fa9798d9228825', + 'upload_date': '20090125', + 'uploader': 'Prochorowka', + 'uploader_id': 'Prochorowka', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', + 'artist': 'Panjabi MC', + 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', + 'album': 'Beware of the Boys (Mundian To Bach Ke)', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video unavailable', + }, + { + # empty description results in an empty string + 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', + 'info_dict': { + 'id': 'x41yOUIvK2k', + 'ext': 'mp4', + 'title': 'IMG 3456', + 'description': '', + 'upload_date': '20170613', + 'uploader_id': 'ElevageOrVert', + 'uploader': 'ElevageOrVert', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # with '};' inside yt initial data (see [1]) + # see [2] for an example with '};' inside ytInitialPlayerResponse + # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 + # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # another example of '};' in ytInitialData + 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', + 'only_matching': True, + }, + { + 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', + 'only_matching': True, + }, + { + # https://github.com/ytdl-org/youtube-dl/pull/28094 + 'url': 'OtqTfy26tG0', + 'info_dict': { + 'id': 'OtqTfy26tG0', + 'ext': 'mp4', + 'title': 'Burn Out', + 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', + 'upload_date': '20141120', + 'uploader': 'The Cinematic Orchestra - Topic', + 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'artist': 'The Cinematic Orchestra', + 'track': 'Burn Out', + 'album': 'Every Day', + 'release_data': None, + 'release_year': None, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # controversial video, only works with bpctr when authenticated with cookies + 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', + 'only_matching': True, + }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, + ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -565,647 +1277,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed), available via embed page - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - { - # Age-gated video only available with authentication (unavailable - # via embed page workaround) - 'url': 'XgnwCQzjau8', - 'only_matching': True, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'Dada Life, deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'This Machine Kills Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to ' valid video id redirection - 'url': 'DJztXj2GPfl', - 'info_dict': { - 'id': 'DJztXj2GPfk', - 'ext': 'mp4', - 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', - 'description': 'md5:bf577a41da97918e94fa9798d9228825', - 'upload_date': '20090125', - 'uploader': 'Prochorowka', - 'uploader_id': 'Prochorowka', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', - 'artist': 'Panjabi MC', - 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', - 'album': 'Beware of the Boys (Mundian To Bach Ke)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # empty description results in an empty string - 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', - 'info_dict': { - 'id': 'x41yOUIvK2k', - 'ext': 'mp4', - 'title': 'IMG 3456', - 'description': '', - 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', - 'uploader': 'ElevageOrVert', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # with '};' inside yt initial data (see [1]) - # see [2] for an example with '};' inside ytInitialPlayerResponse - # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 - # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 - 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', - 'info_dict': { - 'id': 'CHqg6qOn4no', - 'ext': 'mp4', - 'title': 'Part 77 Sort a list of simple types in c#', - 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', - 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', - 'uploader': 'kudvenkat', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # another example of '};' in ytInitialData - 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', - 'only_matching': True, - }, - ] + @classmethod + def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) + self._code_cache = {} self._player_cache = {} - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1218,40 +1305,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break else: raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') + return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) + func_id = 'js_%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( player_url, video_id, - note=download_note, + note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type + code = self._code_cache[player_id] + res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) @@ -1303,7 +1377,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', @@ -1320,14 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" if player_url is None: @@ -1354,158 +1424,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, player_response, player_config): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not (player_response or player_config): - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config.get('args') if player_config else {} - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1572,12 +1494,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_chapters_from_json(self, webpage, video_id, duration): - if not webpage: - return - data = self._extract_yt_initial_data(video_id, webpage) - if not data or not isinstance(data, dict): - return + def _extract_chapters_from_json(self, data, video_id, duration): chapters_list = try_get( data, lambda x: x['playerOverlays'] @@ -1617,244 +1534,89 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) return chapters - @staticmethod - def _extract_chapters_from_description(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)[^>]*)(?=$|)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r']+>[^<]+', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') + player_response = None + if webpage: + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + if not player_response: + player_response = self._call_api( + 'player', {'videoId': video_id}, video_id) - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) + playability_status = player_response.get('playabilityStatus') or {} + if playability_status.get('reason') == 'Sign in to confirm your age': + video_info = self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'html5': 1, + # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 + 'c': 'TVHTML5', + 'cver': '6.20180913', + }, fatal=False) + if video_info: + pr = self._parse_json( + try_get( + compat_parse_qs(video_info), + lambda x: x['player_response'][0], compat_str) or '{}', + video_id, fatal=False) + if pr and isinstance(pr, dict): + player_response = pr - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) + trailer_video_id = try_get( + playability_status, + lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], + compat_str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: + def get_text(x): + if not x: return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response + text = x.get('simpleText') + if text and isinstance(text, compat_str): + return text + runs = x.get('runs') + if not isinstance(runs, list): + return + return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) - player_response = {} - - # Get video info - video_info = {} - embed_webpage = None - ytplayer_config = None - - if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - - if not video_info and not player_response: - player_response = extract_player_response( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, - 'initial player response', default='{}'), - video_id) - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info and not player_response: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - if not isinstance(video_info, dict): - video_info = {} - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} + search_meta = ( + lambda x: self._html_search_meta(x, webpage, default=None)) \ + if webpage else lambda x: None + video_details = player_response.get('videoDetails') or {} microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - ]*> - [^<]+\.{3}\s* - - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = video_details.get('shortDescription') - if video_description is None: - video_description = self._html_search_meta('description', video_webpage) + player_response, + lambda x: x['microformat']['playerMicroformatRenderer'], + dict) or {} + video_title = video_details.get('title') \ + or get_text(microformat.get('title')) \ + or search_meta(['og:title', 'twitter:title', 'title']) + video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] @@ -1862,10 +1624,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) + return try_get( + feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: @@ -1878,7 +1642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) @@ -1886,631 +1650,416 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) + return self.playlist_result( + entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) + formats = [] + itags = [] + itag_qualities = {} + player_url = None + q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + streaming_data = player_response.get('streamingData') or {} + streaming_formats = streaming_data.get('formats') or [] + streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') - if not cipher: + if not player_url: + if not webpage: continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage, 'player URL', fatal=False) + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + if itag: + itags.append(itag) + tbr = float_or_none( + fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': fmt.get('qualityLabel') or quality, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + 'tbr': tbr, + 'url': fmt_url, + 'width': fmt.get('width'), + } + mimetype = fmt.get('mimeType') + if mimetype: + mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) + if mobj: + dct['ext'] = mimetype2ext(mobj.group(1)) + dct.update(parse_codecs(mobj.group(2))) + no_audio = dct.get('acodec') == 'none' + no_video = dct.get('vcodec') == 'none' + if no_audio: + dct['vbr'] = tbr + if no_video: + dct['abr'] = tbr + if no_audio or no_video: + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + if dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' + formats.append(dct) + + hls_manifest_url = streaming_data.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats( + hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag: + f['format_id'] = itag + formats.append(f) + + if self._downloader.params.get('youtube_include_dash_manifest', True): + dash_manifest_url = streaming_data.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats( + dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + formats.append(f) - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue + if not formats: + if streaming_data.get('licenseInfos'): + raise ExtractorError( + 'This video is DRM protected.', expected=True) + pemr = try_get( + playability_status, + lambda x: x['errorScreen']['playerErrorMessageRenderer'], + dict) or {} + reason = get_text(pemr.get('reason')) or playability_status.get('reason') + subreason = pemr.get('subreason') + if subreason: + subreason = clean_html(get_text(subreason)) + if subreason == 'The uploader has not made this video available in your country.': + countries = microformat.get('availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + subreason, countries) + reason += '\n' + subreason + if reason: + raise ExtractorError(reason, expected=True) - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) + self._sort_formats(formats) - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = ( - r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', - r'"jsUrl"\s*:\s*("[^"]+")', - r'"assets":.+?"js":\s*("[^"]+")') - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_desc = 'unknown' - else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - reason_list = try_get( - player_response, - lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], - list) or [] - for reason in reason_list: - if not isinstance(reason, dict): - continue - reason_text = try_get(reason, lambda x: x['text'], compat_str) - if reason_text: - if not error_message: - error_message = '' - error_message += reason_text - if error_message: - error_message = clean_html(error_message) - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + keywords = video_details.get('keywords') or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: - video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) + for container in (video_details, microformat): + for thumbnail in (try_get( + container, + lambda x: x['thumbnail']['thumbnails'], list) or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'height': int_or_none(thumbnail.get('height')), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + }) + if thumbnails: + break else: - video_alt_title = video_creator = None + thumbnail = search_meta(['og:image', 'twitter:image']) + if thumbnail: + thumbnails = [{'url': thumbnail}] - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) + category = microformat.get('category') or search_meta('genre') + channel_id = video_details.get('channelId') \ + or microformat.get('externalChannelId') \ + or search_meta('channelId') + duration = int_or_none( + video_details.get('lengthSeconds') + or microformat.get('lengthSeconds')) \ + or parse_duration(search_meta('duration')) + is_live = video_details.get('isLive') + owner_profile_url = microformat.get('ownerProfileUrl') - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + microformat.get('uploadDate') + or search_meta('uploadDate')), + 'uploader': video_details['author'], + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + video_details.get('viewCount') + or microformat.get('viewCount') + or search_meta('interactionCount')), + 'average_rating': float_or_none(video_details.get('averageRating')), + 'age_limit': 18 if ( + microformat.get('isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'is_live': is_live, + } + + pctr = try_get( + player_response, + lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + if pctr: + def process_language(container, base_url, lang_code, query): + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + }) + container[lang_code] = lang_subs + + subtitles = {} + for caption_track in (pctr.get('captionTracks') or []): + base_url = caption_track.get('baseUrl') + if not base_url: + continue + if caption_track.get('kind') != 'asr': + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) + continue + automatic_captions = {} + for translation_language in (pctr.get('translationLanguages') or []): + translation_language_code = translation_language.get('languageCode') + if not translation_language_code: + continue + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) - # Youtube Music Auto-generated description - release_date = release_year = None if video_description: mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int_or_none(release_year), + }) - yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) - contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - for content in contents: - rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = try_get( - mrr, lambda x: x['title']['simpleText'], compat_str) - mrr_contents = try_get( - mrr, lambda x: x['contents'][0], dict) or {} - mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) - if not (mrr_title and mrr_contents_text): - continue - if mrr_title == 'License': - video_license = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - album = mrr_contents_text - elif mrr_title == 'Artist': - artist = mrr_contents_text - elif mrr_title == 'Song': - track = mrr_contents_text + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + initial_data = self._call_api( + 'next', {'videoId': video_id}, video_id, fatal=False) - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None + if initial_data: + chapters = self._extract_chapters_from_json( + initial_data, video_id, duration) + if not chapters: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: + continue - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - category = None - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + def chapter_time(mmlir): + return parse_duration( + get_text(mmlir.get('timeDescription'))) - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) + chapters = [] + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if start_time is None or end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + if chapters: + break + if chapters: + info['chapters'] = chapters - def _extract_count(count_name): - return str_to_int(self._search_regex( - (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), - r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), - video_webpage, count_name, default=None)) + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = get_text(try_get( + vsir, + lambda x: x['owner']['videoOwnerRenderer']['title'], + dict)) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = get_text(mrr['title']) + mrr_contents_text = get_text(mrr['contents'][0]) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) + self.mark_watched(video_id, player_response) - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = None - ytcfg = self._extract_ytcfg(video_id, video_webpage) - if ytcfg: - xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) - if not xsrf_token: - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = None - if ytcfg: - xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) - if not xsrf_field_name: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } + return info class YoutubeTabIE(YoutubeBaseInfoExtractor): @@ -2523,7 +2072,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): invidio\.us )/ (?: - (?:channel|c|user|feed)/| + (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| (?!(?:watch|embed|v|e|results)\b) ) @@ -2549,6 +2098,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, + }, { + # playlists, series + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', @@ -2809,6 +2367,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, }] @classmethod @@ -2831,40 +2399,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel'): - renderer = item.get('grid%sRenderer' % item_kind) - if renderer: - return renderer - - def _extract_video(self, renderer): - video_id = renderer.get('videoId') - title = try_get( - renderer, - (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - description = try_get( - renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], - compat_str) - duration = parse_duration(try_get( - renderer, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get( - renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get( - renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - return { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } + assert isinstance(item, dict) + for key, renderer in item.items(): + if not key.startswith('grid') or not key.endswith('Renderer'): + continue + if not isinstance(renderer, dict): + continue + return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: @@ -2874,7 +2415,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not isinstance(renderer, dict): continue title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2882,10 +2424,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + continue # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) + continue # channel channel_id = renderer.get('channelId') if channel_id: @@ -2894,6 +2438,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): + yield self.url_result( + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') @@ -2986,6 +2541,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for entry in self._post_thread_entries(renderer): yield entry + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + @staticmethod def _build_continuation_query(continuation, ctp=None): query = { @@ -3013,9 +2576,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation - contents = renderer.get('contents') - if not isinstance(contents, list): - return + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: if not isinstance(content, dict): continue @@ -3031,82 +2594,111 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) - def _entries(self, tab, identity_token): + def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if not slr_renderer: - return - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): + if slr_renderer: + is_channels_tab = tab.get('title') == 'Channels' + continuation = None + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer, not is_channels_tab): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation: + continuation = self._extract_continuation(is_renderer) if not continuation: - continuation = self._extract_continuation(is_renderer) + continuation = self._extract_continuation(slr_renderer) + else: + rich_grid_renderer = tab_content.get('richGridRenderer') + if not rich_grid_renderer: + return + for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): + yield entry + continuation = self._extract_continuation(rich_grid_renderer) - if not continuation: - continuation = self._extract_continuation(slr_renderer) + ytcfg = self._extract_ytcfg(item_id, webpage) + client_version = try_get( + ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' headers = { 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201112.04.01', + 'x-youtube-client-version': client_version, + 'content-type': 'application/json', } + + context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { + 'client': { + 'clientName': 'WEB', + 'clientVersion': client_version, + } + } + visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) + + identity_token = self._extract_identity_token(ytcfg, webpage) if identity_token: headers['x-youtube-identity-token'] = identity_token + data = { + 'context': context, + } + for page_num in itertools.count(1): if not continuation: break + if visitor_data: + headers['x-goog-visitor-id'] = visitor_data + data['continuation'] = continuation['continuation'] + data['clickTracking'] = { + 'clickTrackingParams': continuation['itct'] + } count = 0 retries = 3 while count <= retries: try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d%s' - % (page_num, ' (retry #%d)' % count if count else ''), - headers=headers, query=continuation) + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, data=json.dumps(data).encode('utf8')) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -3114,12 +2706,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if count <= retries: continue raise - if not browse: - break - response = try_get(browse, lambda x: x[1]['response'], dict) if not response: break + visitor_data = try_get( + response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data + continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) if continuation_contents: @@ -3142,12 +2734,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation = self._extract_continuation(continuation_renderer) continue + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( - response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) if continuation_items: continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue + renderer = self._extract_grid_item_renderer(continuation_item) + if renderer: + grid_renderer = {'items': continuation_items} + for entry in self._grid_entries(grid_renderer): + yield entry + continuation = self._extract_continuation(grid_renderer) + continue renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} @@ -3155,6 +2755,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(video_list_renderer) continue + renderer = continuation_item.get('backstagePostThreadRenderer') + if renderer: + continuation_renderer = {'contents': continuation_items} + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + renderer = continuation_item.get('richItemRenderer') + if renderer: + for entry in self._rich_grid_entries(continuation_items): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @@ -3207,11 +2820,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): alerts.append(text) return '\n'.join(alerts) - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None + playlist_id = item_id + title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -3220,14 +2834,18 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title += ' - %s' % tab_title description = renderer.get('description') playlist_id = renderer.get('externalId') - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - description = None - playlist_id = item_id + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + else: + renderer = try_get( + data, lambda x: x['header']['hashtagHeaderRenderer'], dict) + if renderer: + title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( - self._entries(selected_tab, identity_token), + self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) @@ -3251,8 +2869,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) - def _extract_identity_token(self, webpage, item_id): - ytcfg = self._extract_ytcfg(item_id, webpage) + def _extract_identity_token(self, ytcfg, webpage): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: @@ -3266,7 +2883,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # Handle both video/playlist URLs - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] if video_id and playlist_id: @@ -3275,12 +2892,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) - identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + return self._extract_from_tabs(item_id, webpage, data, tabs) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: @@ -3363,12 +2979,19 @@ class YoutubePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) + if YoutubeTabIE.suitable(url): + return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) if not qs: qs = {'list': playlist_id} return self.url_result( diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 5ed2946c2..4dd56f66d 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -7,7 +7,9 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + float_or_none, int_or_none, + merge_dicts, NO_DEFAULT, orderedSet, parse_codecs, @@ -21,49 +23,17 @@ from ..utils import ( class ZDFBaseIE(InfoExtractor): - def _call_api(self, url, player, referrer, video_id, item): - return self._download_json( - url, video_id, 'Downloading JSON %s' % item, - headers={ - 'Referer': referrer, - 'Api-Auth': 'Bearer %s' % player['apiToken'], - }) - - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') - _TESTS = [{ - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] + def _call_api(self, url, video_id, item, api_token=None, referrer=None): + headers = {} + if api_token: + headers['Api-Auth'] = 'Bearer %s' % api_token + if referrer: + headers['Referer'] = referrer + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, headers=headers) @staticmethod def _extract_subtitles(src): @@ -109,20 +79,11 @@ class ZDFIE(ZDFBaseIE): }) formats.append(f) - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') - + def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( - urljoin(url, ptmd_path), player, url, video_id, 'metadata') + ptmd_url, video_id, 'metadata', api_token, referrer) + + content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] formats = [] track_uris = set() @@ -140,7 +101,7 @@ class ZDFIE(ZDFBaseIE): continue for track in tracks: self._extract_format( - video_id, formats, track_uris, { + content_id, formats, track_uris, { 'url': track.get('uri'), 'type': f.get('type'), 'mimeType': f.get('mimeType'), @@ -149,6 +110,103 @@ class ZDFIE(ZDFBaseIE): }) self._sort_formats(formats) + duration = float_or_none(try_get( + ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) + + return { + 'extractor_key': ZDFIE.ie_key(), + 'id': content_id, + 'duration': duration, + 'formats': formats, + 'subtitles': self._extract_subtitles(ptmd), + } + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': '151025_magie_farben2_tex', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html + 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'ngplayer_2_4') + + info = self._extract_ptmd( + urljoin(url, ptmd_path), video_id, player['apiToken'], url) + thumbnails = [] layouts = try_get( content, lambda x: x['teaserImageRef']['layouts'], dict) @@ -169,33 +227,33 @@ class ZDFIE(ZDFBaseIE): }) thumbnails.append(thumbnail) - return { - 'id': video_id, + return merge_dicts(info, { 'title': title, 'description': content.get('leadParagraph') or content.get('teasertext'), 'duration': int_or_none(t.get('duration')), 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(ptmd), - 'formats': formats, - } + }) def _extract_regular(self, url, player, video_id): content = self._call_api( - player['content'], player, url, video_id, 'content') + player['content'], video_id, 'content', player['apiToken'], url) return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): - document = self._download_json( + video = self._download_json( 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id)['document'] + video_id) + + document = video['document'] title = document['titel'] + content_id = document['basename'] formats = [] format_urls = set() for f in document['formitaeten']: - self._extract_format(video_id, formats, format_urls, f) + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -213,12 +271,12 @@ class ZDFIE(ZDFBaseIE): }) return { - 'id': video_id, + 'id': content_id, 'title': title, 'description': document.get('beschreibung'), 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(try_get( - document, lambda x: x['meta']['editorialDate'], compat_str)), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), 'thumbnails': thumbnails, 'subtitles': self._extract_subtitles(document), 'formats': formats, diff --git a/youtube_dl/extractor/zhihu.py b/youtube_dl/extractor/zhihu.py new file mode 100644 index 000000000..d1ed55be3 --- /dev/null +++ b/youtube_dl/extractor/zhihu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none + + +class ZhihuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.zhihu.com/zvideo/1342930761977176064', + 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464', + 'info_dict': { + 'id': '1342930761977176064', + 'ext': 'mp4', + 'title': '写春联也太难了吧!', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': '桥半舫', + 'timestamp': 1612959715, + 'upload_date': '20210210', + 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365', + 'duration': 146.333, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + zvideo = self._download_json( + 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id) + title = zvideo['title'] + video = zvideo.get('video') or {} + + formats = [] + for format_id, q in (video.get('playlist') or {}).items(): + play_url = q.get('url') or q.get('play_url') + if not play_url: + continue + formats.append({ + 'asr': int_or_none(q.get('sample_rate')), + 'filesize': int_or_none(q.get('size')), + 'format_id': format_id, + 'fps': int_or_none(q.get('fps')), + 'height': int_or_none(q.get('height')), + 'tbr': float_or_none(q.get('bitrate')), + 'url': play_url, + 'width': int_or_none(q.get('width')), + }) + self._sort_formats(formats) + + author = zvideo.get('author') or {} + url_token = author.get('url_token') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'), + 'uploader': author.get('name'), + 'timestamp': int_or_none(zvideo.get('published_at')), + 'uploader_id': author.get('id'), + 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(zvideo.get('play_count')), + 'like_count': int_or_none(zvideo.get('liked_count')), + 'comment_count': int_or_none(zvideo.get('comment_count')), + } diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index adfdcaabf..207c04f5e 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,93 +1,94 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - update_url_query, ) -class ZingMp3BaseInfoExtractor(InfoExtractor): +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html' + _GEO_COUNTRIES = ['VN'] - def _extract_item(self, item, page_type, fatal=True): - error_message = item.get('msg') - if error_message: - if not fatal: - return - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] formats = [] - for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): - if not source_url or source_url == 'require vip': + for k, v in (item.get('source') or {}).items(): + if not v: continue - if not re.match(r'https?://', source_url): - source_url = '//' + source_url - source_url = self._proto_relative_url(source_url, 'http:') - quality_num = int_or_none(quality) - f = { - 'format_id': quality, - 'url': source_url, - } - if page_type == 'video': - f.update({ - 'height': quality_num, - 'ext': 'mp4', - }) + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) else: - f.update({ - 'abr': quality_num, + formats.append({ 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', }) - formats.append(f) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(msg, expected=True) + self._sort_formats(formats) - cover = item.get('cover') + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } + + album = item.get('album') or {} return { - 'title': (item.get('name') or item.get('title')).strip(), + 'id': item_id, + 'title': title, 'formats': formats, - 'thumbnail': 'http:/' + cover if cover else None, - 'artist': item.get('artist'), + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), } - def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): - player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') - items = player_json['data'] - if 'item' in items: - items = items['item'] - - if len(items) == 1: - # one single song - data = self._extract_item(items[0], page_type) - data['id'] = id - - return data - else: - # playlist of songs - entries = [] - - for i, item in enumerate(items, 1): - entry = self._extract_item(item, page_type, fatal=False) - if not entry: - continue - entry['id'] = '%s-%d' % (id, i) - entries.append(entry) - - return { - '_type': 'playlist', - 'id': id, - 'title': playlist_title, - 'entries': entries, - } + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) -class ZingMp3IE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html' +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', }, }, { - 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', - 'md5': '870295a9cd8045c0e15663565902618d', + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', 'info_dict': { - 'id': 'ZW6BAEA0', - 'title': 'Let It Go (Frozen OST)', + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', }, }, { - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', - }, - 'playlist_count': 10, - 'skip': 'removed at the request of the owner', - }, { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] IE_NAME = 'zingmp3' IE_DESC = 'mp3.zing.vn' - def _real_extract(self, url): - page_id = self._match_id(url) + def _process_data(self, data): + return self._extract_item(data, True) - webpage = self._download_webpage(url, page_id) - player_json_url = self._search_regex([ - r'data-xml="([^"]+)', - r'&xmlURL=([^&]+)&' - ], webpage, 'player xml url') +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái', + }, + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3:album' - playlist_title = None - page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') - if page_type == 'video': - player_json_url = update_url_query(player_json_url, {'format': 'json'}) - else: - player_json_url = player_json_url.replace('/xml/', '/html5xml/') - if page_type == 'album': - playlist_title = self._og_search_title(webpage) - - return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py new file mode 100644 index 000000000..db073d91d --- /dev/null +++ b/youtube_dl/extractor/zoom.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self._downloader.params.get('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 5288f40d8..f20f953cb 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor): r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', body, 'm3u8 url', group='url', default=None) if not m3u8_url: - source = self._parse_json(self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, - 'source'), video_id, js_to_json) - if source.get('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3000ba41e..0a0641bd4 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,6 +689,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, @@ -764,7 +768,7 @@ def parseOpts(overrideArguments=None): action='store_true', dest='rm_cachedir', help='Delete all filesystem cache files') - thumbnail = optparse.OptionGroup(parser, 'Thumbnail images') + thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options') thumbnail.add_option( '--write-thumbnail', action='store_true', dest='writethumbnail', default=False, @@ -782,7 +786,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 5a3359588..3990908b6 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): + atomicparsley = next((x + for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + + if atomicparsley is None: raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = [encodeFilename('AtomicParsley', True), + cmd = [encodeFilename(atomicparsley, True), encodeFilename(filename, True), encodeArgument('--artwork'), encodeFilename(thumbnail_filename, True), diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5f7298345..9f76c9d4e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -231,7 +231,10 @@ class FFmpegPostProcessor(PostProcessor): stdout, stderr = p.communicate() if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') - msg = stderr.strip().split('\n')[-1] + msgs = stderr.strip().split('\n') + msg = msgs[-1] + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] ' + '\n'.join(msgs[:-1])) raise FFmpegPostProcessorError(msg) self.try_utime(out_path, oldest_mtime, oldest_mtime) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ import zlib from .compat import ( compat_HTMLParseError, compat_HTMLParser, + compat_HTTPError, compat_basestring, compat_chr, compat_cookiejar, @@ -2879,12 +2880,60 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - if sys.version_info[0] < 3: - def redirect_request(self, req, fp, code, msg, headers, newurl): - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0d9659b2b..b82fbc702 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.08' +__version__ = '2021.12.17'