From e50c3500b43d80e4492569c4b4523c4379c6fbb2 Mon Sep 17 00:00:00 2001 From: siddharth ravikumar Date: Thu, 2 Jun 2022 20:51:11 -0400 Subject: [PATCH] [extractor/npr] Use stream url from json-ld (#3455) Closes #1934 Authored by: r5d --- yt_dlp/extractor/common.py | 4 +++- yt_dlp/extractor/npr.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d88d5e6f9..71e982f02 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1461,7 +1461,7 @@ class InfoExtractor: assert e['@type'] == 'VideoObject' author = e.get('author') info.update({ - 'url': url_or_none(e.get('contentUrl')), + 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'thumbnails': [{'url': url} @@ -1529,6 +1529,8 @@ class InfoExtractor: }) if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': extract_video_object(e['video'][0]) + elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject': + extract_video_object(e['subjectOf'][0]) elif item_type == 'VideoObject': extract_video_object(e) if expected_type is None: diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index 6d93f154c..e4ff8d6c2 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -51,6 +51,15 @@ class NprIE(InfoExtractor): # multimedia, no formats, stream 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', 'only_matching': True, + }, { + 'url': 'https://www.npr.org/2022/03/15/1084896560/bonobo-tiny-desk-home-concert', + 'info_dict': { + 'id': '1086468851', + 'ext': 'mp4', + 'title': 'Bonobo: Tiny Desk (Home) Concert', + 'duration': 1061, + 'thumbnail': r're:^https?://media.npr.org/assets/img/.*\.jpg$', + }, }] def _real_extract(self, url): @@ -65,6 +74,10 @@ class NprIE(InfoExtractor): })['list']['story'][0] playlist_title = story.get('title', {}).get('$text') + # Fetch the JSON-LD from the npr page. + json_ld = self._search_json_ld( + self._download_webpage(url, playlist_id), playlist_id, 'NewsArticle', fatal=False) + KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3') quality = qualities(KNOWN_FORMATS) @@ -110,6 +123,10 @@ class NprIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( stream_url, stream_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + if not formats and json_ld.get('url'): + formats.extend(self._extract_m3u8_formats(json_ld['url'], media_id, 'mp4', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) entries.append({