[extractors/podbayfm] Add extractor (#4971)

Authored by: schnusch
2022-10-10 22:31:01 +02:00 · 2022-10-10 22:31:01 +02:00 · 2c98d99818
parent 226c0f3a54
commit 2c98d99818
3 changed files with 78 additions and 1 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1345,6 +1345,7 @@ from .pluralsight import (
    PluralsightIE,
    PluralsightCourseIE,
 )
+from .podbayfm import PodbayFMIE, PodbayFMChannelIE
 from .podchaser import PodchaserIE
 from .podomatic import PodomaticIE
 from .pokemon import (
--- a/yt_dlp/extractor/podbayfm.py
+++ b/yt_dlp/extractor/podbayfm.py
@ -0,0 +1,75 @@
+from .common import InfoExtractor
+from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call
+
+
+def result_from_props(props, episode_id=None):
+    return {
+        'id': props.get('podcast_id') or episode_id,
+        'title': props.get('title'),
+        'url': props['mediaURL'],
+        'ext': 'mp3',
+        'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']),
+        'timestamp': props.get('timestamp'),
+        'duration': int_or_none(props.get('duration')),
+    }
+
+
+class PodbayFMIE(InfoExtractor):
+    _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$'
+    _TESTS = [{
+        'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
+        'md5': '98b41285dcf7989d105a4ed0404054cf',
+        'info_dict': {
+            'id': '1647338400',
+            'title': 'Part One: Kissinger',
+            'ext': 'mp3',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1647338400,
+            'duration': 5001,
+            'upload_date': '20220315',
+        },
+    }]
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        webpage = self._download_webpage(url, episode_id)
+        data = self._search_nextjs_data(webpage, episode_id)
+        return result_from_props(data['props']['pageProps']['episode'], episode_id)
+
+
+class PodbayFMChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$'
+    _TESTS = [{
+        'url': 'https://podbay.fm/p/behind-the-bastards',
+        'info_dict': {
+            'id': 'behind-the-bastards',
+            'title': 'Behind the Bastards',
+        },
+    }]
+    _PAGE_SIZE = 10
+
+    def _fetch_page(self, channel_id, pagenum):
+        return self._download_json(
+            f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
+            channel_id)['podcast']
+
+    @staticmethod
+    def _results_from_page(channel_id, page):
+        return [{
+            **result_from_props(e),
+            'extractor': PodbayFMIE.IE_NAME,
+            'extractor_key': PodbayFMIE.ie_key(),
+            # somehow they use timestamps as the episode identifier
+            'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
+        } for e in page['episodes']]
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        first_page = self._fetch_page(channel_id, 0)
+        entries = OnDemandPagedList(
+            lambda pagenum: self._results_from_page(
+                channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
+            self._PAGE_SIZE)
+
+        return self.playlist_result(entries, channel_id, first_page.get('title'))
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -5499,7 +5499,8 @@ def jwt_encode_hs256(payload_data, key, headers={}):
 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
 def jwt_decode_hs256(jwt):
    header_b64, payload_b64, signature_b64 = jwt.split('.')
-    payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
+    # add trailing ='s that may have been stripped, superfluous ='s are ignored
+    payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
    return payload_data