[extractor/unscripted] Add extractor (#5008)

Closes #4903
Authored by: HobbyistDev
This commit is contained in:
HobbyistDev 2022-10-01 01:54:05 +09:00 committed by GitHub
parent 2e0f8d4f6e
commit 20a7304e4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 54 additions and 0 deletions

View File

@ -1971,6 +1971,7 @@ from .drooble import DroobleIE
from .umg import UMGDeIE from .umg import UMGDeIE
from .unistra import UnistraIE from .unistra import UnistraIE
from .unity import UnityIE from .unity import UnityIE
from .unscripted import UnscriptedNewsVideoIE
from .uol import UOLIE from .uol import UOLIE
from .uplynk import ( from .uplynk import (
UplynkIE, UplynkIE,

View File

@ -0,0 +1,53 @@
from .common import InfoExtractor
from ..utils import parse_duration, traverse_obj
class UnscriptedNewsVideoIE(InfoExtractor):
_VALID_URL = r'https?://www\.unscripted\.news/videos/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.unscripted.news/videos/a-day-at-the-farmers-protest',
'info_dict': {
'id': '60c0a55cd1e99b1079918a57',
'display_id': 'a-day-at-the-farmers-protest',
'ext': 'mp4',
'title': 'A Day at the Farmers\' Protest',
'description': 'md5:4b3df22747a03e8f14f746dd72190384',
'thumbnail': 'https://s3.unscripted.news/anj2/60c0a55cd1e99b1079918a57/5f199a65-c803-4a5c-8fce-2077359c3b72.jpg',
'duration': 2251.0,
'series': 'Ground Reports',
}
}, {
'url': 'https://www.unscripted.news/videos/you-get-the-politicians-you-deserve-ft-shashi-tharoor',
'info_dict': {
'id': '5fb3afbf18ac817d341a74d8',
'display_id': 'you-get-the-politicians-you-deserve-ft-shashi-tharoor',
'ext': 'mp4',
'cast': ['Avalok Langer', 'Ashwin Mehta'],
'thumbnail': 'https://s3.unscripted.news/anj2/5fb3afbf18ac817d341a74d8/82bd7942-4f20-4cd8-98ae-83f9e814f998.jpg',
'description': 'md5:1e91b069238a705ca3a40f87e6f1182c',
'duration': 1046.0,
'series': 'Dumb Questions Only',
'title': 'You Get The Politicians You Deserve! ft. Shashi Tharoor',
}
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['dataLocal']
# TODO: get subtitle from srt key
formats, subtitles = self._extract_m3u8_formats_and_subtitles(nextjs_data['alt_content'], display_id)
return {
'id': nextjs_data['_id'],
'display_id': display_id,
'title': nextjs_data.get('title') or self._og_search_title(webpage),
'description': nextjs_data.get('sh_heading') or self._og_search_description(webpage),
'formats': formats,
'subtitles': subtitles,
'thumbnail': self._og_search_thumbnail(webpage),
'duration': parse_duration(nextjs_data.get('duration')),
'series': traverse_obj(nextjs_data, ('show', 'topic')),
'cast': traverse_obj(nextjs_data, ('cast_crew', ..., 'displayname')),
}