[extractor/detik] Generalize extractors (#4899)

Authored by: HobbyistDev, coletdjnz
This commit is contained in:
HobbyistDev 2022-10-04 12:09:23 +09:00 committed by GitHub
parent 12f153a827
commit c7f540ea1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 186 additions and 90 deletions

View File

@ -333,6 +333,7 @@ from .cnn import (
CNNIE, CNNIE,
CNNBlogsIE, CNNBlogsIE,
CNNArticleIE, CNNArticleIE,
CNNIndonesiaIE,
) )
from .coub import CoubIE from .coub import CoubIE
from .comedycentral import ( from .comedycentral import (
@ -411,7 +412,7 @@ from .deezer import (
DeezerAlbumIE, DeezerAlbumIE,
) )
from .democracynow import DemocracynowIE from .democracynow import DemocracynowIE
from .detik import Detik20IE from .detik import DetikEmbedIE
from .dfb import DFBIE from .dfb import DFBIE
from .dhm import DHMIE from .dhm import DHMIE
from .digg import DiggIE from .digg import DiggIE

View File

@ -1,6 +1,6 @@
from .common import InfoExtractor from .common import InfoExtractor
from .turner import TurnerBaseIE from .turner import TurnerBaseIE
from ..utils import url_basename from ..utils import merge_dicts, try_call, url_basename
class CNNIE(TurnerBaseIE): class CNNIE(TurnerBaseIE):
@ -141,3 +141,58 @@ class CNNArticleIE(InfoExtractor):
webpage = self._download_webpage(url, url_basename(url)) webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
class CNNIndonesiaIE(InfoExtractor):
_VALID_URL = r'https?://www\.cnnindonesia\.com/[\w-]+/(?P<upload_date>\d{8})\d+-\d+-(?P<id>\d+)/(?P<display_id>[\w-]+)'
_TESTS = [{
'url': 'https://www.cnnindonesia.com/ekonomi/20220909212635-89-845885/alasan-harga-bbm-di-indonesia-masih-disubsidi',
'info_dict': {
'id': '845885',
'ext': 'mp4',
'description': 'md5:e7954bfa6f1749bc9ef0c079a719c347',
'upload_date': '20220909',
'title': 'Alasan Harga BBM di Indonesia Masih Disubsidi',
'timestamp': 1662859088,
'duration': 120.0,
'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/09/thumbnail-ekopedia-alasan-harga-bbm-disubsidi_169\.jpeg',
'tags': ['ekopedia', 'subsidi bbm', 'subsidi', 'bbm', 'bbm subsidi', 'harga pertalite naik'],
'age_limit': 0,
'release_timestamp': 1662859088,
'release_date': '20220911',
'uploader': 'Asfahan Yahsyi',
}
}, {
'url': 'https://www.cnnindonesia.com/internasional/20220911104341-139-846189/video-momen-charles-disambut-meriah-usai-dilantik-jadi-raja-inggris',
'info_dict': {
'id': '846189',
'ext': 'mp4',
'upload_date': '20220911',
'duration': 76.0,
'timestamp': 1662869995,
'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d',
'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169\.jpeg',
'title': 'VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris',
'tags': ['raja charles', 'raja charles iii', 'ratu elizabeth', 'ratu elizabeth meninggal dunia', 'raja inggris', 'inggris'],
'age_limit': 0,
'release_date': '20220911',
'uploader': 'REUTERS',
'release_timestamp': 1662869995,
}
}]
def _real_extract(self, url):
upload_date, video_id, display_id = self._match_valid_url(url).group('upload_date', 'id', 'display_id')
webpage = self._download_webpage(url, display_id)
json_ld_list = list(self._yield_json_ld(webpage, display_id))
json_ld_data = self._json_ld(json_ld_list, display_id)
embed_url = next(
json_ld.get('embedUrl') for json_ld in json_ld_list if json_ld.get('@type') == 'VideoObject')
return merge_dicts(json_ld_data, {
'_type': 'url_transparent',
'url': embed_url,
'upload_date': upload_date,
'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', '))
})

View File

@ -1,122 +1,162 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import merge_dicts, str_or_none from ..utils import int_or_none, merge_dicts, try_call, url_basename
class Detik20IE(InfoExtractor): class DetikEmbedIE(InfoExtractor):
IE_NAME = '20.detik.com' _VALID_URL = False
_VALID_URL = r'https?://20\.detik\.com/((?!program)[\w-]+)/[\d-]+/(?P<id>[\w-]+)' _WEBPAGE_TESTS = [{
_TESTS = [{ # cnn embed
# detikflash 'url': 'https://www.cnnindonesia.com/embed/video/846189',
'url': 'https://20.detik.com/detikflash/20220705-220705098/zulhas-klaim-sukses-turunkan-harga-migor-jawa-bali',
'info_dict': { 'info_dict': {
'id': '220705098', 'id': '846189',
'ext': 'mp4', 'ext': 'mp4',
'duration': 157, 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/05/bfe0384db04f4bbb9dd5efc869c5d4b1-20220705164334-0s.jpg?w=650&q=80', 'thumbnail': r're:https?://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169.jpeg',
'description': 'md5:ac18dcee5b107abbec1ed46e0bf400e3', 'title': 'Video CNN Indonesia - VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris',
'title': 'Zulhas Klaim Sukses Turunkan Harga Migor Jawa-Bali', 'age_limit': 0,
'tags': ['zulkifli hasan', 'menteri perdagangan', 'minyak goreng'], 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'],
'timestamp': 1657039548, 'release_timestamp': 1662869995,
'upload_date': '20220705' 'release_date': '20220911',
'uploader': 'REUTERS'
} }
}, { }, {
# e-flash # 20.detik
'url': 'https://20.detik.com/e-flash/20220705-220705109/ahli-level-ppkm-jadi-payung-strategi-protokol-kesehatan',
'info_dict': {
'id': '220705109',
'ext': 'mp4',
'tags': ['ppkm jabodetabek', 'dicky budiman', 'ppkm'],
'upload_date': '20220705',
'duration': 110,
'title': 'Ahli: Level PPKM Jadi Payung Strategi Protokol Kesehatan',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/05/Ahli-_Level_PPKM_Jadi_Payung_Strat_jOgUMCN-20220705182313-custom.jpg?w=650&q=80',
'description': 'md5:4eb825a9842e6bdfefd66f47b364314a',
'timestamp': 1657045255,
}
}, {
# otobuzz
'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport',
'info_dict': { 'info_dict': {
'display_id': 'mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport',
'id': '220704093', 'id': '220704093',
'ext': 'mp4', 'ext': 'mp4',
'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'],
'timestamp': 1656951521,
'duration': 83,
'upload_date': '20220704',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg?w=650&q=80',
'description': 'md5:9b2257341b6f375cdcf90106146d5ffb', 'description': 'md5:9b2257341b6f375cdcf90106146d5ffb',
'thumbnail': r're:https?://cdnv\.detik\.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg',
'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport', 'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport',
} 'timestamp': 1656951521,
}, {
# sport-buzz
'url': 'https://20.detik.com/sport-buzz/20220704-220704054/crash-crash-horor-di-paruh-pertama-motogp-2022',
'info_dict': {
'id': '220704054',
'ext': 'mp4',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/04/6b172c6fb564411996ea145128315630-20220704090746-0s.jpg?w=650&q=80',
'title': 'Crash-crash Horor di Paruh Pertama MotoGP 2022',
'description': 'md5:fbcc6687572ad7d16eb521b76daa50e4',
'timestamp': 1656925591,
'duration': 107,
'tags': ['marc marquez', 'fabio quartararo', 'francesco bagnaia', 'motogp crash', 'motogp 2022'],
'upload_date': '20220704', 'upload_date': '20220704',
'duration': 83.0,
'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'],
'release_timestamp': 1656926321,
'release_date': '20220704',
'age_limit': 0,
'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader
} }
}, { }, {
# adu-perspektif # pasangmata.detik
'url': 'https://20.detik.com/adu-perspektif/20220518-220518144/24-tahun-reformasi-dan-alarm-demokrasi-dari-filipina', 'url': 'https://pasangmata.detik.com/contribution/366649',
'info_dict': { 'info_dict': {
'id': '220518144', 'id': '366649',
'ext': 'mp4', 'ext': 'mp4',
'title': '24 Tahun Reformasi dan Alarm Demokrasi dari Filipina', 'title': 'Saling Dorong Aparat dan Pendemo di Aksi Tolak Kenaikan BBM',
'upload_date': '20220518', 'description': 'md5:7a6580876c8381c454679e028620bea7',
'timestamp': 1652913823, 'age_limit': 0,
'duration': 185.0, 'tags': 'count:17',
'tags': ['politik', 'adu perspektif', 'indonesia', 'filipina', 'demokrasi'], 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg',
'description': 'md5:8eaaf440b839c3d02dca8c9bbbb099a9',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/05/18/adpers_18_mei_compressed-20220518230458-custom.jpg?w=650&q=80',
} }
}, { }, {
# sosok # insertlive embed
'url': 'https://20.detik.com/sosok/20220702-220703032/resa-boenard-si-princess-bantar-gebang', 'url': 'https://www.insertlive.com/embed/video/290482',
'info_dict': { 'info_dict': {
'id': '220703032', 'id': '290482',
'ext': 'mp4', 'ext': 'mp4',
'timestamp': 1656824438, 'release_timestamp': 1663063704,
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/07/02/SOSOK_BGBJ-20220702191138-custom.jpg?w=650&q=80', 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/13/leonardo-dicaprio_169.png?w=600&q=90',
'title': 'Resa Boenard Si \'Princess Bantar Gebang\'', 'age_limit': 0,
'description': 'md5:84ea66306a0285330de6a13fc6218b78', 'description': 'Aktor Leonardo DiCaprio memang baru saja putus dari kekasihnya yang bernama Camilla Morrone.',
'tags': ['sosok', 'sosok20d', 'bantar gebang', 'bgbj', 'resa boenard', 'bantar gebang bgbj', 'bgbj bantar gebang', 'sosok bantar gebang', 'sosok bgbj', 'bgbj resa boenard'], 'release_date': '20220913',
'upload_date': '20220703', 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta',
'duration': 650, 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'],
'uploader': '!nsertlive',
} }
}, { }, {
# viral # beautynesia embed
'url': 'https://20.detik.com/viral/20220603-220603135/merasakan-bus-imut-tanpa-pengemudi-muter-muter-di-kawasan-bsd-city', 'url': 'https://www.beautynesia.id/embed/video/261636',
'info_dict': { 'info_dict': {
'id': '220603135', 'id': '261636',
'ext': 'mp4', 'ext': 'mp4',
'description': 'md5:4771fe101aa303edb829c59c26f9e7c6', 'age_limit': 0,
'timestamp': 1654304305, 'release_timestamp': 1662375600,
'title': 'Merasakan Bus Imut Tanpa Pengemudi, Muter-muter di Kawasan BSD City', 'description': 'Menurut ramalan astrologi, tiga zodiak ini bakal hoki sepanjang September 2022.',
'tags': ['viral', 'autonomous vehicle', 'electric', 'shuttle bus'], 'title': '3 Zodiak Paling Beruntung Selama September 2022',
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/06/03/VIRAL_BUS_NO_SUPIR-20220604004707-custom.jpg?w=650&q=80', 'release_date': '20220905',
'duration': 593, 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'],
'upload_date': '20220604', 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90',
'uploader': 'amh',
}
}, {
# cnbcindonesia embed
'url': 'https://www.cnbcindonesia.com/embed/video/371839',
'info_dict': {
'id': '371839',
'ext': 'mp4',
'title': 'Puluhan Pejabat Rusia Tuntut Putin Mundur',
'tags': ['putin'],
'age_limit': 0,
'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80',
'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc',
}
}, {
# detik shortlink (we can get it from https://dtk.id/?<url>)
'url': 'https://dtk.id/NkISKr',
'info_dict': {
'id': '220914049',
'ext': 'mp4',
'release_timestamp': 1663114488,
'uploader': 'Tim 20Detik',
'title': 'Pakar Bicara soal Tim Khusus Jokowi dan Mereka yang Pro ke Bjorka',
'age_limit': 0,
'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/09/14/f15cae71d7b640c58e75b254ecbb1ce1-20220914071613-0s.jpg?w=400&q=80',
'display_id': 'pakar-bicara-soal-tim-khusus-jokowi-dan-mereka-yang-pro-ke-bjorka',
'upload_date': '20220914',
'release_date': '20220914',
'description': 'md5:5eb03225f7ee40207dd3a1e18a73f1ff',
'timestamp': 1663139688,
'duration': 213.0,
'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'],
} }
}] }]
def _real_extract(self, url): def _extract_from_webpage(self, url, webpage):
display_id = self._match_id(url) display_id = url_basename(url)
webpage = self._download_webpage(url, display_id) player_type, video_data = self._search_regex(
json_ld_data = self._search_json_ld(webpage, display_id) r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})',
webpage, 'playerjs', group=('type', 'video_data'), default=(None, ''))
video_url = self._html_search_regex( json_ld_data = self._search_json_ld(webpage, display_id, default={})
r'videoUrl\s*:\s*"(?P<video_url>[^"]+)', webpage, 'videoUrl') extra_info_dict = {}
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id, ext='mp4')
return merge_dicts(json_ld_data, { if not player_type:
'id': self._html_search_meta('video_id', webpage), return
elif player_type == 'flowplayer':
video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id)
video_url = video_json_data['videoUrl']
extra_info_dict = {
'id': self._search_regex(r'identifier\s*:\s*\'([^\']+)', webpage, 'identifier'),
'thumbnail': video_json_data.get('imageUrl'),
}
elif player_type == 'detikVideo':
video_url = self._search_regex(
r'videoUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl')
extra_info_dict = {
'id': self._html_search_meta(['video_id', 'dtk:video_id'], webpage),
'thumbnail': self._search_regex(r'imageUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl'),
'duration': int_or_none(self._html_search_meta('duration', webpage, fatal=False, default=None)),
'release_timestamp': int_or_none(self._html_search_meta('dtk:publishdateunix', webpage, fatal=False, default=None), 1000),
'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000),
'uploader': self._search_regex(
r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader',
default=None)
}
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id)
self._sort_formats(formats)
yield merge_dicts(json_ld_data, extra_info_dict, {
'display_id': display_id,
'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage),
'description': self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'tags': str_or_none(self._html_search_meta(['keywords', 'keyword', 'dtk:keywords'], webpage), '').split(','), 'tags': try_call(lambda: self._html_search_meta(
['keywords', 'keyword', 'dtk:keywords'], webpage).split(',')),
}) })