[PromoDJ] Add extractors

This commit is contained in:
DmitryScaletta 2024-02-13 22:57:05 +03:00
parent 05420227aa
commit 1a256e5d56
No known key found for this signature in database
GPG Key ID: 167A65222EDD4C2A
2 changed files with 506 additions and 0 deletions

View File

@ -1529,6 +1529,19 @@ from .prankcast import PrankCastIE, PrankCastPostIE
from .premiershiprugby import PremiershipRugbyIE from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE from .presstv import PressTVIE
from .projectveritas import ProjectVeritasIE from .projectveritas import ProjectVeritasIE
from .promodj import (
PromoDJPageIE,
PromoDJUserIE,
PromoDJUserMediaIE,
PromoDJUserPagesIE,
PromoDJUserPageIE,
PromoDJBlogPageIE,
PromoDJPlaylistIE,
PromoDJIE,
PromoDJEmbedIE,
PromoDJShortIE,
PromoDJRadioIE,
)
from .prosiebensat1 import ProSiebenSat1IE from .prosiebensat1 import ProSiebenSat1IE
from .prx import ( from .prx import (
PRXStoryIE, PRXStoryIE,

493
yt_dlp/extractor/promodj.py Normal file
View File

@ -0,0 +1,493 @@
import datetime
import functools
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
OnDemandPagedList,
clean_html,
dict_get,
extract_attributes,
float_or_none,
get_element_by_class,
get_elements_by_class,
int_or_none,
parse_duration,
str_or_none,
traverse_obj,
urlencode_postdata,
url_or_none,
)
# promodj.com
# Playlist types:
# /:login/:media_type - default
# /:login/groups/:id/:slug - user defined (groups). Can contain audios and/or videos
# A single media by default is attached to default playlist
# But it can be reattached to a user playlist (group), and no longer appears in the default one
# User pages
# /:login - all non-empty playlists
# /:login/music - all non-empty playlists with at least one audio (shows 10 audios per playlist max)
# /:login/video - all non-empty playlists with at least one video (shows 10 videos per playlist max)
# /:login/pages - a list of user pages
# /:login/:page_name - a single user page
# /:login/blog - a list of blog posts
# /:login/blog/:id/:slug - a single blog post
# If default playlist is empty, it redirects to the user's page
# Pages and blog posts can contain: audios, videos, youtube videos
# Tracks and remixes can be paid. See /shop page
class PromoDJBaseIE(InfoExtractor):
_MEDIA_TYPES = [
'tracks',
'remixes',
'mixes',
'promos',
'lives',
'podcasts',
'radioshows',
'tools',
'realtones', # doesn't appear on the site menu but still exists
'acapellas', # redirects to /tools, creates default playlist
'samples', # redirects to /tools, doesn't create default playlist
'videos',
]
_PAGES = ['featured', 'shop', *_MEDIA_TYPES]
_BASE_URL_RE = r'https?://(?:www\.)?promodj\.com'
_MEDIA_TYPES_RE = '|'.join(_MEDIA_TYPES)
_NOT_PAGE_RE = '|'.join(['radio', *_PAGES])
_LOGIN_RE = rf'(?:(?!{_NOT_PAGE_RE}).)[\w-]+'
def _set_url_page(self, url, page):
parsed_url = urllib.parse.urlparse(url)
qs = urllib.parse.parse_qs(parsed_url.query)
qs['page'] = page
return parsed_url._replace(query=urllib.parse.urlencode(qs, doseq=True)).geturl()
def _fetch_page(self, url, parsed_media_types, playlist_id, page):
page_url = self._set_url_page(url, page + 1)
html = self._download_webpage(page_url, f'{playlist_id}-page-{page + 1}')
current_page = int(clean_html(get_element_by_class('NavigatorCurrentPage', html)) or '1')
if current_page != page + 1:
return
tracks_dump_html = get_element_by_class('tracks_dump', html)
for item_html in get_elements_by_class('player_standard', tracks_dump_html):
if 'music' in parsed_media_types:
a = get_element_by_class('title', item_html)
if 'video' in parsed_media_types and not a:
a = get_element_by_class('h5videoplayer_promodj_video__title', item_html)
if not a:
continue
if url := traverse_obj(extract_attributes(a), ('href', {url_or_none})):
yield self.url_result(url, PromoDJIE)
def _parse_playlist_links(self, html):
PLAYLISTS_RE = r'<a class=\"files_group_title\" href=\"([^\"]+)\">'
DEFAULT_VIDEO_PLAYLIST_RE = r'<h5><a href=\"https://promodj\.com/([\w-]+)/video\">Видео</a></h5>'
playlist_links = []
for playlist_url in re.findall(PLAYLISTS_RE, html):
playlist_links.append(playlist_url)
login = self._search_regex(
DEFAULT_VIDEO_PLAYLIST_RE, html, 'video playlist url', None)
if login:
playlist_links.append(f'https://promodj.com/{login}/videos')
return playlist_links
def _get_playlist_page_size(self, url):
is_default_playlist = '/groups/' not in url
return 30 if is_default_playlist else 20
def _fetch_media_data(self, ids, video_id):
data = {}
for i, id in enumerate(ids):
data[f'multi[{i}][method]'] = 'players/config'
data[f'multi[{i}][params][kind]'] = 'standalone.big'
data[f'multi[{i}][params][fileID]'] = id
return self._download_json(
'https://promodj.com/api/multi.json', video_id, data=urlencode_postdata(data),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
class PromoDJPageIE(PromoDJBaseIE):
_PAGES_RE = '|'.join(PromoDJBaseIE._PAGES)
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<id>{_PAGES_RE})'
_TESTS = [{
'url': 'https://promodj.com/featured',
'only_matching': True,
}, {
# second page
'url': 'https://promodj.com/featured/rap?download=1&page=2',
'only_matching': True,
}, {
# filtered
'url': 'https://promodj.com/remixes?top=1',
'only_matching': True,
}, {
# with genre
'url': 'https://promodj.com/tracks/hip_hop',
'only_matching': True,
}, {
# with search
'url': 'https://promodj.com/mixes?kind=mixes&styleID=&searchfor=dance',
'only_matching': True,
}, {
# no download button
'url': 'https://promodj.com/shop',
'only_matching': True,
}]
_PAGE_SIZE = 20
def _real_extract(self, url):
page_type = self._match_id(url)
return self.playlist_result(
OnDemandPagedList(
functools.partial(self._fetch_page, url, ['music', 'video'], page_type),
self._PAGE_SIZE),
playlist_id=page_type)
class PromoDJUserIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})$'
_TESTS = [{
'url': 'https://promodj.com/djperetse',
'only_matching': True,
}, {
'url': 'https://promodj.com/dj-trojan',
'only_matching': True,
}]
def _real_extract(self, url):
login = self._match_valid_url(url).group('login')
html = self._download_webpage(url, login)
def entries():
for playlist_url in self._parse_playlist_links(html):
yield self.url_result(playlist_url, PromoDJPlaylistIE)
return self.playlist_result(entries(), playlist_id=login)
class PromoDJUserMediaIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>music|video)$'
_TESTS = [{
'url': 'https://promodj.com/feel/music',
'only_matching': True,
}, {
'url': 'https://promodj.com/djmikis/video',
'only_matching': True,
}, {
# a user without any videos
'url': 'https://promodj.com/worobyev/video',
'only_matching': True,
}]
def _real_extract(self, url):
login, type = self._match_valid_url(url).groups()
page_id = f'{login}-{type}'
html = self._download_webpage(url, page_id)
def entries():
for playlist_url in self._parse_playlist_links(html):
# TODO: parse only music or videos
yield self.url_result(playlist_url, PromoDJPlaylistIE)
return self.playlist_result(entries(), playlist_id=page_id)
class PromoDJUserPagesIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>(pages|blog))$'
_TESTS = [{
'url': 'https://promodj.com/djperetse/pages',
'only_matching': True,
}, {
'url': 'https://promodj.com/golub/blog',
'only_matching': True,
}]
def _real_extract(self, url):
login, type = self._match_valid_url(url).groups()
class PromoDJUserPageIE(PromoDJBaseIE):
_USER_PAGES = [
'pages',
'music',
'video',
'foto',
'avisha',
'blog',
'feedback',
'contact',
*PromoDJBaseIE._MEDIA_TYPES,
]
_NOT_USER_PAGE_RE = '|'.join(_USER_PAGES)
_USER_PAGE_RE = rf'(?:(?!{_NOT_USER_PAGE_RE}).)[\w-]+'
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<slug>{_USER_PAGE_RE})$'
_TESTS = [{
'url': 'https://promodj.com/djperetse/MaxMixes',
'only_matching': True,
}]
def _real_extract(self, url):
login, slug = self._match_valid_url(url).groups()
class PromoDJBlogPageIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/blog/(?P<id>\d+)(?:/(?P<slug>\w+))?'
_TESTS = [{
# with small and big audio players and youtube video
'url': 'https://promodj.com/golub/blog/1163895/DJ_Andrey_Golubev_To_Depeche_Mode_with_love_part_9_special_dj_edits_mix',
'only_matching': True,
}, {
# with audio and video
'url': 'https://promodj.com/svetmusic/blog/1101958/SVET_I_Like_It_Extra_Sound_Recordings',
'only_matching': True,
}, {
# without any media
'url': 'https://promodj.com/svetmusic/blog/915878/DJ_SVET_pobeditel_konkursa_Burn_City_Sound',
'only_matching': True,
}]
def _real_extract(self, url):
login, id, slug = self._match_valid_url(url).groups()
class PromoDJPlaylistIE(PromoDJBaseIE):
_VALID_URL = [
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>{PromoDJBaseIE._MEDIA_TYPES_RE})$',
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>groups)/(?P<id>\d+)(?:/(?P<slug>\w+))?',
]
_TESTS = [{
# default playlist: tracks (audio)
'url': 'https://promodj.com/gluk/tracks',
'only_matching': True,
}, {
# default playlist: video
'url': 'https://promodj.com/djperetse/videos',
'only_matching': True,
}, {
# user playlist: audio
'url': 'https://promodj.com/fonarev/groups/608158/Digital_Emotions_Night',
'only_matching': True,
}, {
# two pages
'url': 'https://promodj.com/lavrov/groups/677132/VINYL',
'only_matching': True,
}, {
# user playlist: video
'url': 'https://promodj.com/deeplecture/groups/672782/LAROCCA_TV',
'only_matching': True,
}, {
# user playlist: audio and video
'url': 'https://promodj.com/djperetse/groups/637358/Russkie_treki',
'only_matching': True,
}, {
# 900+ items
'url': 'https://promodj.com/fonarev/groups/17350/Digital_Emotions_Podcast',
'only_matching': True,
}]
def _real_extract(self, url):
match = self._match_valid_url(url)
login = match.group('login')
type = match.group('type')
playlist_id = f'{login}-{type}' if len(match.groups()) == 2 else f'{login}-{type}-{match.group("id")}'
page_size = self._get_playlist_page_size(url)
entries = OnDemandPagedList(
functools.partial(self._fetch_page, url, ['music', 'video'], playlist_id),
page_size)
return self.playlist_result(entries, playlist_id=playlist_id)
class PromoDJIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/{PromoDJBaseIE._LOGIN_RE}/(?P<type>{PromoDJBaseIE._MEDIA_TYPES_RE})/(?P<id>\d+)(?:/\w+)?',
_TESTS = [{
'url': 'https://promodj.com/antonpavlovsky/remixes/6259208/David_Usher_Black_Black_Heart_Anton_Pavlovsky_Cover',
'only_matching': True,
}, {
'url': 'https://promodj.com/j-factory/samples/7560171/Amedici_BW1_Intro',
'only_matching': True,
}, {
# no download links in html
'url': 'https://promodj.com/gluk/tracks/4713922/DJ_Glyuk_Folk_ing_DJ_Steven_Smile_Remix_2005',
'only_matching': True,
}, {
# no player
'url': 'https://promodj.com/gluk/tracks/420310/IMpulse_Zakat',
'only_matching': True,
}, {
# without slug
'url': 'https://promodj.com/djlykov/tracks/7551590',
'only_matching': True,
}, {
# lossless
'url': 'https://promodj.com/modi-glu/tracks/6081339/Modi_Glyu_Anabel',
'only_matching': True,
}, {
# paid audio
'url': 'https://promodj.com/boyko/tracks/1435682/Dj_Boyko_Katy_Queen_Nad_Oblakami',
'only_matching': True,
}, {
'url': 'https://promodj.com/sergeyfedotov306/videos/7457627/V_Matrice_Sboy',
'only_matching': True,
}, {
'url': 'https://promodj.com/djperetse/videos/5868236/Fatalist_Project_feat_DJ_Peretse_Den_pobedi_Videoklip',
'only_matching': True,
}]
_IS_PAID_RE = r'<b>Цена:</b>'
# examples: MP3, 320 Кбит | MP4, 20157 Кбит | WAV, 1412 Кбит
_FORMATS_RE = r'<a\s+href=\"(?P<url>[^\"]+\.(?:mp3|mp4|wav))\">\s*(?P<format>MP3|MP4|WAV), (?P<bitrate>\d+) Кбит\s*</a>'
_VIEW_COUNT_RE = r'<b>(?:Прослушиваний|Просмотров):</b>\s*(\d+)'
# examples: 0:21, 1:07, 74:38
_DURATION_RE = r'<b>Продолжительность:</b>\s*(\d{1,}:\d{2})'
# examples: 818.4 Кб, 12.9 Мб, 4 Гб, 1.76 Гб
_SIZE_RE = r'<b>Размер:</b>\s*(?P<size>\d{1,3}(?:\.\d{1,2})?)\s*(?P<unit>Кб|Мб|Гб)'
# examples: сегодня 2:55, вчера 23:17, 1 июня 2016 3:46
_TIMESTAMP_RE = r'<b>Публикация:</b>\s*(?P<day>вчера|сегодня|\d{1,2})(?: (?P<month>[а-я]+) (?P<year>\d{4}))?\s*(?P<hours>\d{1,2}):(?P<minutes>\d{2})'
_TAGS_RE = r'<span\s+class=\"styles\">([^\n]+)</span>'
def _parse_ru_date(self, raw_date):
RU_MONTHS = ['января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря']
day, month, year, hours, minutes = raw_date
if day == 'сегодня':
d = datetime.date.today()
day = d.day
month = d.month
year = d.year
elif day == 'вчера':
d = datetime.date.today() - datetime.timedelta(days=1)
day = d.day
month = d.month
year = d.year
else:
day = int(day)
month = RU_MONTHS.index(month) + 1
year = int(year)
return datetime.datetime(year, month, day, int(hours), int(minutes)).timestamp()
def _parse_ru_size(self, raw_size):
RU_SIZE_UNITS = ['Б', 'Кб', 'Мб', 'Гб']
size, size_unit = raw_size
return int(float(size) * pow(1024, RU_SIZE_UNITS.index(size_unit)))
def _parse_media(self, html, id):
meta_html = get_element_by_class('clearfix', get_element_by_class('dj_bblock', html))
is_paid = re.search(self._IS_PAID_RE, meta_html)
formats_from_html = re.findall(self._FORMATS_RE, meta_html)
if is_paid or len(formats_from_html) == 0:
media_data_raw = self._search_regex(
r'({\"no_preroll\":false,\"seekAny\":true,\"sources\":[^\n]+)\);', html, 'media data')
media_data = self._parse_json(media_data_raw, id)
formats = [{
'url': source.get('URL'),
'size': int_or_none(source.get('size')),
} for source in traverse_obj(media_data, ('sources')) if url_or_none(source.get('URL'))]
else:
formats = [{
'url': url,
'format': format.lower(),
'tbr': int(bitrate),
} for url, format, bitrate in formats_from_html if url_or_none(url)]
# size field describes best quality. best quality always comes first
formats[0]['size'] = self._parse_ru_size(re.findall(self._SIZE_RE, meta_html)[0])
return {
'id': id,
'title': clean_html(get_element_by_class('file_title', html)),
'formats': formats,
'view_count': int_or_none(self._search_regex(self._VIEW_COUNT_RE, meta_html, 'view_count')),
'duration': parse_duration(self._search_regex(self._DURATION_RE, meta_html, 'duration')),
'timestamp': self._parse_ru_date(re.findall(self._TIMESTAMP_RE, meta_html)[0]),
'tags': self._html_search_regex(self._TAGS_RE, meta_html, 'tags').split(', '),
}
def _real_extract(self, url):
id = self._match_id(url)
html = self._download_webpage(url, id)
return self._parse_media(html, id)
class PromoDJEmbedIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/embed/(?P<id>\d+)/(?P<type>cover|big)'
_TESTS = [{
'url': 'https://promodj.com/embed/7555440/cover',
'only_matching': True,
}, {
'url': 'https://promodj.com/embed/7540163/big',
'only_matching': True,
}, {
# video (can be only big)
'url': 'https://promodj.com/embed/3922099/big',
'only_matching': True,
}]
def _get_full_url(self, media_data, id):
if media_data.get('video'):
video_config = self._parse_json(media_data['config'], id)
video = traverse_obj(video_config, ('playlist', 'item', 0))
return traverse_obj(video, ('title', '@ico_url'))
else:
return media_data.get('titleURL')
def _real_extract(self, url):
id = self._match_id(url)
url = self._get_full_url(self._fetch_media_data([id], id)[0], id)
return self.url_result(url, PromoDJIE, id)
class PromoDJShortIE(PromoDJBaseIE):
_VALID_URL = r'https://pdj.cc/(?P<id>\w+)'
_TESTS = [{
'url': 'https://pdj.cc/fv8VD',
'only_matching': True,
}]
_PAGE_URL_REGEX = r'<meta property="og:url"\s*content="(?P<url>[^"]+)"'
def _real_extract(self, url):
id = self._match_id(url)
html = self._download_webpage(url, id)
url = re.findall(self._PAGE_URL_REGEX, html)[0]
return self.url_result(url, PromoDJIE, id)
class PromoDJRadioIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/radio#(?P<id>\w+)'
_TESTS = [{
'url': 'https://promodj.com/radio#dubstep',
'only_matching': True,
}, {
'url': 'https://promodj.com/radio#oldschool',
'only_matching': True,
}]
def _real_extract(self, url):
id = self._match_id(url)
return {
'id': id,
'formats': [{
'url': f'https://radio.promodj.com/{id}-192',
'abr': 192,
}],
'is_live': True,
}