mirror of https://github.com/yt-dlp/yt-dlp.git
[facebook] Improve title and uploader extraction
Closes #1943, closes #795
This commit is contained in:
parent
fabb27fcea
commit
80fa6e5327
|
@ -20,13 +20,13 @@ from ..utils import (
|
||||||
get_element_by_id,
|
get_element_by_id,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
js_to_json,
|
js_to_json,
|
||||||
limit_length,
|
|
||||||
merge_dicts,
|
merge_dicts,
|
||||||
network_exceptions,
|
network_exceptions,
|
||||||
parse_count,
|
parse_count,
|
||||||
parse_qs,
|
parse_qs,
|
||||||
qualities,
|
qualities,
|
||||||
sanitized_Request,
|
sanitized_Request,
|
||||||
|
traverse_obj,
|
||||||
try_get,
|
try_get,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
|
@ -398,28 +398,31 @@ class FacebookIE(InfoExtractor):
|
||||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||||
|
|
||||||
def extract_metadata(webpage):
|
def extract_metadata(webpage):
|
||||||
video_title = self._html_search_regex(
|
media_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
|
||||||
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
|
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
|
||||||
'title', default=None)
|
media = traverse_obj(media_data, (
|
||||||
if not video_title:
|
..., 'require', ..., ..., ..., '__bbox', 'result', 'data', 'attachments', ..., 'media'), expected_type=dict)
|
||||||
video_title = self._html_search_regex(
|
media = [m for m in media if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
|
||||||
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
|
|
||||||
webpage, 'alternative title', default=None)
|
video_title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
|
||||||
if not video_title:
|
description = traverse_obj(media, (
|
||||||
video_title = self._html_search_meta(
|
..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
|
||||||
['og:title', 'twitter:title', 'description'],
|
uploader = traverse_obj(media, (..., 'owner', 'name'), get_all=False)
|
||||||
webpage, 'title', default=None)
|
uploader_id = traverse_obj(media, (..., 'owner', 'id'), get_all=False)
|
||||||
if video_title:
|
|
||||||
video_title = limit_length(video_title, 80)
|
video_title = video_title or self._html_search_regex((
|
||||||
else:
|
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
|
||||||
video_title = 'Facebook video #%s' % video_id
|
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
|
||||||
description = self._html_search_meta(
|
self._meta_regex('og:title'), self._meta_regex('twitter:title'), self._meta_regex('description'),
|
||||||
|
), webpage, 'title', default=None, group='content')
|
||||||
|
description = description or self._html_search_meta(
|
||||||
['description', 'og:description', 'twitter:description'],
|
['description', 'og:description', 'twitter:description'],
|
||||||
webpage, 'description', default=None)
|
webpage, 'description', default=None)
|
||||||
uploader = clean_html(get_element_by_id(
|
uploader = uploader or (
|
||||||
'fbPhotoPageAuthorName', webpage)) or self._search_regex(
|
clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
|
||||||
r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
|
or self._search_regex(
|
||||||
default=None) or self._og_search_title(webpage, fatal=False)
|
(r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
|
||||||
|
|
||||||
timestamp = int_or_none(self._search_regex(
|
timestamp = int_or_none(self._search_regex(
|
||||||
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
|
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
|
||||||
'timestamp', default=None))
|
'timestamp', default=None))
|
||||||
|
@ -434,17 +437,17 @@ class FacebookIE(InfoExtractor):
|
||||||
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
|
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
|
||||||
default=None))
|
default=None))
|
||||||
info_dict = {
|
info_dict = {
|
||||||
'title': video_title,
|
'title': video_title or description.replace('\n', ' ') or f'Facebook video #{video_id}',
|
||||||
'description': description,
|
'description': description,
|
||||||
'uploader': uploader,
|
'uploader': uploader,
|
||||||
|
'uploader_id': uploader_id,
|
||||||
'timestamp': timestamp,
|
'timestamp': timestamp,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
'view_count': view_count,
|
'view_count': view_count,
|
||||||
}
|
}
|
||||||
info_json_ld = self._search_json_ld(webpage, video_id, default={})
|
info_json_ld = self._search_json_ld(webpage, video_id, default={})
|
||||||
if info_json_ld.get('title'):
|
if info_json_ld.get('title'):
|
||||||
info_json_ld['title'] = limit_length(
|
info_json_ld['title'] = re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title'])
|
||||||
re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
|
|
||||||
return merge_dicts(info_json_ld, info_dict)
|
return merge_dicts(info_json_ld, info_dict)
|
||||||
|
|
||||||
video_data = None
|
video_data = None
|
||||||
|
|
Loading…
Reference in New Issue