2021-01-01 05:26:37 -07:00
import json
2013-06-23 12:59:45 -06:00
import re
2022-06-24 02:10:17 -06:00
import urllib . parse
2013-06-23 12:59:45 -06:00
from . common import InfoExtractor
2024-06-11 17:09:58 -06:00
from . . compat import compat_etree_fromstring
2023-07-09 01:53:02 -06:00
from . . networking import Request
from . . networking . exceptions import network_exceptions
2014-11-02 03:23:40 -07:00
from . . utils import (
2022-06-24 02:10:17 -06:00
ExtractorError ,
2017-02-01 09:15:38 -07:00
clean_html ,
2022-01-01 13:47:24 -07:00
determine_ext ,
2021-01-01 05:26:37 -07:00
float_or_none ,
2023-11-25 19:17:16 -07:00
format_field ,
2017-02-01 09:15:38 -07:00
get_element_by_id ,
2022-03-08 13:54:41 -07:00
get_first ,
2016-08-31 09:12:37 -06:00
int_or_none ,
2024-01-21 23:28:11 -07:00
join_nonempty ,
2017-02-01 09:15:38 -07:00
js_to_json ,
2021-08-23 10:21:42 -06:00
merge_dicts ,
2018-07-18 13:25:19 -06:00
parse_count ,
2021-12-19 18:30:04 -07:00
parse_qs ,
2021-01-01 05:26:37 -07:00
qualities ,
2023-07-15 15:03:23 -06:00
str_or_none ,
2022-01-11 09:39:49 -07:00
traverse_obj ,
2017-02-01 09:15:38 -07:00
try_get ,
2021-12-19 18:30:04 -07:00
url_or_none ,
2014-12-10 07:18:34 -07:00
urlencode_postdata ,
2021-01-01 05:26:37 -07:00
urljoin ,
2022-01-13 04:02:21 -07:00
variadic ,
2013-06-23 12:59:45 -06:00
)
class FacebookIE ( InfoExtractor ) :
2014-01-21 10:10:14 -07:00
_VALID_URL = r ''' (?x)
2016-01-25 09:15:21 -07:00
( ? :
https ? : / /
2021-08-25 09:01:43 -06:00
( ? : [ \w - ] + \. ) ? ( ? : facebook \. com | facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd \. onion ) /
2016-01-25 09:15:21 -07:00
( ? : [ ^ #]*?\#!/)?
( ? :
( ? :
2024-01-28 11:50:03 -07:00
permalink \. php |
2016-01-25 09:15:21 -07:00
video / video \. php |
photo \. php |
video \. php |
2016-02-22 22:17:24 -07:00
video / embed |
2021-01-01 05:26:37 -07:00
story \. php |
watch ( ? : / live ) ? / ?
2016-02-22 22:17:24 -07:00
) \? ( ? : . * ? ) ( ? : v | video_id | story_fbid ) = |
2016-03-09 23:33:48 -07:00
[ ^ / ] + / videos / ( ? : [ ^ / ] + / ) ? |
2016-03-11 01:20:27 -07:00
[ ^ / ] + / posts / |
2024-01-29 12:43:41 -07:00
events / ( ? : [ ^ / ] + / ) ? |
2023-12-24 15:43:35 -07:00
groups / [ ^ / ] + / ( ? : permalink | posts ) / |
2021-01-01 05:26:37 -07:00
watchparty /
2016-01-25 09:15:21 -07:00
) |
facebook :
)
2024-01-18 16:40:08 -07:00
( ? P < id > pfbid [ A - Za - z0 - 9 ] + | \d + )
2016-01-25 09:15:21 -07:00
'''
2022-07-31 19:23:25 -06:00
_EMBED_REGEX = [
r ' <iframe[^>]+?src=([ " \' ])(?P<url>https?://www \ .facebook \ .com/(?:video/embed|plugins/video \ .php).+?) \ 1 ' ,
# Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player
r ''' (?x)<div[^>]+
class = ( ? P < q1 > [ \' " ])[^ \' " ]* \b fb-(?:video|post) \b [^ \' " ]*(?P=q1)[^>]+
data - href = ( ? P < q2 > [ \' " ])(?P<url>(?:https?:)?//(?:www \ .)?facebook.com/.+?)(?P=q2) ' ' ' ,
]
2013-10-27 05:07:58 -06:00
_LOGIN_URL = ' https://www.facebook.com/login.php?next=http % 3A %2F %2F facebook.com %2F home.php&login_attempt=1 '
_CHECKPOINT_URL = ' https://www.facebook.com/checkpoint/?next=http % 3A %2F %2F facebook.com %2F home.php&_fb_noscript=1 '
2013-06-23 12:59:45 -06:00
_NETRC_MACHINE = ' facebook '
2014-03-03 19:36:54 -07:00
IE_NAME = ' facebook '
2016-01-30 06:31:53 -07:00
2016-03-09 23:33:48 -07:00
_VIDEO_PAGE_TEMPLATE = ' https://www.facebook.com/video/video.php?v= %s '
2018-11-06 13:22:00 -07:00
_VIDEO_PAGE_TAHOE_TEMPLATE = ' https://www.facebook.com/video/tahoe/async/ %s /?chain=true&isvideo=true&payloadtype=primary '
2016-03-09 23:33:48 -07:00
2014-08-27 03:08:47 -06:00
_TESTS = [ {
2023-09-05 14:35:23 -06:00
' url ' : ' https://www.facebook.com/radiokicksfm/videos/3676516585958356/ ' ,
' info_dict ' : {
' id ' : ' 3676516585958356 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' dr Adam Przygoda ' ,
' description ' : ' md5:34675bda53336b1d16400265c2bb9b3b ' ,
' uploader ' : ' RADIO KICKS FM ' ,
' upload_date ' : ' 20230818 ' ,
' timestamp ' : 1692346159 ,
' thumbnail ' : r ' re:^https?://.* ' ,
' uploader_id ' : ' 100063551323670 ' ,
' duration ' : 3132.184 ,
' view_count ' : int ,
' concurrent_view_count ' : 0 ,
} ,
} , {
2014-09-13 01:01:57 -06:00
' url ' : ' https://www.facebook.com/video.php?v=637842556329505&fref=nf ' ,
' md5 ' : ' 6a40d33c0eccbb1af76cf0485a052659 ' ,
2014-03-03 19:36:54 -07:00
' info_dict ' : {
2014-09-13 01:01:57 -06:00
' id ' : ' 637842556329505 ' ,
2014-03-03 19:36:54 -07:00
' ext ' : ' mp4 ' ,
2014-09-28 21:19:56 -06:00
' title ' : ' re:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam ' ,
2015-08-02 17:09:21 -06:00
' uploader ' : ' Tennis on Facebook ' ,
2016-08-31 09:12:37 -06:00
' upload_date ' : ' 20140908 ' ,
' timestamp ' : 1410199200 ,
2017-10-10 09:20:38 -06:00
} ,
' skip ' : ' Requires logging in ' ,
2014-09-15 07:10:24 -06:00
} , {
2021-01-01 05:26:37 -07:00
# data.video
2014-09-15 07:10:24 -06:00
' url ' : ' https://www.facebook.com/video.php?v=274175099429670 ' ,
' info_dict ' : {
' id ' : ' 274175099429670 ' ,
' ext ' : ' mp4 ' ,
2023-07-15 15:03:23 -06:00
' title ' : ' Asif ' ,
' description ' : ' ' ,
2015-08-02 17:09:21 -06:00
' uploader ' : ' Asif Nawab Butt ' ,
2016-08-31 09:12:37 -06:00
' upload_date ' : ' 20140506 ' ,
' timestamp ' : 1399398998 ,
2017-10-10 09:20:38 -06:00
' thumbnail ' : r ' re:^https?://.* ' ,
2023-09-05 14:35:23 -06:00
' uploader_id ' : ' pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl ' ,
2023-07-15 15:03:23 -06:00
' duration ' : 131.03 ,
' concurrent_view_count ' : int ,
2015-05-25 23:37:15 -06:00
} ,
2016-01-30 06:31:53 -07:00
} , {
' note ' : ' Video with DASH manifest ' ,
' url ' : ' https://www.facebook.com/video.php?v=957955867617029 ' ,
2016-08-31 09:12:37 -06:00
' md5 ' : ' b2c28d528273b323abe5c6ab59f0f030 ' ,
2016-01-30 06:31:53 -07:00
' info_dict ' : {
' id ' : ' 957955867617029 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' When you post epic content on instagram.com/433 8 million followers, this is ... ' ,
' uploader ' : ' Demy de Zeeuw ' ,
2016-08-31 09:12:37 -06:00
' upload_date ' : ' 20160110 ' ,
' timestamp ' : 1452431627 ,
2016-01-30 06:31:53 -07:00
} ,
2017-10-10 09:20:38 -06:00
' skip ' : ' Requires logging in ' ,
2016-03-09 23:33:48 -07:00
} , {
' url ' : ' https://www.facebook.com/maxlayn/posts/10153807558977570 ' ,
' md5 ' : ' 037b1fa7f3c2d02b7a0d7bc16031ecc6 ' ,
' info_dict ' : {
' id ' : ' 544765982287235 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' " What are you doing running in the snow? " ' ,
' uploader ' : ' FailArmy ' ,
2016-10-30 04:20:55 -06:00
} ,
' skip ' : ' Video gone ' ,
2016-03-09 23:58:05 -07:00
} , {
' url ' : ' https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903 ' ,
' md5 ' : ' 1deb90b6ac27f7efcf6d747c8a27f5e3 ' ,
' info_dict ' : {
' id ' : ' 1035862816472149 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' What the Flock Is Going On In New Zealand Credit: ViralHog ' ,
' uploader ' : ' S. Saint ' ,
} ,
2016-10-30 04:20:55 -06:00
' skip ' : ' Video gone ' ,
2016-03-10 00:26:32 -07:00
} , {
' note ' : ' swf params escaped ' ,
' url ' : ' https://www.facebook.com/barackobama/posts/10153664894881749 ' ,
' md5 ' : ' 97ba073838964d12c70566e0085c2b91 ' ,
' info_dict ' : {
' id ' : ' 10153664894881749 ' ,
' ext ' : ' mp4 ' ,
2017-10-10 09:20:38 -06:00
' title ' : ' Average time to confirm recent Supreme Court nominees: 67 days Longest it \' s t... ' ,
' thumbnail ' : r ' re:^https?://.* ' ,
' timestamp ' : 1456259628 ,
' upload_date ' : ' 20160223 ' ,
' uploader ' : ' Barack Obama ' ,
2016-03-10 00:26:32 -07:00
} ,
2021-08-23 10:21:42 -06:00
' skip ' : ' Gif on giphy.com gone ' ,
2016-10-30 04:20:55 -06:00
} , {
# have 1080P, but only up to 720p in swf params
2021-01-01 05:26:37 -07:00
# data.video.story.attachments[].media
2016-10-30 04:20:55 -06:00
' url ' : ' https://www.facebook.com/cnn/videos/10155529876156509/ ' ,
2023-07-15 15:03:23 -06:00
' md5 ' : ' ca63897a90c9452efee5f8c40d080e25 ' ,
2016-10-30 04:20:55 -06:00
' info_dict ' : {
' id ' : ' 10155529876156509 ' ,
' ext ' : ' mp4 ' ,
2021-08-23 10:21:42 -06:00
' title ' : ' Holocaust survivor becomes US citizen ' ,
' description ' : ' She survived the holocaust — and years later, she’ s getting her citizenship so she can vote for Hillary Clinton http://cnn.it/2eERh5f ' ,
2016-10-30 04:20:55 -06:00
' timestamp ' : 1477818095 ,
' upload_date ' : ' 20161030 ' ,
' uploader ' : ' CNN ' ,
2017-10-10 09:20:38 -06:00
' thumbnail ' : r ' re:^https?://.* ' ,
2018-07-18 13:25:19 -06:00
' view_count ' : int ,
2023-07-15 15:03:23 -06:00
' uploader_id ' : ' 100059479812265 ' ,
' concurrent_view_count ' : int ,
' duration ' : 44.478 ,
2016-10-30 04:20:55 -06:00
} ,
2017-02-09 09:42:40 -07:00
} , {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
2021-01-01 05:26:37 -07:00
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
2017-02-09 09:42:40 -07:00
' url ' : ' https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ ' ,
' info_dict ' : {
' id ' : ' 1417995061575415 ' ,
' ext ' : ' mp4 ' ,
2023-07-15 15:03:23 -06:00
' title ' : ' Довгоочікуване відео | By Yaroslav - Facebook ' ,
2021-08-23 10:21:42 -06:00
' description ' : ' Довгоочікуване відео ' ,
2023-07-15 15:03:23 -06:00
' timestamp ' : 1486648217 ,
2017-02-09 09:42:40 -07:00
' upload_date ' : ' 20170209 ' ,
' uploader ' : ' Yaroslav Korpan ' ,
2023-09-05 14:35:23 -06:00
' uploader_id ' : ' pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl ' ,
2023-07-15 15:03:23 -06:00
' concurrent_view_count ' : int ,
' thumbnail ' : r ' re:^https?://.* ' ,
' view_count ' : int ,
' duration ' : 11736.446 ,
2017-02-09 09:42:40 -07:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
2017-02-10 11:04:09 -07:00
} , {
2024-06-11 17:09:58 -06:00
# FIXME: Cannot parse data error
2017-02-10 11:04:09 -07:00
' url ' : ' https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471 ' ,
' info_dict ' : {
' id ' : ' 1072691702860471 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' md5:ae2d22a93fbb12dad20dc393a869739d ' ,
' timestamp ' : 1477305000 ,
' upload_date ' : ' 20161024 ' ,
' uploader ' : ' La Guía Del Varón ' ,
2017-10-10 09:20:38 -06:00
' thumbnail ' : r ' re:^https?://.* ' ,
2017-02-10 11:04:09 -07:00
} ,
2023-07-15 15:03:23 -06:00
' skip ' : ' Requires logging in ' ,
2017-02-10 11:04:09 -07:00
} , {
2021-01-01 05:26:37 -07:00
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
2017-02-10 11:04:09 -07:00
' url ' : ' https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/ ' ,
' info_dict ' : {
2021-08-23 10:21:42 -06:00
' id ' : ' 202882990186699 ' ,
2017-02-10 11:04:09 -07:00
' ext ' : ' mp4 ' ,
2022-01-13 04:02:21 -07:00
' title ' : ' birb (O v O " ) | Hello? Yes your uber ride is here ' ,
' description ' : ' Hello? Yes your uber ride is here * Jukin Media Verified * Find this video and others like it by visiting... ' ,
2021-08-23 10:21:42 -06:00
' timestamp ' : 1486035513 ,
2017-02-10 11:04:09 -07:00
' upload_date ' : ' 20170202 ' ,
' uploader ' : ' Elisabeth Ahtn ' ,
2021-08-23 10:21:42 -06:00
' uploader_id ' : ' 100013949973717 ' ,
2017-02-10 11:04:09 -07:00
} ,
2023-07-15 15:03:23 -06:00
' skip ' : ' Requires logging in ' ,
2023-12-24 15:43:35 -07:00
} , {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
' url ' : ' https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/ ' ,
' info_dict ' : {
' id ' : ' 1569199726448814 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Pence MUST GO! ' ,
' description ' : ' Vickie Gentry shared a memory. ' ,
' timestamp ' : 1511548260 ,
' upload_date ' : ' 20171124 ' ,
' uploader ' : ' Vickie Gentry ' ,
' uploader_id ' : ' pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl ' ,
' thumbnail ' : r ' re:^https?://.* ' ,
' duration ' : 148.435 ,
} ,
2024-01-18 16:40:08 -07:00
} , {
2024-01-28 11:50:03 -07:00
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
2024-01-18 16:40:08 -07:00
' url ' : ' https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl ' ,
' info_dict ' : {
' id ' : ' 6968553779868435 ' ,
' ext ' : ' mp4 ' ,
' description ' : ' md5:2f2fcf93e97ac00244fe64521bbdb0cb ' ,
' uploader ' : ' ATTN: ' ,
' upload_date ' : ' 20231207 ' ,
' title ' : ' ATTN: ' ,
' duration ' : 132.675 ,
' uploader_id ' : ' 100064451419378 ' ,
' view_count ' : int ,
' thumbnail ' : r ' re:^https?://.* ' ,
' timestamp ' : 1701975646 ,
} ,
2024-01-28 11:50:03 -07:00
} , {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
' url ' : ' https://www.facebook.com/permalink.php?story_fbid=pfbid0fqQuVEQyXRa9Dp4RcaTR14KHU3uULHV1EK7eckNXSH63JMuoALsAvVCJ97zAGitil&id=100068861234290 ' ,
' info_dict ' : {
' id ' : ' 270103405756416 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Lela Evans ' ,
' description ' : ' Today Makkovik \' s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and... ' ,
' thumbnail ' : r ' re:^https?://.* ' ,
' uploader ' : ' Lela Evans ' ,
' uploader_id ' : ' pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl ' ,
' upload_date ' : ' 20231228 ' ,
' timestamp ' : 1703804085 ,
' duration ' : 394.347 ,
' view_count ' : int ,
} ,
2024-01-18 16:40:08 -07:00
} , {
' url ' : ' https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552 ' ,
' only_matching ' : True ,
2014-08-27 03:08:47 -06:00
} , {
' url ' : ' https://www.facebook.com/video.php?v=10204634152394104 ' ,
' only_matching ' : True ,
2015-04-16 10:08:52 -06:00
} , {
' url ' : ' https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf ' ,
' only_matching ' : True ,
2015-04-18 02:08:24 -06:00
} , {
2021-01-01 05:26:37 -07:00
# data.mediaset.currMedia.edges
2015-04-18 02:08:24 -06:00
' url ' : ' https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater ' ,
' only_matching ' : True ,
2016-01-25 09:15:21 -07:00
} , {
2021-01-01 05:26:37 -07:00
# data.video.story.attachments[].media
2016-01-25 09:15:21 -07:00
' url ' : ' facebook:544765982287235 ' ,
' only_matching ' : True ,
2016-03-11 01:20:27 -07:00
} , {
2021-01-01 05:26:37 -07:00
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
2016-03-11 01:20:27 -07:00
' url ' : ' https://www.facebook.com/groups/164828000315060/permalink/764967300301124/ ' ,
' only_matching ' : True ,
2016-07-23 15:36:49 -06:00
} , {
2021-01-01 05:26:37 -07:00
# data.video.creation_story.attachments[].media
2016-07-23 15:36:49 -06:00
' url ' : ' https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/ ' ,
' only_matching ' : True ,
2016-12-14 10:01:14 -07:00
} , {
2021-01-01 05:26:37 -07:00
# data.video
2021-08-25 09:01:43 -06:00
' url ' : ' https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670 ' ,
2016-12-14 10:01:14 -07:00
' only_matching ' : True ,
2017-03-13 11:37:39 -06:00
} , {
# no title
' url ' : ' https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/ ' ,
' only_matching ' : True ,
2018-05-25 10:34:22 -06:00
} , {
2021-01-01 05:26:37 -07:00
# data.video
2018-05-25 10:34:22 -06:00
' url ' : ' https://www.facebook.com/WatchESLOne/videos/359649331226507/ ' ,
' info_dict ' : {
' id ' : ' 359649331226507 ' ,
' ext ' : ' mp4 ' ,
2021-08-23 10:21:42 -06:00
' title ' : ' Fnatic vs. EG - Group A - Opening Match - ESL One Birmingham Day 1 ' ,
' description ' : ' #ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses ' ,
' timestamp ' : 1527084179 ,
' upload_date ' : ' 20180523 ' ,
2018-05-25 10:34:22 -06:00
' uploader ' : ' ESL One Dota 2 ' ,
2023-07-15 15:03:23 -06:00
' uploader_id ' : ' 100066514874195 ' ,
' duration ' : 4524.212 ,
' view_count ' : int ,
' thumbnail ' : r ' re:^https?://.* ' ,
' concurrent_view_count ' : int ,
2018-05-25 10:34:22 -06:00
} ,
' params ' : {
' skip_download ' : True ,
} ,
2021-01-01 05:26:37 -07:00
} , {
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
' url ' : ' https://www.facebook.com/100033620354545/videos/106560053808006/ ' ,
' info_dict ' : {
' id ' : ' 106560053808006 ' ,
2023-07-15 15:03:23 -06:00
' ext ' : ' mp4 ' ,
' title ' : ' Josef ' ,
' thumbnail ' : r ' re:^https?://.* ' ,
' concurrent_view_count ' : int ,
2023-09-05 14:35:23 -06:00
' uploader_id ' : ' pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl ' ,
2023-07-15 15:03:23 -06:00
' timestamp ' : 1549275572 ,
' duration ' : 3.413 ,
' uploader ' : ' Josef Novak ' ,
' description ' : ' ' ,
' upload_date ' : ' 20190204 ' ,
2021-01-01 05:26:37 -07:00
} ,
} , {
# data.video.story.attachments[].media
' url ' : ' https://www.facebook.com/watch/?v=647537299265662 ' ,
' only_matching ' : True ,
} , {
2021-08-23 10:21:42 -06:00
# FIXME: https://github.com/yt-dlp/yt-dlp/issues/542
2021-01-01 05:26:37 -07:00
# data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
' url ' : ' https://www.facebook.com/PankajShahLondon/posts/10157667649866271 ' ,
' info_dict ' : {
' id ' : ' 10157667649866271 ' ,
} ,
' playlist_count ' : 3 ,
2023-07-15 15:03:23 -06:00
' skip ' : ' Requires logging in ' ,
2021-01-01 05:26:37 -07:00
} , {
# data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
' url ' : ' https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330 ' ,
' info_dict ' : {
' id ' : ' 117576630041613 ' ,
' ext ' : ' mp4 ' ,
# TODO: title can be extracted from video page
' title ' : ' Facebook video #117576630041613 ' ,
' uploader_id ' : ' 189393014416438 ' ,
' upload_date ' : ' 20201123 ' ,
' timestamp ' : 1606162592 ,
} ,
' skip ' : ' Requires logging in ' ,
} , {
# node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
' url ' : ' https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/ ' ,
' info_dict ' : {
' id ' : ' 211567722618337 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Facebook video #211567722618337 ' ,
' uploader_id ' : ' 127875227654254 ' ,
' upload_date ' : ' 20161122 ' ,
' timestamp ' : 1479793574 ,
} ,
2021-08-23 10:21:42 -06:00
' skip ' : ' No video ' ,
2021-01-01 05:26:37 -07:00
} , {
# data.video.creation_story.attachments[].media
' url ' : ' https://www.facebook.com/watch/live/?v=1823658634322275 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.facebook.com/watchparty/211641140192478 ' ,
' info_dict ' : {
' id ' : ' 211641140192478 ' ,
} ,
' playlist_count ' : 1 ,
' skip ' : ' Requires logging in ' ,
2024-01-29 12:43:41 -07:00
} , {
# data.event.cover_media_renderer.cover_video
' url ' : ' https://m.facebook.com/events/1509582499515440 ' ,
' info_dict ' : {
' id ' : ' 637246984455045 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati " ' ,
' description ' : ' Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022 ' ,
' thumbnail ' : r ' re:^https?://.* ' ,
' uploader ' : ' Comitato Liberi Pensatori ' ,
' uploader_id ' : ' 100065709540881 ' ,
} ,
2014-08-27 03:08:47 -06:00
} ]
2021-01-01 05:26:37 -07:00
_SUPPORTED_PAGLETS_REGEX = r ' (?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+) '
_api_config = {
2024-06-11 17:09:58 -06:00
' graphURI ' : ' /api/graphql/ ' ,
2021-01-01 05:26:37 -07:00
}
2013-06-23 12:59:45 -06:00
2022-03-18 14:53:33 -06:00
def _perform_login ( self , username , password ) :
2023-07-09 01:53:02 -06:00
login_page_req = Request ( self . _LOGIN_URL )
2015-12-28 08:37:02 -07:00
self . _set_cookie ( ' facebook.com ' , ' locale ' , ' en_US ' )
2014-03-07 07:25:33 -07:00
login_page = self . _download_webpage ( login_page_req , None ,
2014-11-23 13:39:15 -07:00
note = ' Downloading login page ' ,
errnote = ' Unable to download login page ' )
2014-03-03 19:39:04 -07:00
lsd = self . _search_regex (
2014-03-03 19:39:45 -07:00
r ' <input type= " hidden " name= " lsd " value= " ([^ " ]*) " ' ,
2014-03-03 19:39:04 -07:00
login_page , ' lsd ' )
2014-03-03 19:36:54 -07:00
lgnrnd = self . _search_regex ( r ' name= " lgnrnd " value= " ([^ " ]*?) " ' , login_page , ' lgnrnd ' )
2013-10-27 05:07:58 -06:00
2013-06-23 12:59:45 -06:00
login_form = {
2022-03-18 14:53:33 -06:00
' email ' : username ,
2013-06-23 12:59:45 -06:00
' pass ' : password ,
2013-10-27 05:07:58 -06:00
' lsd ' : lsd ,
' lgnrnd ' : lgnrnd ,
' next ' : ' http://facebook.com/home.php ' ,
' default_persistent ' : ' 0 ' ,
' legacy_return ' : ' 1 ' ,
' timezone ' : ' -60 ' ,
' trynum ' : ' 1 ' ,
2014-11-23 14:21:46 -07:00
}
2023-07-09 01:53:02 -06:00
request = Request ( self . _LOGIN_URL , urlencode_postdata ( login_form ) )
request . headers [ ' Content-Type ' ] = ' application/x-www-form-urlencoded '
2013-06-23 12:59:45 -06:00
try :
2014-03-07 07:25:33 -07:00
login_results = self . _download_webpage ( request , None ,
2014-11-23 13:39:15 -07:00
note = ' Logging in ' , errnote = ' unable to fetch login page ' )
2013-06-23 12:59:45 -06:00
if re . search ( r ' <form(.*)name= " login " (.*)</form> ' , login_results ) is not None :
2015-12-28 08:20:09 -07:00
error = self . _html_search_regex (
r ' (?s)<div[^>]+class=([ " \' ]).*?login_error_box.*? \ 1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div> ' ,
login_results , ' login error ' , default = None , group = ' error ' )
if error :
2024-06-11 17:09:58 -06:00
raise ExtractorError ( f ' Unable to login: { error } ' , expected = True )
2021-04-16 04:01:10 -06:00
self . report_warning ( ' unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait. ' )
2013-06-23 12:59:45 -06:00
return
2013-10-27 05:07:58 -06:00
2015-12-28 08:37:02 -07:00
fb_dtsg = self . _search_regex (
r ' name= " fb_dtsg " value= " (.+?) " ' , login_results , ' fb_dtsg ' , default = None )
h = self . _search_regex (
r ' name= " h " \ s+(?: \ w+= " [^ " ]+ " \ s+)*?value= " ([^ " ]+) " ' , login_results , ' h ' , default = None )
if not fb_dtsg or not h :
return
2013-10-27 05:07:58 -06:00
check_form = {
2015-12-28 08:37:02 -07:00
' fb_dtsg ' : fb_dtsg ,
' h ' : h ,
2013-10-27 05:07:58 -06:00
' name_action_selected ' : ' dont_save ' ,
}
2023-07-09 01:53:02 -06:00
check_req = Request ( self . _CHECKPOINT_URL , urlencode_postdata ( check_form ) )
check_req . headers [ ' Content-Type ' ] = ' application/x-www-form-urlencoded '
2014-03-07 07:25:33 -07:00
check_response = self . _download_webpage ( check_req , None ,
2014-11-23 13:39:15 -07:00
note = ' Confirming login ' )
2013-10-27 05:07:58 -06:00
if re . search ( r ' id= " checkpointSubmitButton " ' , check_response ) is not None :
2021-04-16 04:01:10 -06:00
self . report_warning ( ' Unable to confirm login, you have to login in your browser and authorize the login. ' )
2021-05-04 11:06:18 -06:00
except network_exceptions as err :
2024-06-11 17:09:58 -06:00
self . report_warning ( f ' unable to log in: { err } ' )
2013-06-23 12:59:45 -06:00
return
2021-01-01 05:26:37 -07:00
def _extract_from_url ( self , url , video_id ) :
webpage = self . _download_webpage (
url . replace ( ' ://m.facebook.com/ ' , ' ://www.facebook.com/ ' ) , video_id )
2013-06-23 12:59:45 -06:00
2021-08-23 10:21:42 -06:00
def extract_metadata ( webpage ) :
2022-01-13 04:02:21 -07:00
post_data = [ self . _parse_json ( j , video_id , fatal = False ) for j in re . findall (
2023-09-05 14:35:23 -06:00
r ' data-sjs>( { .*?ScheduledServerJS.*?})</script> ' , webpage ) ]
2022-01-13 04:02:21 -07:00
post = traverse_obj ( post_data , (
2023-09-05 14:35:23 -06:00
. . . , ' require ' , . . . , . . . , . . . , ' __bbox ' , ' require ' , . . . , . . . , . . . , ' __bbox ' , ' result ' , ' data ' ) , expected_type = dict ) or [ ]
2022-04-18 15:27:20 -06:00
media = traverse_obj ( post , ( . . . , ' attachments ' , . . . , lambda k , v : (
k == ' media ' and str ( v [ ' id ' ] ) == video_id and v [ ' __typename ' ] == ' Video ' ) ) , expected_type = dict )
2022-03-08 13:54:41 -07:00
title = get_first ( media , ( ' title ' , ' text ' ) )
description = get_first ( media , ( ' creation_story ' , ' comet_sections ' , ' message ' , ' story ' , ' message ' , ' text ' ) )
2022-01-13 04:02:21 -07:00
page_title = title or self . _html_search_regex ( (
2022-01-11 09:39:49 -07:00
r ' <h2 \ s+[^>]*class= " uiHeaderTitle " [^>]*>(?P<content>[^<]*)</h2> ' ,
r ' (?s)<span class= " fbPhotosPhotoCaption " .*?id= " fbPhotoPageCaption " ><span class= " hasCaption " >(?P<content>.*?)</span> ' ,
2024-06-11 17:09:58 -06:00
self . _meta_regex ( ' og:title ' ) , self . _meta_regex ( ' twitter:title ' ) , r ' <title>(?P<content>.+?)</title> ' ,
2022-01-11 09:39:49 -07:00
) , webpage , ' title ' , default = None , group = ' content ' )
description = description or self . _html_search_meta (
2021-08-23 10:21:42 -06:00
[ ' description ' , ' og:description ' , ' twitter:description ' ] ,
webpage , ' description ' , default = None )
2024-01-29 12:43:41 -07:00
uploader_data = (
get_first ( media , ( ' owner ' , { dict } ) )
2024-02-15 12:46:57 -07:00
or get_first ( post , ( ' video ' , ' creation_story ' , ' attachments ' , . . . , ' media ' , lambda k , v : k == ' owner ' and v [ ' name ' ] ) )
2024-01-29 12:43:41 -07:00
or get_first ( post , ( . . . , ' video ' , lambda k , v : k == ' owner ' and v [ ' name ' ] ) )
or get_first ( post , ( ' node ' , ' actors ' , . . . , { dict } ) )
or get_first ( post , ( ' event ' , ' event_creator ' , { dict } ) ) or { } )
2022-01-13 04:02:21 -07:00
uploader = uploader_data . get ( ' name ' ) or (
2022-01-11 09:39:49 -07:00
clean_html ( get_element_by_id ( ' fbPhotoPageAuthorName ' , webpage ) )
or self . _search_regex (
( r ' ownerName \ s*: \ s* " ([^ " ]+) " ' , * self . _og_regexes ( ' title ' ) ) , webpage , ' uploader ' , fatal = False ) )
2021-08-23 10:21:42 -06:00
timestamp = int_or_none ( self . _search_regex (
r ' <abbr[^>]+data-utime=[ " \' ]( \ d+) ' , webpage ,
' timestamp ' , default = None ) )
thumbnail = self . _html_search_meta (
[ ' og:image ' , ' twitter:image ' ] , webpage , ' thumbnail ' , default = None )
# some webpages contain unretrievable thumbnail urls
# like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
if thumbnail and not re . search ( r ' \ .(?:jpg|png) ' , thumbnail ) :
thumbnail = None
info_dict = {
' description ' : description ,
' uploader ' : uploader ,
2022-01-13 04:02:21 -07:00
' uploader_id ' : uploader_data . get ( ' id ' ) ,
2021-08-23 10:21:42 -06:00
' timestamp ' : timestamp ,
' thumbnail ' : thumbnail ,
2023-05-08 17:19:42 -06:00
' view_count ' : parse_count ( self . _search_regex (
2024-06-11 17:09:58 -06:00
( r ' \ bviewCount \ s*: \ s*[ " \' ]([ \ d,.]+) ' , r ' video_view_count[ " \' ] \ s*: \ s*( \ d+) ' ) ,
2023-05-08 17:19:42 -06:00
webpage , ' view count ' , default = None ) ) ,
' concurrent_view_count ' : get_first ( post , (
( ' video ' , ( . . . , . . . , ' attachments ' , . . . , ' media ' ) ) , ' liveViewerCount ' , { int_or_none } ) ) ,
2021-08-23 10:21:42 -06:00
}
2022-01-13 04:02:21 -07:00
2021-08-23 10:21:42 -06:00
info_json_ld = self . _search_json_ld ( webpage , video_id , default = { } )
2022-01-13 04:02:21 -07:00
info_json_ld [ ' title ' ] = ( re . sub ( r ' \ s* \ | \ s*Facebook$ ' , ' ' , title or info_json_ld . get ( ' title ' ) or page_title or ' ' )
or ( description or ' ' ) . replace ( ' \n ' , ' ' ) or f ' Facebook video # { video_id } ' )
2021-08-23 10:21:42 -06:00
return merge_dicts ( info_json_ld , info_dict )
2016-01-30 04:30:39 -07:00
video_data = None
2017-02-01 09:15:38 -07:00
def extract_video_data ( instances ) :
2021-01-01 05:26:37 -07:00
video_data = [ ]
2017-02-01 09:15:38 -07:00
for item in instances :
2021-01-01 05:26:37 -07:00
if try_get ( item , lambda x : x [ 1 ] [ 0 ] ) == ' VideoConfig ' :
2017-02-01 09:15:38 -07:00
video_item = item [ 2 ] [ 0 ]
2017-02-10 11:04:09 -07:00
if video_item . get ( ' video_id ' ) :
2021-01-01 05:26:37 -07:00
video_data . append ( video_item [ ' videoData ' ] )
return video_data
2017-02-01 09:15:38 -07:00
2016-10-30 04:20:55 -06:00
server_js_data = self . _parse_json ( self . _search_regex (
2021-01-01 05:26:37 -07:00
[ r ' handleServerJS \ (( { .+})(?: \ );|, " ) ' , r ' \ bs \ .handle \ (( { .+?}) \ ); ' ] ,
webpage , ' server js data ' , default = ' {} ' ) , video_id , fatal = False )
2017-02-01 09:15:38 -07:00
if server_js_data :
video_data = extract_video_data ( server_js_data . get ( ' instances ' , [ ] ) )
2018-06-01 12:32:18 -06:00
def extract_from_jsmods_instances ( js_data ) :
if js_data :
return extract_video_data ( try_get (
js_data , lambda x : x [ ' jsmods ' ] [ ' instances ' ] , list ) or [ ] )
2021-01-01 05:26:37 -07:00
def extract_dash_manifest ( video , formats ) :
2024-04-20 04:23:12 -06:00
dash_manifest = traverse_obj ( video , ' dash_manifest ' , ' playlist ' , expected_type = str )
2021-01-01 05:26:37 -07:00
if dash_manifest :
formats . extend ( self . _parse_mpd_formats (
2023-08-01 08:13:54 -06:00
compat_etree_fromstring ( urllib . parse . unquote_plus ( dash_manifest ) ) ,
mpd_url = video . get ( ' dash_manifest_url ' ) ) )
2021-01-01 05:26:37 -07:00
2022-11-16 22:10:03 -07:00
def process_formats ( info ) :
2021-01-01 05:26:37 -07:00
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
2022-11-16 22:10:03 -07:00
for f in info [ ' formats ' ] :
2024-01-28 11:39:14 -07:00
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
2021-01-01 05:26:37 -07:00
f . setdefault ( ' http_headers ' , { } ) [ ' User-Agent ' ] = ' facebookexternalhit/1.1 '
2024-01-28 11:39:14 -07:00
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f . setdefault ( ' downloader_options ' , { } ) [ ' http_chunk_size ' ] = 250 << 20
2021-01-01 05:26:37 -07:00
def extract_relay_data ( _filter ) :
return self . _parse_json ( self . _search_regex (
2024-06-11 17:09:58 -06:00
rf ' data-sjs>( {{ .*? { _filter } .*? }} )</script> ' ,
2021-01-01 05:26:37 -07:00
webpage , ' replay data ' , default = ' {} ' ) , video_id , fatal = False ) or { }
def extract_relay_prefetched_data ( _filter ) :
2023-09-05 14:35:23 -06:00
return traverse_obj ( extract_relay_data ( _filter ) , (
' require ' , ( None , ( . . . , . . . , . . . , ' __bbox ' , ' require ' ) ) ,
2024-02-15 12:46:57 -07:00
lambda _ , v : any ( key . startswith ( ' RelayPrefetchedStreamCache ' ) for key in v ) ,
. . . , . . . , ' __bbox ' , ' result ' , ' data ' , { dict } ) , get_all = False ) or { }
2021-01-01 05:26:37 -07:00
2017-02-01 09:15:38 -07:00
if not video_data :
2021-01-01 05:26:37 -07:00
server_js_data = self . _parse_json ( self . _search_regex ( [
r ' bigPipe \ .onPageletArrive \ (( { .+?}) \ ) \ s*; \ s*} \ s* \ ) \ s*, \ s*[ " \' ]onPageletArrive \ s+ ' + self . _SUPPORTED_PAGLETS_REGEX ,
2024-06-11 17:09:58 -06:00
rf ' bigPipe \ .onPageletArrive \ (( {{ .*?id \ s*: \ s* " { self . _SUPPORTED_PAGLETS_REGEX } " .*? }} ) \ ); ' ,
2021-01-01 05:26:37 -07:00
] , webpage , ' js data ' , default = ' {} ' ) , video_id , js_to_json , False )
2018-06-01 12:32:18 -06:00
video_data = extract_from_jsmods_instances ( server_js_data )
2016-01-30 04:30:39 -07:00
2018-05-25 10:34:22 -06:00
if not video_data :
2021-01-01 05:26:37 -07:00
data = extract_relay_prefetched_data (
2023-09-05 14:35:23 -06:00
r ' " (?:dash_manifest|playable_url(?:_quality_hd)?) ' )
2021-01-01 05:26:37 -07:00
if data :
entries = [ ]
def parse_graphql_video ( video ) :
2023-07-15 15:03:23 -06:00
v_id = video . get ( ' videoId ' ) or video . get ( ' id ' ) or video_id
reel_info = traverse_obj (
video , ( ' creation_story ' , ' short_form_video_context ' , ' playback_video ' , { dict } ) )
if reel_info :
video = video [ ' creation_story ' ]
video [ ' owner ' ] = traverse_obj ( video , ( ' short_form_video_context ' , ' video_owner ' ) )
video . update ( reel_info )
2021-01-01 05:26:37 -07:00
formats = [ ]
q = qualities ( [ ' sd ' , ' hd ' ] )
2022-01-01 13:47:24 -07:00
for key , format_id in ( ( ' playable_url ' , ' sd ' ) , ( ' playable_url_quality_hd ' , ' hd ' ) ,
2023-09-05 14:35:23 -06:00
( ' playable_url_dash ' , ' ' ) , ( ' browser_native_hd_url ' , ' hd ' ) ,
( ' browser_native_sd_url ' , ' sd ' ) ) :
2022-01-01 13:47:24 -07:00
playable_url = video . get ( key )
2021-01-01 05:26:37 -07:00
if not playable_url :
continue
2022-01-01 13:47:24 -07:00
if determine_ext ( playable_url ) == ' mpd ' :
formats . extend ( self . _extract_mpd_formats ( playable_url , video_id ) )
else :
formats . append ( {
' format_id ' : format_id ,
2023-09-16 15:18:04 -06:00
# sd, hd formats w/o resolution info should be deprioritized below DASH
' quality ' : q ( format_id ) - 3 ,
2022-01-01 13:47:24 -07:00
' url ' : playable_url ,
} )
2021-01-01 05:26:37 -07:00
extract_dash_manifest ( video , formats )
2024-06-21 17:21:45 -06:00
if not formats :
# Do not append false positive entry w/o any formats
return
2024-01-29 12:43:41 -07:00
automatic_captions , subtitles = { } , { }
is_broadcast = traverse_obj ( video , ( ' is_video_broadcast ' , { bool } ) )
for caption in traverse_obj ( video , (
' video_available_captions_locales ' ,
{ lambda x : sorted ( x , key = lambda c : c [ ' locale ' ] ) } ,
2024-06-11 17:09:58 -06:00
lambda _ , v : url_or_none ( v [ ' captions_url ' ] ) ,
2024-01-29 12:43:41 -07:00
) ) :
lang = caption . get ( ' localized_language ' ) or ' und '
subs = {
' url ' : caption [ ' captions_url ' ] ,
' name ' : format_field ( caption , ' localized_country ' , f ' { lang } (%s) ' , default = lang ) ,
}
if caption . get ( ' localized_creation_method ' ) or is_broadcast :
automatic_captions . setdefault ( caption [ ' locale ' ] , [ ] ) . append ( subs )
else :
subtitles . setdefault ( caption [ ' locale ' ] , [ ] ) . append ( subs )
captions_url = traverse_obj ( video , ( ' captions_url ' , { url_or_none } ) )
if captions_url and not automatic_captions and not subtitles :
locale = self . _html_search_meta (
[ ' og:locale ' , ' twitter:locale ' ] , webpage , ' locale ' , default = ' en_US ' )
( automatic_captions if is_broadcast else subtitles ) [ locale ] = [ { ' url ' : captions_url } ]
2021-01-01 05:26:37 -07:00
info = {
' id ' : v_id ,
' formats ' : formats ,
2022-04-12 19:21:23 -06:00
' thumbnail ' : traverse_obj (
video , ( ' thumbnailImage ' , ' uri ' ) , ( ' preferred_thumbnail ' , ' image ' , ' uri ' ) ) ,
2023-07-15 15:03:23 -06:00
' uploader_id ' : traverse_obj ( video , ( ' owner ' , ' id ' , { str_or_none } ) ) ,
' timestamp ' : traverse_obj ( video , ' publish_time ' , ' creation_time ' , expected_type = int_or_none ) ,
' duration ' : ( float_or_none ( video . get ( ' playable_duration_in_ms ' ) , 1000 )
or float_or_none ( video . get ( ' length_in_second ' ) ) ) ,
2024-01-29 12:43:41 -07:00
' automatic_captions ' : automatic_captions ,
' subtitles ' : subtitles ,
2021-01-01 05:26:37 -07:00
}
2022-11-16 22:10:03 -07:00
process_formats ( info )
2021-01-01 05:26:37 -07:00
description = try_get ( video , lambda x : x [ ' savable_description ' ] [ ' text ' ] )
title = video . get ( ' name ' )
if title :
info . update ( {
' title ' : title ,
' description ' : description ,
} )
else :
2024-06-11 17:09:58 -06:00
info [ ' title ' ] = description or f ' Facebook video # { v_id } '
2021-01-01 05:26:37 -07:00
entries . append ( info )
def parse_attachment ( attachment , key = ' media ' ) :
media = attachment . get ( key ) or { }
if media . get ( ' __typename ' ) == ' Video ' :
return parse_graphql_video ( media )
2022-01-13 04:02:21 -07:00
nodes = variadic ( traverse_obj ( data , ' nodes ' , ' node ' ) or [ ] )
attachments = traverse_obj ( nodes , (
. . . , ' comet_sections ' , ' content ' , ' story ' , ( None , ' attached_story ' ) , ' attachments ' ,
2023-12-24 15:43:35 -07:00
. . . , ( ' styles ' , ' style_type_renderer ' , ( ' throwbackStyles ' , ' attachment_target_renderer ' ) ) ,
' attachment ' , { dict } ) )
2022-01-13 04:02:21 -07:00
for attachment in attachments :
2023-12-24 15:43:35 -07:00
ns = traverse_obj ( attachment , ( ' all_subattachments ' , ' nodes ' , . . . , { dict } ) ,
( ' target ' , ' attachments ' , . . . , ' styles ' , ' attachment ' , { dict } ) )
2022-01-13 04:02:21 -07:00
for n in ns :
parse_attachment ( n )
parse_attachment ( attachment )
2021-01-01 05:26:37 -07:00
edges = try_get ( data , lambda x : x [ ' mediaset ' ] [ ' currMedia ' ] [ ' edges ' ] , list ) or [ ]
for edge in edges :
parse_attachment ( edge , key = ' node ' )
2024-01-29 12:43:41 -07:00
video = traverse_obj ( data , (
' event ' , ' cover_media_renderer ' , ' cover_video ' ) , ' video ' , expected_type = dict ) or { }
2021-01-01 05:26:37 -07:00
if video :
attachments = try_get ( video , [
lambda x : x [ ' story ' ] [ ' attachments ' ] ,
2024-06-11 17:09:58 -06:00
lambda x : x [ ' creation_story ' ] [ ' attachments ' ] ,
2021-01-01 05:26:37 -07:00
] , list ) or [ ]
for attachment in attachments :
parse_attachment ( attachment )
if not entries :
parse_graphql_video ( video )
2021-08-23 10:21:42 -06:00
if len ( entries ) > 1 :
return self . playlist_result ( entries , video_id )
2023-12-24 15:43:35 -07:00
video_info = entries [ 0 ] if entries else { ' id ' : video_id }
2021-08-23 10:21:42 -06:00
webpage_info = extract_metadata ( webpage )
# honor precise duration in video info
if video_info . get ( ' duration ' ) :
webpage_info [ ' duration ' ] = video_info [ ' duration ' ]
2024-01-28 11:41:56 -07:00
# preserve preferred_thumbnail in video info
if video_info . get ( ' thumbnail ' ) :
webpage_info [ ' thumbnail ' ] = video_info [ ' thumbnail ' ]
2021-08-23 10:21:42 -06:00
return merge_dicts ( webpage_info , video_info )
2021-01-01 05:26:37 -07:00
if not video_data :
2018-06-01 12:32:18 -06:00
m_msg = re . search ( r ' class= " [^ " ]*uiInterstitialContent[^ " ]* " ><div>(.*?)</div> ' , webpage )
if m_msg is not None :
raise ExtractorError (
2024-06-11 17:09:58 -06:00
f ' The video is not available, Facebook said: " { m_msg . group ( 1 ) } " ' ,
2018-06-01 12:32:18 -06:00
expected = True )
2021-06-07 12:58:32 -06:00
elif any ( p in webpage for p in (
' >You must log in to continue ' ,
' id= " login_form " ' ,
' id= " loginbutton " ' ) ) :
2018-06-01 12:32:18 -06:00
self . raise_login_required ( )
2021-01-01 05:26:37 -07:00
if not video_data and ' /watchparty/ ' in url :
post_data = {
' doc_id ' : 3731964053542869 ,
' variables ' : json . dumps ( {
' livingRoomID ' : video_id ,
} ) ,
}
prefetched_data = extract_relay_prefetched_data ( r ' " login_data " \ s*: \ s* { ' )
if prefetched_data :
lsd = try_get ( prefetched_data , lambda x : x [ ' login_data ' ] [ ' lsd ' ] , dict )
if lsd :
post_data [ lsd [ ' name ' ] ] = lsd [ ' value ' ]
relay_data = extract_relay_data ( r ' \ [ \ s* " RelayAPIConfigDefaults " \ s*, ' )
for define in ( relay_data . get ( ' define ' ) or [ ] ) :
if define [ 0 ] == ' RelayAPIConfigDefaults ' :
self . _api_config = define [ 2 ]
living_room = self . _download_json (
urljoin ( url , self . _api_config [ ' graphURI ' ] ) , video_id ,
data = urlencode_postdata ( post_data ) ) [ ' data ' ] [ ' living_room ' ]
entries = [ ]
for edge in ( try_get ( living_room , lambda x : x [ ' recap ' ] [ ' watched_content ' ] [ ' edges ' ] ) or [ ] ) :
video = try_get ( edge , lambda x : x [ ' node ' ] [ ' video ' ] ) or { }
v_id = video . get ( ' id ' )
if not v_id :
continue
2024-06-11 17:09:58 -06:00
v_id = str ( v_id )
2021-01-01 05:26:37 -07:00
entries . append ( self . url_result (
self . _VIDEO_PAGE_TEMPLATE % v_id ,
self . ie_key ( ) , v_id , video . get ( ' name ' ) ) )
return self . playlist_result ( entries , video_id )
if not video_data :
2018-06-01 12:32:18 -06:00
# Video info not in first request, do a secondary request using
# tahoe player specific URL
2018-05-25 10:34:22 -06:00
tahoe_data = self . _download_webpage (
self . _VIDEO_PAGE_TAHOE_TEMPLATE % video_id , video_id ,
data = urlencode_postdata ( {
' __a ' : 1 ,
2018-06-01 12:32:18 -06:00
' __pc ' : self . _search_regex (
r ' pkg_cohort[ " \' ] \ s*: \ s*[ " \' ](.+?)[ " \' ] ' , webpage ,
' pkg cohort ' , default = ' PHASED:DEFAULT ' ) ,
' __rev ' : self . _search_regex (
r ' client_revision[ " \' ] \ s*: \ s*( \ d+), ' , webpage ,
' client revision ' , default = ' 3944515 ' ) ,
2018-07-22 23:20:00 -06:00
' fb_dtsg ' : self . _search_regex (
r ' " DTSGInitialData " \ s*, \ s* \ [ \ ] \ s*, \ s* { \ s* " token " \ s*: \ s* " ([^ " ]+) " ' ,
webpage , ' dtsg token ' , default = ' ' ) ,
2018-05-25 10:34:22 -06:00
} ) ,
headers = {
' Content-Type ' : ' application/x-www-form-urlencoded ' ,
} )
2018-06-01 12:32:18 -06:00
tahoe_js_data = self . _parse_json (
self . _search_regex (
r ' for \ s+ \ ( \ s*; \ s*; \ s* \ ) \ s*;(.+) ' , tahoe_data ,
' tahoe js data ' , default = ' {} ' ) ,
video_id , fatal = False )
video_data = extract_from_jsmods_instances ( tahoe_js_data )
2018-05-25 10:34:22 -06:00
2016-01-30 04:30:39 -07:00
if not video_data :
2018-06-01 12:32:18 -06:00
raise ExtractorError ( ' Cannot parse data ' )
2015-02-23 10:54:15 -07:00
2021-01-01 05:26:37 -07:00
if len ( video_data ) > 1 :
entries = [ ]
for v in video_data :
video_url = v [ 0 ] . get ( ' video_url ' )
if not video_url :
continue
entries . append ( self . url_result ( urljoin (
url , video_url ) , self . ie_key ( ) , v [ 0 ] . get ( ' video_id ' ) ) )
return self . playlist_result ( entries , video_id )
video_data = video_data [ 0 ]
2015-02-23 10:54:15 -07:00
formats = [ ]
2021-01-01 05:26:37 -07:00
subtitles = { }
2016-10-30 04:20:55 -06:00
for f in video_data :
format_id = f [ ' stream_type ' ]
2016-06-22 05:52:15 -06:00
if f and isinstance ( f , dict ) :
f = [ f ]
2015-10-21 09:35:57 -06:00
if not f or not isinstance ( f , list ) :
continue
for quality in ( ' sd ' , ' hd ' ) :
for src_type in ( ' src ' , ' src_no_ratelimit ' ) :
2024-06-11 17:09:58 -06:00
src = f [ 0 ] . get ( f ' { quality } _ { src_type } ' )
2015-10-21 09:35:57 -06:00
if src :
2023-09-16 15:18:04 -06:00
# sd, hd formats w/o resolution info should be deprioritized below DASH
# TODO: investigate if progressive or src formats still exist
preference = - 10 if format_id == ' progressive ' else - 3
2016-02-23 12:43:24 -07:00
if quality == ' hd ' :
2023-09-16 15:18:04 -06:00
preference + = 1
2015-10-21 09:35:57 -06:00
formats . append ( {
2024-06-11 17:09:58 -06:00
' format_id ' : f ' { format_id } _ { quality } _ { src_type } ' ,
2015-10-21 09:35:57 -06:00
' url ' : src ,
2021-02-18 15:03:16 -07:00
' quality ' : preference ,
2024-06-11 17:09:58 -06:00
' height ' : 720 if quality == ' hd ' else None ,
2015-10-21 09:35:57 -06:00
} )
2021-01-01 05:26:37 -07:00
extract_dash_manifest ( f [ 0 ] , formats )
2019-10-22 10:53:47 -06:00
subtitles_src = f [ 0 ] . get ( ' subtitles_src ' )
if subtitles_src :
subtitles . setdefault ( ' en ' , [ ] ) . append ( { ' url ' : subtitles_src } )
2013-06-23 12:59:45 -06:00
2016-03-09 23:33:48 -07:00
info_dict = {
2013-06-23 12:59:45 -06:00
' id ' : video_id ,
2015-02-23 10:54:15 -07:00
' formats ' : formats ,
2019-10-22 10:53:47 -06:00
' subtitles ' : subtitles ,
2013-06-23 12:59:45 -06:00
}
2022-11-16 22:10:03 -07:00
process_formats ( info_dict )
2021-08-23 10:21:42 -06:00
info_dict . update ( extract_metadata ( webpage ) )
2016-01-25 09:18:34 -07:00
2021-01-01 05:26:37 -07:00
return info_dict
2016-01-25 09:18:34 -07:00
def _real_extract ( self , url ) :
2016-03-09 23:33:48 -07:00
video_id = self . _match_id ( url )
real_url = self . _VIDEO_PAGE_TEMPLATE % video_id if url . startswith ( ' facebook: ' ) else url
2021-01-01 05:26:37 -07:00
return self . _extract_from_url ( real_url , video_id )
2016-09-02 08:13:50 -06:00
class FacebookPluginsVideoIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:[ \ w-]+ \ .)?facebook \ .com/plugins/video \ .php \ ?.*? \ bhref=(?P<id>https.+) '
_TESTS = [ {
' url ' : ' https://www.facebook.com/plugins/video.php?href=https % 3A %2F %2F www.facebook.com %2F gov.sg %2F videos %2F 10154383743583686 %2F &show_text=0&width=560 ' ,
' md5 ' : ' 5954e92cdfe51fe5782ae9bda7058a07 ' ,
' info_dict ' : {
' id ' : ' 10154383743583686 ' ,
' ext ' : ' mp4 ' ,
2022-01-13 04:02:21 -07:00
# TODO: Fix title, uploader
2016-09-02 08:13:50 -06:00
' title ' : ' What to do during the haze? ' ,
' uploader ' : ' Gov.sg ' ,
' upload_date ' : ' 20160826 ' ,
' timestamp ' : 1472184808 ,
} ,
' add_ie ' : [ FacebookIE . ie_key ( ) ] ,
} , {
' url ' : ' https://www.facebook.com/plugins/video.php?href=https % 3A %2F %2F www.facebook.com %2F video.php %3F v % 3D10204634152394104 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560 ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
return self . url_result (
2024-06-11 17:09:58 -06:00
urllib . parse . unquote ( self . _match_id ( url ) ) ,
2016-09-02 08:13:50 -06:00
FacebookIE . ie_key ( ) )
2021-12-19 18:30:04 -07:00
class FacebookRedirectURLIE ( InfoExtractor ) :
IE_DESC = False # Do not list
_VALID_URL = r ' https?://(?:[ \ w-]+ \ .)?facebook \ .com/flx/warn[/?] '
_TESTS = [ {
' url ' : ' https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https % 3A %2F %2F www.youtube.com %2F watch %3F v % 3DpO8h3EaFRdo&s=1 ' ,
' info_dict ' : {
' id ' : ' pO8h3EaFRdo ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Tripeo Boiler Room x Dekmantel Festival DJ Set ' ,
' description ' : ' md5:2d713ccbb45b686a1888397b2c77ca6b ' ,
' channel_id ' : ' UCGBpxWJr9FNOcFYA5GkKrMg ' ,
' playable_in_embed ' : True ,
' categories ' : [ ' Music ' ] ,
' channel ' : ' Boiler Room ' ,
' uploader_id ' : ' brtvofficial ' ,
' uploader ' : ' Boiler Room ' ,
' tags ' : ' count:11 ' ,
' duration ' : 3332 ,
' live_status ' : ' not_live ' ,
' thumbnail ' : ' https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg ' ,
' channel_url ' : ' https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg ' ,
' availability ' : ' public ' ,
' uploader_url ' : ' http://www.youtube.com/user/brtvofficial ' ,
' upload_date ' : ' 20150917 ' ,
' age_limit ' : 0 ,
' view_count ' : int ,
' like_count ' : int ,
} ,
' add_ie ' : [ ' Youtube ' ] ,
' params ' : { ' skip_download ' : ' Youtube ' } ,
} ]
def _real_extract ( self , url ) :
redirect_url = url_or_none ( parse_qs ( url ) . get ( ' u ' , [ None ] ) [ - 1 ] )
if not redirect_url :
raise ExtractorError ( ' Invalid facebook redirect URL ' , expected = True )
return self . url_result ( redirect_url )
2022-08-14 14:03:24 -06:00
class FacebookReelIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:[ \ w-]+ \ .)?facebook \ .com/reel/(?P<id> \ d+) '
IE_NAME = ' facebook:reel '
_TESTS = [ {
' url ' : ' https://www.facebook.com/reel/1195289147628387 ' ,
2023-07-15 15:03:23 -06:00
' md5 ' : ' f13dd37f2633595982db5ed8765474d3 ' ,
2022-08-14 14:03:24 -06:00
' info_dict ' : {
' id ' : ' 1195289147628387 ' ,
' ext ' : ' mp4 ' ,
2023-07-15 15:03:23 -06:00
' title ' : ' md5:b05800b5b1ad56c0ca78bd3807b6a61e ' ,
' description ' : ' md5:22f03309b216ac84720183961441d8db ' ,
' uploader ' : ' md5:723e6cb3091241160f20b3c5dc282af1 ' ,
' uploader_id ' : ' 100040874179269 ' ,
' duration ' : 9.579 ,
' timestamp ' : 1637502609 ,
2022-08-14 14:03:24 -06:00
' upload_date ' : ' 20211121 ' ,
2023-07-15 15:03:23 -06:00
' thumbnail ' : r ' re:^https?://.* ' ,
2024-06-11 17:09:58 -06:00
} ,
2022-08-14 14:03:24 -06:00
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
return self . url_result (
f ' https://m.facebook.com/watch/?v= { video_id } &_rdr ' , FacebookIE , video_id )
2024-01-21 23:28:11 -07:00
class FacebookAdsIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:[ \ w-]+ \ .)?facebook \ .com/ads/library/? \ ?(?:[^#]+&)?id=(?P<id> \ d+) '
IE_NAME = ' facebook:ads '
_TESTS = [ {
' url ' : ' https://www.facebook.com/ads/library/?id=899206155126718 ' ,
' info_dict ' : {
' id ' : ' 899206155126718 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' video by Kandao ' ,
' uploader ' : ' Kandao ' ,
' uploader_id ' : ' 774114102743284 ' ,
' uploader_url ' : r ' re:^https?://.* ' ,
' timestamp ' : 1702548330 ,
' thumbnail ' : r ' re:^https?://.* ' ,
' upload_date ' : ' 20231214 ' ,
' like_count ' : int ,
2024-06-11 17:09:58 -06:00
} ,
2024-01-21 23:28:11 -07:00
} , {
' url ' : ' https://www.facebook.com/ads/library/?id=893637265423481 ' ,
' info_dict ' : {
' id ' : ' 893637265423481 ' ,
' title ' : ' Jusqu \u2019 \u00e0 -25 % s ur une s \u00e9 lection de vins p \u00e9 tillants italiens ' ,
' uploader ' : ' Eataly Paris Marais ' ,
' uploader_id ' : ' 2086668958314152 ' ,
' uploader_url ' : r ' re:^https?://.* ' ,
' timestamp ' : 1703571529 ,
' upload_date ' : ' 20231226 ' ,
' like_count ' : int ,
} ,
' playlist_count ' : 3 ,
} , {
' url ' : ' https://es-la.facebook.com/ads/library/?id=901230958115569 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://m.facebook.com/ads/library/?id=901230958115569 ' ,
' only_matching ' : True ,
} ]
_FORMATS_MAP = {
' watermarked_video_sd_url ' : ( ' sd-wmk ' , ' SD, watermarked ' ) ,
' video_sd_url ' : ( ' sd ' , None ) ,
' watermarked_video_hd_url ' : ( ' hd-wmk ' , ' HD, watermarked ' ) ,
' video_hd_url ' : ( ' hd ' , None ) ,
}
def _extract_formats ( self , video_dict ) :
formats = [ ]
for format_key , format_url in traverse_obj ( video_dict , (
2024-06-11 17:09:58 -06:00
{ dict . items } , lambda _ , v : v [ 0 ] in self . _FORMATS_MAP and url_or_none ( v [ 1 ] ) ,
2024-01-21 23:28:11 -07:00
) ) :
formats . append ( {
' format_id ' : self . _FORMATS_MAP [ format_key ] [ 0 ] ,
' format_note ' : self . _FORMATS_MAP [ format_key ] [ 1 ] ,
' url ' : format_url ,
' ext ' : ' mp4 ' ,
' quality ' : qualities ( tuple ( self . _FORMATS_MAP ) ) ( format_key ) ,
} )
return formats
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
post_data = [ self . _parse_json ( j , video_id , fatal = False )
for j in re . findall ( r ' s \ .handle \ (( { .*}) \ );requireLazy \ ( ' , webpage ) ]
data = traverse_obj ( post_data , (
. . . , ' require ' , . . . , . . . , . . . , ' props ' , ' deeplinkAdCard ' , ' snapshot ' , { dict } ) , get_all = False )
if not data :
raise ExtractorError ( ' Unable to extract ad data ' )
title = data . get ( ' title ' )
if not title or title == ' {{ product.name}} ' :
title = join_nonempty ( ' display_format ' , ' page_name ' , delim = ' by ' , from_dict = data )
info_dict = traverse_obj ( data , {
' description ' : ( ' link_description ' , { str } , { lambda x : x if x != ' {{ product.description}} ' else None } ) ,
' uploader ' : ( ' page_name ' , { str } ) ,
' uploader_id ' : ( ' page_id ' , { str_or_none } ) ,
' uploader_url ' : ( ' page_profile_uri ' , { url_or_none } ) ,
' timestamp ' : ( ' creation_time ' , { int_or_none } ) ,
' like_count ' : ( ' page_like_count ' , { int_or_none } ) ,
} )
entries = [ ]
for idx , entry in enumerate ( traverse_obj (
2024-06-11 17:09:58 -06:00
data , ( ( ' videos ' , ' cards ' ) , lambda _ , v : any ( url_or_none ( v [ f ] ) for f in self . _FORMATS_MAP ) ) ) , 1 ,
2024-01-21 23:28:11 -07:00
) :
entries . append ( {
' id ' : f ' { video_id } _ { idx } ' ,
' title ' : entry . get ( ' title ' ) or title ,
' description ' : entry . get ( ' link_description ' ) or info_dict . get ( ' description ' ) ,
' thumbnail ' : url_or_none ( entry . get ( ' video_preview_image_url ' ) ) ,
' formats ' : self . _extract_formats ( entry ) ,
} )
if len ( entries ) == 1 :
info_dict . update ( entries [ 0 ] )
elif len ( entries ) > 1 :
info_dict . update ( {
' title ' : entries [ 0 ] [ ' title ' ] ,
' entries ' : entries ,
' _type ' : ' playlist ' ,
} )
info_dict [ ' id ' ] = video_id
return info_dict