Fix preview of imgur and Tenor URLs. (#11669)
By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.
This commit is contained in:
parent
9eab71aa93
commit
15ffc4143c
|
@ -0,0 +1 @@
|
||||||
|
Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.
|
|
@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
|
||||||
5. If the media is HTML:
|
5. If the media is HTML:
|
||||||
1. Decodes the HTML via the stored file.
|
1. Decodes the HTML via the stored file.
|
||||||
2. Generates an Open Graph response from the HTML.
|
2. Generates an Open Graph response from the HTML.
|
||||||
3. If an image exists in the Open Graph response:
|
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
||||||
|
1. Downloads the URL and stores it into a file via the media storage provider
|
||||||
|
and saves the local media metadata.
|
||||||
|
2. Convert the oEmbed response to an Open Graph response.
|
||||||
|
3. Override any Open Graph data from the HTML with data from oEmbed.
|
||||||
|
4. If an image exists in the Open Graph response:
|
||||||
1. Downloads the URL and stores it into a file via the media storage
|
1. Downloads the URL and stores it into a file via the media storage
|
||||||
provider and saves the local media metadata.
|
provider and saves the local media metadata.
|
||||||
2. Generates thumbnails.
|
2. Generates thumbnails.
|
||||||
|
|
|
@ -33,6 +33,8 @@ logger = logging.getLogger(__name__)
|
||||||
class OEmbedResult:
|
class OEmbedResult:
|
||||||
# The Open Graph result (converted from the oEmbed result).
|
# The Open Graph result (converted from the oEmbed result).
|
||||||
open_graph_result: JsonDict
|
open_graph_result: JsonDict
|
||||||
|
# The author_name of the oEmbed result
|
||||||
|
author_name: Optional[str]
|
||||||
# Number of milliseconds to cache the content, according to the oEmbed response.
|
# Number of milliseconds to cache the content, according to the oEmbed response.
|
||||||
#
|
#
|
||||||
# This will be None if no cache-age is provided in the oEmbed response (or
|
# This will be None if no cache-age is provided in the oEmbed response (or
|
||||||
|
@ -154,11 +156,12 @@ class OEmbedProvider:
|
||||||
"og:url": url,
|
"og:url": url,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Use either title or author's name as the title.
|
title = oembed.get("title")
|
||||||
title = oembed.get("title") or oembed.get("author_name")
|
|
||||||
if title:
|
if title:
|
||||||
open_graph_response["og:title"] = title
|
open_graph_response["og:title"] = title
|
||||||
|
|
||||||
|
author_name = oembed.get("author_name")
|
||||||
|
|
||||||
# Use the provider name and as the site.
|
# Use the provider name and as the site.
|
||||||
provider_name = oembed.get("provider_name")
|
provider_name = oembed.get("provider_name")
|
||||||
if provider_name:
|
if provider_name:
|
||||||
|
@ -193,9 +196,10 @@ class OEmbedProvider:
|
||||||
# Trap any exception and let the code follow as usual.
|
# Trap any exception and let the code follow as usual.
|
||||||
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
|
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
|
||||||
open_graph_response = {}
|
open_graph_response = {}
|
||||||
|
author_name = None
|
||||||
cache_age = None
|
cache_age = None
|
||||||
|
|
||||||
return OEmbedResult(open_graph_response, cache_age)
|
return OEmbedResult(open_graph_response, author_name, cache_age)
|
||||||
|
|
||||||
|
|
||||||
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
||||||
|
|
|
@ -262,6 +262,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
# The number of milliseconds that the response should be considered valid.
|
# The number of milliseconds that the response should be considered valid.
|
||||||
expiration_ms = media_info.expires
|
expiration_ms = media_info.expires
|
||||||
|
author_name: Optional[str] = None
|
||||||
|
|
||||||
if _is_media(media_info.media_type):
|
if _is_media(media_info.media_type):
|
||||||
file_id = media_info.filesystem_id
|
file_id = media_info.filesystem_id
|
||||||
|
@ -294,17 +295,25 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
# Check if this HTML document points to oEmbed information and
|
# Check if this HTML document points to oEmbed information and
|
||||||
# defer to that.
|
# defer to that.
|
||||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||||
og = {}
|
og_from_oembed: JsonDict = {}
|
||||||
if oembed_url:
|
if oembed_url:
|
||||||
oembed_info = await self._download_url(oembed_url, user)
|
oembed_info = await self._download_url(oembed_url, user)
|
||||||
og, expiration_ms = await self._handle_oembed_response(
|
(
|
||||||
|
og_from_oembed,
|
||||||
|
author_name,
|
||||||
|
expiration_ms,
|
||||||
|
) = await self._handle_oembed_response(
|
||||||
url, oembed_info, expiration_ms
|
url, oembed_info, expiration_ms
|
||||||
)
|
)
|
||||||
|
|
||||||
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
|
# Parse Open Graph information from the HTML in case the oEmbed
|
||||||
# to generate the Open Graph information from the HTML.
|
# response failed or is incomplete.
|
||||||
if not oembed_url or not og:
|
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
|
||||||
og = parse_html_to_open_graph(tree, media_info.uri)
|
|
||||||
|
# Compile the Open Graph response by using the scraped
|
||||||
|
# information from the HTML and overlaying any information
|
||||||
|
# from the oEmbed response.
|
||||||
|
og = {**og_from_html, **og_from_oembed}
|
||||||
|
|
||||||
await self._precache_image_url(user, media_info, og)
|
await self._precache_image_url(user, media_info, og)
|
||||||
else:
|
else:
|
||||||
|
@ -312,7 +321,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
elif oembed_url:
|
elif oembed_url:
|
||||||
# Handle the oEmbed information.
|
# Handle the oEmbed information.
|
||||||
og, expiration_ms = await self._handle_oembed_response(
|
og, author_name, expiration_ms = await self._handle_oembed_response(
|
||||||
url, media_info, expiration_ms
|
url, media_info, expiration_ms
|
||||||
)
|
)
|
||||||
await self._precache_image_url(user, media_info, og)
|
await self._precache_image_url(user, media_info, og)
|
||||||
|
@ -321,6 +330,11 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
logger.warning("Failed to find any OG data in %s", url)
|
logger.warning("Failed to find any OG data in %s", url)
|
||||||
og = {}
|
og = {}
|
||||||
|
|
||||||
|
# If we don't have a title but we have author_name, copy it as
|
||||||
|
# title
|
||||||
|
if not og.get("og:title") and author_name:
|
||||||
|
og["og:title"] = author_name
|
||||||
|
|
||||||
# filter out any stupidly long values
|
# filter out any stupidly long values
|
||||||
keys_to_remove = []
|
keys_to_remove = []
|
||||||
for k, v in og.items():
|
for k, v in og.items():
|
||||||
|
@ -484,7 +498,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
async def _handle_oembed_response(
|
async def _handle_oembed_response(
|
||||||
self, url: str, media_info: MediaInfo, expiration_ms: int
|
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||||
) -> Tuple[JsonDict, int]:
|
) -> Tuple[JsonDict, Optional[str], int]:
|
||||||
"""
|
"""
|
||||||
Parse the downloaded oEmbed info.
|
Parse the downloaded oEmbed info.
|
||||||
|
|
||||||
|
@ -497,11 +511,12 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
Returns:
|
Returns:
|
||||||
A tuple of:
|
A tuple of:
|
||||||
The Open Graph dictionary, if the oEmbed info can be parsed.
|
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||||
|
The author name if it could be retrieved from oEmbed.
|
||||||
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||||
"""
|
"""
|
||||||
# If JSON was not returned, there's nothing to do.
|
# If JSON was not returned, there's nothing to do.
|
||||||
if not _is_json(media_info.media_type):
|
if not _is_json(media_info.media_type):
|
||||||
return {}, expiration_ms
|
return {}, None, expiration_ms
|
||||||
|
|
||||||
with open(media_info.filename, "rb") as file:
|
with open(media_info.filename, "rb") as file:
|
||||||
body = file.read()
|
body = file.read()
|
||||||
|
@ -513,7 +528,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
if open_graph_result and oembed_response.cache_age is not None:
|
if open_graph_result and oembed_response.cache_age is not None:
|
||||||
expiration_ms = oembed_response.cache_age
|
expiration_ms = oembed_response.cache_age
|
||||||
|
|
||||||
return open_graph_result, expiration_ms
|
return open_graph_result, oembed_response.author_name, expiration_ms
|
||||||
|
|
||||||
def _start_expire_url_cache_data(self) -> Deferred:
|
def _start_expire_url_cache_data(self) -> Deferred:
|
||||||
return run_as_background_process(
|
return run_as_background_process(
|
||||||
|
|
Loading…
Reference in New Issue