Try to recover from unknown encodings when previewing media. (#9164)

Treat unknown encodings (according to lxml) as UTF-8 when generating a preview for HTML documents. This isn't fully accurate, but will hopefully give a reasonable title and summary.
2021-01-26 07:32:17 -05:00 · 2021-01-26 07:32:17 -05:00 · 4937fe3d6b
parent e74bb96733
commit 4937fe3d6b
3 changed files with 64 additions and 10 deletions
--- a/changelog.d/9164.bugfix
+++ b/changelog.d/9164.bugfix
@ -0,0 +1 @@
 Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -386,7 +386,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        """
        Check whether the URL should be downloaded as oEmbed content instead.
-        Params:
+        Args:
            url: The URL to check.
        Returns:
@ -403,7 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        """
        Request content from an oEmbed endpoint.
-        Params:
+        Args:
            endpoint: The oEmbed API endpoint.
            url: The URL to pass to the API.
@ -692,27 +692,51 @@ class PreviewUrlResource(DirectServeJsonResource):
 def decode_and_calc_og(
    body: bytes, media_uri: str, request_encoding: Optional[str] = None
 ) -> Dict[str, Optional[str]]:
    """
    Calculate metadata for an HTML document.
    This uses lxml to parse the HTML document into the OG response. If errors
    occur during processing of the document, an empty response is returned.
    Args:
        body: The HTML document, as bytes.
        media_url: The URI used to download the body.
        request_encoding: The character encoding of the body, as a string.
    Returns:
        The OG response as a dictionary.
    """
    # If there's no body, nothing useful is going to be found.
    if not body:
        return {}
    from lxml import etree
    # Create an HTML parser. If this fails, log and return no metadata.
    try:
        parser = etree.HTMLParser(recover=True, encoding=request_encoding)
-        tree = etree.fromstring(body, parser)
+    except LookupError:
-        og = _calc_og(tree, media_uri)
+        # blindly consider the encoding as utf-8.
        parser = etree.HTMLParser(recover=True, encoding="utf-8")
    except Exception as e:
        logger.warning("Unable to create HTML parser: %s" % (e,))
        return {}
    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
        # Attempt to parse the body. If this fails, log and return no metadata.
        tree = etree.fromstring(body_attempt, parser)
        return _calc_og(tree, media_uri)
    # Attempt to parse the body. If this fails, log and return no metadata.
    try:
        return _attempt_calc_og(body)
    except UnicodeDecodeError:
        # blindly try decoding the body as utf-8, which seems to fix
        # the charset mismatches on https://google.com
-        parser = etree.HTMLParser(recover=True, encoding=request_encoding)
+        return _attempt_calc_og(body.decode("utf-8", "ignore"))
        tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
        og = _calc_og(tree, media_uri)
    return og
-def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
+def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
    # suck our tree into lxml and define our OG response.
    # if we see any image URLs in the OG response, then spider them
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@ -261,3 +261,32 @@ class PreviewUrlTestCase(unittest.TestCase):
        html = ""
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {})
    def test_invalid_encoding(self):
        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
        html = """
        <html>
        <head><title>Foo</title></head>
        <body>
        Some text.
        </body>
        </html>
        """
        og = decode_and_calc_og(
            html, "http://example.com/test.html", "invalid-encoding"
        )
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_invalid_encoding2(self):
        """A body which doesn't match the sent character encoding."""
        # Note that this contains an invalid UTF-8 sequence in the title.
        html = b"""
        <html>
        <head><title>\xff\xff Foo</title></head>
        <body>
        Some text.
        </body>
        </html>
        """
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
		`@ -0,0 +1 @@`
							`Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.`