Try to recover from unknown encodings when previewing media. (#9164)
Treat unknown encodings (according to lxml) as UTF-8 when generating a preview for HTML documents. This isn't fully accurate, but will hopefully give a reasonable title and summary.
This commit is contained in:
parent
e74bb96733
commit
4937fe3d6b
|
@ -0,0 +1 @@
|
||||||
|
Fix a long-standing bug where an internal server error was raised when attempting to preview an HTML document in an unknown character encoding.
|
|
@ -386,7 +386,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
"""
|
"""
|
||||||
Check whether the URL should be downloaded as oEmbed content instead.
|
Check whether the URL should be downloaded as oEmbed content instead.
|
||||||
|
|
||||||
Params:
|
Args:
|
||||||
url: The URL to check.
|
url: The URL to check.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -403,7 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
"""
|
"""
|
||||||
Request content from an oEmbed endpoint.
|
Request content from an oEmbed endpoint.
|
||||||
|
|
||||||
Params:
|
Args:
|
||||||
endpoint: The oEmbed API endpoint.
|
endpoint: The oEmbed API endpoint.
|
||||||
url: The URL to pass to the API.
|
url: The URL to pass to the API.
|
||||||
|
|
||||||
|
@ -692,27 +692,51 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
def decode_and_calc_og(
|
def decode_and_calc_og(
|
||||||
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
||||||
) -> Dict[str, Optional[str]]:
|
) -> Dict[str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Calculate metadata for an HTML document.
|
||||||
|
|
||||||
|
This uses lxml to parse the HTML document into the OG response. If errors
|
||||||
|
occur during processing of the document, an empty response is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
body: The HTML document, as bytes.
|
||||||
|
media_url: The URI used to download the body.
|
||||||
|
request_encoding: The character encoding of the body, as a string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The OG response as a dictionary.
|
||||||
|
"""
|
||||||
# If there's no body, nothing useful is going to be found.
|
# If there's no body, nothing useful is going to be found.
|
||||||
if not body:
|
if not body:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
# Create an HTML parser. If this fails, log and return no metadata.
|
||||||
try:
|
try:
|
||||||
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
|
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
|
||||||
tree = etree.fromstring(body, parser)
|
except LookupError:
|
||||||
og = _calc_og(tree, media_uri)
|
# blindly consider the encoding as utf-8.
|
||||||
|
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Unable to create HTML parser: %s" % (e,))
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
||||||
|
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||||
|
tree = etree.fromstring(body_attempt, parser)
|
||||||
|
return _calc_og(tree, media_uri)
|
||||||
|
|
||||||
|
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||||
|
try:
|
||||||
|
return _attempt_calc_og(body)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# blindly try decoding the body as utf-8, which seems to fix
|
# blindly try decoding the body as utf-8, which seems to fix
|
||||||
# the charset mismatches on https://google.com
|
# the charset mismatches on https://google.com
|
||||||
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
|
return _attempt_calc_og(body.decode("utf-8", "ignore"))
|
||||||
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
|
|
||||||
og = _calc_og(tree, media_uri)
|
|
||||||
|
|
||||||
return og
|
|
||||||
|
|
||||||
|
|
||||||
def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
|
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
||||||
# suck our tree into lxml and define our OG response.
|
# suck our tree into lxml and define our OG response.
|
||||||
|
|
||||||
# if we see any image URLs in the OG response, then spider them
|
# if we see any image URLs in the OG response, then spider them
|
||||||
|
|
|
@ -261,3 +261,32 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
html = ""
|
html = ""
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||||
self.assertEqual(og, {})
|
self.assertEqual(og, {})
|
||||||
|
|
||||||
|
def test_invalid_encoding(self):
|
||||||
|
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||||
|
html = """
|
||||||
|
<html>
|
||||||
|
<head><title>Foo</title></head>
|
||||||
|
<body>
|
||||||
|
Some text.
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
og = decode_and_calc_og(
|
||||||
|
html, "http://example.com/test.html", "invalid-encoding"
|
||||||
|
)
|
||||||
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
def test_invalid_encoding2(self):
|
||||||
|
"""A body which doesn't match the sent character encoding."""
|
||||||
|
# Note that this contains an invalid UTF-8 sequence in the title.
|
||||||
|
html = b"""
|
||||||
|
<html>
|
||||||
|
<head><title>\xff\xff Foo</title></head>
|
||||||
|
<body>
|
||||||
|
Some text.
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||||
|
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||||
|
|
Loading…
Reference in New Issue