From 84f9cac4d0a7f19b432e683981f66c20339a60f5 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Fri, 15 Apr 2016 13:19:57 +0100 Subject: [PATCH] fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong --- synapse/rest/media/v1/preview_url_resource.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index c27ba72735..7e937b0446 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -181,22 +181,14 @@ class PreviewUrlResource(BaseMediaResource): from lxml import html - try: - tree = html.parse(media_info['filename']) - og = yield self._calc_og(tree, media_info, requester) - except UnicodeDecodeError: - # XXX: evil evil bodge - # Empirically, sites like google.com mix Latin-1 and utf-8 - # encodings in the same page. The rogue Latin-1 characters - # cause lxml to choke with a UnicodeDecodeError, so if we - # see this we go and do a manual decode of the HTML before - # handing it to lxml as utf-8 encoding, counter-intuitively, - # which seems to make it happier... - file = open(media_info['filename']) - body = file.read() - file.close() - tree = html.fromstring(body.decode('utf-8', 'ignore')) - og = yield self._calc_og(tree, media_info, requester) + # XXX: always manually try to decode body as utf-8 first, which + # seems to help with most character encoding woes. + # XXX: handle non-utf-8 encodings? + file = open(media_info['filename']) + body = file.read() + file.close() + tree = html.fromstring(body.decode('utf-8', 'ignore')) + og = yield self._calc_og(tree, media_info, requester) else: logger.warn("Failed to find any OG data in %s", url)