explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better
This commit is contained in:
parent
84f9cac4d0
commit
aaabbd3e9e
|
@ -179,15 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
|
||||||
elif self._is_html(media_info['media_type']):
|
elif self._is_html(media_info['media_type']):
|
||||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||||
|
|
||||||
from lxml import html
|
from lxml import etree
|
||||||
|
|
||||||
# XXX: always manually try to decode body as utf-8 first, which
|
|
||||||
# seems to help with most character encoding woes.
|
|
||||||
# XXX: handle non-utf-8 encodings?
|
|
||||||
file = open(media_info['filename'])
|
file = open(media_info['filename'])
|
||||||
body = file.read()
|
body = file.read()
|
||||||
file.close()
|
file.close()
|
||||||
tree = html.fromstring(body.decode('utf-8', 'ignore'))
|
|
||||||
|
# clobber the encoding from the content-type, or default to utf-8
|
||||||
|
# XXX: this overrides any <meta/> or XML charset headers in the body
|
||||||
|
# which may pose problems, but so far seems to work okay.
|
||||||
|
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
|
||||||
|
encoding = match.group(1) if match else "utf-8"
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||||
|
tree = etree.fromstring(body, parser)
|
||||||
|
og = yield self._calc_og(tree, media_info, requester)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# blindly try decoding the body as utf-8, which seems to fix
|
||||||
|
# the charset mismatches on https://google.com
|
||||||
|
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||||
|
tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
|
||||||
og = yield self._calc_og(tree, media_info, requester)
|
og = yield self._calc_og(tree, media_info, requester)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue