fix typos and needless try/except from PR review

This commit is contained in:
Matthew Hodgson 2016-04-11 10:39:16 +01:00
parent 83b2f83da0
commit 5ffacc5e84
1 changed files with 137 additions and 140 deletions

View File

@ -54,7 +54,7 @@ class PreviewUrlResource(BaseMediaResource):
if html: if html:
pass pass
except: except:
raise RunTimeError("Disabling PreviewUrlResource as lxml not available") raise RuntimeError("Disabling PreviewUrlResource as lxml not available")
if not hasattr(hs.config, "url_preview_ip_range_blacklist"): if not hasattr(hs.config, "url_preview_ip_range_blacklist"):
logger.warn( logger.warn(
@ -62,7 +62,7 @@ class PreviewUrlResource(BaseMediaResource):
"blacklist in url_preview_ip_range_blacklist for url previewing " "blacklist in url_preview_ip_range_blacklist for url previewing "
"to work" "to work"
) )
raise RunTimeError( raise RuntimeError(
"Disabling PreviewUrlResource as " "Disabling PreviewUrlResource as "
"url_preview_ip_range_blacklist not specified" "url_preview_ip_range_blacklist not specified"
) )
@ -91,157 +91,154 @@ class PreviewUrlResource(BaseMediaResource):
@defer.inlineCallbacks @defer.inlineCallbacks
def _async_render_GET(self, request): def _async_render_GET(self, request):
try: # XXX: if get_user_by_req fails, what should we do in an async render?
# XXX: if get_user_by_req fails, what should we do in an async render? requester = yield self.auth.get_user_by_req(request)
requester = yield self.auth.get_user_by_req(request) url = request.args.get("url")[0]
url = request.args.get("url")[0] if "ts" in request.args:
if "ts" in request.args: ts = int(request.args.get("ts")[0])
ts = int(request.args.get("ts")[0]) else:
else: ts = self.clock.time_msec()
ts = self.clock.time_msec()
# impose the URL pattern blacklist # impose the URL pattern blacklist
if hasattr(self, "url_preview_url_blacklist"): if hasattr(self, "url_preview_url_blacklist"):
url_tuple = urlsplit(url) url_tuple = urlsplit(url)
for entry in self.url_preview_url_blacklist: for entry in self.url_preview_url_blacklist:
match = True match = True
for attrib in entry: for attrib in entry:
pattern = entry[attrib] pattern = entry[attrib]
value = getattr(url_tuple, attrib) value = getattr(url_tuple, attrib)
logger.debug(( logger.debug((
"Matching attrib '%s' with value '%s' against" "Matching attrib '%s' with value '%s' against"
" pattern '%s'" " pattern '%s'"
) % (attrib, value, pattern)) ) % (attrib, value, pattern))
if value is None: if value is None:
match = False
continue
if pattern.startswith('^'):
if not re.match(pattern, getattr(url_tuple, attrib)):
match = False match = False
continue continue
else:
if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
match = False
continue
if match:
logger.warn(
"URL %s blocked by url_blacklist entry %s", url, entry
)
raise SynapseError(
403, "URL blocked by url pattern blacklist entry",
Codes.UNKNOWN
)
if pattern.startswith('^'): # first check the memory cache - good to handle all the clients on this
if not re.match(pattern, getattr(url_tuple, attrib)): # HS thundering away to preview the same URL at the same time.
match = False og = self.cache.get(url)
continue if og:
else: respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern): return
match = False
continue
if match:
logger.warn(
"URL %s blocked by url_blacklist entry %s", url, entry
)
raise SynapseError(
403, "URL blocked by url pattern blacklist entry",
Codes.UNKNOWN
)
# first check the memory cache - good to handle all the clients on this # then check the URL cache in the DB (which will also provide us with
# HS thundering away to preview the same URL at the same time. # historical previews, if we have any)
og = self.cache.get(url) cache_result = yield self.store.get_url_cache(url, ts)
if og: if (
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) cache_result and
return cache_result["download_ts"] + cache_result["expires"] > ts and
cache_result["response_code"] / 100 == 2
):
respond_with_json_bytes(
request, 200, cache_result["og"].encode('utf-8'),
send_cors=True
)
return
# then check the URL cache in the DB (which will also provide us with # Ensure only one download for a given URL is active at a time
# historical previews, if we have any) download = self.downloads.get(url)
cache_result = yield self.store.get_url_cache(url, ts) if download is None:
if ( download = self._download_url(url, requester.user)
cache_result and download = ObservableDeferred(
cache_result["download_ts"] + cache_result["expires"] > ts and download,
cache_result["response_code"] / 100 == 2 consumeErrors=True
): )
respond_with_json_bytes( self.downloads[url] = download
request, 200, cache_result["og"].encode('utf-8'),
send_cors=True
)
return
# Ensure only one download for a given URL is active at a time @download.addBoth
download = self.downloads.get(url) def callback(media_info):
if download is None: del self.downloads[url]
download = self._download_url(url, requester.user) return media_info
download = ObservableDeferred( media_info = yield download.observe()
download,
consumeErrors=True
)
self.downloads[url] = download
@download.addBoth # FIXME: we should probably update our cache now anyway, so that
def callback(media_info): # even if the OG calculation raises, we don't keep hammering on the
del self.downloads[url] # remote server. For now, leave it uncached to aid debugging OG
return media_info # calculation problems
media_info = yield download.observe()
# FIXME: we should probably update our cache now anyway, so that logger.debug("got media_info of '%s'" % media_info)
# even if the OG calculation raises, we don't keep hammering on the
# remote server. For now, leave it uncached to aid debugging OG
# calculation problems
logger.debug("got media_info of '%s'" % media_info) if self._is_media(media_info['media_type']):
dims = yield self._generate_local_thumbnails(
if self._is_media(media_info['media_type']): media_info['filesystem_id'], media_info
dims = yield self._generate_local_thumbnails(
media_info['filesystem_id'], media_info
)
og = {
"og:description": media_info['download_name'],
"og:image": "mxc://%s/%s" % (
self.server_name, media_info['filesystem_id']
),
"og:image:type": media_info['media_type'],
"matrix:image:size": media_info['media_length'],
}
if dims:
og["og:image:width"] = dims['width']
og["og:image:height"] = dims['height']
else:
logger.warn("Couldn't get dims for %s" % url)
# define our OG response for this media
elif self._is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
try:
tree = html.parse(media_info['filename'])
og = yield self._calc_og(tree, media_info, requester)
except UnicodeDecodeError:
# XXX: evil evil bodge
# Empirically, sites like google.com mix Latin-1 and utf-8
# encodings in the same page. The rogue Latin-1 characters
# cause lxml to choke with a UnicodeDecodeError, so if we
# see this we go and do a manual decode of the HTML before
# handing it to lxml as utf-8 encoding, counter-intuitively,
# which seems to make it happier...
file = open(media_info['filename'])
body = file.read()
file.close()
tree = html.fromstring(body.decode('utf-8', 'ignore'))
og = yield self._calc_og(tree, media_info, requester)
else:
logger.warn("Failed to find any OG data in %s", url)
og = {}
logger.debug("Calculated OG for %s as %s" % (url, og))
# store OG in ephemeral in-memory cache
self.cache[url] = og
# store OG in history-aware DB cache
yield self.store.store_url_cache(
url,
media_info["response_code"],
media_info["etag"],
media_info["expires"],
json.dumps(og),
media_info["filesystem_id"],
media_info["created_ts"],
) )
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) og = {
except Exception as e: "og:description": media_info['download_name'],
raise e "og:image": "mxc://%s/%s" % (
self.server_name, media_info['filesystem_id']
),
"og:image:type": media_info['media_type'],
"matrix:image:size": media_info['media_length'],
}
if dims:
og["og:image:width"] = dims['width']
og["og:image:height"] = dims['height']
else:
logger.warn("Couldn't get dims for %s" % url)
# define our OG response for this media
elif self._is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
try:
tree = html.parse(media_info['filename'])
og = yield self._calc_og(tree, media_info, requester)
except UnicodeDecodeError:
# XXX: evil evil bodge
# Empirically, sites like google.com mix Latin-1 and utf-8
# encodings in the same page. The rogue Latin-1 characters
# cause lxml to choke with a UnicodeDecodeError, so if we
# see this we go and do a manual decode of the HTML before
# handing it to lxml as utf-8 encoding, counter-intuitively,
# which seems to make it happier...
file = open(media_info['filename'])
body = file.read()
file.close()
tree = html.fromstring(body.decode('utf-8', 'ignore'))
og = yield self._calc_og(tree, media_info, requester)
else:
logger.warn("Failed to find any OG data in %s", url)
og = {}
logger.debug("Calculated OG for %s as %s" % (url, og))
# store OG in ephemeral in-memory cache
self.cache[url] = og
# store OG in history-aware DB cache
yield self.store.store_url_cache(
url,
media_info["response_code"],
media_info["etag"],
media_info["expires"],
json.dumps(og),
media_info["filesystem_id"],
media_info["created_ts"],
)
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
@defer.inlineCallbacks @defer.inlineCallbacks