This commit is contained in:
Matthew Hodgson 2016-04-03 12:56:29 +01:00
parent eab4d462f8
commit 8b98a7e8c3
4 changed files with 85 additions and 60 deletions

View File

@ -15,7 +15,9 @@
from OpenSSL import SSL from OpenSSL import SSL
from OpenSSL.SSL import VERIFY_NONE from OpenSSL.SSL import VERIFY_NONE
from synapse.api.errors import CodeMessageException from synapse.api.errors import (
CodeMessageException, SynapseError, Codes,
)
from synapse.util.logcontext import preserve_context_over_fn from synapse.util.logcontext import preserve_context_over_fn
import synapse.metrics import synapse.metrics
@ -331,6 +333,7 @@ def _readBodyToFile(response, stream, max_size):
response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size)) response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size))
return d return d
class CaptchaServerHttpClient(SimpleHttpClient): class CaptchaServerHttpClient(SimpleHttpClient):
""" """
Separate HTTP client for talking to google's captcha servers Separate HTTP client for talking to google's captcha servers
@ -360,6 +363,7 @@ class CaptchaServerHttpClient(SimpleHttpClient):
# twisted dislikes google's response, no content length. # twisted dislikes google's response, no content length.
defer.returnValue(e.response) defer.returnValue(e.response)
class SpiderHttpClient(SimpleHttpClient): class SpiderHttpClient(SimpleHttpClient):
""" """
Separate HTTP client for spidering arbitrary URLs. Separate HTTP client for spidering arbitrary URLs.
@ -376,8 +380,10 @@ class SpiderHttpClient(SimpleHttpClient):
connectTimeout=15, connectTimeout=15,
contextFactory=hs.get_http_client_context_factory() contextFactory=hs.get_http_client_context_factory()
)), [('gzip', GzipDecoder)]) )), [('gzip', GzipDecoder)])
# Look like Chrome for now # We could look like Chrome:
#self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko)
# Chrome Safari" % hs.version_string)
def encode_urlencode_args(args): def encode_urlencode_args(args):
return {k: encode_urlencode_arg(v) for k, v in args.items()} return {k: encode_urlencode_arg(v) for k, v in args.items()}

View File

@ -80,4 +80,3 @@ class MediaRepositoryResource(Resource):
self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths))
self.putChild("identicon", IdenticonResource()) self.putChild("identicon", IdenticonResource())
self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) self.putChild("preview_url", PreviewUrlResource(hs, filepaths))

View File

@ -13,25 +13,31 @@
# limitations under the License. # limitations under the License.
from .base_resource import BaseMediaResource from .base_resource import BaseMediaResource
from synapse.api.errors import Codes
from twisted.web.resource import Resource
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from twisted.internet import defer from twisted.internet import defer
from lxml import html from lxml import html
from urlparse import urlparse, urlunparse from urlparse import urlparse, urlunparse
from synapse.api.errors import Codes
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.expiringcache import ExpiringCache
from synapse.http.client import SpiderHttpClient from synapse.http.client import SpiderHttpClient
from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes from synapse.http.server import (
request_handler, respond_with_json, respond_with_json_bytes
)
from synapse.util.async import ObservableDeferred from synapse.util.async import ObservableDeferred
from synapse.util.stringutils import is_ascii
import os import os
import re import re
import cgi
import ujson as json import ujson as json
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class PreviewUrlResource(BaseMediaResource): class PreviewUrlResource(BaseMediaResource):
isLeaf = True isLeaf = True
@ -43,7 +49,8 @@ class PreviewUrlResource(BaseMediaResource):
self.cache = ExpiringCache( self.cache = ExpiringCache(
cache_name="url_previews", cache_name="url_previews",
clock=self.clock, clock=self.clock,
expiry_ms = 60*60*1000, # don't spider URLs more often than once an hour # don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000,
) )
self.cache.start() self.cache.start()
@ -61,7 +68,10 @@ class PreviewUrlResource(BaseMediaResource):
# XXX: if get_user_by_req fails, what should we do in an async render? # XXX: if get_user_by_req fails, what should we do in an async render?
requester = yield self.auth.get_user_by_req(request) requester = yield self.auth.get_user_by_req(request)
url = request.args.get("url")[0] url = request.args.get("url")[0]
ts = int(request.args.get("ts")[0]) if "ts" in request.args else self.clock.time_msec() if "ts" in request.args:
ts = int(request.args.get("ts")[0])
else:
ts = self.clock.time_msec()
# first check the memory cache - good to handle all the clients on this # first check the memory cache - good to handle all the clients on this
# HS thundering away to preview the same URL at the same time. # HS thundering away to preview the same URL at the same time.
@ -98,7 +108,7 @@ class PreviewUrlResource(BaseMediaResource):
@download.addBoth @download.addBoth
def callback(media_info): def callback(media_info):
del self.downloads[key] del self.downloads[url]
return media_info return media_info
media_info = yield download.observe() media_info = yield download.observe()
@ -116,7 +126,9 @@ class PreviewUrlResource(BaseMediaResource):
og = { og = {
"og:description": media_info['download_name'], "og:description": media_info['download_name'],
"og:image" : "mxc://%s/%s" % (self.server_name, media_info['filesystem_id']), "og:image": "mxc://%s/%s" % (
self.server_name, media_info['filesystem_id']
),
"og:image:type": media_info['media_type'], "og:image:type": media_info['media_type'],
} }
@ -145,7 +157,7 @@ class PreviewUrlResource(BaseMediaResource):
logger.warn("Failed to find any OG data in %s", url) logger.warn("Failed to find any OG data in %s", url)
og = {} og = {}
logger.debug("Calculated OG for %s as %s" % (url, og)); logger.debug("Calculated OG for %s as %s" % (url, og))
# store OG in ephemeral in-memory cache # store OG in ephemeral in-memory cache
self.cache[url] = og self.cache[url] = og
@ -181,28 +193,20 @@ class PreviewUrlResource(BaseMediaResource):
# suck our tree into lxml and define our OG response. # suck our tree into lxml and define our OG response.
# if we see any image URLs in the OG response, then spider them # if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) # (although the client could choose to do this by asking for previews of those
# URLs to avoid DoSing the server)
# "og:type" : "article"
# "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672"
# "og:title" : "Matrix on Twitter"
# "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png"
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
# "og:site_name" : "Twitter"
# or:
# "og:type" : "video", # "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube", # "og:site_name" : "YouTube",
# "og:video:type" : "application/x-shockwave-flash", # "og:video:type" : "application/x-shockwave-flash",
# "og:description" : " ", # "og:description" : "Fun stuff happening here",
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
# "og:video:width" : "1280" # "og:video:width" : "1280"
# "og:video:height" : "720", # "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
og = {} og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
@ -210,39 +214,44 @@ class PreviewUrlResource(BaseMediaResource):
# TODO: grab article: meta tags too, e.g.: # TODO: grab article: meta tags too, e.g.:
# <meta property="article:publisher" content="https://www.facebook.com/thethudonline" /> # "article:publisher" : "https://www.facebook.com/thethudonline" />
# <meta property="article:author" content="https://www.facebook.com/thethudonline" /> # "article:author" content="https://www.facebook.com/thethudonline" />
# <meta property="article:tag" content="baby" /> # "article:tag" content="baby" />
# <meta property="article:section" content="Breaking News" /> # "article:section" content="Breaking News" />
# <meta property="article:published_time" content="2016-03-31T19:58:24+00:00" /> # "article:published_time" content="2016-03-31T19:58:24+00:00" />
# <meta property="article:modified_time" content="2016-04-01T18:31:53+00:00" /> # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
if 'og:title' not in og: if 'og:title' not in og:
# do some basic spidering of the HTML # do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
og['og:title'] = title[0].text.strip() if title else None og['og:title'] = title[0].text.strip() if title else None
if 'og:image' not in og: if 'og:image' not in og:
# TODO: extract a favicon failing all else # TODO: extract a favicon failing all else
meta_image = tree.xpath("//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"); meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
)
if meta_image: if meta_image:
og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) og['og:image'] = self._rebase_url(meta_image[0], media_info['uri'])
else: else:
# TODO: consider inlined CSS styles as well as width & height attribs # TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = sorted(images, key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height']))) images = sorted(images, key=lambda i: (
-1 * int(i.attrib['width']) * int(i.attrib['height'])
))
if not images: if not images:
images = tree.xpath("//img[@src]") images = tree.xpath("//img[@src]")
if images: if images:
og['og:image'] = images[0].attrib['src'] og['og:image'] = images[0].attrib['src']
# pre-cache the image for posterity # pre-cache the image for posterity
# FIXME: it might be cleaner to use the same flow as the main /preview_url request itself # FIXME: it might be cleaner to use the same flow as the main /preview_url request
# and benefit from the same caching etc. But for now we just rely on the caching # itself and benefit from the same caching etc. But for now we just rely on the
# of the master request to speed things up. # caching on the master request to speed things up.
if 'og:image' in og and og['og:image']: if 'og:image' in og and og['og:image']:
image_info = yield self._download_url(self._rebase_url(og['og:image'], media_info['uri']), requester.user) image_info = yield self._download_url(
self._rebase_url(og['og:image'], media_info['uri']), requester.user
)
if self._is_media(image_info['media_type']): if self._is_media(image_info['media_type']):
# TODO: make sure we don't choke on white-on-transparent images # TODO: make sure we don't choke on white-on-transparent images
@ -255,19 +264,26 @@ class PreviewUrlResource(BaseMediaResource):
else: else:
logger.warn("Couldn't get dims for %s" % og["og:image"]) logger.warn("Couldn't get dims for %s" % og["og:image"])
og["og:image"] = "mxc://%s/%s" % (self.server_name, image_info['filesystem_id']) og["og:image"] = "mxc://%s/%s" % (
self.server_name, image_info['filesystem_id']
)
og["og:image:type"] = image_info['media_type'] og["og:image:type"] = image_info['media_type']
else: else:
del og["og:image"] del og["og:image"]
if 'og:description' not in og: if 'og:description' not in og:
meta_description = tree.xpath("//*/meta[translate(@name, 'DESCRIPTION', 'description')='description']/@content"); meta_description = tree.xpath(
"//*/meta"
"[translate(@name, 'DESCRIPTION', 'description')='description']"
"/@content")
if meta_description: if meta_description:
og['og:description'] = meta_description[0] og['og:description'] = meta_description[0]
else: else:
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()") # text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | "
text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | ancestor::aside | " + # "//p/text() | //div/text() | //span/text() | //a/text()")
"ancestor::footer | ancestor::script | ancestor::style)]" + text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
"ancestor::aside | ancestor::footer | "
"ancestor::script | ancestor::style)]" +
"[ancestor::body]") "[ancestor::body]")
text = '' text = ''
for text_node in text_nodes: for text_node in text_nodes:
@ -280,8 +296,9 @@ class PreviewUrlResource(BaseMediaResource):
text = text.strip()[:500] text = text.strip()[:500]
og['og:description'] = text if text else None og['og:description'] = text if text else None
# TODO: delete the url downloads to stop diskfilling, as we only ever cared about its OG # TODO: delete the url downloads to stop diskfilling,
defer.returnValue(og); # as we only ever cared about its OG
defer.returnValue(og)
def _rebase_url(self, url, base): def _rebase_url(self, url, base):
base = list(urlparse(base)) base = list(urlparse(base))
@ -377,6 +394,8 @@ class PreviewUrlResource(BaseMediaResource):
def _is_html(self, content_type): def _is_html(self, content_type):
content_type = content_type.lower() content_type = content_type.lower()
if (content_type.startswith("text/html") or if (
content_type.startswith("application/xhtml")): content_type.startswith("text/html") or
content_type.startswith("application/xhtml")
):
return True return True

View File

@ -87,7 +87,8 @@ class MediaRepositoryStore(SQLBaseStore):
"get_url_cache", get_url_cache_txn "get_url_cache", get_url_cache_txn
) )
def store_url_cache(self, url, response_code, etag, expires, og, media_id, download_ts): def store_url_cache(self, url, response_code, etag, expires, og, media_id,
download_ts):
return self._simple_insert( return self._simple_insert(
"local_media_repository_url_cache", "local_media_repository_url_cache",
{ {