Fix parsing of Content-Disposition headers (#4763)

* Fix parsing of Content-Disposition headers TIL: filenames in content-dispostion headers can contain semicolons, and aren't %-encoded. * fix python2 incompatibility * Fix docstrings
2019-02-27 22:29:10 +00:00 · 2019-02-27 22:29:10 +00:00 · 68f47d6744
parent f2a753ea38
commit 68f47d6744
3 changed files with 111 additions and 20 deletions
--- a/changelog.d/4763.bugfix
+++ b/changelog.d/4763.bugfix
@ -0,0 +1 @@
 Fix parsing of Content-Disposition headers on remote media requests and URL previews.
--- a/synapse/rest/media/v1/_base.py
+++ b/synapse/rest/media/v1/_base.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright 2014-2016 OpenMarket Ltd
 # Copyright 2019 New Vector Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -213,8 +214,7 @@ def get_filename_from_headers(headers):
    Content-Disposition HTTP header.
    Args:
-        headers (twisted.web.http_headers.Headers): The HTTP
+        headers (dict[bytes, list[bytes]]): The HTTP request headers.
            request headers.
    Returns:
        A Unicode string of the filename, or None.
@ -225,23 +225,12 @@ def get_filename_from_headers(headers):
    if not content_disposition[0]:
        return
-    # dict of unicode: bytes, corresponding to the key value sections of the
+    _, params = _parse_header(content_disposition[0])
    # Content-Disposition header.
    params = {}
    parts = content_disposition[0].split(b";")
    for i in parts:
        # Split into key-value pairs, if able
        # We don't care about things like `inline`, so throw it out
        if b"=" not in i:
            continue
        key, value = i.strip().split(b"=")
        params[key.decode('ascii')] = value
    upload_name = None
    # First check if there is a valid UTF-8 filename
-    upload_name_utf8 = params.get("filename*", None)
+    upload_name_utf8 = params.get(b"filename*", None)
    if upload_name_utf8:
        if upload_name_utf8.lower().startswith(b"utf-8''"):
            upload_name_utf8 = upload_name_utf8[7:]
@ -267,12 +256,68 @@ def get_filename_from_headers(headers):
    # If there isn't check for an ascii name.
    if not upload_name:
-        upload_name_ascii = params.get("filename", None)
+        upload_name_ascii = params.get(b"filename", None)
        if upload_name_ascii and is_ascii(upload_name_ascii):
-            # Make sure there's no %-quoted bytes. If there is, reject it as
+            upload_name = upload_name_ascii.decode('ascii')
            # non-valid ASCII.
            if b"%" not in upload_name_ascii:
                upload_name = upload_name_ascii.decode('ascii')
    # This may be None here, indicating we did not find a matching name.
    return upload_name
 def _parse_header(line):
    """Parse a Content-type like header.
    Cargo-culted from `cgi`, but works on bytes rather than strings.
    Args:
        line (bytes): header to be parsed
    Returns:
        Tuple[bytes, dict[bytes, bytes]]:
            the main content-type, followed by the parameter dictionary
    """
    parts = _parseparam(b';' + line)
    key = next(parts)
    pdict = {}
    for p in parts:
        i = p.find(b'=')
        if i >= 0:
            name = p[:i].strip().lower()
            value = p[i + 1:].strip()
            # strip double-quotes
            if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
                value = value[1:-1]
                value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
            pdict[name] = value
    return key, pdict
 def _parseparam(s):
    """Generator which splits the input on ;, respecting double-quoted sequences
    Cargo-culted from `cgi`, but works on bytes rather than strings.
    Args:
        s (bytes): header to be parsed
    Returns:
        Iterable[bytes]: the split input
    """
    while s[:1] == b';':
        s = s[1:]
        # look for the next ;
        end = s.find(b';')
        # if there is an odd number of " marks between here and the next ;, skip to the
        # next ; instead
        while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
            end = s.find(b';', end + 1)
        if end < 0:
            end = len(s)
        f = s[:end]
        yield f.strip()
        s = s[end:]
--- a/tests/rest/media/v1/test_base.py
+++ b/tests/rest/media/v1/test_base.py
@ -0,0 +1,45 @@
 # -*- coding: utf-8 -*-
 # Copyright 2019 New Vector Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from synapse.rest.media.v1._base import get_filename_from_headers
 from tests import unittest
 class GetFileNameFromHeadersTests(unittest.TestCase):
    # input -> expected result
    TEST_CASES = {
        b"inline; filename=abc.txt": u"abc.txt",
        b'inline; filename="azerty"': u"azerty",
        b'inline; filename="aze%20rty"': u"aze%20rty",
        b'inline; filename="aze\"rty"': u'aze"rty',
        b'inline; filename="azer;ty"': u"azer;ty",
        b"inline; filename*=utf-8''foo%C2%A3bar": u"foo£bar",
    }
    def tests(self):
        for hdr, expected in self.TEST_CASES.items():
            res = get_filename_from_headers(
                {
                    b'Content-Disposition': [hdr],
                },
            )
            self.assertEqual(
                res, expected,
                "expected output for %s to be %s but was %s" % (
                    hdr, expected, res,
                )
            )
		`@ -0,0 +1 @@`
							`Fix parsing of Content-Disposition headers on remote media requests and URL previews.`