Add optional ICU support for user search (#14464)
Fixes #13655 This change uses ICU (International Components for Unicode) to improve boundary detection in user search. This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
This commit is contained in:
parent
a5d8fee097
commit
2a3cd59dd0
|
@ -0,0 +1 @@
|
||||||
|
Improve user search for international display names.
|
|
@ -1,3 +1,10 @@
|
||||||
|
matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium
|
||||||
|
|
||||||
|
* New dependency on libicu-dev to provide improved results for user
|
||||||
|
search.
|
||||||
|
|
||||||
|
-- Synapse Packaging team <packages@matrix.org> Tue, 06 Dec 2022 15:28:10 +0000
|
||||||
|
|
||||||
matrix-synapse-py3 (1.73.0) stable; urgency=medium
|
matrix-synapse-py3 (1.73.0) stable; urgency=medium
|
||||||
|
|
||||||
* New Synapse release 1.73.0.
|
* New Synapse release 1.73.0.
|
||||||
|
|
|
@ -8,6 +8,8 @@ Build-Depends:
|
||||||
dh-virtualenv (>= 1.1),
|
dh-virtualenv (>= 1.1),
|
||||||
libsystemd-dev,
|
libsystemd-dev,
|
||||||
libpq-dev,
|
libpq-dev,
|
||||||
|
libicu-dev,
|
||||||
|
pkg-config,
|
||||||
lsb-release,
|
lsb-release,
|
||||||
python3-dev,
|
python3-dev,
|
||||||
python3,
|
python3,
|
||||||
|
|
|
@ -97,6 +97,8 @@ RUN \
|
||||||
zlib1g-dev \
|
zlib1g-dev \
|
||||||
git \
|
git \
|
||||||
curl \
|
curl \
|
||||||
|
libicu-dev \
|
||||||
|
pkg-config \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
|
||||||
python3-venv \
|
python3-venv \
|
||||||
sqlite3 \
|
sqlite3 \
|
||||||
libpq-dev \
|
libpq-dev \
|
||||||
|
libicu-dev \
|
||||||
|
pkg-config \
|
||||||
xmlsec1
|
xmlsec1
|
||||||
|
|
||||||
# Install rust and ensure it's in the PATH
|
# Install rust and ensure it's in the PATH
|
||||||
|
|
|
@ -837,6 +837,14 @@ category = "dev"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyicu"
|
||||||
|
version = "2.10.2"
|
||||||
|
description = "Python extension wrapping the ICU C++ API"
|
||||||
|
category = "main"
|
||||||
|
optional = true
|
||||||
|
python-versions = "*"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyjwt"
|
name = "pyjwt"
|
||||||
version = "2.4.0"
|
version = "2.4.0"
|
||||||
|
@ -1622,7 +1630,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
|
||||||
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
|
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler"]
|
all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"]
|
||||||
cache-memory = ["Pympler"]
|
cache-memory = ["Pympler"]
|
||||||
jwt = ["authlib"]
|
jwt = ["authlib"]
|
||||||
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
|
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
|
||||||
|
@ -1635,11 +1643,12 @@ sentry = ["sentry-sdk"]
|
||||||
systemd = ["systemd-python"]
|
systemd = ["systemd-python"]
|
||||||
test = ["parameterized", "idna"]
|
test = ["parameterized", "idna"]
|
||||||
url-preview = ["lxml"]
|
url-preview = ["lxml"]
|
||||||
|
user-search = ["pyicu"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.7.1"
|
python-versions = "^3.7.1"
|
||||||
content-hash = "8c44ceeb9df5c3ab43040400e0a6b895de49417e61293a1ba027640b34f03263"
|
content-hash = "f20007013f33bc35a01e412c48adc62a936030f3074e06286674c5ad7f44d300"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
attrs = [
|
attrs = [
|
||||||
|
@ -2427,6 +2436,9 @@ pygments = [
|
||||||
{file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"},
|
{file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"},
|
||||||
{file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"},
|
{file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"},
|
||||||
]
|
]
|
||||||
|
pyicu = [
|
||||||
|
{file = "PyICU-2.10.2.tar.gz", hash = "sha256:0c3309eea7fab6857507ace62403515b60fe096cbfb4f90d14f55ff75c5441c1"},
|
||||||
|
]
|
||||||
pyjwt = [
|
pyjwt = [
|
||||||
{file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
|
{file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
|
||||||
{file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
|
{file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
|
||||||
|
|
|
@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
|
||||||
Pympler = { version = "*", optional = true }
|
Pympler = { version = "*", optional = true }
|
||||||
parameterized = { version = ">=0.7.4", optional = true }
|
parameterized = { version = ">=0.7.4", optional = true }
|
||||||
idna = { version = ">=2.5", optional = true }
|
idna = { version = ">=2.5", optional = true }
|
||||||
|
pyicu = { version = ">=2.10.2", optional = true }
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
|
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
|
||||||
|
@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
|
||||||
# Required to use experimental `caches.track_memory_usage` config option.
|
# Required to use experimental `caches.track_memory_usage` config option.
|
||||||
cache-memory = ["pympler"]
|
cache-memory = ["pympler"]
|
||||||
test = ["parameterized", "idna"]
|
test = ["parameterized", "idna"]
|
||||||
|
# Allows for better search for international characters in the user directory. This
|
||||||
|
# requires libicu's development headers installed on the system (e.g. libicu-dev on
|
||||||
|
# Debian-based distributions).
|
||||||
|
user-search = ["pyicu"]
|
||||||
|
|
||||||
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
|
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
|
||||||
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
|
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
|
||||||
|
@ -261,6 +266,8 @@ all = [
|
||||||
"txredisapi", "hiredis",
|
"txredisapi", "hiredis",
|
||||||
# cache-memory
|
# cache-memory
|
||||||
"pympler",
|
"pympler",
|
||||||
|
# improved user search
|
||||||
|
"pyicu",
|
||||||
# omitted:
|
# omitted:
|
||||||
# - test: it's useful to have this separate from dev deps in the olddeps job
|
# - test: it's useful to have this separate from dev deps in the olddeps job
|
||||||
# - systemd: this is a system-based requirement
|
# - systemd: this is a system-based requirement
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Copyright 2022 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# Stub for PyICU.
|
||||||
|
|
||||||
|
class Locale:
|
||||||
|
@staticmethod
|
||||||
|
def getDefault() -> Locale: ...
|
||||||
|
|
||||||
|
class BreakIterator:
|
||||||
|
@staticmethod
|
||||||
|
def createWordInstance(locale: Locale) -> BreakIterator: ...
|
||||||
|
def setText(self, text: str) -> None: ...
|
||||||
|
def nextBoundary(self) -> int: ...
|
|
@ -26,6 +26,14 @@ from typing import (
|
||||||
cast,
|
cast,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Figure out if ICU support is available for searching users.
|
||||||
|
import icu
|
||||||
|
|
||||||
|
USE_ICU = True
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
USE_ICU = False
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
from synapse.api.errors import StoreError
|
from synapse.api.errors import StoreError
|
||||||
|
@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Pull out the individual words, discarding any non-word characters.
|
# Pull out the individual words, discarding any non-word characters.
|
||||||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
results = _parse_words(search_term)
|
||||||
return " & ".join("(%s* OR %s)" % (result, result) for result in results)
|
return " & ".join("(%s* OR %s)" % (result, result) for result in results)
|
||||||
|
|
||||||
|
|
||||||
|
@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
|
||||||
We use this so that we can add prefix matching, which isn't something
|
We use this so that we can add prefix matching, which isn't something
|
||||||
that is supported by default.
|
that is supported by default.
|
||||||
"""
|
"""
|
||||||
|
results = _parse_words(search_term)
|
||||||
# Pull out the individual words, discarding any non-word characters.
|
|
||||||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
|
||||||
|
|
||||||
both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
|
both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
|
||||||
exact = " & ".join("%s" % (result,) for result in results)
|
exact = " & ".join("%s" % (result,) for result in results)
|
||||||
prefix = " & ".join("%s:*" % (result,) for result in results)
|
prefix = " & ".join("%s:*" % (result,) for result in results)
|
||||||
|
|
||||||
return both, exact, prefix
|
return both, exact, prefix
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_words(search_term: str) -> List[str]:
|
||||||
|
"""Split the provided search string into a list of its words.
|
||||||
|
|
||||||
|
If support for ICU (International Components for Unicode) is available, use it.
|
||||||
|
Otherwise, fall back to using a regex to detect word boundaries. This latter
|
||||||
|
solution works well enough for most latin-based languages, but doesn't work as well
|
||||||
|
with other languages.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_term: The search string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of the words in the search string.
|
||||||
|
"""
|
||||||
|
if USE_ICU:
|
||||||
|
return _parse_words_with_icu(search_term)
|
||||||
|
|
||||||
|
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_words_with_icu(search_term: str) -> List[str]:
|
||||||
|
"""Break down the provided search string into its individual words using ICU
|
||||||
|
(International Components for Unicode).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_term: The search string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of the words in the search string.
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
|
||||||
|
breaker.setText(search_term)
|
||||||
|
i = 0
|
||||||
|
while True:
|
||||||
|
j = breaker.nextBoundary()
|
||||||
|
if j < 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
result = search_term[i:j]
|
||||||
|
|
||||||
|
# libicu considers spaces and punctuation between words as words, but we don't
|
||||||
|
# want to include those in results as they would result in syntax errors in SQL
|
||||||
|
# queries (e.g. "foo bar" would result in the search query including "foo & &
|
||||||
|
# bar").
|
||||||
|
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
i = j
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
import re
|
||||||
from typing import Any, Dict, Set, Tuple
|
from typing import Any, Dict, Set, Tuple
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
@ -30,6 +31,12 @@ from synapse.util import Clock
|
||||||
from tests.test_utils.event_injection import inject_member_event
|
from tests.test_utils.event_injection import inject_member_event
|
||||||
from tests.unittest import HomeserverTestCase, override_config
|
from tests.unittest import HomeserverTestCase, override_config
|
||||||
|
|
||||||
|
try:
|
||||||
|
import icu
|
||||||
|
except ImportError:
|
||||||
|
icu = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
ALICE = "@alice:a"
|
ALICE = "@alice:a"
|
||||||
BOB = "@bob:b"
|
BOB = "@bob:b"
|
||||||
BOBBY = "@bobby:a"
|
BOBBY = "@bobby:a"
|
||||||
|
@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
|
||||||
r["results"][0],
|
r["results"][0],
|
||||||
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
|
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UserDirectoryICUTestCase(HomeserverTestCase):
|
||||||
|
if not icu:
|
||||||
|
skip = "Requires PyICU"
|
||||||
|
|
||||||
|
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
||||||
|
self.store = hs.get_datastores().main
|
||||||
|
self.user_dir_helper = GetUserDirectoryTables(self.store)
|
||||||
|
|
||||||
|
def test_icu_word_boundary(self) -> None:
|
||||||
|
"""Tests that we correctly detect word boundaries when ICU (International
|
||||||
|
Components for Unicode) support is available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
display_name = "Gáo"
|
||||||
|
|
||||||
|
# This word is not broken down correctly by Python's regular expressions,
|
||||||
|
# likely because á is actually a lowercase a followed by a U+0301 combining
|
||||||
|
# acute accent. This is specifically something that ICU support fixes.
|
||||||
|
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
|
||||||
|
self.assertEqual(len(matches), 2)
|
||||||
|
|
||||||
|
self.get_success(
|
||||||
|
self.store.update_profile_in_user_dir(ALICE, display_name, None)
|
||||||
|
)
|
||||||
|
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
|
||||||
|
|
||||||
|
# Check that searching for this user yields the correct result.
|
||||||
|
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
|
||||||
|
self.assertFalse(r["limited"])
|
||||||
|
self.assertEqual(len(r["results"]), 1)
|
||||||
|
self.assertDictEqual(
|
||||||
|
r["results"][0],
|
||||||
|
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue