2016-01-06 21:26:29 -07:00
|
|
|
# Copyright 2015, 2016 OpenMarket Ltd
|
2015-10-09 08:48:31 -06:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
import enum
|
2018-02-03 16:07:13 -07:00
|
|
|
import logging
|
|
|
|
import re
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
from collections import deque
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from typing import (
|
|
|
|
TYPE_CHECKING,
|
|
|
|
Any,
|
|
|
|
Collection,
|
|
|
|
Iterable,
|
|
|
|
List,
|
|
|
|
Optional,
|
|
|
|
Set,
|
|
|
|
Tuple,
|
|
|
|
Union,
|
|
|
|
)
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2021-12-30 11:47:12 -07:00
|
|
|
import attr
|
|
|
|
|
2015-11-05 09:10:54 -07:00
|
|
|
from synapse.api.errors import SynapseError
|
2020-09-01 09:04:17 -06:00
|
|
|
from synapse.events import EventBase
|
2020-07-16 09:32:19 -06:00
|
|
|
from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
|
2021-12-13 10:05:00 -07:00
|
|
|
from synapse.storage.database import (
|
|
|
|
DatabasePool,
|
|
|
|
LoggingDatabaseConnection,
|
|
|
|
LoggingTransaction,
|
|
|
|
)
|
2020-08-05 14:38:57 -06:00
|
|
|
from synapse.storage.databases.main.events_worker import EventRedactBehaviour
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
|
2022-02-15 06:47:05 -07:00
|
|
|
from synapse.types import JsonDict
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2021-10-22 11:15:41 -06:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
from synapse.server import HomeServer
|
|
|
|
|
2015-11-04 10:57:44 -07:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2021-12-30 11:47:12 -07:00
|
|
|
|
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class SearchEntry:
|
|
|
|
key: str
|
|
|
|
value: str
|
|
|
|
event_id: str
|
|
|
|
room_id: str
|
|
|
|
stream_ordering: Optional[int]
|
|
|
|
origin_server_ts: int
|
2018-02-03 16:07:13 -07:00
|
|
|
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2021-09-22 09:25:26 -06:00
|
|
|
def _clean_value_for_search(value: str) -> str:
|
|
|
|
"""
|
|
|
|
Replaces any null code points in the string with spaces as
|
|
|
|
Postgres and SQLite do not like the insertion of strings with
|
|
|
|
null code points into the full-text search tables.
|
|
|
|
"""
|
|
|
|
return value.replace("\u0000", " ")
|
|
|
|
|
|
|
|
|
2020-05-15 10:22:47 -06:00
|
|
|
class SearchWorkerStore(SQLBaseStore):
|
2021-09-22 09:25:26 -06:00
|
|
|
def store_search_entries_txn(
|
|
|
|
self, txn: LoggingTransaction, entries: Iterable[SearchEntry]
|
|
|
|
) -> None:
|
2020-05-15 10:22:47 -06:00
|
|
|
"""Add entries to the search table
|
|
|
|
|
|
|
|
Args:
|
2021-09-22 09:25:26 -06:00
|
|
|
txn:
|
|
|
|
entries: entries to be added to the table
|
2020-05-15 10:22:47 -06:00
|
|
|
"""
|
2021-09-29 04:44:15 -06:00
|
|
|
if not self.hs.config.server.enable_search:
|
2020-05-15 10:22:47 -06:00
|
|
|
return
|
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = """
|
|
|
|
INSERT INTO event_search
|
|
|
|
(event_id, room_id, key, vector, stream_ordering, origin_server_ts)
|
|
|
|
VALUES (?,?,?,to_tsvector('english', ?),?,?)
|
|
|
|
"""
|
2020-05-15 10:22:47 -06:00
|
|
|
|
2022-03-28 12:11:14 -06:00
|
|
|
args1 = (
|
2020-05-15 10:22:47 -06:00
|
|
|
(
|
|
|
|
entry.event_id,
|
|
|
|
entry.room_id,
|
|
|
|
entry.key,
|
2021-09-22 09:25:26 -06:00
|
|
|
_clean_value_for_search(entry.value),
|
2020-05-15 10:22:47 -06:00
|
|
|
entry.stream_ordering,
|
|
|
|
entry.origin_server_ts,
|
|
|
|
)
|
|
|
|
for entry in entries
|
|
|
|
)
|
|
|
|
|
2022-03-28 12:11:14 -06:00
|
|
|
txn.execute_batch(sql, args1)
|
2020-05-15 10:22:47 -06:00
|
|
|
|
|
|
|
elif isinstance(self.database_engine, Sqlite3Engine):
|
2022-10-28 09:44:10 -06:00
|
|
|
self.db_pool.simple_insert_many_txn(
|
|
|
|
txn,
|
|
|
|
table="event_search",
|
|
|
|
keys=("event_id", "room_id", "key", "value"),
|
|
|
|
values=(
|
|
|
|
(
|
|
|
|
entry.event_id,
|
|
|
|
entry.room_id,
|
|
|
|
entry.key,
|
|
|
|
_clean_value_for_search(entry.value),
|
|
|
|
)
|
|
|
|
for entry in entries
|
|
|
|
),
|
2020-05-15 10:22:47 -06:00
|
|
|
)
|
2021-09-22 09:25:26 -06:00
|
|
|
|
2020-05-15 10:22:47 -06:00
|
|
|
else:
|
|
|
|
# This should be unreachable.
|
|
|
|
raise Exception("Unrecognized database engine")
|
|
|
|
|
|
|
|
|
|
|
|
class SearchBackgroundUpdateStore(SearchWorkerStore):
|
2015-11-09 12:29:32 -07:00
|
|
|
|
|
|
|
EVENT_SEARCH_UPDATE_NAME = "event_search"
|
2016-04-21 09:41:39 -06:00
|
|
|
EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
|
2018-01-09 09:37:48 -07:00
|
|
|
EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin"
|
2022-02-24 04:52:28 -07:00
|
|
|
EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings"
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2021-12-13 10:05:00 -07:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
database: DatabasePool,
|
|
|
|
db_conn: LoggingDatabaseConnection,
|
|
|
|
hs: "HomeServer",
|
|
|
|
):
|
2020-09-18 07:56:44 -06:00
|
|
|
super().__init__(database, db_conn, hs)
|
2018-12-04 04:01:02 -07:00
|
|
|
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates.register_background_update_handler(
|
2015-11-10 08:50:58 -07:00
|
|
|
self.EVENT_SEARCH_UPDATE_NAME, self._background_reindex_search
|
|
|
|
)
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates.register_background_update_handler(
|
2019-04-03 03:07:29 -06:00
|
|
|
self.EVENT_SEARCH_ORDER_UPDATE_NAME, self._background_reindex_search_order
|
2016-04-21 09:41:39 -06:00
|
|
|
)
|
2018-02-02 07:32:51 -07:00
|
|
|
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates.register_background_update_handler(
|
2019-04-03 03:07:29 -06:00
|
|
|
self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search
|
2016-11-03 08:59:59 -06:00
|
|
|
)
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2022-02-24 04:52:28 -07:00
|
|
|
self.db_pool.updates.register_background_update_handler(
|
|
|
|
self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings
|
|
|
|
)
|
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
async def _background_reindex_search(
|
|
|
|
self, progress: JsonDict, batch_size: int
|
|
|
|
) -> int:
|
2018-02-03 16:07:13 -07:00
|
|
|
# we work through the events table from highest stream id to lowest
|
2015-11-10 09:20:13 -07:00
|
|
|
target_min_stream_id = progress["target_min_stream_id_inclusive"]
|
|
|
|
max_stream_id = progress["max_stream_id_exclusive"]
|
2015-11-09 12:29:32 -07:00
|
|
|
rows_inserted = progress.get("rows_inserted", 0)
|
|
|
|
|
|
|
|
TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
|
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def reindex_search_txn(txn: LoggingTransaction) -> int:
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = """
|
|
|
|
SELECT stream_ordering, event_id, room_id, type, json, origin_server_ts
|
|
|
|
FROM events
|
|
|
|
JOIN event_json USING (room_id, event_id)
|
|
|
|
WHERE ? <= stream_ordering AND stream_ordering < ?
|
|
|
|
AND (%s)
|
|
|
|
ORDER BY stream_ordering DESC
|
|
|
|
LIMIT ?
|
|
|
|
""" % (
|
|
|
|
" OR ".join("type = '%s'" % (t,) for t in TYPES),
|
|
|
|
)
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2015-11-11 06:59:40 -07:00
|
|
|
txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2018-02-03 16:07:13 -07:00
|
|
|
# we could stream straight from the results into
|
|
|
|
# store_search_entries_txn with a generator function, but that
|
|
|
|
# would mean having two cursors open on the database at once.
|
|
|
|
# Instead we just build a list of results.
|
2020-08-05 14:38:57 -06:00
|
|
|
rows = self.db_pool.cursor_to_dict(txn)
|
2015-11-09 12:29:32 -07:00
|
|
|
if not rows:
|
2015-11-11 06:59:40 -07:00
|
|
|
return 0
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2016-06-03 10:12:48 -06:00
|
|
|
min_stream_id = rows[-1]["stream_ordering"]
|
2015-11-09 12:29:32 -07:00
|
|
|
|
|
|
|
event_search_rows = []
|
2016-06-03 10:12:48 -06:00
|
|
|
for row in rows:
|
2015-11-09 12:29:32 -07:00
|
|
|
try:
|
2016-06-03 10:12:48 -06:00
|
|
|
event_id = row["event_id"]
|
|
|
|
room_id = row["room_id"]
|
|
|
|
etype = row["type"]
|
2018-02-03 16:07:13 -07:00
|
|
|
stream_ordering = row["stream_ordering"]
|
|
|
|
origin_server_ts = row["origin_server_ts"]
|
2016-06-03 10:12:48 -06:00
|
|
|
try:
|
2020-07-16 09:32:19 -06:00
|
|
|
event_json = db_to_json(row["json"])
|
2018-03-29 16:05:33 -06:00
|
|
|
content = event_json["content"]
|
2017-10-23 08:52:32 -06:00
|
|
|
except Exception:
|
2016-06-03 10:12:48 -06:00
|
|
|
continue
|
|
|
|
|
|
|
|
if etype == "m.room.message":
|
2015-11-09 12:29:32 -07:00
|
|
|
key = "content.body"
|
|
|
|
value = content["body"]
|
2016-06-03 10:12:48 -06:00
|
|
|
elif etype == "m.room.topic":
|
2015-11-09 12:29:32 -07:00
|
|
|
key = "content.topic"
|
|
|
|
value = content["topic"]
|
2016-06-03 10:12:48 -06:00
|
|
|
elif etype == "m.room.name":
|
2015-11-09 12:29:32 -07:00
|
|
|
key = "content.name"
|
|
|
|
value = content["name"]
|
2018-02-03 16:07:13 -07:00
|
|
|
else:
|
|
|
|
raise Exception("unexpected event type %s" % etype)
|
2015-11-10 08:50:58 -07:00
|
|
|
except (KeyError, AttributeError):
|
2015-11-09 12:29:32 -07:00
|
|
|
# If the event is missing a necessary field then
|
|
|
|
# skip over it.
|
|
|
|
continue
|
|
|
|
|
2020-06-16 06:51:47 -06:00
|
|
|
if not isinstance(value, str):
|
2015-12-14 06:55:46 -07:00
|
|
|
# If the event body, name or topic isn't a string
|
|
|
|
# then skip over it
|
|
|
|
continue
|
|
|
|
|
2019-04-03 03:07:29 -06:00
|
|
|
event_search_rows.append(
|
|
|
|
SearchEntry(
|
|
|
|
key=key,
|
|
|
|
value=value,
|
|
|
|
event_id=event_id,
|
|
|
|
room_id=room_id,
|
|
|
|
stream_ordering=stream_ordering,
|
|
|
|
origin_server_ts=origin_server_ts,
|
|
|
|
)
|
|
|
|
)
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2018-02-03 16:07:13 -07:00
|
|
|
self.store_search_entries_txn(txn, event_search_rows)
|
2015-11-09 12:29:32 -07:00
|
|
|
|
|
|
|
progress = {
|
2015-11-10 09:20:13 -07:00
|
|
|
"target_min_stream_id_inclusive": target_min_stream_id,
|
|
|
|
"max_stream_id_exclusive": min_stream_id,
|
2019-04-03 03:07:29 -06:00
|
|
|
"rows_inserted": rows_inserted + len(event_search_rows),
|
2015-11-09 12:29:32 -07:00
|
|
|
}
|
|
|
|
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates._background_update_progress_txn(
|
2015-11-09 12:29:32 -07:00
|
|
|
txn, self.EVENT_SEARCH_UPDATE_NAME, progress
|
|
|
|
)
|
|
|
|
|
|
|
|
return len(event_search_rows)
|
|
|
|
|
2022-03-14 11:52:58 -06:00
|
|
|
if self.hs.config.server.enable_search:
|
|
|
|
result = await self.db_pool.runInteraction(
|
|
|
|
self.EVENT_SEARCH_UPDATE_NAME, reindex_search_txn
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
# Don't index anything if search is not enabled.
|
|
|
|
result = 0
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2015-11-11 06:59:40 -07:00
|
|
|
if not result:
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.updates._end_background_update(
|
2020-08-05 14:38:57 -06:00
|
|
|
self.EVENT_SEARCH_UPDATE_NAME
|
|
|
|
)
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2019-07-23 07:00:55 -06:00
|
|
|
return result
|
2015-11-09 12:29:32 -07:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
async def _background_reindex_gin_search(
|
|
|
|
self, progress: JsonDict, batch_size: int
|
|
|
|
) -> int:
|
2018-02-02 08:25:27 -07:00
|
|
|
"""This handles old synapses which used GIST indexes, if any;
|
2018-01-09 09:55:51 -07:00
|
|
|
converting them back to be GIN as per the actual schema.
|
2018-02-02 08:25:27 -07:00
|
|
|
"""
|
2018-01-09 09:37:48 -07:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def create_index(conn: LoggingDatabaseConnection) -> None:
|
2018-02-02 08:25:27 -07:00
|
|
|
conn.rollback()
|
|
|
|
|
|
|
|
# we have to set autocommit, because postgres refuses to
|
|
|
|
# CREATE INDEX CONCURRENTLY without it.
|
|
|
|
conn.set_session(autocommit=True)
|
|
|
|
|
2018-01-09 09:55:51 -07:00
|
|
|
try:
|
|
|
|
c = conn.cursor()
|
2016-11-03 08:59:59 -06:00
|
|
|
|
2018-02-02 08:25:27 -07:00
|
|
|
# if we skipped the conversion to GIST, we may already/still
|
|
|
|
# have an event_search_fts_idx; unfortunately postgres 9.4
|
|
|
|
# doesn't support CREATE INDEX IF EXISTS so we just catch the
|
|
|
|
# exception and ignore it.
|
|
|
|
import psycopg2
|
2019-04-03 03:07:29 -06:00
|
|
|
|
2018-02-02 08:25:27 -07:00
|
|
|
try:
|
|
|
|
c.execute(
|
2022-10-28 09:44:10 -06:00
|
|
|
"""
|
|
|
|
CREATE INDEX CONCURRENTLY event_search_fts_idx
|
|
|
|
ON event_search USING GIN (vector)
|
|
|
|
"""
|
2018-02-02 08:25:27 -07:00
|
|
|
)
|
|
|
|
except psycopg2.ProgrammingError as e:
|
2019-10-31 04:23:24 -06:00
|
|
|
logger.warning(
|
2019-04-03 03:07:29 -06:00
|
|
|
"Ignoring error %r when trying to switch from GIST to GIN", e
|
2018-02-02 08:25:27 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
# we should now be able to delete the GIST index.
|
2019-04-03 03:07:29 -06:00
|
|
|
c.execute("DROP INDEX IF EXISTS event_search_fts_idx_gist")
|
2018-02-02 08:25:27 -07:00
|
|
|
finally:
|
2018-01-09 09:55:51 -07:00
|
|
|
conn.set_session(autocommit=False)
|
2016-11-03 08:59:59 -06:00
|
|
|
|
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.runWithConnection(create_index)
|
2016-11-03 08:59:59 -06:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.updates._end_background_update(
|
2019-12-04 08:09:36 -07:00
|
|
|
self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME
|
|
|
|
)
|
2019-07-23 07:00:55 -06:00
|
|
|
return 1
|
2016-11-03 08:59:59 -06:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
async def _background_reindex_search_order(
|
|
|
|
self, progress: JsonDict, batch_size: int
|
|
|
|
) -> int:
|
2016-04-21 09:41:39 -06:00
|
|
|
target_min_stream_id = progress["target_min_stream_id_inclusive"]
|
|
|
|
max_stream_id = progress["max_stream_id_exclusive"]
|
|
|
|
rows_inserted = progress.get("rows_inserted", 0)
|
2016-04-21 10:16:11 -06:00
|
|
|
have_added_index = progress["have_added_indexes"]
|
2016-04-21 09:41:39 -06:00
|
|
|
|
2016-04-21 10:19:25 -06:00
|
|
|
if not have_added_index:
|
2019-04-03 03:07:29 -06:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def create_index(conn: LoggingDatabaseConnection) -> None:
|
2016-04-21 10:19:25 -06:00
|
|
|
conn.rollback()
|
|
|
|
conn.set_session(autocommit=True)
|
|
|
|
c = conn.cursor()
|
2016-04-21 11:09:48 -06:00
|
|
|
|
|
|
|
# We create with NULLS FIRST so that when we search *backwards*
|
|
|
|
# we get the ones with non null origin_server_ts *first*
|
2016-04-21 10:19:25 -06:00
|
|
|
c.execute(
|
2022-10-28 09:44:10 -06:00
|
|
|
"""
|
|
|
|
CREATE INDEX CONCURRENTLY event_search_room_order
|
|
|
|
ON event_search(room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
|
|
|
|
"""
|
2016-04-21 10:16:11 -06:00
|
|
|
)
|
2016-04-21 10:19:25 -06:00
|
|
|
c.execute(
|
2022-10-28 09:44:10 -06:00
|
|
|
"""
|
|
|
|
CREATE INDEX CONCURRENTLY event_search_order
|
|
|
|
ON event_search(origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)
|
|
|
|
"""
|
2016-04-21 10:16:11 -06:00
|
|
|
)
|
2016-04-21 10:19:25 -06:00
|
|
|
conn.set_session(autocommit=False)
|
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.runWithConnection(create_index)
|
2016-04-21 10:39:24 -06:00
|
|
|
|
2016-04-21 10:45:56 -06:00
|
|
|
pg = dict(progress)
|
|
|
|
pg["have_added_indexes"] = True
|
2016-04-21 10:39:24 -06:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.runInteraction(
|
2016-04-21 10:45:56 -06:00
|
|
|
self.EVENT_SEARCH_ORDER_UPDATE_NAME,
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates._background_update_progress_txn,
|
2019-04-03 03:07:29 -06:00
|
|
|
self.EVENT_SEARCH_ORDER_UPDATE_NAME,
|
|
|
|
pg,
|
2016-04-21 10:45:56 -06:00
|
|
|
)
|
2016-04-21 10:16:11 -06:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def reindex_search_txn(txn: LoggingTransaction) -> Tuple[int, bool]:
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = """
|
|
|
|
UPDATE event_search AS es
|
|
|
|
SET stream_ordering = e.stream_ordering, origin_server_ts = e.origin_server_ts
|
|
|
|
FROM events AS e
|
|
|
|
WHERE e.event_id = es.event_id
|
|
|
|
AND ? <= e.stream_ordering AND e.stream_ordering < ?
|
|
|
|
RETURNING es.stream_ordering
|
|
|
|
"""
|
2016-04-21 09:41:39 -06:00
|
|
|
|
2016-04-22 02:37:16 -06:00
|
|
|
min_stream_id = max_stream_id - batch_size
|
|
|
|
txn.execute(sql, (min_stream_id, max_stream_id))
|
2016-04-21 09:41:39 -06:00
|
|
|
rows = txn.fetchall()
|
2016-04-22 02:37:16 -06:00
|
|
|
|
|
|
|
if min_stream_id < target_min_stream_id:
|
|
|
|
# We've recached the end.
|
|
|
|
return len(rows), False
|
2016-04-21 09:41:39 -06:00
|
|
|
|
|
|
|
progress = {
|
|
|
|
"target_min_stream_id_inclusive": target_min_stream_id,
|
|
|
|
"max_stream_id_exclusive": min_stream_id,
|
2016-04-21 10:16:11 -06:00
|
|
|
"rows_inserted": rows_inserted + len(rows),
|
2016-04-21 10:49:00 -06:00
|
|
|
"have_added_indexes": True,
|
2016-04-21 09:41:39 -06:00
|
|
|
}
|
|
|
|
|
2020-08-05 14:38:57 -06:00
|
|
|
self.db_pool.updates._background_update_progress_txn(
|
2016-04-21 09:41:39 -06:00
|
|
|
txn, self.EVENT_SEARCH_ORDER_UPDATE_NAME, progress
|
|
|
|
)
|
|
|
|
|
2016-04-22 02:37:16 -06:00
|
|
|
return len(rows), True
|
2016-04-21 09:41:39 -06:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
num_rows, finished = await self.db_pool.runInteraction(
|
2016-04-21 09:41:39 -06:00
|
|
|
self.EVENT_SEARCH_ORDER_UPDATE_NAME, reindex_search_txn
|
|
|
|
)
|
|
|
|
|
2016-04-22 02:37:16 -06:00
|
|
|
if not finished:
|
2020-08-07 10:17:17 -06:00
|
|
|
await self.db_pool.updates._end_background_update(
|
2019-12-04 08:09:36 -07:00
|
|
|
self.EVENT_SEARCH_ORDER_UPDATE_NAME
|
|
|
|
)
|
2016-04-21 09:41:39 -06:00
|
|
|
|
2019-07-23 07:00:55 -06:00
|
|
|
return num_rows
|
2016-04-21 09:41:39 -06:00
|
|
|
|
2022-02-24 04:52:28 -07:00
|
|
|
async def _background_delete_non_strings(
|
|
|
|
self, progress: JsonDict, batch_size: int
|
|
|
|
) -> int:
|
|
|
|
"""Deletes rows with non-string `value`s from `event_search` if using sqlite.
|
|
|
|
|
|
|
|
Prior to Synapse 1.44.0, malformed events received over federation could cause integers
|
|
|
|
to be inserted into the `event_search` table when using sqlite.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def delete_non_strings_txn(txn: LoggingTransaction) -> None:
|
|
|
|
txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'")
|
|
|
|
|
|
|
|
await self.db_pool.runInteraction(
|
|
|
|
self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn
|
|
|
|
)
|
|
|
|
|
|
|
|
await self.db_pool.updates._end_background_update(
|
|
|
|
self.EVENT_SEARCH_DELETE_NON_STRINGS
|
|
|
|
)
|
|
|
|
return 1
|
|
|
|
|
2019-10-03 10:47:42 -06:00
|
|
|
|
|
|
|
class SearchStore(SearchBackgroundUpdateStore):
|
2021-12-13 10:05:00 -07:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
database: DatabasePool,
|
|
|
|
db_conn: LoggingDatabaseConnection,
|
|
|
|
hs: "HomeServer",
|
|
|
|
):
|
2020-09-18 07:56:44 -06:00
|
|
|
super().__init__(database, db_conn, hs)
|
2019-10-03 10:47:42 -06:00
|
|
|
|
2022-02-15 06:47:05 -07:00
|
|
|
async def search_msgs(
|
|
|
|
self, room_ids: Collection[str], search_term: str, keys: Iterable[str]
|
|
|
|
) -> JsonDict:
|
2015-10-16 09:46:48 -06:00
|
|
|
"""Performs a full text search over events with given keys.
|
2015-10-16 04:28:12 -06:00
|
|
|
|
|
|
|
Args:
|
2022-02-15 06:47:05 -07:00
|
|
|
room_ids: List of room ids to search in
|
|
|
|
search_term: Search term to search for
|
|
|
|
keys: List of keys to search in, currently supports
|
2015-10-16 09:46:48 -06:00
|
|
|
"content.body", "content.name", "content.topic"
|
2015-10-16 04:28:12 -06:00
|
|
|
|
|
|
|
Returns:
|
2022-02-15 06:47:05 -07:00
|
|
|
Dictionary of results
|
2015-10-16 04:28:12 -06:00
|
|
|
"""
|
2015-10-09 08:48:31 -06:00
|
|
|
clauses = []
|
2015-12-02 06:28:13 -07:00
|
|
|
|
2022-03-28 12:11:14 -06:00
|
|
|
args: List[Any] = []
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2015-10-22 06:25:22 -06:00
|
|
|
# Make sure we don't explode because the person is in too many rooms.
|
2015-10-22 09:54:56 -06:00
|
|
|
# We filter the results below regardless.
|
2015-10-22 09:18:35 -06:00
|
|
|
if len(room_ids) < 500:
|
2019-10-10 08:35:46 -06:00
|
|
|
clause, args = make_in_list_sql_clause(
|
|
|
|
self.database_engine, "room_id", room_ids
|
2019-10-02 12:07:07 -06:00
|
|
|
)
|
2019-10-10 08:35:46 -06:00
|
|
|
clauses = [clause]
|
2015-10-12 03:49:53 -06:00
|
|
|
|
2015-10-13 08:22:14 -06:00
|
|
|
local_clauses = []
|
|
|
|
for key in keys:
|
|
|
|
local_clauses.append("key = ?")
|
|
|
|
args.append(key)
|
|
|
|
|
2019-04-03 03:07:29 -06:00
|
|
|
clauses.append("(%s)" % (" OR ".join(local_clauses),))
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2015-12-11 04:40:23 -07:00
|
|
|
count_args = args
|
|
|
|
count_clauses = clauses
|
|
|
|
|
2015-10-13 06:47:50 -06:00
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
search_query = search_term
|
|
|
|
tsquery_func = self.database_engine.tsquery_func
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = f"""
|
|
|
|
SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank,
|
|
|
|
room_id, event_id
|
|
|
|
FROM event_search
|
|
|
|
WHERE vector @@ {tsquery_func}('english', ?)
|
|
|
|
"""
|
2015-12-11 04:12:57 -07:00
|
|
|
args = [search_query, search_query] + args
|
2015-12-11 04:40:23 -07:00
|
|
|
|
2022-10-28 09:44:10 -06:00
|
|
|
count_sql = f"""
|
|
|
|
SELECT room_id, count(*) as count FROM event_search
|
|
|
|
WHERE vector @@ {tsquery_func}('english', ?)
|
|
|
|
"""
|
2015-12-11 04:40:23 -07:00
|
|
|
count_args = [search_query] + count_args
|
2015-10-16 07:37:14 -06:00
|
|
|
elif isinstance(self.database_engine, Sqlite3Engine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
search_query = _parse_query_for_sqlite(search_term)
|
|
|
|
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = """
|
|
|
|
SELECT rank(matchinfo(event_search)) as rank, room_id, event_id
|
|
|
|
FROM event_search
|
|
|
|
WHERE value MATCH ?
|
|
|
|
"""
|
2015-12-11 04:12:57 -07:00
|
|
|
args = [search_query] + args
|
2015-12-11 04:40:23 -07:00
|
|
|
|
2022-10-28 09:44:10 -06:00
|
|
|
count_sql = """
|
|
|
|
SELECT room_id, count(*) as count FROM event_search
|
|
|
|
WHERE value MATCH ?
|
|
|
|
"""
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
count_args = [search_query] + count_args
|
2015-10-16 07:37:14 -06:00
|
|
|
else:
|
|
|
|
# This should be unreachable.
|
|
|
|
raise Exception("Unrecognized database engine")
|
2015-10-09 08:48:31 -06:00
|
|
|
|
|
|
|
for clause in clauses:
|
|
|
|
sql += " AND " + clause
|
|
|
|
|
2015-12-11 04:40:23 -07:00
|
|
|
for clause in count_clauses:
|
|
|
|
count_sql += " AND " + clause
|
|
|
|
|
2015-10-16 04:24:02 -06:00
|
|
|
# We add an arbitrary limit here to ensure we don't try to pull the
|
|
|
|
# entire table from the database.
|
2015-10-13 08:50:56 -06:00
|
|
|
sql += " ORDER BY rank DESC LIMIT 500"
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
results = await self.db_pool.execute(
|
2020-08-05 14:38:57 -06:00
|
|
|
"search_msgs", self.db_pool.cursor_to_dict, sql, *args
|
2019-12-04 06:52:46 -07:00
|
|
|
)
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2018-05-31 03:03:47 -06:00
|
|
|
results = list(filter(lambda row: row["room_id"] in room_ids, results))
|
2015-10-22 09:54:56 -06:00
|
|
|
|
2022-05-04 05:26:11 -06:00
|
|
|
# We set redact_behaviour to block here to prevent redacted events being returned in
|
2019-12-11 06:39:47 -07:00
|
|
|
# search results (which is a data leak)
|
2022-03-28 12:11:14 -06:00
|
|
|
events = await self.get_events_as_list( # type: ignore[attr-defined]
|
2019-12-11 06:39:47 -07:00
|
|
|
[r["event_id"] for r in results],
|
2022-05-04 05:26:11 -06:00
|
|
|
redact_behaviour=EventRedactBehaviour.block,
|
2019-12-11 06:39:47 -07:00
|
|
|
)
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2019-04-03 03:07:29 -06:00
|
|
|
event_map = {ev.event_id: ev for ev in events}
|
2015-10-09 08:48:31 -06:00
|
|
|
|
2015-11-27 09:40:42 -07:00
|
|
|
highlights = None
|
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
highlights = await self._find_highlights_in_postgres(
|
|
|
|
search_query, events, tsquery_func
|
|
|
|
)
|
2015-11-27 09:40:42 -07:00
|
|
|
|
2015-12-11 04:40:23 -07:00
|
|
|
count_sql += " GROUP BY room_id"
|
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
count_results = await self.db_pool.execute(
|
2020-08-05 14:38:57 -06:00
|
|
|
"search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args
|
2015-12-11 04:40:23 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
|
2019-07-23 07:00:55 -06:00
|
|
|
return {
|
|
|
|
"results": [
|
|
|
|
{"event": event_map[r["event_id"]], "rank": r["rank"]}
|
|
|
|
for r in results
|
|
|
|
if r["event_id"] in event_map
|
|
|
|
],
|
|
|
|
"highlights": highlights,
|
|
|
|
"count": count,
|
|
|
|
}
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
async def search_rooms(
|
|
|
|
self,
|
2021-01-26 08:50:21 -07:00
|
|
|
room_ids: Collection[str],
|
2020-08-07 10:17:17 -06:00
|
|
|
search_term: str,
|
2022-02-15 06:47:05 -07:00
|
|
|
keys: Iterable[str],
|
2022-03-28 12:11:14 -06:00
|
|
|
limit: int,
|
2020-08-07 10:17:17 -06:00
|
|
|
pagination_token: Optional[str] = None,
|
2022-02-15 06:47:05 -07:00
|
|
|
) -> JsonDict:
|
2015-11-04 10:57:44 -07:00
|
|
|
"""Performs a full text search over events with given keys.
|
|
|
|
|
|
|
|
Args:
|
2020-08-07 10:17:17 -06:00
|
|
|
room_ids: The room_ids to search in
|
|
|
|
search_term: Search term to search for
|
|
|
|
keys: List of keys to search in, currently supports "content.body",
|
|
|
|
"content.name", "content.topic"
|
|
|
|
pagination_token: A pagination token previously returned
|
2015-11-04 10:57:44 -07:00
|
|
|
|
|
|
|
Returns:
|
2020-08-07 10:17:17 -06:00
|
|
|
Each match as a dictionary.
|
2015-11-04 10:57:44 -07:00
|
|
|
"""
|
|
|
|
clauses = []
|
2022-03-28 12:11:14 -06:00
|
|
|
args: List[Any] = []
|
2015-11-30 10:45:31 -07:00
|
|
|
|
|
|
|
# Make sure we don't explode because the person is in too many rooms.
|
|
|
|
# We filter the results below regardless.
|
|
|
|
if len(room_ids) < 500:
|
2019-10-10 08:35:46 -06:00
|
|
|
clause, args = make_in_list_sql_clause(
|
|
|
|
self.database_engine, "room_id", room_ids
|
2019-10-02 12:07:07 -06:00
|
|
|
)
|
2019-10-10 08:35:46 -06:00
|
|
|
clauses = [clause]
|
2015-11-04 10:57:44 -07:00
|
|
|
|
|
|
|
local_clauses = []
|
|
|
|
for key in keys:
|
|
|
|
local_clauses.append("key = ?")
|
|
|
|
args.append(key)
|
|
|
|
|
2019-04-03 03:07:29 -06:00
|
|
|
clauses.append("(%s)" % (" OR ".join(local_clauses),))
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2015-12-17 05:47:26 -07:00
|
|
|
# take copies of the current args and clauses lists, before adding
|
|
|
|
# pagination clauses to main query.
|
|
|
|
count_args = list(args)
|
|
|
|
count_clauses = list(clauses)
|
2015-12-11 04:40:23 -07:00
|
|
|
|
2015-11-04 10:57:44 -07:00
|
|
|
if pagination_token:
|
2015-11-05 09:10:54 -07:00
|
|
|
try:
|
2022-03-28 12:11:14 -06:00
|
|
|
origin_server_ts_str, stream_str = pagination_token.split(",")
|
|
|
|
origin_server_ts = int(origin_server_ts_str)
|
|
|
|
stream = int(stream_str)
|
2017-10-23 08:52:32 -06:00
|
|
|
except Exception:
|
2015-11-05 09:10:54 -07:00
|
|
|
raise SynapseError(400, "Invalid pagination token")
|
|
|
|
|
2015-11-04 10:57:44 -07:00
|
|
|
clauses.append(
|
2022-10-28 09:44:10 -06:00
|
|
|
"""
|
|
|
|
(origin_server_ts < ? OR (origin_server_ts = ? AND stream_ordering < ?))
|
|
|
|
"""
|
2015-11-04 10:57:44 -07:00
|
|
|
)
|
2015-11-30 10:45:31 -07:00
|
|
|
args.extend([origin_server_ts, origin_server_ts, stream])
|
2015-11-04 10:57:44 -07:00
|
|
|
|
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
search_query = search_term
|
|
|
|
tsquery_func = self.database_engine.tsquery_func
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = f"""
|
|
|
|
SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank,
|
|
|
|
origin_server_ts, stream_ordering, room_id, event_id
|
|
|
|
FROM event_search
|
|
|
|
WHERE vector @@ {tsquery_func}('english', ?) AND
|
|
|
|
"""
|
2015-12-11 04:40:23 -07:00
|
|
|
args = [search_query, search_query] + args
|
|
|
|
|
2022-10-28 09:44:10 -06:00
|
|
|
count_sql = f"""
|
|
|
|
SELECT room_id, count(*) as count FROM event_search
|
|
|
|
WHERE vector @@ {tsquery_func}('english', ?) AND
|
|
|
|
"""
|
2015-12-11 04:40:23 -07:00
|
|
|
count_args = [search_query] + count_args
|
2015-11-04 10:57:44 -07:00
|
|
|
elif isinstance(self.database_engine, Sqlite3Engine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
|
2015-11-12 08:19:56 -07:00
|
|
|
# We use CROSS JOIN here to ensure we use the right indexes.
|
|
|
|
# https://sqlite.org/optoverview.html#crossjoin
|
2015-11-12 08:33:47 -07:00
|
|
|
#
|
|
|
|
# We want to use the full text search index on event_search to
|
|
|
|
# extract all possible matches first, then lookup those matches
|
|
|
|
# in the events table to get the topological ordering. We need
|
2015-11-12 08:36:43 -07:00
|
|
|
# to use the indexes in this order because sqlite refuses to
|
2015-11-12 08:33:47 -07:00
|
|
|
# MATCH unless it uses the full text search index
|
2022-10-28 09:44:10 -06:00
|
|
|
sql = """
|
|
|
|
SELECT
|
|
|
|
rank(matchinfo) as rank, room_id, event_id, origin_server_ts, stream_ordering
|
|
|
|
FROM (
|
|
|
|
SELECT key, event_id, matchinfo(event_search) as matchinfo
|
|
|
|
FROM event_search
|
|
|
|
WHERE value MATCH ?
|
2015-11-04 10:57:44 -07:00
|
|
|
)
|
2022-10-28 09:44:10 -06:00
|
|
|
CROSS JOIN events USING (event_id)
|
|
|
|
WHERE
|
|
|
|
"""
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
search_query = _parse_query_for_sqlite(search_term)
|
2015-12-11 04:40:23 -07:00
|
|
|
args = [search_query] + args
|
|
|
|
|
2022-10-28 09:44:10 -06:00
|
|
|
count_sql = """
|
|
|
|
SELECT room_id, count(*) as count FROM event_search
|
|
|
|
WHERE value MATCH ? AND
|
|
|
|
"""
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
count_args = [search_query] + count_args
|
2015-11-04 10:57:44 -07:00
|
|
|
else:
|
|
|
|
# This should be unreachable.
|
|
|
|
raise Exception("Unrecognized database engine")
|
|
|
|
|
2015-11-30 10:45:31 -07:00
|
|
|
sql += " AND ".join(clauses)
|
2015-12-11 04:40:23 -07:00
|
|
|
count_sql += " AND ".join(count_clauses)
|
2015-11-04 10:57:44 -07:00
|
|
|
|
|
|
|
# We add an arbitrary limit here to ensure we don't try to pull the
|
|
|
|
# entire table from the database.
|
2016-04-21 11:09:48 -06:00
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
2022-10-28 09:44:10 -06:00
|
|
|
sql += """
|
|
|
|
ORDER BY origin_server_ts DESC NULLS LAST, stream_ordering DESC NULLS LAST
|
|
|
|
LIMIT ?
|
|
|
|
"""
|
2016-04-21 11:09:48 -06:00
|
|
|
elif isinstance(self.database_engine, Sqlite3Engine):
|
|
|
|
sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?"
|
|
|
|
else:
|
|
|
|
raise Exception("Unrecognized database engine")
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
# mypy expects to append only a `str`, not an `int`
|
2022-09-30 10:36:28 -06:00
|
|
|
args.append(limit)
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
results = await self.db_pool.execute(
|
2020-08-05 14:38:57 -06:00
|
|
|
"search_rooms", self.db_pool.cursor_to_dict, sql, *args
|
2019-12-04 06:52:46 -07:00
|
|
|
)
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2018-05-31 03:03:47 -06:00
|
|
|
results = list(filter(lambda row: row["room_id"] in room_ids, results))
|
2015-11-30 10:45:31 -07:00
|
|
|
|
2022-05-04 05:26:11 -06:00
|
|
|
# We set redact_behaviour to block here to prevent redacted events being returned in
|
2019-12-12 08:53:49 -07:00
|
|
|
# search results (which is a data leak)
|
2022-03-28 12:11:14 -06:00
|
|
|
events = await self.get_events_as_list( # type: ignore[attr-defined]
|
2019-12-12 08:53:49 -07:00
|
|
|
[r["event_id"] for r in results],
|
2022-05-04 05:26:11 -06:00
|
|
|
redact_behaviour=EventRedactBehaviour.block,
|
2019-12-12 08:53:49 -07:00
|
|
|
)
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2019-04-03 03:07:29 -06:00
|
|
|
event_map = {ev.event_id: ev for ev in events}
|
2015-11-04 10:57:44 -07:00
|
|
|
|
2015-11-27 09:40:42 -07:00
|
|
|
highlights = None
|
|
|
|
if isinstance(self.database_engine, PostgresEngine):
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
highlights = await self._find_highlights_in_postgres(
|
|
|
|
search_query, events, tsquery_func
|
|
|
|
)
|
2015-11-27 09:40:42 -07:00
|
|
|
|
2015-12-11 04:40:23 -07:00
|
|
|
count_sql += " GROUP BY room_id"
|
|
|
|
|
2020-08-07 10:17:17 -06:00
|
|
|
count_results = await self.db_pool.execute(
|
2020-08-05 14:38:57 -06:00
|
|
|
"search_rooms_count", self.db_pool.cursor_to_dict, count_sql, *count_args
|
2015-12-11 04:40:23 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)
|
|
|
|
|
2019-07-23 07:00:55 -06:00
|
|
|
return {
|
|
|
|
"results": [
|
|
|
|
{
|
|
|
|
"event": event_map[r["event_id"]],
|
|
|
|
"rank": r["rank"],
|
|
|
|
"pagination_token": "%s,%s"
|
|
|
|
% (r["origin_server_ts"], r["stream_ordering"]),
|
|
|
|
}
|
|
|
|
for r in results
|
|
|
|
if r["event_id"] in event_map
|
|
|
|
],
|
|
|
|
"highlights": highlights,
|
|
|
|
"count": count,
|
|
|
|
}
|
2015-11-27 09:40:42 -07:00
|
|
|
|
2020-09-01 09:04:17 -06:00
|
|
|
async def _find_highlights_in_postgres(
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
self, search_query: str, events: List[EventBase], tsquery_func: str
|
2020-09-01 09:04:17 -06:00
|
|
|
) -> Set[str]:
|
2015-11-27 09:40:42 -07:00
|
|
|
"""Given a list of events and a search term, return a list of words
|
|
|
|
that match from the content of the event.
|
|
|
|
|
|
|
|
This is used to give a list of words that clients can match against to
|
|
|
|
highlight the matching parts.
|
|
|
|
|
|
|
|
Args:
|
2020-09-01 09:04:17 -06:00
|
|
|
search_query
|
|
|
|
events: A list of events
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
tsquery_func: The tsquery_* function to use when making queries
|
2015-11-27 09:40:42 -07:00
|
|
|
|
|
|
|
Returns:
|
2020-09-01 09:04:17 -06:00
|
|
|
A set of strings.
|
2015-11-27 09:40:42 -07:00
|
|
|
"""
|
2019-04-03 03:07:29 -06:00
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def f(txn: LoggingTransaction) -> Set[str]:
|
2015-11-27 09:40:42 -07:00
|
|
|
highlight_words = set()
|
|
|
|
for event in events:
|
|
|
|
# As a hack we simply join values of all possible keys. This is
|
|
|
|
# fine since we're only using them to find possible highlights.
|
|
|
|
values = []
|
|
|
|
for key in ("body", "name", "topic"):
|
|
|
|
v = event.content.get(key, None)
|
|
|
|
if v:
|
2021-09-22 09:25:26 -06:00
|
|
|
v = _clean_value_for_search(v)
|
2015-11-27 09:40:42 -07:00
|
|
|
values.append(v)
|
|
|
|
|
|
|
|
if not values:
|
|
|
|
continue
|
|
|
|
|
|
|
|
value = " ".join(values)
|
|
|
|
|
|
|
|
# We need to find some values for StartSel and StopSel that
|
|
|
|
# aren't in the value so that we can pick results out.
|
|
|
|
start_sel = "<"
|
|
|
|
stop_sel = ">"
|
|
|
|
|
|
|
|
while start_sel in value:
|
|
|
|
start_sel += "<"
|
|
|
|
while stop_sel in value:
|
|
|
|
stop_sel += ">"
|
|
|
|
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % (
|
2019-04-03 03:07:29 -06:00
|
|
|
_to_postgres_options(
|
|
|
|
{
|
|
|
|
"StartSel": start_sel,
|
|
|
|
"StopSel": stop_sel,
|
|
|
|
"MaxFragments": "50",
|
|
|
|
}
|
|
|
|
)
|
2015-11-27 09:40:42 -07:00
|
|
|
)
|
2019-04-03 03:07:29 -06:00
|
|
|
txn.execute(query, (value, search_query))
|
2019-10-31 09:43:24 -06:00
|
|
|
(headline,) = txn.fetchall()[0]
|
2015-11-27 09:40:42 -07:00
|
|
|
|
|
|
|
# Now we need to pick the possible highlights out of the haedline
|
|
|
|
# result.
|
|
|
|
matcher_regex = "%s(.*?)%s" % (
|
|
|
|
re.escape(start_sel),
|
|
|
|
re.escape(stop_sel),
|
|
|
|
)
|
|
|
|
|
|
|
|
res = re.findall(matcher_regex, headline)
|
|
|
|
highlight_words.update([r.lower() for r in res])
|
|
|
|
|
|
|
|
return highlight_words
|
|
|
|
|
2020-09-01 09:04:17 -06:00
|
|
|
return await self.db_pool.runInteraction("_find_highlights", f)
|
2015-11-27 09:40:42 -07:00
|
|
|
|
|
|
|
|
2022-05-10 12:07:48 -06:00
|
|
|
def _to_postgres_options(options_dict: JsonDict) -> str:
|
2019-04-03 03:07:29 -06:00
|
|
|
return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)
|
2015-12-02 04:38:51 -07:00
|
|
|
|
|
|
|
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
@dataclass
|
|
|
|
class Phrase:
|
|
|
|
phrase: List[str]
|
|
|
|
|
|
|
|
|
|
|
|
class SearchToken(enum.Enum):
|
|
|
|
Not = enum.auto()
|
|
|
|
Or = enum.auto()
|
|
|
|
And = enum.auto()
|
|
|
|
|
|
|
|
|
|
|
|
Token = Union[str, Phrase, SearchToken]
|
|
|
|
TokenList = List[Token]
|
|
|
|
|
|
|
|
|
|
|
|
def _is_stop_word(word: str) -> bool:
|
|
|
|
# TODO Pull these out of the dictionary:
|
|
|
|
# https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop
|
|
|
|
return word in {"the", "a", "you", "me", "and", "but"}
|
|
|
|
|
|
|
|
|
|
|
|
def _tokenize_query(query: str) -> TokenList:
|
|
|
|
"""
|
|
|
|
Convert the user-supplied `query` into a TokenList, which can be translated into
|
|
|
|
some DB-specific syntax.
|
|
|
|
|
|
|
|
The following constructs are supported:
|
|
|
|
|
|
|
|
- phrase queries using "double quotes"
|
|
|
|
- case-insensitive `or` and `and` operators
|
|
|
|
- negation of a keyword via unary `-`
|
|
|
|
- unary hyphen to denote NOT e.g. 'include -exclude'
|
|
|
|
|
|
|
|
The following differs from websearch_to_tsquery:
|
|
|
|
|
|
|
|
- Stop words are not removed.
|
|
|
|
- Unclosed phrases are treated differently.
|
|
|
|
|
|
|
|
"""
|
|
|
|
tokens: TokenList = []
|
|
|
|
|
|
|
|
# Find phrases.
|
|
|
|
in_phrase = False
|
|
|
|
parts = deque(query.split('"'))
|
|
|
|
for i, part in enumerate(parts):
|
2022-10-27 07:58:12 -06:00
|
|
|
# The contents inside double quotes is treated as a phrase.
|
|
|
|
in_phrase = bool(i % 2)
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
|
|
|
|
# Pull out the individual words, discarding any non-word characters.
|
|
|
|
words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE))
|
|
|
|
|
|
|
|
# Phrases have simplified handling of words.
|
|
|
|
if in_phrase:
|
|
|
|
# Skip stop words.
|
|
|
|
phrase = [word for word in words if not _is_stop_word(word)]
|
|
|
|
|
|
|
|
# Consecutive words are implicitly ANDed together.
|
|
|
|
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
|
|
|
|
tokens.append(SearchToken.And)
|
|
|
|
|
|
|
|
# Add the phrase.
|
|
|
|
tokens.append(Phrase(phrase))
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Otherwise, not in a phrase.
|
|
|
|
while words:
|
|
|
|
word = words.popleft()
|
|
|
|
|
|
|
|
if word.startswith("-"):
|
|
|
|
tokens.append(SearchToken.Not)
|
|
|
|
|
|
|
|
# If there's more word, put it back to be processed again.
|
|
|
|
word = word[1:]
|
|
|
|
if word:
|
|
|
|
words.appendleft(word)
|
|
|
|
elif word.lower() == "or":
|
|
|
|
tokens.append(SearchToken.Or)
|
|
|
|
else:
|
|
|
|
# Skip stop words.
|
|
|
|
if _is_stop_word(word):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Consecutive words are implicitly ANDed together.
|
|
|
|
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
|
|
|
|
tokens.append(SearchToken.And)
|
|
|
|
|
|
|
|
# Add the search term.
|
|
|
|
tokens.append(word)
|
|
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
def _tokens_to_sqlite_match_query(tokens: TokenList) -> str:
|
|
|
|
"""
|
|
|
|
Convert the list of tokens to a string suitable for passing to sqlite's MATCH.
|
|
|
|
Assume sqlite was compiled with enhanced query syntax.
|
|
|
|
|
|
|
|
Ref: https://www.sqlite.org/fts3.html#full_text_index_queries
|
2015-12-02 04:38:51 -07:00
|
|
|
"""
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
match_query = []
|
|
|
|
for token in tokens:
|
|
|
|
if isinstance(token, str):
|
|
|
|
match_query.append(token)
|
|
|
|
elif isinstance(token, Phrase):
|
|
|
|
match_query.append('"' + " ".join(token.phrase) + '"')
|
|
|
|
elif token == SearchToken.Not:
|
|
|
|
# TODO: SQLite treats NOT as a *binary* operator. Hopefully a search
|
|
|
|
# term has already been added before this.
|
|
|
|
match_query.append(" NOT ")
|
|
|
|
elif token == SearchToken.Or:
|
|
|
|
match_query.append(" OR ")
|
|
|
|
elif token == SearchToken.And:
|
|
|
|
match_query.append(" AND ")
|
|
|
|
else:
|
|
|
|
raise ValueError(f"unknown token {token}")
|
|
|
|
|
|
|
|
return "".join(match_query)
|
2015-12-02 04:38:51 -07:00
|
|
|
|
|
|
|
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 12:05:22 -06:00
|
|
|
def _parse_query_for_sqlite(search_term: str) -> str:
|
|
|
|
"""Takes a plain unicode string from the user and converts it into a form
|
|
|
|
that can be passed to sqllite's matchinfo().
|
|
|
|
"""
|
|
|
|
return _tokens_to_sqlite_match_query(_tokenize_query(search_term))
|