From 940254a95bd02fe30ccc3ee27729606b91a04514 Mon Sep 17 00:00:00 2001 From: anoa's Codex Agent Date: Fri, 6 Jun 2025 14:35:11 +0100 Subject: [PATCH 1/8] Replace PyICU with Rust icu_segmenter --- docs/development/contributing_guide.md | 2 +- docs/development/dependencies.md | 5 +- docs/setup/installation.md | 26 ++------ docs/upgrade.md | 15 +++-- poetry.lock | 63 ++++++++----------- pyproject.toml | 7 --- rust/Cargo.toml | 1 + rust/src/lib.rs | 2 + rust/src/segmenter.rs | 30 +++++++++ .../storage/databases/main/user_directory.py | 36 ++--------- synapse/synapse_rust/segmenter.pyi | 3 + tests/storage/test_user_directory.py | 19 ++---- 12 files changed, 89 insertions(+), 120 deletions(-) create mode 100644 rust/src/segmenter.rs create mode 100644 synapse/synapse_rust/segmenter.pyi diff --git a/docs/development/contributing_guide.md b/docs/development/contributing_guide.md index d6efab96cfb..64818d6f12c 100644 --- a/docs/development/contributing_guide.md +++ b/docs/development/contributing_guide.md @@ -29,7 +29,7 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`. -Synapse has an optional, improved user search with better Unicode support. For that you need the development package of `libicu`. On Debian or Ubuntu Linux, this can be installed with `sudo apt install libicu-dev`. +Synapse bundles the ICU library via Rust, so no additional `libicu` package is required for improved user search. The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git). diff --git a/docs/development/dependencies.md b/docs/development/dependencies.md index fa5ff4dcf7f..e381b3d1555 100644 --- a/docs/development/dependencies.md +++ b/docs/development/dependencies.md @@ -164,10 +164,7 @@ $ poetry cache clear --all . # including the wheel artifacts which is not covered by the above command # (see https://github.com/python-poetry/poetry/issues/10304) # -# This is necessary in order to rebuild or fetch new wheels. For example, if you update -# the `icu` library in on your system, you will need to rebuild the PyICU Python package -# in order to incorporate the correct dynamically linked library locations otherwise you -# will run into errors like: `ImportError: libicui18n.so.75: cannot open shared object file: No such file or directory` +# This is necessary in order to rebuild or fetch new wheels. $ rm -rf $(poetry config cache-dir) ``` diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 0853496ab7d..05a557e77ac 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -286,7 +286,7 @@ Installing prerequisites on Ubuntu or Debian: ```sh sudo apt install build-essential python3-dev libffi-dev \ python3-pip python3-setuptools sqlite3 \ - libssl-dev virtualenv libjpeg-dev libxslt1-dev libicu-dev + libssl-dev virtualenv libjpeg-dev libxslt1-dev ``` ##### ArchLinux @@ -295,7 +295,7 @@ Installing prerequisites on ArchLinux: ```sh sudo pacman -S base-devel python python-pip \ - python-setuptools python-virtualenv sqlite3 icu + python-setuptools python-virtualenv sqlite3 ``` ##### CentOS/Fedora @@ -305,8 +305,7 @@ Installing prerequisites on CentOS or Fedora Linux: ```sh sudo dnf install libtiff-devel libjpeg-devel libzip-devel freetype-devel \ libwebp-devel libxml2-devel libxslt-devel libpq-devel \ - python3-virtualenv libffi-devel openssl-devel python3-devel \ - libicu-devel + python3-virtualenv libffi-devel openssl-devel python3-devel sudo dnf group install "Development Tools" ``` @@ -333,7 +332,7 @@ dnf install python3.12 python3.12-devel ``` Finally, install common prerequisites ```bash -dnf install libicu libicu-devel libpq5 libpq5-devel lz4 pkgconf +dnf install libpq5 libpq5-devel lz4 pkgconf dnf group install "Development Tools" ``` ###### Using venv module instead of virtualenv command @@ -365,19 +364,7 @@ xcode-select --install Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them. -You may need to install icu, and make the icu binaries and libraries accessible. -Please follow [the official instructions of PyICU](https://pypi.org/project/PyICU/) to do so. - -If you're struggling to get icu discovered, and see: -``` - RuntimeError: - Please install pkg-config on your system or set the ICU_VERSION environment - variable to the version of ICU you have installed. -``` -despite it being installed and having your `PATH` updated, you can omit this dependency by -not specifying `--extras all` to `poetry`. If using postgres, you can install Synapse via -`poetry install --extras saml2 --extras oidc --extras postgres --extras opentracing --extras redis --extras sentry`. -ICU is not a hard dependency on getting a working installation. +The ICU library is bundled with Synapse and requires no additional setup. On ARM-based Macs you may also need to install libjpeg and libpq: ```sh @@ -400,8 +387,7 @@ Installing prerequisites on openSUSE: ```sh sudo zypper in -t pattern devel_basis sudo zypper in python-pip python-setuptools sqlite3 python-virtualenv \ - python-devel libffi-devel libopenssl-devel libjpeg62-devel \ - libicu-devel + python-devel libffi-devel libopenssl-devel libjpeg62-devel ``` ##### OpenBSD diff --git a/docs/upgrade.md b/docs/upgrade.md index d508e2231e3..1d07d00e98d 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -117,6 +117,13 @@ each upgrade are complete before moving on to the next upgrade, to avoid stacking them up. You can monitor the currently running background updates with [the Admin API](usage/administration/admin_api/background_updates.html#status). +# Upgrading to v1.131.0 + +## ICU bundled with Synapse + +Synapse now uses the Rust `icu` library for improved user search. Installing the +native ICU library on your system is no longer required. + # Upgrading to v1.130.0 ## Documented endpoint which can be delegated to a federation worker @@ -516,11 +523,11 @@ For all other installation methods, no acction is required. This version introduces optional support for an [improved user search dealing with Unicode characters](https://github.com/matrix-org/synapse/pull/14464). If you want to take advantage of this feature you need to install PyICU, -the ICU native dependency and its development headers -so that PyICU can build since no prebuilt wheels are available. +the ICU native dependency and its development headers so that PyICU can build +since no prebuilt wheels are available. -You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do so, -and then do `pip install matrix-synapse[user-search]` for a PyPI install. +You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do +so, and then do `pip install matrix-synapse[user-search]` for a PyPI install. Docker images and Debian packages need nothing specific as they already include or specify ICU as an explicit dependency. diff --git a/poetry.lock b/poetry.lock index cbed01b5646..75696f8ce48 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -39,7 +39,7 @@ description = "The ultimate Python library in building OAuth and OpenID Connect optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"jwt\" or extra == \"oidc\"" +markers = "extra == \"oidc\" or extra == \"jwt\" or extra == \"all\"" files = [ {file = "authlib-1.5.2-py2.py3-none-any.whl", hash = "sha256:8804dd4402ac5e4a0435ac49e0b6e19e395357cfa632a3f624dcb4f6df13b4b1"}, {file = "authlib-1.5.2.tar.gz", hash = "sha256:fe85ec7e50c5f86f1e2603518bb3b4f632985eb4a355e52256530790e326c512"}, @@ -451,7 +451,7 @@ description = "XML bomb protection for Python stdlib modules" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -494,7 +494,7 @@ description = "XPath 1.0/2.0/3.0/3.1 parsers and selectors for ElementTree and l optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "elementpath-4.1.5-py3-none-any.whl", hash = "sha256:2ac1a2fb31eb22bbbf817f8cf6752f844513216263f0e3892c8e79782fe4bb55"}, {file = "elementpath-4.1.5.tar.gz", hash = "sha256:c2d6dc524b29ef751ecfc416b0627668119d8812441c555d7471da41d4bacb8d"}, @@ -544,7 +544,7 @@ description = "Python wrapper for hiredis" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "hiredis-3.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:2892db9db21f0cf7cc298d09f85d3e1f6dc4c4c24463ab67f79bc7a006d51867"}, {file = "hiredis-3.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:93cfa6cc25ee2ceb0be81dc61eca9995160b9e16bdb7cca4a00607d57e998918"}, @@ -890,7 +890,7 @@ description = "Jaeger Python OpenTracing Tracer implementation" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "jaeger-client-4.8.0.tar.gz", hash = "sha256:3157836edab8e2c209bd2d6ae61113db36f7ee399e66b1dcbb715d87ab49bfe0"}, ] @@ -1028,7 +1028,7 @@ description = "A strictly RFC 4510 conforming LDAP V3 pure Python client library optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "ldap3-2.9.1-py2.py3-none-any.whl", hash = "sha256:5869596fc4948797020d3f03b7939da938778a0f9e2009f7a072ccf92b8e8d70"}, {file = "ldap3-2.9.1.tar.gz", hash = "sha256:f3e7fc4718e3f09dda568b57100095e0ce58633bcabbed8667ce3f8fbaa4229f"}, @@ -1044,7 +1044,7 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"url-preview\"" +markers = "extra == \"url-preview\" or extra == \"all\"" files = [ {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e7bc6df34d42322c5289e37e9971d6ed114e3776b45fa879f734bded9d1fea9c"}, {file = "lxml-5.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6854f8bd8a1536f8a1d9a3655e6354faa6406621cf857dc27b681b69860645c7"}, @@ -1324,7 +1324,7 @@ description = "An LDAP3 auth provider for Synapse" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"matrix-synapse-ldap3\"" +markers = "extra == \"matrix-synapse-ldap3\" or extra == \"all\"" files = [ {file = "matrix-synapse-ldap3-0.3.0.tar.gz", hash = "sha256:8bb6517173164d4b9cc44f49de411d8cebdb2e705d5dd1ea1f38733c4a009e1d"}, {file = "matrix_synapse_ldap3-0.3.0-py3-none-any.whl", hash = "sha256:8b4d701f8702551e98cc1d8c20dbed532de5613584c08d0df22de376ba99159d"}, @@ -1545,7 +1545,7 @@ description = "OpenTracing API for Python. See documentation at http://opentraci optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "opentracing-2.4.0.tar.gz", hash = "sha256:a173117e6ef580d55874734d1fa7ecb6f3655160b8b8974a2a1e98e5ec9c840d"}, ] @@ -1714,7 +1714,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"postgres\"" +markers = "extra == \"postgres\" or extra == \"all\"" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, {file = "psycopg2-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:c6f7b8561225f9e711a9c47087388a97fdc948211c10a4bccbf0ba68ab7b3b5a"}, @@ -1735,7 +1735,7 @@ description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=mas optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] @@ -1751,7 +1751,7 @@ description = "A Simple library to enable psycopg2 compatability" optional = true python-versions = "*" groups = ["main"] -markers = "platform_python_implementation == \"PyPy\" and (extra == \"all\" or extra == \"postgres\")" +markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"all\")" files = [ {file = "psycopg2cffi-compat-1.1.tar.gz", hash = "sha256:d25e921748475522b33d13420aad5c2831c743227dc1f1f2585e0fdb5c914e05"}, ] @@ -1967,18 +1967,6 @@ files = [ [package.extras] plugins = ["importlib-metadata ; python_version < \"3.8\""] -[[package]] -name = "pyicu" -version = "2.14" -description = "Python extension wrapping the ICU C++ API" -optional = true -python-versions = "*" -groups = ["main"] -markers = "extra == \"all\" or extra == \"user-search\"" -files = [ - {file = "PyICU-2.14.tar.gz", hash = "sha256:acc7eb92bd5c554ed577249c6978450a4feda0aa6f01470152b3a7b382a02132"}, -] - [[package]] name = "pyjwt" version = "2.6.0" @@ -2023,7 +2011,7 @@ description = "A development tool to measure, monitor and analyze the memory beh optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"cache-memory\"" +markers = "extra == \"cache-memory\" or extra == \"all\"" files = [ {file = "Pympler-1.0.1-py3-none-any.whl", hash = "sha256:d260dda9ae781e1eab6ea15bacb84015849833ba5555f141d2d9b7b7473b307d"}, {file = "Pympler-1.0.1.tar.gz", hash = "sha256:993f1a3599ca3f4fcd7160c7545ad06310c9e12f70174ae7ae8d4e25f6c5d3fa"}, @@ -2083,7 +2071,7 @@ description = "Python implementation of SAML Version 2 Standard" optional = true python-versions = ">=3.9,<4.0" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pysaml2-7.5.0-py3-none-any.whl", hash = "sha256:bc6627cc344476a83c757f440a73fda1369f13b6fda1b4e16bca63ffbabb5318"}, {file = "pysaml2-7.5.0.tar.gz", hash = "sha256:f36871d4e5ee857c6b85532e942550d2cf90ea4ee943d75eb681044bbc4f54f7"}, @@ -2108,7 +2096,7 @@ description = "Extensions to the standard Python datetime module" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2136,7 +2124,7 @@ description = "World timezone definitions, modern and historical" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "pytz-2022.7.1-py2.py3-none-any.whl", hash = "sha256:78f4f37d8198e0627c5f1143240bb0206b8691d8d7ac6d78fee88b78733f8c4a"}, {file = "pytz-2022.7.1.tar.gz", hash = "sha256:01a0681c4b9684a28304615eba55d1ab31ae00bf68ec157ec3708a8182dbbcd0"}, @@ -2500,7 +2488,7 @@ description = "Python client for Sentry (https://sentry.io)" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"all\" or extra == \"sentry\"" +markers = "extra == \"sentry\" or extra == \"all\"" files = [ {file = "sentry_sdk-2.22.0-py2.py3-none-any.whl", hash = "sha256:3d791d631a6c97aad4da7074081a57073126c69487560c6f8bffcf586461de66"}, {file = "sentry_sdk-2.22.0.tar.gz", hash = "sha256:b4bf43bb38f547c84b2eadcefbe389b36ef75f3f38253d7a74d6b928c07ae944"}, @@ -2688,7 +2676,7 @@ description = "Tornado IOLoop Backed Concurrent Futures" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "threadloop-1.0.2-py2-none-any.whl", hash = "sha256:5c90dbefab6ffbdba26afb4829d2a9df8275d13ac7dc58dccb0e279992679599"}, {file = "threadloop-1.0.2.tar.gz", hash = "sha256:8b180aac31013de13c2ad5c834819771992d350267bddb854613ae77ef571944"}, @@ -2704,7 +2692,7 @@ description = "Python bindings for the Apache Thrift RPC system" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "thrift-0.16.0.tar.gz", hash = "sha256:2b5b6488fcded21f9d312aa23c9ff6a0195d0f6ae26ddbd5ad9e3e25dfc14408"}, ] @@ -2766,7 +2754,7 @@ description = "Tornado is a Python web framework and asynchronous networking lib optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"opentracing\"" +markers = "extra == \"opentracing\" or extra == \"all\"" files = [ {file = "tornado-6.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:f81067dad2e4443b015368b24e802d0083fecada4f0a4572fdb72fc06e54a9a6"}, {file = "tornado-6.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9ac1cbe1db860b3cbb251e795c701c41d343f06a96049d6274e7c77559117e41"}, @@ -2901,7 +2889,7 @@ description = "non-blocking redis client for python" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"all\" or extra == \"redis\"" +markers = "extra == \"redis\" or extra == \"all\"" files = [ {file = "txredisapi-1.4.11-py3-none-any.whl", hash = "sha256:ac64d7a9342b58edca13ef267d4fa7637c1aa63f8595e066801c1e8b56b22d0b"}, {file = "txredisapi-1.4.11.tar.gz", hash = "sha256:3eb1af99aefdefb59eb877b1dd08861efad60915e30ad5bf3d5bf6c5cedcdbc6"}, @@ -3244,7 +3232,7 @@ description = "An XML Schema validator and decoder" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"saml2\"" +markers = "extra == \"saml2\" or extra == \"all\"" files = [ {file = "xmlschema-2.4.0-py3-none-any.whl", hash = "sha256:dc87be0caaa61f42649899189aab2fd8e0d567f2cf548433ba7b79278d231a4a"}, {file = "xmlschema-2.4.0.tar.gz", hash = "sha256:d74cd0c10866ac609e1ef94a5a69b018ad16e39077bc6393408b40c6babee793"}, @@ -3371,7 +3359,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"] +all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -3384,9 +3372,8 @@ sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["idna", "parameterized"] url-preview = ["lxml"] -user-search = ["pyicu"] [metadata] lock-version = "2.1" python-versions = "^3.9.0" -content-hash = "9824e42dfc0e128129ee0c8641f7fe639bf47574cdd3f052dd995941abc6e44b" +content-hash = "457f188ae22af9663b2ed21f2586720ce5014edc7c34a697787f16aad733ea41" diff --git a/pyproject.toml b/pyproject.toml index d95881b53a3..e00fd151163 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,7 +250,6 @@ hiredis = { version = "*", optional = true } Pympler = { version = "*", optional = true } parameterized = { version = ">=0.7.4", optional = true } idna = { version = ">=2.5", optional = true } -pyicu = { version = ">=2.10.2", optional = true } [tool.poetry.extras] # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified @@ -273,10 +272,6 @@ redis = ["txredisapi", "hiredis"] # Required to use experimental `caches.track_memory_usage` config option. cache-memory = ["pympler"] test = ["parameterized", "idna"] -# Allows for better search for international characters in the user directory. This -# requires libicu's development headers installed on the system (e.g. libicu-dev on -# Debian-based distributions). -user-search = ["pyicu"] # The duplication here is awful. I hate hate hate hate hate it. However, for now I want # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: @@ -308,8 +303,6 @@ all = [ "txredisapi", "hiredis", # cache-memory "pympler", - # improved user search - "pyicu", # omitted: # - test: it's useful to have this separate from dev deps in the olddeps job # - systemd: this is a system-based requirement diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 840988e74eb..3afc4f47c00 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -43,6 +43,7 @@ sha2 = "0.10.8" serde = { version = "1.0.144", features = ["derive"] } serde_json = "1.0.85" ulid = "1.1.2" +icu_segmenter = { version = "2.0", features = ["compiled_data"] } [features] extension-module = ["pyo3/extension-module"] diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d751889874b..b686c5574d1 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -12,6 +12,7 @@ pub mod identifier; pub mod matrix_const; pub mod push; pub mod rendezvous; +pub mod segmenter; lazy_static! { static ref LOGGING_HANDLE: ResetHandle = pyo3_log::init(); @@ -51,6 +52,7 @@ fn synapse_rust(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { push::register_module(py, m)?; events::register_module(py, m)?; rendezvous::register_module(py, m)?; + segmenter::register_module(py, m)?; Ok(()) } diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs new file mode 100644 index 00000000000..038f37640c8 --- /dev/null +++ b/rust/src/segmenter.rs @@ -0,0 +1,30 @@ +use pyo3::prelude::*; +use icu_segmenter::WordSegmenter; +use icu_segmenter::options::WordBreakInvariantOptions; + +#[pyfunction] +pub fn parse_words(text: &str) -> PyResult> { + let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default()); + let mut parts = Vec::new(); + let mut last = 0usize; + for boundary in segmenter.segment_str(text) { + if boundary > last { + parts.push(text[last..boundary].to_string()); + } + last = boundary; + } + Ok(parts) +} + +pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let child_module = PyModule::new(py, "segmenter")?; + child_module.add_function(wrap_pyfunction!(parse_words, m)?)?; + + m.add_submodule(&child_module)?; + + py.import("sys")? + .getattr("modules")? + .set_item("synapse.synapse_rust.segmenter", child_module)?; + + Ok(()) +} diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 2b867cdb6ea..2c9427d7af0 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -37,15 +37,8 @@ import attr -try: - # Figure out if ICU support is available for searching users. - import icu - - USE_ICU = True -except ModuleNotFoundError: - USE_ICU = False - from synapse.api.errors import StoreError +from synapse.synapse_rust import segmenter as icu from synapse.util.stringutils import non_null_str_or_none if TYPE_CHECKING: @@ -1270,12 +1263,7 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: def _parse_words(search_term: str) -> List[str]: - """Split the provided search string into a list of its words. - - If support for ICU (International Components for Unicode) is available, use it. - Otherwise, fall back to using a regex to detect word boundaries. This latter - solution works well enough for most latin-based languages, but doesn't work as well - with other languages. + """Split the provided search string into a list of its words using ICU. Args: search_term: The search string. @@ -1283,10 +1271,7 @@ def _parse_words(search_term: str) -> List[str]: Returns: A list of the words in the search string. """ - if USE_ICU: - return _parse_words_with_icu(search_term) - - return _parse_words_with_regex(search_term) + return _parse_words_with_icu(search_term) def _parse_words_with_regex(search_term: str) -> List[str]: @@ -1308,21 +1293,10 @@ def _parse_words_with_icu(search_term: str) -> List[str]: A list of the words in the search string. """ results = [] - breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) - breaker.setText(search_term) - i = 0 - while True: - j = breaker.nextBoundary() - if j < 0: - break - - # We want to make sure that we split on `@` and `:` specifically, as - # they occur in user IDs. - for result in re.split(r"[@:]+", search_term[i:j]): + for part in icu.parse_words(search_term): + for result in re.split(r"[@:]+", part): results.append(result.strip()) - i = j - # libicu will break up words that have punctuation in them, but to handle # cases where user IDs have '-', '.' and '_' in them we want to *not* break # those into words and instead allow the DB to tokenise them how it wants. diff --git a/synapse/synapse_rust/segmenter.pyi b/synapse/synapse_rust/segmenter.pyi new file mode 100644 index 00000000000..5f367659479 --- /dev/null +++ b/synapse/synapse_rust/segmenter.pyi @@ -0,0 +1,3 @@ +from typing import List + +def parse_words(text: str) -> List[str]: ... diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index c26932069f8..781832b3fc4 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -44,12 +44,6 @@ from tests.test_utils.event_injection import inject_member_event from tests.unittest import HomeserverTestCase, override_config -try: - import icu -except ImportError: - icu = None # type: ignore - - ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" @@ -451,11 +445,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) - self._restore_use_icu = user_directory.USE_ICU - user_directory.USE_ICU = self.use_icu + self._restore_parse_words = user_directory._parse_words + if not self.use_icu: + user_directory._parse_words = user_directory._parse_words_with_regex def tearDown(self) -> None: - user_directory.USE_ICU = self._restore_use_icu + user_directory._parse_words = self._restore_parse_words def test_search_user_dir(self) -> None: # normally when alice searches the directory she should just find @@ -651,14 +646,8 @@ def test_search_user_dir_accent_insensitivity(self) -> None: class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True - if not icu: - skip = "Requires PyICU" - class UserDirectoryICUTestCase(HomeserverTestCase): - if not icu: - skip = "Requires PyICU" - def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main self.user_dir_helper = GetUserDirectoryTables(self.store) From 48f5d582ba7de8f45a151eee2053a8d40e50e71d Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:21:25 +0100 Subject: [PATCH 2/8] Updates to documentation surrounding ICU --- Cargo.lock | 247 +++++++++++++++++++++++++ docs/development/contributing_guide.md | 2 - docs/setup/installation.md | 2 - docs/upgrade.md | 8 +- docs/user_directory.md | 10 +- 5 files changed, 253 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 980dff6987f..402377a0cbe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,6 +77,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "core_maths" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30" +dependencies = [ + "libm", +] + [[package]] name = "cpufeatures" version = "0.2.12" @@ -107,6 +116,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "fnv" version = "1.0.7" @@ -188,6 +208,95 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ae5921528335e91da1b6c695dbf1ec37df5ac13faa3f91e5640be93aa2fbefd" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_locale_data", + "icu_provider", + "potential_utf", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locale_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fdef0c124749d06a743c69e938350816554eb63ac979166590e2b4ee4252765" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_segmenter" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e185fc13b6401c138cf40db12b863b35f5edf31b88192a545857b41aeaf7d3d3" +dependencies = [ + "core_maths", + "displaydoc", + "icu_collections", + "icu_locale", + "icu_locale_core", + "icu_provider", + "icu_segmenter_data", + "potential_utf", + "utf8_iter", + "zerovec", +] + +[[package]] +name = "icu_segmenter_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5360a2fbe97f617c4f8b944356dedb36d423f7da7f13c070995cf89e59f01220" + [[package]] name = "indoc" version = "2.0.5" @@ -221,6 +330,18 @@ version = "0.2.154" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346" +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + [[package]] name = "log" version = "0.4.27" @@ -260,6 +381,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "serde", + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -489,6 +620,12 @@ dependencies = [ "digest", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "subtle" version = "2.5.0" @@ -517,6 +654,7 @@ dependencies = [ "headers", "hex", "http", + "icu_segmenter", "lazy_static", "log", "mime", @@ -530,12 +668,33 @@ dependencies = [ "ulid", ] +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "target-lexicon" version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "typenum" version = "1.17.0" @@ -564,6 +723,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "version_check" version = "0.9.4" @@ -716,6 +881,36 @@ dependencies = [ "bitflags", ] +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.17" @@ -735,3 +930,55 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/docs/development/contributing_guide.md b/docs/development/contributing_guide.md index 64818d6f12c..eb6f04e301c 100644 --- a/docs/development/contributing_guide.md +++ b/docs/development/contributing_guide.md @@ -29,8 +29,6 @@ easiest way of installing the latest version is to use [rustup](https://rustup.r Synapse can connect to PostgreSQL via the [psycopg2](https://pypi.org/project/psycopg2/) Python library. Building this library from source requires access to PostgreSQL's C header files. On Debian or Ubuntu Linux, these can be installed with `sudo apt install libpq-dev`. -Synapse bundles the ICU library via Rust, so no additional `libicu` package is required for improved user search. - The source code of Synapse is hosted on GitHub. You will also need [a recent version of git](https://github.com/git-guides/install-git). For some tests, you will need [a recent version of Docker](https://docs.docker.com/get-docker/). diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 05a557e77ac..0840f532b05 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -364,8 +364,6 @@ xcode-select --install Some extra dependencies may be needed. You can use Homebrew (https://brew.sh) for them. -The ICU library is bundled with Synapse and requires no additional setup. - On ARM-based Macs you may also need to install libjpeg and libpq: ```sh brew install jpeg libpq diff --git a/docs/upgrade.md b/docs/upgrade.md index 1d07d00e98d..d42b935e0f7 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -523,11 +523,11 @@ For all other installation methods, no acction is required. This version introduces optional support for an [improved user search dealing with Unicode characters](https://github.com/matrix-org/synapse/pull/14464). If you want to take advantage of this feature you need to install PyICU, -the ICU native dependency and its development headers so that PyICU can build -since no prebuilt wheels are available. +the ICU native dependency and its development headers +so that PyICU can build since no prebuilt wheels are available. -You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do -so, and then do `pip install matrix-synapse[user-search]` for a PyPI install. +You can follow [the PyICU documentation](https://pypi.org/project/PyICU/) to do so, +and then do `pip install matrix-synapse[user-search]` for a PyPI install. Docker images and Debian packages need nothing specific as they already include or specify ICU as an explicit dependency. diff --git a/docs/user_directory.md b/docs/user_directory.md index be8664a0163..f8a78a82066 100644 --- a/docs/user_directory.md +++ b/docs/user_directory.md @@ -77,14 +77,8 @@ The user provided search term is lowercased and normalized using [NFKC](https:// this treats the string as case-insensitive, canonicalizes different forms of the same text, and maps some "roughly equivalent" characters together. -The search term is then split into words: - -* If [ICU](https://en.wikipedia.org/wiki/International_Components_for_Unicode) is - available, then the system's [default locale](https://unicode-org.github.io/icu/userguide/locale/#default-locales) - will be used to break the search term into words. (See the - [installation instructions](setup/installation.md) for how to install ICU.) -* If unavailable, then runs of ASCII characters, numbers, underscores, and hyphens - are considered words. +The search term is then split into words by using the system's [default +locale](https://unicode-org.github.io/icu/userguide/locale/#default-locales). The queries for PostgreSQL and SQLite are detailed below, but their overall goal is to find matching users, preferring users who are "real" (e.g. not bots, From f53f757e0b50c0f64315ab928e5594edcbd8ff67 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:21:50 +0100 Subject: [PATCH 3/8] Remove `icu` from Nix development flake --- flake.nix | 1 - 1 file changed, 1 deletion(-) diff --git a/flake.nix b/flake.nix index 749c10da1d0..4ff6518aed7 100644 --- a/flake.nix +++ b/flake.nix @@ -96,7 +96,6 @@ gnumake # Native dependencies for running Synapse. - icu libffi libjpeg libpqxx From 4807cd2f7b15c3a48107971ca61b5fc1b5f8a8b0 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:29:36 +0100 Subject: [PATCH 4/8] lint --- rust/src/segmenter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs index 038f37640c8..f9becd04245 100644 --- a/rust/src/segmenter.rs +++ b/rust/src/segmenter.rs @@ -1,6 +1,6 @@ -use pyo3::prelude::*; -use icu_segmenter::WordSegmenter; use icu_segmenter::options::WordBreakInvariantOptions; +use icu_segmenter::WordSegmenter; +use pyo3::prelude::*; #[pyfunction] pub fn parse_words(text: &str) -> PyResult> { From 79d1f176ff1268cf616a42c7254485922e45bd5b Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:39:19 +0100 Subject: [PATCH 5/8] Remove `_parse_words_by_regex` and associated tests --- .../storage/databases/main/user_directory.py | 8 ------- tests/storage/test_user_directory.py | 22 ++----------------- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 2c9427d7af0..35d7431a950 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -1274,14 +1274,6 @@ def _parse_words(search_term: str) -> List[str]: return _parse_words_with_icu(search_term) -def _parse_words_with_regex(search_term: str) -> List[str]: - """ - Break down search term into words, when we don't have ICU available. - See: `_parse_words` - """ - return re.findall(r"([\w-]+)", search_term, re.UNICODE) - - def _parse_words_with_icu(search_term: str) -> List[str]: """Break down the provided search string into its individual words using ICU (International Components for Unicode). diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 781832b3fc4..8d1aa0dcde2 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -35,7 +35,6 @@ from synapse.storage.databases.main import user_directory from synapse.storage.databases.main.user_directory import ( _parse_words_with_icu, - _parse_words_with_regex, ) from synapse.storage.roommember import ProfileInfo from synapse.util import Clock @@ -432,8 +431,6 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int: class UserDirectoryStoreTestCase(HomeserverTestCase): - use_icu = False - def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main @@ -446,8 +443,6 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) self._restore_parse_words = user_directory._parse_words - if not self.use_icu: - user_directory._parse_words = user_directory._parse_words_with_regex def tearDown(self) -> None: user_directory._parse_words = self._restore_parse_words @@ -643,18 +638,14 @@ def test_search_user_dir_accent_insensitivity(self) -> None: test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore -class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): - use_icu = True - - class UserDirectoryICUTestCase(HomeserverTestCase): def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main self.user_dir_helper = GetUserDirectoryTables(self.store) def test_icu_word_boundary(self) -> None: - """Tests that we correctly detect word boundaries when ICU (International - Components for Unicode) support is available. + """Tests that we correctly detect word boundaries with ICU + (International Components for Unicode). """ display_name = "Gáo" @@ -703,12 +694,3 @@ def test_icu_word_boundary_punctuation(self) -> None: self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"]) self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"]) self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"]) - - def test_regex_word_boundary_punctuation(self) -> None: - """ - Tests the behaviour of punctuation with the non-ICU tokeniser - """ - self.assertEqual( - _parse_words_with_regex("lazy'fox jumped:over the.dog"), - ["lazy", "fox", "jumped", "over", "the", "dog"], - ) From c386cefc37114b690591f3d6705a80ec3962ed4b Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:58:14 +0100 Subject: [PATCH 6/8] Add helpful comments --- rust/src/segmenter.rs | 3 +++ synapse/storage/databases/main/user_directory.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/rust/src/segmenter.rs b/rust/src/segmenter.rs index f9becd04245..135b3c17796 100644 --- a/rust/src/segmenter.rs +++ b/rust/src/segmenter.rs @@ -7,6 +7,9 @@ pub fn parse_words(text: &str) -> PyResult> { let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default()); let mut parts = Vec::new(); let mut last = 0usize; + + // `segment_str` gives us word boundaries as a vector of indexes. Use that + // to build a vector of words, and return. for boundary in segmenter.segment_str(text) { if boundary > last { parts.push(text[last..boundary].to_string()); diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 35d7431a950..73deae80504 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -1217,7 +1217,7 @@ def _filter_text_for_index(text: str) -> str: def _parse_query_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form - that can be passed to database. + that can be passed to the database. We use this so that we can add prefix matching, which isn't something that is supported by default. @@ -1233,7 +1233,7 @@ def _parse_query_sqlite(search_term: str) -> str: def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: """Takes a plain unicode string from the user and converts it into a form - that can be passed to database. + that can be passed to the database. We use this so that we can add prefix matching, which isn't something that is supported by default. """ @@ -1286,10 +1286,12 @@ def _parse_words_with_icu(search_term: str) -> List[str]: """ results = [] for part in icu.parse_words(search_term): + # We want to make sure that we split on `@` and `:` specifically, as + # they occur in user IDs. for result in re.split(r"[@:]+", part): results.append(result.strip()) - # libicu will break up words that have punctuation in them, but to handle + # icu will break up words that have punctuation in them, but to handle # cases where user IDs have '-', '.' and '_' in them we want to *not* break # those into words and instead allow the DB to tokenise them how it wants. # From 1a8e3d76203d7640e3350f4d7db74eae1da938f8 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 17:58:28 +0100 Subject: [PATCH 7/8] Remove already default feature flag --- rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 3afc4f47c00..6f897434936 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -43,7 +43,7 @@ sha2 = "0.10.8" serde = { version = "1.0.144", features = ["derive"] } serde_json = "1.0.85" ulid = "1.1.2" -icu_segmenter = { version = "2.0", features = ["compiled_data"] } +icu_segmenter = { version = "2.0" } [features] extension-module = ["pyo3/extension-module"] From 57de6422c95c83c14e01e15836a2e1b8d71adddc Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 Jun 2025 18:05:48 +0100 Subject: [PATCH 8/8] newsfile --- changelog.d/18553.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/18553.misc diff --git a/changelog.d/18553.misc b/changelog.d/18553.misc new file mode 100644 index 00000000000..bb143aacfca --- /dev/null +++ b/changelog.d/18553.misc @@ -0,0 +1 @@ +Replace `PyICU` crate with equivalent `icu_segmenter` Rust crate. \ No newline at end of file