From 85784be084eaa59864773f022b436455bd12e3ba Mon Sep 17 00:00:00 2001 From: Janosch Hildebrand Date: Mon, 14 Oct 2024 01:00:32 +0200 Subject: [PATCH 1/3] Add support for the FTS5 trigram tokenizer --- Documentation/FTS5Tokenizers.md | 2 + Documentation/FullTextSearch.md | 41 +++-- GRDB/FTS/FTS5.swift | 42 ++++++ GRDB/FTS/FTS5Tokenizer.swift | 4 +- GRDB/FTS/FTS5TokenizerDescriptor.swift | 65 ++++++++ Tests/GRDBTests/FTS5TableBuilderTests.swift | 82 ++++++++++ Tests/GRDBTests/FTS5TokenizerTests.swift | 156 ++++++++++++++++++++ 7 files changed, 380 insertions(+), 12 deletions(-) diff --git a/Documentation/FTS5Tokenizers.md b/Documentation/FTS5Tokenizers.md index 29c9d75725..c25ea6bc01 100644 --- a/Documentation/FTS5Tokenizers.md +++ b/Documentation/FTS5Tokenizers.md @@ -39,6 +39,8 @@ All SQLite [built-in tokenizers](https://www.sqlite.org/fts5.html#tokenizers) to - The [porter](https://www.sqlite.org/fts5.html#porter_tokenizer) tokenizer turns English words into their root: "database engine" gives the "databas" and "engin" tokens. The query "database engines" will match, because it produces the same tokens. +- The [trigram](https://sqlite.org/fts5.html#the_trigram_tokenizer) tokenizer treats each contiguous sequence of three characters as a token to allow general substring matching. "Sequence" gives "seq", "equ", "que", "uen", "enc" and "nce". The queries "SEQUENCE", "SEQUEN", "QUENC" and "QUE" all match as they decompose into a subset of the same trigrams. + However, built-in tokenizers don't match "first" with "1st", because they produce the different "first" and "1st" tokens. Nor do they match "Grossmann" with "Großmann", because they produce the different "grossmann" and "großmann" tokens. diff --git a/Documentation/FullTextSearch.md b/Documentation/FullTextSearch.md index 69b1ae757a..2a0459a3a2 100644 --- a/Documentation/FullTextSearch.md +++ b/Documentation/FullTextSearch.md @@ -386,7 +386,7 @@ See [SQLite documentation](https://www.sqlite.org/fts5.html) for more informatio **A tokenizer defines what "matching" means.** Depending on the tokenizer you choose, full-text searches won't return the same results. -SQLite ships with three built-in FTS5 tokenizers: `ascii`, `porter` and `unicode61` that use different algorithms to match queries with indexed content. +SQLite ships with four built-in FTS5 tokenizers: `ascii`, `porter`, `unicode61` and `trigram` that use different algorithms to match queries with indexed content. ```swift try db.create(virtualTable: "book", using: FTS5()) { t in @@ -395,20 +395,23 @@ try db.create(virtualTable: "book", using: FTS5()) { t in t.tokenizer = .unicode61(...) t.tokenizer = .ascii t.tokenizer = .porter(...) + t.tokenizer = .trigram(...) } ``` See below some examples of matches: -| content | query | ascii | unicode61 | porter on ascii | porter on unicode61 | -| ----------- | ---------- | :----: | :-------: | :-------------: | :-----------------: | -| Foo | Foo | X | X | X | X | -| Foo | FOO | X | X | X | X | -| Jérôme | Jérôme | X ¹ | X ¹ | X ¹ | X ¹ | -| Jérôme | JÉRÔME | | X ¹ | | X ¹ | -| Jérôme | Jerome | | X ¹ | | X ¹ | -| Database | Databases | | | X | X | -| Frustration | Frustrated | | | X | X | +| content | query | ascii | unicode61 | porter on ascii | porter on unicode61 | trigram | +| ----------- | ---------- | :----: | :-------: | :-------------: | :-----------------: | :-----: | +| Foo | Foo | X | X | X | X | X | +| Foo | FOO | X | X | X | X | X | +| Jérôme | Jérôme | X ¹ | X ¹ | X ¹ | X ¹ | X ¹ | +| Jérôme | JÉRÔME | | X ¹ | | X ¹ | X ¹ | +| Jérôme | Jerome | | X ¹ | | X ¹ | X ¹ | +| Database | Databases | | | X | X | | +| Frustration | Frustrated | | | X | X | | +| Sequence | quenc | | | | | X | + ¹ Don't miss [Unicode Full-Text Gotchas](#unicode-full-text-gotchas) @@ -455,6 +458,24 @@ See below some examples of matches: It strips diacritics from latin script characters if it wraps unicode61, and does not if it wraps ascii (see the example above). +- **trigram** + + ```swift + try db.create(virtualTable: "book", using: FTS5()) { t in + t.tokenizer = .trigram() + t.tokenizer = .trigram(matching: .caseInsensitiveRemovingDiacritics) + t.tokenizer = .trigram(matching: .caseSensitive) + } + ``` + + The "trigram" tokenizer is case-insensitive for unicode characters by default. It matches "Jérôme" with "JÉRÔME". + + Diacritics stripping can be enabled so it matches "jérôme" with "jerome". Case-sensitive matching can also be enabled but is mutually exclusive with diacritics stripping. + + Unlike the other tokenizers, it provides general substring matching, matching "Sequence" with "que" by splitting character sequences into overlapping 3 character tokens (trigrams). + + It can also act as an index for GLOB and LIKE queries depending on the configuration. + See [SQLite tokenizers](https://www.sqlite.org/fts5.html#tokenizers) for more information, and [custom FTS5 tokenizers](FTS5Tokenizers.md) in order to add your own tokenizers. diff --git a/GRDB/FTS/FTS5.swift b/GRDB/FTS/FTS5.swift index a87b184b1d..ef839736a0 100644 --- a/GRDB/FTS/FTS5.swift +++ b/GRDB/FTS/FTS5.swift @@ -74,6 +74,48 @@ public struct FTS5 { #endif } + #if GRDBCUSTOMSQLITE || GRDBCIPHER + /// Options for trigram tokenizer character matching. Matches the raw + /// "case_sensitive" and "remove_diacritics" tokenizer arguments. + /// + /// Related SQLite documentation: + public enum TrigramTokenizerMatching: Sendable { + /// Case insensitive matching without removing diacritics. This + /// option matches the raw "case_sensitive=0 remove_diacritics=0" + /// tokenizer argument. + case caseInsensitive + /// Case insensitive matching that removes diacritics before + /// matching. This option matches the raw + /// "case_sensitive=0 remove_diacritics=1" tokenizer argument. + case caseInsensitiveRemovingDiacritics + /// Case sensitive matching. Diacritics are not removed when + /// performing case sensitive matching. This option matches the raw + /// "case_sensitive=1 remove_diacritics=0" tokenizer argument. + case caseSensitive + } + #else + /// Options for trigram tokenizer character matching. Matches the raw + /// "case_sensitive" and "remove_diacritics" tokenizer arguments. + /// + /// Related SQLite documentation: + @available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) // SQLite 3.35.0+ (3.34 actually) + public enum TrigramTokenizerMatching: Sendable { + /// Case insensitive matching without removing diacritics. This + /// option matches the raw "case_sensitive=0 remove_diacritics=0" + /// tokenizer argument. + case caseInsensitive + /// Case insensitive matching that removes diacritics before + /// matching. This option matches the raw + /// "case_sensitive=0 remove_diacritics=1" tokenizer argument. + @available(*, unavailable, message: "Requires a future OS release that includes SQLite >=3.45") + case caseInsensitiveRemovingDiacritics + /// Case sensitive matching. Diacritics are not removed when + /// performing case sensitive matching. This option matches the raw + /// "case_sensitive=1 remove_diacritics=0" tokenizer argument. + case caseSensitive + } + #endif + /// Creates an FTS5 module. /// /// For example: diff --git a/GRDB/FTS/FTS5Tokenizer.swift b/GRDB/FTS/FTS5Tokenizer.swift index e4aecc36f5..7af12968e0 100644 --- a/GRDB/FTS/FTS5Tokenizer.swift +++ b/GRDB/FTS/FTS5Tokenizer.swift @@ -148,11 +148,11 @@ extension FTS5Tokenizer { private func tokenize(_ string: String, for tokenization: FTS5Tokenization) throws -> [(token: String, flags: FTS5TokenFlags)] { - try ContiguousArray(string.utf8).withUnsafeBufferPointer { buffer -> [(String, FTS5TokenFlags)] in + try string.utf8CString.withUnsafeBufferPointer { buffer -> [(String, FTS5TokenFlags)] in guard let addr = buffer.baseAddress else { return [] } - let pText = UnsafeMutableRawPointer(mutating: addr).assumingMemoryBound(to: CChar.self) + let pText = addr let nText = CInt(buffer.count) var context = TokenizeContext() diff --git a/GRDB/FTS/FTS5TokenizerDescriptor.swift b/GRDB/FTS/FTS5TokenizerDescriptor.swift index 9750aa76fb..99a4927fb8 100644 --- a/GRDB/FTS/FTS5TokenizerDescriptor.swift +++ b/GRDB/FTS/FTS5TokenizerDescriptor.swift @@ -210,5 +210,70 @@ public struct FTS5TokenizerDescriptor: Sendable { } return FTS5TokenizerDescriptor(components: components) } + + #if GRDBCUSTOMSQLITE || GRDBCIPHER + /// The "trigram" tokenizer. + /// + /// For example: + /// + /// ```swift + /// try db.create(virtualTable: "book", using: FTS5()) { t in + /// t.tokenizer = .trigram() + /// } + /// ``` + /// + /// Related SQLite documentation: + /// + /// - parameters: + /// - matching: By default SQLite will perform case insensitive + /// matching and not remove diacritics before matching. + public static func trigram( + matching: FTS5.TrigramTokenizerMatching = .caseInsensitive + ) -> FTS5TokenizerDescriptor { + var components = ["trigram"] + switch matching { + case .caseInsensitive: + break + case .caseInsensitiveRemovingDiacritics: + components.append(contentsOf: ["remove_diacritics", "1"]) + case .caseSensitive: + components.append(contentsOf: ["case_sensitive", "1"]) + } + + return FTS5TokenizerDescriptor(components: components) + } + #else + /// The "trigram" tokenizer. + /// + /// For example: + /// + /// ```swift + /// try db.create(virtualTable: "book", using: FTS5()) { t in + /// t.tokenizer = .trigram() + /// } + /// ``` + /// + /// Related SQLite documentation: + /// + /// - parameters: + /// - matching: By default SQLite will perform case insensitive + /// matching and not remove diacritics before matching. + @available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) // SQLite 3.35.0+ (3.34 actually) + public static func trigram( + matching: FTS5.TrigramTokenizerMatching = .caseInsensitive + ) -> FTS5TokenizerDescriptor { + var components = ["trigram"] + switch matching { + case .caseInsensitive: + break + case .caseInsensitiveRemovingDiacritics: + components.append(contentsOf: ["remove_diacritics", "1"]) + case .caseSensitive: + components.append(contentsOf: ["case_sensitive", "1"]) + } + + return FTS5TokenizerDescriptor(components: components) + } + #endif } #endif diff --git a/Tests/GRDBTests/FTS5TableBuilderTests.swift b/Tests/GRDBTests/FTS5TableBuilderTests.swift index fe63cfc427..bb435a6188 100644 --- a/Tests/GRDBTests/FTS5TableBuilderTests.swift +++ b/Tests/GRDBTests/FTS5TableBuilderTests.swift @@ -166,7 +166,89 @@ class FTS5TableBuilderTests: GRDBTestCase { assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''unicode61'' ''tokenchars'' ''-.''')") } } + + func testTrigramTokenizer() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram() + t.column("content") + } + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram''')") + } + } + + func testTrigramTokenizerCaseInsensitive() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(matching: .caseInsensitive) + t.column("content") + } + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram''')") + } + } + func testTrigramTokenizerCaseSensitive() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(matching: .caseSensitive) + t.column("content") + } + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''case_sensitive'' ''1''')") + } + } + + func testTrigramTokenizerCaseInsensitiveRemovingDiacritics() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3045000 else { + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") + } + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(matching: .caseInsensitiveRemovingDiacritics) + t.column("content") + } + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''remove_diacritics'' ''1''')") + } + #else + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") + #endif + } + func testColumns() throws { let dbQueue = try makeDatabaseQueue() try dbQueue.inDatabase { db in diff --git a/Tests/GRDBTests/FTS5TokenizerTests.swift b/Tests/GRDBTests/FTS5TokenizerTests.swift index d7efff5c07..76790f41c7 100644 --- a/Tests/GRDBTests/FTS5TokenizerTests.swift +++ b/Tests/GRDBTests/FTS5TokenizerTests.swift @@ -286,6 +286,121 @@ class FTS5TokenizerTests: GRDBTestCase { } } + func testTrigramTokenizer() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram() + t.column("content") + } + + // simple match + XCTAssertTrue(match(db, "abcDÉF", "abcDÉF")) + + // English stemming + XCTAssertFalse(match(db, "database", "databases")) + + // diacritics in latin characters + XCTAssertFalse(match(db, "eéÉ", "Èèe")) + + // unicode case + XCTAssertTrue(match(db, "jérôme", "JÉRÔME")) + + // substring match + XCTAssertTrue(match(db, "sequence", "que")) + } + } + + func testTrigramTokenizerCaseSensitive() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(matching: .caseSensitive) + t.column("content") + } + + // simple match + XCTAssertTrue(match(db, "abcDÉF", "abcDÉF")) + + // English stemming + XCTAssertFalse(match(db, "database", "databases")) + + // diacritics in latin characters + XCTAssertFalse(match(db, "eéÉ", "Èèe")) + + // unicode case + XCTAssertFalse(match(db, "jérôme", "JÉRÔME")) + + // substring match + XCTAssertTrue(match(db, "sequence", "que")) + + // substring match with too short query + XCTAssertFalse(match(db, "sequence", "qu")) + } + } + + func testTrigramTokenizerDiacriticsRemove() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3045000 else { + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") + } + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + do { + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(matching: .caseInsensitiveRemovingDiacritics) + t.column("content") + } + } catch { + print(error) + throw error + } + + + // simple match + XCTAssertTrue(match(db, "abcDÉF", "abcDÉF")) + + // English stemming + XCTAssertFalse(match(db, "database", "databases")) + + // diacritics in latin characters + XCTAssertTrue(match(db, "eéÉ", "Èèe")) + + // unicode case + XCTAssertTrue(match(db, "jérôme", "JÉRÔME")) + + // substring match + XCTAssertTrue(match(db, "sequence", "que")) + + // substring match with too short query + XCTAssertFalse(match(db, "sequence", "qu")) + } + #else + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") + #endif + } + func testTokenize() throws { try makeDatabaseQueue().inDatabase { db in let ascii = try db.makeTokenizer(.ascii()) @@ -366,6 +481,47 @@ class FTS5TokenizerTests: GRDBTestCase { } } + func testTokenizeTrigram() throws { + #if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3034000 else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #else + guard #available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) else { + throw XCTSkip("FTS5 trigram tokenizer is not available") + } + #endif + + try makeDatabaseQueue().inDatabase { db in + let trigram = try db.makeTokenizer(.trigram()) + + // Empty query + try XCTAssertEqual(trigram.tokenize(query: "").map(\.token), []) + try XCTAssertEqual(trigram.tokenize(query: "?!").map(\.token), []) + + // Token queries + try XCTAssertEqual(trigram.tokenize(query: "Moby").map(\.token), ["mob", "oby"]) + try XCTAssertEqual(trigram.tokenize(query: "écarlates").map(\.token), ["éca", "car", "arl", "rla", "lat", "ate", "tes"]) + try XCTAssertEqual(trigram.tokenize(query: "fooéı👨👨🏿🇫🇷🇨🇮").map(\.token), ["foo", "ooé", "oéı", "éı👨", "ı👨👨", "👨👨🏿", "👨🏿🇫", "\u{0001F3FF}🇫🇷", "🇫🇷🇨", "🇷🇨🇮"]) + try XCTAssertEqual(trigram.tokenize(query: "SQLite database").map(\.token), ["sql", "qli", "lit", "ite", "te ", "e d", " da", "dat", "ata", "tab", "aba", "bas", "ase"]) + try XCTAssertEqual(trigram.tokenize(query: "Édouard Manet").map(\.token), ["édo", "dou", "oua", "uar", "ard", "rd ", "d m", " ma", "man", "ane", "net"]) + + // Prefix queries + try XCTAssertEqual(trigram.tokenize(query: "*").map(\.token), []) + try XCTAssertEqual(trigram.tokenize(query: "Robin*").map(\.token), ["rob", "obi", "bin", "in*"]) + + // Phrase queries + try XCTAssertEqual(trigram.tokenize(query: "\"foulent muscles\"").map(\.token), ["\"fo", "fou", "oul", "ule", "len", "ent", "nt ", "t m", " mu", "mus", "usc", "scl", "cle", "les", "es\""]) + try XCTAssertEqual(trigram.tokenize(query: "\"Kim Stan* Robin*\"").map(\.token), ["\"ki", "kim", "im ", "m s", " st", "sta", "tan", "an*", "n* ", "* r", " ro", "rob", "obi", "bin", "in*", "n*\""]) + + // Logical queries + try XCTAssertEqual(trigram.tokenize(query: "years AND months").map(\.token), ["yea", "ear", "ars", "rs ", "s a", " an", "and", "nd ", "d m", " mo", "mon", "ont", "nth", "ths"]) + + // column queries + try XCTAssertEqual(trigram.tokenize(query: "title:brest").map(\.token), ["tit", "itl", "tle", "le:", "e:b", ":br", "bre", "res", "est"]) + } + } + func testTokenize_Unicode61TokenizerCategories() throws { // Prevent SQLCipher failures. // Categories are not mentioned in the SQLite release notes. From 3df8834e8520c792e0388aa002922e1f7d3a6592 Mon Sep 17 00:00:00 2001 From: Janosch Hildebrand Date: Sun, 27 Oct 2024 16:05:37 +0100 Subject: [PATCH 2/3] Add documentation suggestion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Gwendal Roué --- Documentation/FullTextSearch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/FullTextSearch.md b/Documentation/FullTextSearch.md index 2a0459a3a2..6838503ca4 100644 --- a/Documentation/FullTextSearch.md +++ b/Documentation/FullTextSearch.md @@ -474,7 +474,7 @@ See below some examples of matches: Unlike the other tokenizers, it provides general substring matching, matching "Sequence" with "que" by splitting character sequences into overlapping 3 character tokens (trigrams). - It can also act as an index for GLOB and LIKE queries depending on the configuration. + It can also act as an index for GLOB and LIKE queries depending on the configuration (see the [SQLite Documentation](https://www.sqlite.org/fts5.html#the_trigram_tokenizer)). See [SQLite tokenizers](https://www.sqlite.org/fts5.html#tokenizers) for more information, and [custom FTS5 tokenizers](FTS5Tokenizers.md) in order to add your own tokenizers. From a303308f195941c403a7e18476d74c80acaa4652 Mon Sep 17 00:00:00 2001 From: Janosch Hildebrand Date: Thu, 31 Oct 2024 14:25:54 +0100 Subject: [PATCH 3/3] Split TrigramTokenizerMatching into TrigramCaseSensitiveOption and TrigramDiacriticsOption --- GRDB/FTS/FTS5.swift | 106 +++++++++++++------- GRDB/FTS/FTS5TokenizerDescriptor.swift | 82 ++++++++++----- Tests/GRDBTests/FTS5TableBuilderTests.swift | 31 ++++-- Tests/GRDBTests/FTS5TokenizerTests.swift | 4 +- 4 files changed, 158 insertions(+), 65 deletions(-) diff --git a/GRDB/FTS/FTS5.swift b/GRDB/FTS/FTS5.swift index ef839736a0..1297fca281 100644 --- a/GRDB/FTS/FTS5.swift +++ b/GRDB/FTS/FTS5.swift @@ -75,44 +75,82 @@ public struct FTS5 { } #if GRDBCUSTOMSQLITE || GRDBCIPHER - /// Options for trigram tokenizer character matching. Matches the raw - /// "case_sensitive" and "remove_diacritics" tokenizer arguments. - /// - /// Related SQLite documentation: - public enum TrigramTokenizerMatching: Sendable { - /// Case insensitive matching without removing diacritics. This - /// option matches the raw "case_sensitive=0 remove_diacritics=0" - /// tokenizer argument. - case caseInsensitive - /// Case insensitive matching that removes diacritics before - /// matching. This option matches the raw - /// "case_sensitive=0 remove_diacritics=1" tokenizer argument. - case caseInsensitiveRemovingDiacritics - /// Case sensitive matching. Diacritics are not removed when - /// performing case sensitive matching. This option matches the raw - /// "case_sensitive=1 remove_diacritics=0" tokenizer argument. - case caseSensitive + /// Case sensitivity options for the Trigram FTS5 tokenizer. + /// Matches the raw "case_sensitive" tokenizer argument. + /// + /// Related SQLite documentation: + public struct TrigramCaseSensitiveOption: RawRepresentable, Sendable, ExpressibleByBooleanLiteral { + public var rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// When true, matches the "case_sensitive=1" trigram tokenizer argument. + /// When false, it is "case_sensitive=0". + public init(booleanLiteral value: Bool) { + self = value ? Self(rawValue: 1) : Self(rawValue: 0) + } + } + + /// Diacritics options for the Trigram FTS5 tokenizer. + /// Matches the raw "remove_diacritics" tokenizer argument. + /// + /// Related SQLite documentation: + public struct TrigramDiacriticsOption: RawRepresentable, Sendable { + public var rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// Do not remove diacritics. This option matches the raw + /// "remove_diacritics=0" trigram tokenizer argument. + public static let keep = Self(rawValue: 0) + + /// Remove diacritics. This option matches the raw + /// "remove_diacritics=1" trigram tokenizer argument. + public static let remove = Self(rawValue: 1) } #else - /// Options for trigram tokenizer character matching. Matches the raw - /// "case_sensitive" and "remove_diacritics" tokenizer arguments. + /// Case sensitivity options for the Trigram FTS5 tokenizer. + /// Matches the raw "case_sensitive" tokenizer argument. /// - /// Related SQLite documentation: + /// Related SQLite documentation: @available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) // SQLite 3.35.0+ (3.34 actually) - public enum TrigramTokenizerMatching: Sendable { - /// Case insensitive matching without removing diacritics. This - /// option matches the raw "case_sensitive=0 remove_diacritics=0" - /// tokenizer argument. - case caseInsensitive - /// Case insensitive matching that removes diacritics before - /// matching. This option matches the raw - /// "case_sensitive=0 remove_diacritics=1" tokenizer argument. - @available(*, unavailable, message: "Requires a future OS release that includes SQLite >=3.45") - case caseInsensitiveRemovingDiacritics - /// Case sensitive matching. Diacritics are not removed when - /// performing case sensitive matching. This option matches the raw - /// "case_sensitive=1 remove_diacritics=0" tokenizer argument. - case caseSensitive + public struct TrigramCaseSensitiveOption: RawRepresentable, Sendable, ExpressibleByBooleanLiteral { + public var rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// When true, matches the "case_sensitive=1" trigram tokenizer argument. + /// When false, it is "case_sensitive=0". + public init(booleanLiteral value: Bool) { + self = value ? Self(rawValue: 1) : Self(rawValue: 0) + } + } + + /// Diacritics options for the Trigram FTS5 tokenizer. + /// Matches the raw "remove_diacritics" tokenizer argument. + /// + /// Related SQLite documentation: + @available(*, unavailable, message: "Requires a future OS release that includes SQLite >=3.45") + public struct TrigramDiacriticsOption: RawRepresentable, Sendable { + public var rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// Do not remove diacritics. This option matches the raw + /// "remove_diacritics=0" trigram tokenizer argument. + public static let keep = Self(rawValue: 0) + + /// Remove diacritics. This option matches the raw + /// "remove_diacritics=1" trigram tokenizer argument. + public static let remove = Self(rawValue: 1) } #endif diff --git a/GRDB/FTS/FTS5TokenizerDescriptor.swift b/GRDB/FTS/FTS5TokenizerDescriptor.swift index 99a4927fb8..7a783988a3 100644 --- a/GRDB/FTS/FTS5TokenizerDescriptor.swift +++ b/GRDB/FTS/FTS5TokenizerDescriptor.swift @@ -210,7 +210,7 @@ public struct FTS5TokenizerDescriptor: Sendable { } return FTS5TokenizerDescriptor(components: components) } - + #if GRDBCUSTOMSQLITE || GRDBCIPHER /// The "trigram" tokenizer. /// @@ -225,21 +225,25 @@ public struct FTS5TokenizerDescriptor: Sendable { /// Related SQLite documentation: /// /// - parameters: - /// - matching: By default SQLite will perform case insensitive - /// matching and not remove diacritics before matching. + /// - caseSensitive: By default SQLite will perform case insensitive + /// matching. + /// - removeDiacritics: By default SQLite will not remove diacritics + /// before matching. public static func trigram( - matching: FTS5.TrigramTokenizerMatching = .caseInsensitive + caseSensitive: FTS5.TrigramCaseSensitiveOption? = nil, + removeDiacritics: FTS5.TrigramDiacriticsOption? = nil ) -> FTS5TokenizerDescriptor { var components = ["trigram"] - switch matching { - case .caseInsensitive: - break - case .caseInsensitiveRemovingDiacritics: - components.append(contentsOf: ["remove_diacritics", "1"]) - case .caseSensitive: - components.append(contentsOf: ["case_sensitive", "1"]) + if let caseSensitive { + components.append(contentsOf: [ + "case_sensitive", String(caseSensitive.rawValue) + ]) + } + if let removeDiacritics { + components.append(contentsOf: [ + "remove_diacritics", String(removeDiacritics.rawValue) + ]) } - return FTS5TokenizerDescriptor(components: components) } #else @@ -256,22 +260,54 @@ public struct FTS5TokenizerDescriptor: Sendable { /// Related SQLite documentation: /// /// - parameters: - /// - matching: By default SQLite will perform case insensitive - /// matching and not remove diacritics before matching. + /// - caseSensitive: By default SQLite will perform case insensitive + /// matching. @available(iOS 15, macOS 12, tvOS 15, watchOS 8, *) // SQLite 3.35.0+ (3.34 actually) public static func trigram( - matching: FTS5.TrigramTokenizerMatching = .caseInsensitive + caseSensitive: FTS5.TrigramCaseSensitiveOption? = nil ) -> FTS5TokenizerDescriptor { var components = ["trigram"] - switch matching { - case .caseInsensitive: - break - case .caseInsensitiveRemovingDiacritics: - components.append(contentsOf: ["remove_diacritics", "1"]) - case .caseSensitive: - components.append(contentsOf: ["case_sensitive", "1"]) + if let caseSensitive { + components.append(contentsOf: [ + "case_sensitive", String(caseSensitive.rawValue) + ]) + } + return FTS5TokenizerDescriptor(components: components) + } + + /// The "trigram" tokenizer. + /// + /// For example: + /// + /// ```swift + /// try db.create(virtualTable: "book", using: FTS5()) { t in + /// t.tokenizer = .trigram() + /// } + /// ``` + /// + /// Related SQLite documentation: + /// + /// - parameters: + /// - caseSensitive: By default SQLite will perform case insensitive + /// matching. + /// - removeDiacritics: By default SQLite will not remove diacritics + /// before matching. + @available(*, unavailable, message: "Requires a future OS release that includes SQLite >=3.45") + public static func trigram( + caseSensitive: FTS5.TrigramCaseSensitiveOption? = nil, + removeDiacritics: FTS5.TrigramDiacriticsOption? = nil + ) -> FTS5TokenizerDescriptor { + var components = ["trigram"] + if let caseSensitive { + components.append(contentsOf: [ + "case_sensitive", String(caseSensitive.rawValue) + ]) + } + if let removeDiacritics { + components.append(contentsOf: [ + "remove_diacritics", String(removeDiacritics.rawValue) + ]) } - return FTS5TokenizerDescriptor(components: components) } #endif diff --git a/Tests/GRDBTests/FTS5TableBuilderTests.swift b/Tests/GRDBTests/FTS5TableBuilderTests.swift index bb435a6188..2415bf6626 100644 --- a/Tests/GRDBTests/FTS5TableBuilderTests.swift +++ b/Tests/GRDBTests/FTS5TableBuilderTests.swift @@ -202,10 +202,10 @@ class FTS5TableBuilderTests: GRDBTestCase { let dbQueue = try makeDatabaseQueue() try dbQueue.inDatabase { db in try db.create(virtualTable: "documents", using: FTS5()) { t in - t.tokenizer = .trigram(matching: .caseInsensitive) + t.tokenizer = .trigram(caseSensitive: false) t.column("content") } - assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram''')") + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''case_sensitive'' ''0''')") } } @@ -223,14 +223,33 @@ class FTS5TableBuilderTests: GRDBTestCase { let dbQueue = try makeDatabaseQueue() try dbQueue.inDatabase { db in try db.create(virtualTable: "documents", using: FTS5()) { t in - t.tokenizer = .trigram(matching: .caseSensitive) + t.tokenizer = .trigram(caseSensitive: true) t.column("content") } assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''case_sensitive'' ''1''')") } } - - func testTrigramTokenizerCaseInsensitiveRemovingDiacritics() throws { + + func testTrigramTokenizerWithoutRemovingDiacritics() throws { +#if GRDBCUSTOMSQLITE || GRDBCIPHER + guard sqlite3_libversion_number() >= 3045000 else { + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") + } + + let dbQueue = try makeDatabaseQueue() + try dbQueue.inDatabase { db in + try db.create(virtualTable: "documents", using: FTS5()) { t in + t.tokenizer = .trigram(removeDiacritics: .keep) + t.column("content") + } + assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''remove_diacritics'' ''0''')") + } +#else + throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") +#endif + } + + func testTrigramTokenizerRemoveDiacritics() throws { #if GRDBCUSTOMSQLITE || GRDBCIPHER guard sqlite3_libversion_number() >= 3045000 else { throw XCTSkip("FTS5 trigram tokenizer remove_diacritics is not available") @@ -239,7 +258,7 @@ class FTS5TableBuilderTests: GRDBTestCase { let dbQueue = try makeDatabaseQueue() try dbQueue.inDatabase { db in try db.create(virtualTable: "documents", using: FTS5()) { t in - t.tokenizer = .trigram(matching: .caseInsensitiveRemovingDiacritics) + t.tokenizer = .trigram(removeDiacritics: .remove) t.column("content") } assertDidExecute(sql: "CREATE VIRTUAL TABLE \"documents\" USING fts5(content, tokenize='''trigram'' ''remove_diacritics'' ''1''')") diff --git a/Tests/GRDBTests/FTS5TokenizerTests.swift b/Tests/GRDBTests/FTS5TokenizerTests.swift index 76790f41c7..68d9b7db92 100644 --- a/Tests/GRDBTests/FTS5TokenizerTests.swift +++ b/Tests/GRDBTests/FTS5TokenizerTests.swift @@ -335,7 +335,7 @@ class FTS5TokenizerTests: GRDBTestCase { let dbQueue = try makeDatabaseQueue() try dbQueue.inDatabase { db in try db.create(virtualTable: "documents", using: FTS5()) { t in - t.tokenizer = .trigram(matching: .caseSensitive) + t.tokenizer = .trigram(caseSensitive: true) t.column("content") } @@ -369,7 +369,7 @@ class FTS5TokenizerTests: GRDBTestCase { try dbQueue.inDatabase { db in do { try db.create(virtualTable: "documents", using: FTS5()) { t in - t.tokenizer = .trigram(matching: .caseInsensitiveRemovingDiacritics) + t.tokenizer = .trigram(removeDiacritics: .remove) t.column("content") } } catch {