From c798a6c210a30bb7b91f502dd4b5c77abcf583e5 Mon Sep 17 00:00:00 2001 From: Jai Radhakrishnan Date: Thu, 16 Oct 2025 13:08:25 -0700 Subject: [PATCH] [ENH] Add Schema to js client --- .../packages/chromadb/src/chroma-client.ts | 114 +- .../chromadb/src/collection-configuration.ts | 4 +- .../packages/chromadb/src/collection.ts | 213 ++- .../chromadb/src/embedding-function.ts | 19 +- clients/new-js/packages/chromadb/src/index.ts | 1 + .../new-js/packages/chromadb/src/schema.ts | 977 ++++++++++++ .../packages/chromadb/test/schema.test.ts | 1412 +++++++++++++++++ 7 files changed, 2680 insertions(+), 60 deletions(-) create mode 100644 clients/new-js/packages/chromadb/src/schema.ts create mode 100644 clients/new-js/packages/chromadb/test/schema.test.ts diff --git a/clients/new-js/packages/chromadb/src/chroma-client.ts b/clients/new-js/packages/chromadb/src/chroma-client.ts index 7f771ce726d..8a81fde5ba7 100644 --- a/clients/new-js/packages/chromadb/src/chroma-client.ts +++ b/clients/new-js/packages/chromadb/src/chroma-client.ts @@ -22,6 +22,23 @@ import { CreateCollectionConfiguration, processCreateCollectionConfig, } from "./collection-configuration"; +import { EMBEDDING_KEY, Schema } from "./schema"; + +const resolveSchemaEmbeddingFunction = ( + schema: Schema | undefined, +): EmbeddingFunction | undefined => { + if (!schema) { + return undefined; + } + + const embeddingOverride = + schema.keys[EMBEDDING_KEY]?.floatList?.vectorIndex?.config.embeddingFunction ?? undefined; + if (embeddingOverride) { + return embeddingOverride; + } + + return schema.defaults.floatList?.vectorIndex?.config.embeddingFunction ?? undefined; +}; /** * Configuration options for the ChromaClient. @@ -217,22 +234,27 @@ export class ChromaClient { }); return Promise.all( - data.map( - async (collection) => - new CollectionImpl({ - chromaClient: this, - apiClient: this.apiClient, - name: collection.name, - id: collection.id, - embeddingFunction: await getEmbeddingFunction( - collection.name, - collection.configuration_json.embedding_function ?? undefined, - ), - configuration: collection.configuration_json, - metadata: - deserializeMetadata(collection.metadata ?? undefined) ?? undefined, - }), - ), + data.map(async (collection) => { + const schema = Schema.deserializeFromJSON(collection.schema ?? undefined); + const schemaEmbeddingFunction = resolveSchemaEmbeddingFunction(schema); + const resolvedEmbeddingFunction = + getEmbeddingFunction( + collection.name, + collection.configuration_json.embedding_function ?? undefined, + ) ?? schemaEmbeddingFunction; + + return new CollectionImpl({ + chromaClient: this, + apiClient: this.apiClient, + name: collection.name, + id: collection.id, + embeddingFunction: resolvedEmbeddingFunction, + configuration: collection.configuration_json, + metadata: + deserializeMetadata(collection.metadata ?? undefined) ?? undefined, + schema, + }); + }), ); } @@ -264,11 +286,13 @@ export class ChromaClient { configuration, metadata, embeddingFunction, + schema, }: { name: string; configuration?: CreateCollectionConfiguration; metadata?: CollectionMetadata; embeddingFunction?: EmbeddingFunction | null; + schema?: Schema; }): Promise { const collectionConfig = await processCreateCollectionConfig({ configuration, @@ -284,22 +308,29 @@ export class ChromaClient { configuration: collectionConfig, metadata: serializeMetadata(metadata), get_or_create: false, + schema: schema ? schema.serializeToJSON() : undefined, }, }); + const serverSchema = Schema.deserializeFromJSON(data.schema ?? undefined); + const schemaEmbeddingFunction = resolveSchemaEmbeddingFunction(serverSchema); + const resolvedEmbeddingFunction = + embeddingFunction ?? + getEmbeddingFunction( + data.name, + data.configuration_json.embedding_function ?? undefined, + ) ?? + schemaEmbeddingFunction; + return new CollectionImpl({ chromaClient: this, apiClient: this.apiClient, name, configuration: data.configuration_json, metadata: deserializeMetadata(data.metadata ?? undefined) ?? undefined, - embeddingFunction: - embeddingFunction ?? - (await getEmbeddingFunction( - data.name, - data.configuration_json.embedding_function ?? undefined, - )), + embeddingFunction: resolvedEmbeddingFunction, id: data.id, + schema: serverSchema, }); } @@ -323,19 +354,25 @@ export class ChromaClient { path: { ...(await this._path()), collection_id: name }, }); + const schema = Schema.deserializeFromJSON(data.schema ?? undefined); + const schemaEmbeddingFunction = resolveSchemaEmbeddingFunction(schema); + const resolvedEmbeddingFunction = + embeddingFunction ?? + getEmbeddingFunction( + data.name, + data.configuration_json.embedding_function ?? undefined, + ) ?? + schemaEmbeddingFunction; + return new CollectionImpl({ chromaClient: this, apiClient: this.apiClient, name, configuration: data.configuration_json, metadata: deserializeMetadata(data.metadata ?? undefined) ?? undefined, - embeddingFunction: embeddingFunction - ? embeddingFunction - : await getEmbeddingFunction( - data.name, - data.configuration_json.embedding_function ?? undefined, - ), + embeddingFunction: resolvedEmbeddingFunction, id: data.id, + schema, }); } @@ -382,11 +419,13 @@ export class ChromaClient { configuration, metadata, embeddingFunction, + schema, }: { name: string; configuration?: CreateCollectionConfiguration; metadata?: CollectionMetadata; embeddingFunction?: EmbeddingFunction | null; + schema?: Schema; }): Promise { const collectionConfig = await processCreateCollectionConfig({ configuration, @@ -402,22 +441,29 @@ export class ChromaClient { configuration: collectionConfig, metadata: serializeMetadata(metadata), get_or_create: true, + schema: schema ? schema.serializeToJSON() : undefined, }, }); + const serverSchema = Schema.deserializeFromJSON(data.schema ?? undefined); + const schemaEmbeddingFunction = resolveSchemaEmbeddingFunction(serverSchema); + const resolvedEmbeddingFunction = + embeddingFunction ?? + getEmbeddingFunction( + name, + data.configuration_json.embedding_function ?? undefined, + ) ?? + schemaEmbeddingFunction; + return new CollectionImpl({ chromaClient: this, apiClient: this.apiClient, name, configuration: data.configuration_json, metadata: deserializeMetadata(data.metadata ?? undefined) ?? undefined, - embeddingFunction: - embeddingFunction ?? - (await getEmbeddingFunction( - name, - data.configuration_json.embedding_function ?? undefined, - )), + embeddingFunction: resolvedEmbeddingFunction, id: data.id, + schema: serverSchema, }); } diff --git a/clients/new-js/packages/chromadb/src/collection-configuration.ts b/clients/new-js/packages/chromadb/src/collection-configuration.ts index a32c9ac0bcc..c9a3bb1cfad 100644 --- a/clients/new-js/packages/chromadb/src/collection-configuration.ts +++ b/clients/new-js/packages/chromadb/src/collection-configuration.ts @@ -164,10 +164,10 @@ export const processUpdateCollectionConfig = async ({ const embeddingFunction = currentEmbeddingFunction || - (await getEmbeddingFunction( + getEmbeddingFunction( collectionName, currentConfiguration.embeddingFunction ?? undefined, - )); + ); const newEmbeddingFunction = newConfiguration.embeddingFunction; diff --git a/clients/new-js/packages/chromadb/src/collection.ts b/clients/new-js/packages/chromadb/src/collection.ts index 807a2180940..42342c80294 100644 --- a/clients/new-js/packages/chromadb/src/collection.ts +++ b/clients/new-js/packages/chromadb/src/collection.ts @@ -1,5 +1,5 @@ import { ChromaClient } from "./chroma-client"; -import { EmbeddingFunction } from "./embedding-function"; +import { EmbeddingFunction, SparseEmbeddingFunction } from "./embedding-function"; import { BaseRecordSet, CollectionMetadata, @@ -13,7 +13,7 @@ import { Where, WhereDocument, } from "./types"; -import { Include } from "./api"; +import { Include, SparseVector } from "./api"; import { DefaultService as Api } from "./api"; import { validateRecordSetLengthConsistency, @@ -40,6 +40,8 @@ import { UpdateCollectionConfiguration, } from "./collection-configuration"; import { SearchLike, SearchResult, toSearch } from "./execution/expression"; +import { Schema, EMBEDDING_KEY, DOCUMENT_KEY } from "./schema"; +import type { SparseVectorIndexConfig } from "./schema"; /** * Interface for collection operations using collection ID. @@ -56,6 +58,8 @@ export interface Collection { configuration: CollectionConfiguration; /** Optional embedding function. Must match the one used to create the collection. */ embeddingFunction?: EmbeddingFunction; + /** Collection schema describing index configuration */ + schema?: Schema; /** Gets the total number of records in the collection */ count(): Promise; /** @@ -212,6 +216,8 @@ export interface CollectionArgs { configuration: CollectionConfiguration; /** Optional collection metadata */ metadata?: CollectionMetadata; + /** Optional schema returned by the server */ + schema?: Schema; } /** @@ -226,6 +232,7 @@ export class CollectionImpl implements Collection { private _metadata: CollectionMetadata | undefined; private _configuration: CollectionConfiguration; protected _embeddingFunction: EmbeddingFunction | undefined; + protected _schema: Schema | undefined; /** * Creates a new CollectionAPIImpl instance. @@ -239,6 +246,7 @@ export class CollectionImpl implements Collection { metadata, configuration, embeddingFunction, + schema, }: CollectionArgs) { this.chromaClient = chromaClient; this.apiClient = apiClient; @@ -247,6 +255,7 @@ export class CollectionImpl implements Collection { this._metadata = metadata; this._configuration = configuration; this._embeddingFunction = embeddingFunction; + this._schema = schema; } public get name(): string { @@ -283,6 +292,14 @@ export class CollectionImpl implements Collection { this._embeddingFunction = embeddingFunction; } + public get schema(): Schema | undefined { + return this._schema; + } + + protected set schema(schema: Schema | undefined) { + this._schema = schema; + } + protected async path(): Promise<{ tenant: string; database: string; @@ -296,17 +313,177 @@ export class CollectionImpl implements Collection { } private async embed(inputs: string[], isQuery: boolean): Promise { - if (!this._embeddingFunction) { + const embeddingFunction = + this._embeddingFunction ?? this.getSchemaEmbeddingFunction(); + + if (!embeddingFunction) { throw new ChromaValueError( "Embedding function must be defined for operations requiring embeddings.", ); } - if (this._embeddingFunction.generateForQueries && isQuery) { - return await this._embeddingFunction.generateForQueries(inputs); - } else { - return await this._embeddingFunction.generate(inputs); - } + if (isQuery && embeddingFunction.generateForQueries) { + return await embeddingFunction.generateForQueries(inputs); + } + + return await embeddingFunction.generate(inputs); + } + + private async sparseEmbed(sparseEmbeddingFunction: SparseEmbeddingFunction, inputs: string[], isQuery: boolean): Promise { + if (isQuery && sparseEmbeddingFunction.generateForQueries) { + return await sparseEmbeddingFunction.generateForQueries(inputs); + } + + return await sparseEmbeddingFunction.generate(inputs); + } + + private getSparseEmbeddingTargets(): Record { + const schema = this._schema; + if (!schema) return {}; + + const targets: Record = {}; + for (const [key, valueTypes] of Object.entries(schema.keys)) { + const sparseVector = valueTypes.sparseVector; + const sparseIndex = sparseVector?.sparseVectorIndex; + if (!sparseIndex?.enabled) continue; + + const config = sparseIndex.config; + if (!config.embeddingFunction || !config.sourceKey) continue; + + targets[key] = config; + } + + return targets; + } + + private async applySparseEmbeddingsToMetadatas( + metadatas?: Metadata[], + documents?: string[], + ): Promise { + const sparseTargets = this.getSparseEmbeddingTargets(); + if (Object.keys(sparseTargets).length === 0) { + return metadatas; + } + + // If no metadatas provided, create empty objects based on documents length + if (!metadatas) { + if (!documents) { + return undefined; + } + metadatas = Array(documents.length).fill(null).map(() => ({})); + } + + // Create copies, converting null to empty object + const updatedMetadatas = metadatas.map((metadata) => + metadata !== null && metadata !== undefined ? { ...metadata } : {} + ); + const documentsList = documents ? [...documents] : undefined; + + for (const [targetKey, config] of Object.entries(sparseTargets)) { + const sourceKey = config.sourceKey; + const embeddingFunction = config.embeddingFunction; + if (!sourceKey || !embeddingFunction) { + continue; + } + + const inputs: string[] = []; + const positions: number[] = []; + + // Handle special case: source_key is "#document" + if (sourceKey === DOCUMENT_KEY) { + if (!documentsList) { + continue; + } + + // Collect documents that need embedding + updatedMetadatas.forEach((metadata, index) => { + // Skip if target already exists in metadata + if (targetKey in metadata) { + return; + } + + // Get document at this position + if (index < documentsList.length) { + const doc = documentsList[index]; + if (typeof doc === "string") { + inputs.push(doc); + positions.push(index); + } + } + }); + + // Generate embeddings for all collected documents + if (inputs.length === 0) { + continue; + } + + const sparseEmbeddings = await this.sparseEmbed(embeddingFunction, inputs, false); + if (sparseEmbeddings.length !== positions.length) { + throw new ChromaValueError( + "Sparse embedding function returned unexpected number of embeddings.", + ); + } + + positions.forEach((position, idx) => { + updatedMetadatas[position][targetKey] = sparseEmbeddings[idx]; + }); + + continue; // Skip the metadata-based logic below + } + + // Handle normal case: source_key is a metadata field + updatedMetadatas.forEach((metadata, index) => { + if (targetKey in metadata) { + return; + } + + const sourceValue = metadata[sourceKey]; + if (typeof sourceValue !== "string") { + return; + } + + inputs.push(sourceValue); + positions.push(index); + }); + + if (inputs.length === 0) { + continue; + } + + const sparseEmbeddings = await this.sparseEmbed(embeddingFunction, inputs, false); + if (sparseEmbeddings.length !== positions.length) { + throw new ChromaValueError( + "Sparse embedding function returned unexpected number of embeddings.", + ); + } + + positions.forEach((position, idx) => { + updatedMetadatas[position][targetKey] = sparseEmbeddings[idx]; + }); + } + + // Convert empty objects back to null + const resultMetadatas = updatedMetadatas.map((metadata) => + Object.keys(metadata).length === 0 ? null : metadata + ); + + return resultMetadatas as Metadata[]; + } + + private getSchemaEmbeddingFunction(): EmbeddingFunction | undefined { + const schema = this._schema; + if (!schema) return undefined; + + const schemaOverride = schema.keys[EMBEDDING_KEY]; + const overrideFunction = schemaOverride?.floatList?.vectorIndex?.config + .embeddingFunction; + if (overrideFunction) { + return overrideFunction; + } + + const defaultFunction = schema.defaults.floatList?.vectorIndex?.config + .embeddingFunction; + return defaultFunction ?? undefined; } private async prepareRecords({ @@ -327,7 +504,15 @@ export class CollectionImpl implements Collection { recordSet.embeddings = await this.embed(recordSet.documents, false); } - const preparedRecordSet: PreparedRecordSet = { ...recordSet }; + const metadatasWithSparse = await this.applySparseEmbeddingsToMetadatas( + recordSet.metadatas, + recordSet.documents, + ); + + const preparedRecordSet: PreparedRecordSet = { + ...recordSet, + metadatas: metadatasWithSparse, + }; const base64Supported = await this.chromaClient.supportsBase64Encoding(); if (base64Supported && recordSet.embeddings) { @@ -587,11 +772,11 @@ export class CollectionImpl implements Collection { const { updateConfiguration, updateEmbeddingFunction } = configuration ? await processUpdateCollectionConfig({ - collectionName: this.name, - currentConfiguration: this.configuration, - newConfiguration: configuration, - currentEmbeddingFunction: this.embeddingFunction, - }) + collectionName: this.name, + currentConfiguration: this.configuration, + newConfiguration: configuration, + currentEmbeddingFunction: this.embeddingFunction, + }) : {}; if (updateEmbeddingFunction) { diff --git a/clients/new-js/packages/chromadb/src/embedding-function.ts b/clients/new-js/packages/chromadb/src/embedding-function.ts index 4b34f0d6bce..f0c66485b92 100644 --- a/clients/new-js/packages/chromadb/src/embedding-function.ts +++ b/clients/new-js/packages/chromadb/src/embedding-function.ts @@ -130,6 +130,11 @@ export const knownSparseEmbeddingFunctions = new Map< SparseEmbeddingFunctionClass >(); +/** + * Union type covering both dense and sparse embedding functions. + */ +export type AnyEmbeddingFunction = EmbeddingFunction | SparseEmbeddingFunction; + /** * Registers an embedding function in the global registry. * @param name - Unique name for the embedding function @@ -170,9 +175,9 @@ export const registerSparseEmbeddingFunction = ( * Retrieves and instantiates an embedding function from configuration. * @param collectionName - Name of the collection (for error messages) * @param efConfig - Configuration for the embedding function - * @returns Promise resolving to an EmbeddingFunction instance + * @returns EmbeddingFunction instance or undefined if it cannot be constructed */ -export const getEmbeddingFunction = async ( +export const getEmbeddingFunction = ( collectionName: string, efConfig?: EmbeddingFunctionConfiguration, ) => { @@ -235,23 +240,17 @@ export const getEmbeddingFunction = async ( * Retrieves and instantiates a sparse embedding function from configuration. * @param collectionName - Name of the collection (for error messages) * @param efConfig - Configuration for the sparse embedding function - * @returns Promise resolving to a SparseEmbeddingFunction instance + * @returns SparseEmbeddingFunction instance or undefined if it cannot be constructed */ -export const getSparseEmbeddingFunction = async ( +export const getSparseEmbeddingFunction = ( collectionName: string, efConfig?: EmbeddingFunctionConfiguration, ) => { if (!efConfig) { - console.warn( - `No sparse embedding function configuration found for collection ${collectionName}.`, - ); return undefined; } if (efConfig.type === "legacy") { - console.warn( - `No sparse embedding function configuration found for collection ${collectionName}.`, - ); return undefined; } diff --git a/clients/new-js/packages/chromadb/src/index.ts b/clients/new-js/packages/chromadb/src/index.ts index 35df1636fe0..fc6e6570990 100644 --- a/clients/new-js/packages/chromadb/src/index.ts +++ b/clients/new-js/packages/chromadb/src/index.ts @@ -16,3 +16,4 @@ export * from "./cloud-client"; export * from "./errors"; export * from "./collection-configuration"; export * from "./execution"; +export * from "./schema"; diff --git a/clients/new-js/packages/chromadb/src/schema.ts b/clients/new-js/packages/chromadb/src/schema.ts new file mode 100644 index 00000000000..03b0d5cd847 --- /dev/null +++ b/clients/new-js/packages/chromadb/src/schema.ts @@ -0,0 +1,977 @@ +import type { + EmbeddingFunctionConfiguration, + Schema as InternalSchema, + Space, + HnswIndexConfig as ApiHnswIndexConfig, + SpannIndexConfig as ApiSpannIndexConfig, + ValueTypes as ApiValueTypes, +} from "./api"; +import { + AnyEmbeddingFunction, + EmbeddingFunction, + SparseEmbeddingFunction, + getEmbeddingFunction, + getSparseEmbeddingFunction, +} from "./embedding-function"; + +export const DOCUMENT_KEY = "#document"; +export const EMBEDDING_KEY = "#embedding"; + +const STRING_VALUE_NAME = "string"; +const FLOAT_LIST_VALUE_NAME = "float_list"; +const SPARSE_VECTOR_VALUE_NAME = "sparse_vector"; +const INT_VALUE_NAME = "int"; +const FLOAT_VALUE_NAME = "float"; +const BOOL_VALUE_NAME = "bool"; + +const FTS_INDEX_NAME = "fts_index"; +const STRING_INVERTED_INDEX_NAME = "string_inverted_index"; +const VECTOR_INDEX_NAME = "vector_index"; +const SPARSE_VECTOR_INDEX_NAME = "sparse_vector_index"; +const INT_INVERTED_INDEX_NAME = "int_inverted_index"; +const FLOAT_INVERTED_INDEX_NAME = "float_inverted_index"; +const BOOL_INVERTED_INDEX_NAME = "bool_inverted_index"; + +export class FtsIndexConfig { + readonly type = "FtsIndexConfig"; +} + +export class StringInvertedIndexConfig { + readonly type = "StringInvertedIndexConfig"; +} + +export class IntInvertedIndexConfig { + readonly type = "IntInvertedIndexConfig"; +} + +export class FloatInvertedIndexConfig { + readonly type = "FloatInvertedIndexConfig"; +} + +export class BoolInvertedIndexConfig { + readonly type = "BoolInvertedIndexConfig"; +} + +export interface VectorIndexConfigOptions { + space?: Space | null; + embeddingFunction?: EmbeddingFunction | null; + sourceKey?: string | null; + hnsw?: ApiHnswIndexConfig | null; + spann?: ApiSpannIndexConfig | null; +} + +export class VectorIndexConfig { + readonly type = "VectorIndexConfig"; + space: Space | null; + embeddingFunction: EmbeddingFunction | null; + sourceKey: string | null; + hnsw: ApiHnswIndexConfig | null; + spann: ApiSpannIndexConfig | null; + + constructor(options: VectorIndexConfigOptions = {}) { + this.space = options.space ?? null; + this.embeddingFunction = options.embeddingFunction ?? null; + this.sourceKey = options.sourceKey ?? null; + this.hnsw = options.hnsw ?? null; + this.spann = options.spann ?? null; + } +} + +export interface SparseVectorIndexConfigOptions { + embeddingFunction?: SparseEmbeddingFunction | null; + sourceKey?: string | null; + bm25?: boolean | null; +} + +export class SparseVectorIndexConfig { + readonly type = "SparseVectorIndexConfig"; + embeddingFunction: SparseEmbeddingFunction | null; + sourceKey: string | null; + bm25: boolean | null; + + constructor(options: SparseVectorIndexConfigOptions = {}) { + this.embeddingFunction = options.embeddingFunction ?? null; + this.sourceKey = options.sourceKey ?? null; + this.bm25 = options.bm25 ?? null; + } +} + +export class FtsIndexType { + constructor(public enabled: boolean, public config: FtsIndexConfig) { } +} + +export class StringInvertedIndexType { + constructor(public enabled: boolean, public config: StringInvertedIndexConfig) { } +} + +export class VectorIndexType { + constructor(public enabled: boolean, public config: VectorIndexConfig) { } +} + +export class SparseVectorIndexType { + constructor(public enabled: boolean, public config: SparseVectorIndexConfig) { } +} + +export class IntInvertedIndexType { + constructor(public enabled: boolean, public config: IntInvertedIndexConfig) { } +} + +export class FloatInvertedIndexType { + constructor(public enabled: boolean, public config: FloatInvertedIndexConfig) { } +} + +export class BoolInvertedIndexType { + constructor(public enabled: boolean, public config: BoolInvertedIndexConfig) { } +} + +export class StringValueType { + constructor( + public ftsIndex: FtsIndexType | null = null, + public stringInvertedIndex: StringInvertedIndexType | null = null, + ) { } +} + +export class FloatListValueType { + constructor(public vectorIndex: VectorIndexType | null = null) { } +} + +export class SparseVectorValueType { + constructor(public sparseVectorIndex: SparseVectorIndexType | null = null) { } +} + +export class IntValueType { + constructor(public intInvertedIndex: IntInvertedIndexType | null = null) { } +} + +export class FloatValueType { + constructor(public floatInvertedIndex: FloatInvertedIndexType | null = null) { } +} + +export class BoolValueType { + constructor(public boolInvertedIndex: BoolInvertedIndexType | null = null) { } +} + +export class ValueTypes { + string: StringValueType | null = null; + floatList: FloatListValueType | null = null; + sparseVector: SparseVectorValueType | null = null; + intValue: IntValueType | null = null; + floatValue: FloatValueType | null = null; + boolean: BoolValueType | null = null; +} + +export type IndexConfig = + | FtsIndexConfig + | VectorIndexConfig + | SparseVectorIndexConfig + | StringInvertedIndexConfig + | IntInvertedIndexConfig + | FloatInvertedIndexConfig + | BoolInvertedIndexConfig; + +type ValueTypesJson = ApiValueTypes; + +type JsonDict = Record; + +const cloneObject = (value: T): T => { + if (value === null || value === undefined) { + return value; + } + if (typeof value !== "object") { + return value; + } + return Array.isArray(value) + ? (value.map((item) => cloneObject(item)) as T) + : (Object.fromEntries( + Object.entries(value as Record).map(([k, v]) => [ + k, + cloneObject(v), + ]), + ) as T); +}; + +const resolveEmbeddingFunctionName = ( + fn: AnyEmbeddingFunction | null | undefined, +): string | undefined => { + if (!fn) return undefined; + if (typeof (fn as any).name === "function") { + try { + const value = (fn as any).name(); + return typeof value === "string" ? value : undefined; + } catch (_err) { + return undefined; + } + } + if (typeof (fn as any).name === "string") { + return (fn as any).name; + } + return undefined; +}; + +const prepareEmbeddingFunctionConfig = ( + fn: AnyEmbeddingFunction | null | undefined, +): EmbeddingFunctionConfiguration => { + if (!fn) { + return { type: "legacy" }; + } + + const name = resolveEmbeddingFunctionName(fn); + const getConfig = typeof fn.getConfig === "function" ? fn.getConfig.bind(fn) : undefined; + const buildFromConfig = (fn.constructor as any)?.buildFromConfig; + + if (!name || !getConfig || typeof buildFromConfig !== "function") { + return { type: "legacy" }; + } + + const config = getConfig(); + if (typeof fn.validateConfig === "function") { + fn.validateConfig(config); + } + + return { + type: "known", + name, + config, + }; +}; + +const ensureValueTypes = ( + valueTypes: ValueTypes | null | undefined, +): ValueTypes => (valueTypes ?? new ValueTypes()); + +const ensureStringValueType = ( + valueTypes: ValueTypes, +): StringValueType => { + if (!valueTypes.string) { + valueTypes.string = new StringValueType(); + } + return valueTypes.string; +}; + +const ensureFloatListValueType = ( + valueTypes: ValueTypes, +): FloatListValueType => { + if (!valueTypes.floatList) { + valueTypes.floatList = new FloatListValueType(); + } + return valueTypes.floatList; +}; + +const ensureSparseVectorValueType = ( + valueTypes: ValueTypes, +): SparseVectorValueType => { + if (!valueTypes.sparseVector) { + valueTypes.sparseVector = new SparseVectorValueType(); + } + return valueTypes.sparseVector; +}; + +const ensureIntValueType = ( + valueTypes: ValueTypes, +): IntValueType => { + if (!valueTypes.intValue) { + valueTypes.intValue = new IntValueType(); + } + return valueTypes.intValue; +}; + +const ensureFloatValueType = ( + valueTypes: ValueTypes, +): FloatValueType => { + if (!valueTypes.floatValue) { + valueTypes.floatValue = new FloatValueType(); + } + return valueTypes.floatValue; +}; + +const ensureBoolValueType = ( + valueTypes: ValueTypes, +): BoolValueType => { + if (!valueTypes.boolean) { + valueTypes.boolean = new BoolValueType(); + } + return valueTypes.boolean; +}; + +export class Schema { + defaults: ValueTypes; + keys: Record; + + constructor() { + this.defaults = new ValueTypes(); + this.keys = {}; + this.initializeDefaults(); + this.initializeKeys(); + } + + createIndex(config?: IndexConfig, key?: string): this { + const configProvided = config !== undefined && config !== null; + const keyProvided = key !== undefined && key !== null; + + if (!configProvided && !keyProvided) { + throw new Error( + "Cannot enable all index types globally. Must specify either config or key.", + ); + } + + if (keyProvided && key && (key === EMBEDDING_KEY || key === DOCUMENT_KEY)) { + throw new Error( + `Cannot create index on special key '${key}'. These keys are managed automatically by the system.`, + ); + } + + if (config instanceof VectorIndexConfig) { + if (!keyProvided) { + this.setVectorIndexConfig(config); + return this; + } + throw new Error( + "Vector index cannot be enabled on specific keys. Use createIndex(config=VectorIndexConfig(...)) without specifying a key to configure the vector index globally.", + ); + } + + if (config instanceof FtsIndexConfig) { + if (!keyProvided) { + this.setFtsIndexConfig(config); + return this; + } + throw new Error( + "FTS index cannot be enabled on specific keys. Use createIndex(config=FtsIndexConfig(...)) without specifying a key to configure the FTS index globally.", + ); + } + + if (config instanceof SparseVectorIndexConfig && !keyProvided) { + throw new Error( + "Sparse vector index must be created on a specific key. Please specify a key using: createIndex(config=SparseVectorIndexConfig(...), key='your_key')", + ); + } + + if (!configProvided && keyProvided && key) { + this.enableAllIndexesForKey(key); + return this; + } + + if (configProvided && !keyProvided) { + this.setIndexInDefaults(config as IndexConfig, true); + } else if (configProvided && keyProvided && key) { + this.setIndexForKey(key, config as IndexConfig, true); + } + + return this; + } + + deleteIndex(config?: IndexConfig, key?: string): this { + const configProvided = config !== undefined && config !== null; + const keyProvided = key !== undefined && key !== null; + + if (!configProvided && !keyProvided) { + throw new Error("Cannot disable all indexes. Must specify either config or key."); + } + + if (keyProvided && key && (key === EMBEDDING_KEY || key === DOCUMENT_KEY)) { + throw new Error( + `Cannot delete index on special key '${key}'. These keys are managed automatically by the system.`, + ); + } + + if (config instanceof VectorIndexConfig) { + throw new Error("Deleting vector index is not currently supported."); + } + + if (config instanceof FtsIndexConfig) { + throw new Error("Deleting FTS index is not currently supported."); + } + + if (config instanceof SparseVectorIndexConfig) { + throw new Error("Deleting sparse vector index is not currently supported."); + } + + if (keyProvided && !configProvided && key) { + this.disableAllIndexesForKey(key); + return this; + } + + if (keyProvided && configProvided && key) { + this.setIndexForKey(key, config as IndexConfig, false); + } else if (!keyProvided && configProvided) { + this.setIndexInDefaults(config as IndexConfig, false); + } + + return this; + } + + serializeToJSON(): InternalSchema { + const defaults = this.serializeValueTypes(this.defaults); + + const keys: Record = {}; + for (const [keyName, valueTypes] of Object.entries(this.keys)) { + keys[keyName] = this.serializeValueTypes(valueTypes); + } + + return { + defaults, + keys, + }; + } + + static deserializeFromJSON(json?: InternalSchema | JsonDict | null): Schema | undefined { + if (json == null) { + return undefined; + } + + const data = json as JsonDict; + const instance = Object.create(Schema.prototype) as Schema; + instance.defaults = Schema.deserializeValueTypes( + (data.defaults ?? {}) as Record, + ); + instance.keys = {}; + const keys = (data.keys ?? {}) as Record>; + for (const [keyName, value] of Object.entries(keys)) { + instance.keys[keyName] = Schema.deserializeValueTypes(value); + } + return instance; + } + + private setVectorIndexConfig(config: VectorIndexConfig): void { + const defaultsFloatList = ensureFloatListValueType(this.defaults); + const currentDefaultsVector = + defaultsFloatList.vectorIndex ?? new VectorIndexType(false, new VectorIndexConfig()); + defaultsFloatList.vectorIndex = new VectorIndexType( + currentDefaultsVector.enabled, + new VectorIndexConfig({ + space: config.space ?? null, + embeddingFunction: config.embeddingFunction ?? null, + sourceKey: config.sourceKey ?? null, + hnsw: config.hnsw ? cloneObject(config.hnsw) : null, + spann: config.spann ? cloneObject(config.spann) : null, + }), + ); + + const embeddingValueTypes = ensureValueTypes(this.keys[EMBEDDING_KEY]); + this.keys[EMBEDDING_KEY] = embeddingValueTypes; + const overrideFloatList = ensureFloatListValueType(embeddingValueTypes); + const currentOverrideVector = + overrideFloatList.vectorIndex ?? new VectorIndexType(true, new VectorIndexConfig({ sourceKey: DOCUMENT_KEY })); + const preservedSourceKey = currentOverrideVector.config.sourceKey ?? DOCUMENT_KEY; + overrideFloatList.vectorIndex = new VectorIndexType( + currentOverrideVector.enabled, + new VectorIndexConfig({ + space: config.space ?? null, + embeddingFunction: config.embeddingFunction ?? null, + sourceKey: preservedSourceKey, + hnsw: config.hnsw ? cloneObject(config.hnsw) : null, + spann: config.spann ? cloneObject(config.spann) : null, + }), + ); + } + + private setFtsIndexConfig(config: FtsIndexConfig): void { + const defaultsString = ensureStringValueType(this.defaults); + const currentDefaultsFts = + defaultsString.ftsIndex ?? new FtsIndexType(false, new FtsIndexConfig()); + defaultsString.ftsIndex = new FtsIndexType(currentDefaultsFts.enabled, config); + + const documentValueTypes = ensureValueTypes(this.keys[DOCUMENT_KEY]); + this.keys[DOCUMENT_KEY] = documentValueTypes; + const overrideString = ensureStringValueType(documentValueTypes); + const currentOverrideFts = + overrideString.ftsIndex ?? new FtsIndexType(true, new FtsIndexConfig()); + overrideString.ftsIndex = new FtsIndexType(currentOverrideFts.enabled, config); + } + + private setIndexInDefaults(config: IndexConfig, enabled: boolean): void { + if (config instanceof FtsIndexConfig) { + const valueType = ensureStringValueType(this.defaults); + valueType.ftsIndex = new FtsIndexType(enabled, config); + } else if (config instanceof StringInvertedIndexConfig) { + const valueType = ensureStringValueType(this.defaults); + valueType.stringInvertedIndex = new StringInvertedIndexType(enabled, config); + } else if (config instanceof VectorIndexConfig) { + const valueType = ensureFloatListValueType(this.defaults); + valueType.vectorIndex = new VectorIndexType(enabled, config); + } else if (config instanceof SparseVectorIndexConfig) { + const valueType = ensureSparseVectorValueType(this.defaults); + valueType.sparseVectorIndex = new SparseVectorIndexType(enabled, config); + } else if (config instanceof IntInvertedIndexConfig) { + const valueType = ensureIntValueType(this.defaults); + valueType.intInvertedIndex = new IntInvertedIndexType(enabled, config); + } else if (config instanceof FloatInvertedIndexConfig) { + const valueType = ensureFloatValueType(this.defaults); + valueType.floatInvertedIndex = new FloatInvertedIndexType(enabled, config); + } else if (config instanceof BoolInvertedIndexConfig) { + const valueType = ensureBoolValueType(this.defaults); + valueType.boolInvertedIndex = new BoolInvertedIndexType(enabled, config); + } + } + + private setIndexForKey(key: string, config: IndexConfig, enabled: boolean): void { + if (config instanceof SparseVectorIndexConfig && enabled) { + this.validateSingleSparseVectorIndex(key); + } + + const current = (this.keys[key] = ensureValueTypes(this.keys[key])); + + if (config instanceof StringInvertedIndexConfig) { + const valueType = ensureStringValueType(current); + valueType.stringInvertedIndex = new StringInvertedIndexType(enabled, config); + } else if (config instanceof FtsIndexConfig) { + const valueType = ensureStringValueType(current); + valueType.ftsIndex = new FtsIndexType(enabled, config); + } else if (config instanceof SparseVectorIndexConfig) { + const valueType = ensureSparseVectorValueType(current); + valueType.sparseVectorIndex = new SparseVectorIndexType(enabled, config); + } else if (config instanceof VectorIndexConfig) { + const valueType = ensureFloatListValueType(current); + valueType.vectorIndex = new VectorIndexType(enabled, config); + } else if (config instanceof IntInvertedIndexConfig) { + const valueType = ensureIntValueType(current); + valueType.intInvertedIndex = new IntInvertedIndexType(enabled, config); + } else if (config instanceof FloatInvertedIndexConfig) { + const valueType = ensureFloatValueType(current); + valueType.floatInvertedIndex = new FloatInvertedIndexType(enabled, config); + } else if (config instanceof BoolInvertedIndexConfig) { + const valueType = ensureBoolValueType(current); + valueType.boolInvertedIndex = new BoolInvertedIndexType(enabled, config); + } + } + + private enableAllIndexesForKey(key: string): void { + if (key === EMBEDDING_KEY || key === DOCUMENT_KEY) { + throw new Error( + `Cannot enable all indexes for special key '${key}'. These keys are managed automatically by the system.`, + ); + } + + const current = (this.keys[key] = ensureValueTypes(this.keys[key])); + current.string = new StringValueType( + new FtsIndexType(true, new FtsIndexConfig()), + new StringInvertedIndexType(true, new StringInvertedIndexConfig()), + ); + current.floatList = new FloatListValueType( + new VectorIndexType(true, new VectorIndexConfig()), + ); + current.sparseVector = new SparseVectorValueType( + new SparseVectorIndexType(true, new SparseVectorIndexConfig()), + ); + current.intValue = new IntValueType( + new IntInvertedIndexType(true, new IntInvertedIndexConfig()), + ); + current.floatValue = new FloatValueType( + new FloatInvertedIndexType(true, new FloatInvertedIndexConfig()), + ); + current.boolean = new BoolValueType( + new BoolInvertedIndexType(true, new BoolInvertedIndexConfig()), + ); + } + + private disableAllIndexesForKey(key: string): void { + if (key === EMBEDDING_KEY || key === DOCUMENT_KEY) { + throw new Error( + `Cannot disable all indexes for special key '${key}'. These keys are managed automatically by the system.`, + ); + } + + const current = (this.keys[key] = ensureValueTypes(this.keys[key])); + current.string = new StringValueType( + new FtsIndexType(false, new FtsIndexConfig()), + new StringInvertedIndexType(false, new StringInvertedIndexConfig()), + ); + current.floatList = new FloatListValueType( + new VectorIndexType(false, new VectorIndexConfig()), + ); + current.sparseVector = new SparseVectorValueType( + new SparseVectorIndexType(false, new SparseVectorIndexConfig()), + ); + current.intValue = new IntValueType( + new IntInvertedIndexType(false, new IntInvertedIndexConfig()), + ); + current.floatValue = new FloatValueType( + new FloatInvertedIndexType(false, new FloatInvertedIndexConfig()), + ); + current.boolean = new BoolValueType( + new BoolInvertedIndexType(false, new BoolInvertedIndexConfig()), + ); + } + + private validateSingleSparseVectorIndex(targetKey: string): void { + for (const [existingKey, valueTypes] of Object.entries(this.keys)) { + if (existingKey === targetKey) continue; + const sparseIndex = valueTypes.sparseVector?.sparseVectorIndex; + if (sparseIndex?.enabled) { + throw new Error( + `Cannot enable sparse vector index on key '${targetKey}'. A sparse vector index is already enabled on key '${existingKey}'. Only one sparse vector index is allowed per collection.`, + ); + } + } + } + + private initializeDefaults(): void { + this.defaults.string = new StringValueType( + new FtsIndexType(false, new FtsIndexConfig()), + new StringInvertedIndexType(true, new StringInvertedIndexConfig()), + ); + + this.defaults.floatList = new FloatListValueType( + new VectorIndexType(false, new VectorIndexConfig()), + ); + + this.defaults.sparseVector = new SparseVectorValueType( + new SparseVectorIndexType(false, new SparseVectorIndexConfig()), + ); + + this.defaults.intValue = new IntValueType( + new IntInvertedIndexType(true, new IntInvertedIndexConfig()), + ); + + this.defaults.floatValue = new FloatValueType( + new FloatInvertedIndexType(true, new FloatInvertedIndexConfig()), + ); + + this.defaults.boolean = new BoolValueType( + new BoolInvertedIndexType(true, new BoolInvertedIndexConfig()), + ); + } + + private initializeKeys(): void { + this.keys[DOCUMENT_KEY] = new ValueTypes(); + this.keys[DOCUMENT_KEY].string = new StringValueType( + new FtsIndexType(true, new FtsIndexConfig()), + new StringInvertedIndexType(false, new StringInvertedIndexConfig()), + ); + + this.keys[EMBEDDING_KEY] = new ValueTypes(); + this.keys[EMBEDDING_KEY].floatList = new FloatListValueType( + new VectorIndexType( + true, + new VectorIndexConfig({ sourceKey: DOCUMENT_KEY }), + ), + ); + } + + private serializeValueTypes(valueTypes: ValueTypes): ValueTypesJson { + const result: ValueTypesJson = {}; + + if (valueTypes.string) { + const serialized = this.serializeStringValueType(valueTypes.string); + if (Object.keys(serialized).length > 0) { + result[STRING_VALUE_NAME] = serialized; + } + } + + if (valueTypes.floatList) { + const serialized = this.serializeFloatListValueType(valueTypes.floatList); + if (Object.keys(serialized).length > 0) { + result[FLOAT_LIST_VALUE_NAME] = serialized; + } + } + + if (valueTypes.sparseVector) { + const serialized = this.serializeSparseVectorValueType(valueTypes.sparseVector); + if (Object.keys(serialized).length > 0) { + result[SPARSE_VECTOR_VALUE_NAME] = serialized; + } + } + + if (valueTypes.intValue) { + const serialized = this.serializeIntValueType(valueTypes.intValue); + if (Object.keys(serialized).length > 0) { + result[INT_VALUE_NAME] = serialized; + } + } + + if (valueTypes.floatValue) { + const serialized = this.serializeFloatValueType(valueTypes.floatValue); + if (Object.keys(serialized).length > 0) { + result[FLOAT_VALUE_NAME] = serialized; + } + } + + if (valueTypes.boolean) { + const serialized = this.serializeBoolValueType(valueTypes.boolean); + if (Object.keys(serialized).length > 0) { + result[BOOL_VALUE_NAME] = serialized; + } + } + + return result; + } + + private serializeStringValueType(valueType: StringValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.ftsIndex) { + result[FTS_INDEX_NAME] = { + enabled: valueType.ftsIndex.enabled, + config: this.serializeConfig(valueType.ftsIndex.config), + }; + } + if (valueType.stringInvertedIndex) { + result[STRING_INVERTED_INDEX_NAME] = { + enabled: valueType.stringInvertedIndex.enabled, + config: this.serializeConfig(valueType.stringInvertedIndex.config), + }; + } + return result; + } + + private serializeFloatListValueType(valueType: FloatListValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.vectorIndex) { + result[VECTOR_INDEX_NAME] = { + enabled: valueType.vectorIndex.enabled, + config: this.serializeConfig(valueType.vectorIndex.config), + }; + } + return result; + } + + private serializeSparseVectorValueType(valueType: SparseVectorValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.sparseVectorIndex) { + result[SPARSE_VECTOR_INDEX_NAME] = { + enabled: valueType.sparseVectorIndex.enabled, + config: this.serializeConfig(valueType.sparseVectorIndex.config), + }; + } + return result; + } + + private serializeIntValueType(valueType: IntValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.intInvertedIndex) { + result[INT_INVERTED_INDEX_NAME] = { + enabled: valueType.intInvertedIndex.enabled, + config: this.serializeConfig(valueType.intInvertedIndex.config), + }; + } + return result; + } + + private serializeFloatValueType(valueType: FloatValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.floatInvertedIndex) { + result[FLOAT_INVERTED_INDEX_NAME] = { + enabled: valueType.floatInvertedIndex.enabled, + config: this.serializeConfig(valueType.floatInvertedIndex.config), + }; + } + return result; + } + + private serializeBoolValueType(valueType: BoolValueType): JsonDict { + const result: JsonDict = {}; + if (valueType.boolInvertedIndex) { + result[BOOL_INVERTED_INDEX_NAME] = { + enabled: valueType.boolInvertedIndex.enabled, + config: this.serializeConfig(valueType.boolInvertedIndex.config), + }; + } + return result; + } + + private serializeConfig(config: IndexConfig): JsonDict { + if (config instanceof VectorIndexConfig) { + return this.serializeVectorConfig(config); + } + if (config instanceof SparseVectorIndexConfig) { + return this.serializeSparseVectorConfig(config); + } + return {}; + } + + private serializeVectorConfig(config: VectorIndexConfig): JsonDict { + const serialized: JsonDict = {}; + const embeddingFunction = config.embeddingFunction; + const efConfig = prepareEmbeddingFunctionConfig(embeddingFunction); + serialized["embedding_function"] = efConfig; + + let resolvedSpace = config.space ?? null; + if (!resolvedSpace && embeddingFunction?.defaultSpace) { + resolvedSpace = embeddingFunction.defaultSpace(); + } + + if ( + resolvedSpace && + embeddingFunction?.supportedSpaces && + !embeddingFunction.supportedSpaces().includes(resolvedSpace) + ) { + console.warn( + `Space '${resolvedSpace}' is not supported by embedding function '${resolveEmbeddingFunctionName(embeddingFunction) ?? "unknown"}'. Supported spaces: ${embeddingFunction + .supportedSpaces() + .join(", ")}`, + ); + } + + if (resolvedSpace) { + serialized.space = resolvedSpace; + } + + if (config.sourceKey) { + serialized.source_key = config.sourceKey; + } + + if (config.hnsw) { + serialized.hnsw = cloneObject(config.hnsw); + } + + if (config.spann) { + serialized.spann = cloneObject(config.spann); + } + + return serialized; + } + + private serializeSparseVectorConfig(config: SparseVectorIndexConfig): JsonDict { + const serialized: JsonDict = {}; + const embeddingFunction = config.embeddingFunction; + serialized["embedding_function"] = prepareEmbeddingFunctionConfig(embeddingFunction); + + if (config.sourceKey) { + serialized.source_key = config.sourceKey; + } + + if (typeof config.bm25 === "boolean") { + serialized.bm25 = config.bm25; + } + + return serialized; + } + + private static deserializeValueTypes(json: Record): ValueTypes { + const result = new ValueTypes(); + + if (json[STRING_VALUE_NAME]) { + result.string = Schema.deserializeStringValueType(json[STRING_VALUE_NAME]); + } + + if (json[FLOAT_LIST_VALUE_NAME]) { + result.floatList = Schema.deserializeFloatListValueType(json[FLOAT_LIST_VALUE_NAME]); + } + + if (json[SPARSE_VECTOR_VALUE_NAME]) { + result.sparseVector = Schema.deserializeSparseVectorValueType(json[SPARSE_VECTOR_VALUE_NAME]); + } + + if (json[INT_VALUE_NAME]) { + result.intValue = Schema.deserializeIntValueType(json[INT_VALUE_NAME]); + } + + if (json[FLOAT_VALUE_NAME]) { + result.floatValue = Schema.deserializeFloatValueType(json[FLOAT_VALUE_NAME]); + } + + if (json[BOOL_VALUE_NAME]) { + result.boolean = Schema.deserializeBoolValueType(json[BOOL_VALUE_NAME]); + } + + return result; + } + + private static deserializeStringValueType(json: Record): StringValueType { + let ftsIndex: FtsIndexType | null = null; + let stringIndex: StringInvertedIndexType | null = null; + + if (json[FTS_INDEX_NAME]) { + const data = json[FTS_INDEX_NAME]; + ftsIndex = new FtsIndexType(Boolean(data.enabled), new FtsIndexConfig()); + } + + if (json[STRING_INVERTED_INDEX_NAME]) { + const data = json[STRING_INVERTED_INDEX_NAME]; + stringIndex = new StringInvertedIndexType( + Boolean(data.enabled), + new StringInvertedIndexConfig(), + ); + } + + return new StringValueType(ftsIndex, stringIndex); + } + + private static deserializeFloatListValueType(json: Record): FloatListValueType { + let vectorIndex: VectorIndexType | null = null; + if (json[VECTOR_INDEX_NAME]) { + const data = json[VECTOR_INDEX_NAME]; + const enabled = Boolean(data.enabled); + const config = Schema.deserializeVectorConfig(data.config ?? {}); + vectorIndex = new VectorIndexType(enabled, config); + } + return new FloatListValueType(vectorIndex); + } + + private static deserializeSparseVectorValueType(json: Record): SparseVectorValueType { + let sparseIndex: SparseVectorIndexType | null = null; + if (json[SPARSE_VECTOR_INDEX_NAME]) { + const data = json[SPARSE_VECTOR_INDEX_NAME]; + const enabled = Boolean(data.enabled); + const config = Schema.deserializeSparseVectorConfig(data.config ?? {}); + sparseIndex = new SparseVectorIndexType(enabled, config); + } + return new SparseVectorValueType(sparseIndex); + } + + private static deserializeIntValueType(json: Record): IntValueType { + let index: IntInvertedIndexType | null = null; + if (json[INT_INVERTED_INDEX_NAME]) { + const data = json[INT_INVERTED_INDEX_NAME]; + index = new IntInvertedIndexType(Boolean(data.enabled), new IntInvertedIndexConfig()); + } + return new IntValueType(index); + } + + private static deserializeFloatValueType(json: Record): FloatValueType { + let index: FloatInvertedIndexType | null = null; + if (json[FLOAT_INVERTED_INDEX_NAME]) { + const data = json[FLOAT_INVERTED_INDEX_NAME]; + index = new FloatInvertedIndexType(Boolean(data.enabled), new FloatInvertedIndexConfig()); + } + return new FloatValueType(index); + } + + private static deserializeBoolValueType(json: Record): BoolValueType { + let index: BoolInvertedIndexType | null = null; + if (json[BOOL_INVERTED_INDEX_NAME]) { + const data = json[BOOL_INVERTED_INDEX_NAME]; + index = new BoolInvertedIndexType(Boolean(data.enabled), new BoolInvertedIndexConfig()); + } + return new BoolValueType(index); + } + + private static deserializeVectorConfig(json: Record): VectorIndexConfig { + const config = new VectorIndexConfig({ + space: (json.space as Space | null | undefined) ?? null, + sourceKey: (json.source_key as string | null | undefined) ?? null, + hnsw: json.hnsw ? cloneObject(json.hnsw) : null, + spann: json.spann ? cloneObject(json.spann) : null, + }); + + const embeddingFunction = + getEmbeddingFunction( + "schema deserialization", + json.embedding_function as EmbeddingFunctionConfiguration, + ) ?? (config.embeddingFunction as EmbeddingFunction | null | undefined) ?? undefined; + + config.embeddingFunction = embeddingFunction ?? null; + if (!config.space && config.embeddingFunction?.defaultSpace) { + config.space = config.embeddingFunction.defaultSpace(); + } + + return config; + } + + private static deserializeSparseVectorConfig(json: Record): SparseVectorIndexConfig { + const config = new SparseVectorIndexConfig({ + sourceKey: (json.source_key as string | null | undefined) ?? null, + bm25: typeof json.bm25 === "boolean" ? json.bm25 : null, + }); + + const embeddingFunction = + getSparseEmbeddingFunction( + "schema deserialization", + json.embedding_function as EmbeddingFunctionConfiguration, + ) ?? + (config.embeddingFunction as SparseEmbeddingFunction | null | undefined) ?? + undefined; + + config.embeddingFunction = embeddingFunction ?? null; + return config; + } +} diff --git a/clients/new-js/packages/chromadb/test/schema.test.ts b/clients/new-js/packages/chromadb/test/schema.test.ts new file mode 100644 index 00000000000..a207b833823 --- /dev/null +++ b/clients/new-js/packages/chromadb/test/schema.test.ts @@ -0,0 +1,1412 @@ +import { CollectionImpl } from "../src/collection"; +import type { CollectionConfiguration } from "../src/collection-configuration"; +import type { CollectionMetadata } from "../src/types"; +import { + registerEmbeddingFunction, + registerSparseEmbeddingFunction, + EmbeddingFunction, + SparseEmbeddingFunction, +} from "../src/embedding-function"; +import { + DOCUMENT_KEY, + EMBEDDING_KEY, + Schema, + FtsIndexConfig, + StringInvertedIndexConfig, + IntInvertedIndexConfig, + FloatInvertedIndexConfig, + BoolInvertedIndexConfig, + SparseVectorIndexConfig, + VectorIndexConfig, +} from "../src/schema"; +import type { ChromaClient } from "../src/chroma-client"; + +class MockEmbedding implements EmbeddingFunction { + public readonly name = "mock_embedding"; + + constructor(private readonly modelName = "mock_model") { } + + async generate(texts: string[]): Promise { + return texts.map(() => [1, 2, 3]); + } + + getConfig(): Record { + return { modelName: this.modelName }; + } + + defaultSpace(): "cosine" { + return "cosine"; + } + + supportedSpaces(): ("cosine" | "l2" | "ip")[] { + return ["cosine", "l2", "ip"]; + } + + static buildFromConfig(config: Record): MockEmbedding { + return new MockEmbedding(config.modelName); + } +} + +class MockSparseEmbedding implements SparseEmbeddingFunction { + public readonly name = "mock_sparse"; + + constructor(private readonly identifier = "mock_sparse") { } + + async generate(texts: string[]) { + return texts.map(() => ({ indices: [0, 1], values: [1, 1] })); + } + + getConfig(): Record { + return { identifier: this.identifier }; + } + + static buildFromConfig(config: Record): MockSparseEmbedding { + return new MockSparseEmbedding(config.identifier); + } +} + +class DeterministicSparseEmbedding implements SparseEmbeddingFunction { + public readonly name = "deterministic_sparse"; + + constructor(private readonly label = "det") { } + + async generate(texts: string[]) { + return texts.map((text, index) => { + const indices: number[] = []; + const values: number[] = []; + + for (let i = 0; i < text.length; i++) { + indices.push(index * 1000 + i); + values.push(text.charCodeAt(i) / 100.0); + } + + return { indices, values }; + }); + } + + getConfig(): Record { + return { label: this.label }; + } + + static buildFromConfig(config: Record): DeterministicSparseEmbedding { + return new DeterministicSparseEmbedding(config.label); + } +} + +beforeAll(() => { + try { + registerEmbeddingFunction("mock_embedding", MockEmbedding as any); + } catch (_err) { + // ignore double registration in watch mode + } + try { + registerSparseEmbeddingFunction("mock_sparse", MockSparseEmbedding as any); + } catch (_err) { + // ignore double registration in watch mode + } +}); + +describe("Schema", () => { + it("default schema initialization", () => { + const schema = new Schema(); + + expect(schema.defaults).toBeDefined(); + + expect(schema.defaults.string).not.toBeNull(); + expect(schema.defaults.string?.ftsIndex?.enabled).toBe(false); + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + + expect(schema.defaults.floatList).not.toBeNull(); + expect(schema.defaults.floatList?.vectorIndex?.enabled).toBe(false); + + expect(schema.defaults.sparseVector).not.toBeNull(); + expect(schema.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + + expect(schema.defaults.intValue).not.toBeNull(); + expect(schema.defaults.intValue?.intInvertedIndex?.enabled).toBe(true); + + expect(schema.defaults.floatValue).not.toBeNull(); + expect(schema.defaults.floatValue?.floatInvertedIndex?.enabled).toBe(true); + + expect(schema.defaults.boolean).not.toBeNull(); + expect(schema.defaults.boolean?.boolInvertedIndex?.enabled).toBe(true); + + const overrideKeys = Object.keys(schema.keys); + expect(overrideKeys).toEqual(expect.arrayContaining([DOCUMENT_KEY, EMBEDDING_KEY])); + expect(overrideKeys).toHaveLength(2); + + const documentOverride = schema.keys[DOCUMENT_KEY]; + expect(documentOverride.string?.ftsIndex?.enabled).toBe(true); + expect(documentOverride.string?.stringInvertedIndex?.enabled).toBe(false); + + const embeddingOverride = schema.keys[EMBEDDING_KEY]; + expect(embeddingOverride.floatList?.vectorIndex?.enabled).toBe(true); + expect(embeddingOverride.floatList?.vectorIndex?.config.sourceKey).toBe(DOCUMENT_KEY); + }); + + it("create sparse vector index on key", () => { + const schema = new Schema(); + const config = new SparseVectorIndexConfig(); + + const result = schema.createIndex(config, "custom_sparse_key"); + expect(result).toBe(schema); + + const override = schema.keys["custom_sparse_key"]; + expect(override.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(override.sparseVector?.sparseVectorIndex?.config).toBe(config); + expect(override.string).toBeNull(); + expect(override.floatList).toBeNull(); + expect(override.intValue).toBeNull(); + expect(override.floatValue).toBeNull(); + expect(override.boolean).toBeNull(); + + expect(schema.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + }); + + it("create sparse vector index with custom config", () => { + const schema = new Schema(); + const embeddingFunc = new MockSparseEmbedding("custom_sparse_ef"); + const config = new SparseVectorIndexConfig({ + embeddingFunction: embeddingFunc, + sourceKey: "custom_document_field", + }); + + const result = schema.createIndex(config, "sparse_embeddings"); + expect(result).toBe(schema); + + const override = schema.keys["sparse_embeddings"]; + const sparseIndex = override.sparseVector?.sparseVectorIndex; + expect(sparseIndex?.enabled).toBe(true); + expect(sparseIndex?.config).toBe(config); + expect(sparseIndex?.config.embeddingFunction).toBe(embeddingFunc); + expect(sparseIndex?.config.sourceKey).toBe("custom_document_field"); + + expect(schema.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + expect( + schema.defaults.sparseVector?.sparseVectorIndex?.config.embeddingFunction, + ).toBeNull(); + }); + + it("delete string inverted index on key", () => { + const schema = new Schema(); + const config = new StringInvertedIndexConfig(); + + const result = schema.deleteIndex(config, "custom_text_key"); + expect(result).toBe(schema); + + const override = schema.keys["custom_text_key"]; + expect(override.string?.stringInvertedIndex?.enabled).toBe(false); + expect(override.string?.stringInvertedIndex?.config).toBe(config); + + expect(schema.keys[DOCUMENT_KEY].string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys[EMBEDDING_KEY].string).toBeNull(); + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + }); + + it("chained create and delete operations", () => { + const schema = new Schema(); + const sparseConfig = new SparseVectorIndexConfig({ sourceKey: "raw_text" }); + const stringConfig = new StringInvertedIndexConfig(); + + const result = schema + .createIndex(sparseConfig, "embeddings_key") + .deleteIndex(stringConfig, "text_key_1") + .deleteIndex(stringConfig, "text_key_2"); + + expect(result).toBe(schema); + + const embeddingsOverride = schema.keys["embeddings_key"]; + expect(embeddingsOverride.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(embeddingsOverride.sparseVector?.sparseVectorIndex?.config.sourceKey).toBe("raw_text"); + expect(embeddingsOverride.string).toBeNull(); + expect(embeddingsOverride.floatList).toBeNull(); + + const textKey1 = schema.keys["text_key_1"]; + expect(textKey1.string?.stringInvertedIndex?.enabled).toBe(false); + expect(textKey1.sparseVector).toBeNull(); + + const textKey2 = schema.keys["text_key_2"]; + expect(textKey2.string?.stringInvertedIndex?.enabled).toBe(false); + expect(textKey2.sparseVector).toBeNull(); + + expect(schema.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + }); + + it("vector index config and restrictions", () => { + const schema = new Schema(); + const vectorConfig = new VectorIndexConfig({ + space: "cosine", + sourceKey: "custom_source", + }); + + const result = schema.createIndex(vectorConfig); + expect(result).toBe(schema); + + const defaultsVector = schema.defaults.floatList?.vectorIndex; + expect(defaultsVector?.enabled).toBe(false); + expect(defaultsVector?.config.space).toBe("cosine"); + expect(defaultsVector?.config.sourceKey).toBe("custom_source"); + + const embeddingVector = schema.keys[EMBEDDING_KEY].floatList?.vectorIndex; + expect(embeddingVector?.enabled).toBe(true); + expect(embeddingVector?.config.space).toBe("cosine"); + expect(embeddingVector?.config.sourceKey).toBe(DOCUMENT_KEY); + + expect(() => schema.createIndex(new VectorIndexConfig({ space: "l2" }), "my_vectors")).toThrow( + /Vector index cannot be enabled on specific keys/, + ); + expect(() => schema.createIndex(new VectorIndexConfig({ space: "l2" }), DOCUMENT_KEY)).toThrow( + /Cannot create index on special key '#document'/, + ); + expect(() => schema.createIndex(new VectorIndexConfig({ space: "ip" }), EMBEDDING_KEY)).toThrow( + /Cannot create index on special key '#embedding'/, + ); + }); + + it("vector index with embedding function and hnsw", () => { + const schema = new Schema(); + const mockEf = new MockEmbedding("custom_model_v2"); + const vectorConfig = new VectorIndexConfig({ + embeddingFunction: mockEf, + space: "l2", + hnsw: { ef_construction: 200, max_neighbors: 32, ef_search: 100 }, + sourceKey: "custom_document_field", + }); + + const result = schema.createIndex(vectorConfig); + expect(result).toBe(schema); + + const defaultsVector = schema.defaults.floatList?.vectorIndex; + expect(defaultsVector?.enabled).toBe(false); + expect(defaultsVector?.config.embeddingFunction).toBe(mockEf); + expect(defaultsVector?.config.space).toBe("l2"); + expect(defaultsVector?.config.hnsw).toEqual({ + ef_construction: 200, + max_neighbors: 32, + ef_search: 100, + }); + expect(defaultsVector?.config.sourceKey).toBe("custom_document_field"); + + const embeddingVector = schema.keys[EMBEDDING_KEY].floatList?.vectorIndex; + expect(embeddingVector?.enabled).toBe(true); + expect(embeddingVector?.config.embeddingFunction).toBe(mockEf); + expect(embeddingVector?.config.space).toBe("l2"); + expect(embeddingVector?.config.hnsw).toEqual({ + ef_construction: 200, + max_neighbors: 32, + ef_search: 100, + }); + expect(embeddingVector?.config.sourceKey).toBe(DOCUMENT_KEY); + }); + + it("fts index config and restrictions", () => { + const schema = new Schema(); + const ftsConfig = new FtsIndexConfig(); + + const result = schema.createIndex(ftsConfig); + expect(result).toBe(schema); + + const defaultsString = schema.defaults.string; + expect(defaultsString?.ftsIndex?.enabled).toBe(false); + expect(defaultsString?.ftsIndex?.config).toBe(ftsConfig); + + const documentOverride = schema.keys[DOCUMENT_KEY]; + expect(documentOverride.string?.ftsIndex?.enabled).toBe(true); + expect(documentOverride.string?.ftsIndex?.config).toBe(ftsConfig); + + expect(() => schema.createIndex(new FtsIndexConfig(), "custom_text_field")).toThrow( + /FTS index cannot be enabled on specific keys/, + ); + expect(() => schema.createIndex(new FtsIndexConfig(), DOCUMENT_KEY)).toThrow( + /Cannot create index on special key '#document'/, + ); + expect(() => schema.createIndex(new FtsIndexConfig(), EMBEDDING_KEY)).toThrow( + /Cannot create index on special key '#embedding'/, + ); + }); + + it("special keys blocked for all index types", () => { + const schema = new Schema(); + + expect(() => schema.createIndex(new StringInvertedIndexConfig(), DOCUMENT_KEY)).toThrow( + /Cannot create index on special key '#document'/, + ); + expect(() => schema.createIndex(new StringInvertedIndexConfig(), EMBEDDING_KEY)).toThrow( + /Cannot create index on special key '#embedding'/, + ); + expect(() => schema.createIndex(new SparseVectorIndexConfig(), DOCUMENT_KEY)).toThrow( + /Cannot create index on special key '#document'/, + ); + expect(() => schema.createIndex(new SparseVectorIndexConfig(), EMBEDDING_KEY)).toThrow( + /Cannot create index on special key '#embedding'/, + ); + }); + + it("enable and disable all indexes for custom key", () => { + const schema = new Schema(); + + schema.createIndex(undefined, "my_key"); + + const enabled = schema.keys["my_key"]; + expect(enabled.string?.ftsIndex?.enabled).toBe(true); + expect(enabled.string?.stringInvertedIndex?.enabled).toBe(true); + expect(enabled.floatList?.vectorIndex?.enabled).toBe(true); + expect(enabled.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(enabled.intValue?.intInvertedIndex?.enabled).toBe(true); + expect(enabled.floatValue?.floatInvertedIndex?.enabled).toBe(true); + expect(enabled.boolean?.boolInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(undefined, "my_key"); + + const disabled = schema.keys["my_key"]; + expect(disabled.string?.ftsIndex?.enabled).toBe(false); + expect(disabled.string?.stringInvertedIndex?.enabled).toBe(false); + expect(disabled.floatList?.vectorIndex?.enabled).toBe(false); + expect(disabled.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + expect(disabled.intValue?.intInvertedIndex?.enabled).toBe(false); + expect(disabled.floatValue?.floatInvertedIndex?.enabled).toBe(false); + expect(disabled.boolean?.boolInvertedIndex?.enabled).toBe(false); + }); + + it("cannot delete vector or fts index", () => { + const schema = new Schema(); + + expect(() => schema.deleteIndex(new VectorIndexConfig())).toThrow( + "Deleting vector index is not currently supported.", + ); + expect(() => schema.deleteIndex(new VectorIndexConfig(), "my_vectors")).toThrow( + "Deleting vector index is not currently supported.", + ); + expect(() => schema.deleteIndex(new FtsIndexConfig())).toThrow( + "Deleting FTS index is not currently supported.", + ); + expect(() => schema.deleteIndex(new FtsIndexConfig(), "my_text_field")).toThrow( + "Deleting FTS index is not currently supported.", + ); + }); + + it("disable string inverted index globally", () => { + const schema = new Schema(); + const config = new StringInvertedIndexConfig(); + + const result = schema.deleteIndex(config); + expect(result).toBe(schema); + + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.defaults.string?.stringInvertedIndex?.config).toBe(config); + + expect(schema.keys[DOCUMENT_KEY].string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys[EMBEDDING_KEY].floatList?.vectorIndex?.enabled).toBe(true); + }); + + it("disable string inverted index on key", () => { + const schema = new Schema(); + const config = new StringInvertedIndexConfig(); + + const result = schema.deleteIndex(config, "my_text_field"); + expect(result).toBe(schema); + + const override = schema.keys["my_text_field"]; + expect(override.string?.stringInvertedIndex?.enabled).toBe(false); + expect(override.string?.stringInvertedIndex?.config).toBe(config); + expect(override.floatList).toBeNull(); + expect(override.sparseVector).toBeNull(); + expect(override.intValue).toBeNull(); + + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + expect(schema.keys[DOCUMENT_KEY].string?.stringInvertedIndex?.enabled).toBe(false); + }); + + it("disable int inverted index", () => { + const schema = new Schema(); + const configGlobal = new IntInvertedIndexConfig(); + + expect(schema.defaults.intValue?.intInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(configGlobal); + expect(schema.defaults.intValue?.intInvertedIndex?.enabled).toBe(false); + expect(schema.defaults.intValue?.intInvertedIndex?.config).toBe(configGlobal); + + const configKey = new IntInvertedIndexConfig(); + schema.deleteIndex(configKey, "age_field"); + + const override = schema.keys["age_field"]; + expect(override.intValue?.intInvertedIndex?.enabled).toBe(false); + expect(override.intValue?.intInvertedIndex?.config).toBe(configKey); + expect(override.string).toBeNull(); + expect(override.floatList).toBeNull(); + expect(override.sparseVector).toBeNull(); + expect(override.floatValue).toBeNull(); + expect(override.boolean).toBeNull(); + }); + + // Additional tests will be appended below. + + it("serialize and deserialize default schema", () => { + const schema = new Schema(); + const json = schema.serializeToJSON(); + + expect(json).toHaveProperty("defaults"); + expect(json).toHaveProperty("keys"); + + const defaults = json.defaults; + expect(defaults["string"]!["fts_index"]!.enabled).toBe(false); + expect(defaults["string"]!["fts_index"]!.config).toEqual({}); + expect(defaults["string"]!["string_inverted_index"]!.enabled).toBe(true); + expect(defaults["string"]!["string_inverted_index"]!.config).toEqual({}); + + const vectorJson = defaults["float_list"]!["vector_index"]!; + expect(vectorJson.enabled).toBe(false); + expect(vectorJson.config!.embedding_function).toEqual({ type: "legacy" }); + expect(vectorJson.config!.space).toBeUndefined(); + + const sparseJson = defaults["sparse_vector"]!["sparse_vector_index"]!; + expect(sparseJson.enabled).toBe(false); + expect(sparseJson.config!.embedding_function).toEqual({ type: "legacy" }); + + expect(defaults["int"]!["int_inverted_index"]!.enabled).toBe(true); + expect(defaults["float"]!["float_inverted_index"]!.enabled).toBe(true); + expect(defaults["bool"]!["bool_inverted_index"]!.enabled).toBe(true); + + const overrides = json.keys; + expect(overrides).toHaveProperty(DOCUMENT_KEY); + expect(overrides).toHaveProperty(EMBEDDING_KEY); + + const documentJson = overrides[DOCUMENT_KEY]!["string"]!; + expect(documentJson["fts_index"]!.enabled).toBe(true); + expect(documentJson["fts_index"]!.config).toEqual({}); + expect(documentJson["string_inverted_index"]!.enabled).toBe(false); + + const embeddingJson = overrides[EMBEDDING_KEY]!["float_list"]!["vector_index"]!; + expect(embeddingJson.enabled).toBe(true); + expect(embeddingJson.config!.embedding_function).toEqual({ type: "legacy" }); + expect(embeddingJson.config!.source_key).toBe(DOCUMENT_KEY); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized).toBeDefined(); + expect(deserialized!.defaults.string?.ftsIndex?.enabled).toBe(false); + expect(deserialized!.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + expect(deserialized!.defaults.floatList?.vectorIndex?.enabled).toBe(false); + expect(deserialized!.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + expect(deserialized!.defaults.intValue?.intInvertedIndex?.enabled).toBe(true); + expect(deserialized!.defaults.floatValue?.floatInvertedIndex?.enabled).toBe(true); + expect(deserialized!.defaults.boolean?.boolInvertedIndex?.enabled).toBe(true); + expect(deserialized!.keys[DOCUMENT_KEY].string?.ftsIndex?.enabled).toBe(true); + expect(deserialized!.keys[EMBEDDING_KEY].floatList?.vectorIndex?.enabled).toBe(true); + }); + + it("serialize and deserialize with vector config and no embedding function", () => { + const schema = new Schema(); + const vectorConfig = new VectorIndexConfig({ + space: "cosine", + embeddingFunction: null, + }); + + schema.createIndex(vectorConfig); + + const json = schema.serializeToJSON(); + const defaultsVector = json.defaults["float_list"]!["vector_index"]!; + expect(defaultsVector.enabled).toBe(false); + expect(defaultsVector.config!.space).toBe("cosine"); + expect(defaultsVector.config!.embedding_function!.type).toBe("legacy"); + + const embeddingVector = json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!; + expect(embeddingVector.enabled).toBe(true); + expect(embeddingVector.config!.space).toBe("cosine"); + expect(embeddingVector.config!.embedding_function!.type).toBe("legacy"); + expect(embeddingVector.config!.source_key).toBe(DOCUMENT_KEY); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.space).toBe("cosine"); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.embeddingFunction).toBeNull(); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.space).toBe("cosine"); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.embeddingFunction).toBeNull(); + }); + + it("serialize and deserialize with custom embedding function", () => { + const schema = new Schema(); + const mockEf = new MockEmbedding("custom_model_v3"); + const vectorConfig = new VectorIndexConfig({ + embeddingFunction: mockEf, + space: "ip", + hnsw: { ef_construction: 256, max_neighbors: 48, ef_search: 128 }, + }); + + schema.createIndex(vectorConfig); + + const json = schema.serializeToJSON(); + const defaultsVector = json.defaults["float_list"]!["vector_index"]!; + expect(defaultsVector.config!.space).toBe("ip"); + expect(defaultsVector.config!.embedding_function).toEqual({ + type: "known", + name: "mock_embedding", + config: { modelName: "custom_model_v3" }, + }); + expect(defaultsVector.config!.hnsw).toEqual({ + ef_construction: 256, + max_neighbors: 48, + ef_search: 128, + }); + + const embeddingVector = json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!; + expect(embeddingVector.config!.embedding_function).toEqual({ + type: "known", + name: "mock_embedding", + config: { modelName: "custom_model_v3" }, + }); + expect(embeddingVector.config!.space).toBe("ip"); + expect(embeddingVector.config!.hnsw).toEqual({ + ef_construction: 256, + max_neighbors: 48, + ef_search: 128, + }); + + const deserialized = Schema.deserializeFromJSON(json); + const desDefaultsVector = deserialized?.defaults.floatList?.vectorIndex; + expect(desDefaultsVector?.config.embeddingFunction).toBeDefined(); + expect(desDefaultsVector?.config.embeddingFunction?.getConfig?.()).toEqual({ + modelName: "custom_model_v3", + }); + expect(desDefaultsVector?.config.space).toBe("ip"); + expect(desDefaultsVector?.config.hnsw).toEqual({ + ef_construction: 256, + max_neighbors: 48, + ef_search: 128, + }); + }); + + it("serialize and deserialize with SPANN config", () => { + const schema = new Schema(); + const mockEf = new MockEmbedding("spann_model"); + const spannConfig = { + search_nprobe: 100, + write_nprobe: 50, + ef_construction: 200, + ef_search: 150, + }; + const vectorConfig = new VectorIndexConfig({ + embeddingFunction: mockEf, + space: "cosine", + spann: spannConfig, + }); + + schema.createIndex(vectorConfig); + + const json = schema.serializeToJSON(); + const defaultsVector = json.defaults["float_list"]!["vector_index"]!; + expect(defaultsVector.config!.space).toBe("cosine"); + expect(defaultsVector.config!.embedding_function).toEqual({ + type: "known", + name: "mock_embedding", + config: { modelName: "spann_model" }, + }); + expect(defaultsVector.config!.spann).toEqual(spannConfig); + expect(defaultsVector.config!.hnsw).toBeUndefined(); + + const embeddingVector = json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!; + expect(embeddingVector.config!.spann).toEqual(spannConfig); + expect(embeddingVector.config!.hnsw).toBeUndefined(); + + const deserialized = Schema.deserializeFromJSON(json); + const desDefaultsVector = deserialized?.defaults.floatList?.vectorIndex; + expect(desDefaultsVector?.config.spann).toEqual(spannConfig); + expect(desDefaultsVector?.config.hnsw).toBeNull(); + expect(desDefaultsVector?.config.embeddingFunction?.getConfig?.()).toEqual({ + modelName: "spann_model", + }); + const desEmbeddingVector = deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex; + expect(desEmbeddingVector?.config.spann).toEqual(spannConfig); + expect(desEmbeddingVector?.config.hnsw).toBeNull(); + }); + + it("serialize and deserialize complex mixed modifications", () => { + const schema = new Schema(); + + const vectorConfig = new VectorIndexConfig({ + embeddingFunction: new MockEmbedding("mixed_test_model"), + space: "ip", + hnsw: { ef_construction: 300, max_neighbors: 64 }, + }); + schema.createIndex(vectorConfig); + + const sparseConfig = new SparseVectorIndexConfig({ + embeddingFunction: new MockSparseEmbedding("sparse_model"), + sourceKey: "text_field", + }); + schema.createIndex(sparseConfig, "embeddings_field"); + + schema.deleteIndex(new StringInvertedIndexConfig(), "tags"); + schema.deleteIndex(new IntInvertedIndexConfig(), "count"); + schema.deleteIndex(new FloatInvertedIndexConfig(), "price"); + + const json = schema.serializeToJSON(); + const defaultsVector = json.defaults["float_list"]!["vector_index"]!; + expect(defaultsVector.config!.space).toBe("ip"); + expect(defaultsVector.config!.hnsw).toEqual({ + ef_construction: 300, + max_neighbors: 64, + }); + + const overrides = json.keys; + expect(overrides).toHaveProperty("embeddings_field"); + expect(overrides).toHaveProperty("tags"); + expect(overrides).toHaveProperty("count"); + expect(overrides).toHaveProperty("price"); + expect(overrides).toHaveProperty(DOCUMENT_KEY); + expect(overrides).toHaveProperty(EMBEDDING_KEY); + + const embeddingsFieldJson = overrides["embeddings_field"]!; + expect(embeddingsFieldJson["sparse_vector"]!["sparse_vector_index"]!.enabled).toBe(true); + expect( + embeddingsFieldJson["sparse_vector"]!["sparse_vector_index"]!.config!.source_key, + ).toBe("text_field"); + expect( + embeddingsFieldJson["sparse_vector"]!["sparse_vector_index"]!.config!.embedding_function, + ).toEqual({ + type: "known", + name: "mock_sparse", + config: { identifier: "sparse_model" }, + }); + expect(Object.keys(embeddingsFieldJson)).toEqual(["sparse_vector"]); + + const tagsJson = overrides["tags"]!; + expect(tagsJson["string"]!["string_inverted_index"]!.enabled).toBe(false); + expect(tagsJson["string"]!["string_inverted_index"]!.config).toEqual({}); + + const countJson = overrides["count"]!; + expect(countJson["int"]!["int_inverted_index"]!.enabled).toBe(false); + expect(countJson["int"]!["int_inverted_index"]!.config).toEqual({}); + + const priceJson = overrides["price"]!; + expect(priceJson["float"]!["float_inverted_index"]!.enabled).toBe(false); + expect(priceJson["float"]!["float_inverted_index"]!.config).toEqual({}); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.keys["embeddings_field"].sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(deserialized?.keys["embeddings_field"].sparseVector?.sparseVectorIndex?.config.sourceKey).toBe("text_field"); + expect(deserialized?.keys["tags"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(deserialized?.keys["count"].intValue?.intInvertedIndex?.enabled).toBe(false); + expect(deserialized?.keys["price"].floatValue?.floatInvertedIndex?.enabled).toBe(false); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.space).toBe("ip"); + expect(deserialized?.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + expect(deserialized?.defaults.sparseVector?.sparseVectorIndex?.enabled).toBe(false); + }); + + it("multiple index types on same key", () => { + const schema = new Schema(); + + schema.createIndex(new SparseVectorIndexConfig({ sourceKey: "source" }), "multi_field"); + schema.createIndex(new StringInvertedIndexConfig(), "multi_field"); + + const override = schema.keys["multi_field"]; + expect(override.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(override.string?.stringInvertedIndex?.enabled).toBe(true); + expect(override.floatList).toBeNull(); + expect(override.intValue).toBeNull(); + expect(override.floatValue).toBeNull(); + expect(override.boolean).toBeNull(); + + const json = schema.serializeToJSON(); + const multiFieldJson = json.keys["multi_field"]!; + expect(multiFieldJson["sparse_vector"]!["sparse_vector_index"]!.enabled).toBe(true); + expect(multiFieldJson["string"]!["string_inverted_index"]!.enabled).toBe(true); + + const deserialized = Schema.deserializeFromJSON(json); + const desOverride = deserialized?.keys["multi_field"]; + expect(desOverride?.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(desOverride?.string?.stringInvertedIndex?.enabled).toBe(true); + }); + + it("override then revert to default", () => { + const schema = new Schema(); + const stringConfig = new StringInvertedIndexConfig(); + + schema.createIndex(stringConfig, "temp_field"); + expect(schema.keys["temp_field"].string?.stringInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(stringConfig, "temp_field"); + expect(schema.keys["temp_field"].string?.stringInvertedIndex?.enabled).toBe(false); + + const json = schema.serializeToJSON(); + expect(json.keys["temp_field"]!["string"]!["string_inverted_index"]!.enabled).toBe(false); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.keys["temp_field"].string?.stringInvertedIndex?.enabled).toBe(false); + }); + + it("error handling invalid operations", () => { + const schema = new Schema(); + + expect(() => schema.createIndex(new VectorIndexConfig(), EMBEDDING_KEY)).toThrow( + /Cannot create index on special key '#embedding'/, + ); + expect(() => schema.createIndex(new FtsIndexConfig(), DOCUMENT_KEY)).toThrow( + /Cannot create index on special key '#document'/, + ); + expect(() => schema.createIndex()).toThrow( + /Cannot enable all index types globally/, + ); + expect(() => schema.createIndex(undefined, "mykey")).not.toThrow(); + expect(() => schema.deleteIndex(undefined, "mykey")).not.toThrow(); + expect(() => schema.deleteIndex(new VectorIndexConfig())).toThrow( + /Deleting vector index is not currently supported/, + ); + expect(() => schema.deleteIndex(new FtsIndexConfig())).toThrow( + /Deleting FTS index is not currently supported/, + ); + expect(() => schema.createIndex(new VectorIndexConfig(), "custom_field")).toThrow( + /Vector index cannot be enabled on specific keys/, + ); + expect(() => schema.createIndex(new FtsIndexConfig(), "custom_field")).toThrow( + /FTS index cannot be enabled on specific keys/, + ); + }); + + it("empty schema serialization", () => { + const schema = new Schema(); + const json = schema.serializeToJSON(); + + expect(Object.keys(json.defaults)).toEqual( + expect.arrayContaining(["string", "float_list", "sparse_vector", "int", "float", "bool"]), + ); + expect(Object.keys(json.keys)).toEqual( + expect.arrayContaining([DOCUMENT_KEY, EMBEDDING_KEY]), + ); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.string?.ftsIndex?.enabled).toBe(false); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.enabled).toBe(true); + }); + + it("multiple serialize deserialize roundtrips", () => { + const schema = new Schema(); + const json1 = schema.serializeToJSON(); + const schema2 = Schema.deserializeFromJSON(json1); + const json2 = schema2?.serializeToJSON(); + const schema3 = json2 ? Schema.deserializeFromJSON(json2) : undefined; + const json3 = schema3?.serializeToJSON(); + + expect(json1).toBeDefined(); + expect(json2).toBeDefined(); + expect(json3).toBeDefined(); + expect(schema3?.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + expect(schema3?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.enabled).toBe(true); + }); + + it("many key overrides stress", () => { + const schema = new Schema(); + + for (let i = 0; i < 50; i += 1) { + const key = `field_${i}`; + if (i === 0) { + schema.createIndex(new SparseVectorIndexConfig({ sourceKey: `source_${i}` }), key); + } else if (i % 2 === 1) { + schema.deleteIndex(new StringInvertedIndexConfig(), key); + } else { + schema.deleteIndex(new IntInvertedIndexConfig(), key); + } + } + + expect(Object.keys(schema.keys)).toHaveLength(52); + expect(schema.keys["field_0"].sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(schema.keys["field_1"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys["field_2"].intValue?.intInvertedIndex?.enabled).toBe(false); + + const json = schema.serializeToJSON(); + expect(Object.keys(json.keys)).toHaveLength(52); + + const deserialized = Schema.deserializeFromJSON(json); + expect(Object.keys(deserialized!.keys)).toHaveLength(52); + expect( + deserialized!.keys["field_0"].sparseVector?.sparseVectorIndex?.config.sourceKey, + ).toBe("source_0"); + expect(deserialized!.keys["field_49"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(deserialized!.keys["field_48"].intValue?.intInvertedIndex?.enabled).toBe(false); + }); + + it("chained operations maintain consistency", () => { + const schema = new Schema(); + + const result = schema + .createIndex(new SparseVectorIndexConfig({ sourceKey: "text" }), "field1") + .deleteIndex(new StringInvertedIndexConfig(), "field2") + .deleteIndex(new StringInvertedIndexConfig(), "field3") + .deleteIndex(new IntInvertedIndexConfig(), "field4"); + + expect(result).toBe(schema); + expect(schema.keys["field1"].sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(schema.keys["field2"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys["field3"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys["field4"].intValue?.intInvertedIndex?.enabled).toBe(false); + }); + + it("float and bool inverted indexes", () => { + const schema = new Schema(); + expect(schema.defaults.floatValue?.floatInvertedIndex?.enabled).toBe(true); + expect(schema.defaults.boolean?.boolInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(new FloatInvertedIndexConfig()); + expect(schema.defaults.floatValue?.floatInvertedIndex?.enabled).toBe(false); + + schema.deleteIndex(new BoolInvertedIndexConfig()); + expect(schema.defaults.boolean?.boolInvertedIndex?.enabled).toBe(false); + + schema.createIndex(new FloatInvertedIndexConfig(), "price"); + expect(schema.keys["price"].floatValue?.floatInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(new BoolInvertedIndexConfig(), "is_active"); + expect(schema.keys["is_active"].boolean?.boolInvertedIndex?.enabled).toBe(false); + + const json = schema.serializeToJSON(); + expect(json.defaults["float"]!["float_inverted_index"]!.enabled).toBe(false); + expect(json.defaults["bool"]!["bool_inverted_index"]!.enabled).toBe(false); + expect(json.keys["price"]!["float"]!["float_inverted_index"]!.enabled).toBe(true); + expect(json.keys["is_active"]!["bool"]!["bool_inverted_index"]!.enabled).toBe(false); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.floatValue?.floatInvertedIndex?.enabled).toBe(false); + expect(deserialized?.defaults.boolean?.boolInvertedIndex?.enabled).toBe(false); + expect(deserialized?.keys["price"].floatValue?.floatInvertedIndex?.enabled).toBe(true); + expect(deserialized?.keys["is_active"].boolean?.boolInvertedIndex?.enabled).toBe(false); + }); + + it("space inference from embedding function", () => { + const schema = new Schema(); + schema.createIndex(new VectorIndexConfig({ embeddingFunction: new MockEmbedding("space_inference") })); + + const json = schema.serializeToJSON(); + expect(json.defaults["float_list"]!["vector_index"]!.config!.space).toBe("cosine"); + expect(json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!.config!.space).toBe("cosine"); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.space).toBe("cosine"); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.space).toBe("cosine"); + }); + + it("explicit space overrides embedding function default", () => { + const schema = new Schema(); + schema.createIndex( + new VectorIndexConfig({ embeddingFunction: new MockEmbedding("override_space"), space: "l2" }), + ); + + const json = schema.serializeToJSON(); + expect(json.defaults["float_list"]!["vector_index"]!.config!.space).toBe("l2"); + expect(json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!.config!.space).toBe("l2"); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.space).toBe("l2"); + expect(deserialized?.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.space).toBe("l2"); + }); + + it("space inference with no embedding function", () => { + const schema = new Schema(); + schema.createIndex(new VectorIndexConfig({ embeddingFunction: null, space: "ip" })); + + const json = schema.serializeToJSON(); + expect(json.defaults["float_list"]!["vector_index"]!.config!.space).toBe("ip"); + expect(json.defaults["float_list"]!["vector_index"]!.config!.embedding_function!.type).toBe("legacy"); + + const embeddingVector = json.keys[EMBEDDING_KEY]!["float_list"]!["vector_index"]!; + expect(embeddingVector.config!.space).toBe("ip"); + expect(embeddingVector.config!.embedding_function!.type).toBe("legacy"); + + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.space).toBe("ip"); + expect(deserialized?.defaults.floatList?.vectorIndex?.config.embeddingFunction).toBeNull(); + }); + + it("space inference remains stable across roundtrips", () => { + const schema = new Schema(); + schema.createIndex(new VectorIndexConfig({ embeddingFunction: new MockEmbedding("roundtrip_space") })); + + const json1 = schema.serializeToJSON(); + expect(json1["defaults"]["float_list"]!["vector_index"]!.config!.space).toBe("cosine"); + const schema2 = Schema.deserializeFromJSON(json1); + + const json2 = schema2?.serializeToJSON(); + expect(json2?.["defaults"]["float_list"]!["vector_index"]!.config!.space).toBe("cosine"); + const schema3 = json2 ? Schema.deserializeFromJSON(json2) : undefined; + + const json3 = schema3?.serializeToJSON(); + expect(json3?.["defaults"]["float_list"]!["vector_index"]!.config!.space).toBe("cosine"); + expect(schema3?.defaults.floatList?.vectorIndex?.config.space).toBe("cosine"); + }); + + it("key overrides have independent configs", () => { + const schema = new Schema(); + + schema.createIndex(new SparseVectorIndexConfig({ sourceKey: "default_source" }), "field1"); + schema.createIndex(new StringInvertedIndexConfig(), "field2"); + + expect(schema.keys["field1"].sparseVector?.sparseVectorIndex?.config.sourceKey).toBe( + "default_source", + ); + expect(schema.keys["field2"].string?.stringInvertedIndex?.enabled).toBe(true); + + const json = schema.serializeToJSON(); + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.keys["field1"].sparseVector?.sparseVectorIndex?.config.sourceKey).toBe( + "default_source", + ); + expect(deserialized?.keys["field2"].string?.stringInvertedIndex?.enabled).toBe(true); + }); + + it("global default changes do not affect existing overrides", () => { + const schema = new Schema(); + + const initialEf = new MockEmbedding("initial_model"); + schema.createIndex( + new VectorIndexConfig({ + embeddingFunction: initialEf, + space: "cosine", + hnsw: { ef_construction: 100, max_neighbors: 16 }, + }), + ); + + const initialOverride = schema.keys[EMBEDDING_KEY].floatList?.vectorIndex?.config.hnsw; + expect(initialOverride).toEqual({ ef_construction: 100, max_neighbors: 16 }); + + const updatedEf = new MockEmbedding("updated_model"); + schema.createIndex( + new VectorIndexConfig({ + embeddingFunction: updatedEf, + space: "l2", + hnsw: { ef_construction: 200, max_neighbors: 32 }, + }), + ); + + const defaultsVector = schema.defaults.floatList?.vectorIndex; + expect(defaultsVector?.config.space).toBe("l2"); + expect(defaultsVector?.config.hnsw).toEqual({ ef_construction: 200, max_neighbors: 32 }); + + const embeddingVector = schema.keys[EMBEDDING_KEY].floatList?.vectorIndex; + expect(embeddingVector?.config.space).toBe("l2"); + expect(embeddingVector?.config.hnsw).toEqual({ ef_construction: 200, max_neighbors: 32 }); + }); + + it("key specific overrides remain independent", () => { + const schema = new Schema(); + + schema.createIndex(new SparseVectorIndexConfig({ sourceKey: "source_a" }), "key_a"); + schema.createIndex(new StringInvertedIndexConfig(), "key_b"); + schema.createIndex(new StringInvertedIndexConfig(), "key_c"); + + expect(schema.keys["key_a"].sparseVector?.sparseVectorIndex?.config.sourceKey).toBe("source_a"); + expect(schema.keys["key_b"].string?.stringInvertedIndex?.enabled).toBe(true); + expect(schema.keys["key_c"].string?.stringInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(new StringInvertedIndexConfig(), "key_b"); + expect(schema.keys["key_b"].string?.stringInvertedIndex?.enabled).toBe(false); + + const json = schema.serializeToJSON(); + const deserialized = Schema.deserializeFromJSON(json); + expect(deserialized?.keys["key_a"].sparseVector?.sparseVectorIndex?.config.sourceKey).toBe( + "source_a", + ); + expect(deserialized?.keys["key_b"].string?.stringInvertedIndex?.enabled).toBe(false); + expect(deserialized?.keys["key_c"].string?.stringInvertedIndex?.enabled).toBe(true); + }); + + it("global default disable then key enable", () => { + const schema = new Schema(); + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(true); + + schema.deleteIndex(new StringInvertedIndexConfig()); + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(false); + + schema.createIndex(new StringInvertedIndexConfig(), "important_field"); + schema.createIndex(new StringInvertedIndexConfig(), "searchable_field"); + + expect(schema.defaults.string?.stringInvertedIndex?.enabled).toBe(false); + expect(schema.keys["important_field"].string?.stringInvertedIndex?.enabled).toBe(true); + expect(schema.keys["searchable_field"].string?.stringInvertedIndex?.enabled).toBe(true); + + const json = schema.serializeToJSON(); + expect(json.keys).toHaveProperty("important_field"); + expect(json.keys).toHaveProperty("searchable_field"); + expect(json.keys).toHaveProperty(DOCUMENT_KEY); + expect(json.keys).toHaveProperty(EMBEDDING_KEY); + expect(json.keys).not.toHaveProperty("other_field"); + }); + + it("partial override fills from defaults", () => { + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ sourceKey: "my_source" }), "multi_index_field"); + + const override = schema.keys["multi_index_field"]; + expect(override.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(override.string).toBeNull(); + expect(override.intValue).toBeNull(); + expect(override.floatValue).toBeNull(); + expect(override.boolean).toBeNull(); + expect(override.floatList).toBeNull(); + + const json = schema.serializeToJSON(); + const fieldJson = json.keys["multi_index_field"]; + expect(fieldJson["sparse_vector"]).toBeDefined(); + expect(fieldJson["string"]).toBeUndefined(); + expect(fieldJson["int"]).toBeUndefined(); + expect(fieldJson["float"]).toBeUndefined(); + expect(fieldJson["bool"]).toBeUndefined(); + expect(fieldJson["float_list"]).toBeUndefined(); + + const deserialized = Schema.deserializeFromJSON(json); + const desOverride = deserialized?.keys["multi_index_field"]; + expect(desOverride?.sparseVector?.sparseVectorIndex?.enabled).toBe(true); + expect(desOverride?.string).toBeNull(); + expect(desOverride?.intValue).toBeNull(); + }); + + it("sparse vector cannot be created globally", () => { + const schema = new Schema(); + expect(() => schema.createIndex(new SparseVectorIndexConfig())).toThrow( + /Sparse vector index must be created on a specific key/, + ); + }); + + it("sparse vector cannot be deleted", () => { + const schema = new Schema(); + const config = new SparseVectorIndexConfig(); + schema.createIndex(config, "my_key"); + expect(() => schema.deleteIndex(config, "my_key")).toThrow( + /Deleting sparse vector index is not currently supported/, + ); + }); + + it("uses schema embedding function fallback when collection embedding is missing", async () => { + const schema = new Schema(); + const embedding = new MockEmbedding("schema_model"); + schema.createIndex(new VectorIndexConfig({ embeddingFunction: embedding })); + + const collection = new CollectionImpl({ + chromaClient: null as unknown as ChromaClient, + apiClient: {} as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + const embedFn = (collection as unknown as { + getSchemaEmbeddingFunction: () => EmbeddingFunction | undefined; + }).getSchemaEmbeddingFunction(); + expect(embedFn).toBeDefined(); + const result = await embedFn!.generate(["hello"]); + expect(result).toEqual([[1, 2, 3]]); + }); + + it("sparse auto-embedding with #document source", async () => { + const sparseEf = new DeterministicSparseEmbedding("doc_sparse"); + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ + embeddingFunction: sparseEf, + sourceKey: DOCUMENT_KEY, + }), "doc_sparse"); + + let capturedRecords: any = null; + const mockApiClient = { + post: jest.fn().mockImplementation(async (options) => { + capturedRecords = options.body; + return { data: {} }; + }), + }; + + const mockChromaClient = { + getMaxBatchSize: jest.fn().mockResolvedValue(1000), + supportsBase64Encoding: jest.fn().mockResolvedValue(false), + _path: jest.fn().mockResolvedValue({ path: "/api/v1", tenant: "default_tenant", database: "default_database" }), + }; + + const collection = new CollectionImpl({ + chromaClient: mockChromaClient as unknown as ChromaClient, + apiClient: mockApiClient as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + await collection.add({ + ids: ["1", "2"], + documents: ["Hello, world!", "Test document"], + embeddings: [[1, 2, 3], [4, 5, 6]], // Provide dummy embeddings to skip auto-generation + }); + + expect(capturedRecords).not.toBeNull(); + expect(capturedRecords.metadatas).toHaveLength(2); + + // Expected from batch call + const expectedBatch = await sparseEf.generate(["Hello, world!", "Test document"]); + + expect(capturedRecords.metadatas[0]).toHaveProperty("doc_sparse"); + expect(capturedRecords.metadatas[0].doc_sparse).toEqual({ + "#type": "sparse_vector", + ...expectedBatch[0], + }); + + expect(capturedRecords.metadatas[1]).toHaveProperty("doc_sparse"); + expect(capturedRecords.metadatas[1].doc_sparse).toEqual({ + "#type": "sparse_vector", + ...expectedBatch[1], + }); + }); + + it("sparse auto-embedding with metadata field source", async () => { + const sparseEf = new DeterministicSparseEmbedding("content_sparse"); + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ + embeddingFunction: sparseEf, + sourceKey: "content", + }), "content_sparse"); + + let capturedRecords: any = null; + const mockApiClient = { + post: jest.fn().mockImplementation(async (options) => { + capturedRecords = options.body; + return { data: {} }; + }), + }; + + const mockChromaClient = { + getMaxBatchSize: jest.fn().mockResolvedValue(1000), + supportsBase64Encoding: jest.fn().mockResolvedValue(false), + _path: jest.fn().mockResolvedValue({ path: "/api/v1", tenant: "default_tenant", database: "default_database" }), + }; + + const collection = new CollectionImpl({ + chromaClient: mockChromaClient as unknown as ChromaClient, + apiClient: mockApiClient as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + await collection.add({ + ids: ["s1", "s2", "s3"], + documents: ["ignored1", "ignored2", "ignored3"], + embeddings: [[1, 2], [3, 4], [5, 6]], // Provide dummy embeddings to skip auto-generation + metadatas: [ + { content: "sparse content one" }, + { content: "sparse content two" }, + { content: "sparse content three" }, + ], + }); + + expect(capturedRecords).not.toBeNull(); + expect(capturedRecords.metadatas).toHaveLength(3); + + // Expected from batch call + const expectedBatch = await sparseEf.generate([ + "sparse content one", + "sparse content two", + "sparse content three", + ]); + + for (let i = 0; i < 3; i++) { + expect(capturedRecords.metadatas[i]).toHaveProperty("content_sparse"); + expect(capturedRecords.metadatas[i]).toHaveProperty("content"); + expect(capturedRecords.metadatas[i].content_sparse).toEqual({ + "#type": "sparse_vector", + ...expectedBatch[i], + }); + } + }); + + it("sparse auto-embedding with mixed metadata null and filled", async () => { + const sparseEf = new DeterministicSparseEmbedding("mixed_sparse"); + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ + embeddingFunction: sparseEf, + sourceKey: DOCUMENT_KEY, + }), "mixed_sparse"); + + let capturedRecords: any = null; + const mockApiClient = { + post: jest.fn().mockImplementation(async (options) => { + capturedRecords = options.body; + return { data: {} }; + }), + }; + + const mockChromaClient = { + getMaxBatchSize: jest.fn().mockResolvedValue(1000), + supportsBase64Encoding: jest.fn().mockResolvedValue(false), + _path: jest.fn().mockResolvedValue({ path: "/api/v1", tenant: "default_tenant", database: "default_database" }), + }; + + const collection = new CollectionImpl({ + chromaClient: mockChromaClient as unknown as ChromaClient, + apiClient: mockApiClient as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + await collection.add({ + ids: ["n1", "n2", "n3", "n4"], + documents: ["doc one", "doc two", "doc three", "doc four"], + embeddings: [[1, 2], [3, 4], [5, 6], [7, 8]], // Provide dummy embeddings to skip auto-generation + metadatas: [ + null as any, + null as any, + { existing: "data" }, + null as any, + ], + }); + + expect(capturedRecords).not.toBeNull(); + expect(capturedRecords.metadatas).toHaveLength(4); + + // Expected from batch call + const expectedBatch = await sparseEf.generate(["doc one", "doc two", "doc three", "doc four"]); + + // All should have sparse embeddings added + for (let i = 0; i < 4; i++) { + expect(capturedRecords.metadatas[i]).toHaveProperty("mixed_sparse"); + expect(capturedRecords.metadatas[i].mixed_sparse).toEqual({ + "#type": "sparse_vector", + ...expectedBatch[i], + }); + } + + // Third one should still have existing data + expect(capturedRecords.metadatas[2].existing).toBe("data"); + }); + + it("sparse auto-embedding skips existing values", async () => { + const sparseEf = new DeterministicSparseEmbedding("preserve"); + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ + embeddingFunction: sparseEf, + sourceKey: DOCUMENT_KEY, + }), "preserve_sparse"); + + let capturedRecords: any = null; + const mockApiClient = { + post: jest.fn().mockImplementation(async (options) => { + capturedRecords = options.body; + return { data: {} }; + }), + }; + + const mockChromaClient = { + getMaxBatchSize: jest.fn().mockResolvedValue(1000), + supportsBase64Encoding: jest.fn().mockResolvedValue(false), + _path: jest.fn().mockResolvedValue({ path: "/api/v1", tenant: "default_tenant", database: "default_database" }), + }; + + const collection = new CollectionImpl({ + chromaClient: mockChromaClient as unknown as ChromaClient, + apiClient: mockApiClient as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + const existingSparse = { indices: [999], values: [123.456] }; + + await collection.add({ + ids: ["preserve1", "preserve2"], + documents: ["auto document", "manual document"], + embeddings: [[1, 2], [3, 4]], // Provide dummy embeddings to skip auto-generation + metadatas: [ + null as any, + { preserve_sparse: existingSparse }, + ], + }); + + expect(capturedRecords).not.toBeNull(); + expect(capturedRecords.metadatas).toHaveLength(2); + + // First should have auto-generated embedding (single item batch) + const expectedAuto = await sparseEf.generate(["auto document"]); + expect(capturedRecords.metadatas[0]).toHaveProperty("preserve_sparse"); + expect(capturedRecords.metadatas[0].preserve_sparse).toEqual({ + "#type": "sparse_vector", + ...expectedAuto[0], + }); + + // Second should preserve the manually provided one (already serialized in input) + expect(capturedRecords.metadatas[1].preserve_sparse).toEqual({ + "#type": "sparse_vector", + ...existingSparse, + }); + }); + + it("sparse auto-embedding with missing source field", async () => { + const sparseEf = new DeterministicSparseEmbedding("missing_field"); + const schema = new Schema(); + schema.createIndex(new SparseVectorIndexConfig({ + embeddingFunction: sparseEf, + sourceKey: "text_field", + }), "field_sparse"); + + let capturedRecords: any = null; + const mockApiClient = { + post: jest.fn().mockImplementation(async (options) => { + capturedRecords = options.body; + return { data: {} }; + }), + }; + + const mockChromaClient = { + getMaxBatchSize: jest.fn().mockResolvedValue(1000), + supportsBase64Encoding: jest.fn().mockResolvedValue(false), + _path: jest.fn().mockResolvedValue({ path: "/api/v1", tenant: "default_tenant", database: "default_database" }), + }; + + const collection = new CollectionImpl({ + chromaClient: mockChromaClient as unknown as ChromaClient, + apiClient: mockApiClient as any, + id: "test-id", + name: "test", + configuration: {} as CollectionConfiguration, + metadata: undefined as CollectionMetadata | undefined, + embeddingFunction: undefined, + schema, + }); + + await collection.add({ + ids: ["f1", "f2", "f3", "f4"], + documents: ["doc1", "doc2", "doc3", "doc4"], + embeddings: [[1, 2], [3, 4], [5, 6], [7, 8]], // Provide dummy embeddings to skip auto-generation + metadatas: [ + { text_field: "valid text" }, + { text_field: 123 }, + { other_field: "value" }, + null as any, + ], + }); + + expect(capturedRecords).not.toBeNull(); + expect(capturedRecords.metadatas).toHaveLength(4); + + // Only first one should have sparse embedding (single item batch) + const expected = await sparseEf.generate(["valid text"]); + expect(capturedRecords.metadatas[0]).toHaveProperty("field_sparse"); + expect(capturedRecords.metadatas[0].field_sparse).toEqual({ + "#type": "sparse_vector", + ...expected[0], + }); + + // Others should NOT have sparse embedding + expect(capturedRecords.metadatas[1]).not.toHaveProperty("field_sparse"); + expect(capturedRecords.metadatas[2]).not.toHaveProperty("field_sparse"); + expect(capturedRecords.metadatas[3]).toBeNull(); + }); + +});