diff --git a/api-extractor.tokenizer-node.json b/api-extractor.tokenizer-node.json new file mode 100644 index 000000000..41ef18fb6 --- /dev/null +++ b/api-extractor.tokenizer-node.json @@ -0,0 +1,26 @@ +/** + * API Extractor configuration for the tokenizer/node subpackage. + * This configuration extends the one in api-extractor.json and changes + * input and output paths for the tokenizer/node subpath export. + */ +{ + "extends": "./api-extractor.json", + + "mainEntryPointFilePath": "/dist/src/tokenizer/node.d.ts", + + "apiReport": { + "reportFileName": "-tokenizer-node" + }, + + "docModel": { + "apiJsonFilePath": "/temp/tokenizer-node/.api.json" + }, + + "dtsRollup": { + "untrimmedFilePath": "/dist/tokenizer/node.d.ts" + }, + + "tsdocMetadata": { + "tsdocMetadataFilePath": "/dist/tokenizer/node-tsdoc-metadata.json" + } +} diff --git a/api-report/genai-node.api.md b/api-report/genai-node.api.md index ca6217e56..c8af73221 100644 --- a/api-report/genai-node.api.md +++ b/api-report/genai-node.api.md @@ -410,6 +410,11 @@ export class ComputeTokensResponse { tokensInfo?: TokensInfo[]; } +// @public +export interface ComputeTokensResult { + tokensInfo?: TokensInfo[]; +} + // @public export interface Content { parts?: Part[]; @@ -499,6 +504,11 @@ export class CountTokensResponse { totalTokens?: number; } +// @public +export interface CountTokensResult { + totalTokens?: number; +} + // @public export interface CreateAuthTokenConfig { abortSignal?: AbortSignal; diff --git a/api-report/genai-tokenizer-node.api.md b/api-report/genai-tokenizer-node.api.md new file mode 100644 index 000000000..2c1faa95a --- /dev/null +++ b/api-report/genai-tokenizer-node.api.md @@ -0,0 +1,35 @@ +## API Report File for "@google/genai" + +> Do not edit this file. It is a report generated by [API Extractor](https://api-extractor.com/). + +```ts + +// @public +export interface ComputeTokensResult { + tokensInfo?: TokensInfo[]; +} + +// @public +export interface CountTokensResult { + totalTokens?: number; +} + +// @public +export class LocalTokenizer { + constructor(modelName: string); + computeTokens(contents: ContentListUnion): Promise; + // Warning: (ae-forgotten-export) The symbol "ContentListUnion" needs to be exported by the entry point node.d.ts + // Warning: (ae-forgotten-export) The symbol "CountTokensConfig" needs to be exported by the entry point node.d.ts + countTokens(contents: ContentListUnion, config?: CountTokensConfig): Promise; +} + +// @public +export interface TokensInfo { + role?: string; + tokenIds?: string[]; + tokens?: string[]; +} + +// (No @packageDocumentation comment for this package) + +``` diff --git a/api-report/genai-tokenizer.api.md b/api-report/genai-tokenizer.api.md new file mode 100644 index 000000000..5b748db42 --- /dev/null +++ b/api-report/genai-tokenizer.api.md @@ -0,0 +1,38 @@ +## API Report File for "@google/genai" + +> Do not edit this file. It is a report generated by [API Extractor](https://api-extractor.com/). + +```ts + +// @public +export interface ComputeTokensResult { + tokensInfo?: TokensInfo[]; +} + +// @public +export interface CountTokensResult { + totalTokens?: number; +} + +// Warning: (ae-forgotten-export) The symbol "ILocalTokenizer" needs to be exported by the entry point index.d.ts +// +// @public +export class LocalTokenizer implements ILocalTokenizer { + // Warning: (ae-forgotten-export) The symbol "TokenizerPlatform" needs to be exported by the entry point index.d.ts + constructor(modelName: string, platform: TokenizerPlatform); + computeTokens(contents: ContentListUnion): Promise; + // Warning: (ae-forgotten-export) The symbol "ContentListUnion" needs to be exported by the entry point index.d.ts + // Warning: (ae-forgotten-export) The symbol "CountTokensConfig" needs to be exported by the entry point index.d.ts + countTokens(contents: ContentListUnion, config?: CountTokensConfig): Promise; +} + +// @public +export interface TokensInfo { + role?: string; + tokenIds?: string[]; + tokens?: string[]; +} + +// (No @packageDocumentation comment for this package) + +``` diff --git a/api-report/genai-web.api.md b/api-report/genai-web.api.md index ca6217e56..c8af73221 100644 --- a/api-report/genai-web.api.md +++ b/api-report/genai-web.api.md @@ -410,6 +410,11 @@ export class ComputeTokensResponse { tokensInfo?: TokensInfo[]; } +// @public +export interface ComputeTokensResult { + tokensInfo?: TokensInfo[]; +} + // @public export interface Content { parts?: Part[]; @@ -499,6 +504,11 @@ export class CountTokensResponse { totalTokens?: number; } +// @public +export interface CountTokensResult { + totalTokens?: number; +} + // @public export interface CreateAuthTokenConfig { abortSignal?: AbortSignal; diff --git a/api-report/genai.api.md b/api-report/genai.api.md index ca6217e56..c8af73221 100644 --- a/api-report/genai.api.md +++ b/api-report/genai.api.md @@ -410,6 +410,11 @@ export class ComputeTokensResponse { tokensInfo?: TokensInfo[]; } +// @public +export interface ComputeTokensResult { + tokensInfo?: TokensInfo[]; +} + // @public export interface Content { parts?: Part[]; @@ -499,6 +504,11 @@ export class CountTokensResponse { totalTokens?: number; } +// @public +export interface CountTokensResult { + totalTokens?: number; +} + // @public export interface CreateAuthTokenConfig { abortSignal?: AbortSignal; diff --git a/package-lock.json b/package-lock.json index 398aadc7d..bb94bc848 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,6 +10,7 @@ "license": "Apache-2.0", "dependencies": { "google-auth-library": "^10.3.0", + "protobufjs": "^7.5.4", "ws": "^8.18.0" }, "devDependencies": { @@ -34,6 +35,7 @@ "patch-package": "^8.0.1", "prettier": "3.3.3", "prettier-plugin-organize-imports": "^4.1.0", + "protobufjs-cli": "^1.1.3", "rollup-plugin-typescript2": "^0.36.0", "test-server-sdk": "^0.2.9", "ts-node": "^10.9.2", @@ -1236,6 +1238,19 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@jsdoc/salty": { + "version": "0.2.9", + "resolved": "https://registry.npmjs.org/@jsdoc/salty/-/salty-0.2.9.tgz", + "integrity": "sha512-yYxMVH7Dqw6nO0d5NIV8OQWnitU8k6vXH8NtgqAfIa/IUqRMxRv/NUJJ08VEKbAakwxlgBl5PJdrU0dMPStsnw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "lodash": "^4.17.21" + }, + "engines": { + "node": ">=v12.0.0" + } + }, "node_modules/@microsoft/api-extractor": { "version": "7.55.2", "resolved": "https://registry.npmjs.org/@microsoft/api-extractor/-/api-extractor-7.55.2.tgz", @@ -1424,6 +1439,70 @@ "url": "https://opencollective.com/unts" } }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", + "license": "BSD-3-Clause" + }, "node_modules/@rollup/plugin-json": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/@rollup/plugin-json/-/plugin-json-6.1.0.tgz", @@ -2001,6 +2080,28 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/linkify-it": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@types/linkify-it/-/linkify-it-5.0.0.tgz", + "integrity": "sha512-sVDA58zAw4eWAffKOaQH5/5j3XeayukzDk+ewSsnv3p4yJEZHCCzMDiZM8e0OUrRvmpGZ85jf4yDHkHsgBNr9Q==", + "dev": true + }, + "node_modules/@types/markdown-it": { + "version": "14.1.2", + "resolved": "https://registry.npmjs.org/@types/markdown-it/-/markdown-it-14.1.2.tgz", + "integrity": "sha512-promo4eFwuiW+TfGxhi+0x3czqTYJkG8qB17ZUJiVF10Xm7NLVRSLUsfRTU/6h1e24VvRnXCx+hG7li58lkzog==", + "dev": true, + "dependencies": { + "@types/linkify-it": "^5", + "@types/mdurl": "^2" + } + }, + "node_modules/@types/mdurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@types/mdurl/-/mdurl-2.0.0.tgz", + "integrity": "sha512-RGdgjQUZba5p6QEFAVx2OGb8rQDL/cPRG7GiedRzMcJ1tYnUANBncjbSB1NRGwbvjcPeikRABz2nshyPk1bhWg==", + "dev": true + }, "node_modules/@types/minimist": { "version": "1.2.5", "resolved": "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.5.tgz", @@ -2012,7 +2113,6 @@ "version": "20.19.25", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz", "integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==", - "dev": true, "license": "MIT", "dependencies": { "undici-types": "~6.21.0" @@ -2033,7 +2133,6 @@ "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", - "dev": true, "license": "MIT" }, "node_modules/@types/normalize-package-data": { @@ -2684,6 +2783,13 @@ "node": "*" } }, + "node_modules/bluebird": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", + "integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==", + "dev": true, + "license": "MIT" + }, "node_modules/body-parser": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz", @@ -3008,6 +3114,18 @@ ], "license": "CC-BY-4.0" }, + "node_modules/catharsis": { + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/catharsis/-/catharsis-0.9.0.tgz", + "integrity": "sha512-prMTQVpcns/tzFgFVkVp6ak6RykZyWb3gu8ckUpd6YkTlacOd3DXGJjIpD4Q6zJirizvaiAjSSHlOsA+6sNh2A==", + "dev": true, + "dependencies": { + "lodash": "^4.17.15" + }, + "engines": { + "node": ">= 10" + } + }, "node_modules/chalk": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", @@ -3808,6 +3926,93 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/escodegen": { + "version": "1.14.3", + "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.14.3.tgz", + "integrity": "sha512-qFcX0XJkdg+PB3xjZZG/wKSuT1PnQWx57+TVSjIMmILd2yC/6ByYElPwJnslDsuWuSAp4AwJGumarAAmJch5Kw==", + "dev": true, + "license": "BSD-2-Clause", + "dependencies": { + "esprima": "^4.0.1", + "estraverse": "^4.2.0", + "esutils": "^2.0.2", + "optionator": "^0.8.1" + }, + "bin": { + "escodegen": "bin/escodegen.js", + "esgenerate": "bin/esgenerate.js" + }, + "engines": { + "node": ">=4.0" + }, + "optionalDependencies": { + "source-map": "~0.6.1" + } + }, + "node_modules/escodegen/node_modules/estraverse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz", + "integrity": "sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/escodegen/node_modules/levn": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz", + "integrity": "sha512-0OO4y2iOHix2W6ujICbKIaEQXvFQHue65vUG3pb5EUomzPI90z9hsA1VsO/dbIIpC53J8gxM9Q4Oho0jrCM/yA==", + "dev": true, + "license": "MIT", + "dependencies": { + "prelude-ls": "~1.1.2", + "type-check": "~0.3.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/escodegen/node_modules/optionator": { + "version": "0.8.3", + "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz", + "integrity": "sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "deep-is": "~0.1.3", + "fast-levenshtein": "~2.0.6", + "levn": "~0.3.0", + "prelude-ls": "~1.1.2", + "type-check": "~0.3.2", + "word-wrap": "~1.2.3" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/escodegen/node_modules/prelude-ls": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz", + "integrity": "sha512-ESF23V4SKG6lVSGZgYNpbsiaAkdab6ZgOxe52p7+Kid3W3u3bxR4Vfd/o21dmN7jSt0IwgZ4v5MUd26FEtXE9w==", + "dev": true, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/escodegen/node_modules/type-check": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz", + "integrity": "sha512-ZCmOJdvOWDBYJlzAoFkC+Q0+bUyEOS1ltgp1MGU03fqHG+dbi9tBFU2Rd9QKiDZFAYrhPh2JUf7rZRIuHRKtOg==", + "dev": true, + "license": "MIT", + "dependencies": { + "prelude-ls": "~1.1.2" + }, + "engines": { + "node": ">= 0.8.0" + } + }, "node_modules/eslint": { "version": "8.57.0", "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.0.tgz", @@ -6360,6 +6565,53 @@ "dev": true, "license": "Python-2.0" }, + "node_modules/js2xmlparser": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/js2xmlparser/-/js2xmlparser-4.0.2.tgz", + "integrity": "sha512-6n4D8gLlLf1n5mNLQPRfViYzu9RATblzPEtm1SthMX1Pjao0r9YI9nw7ZIfRxQMERS87mcswrg+r/OYrPRX6jA==", + "dev": true, + "dependencies": { + "xmlcreate": "^2.0.4" + } + }, + "node_modules/jsdoc": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/jsdoc/-/jsdoc-4.0.5.tgz", + "integrity": "sha512-P4C6MWP9yIlMiK8nwoZvxN84vb6MsnXcHuy7XzVOvQoCizWX5JFCBsWIIWKXBltpoRZXddUOVQmCTOZt9yDj9g==", + "dev": true, + "dependencies": { + "@babel/parser": "^7.20.15", + "@jsdoc/salty": "^0.2.1", + "@types/markdown-it": "^14.1.1", + "bluebird": "^3.7.2", + "catharsis": "^0.9.0", + "escape-string-regexp": "^2.0.0", + "js2xmlparser": "^4.0.2", + "klaw": "^3.0.0", + "markdown-it": "^14.1.0", + "markdown-it-anchor": "^8.6.7", + "marked": "^4.0.10", + "mkdirp": "^1.0.4", + "requizzle": "^0.2.3", + "strip-json-comments": "^3.1.0", + "underscore": "~1.13.2" + }, + "bin": { + "jsdoc": "jsdoc.js" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/jsdoc/node_modules/escape-string-regexp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz", + "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/jsesc": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", @@ -6514,6 +6766,16 @@ "node": ">=0.10.0" } }, + "node_modules/klaw": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/klaw/-/klaw-3.0.0.tgz", + "integrity": "sha512-0Fo5oir+O9jnXu5EefYbVK+mHMBeEVEy2cmctR1O1NECcCkPRreJKrS6Qt/j3KC2C148Dfo9i3pCmCMsdqGr0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "graceful-fs": "^4.1.9" + } + }, "node_modules/klaw-sync": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/klaw-sync/-/klaw-sync-6.0.0.tgz", @@ -6618,6 +6880,11 @@ "dev": true, "license": "MIT" }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==" + }, "node_modules/lru-cache": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", @@ -6692,6 +6959,17 @@ "markdown-it": "bin/markdown-it.mjs" } }, + "node_modules/markdown-it-anchor": { + "version": "8.6.7", + "resolved": "https://registry.npmjs.org/markdown-it-anchor/-/markdown-it-anchor-8.6.7.tgz", + "integrity": "sha512-FlCHFwNnutLgVTflOYHPW2pPcl2AACqVzExlkGQNsi4CJgqOHN7YTgDd4LuhgN1BFO3TS0vLAruV1Td6dwWPJA==", + "dev": true, + "license": "Unlicense", + "peerDependencies": { + "@types/markdown-it": "*", + "markdown-it": "*" + } + }, "node_modules/markdown-it/node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -6699,6 +6977,18 @@ "dev": true, "license": "Python-2.0" }, + "node_modules/marked": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", + "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", + "dev": true, + "bin": { + "marked": "bin/marked.js" + }, + "engines": { + "node": ">= 12" + } + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -8174,6 +8464,103 @@ "node": ">=8" } }, + "node_modules/protobufjs": { + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", + "integrity": "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==", + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/node": ">=13.7.0", + "long": "^5.0.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/protobufjs-cli": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/protobufjs-cli/-/protobufjs-cli-1.1.3.tgz", + "integrity": "sha512-MqD10lqF+FMsOayFiNOdOGNlXc4iKDCf0ZQPkPR+gizYh9gqUeGTWulABUCdI+N67w5RfJ6xhgX4J8pa8qmMXQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "chalk": "^4.0.0", + "escodegen": "^1.13.0", + "espree": "^9.0.0", + "estraverse": "^5.1.0", + "glob": "^8.0.0", + "jsdoc": "^4.0.0", + "minimist": "^1.2.0", + "semver": "^7.1.2", + "tmp": "^0.2.1", + "uglify-js": "^3.7.7" + }, + "bin": { + "pbjs": "bin/pbjs", + "pbts": "bin/pbts" + }, + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "protobufjs": "^7.0.0" + } + }, + "node_modules/protobufjs-cli/node_modules/brace-expansion": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/protobufjs-cli/node_modules/glob": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz", + "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "dev": true, + "license": "ISC", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^5.0.1", + "once": "^1.3.0" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/protobufjs-cli/node_modules/minimatch": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz", + "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -8627,6 +9014,15 @@ "dev": true, "license": "ISC" }, + "node_modules/requizzle": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/requizzle/-/requizzle-0.2.4.tgz", + "integrity": "sha512-JRrFk1D4OQ4SqovXOgdav+K8EAhSB/LJZqCz8tbX0KObcdeM15Ss59ozWMBWmmINMagCwmqn4ZNryUGpBsl6Jw==", + "dev": true, + "dependencies": { + "lodash": "^4.17.21" + } + }, "node_modules/resolve": { "version": "1.22.11", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz", @@ -10345,6 +10741,18 @@ "dev": true, "license": "MIT" }, + "node_modules/uglify-js": { + "version": "3.19.3", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.19.3.tgz", + "integrity": "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ==", + "dev": true, + "bin": { + "uglifyjs": "bin/uglifyjs" + }, + "engines": { + "node": ">=0.8.0" + } + }, "node_modules/unbox-primitive": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", @@ -10364,6 +10772,13 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "dev": true, + "license": "MIT" + }, "node_modules/undici": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz", @@ -10778,6 +11193,13 @@ } } }, + "node_modules/xmlcreate": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/xmlcreate/-/xmlcreate-2.0.4.tgz", + "integrity": "sha512-nquOebG4sngPmGPICTS5EnxqhKbCmz5Ox5hsszI2T6U5qdrJizBc+0ilYSEjTSzU0yZcmvppztXe/5Al5fUwdg==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index e107fe1ab..f8fcc5e0c 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,24 @@ "types": "./dist/node/node.d.ts", "import": "./dist/node/index.mjs", "default": "./dist/node/index.mjs" + }, + "./tokenizer": { + "node": { + "types": "./dist/tokenizer/node.d.ts", + "import": "./dist/tokenizer/node.mjs", + "require": "./dist/tokenizer/node.cjs", + "default": "./dist/tokenizer/node.mjs" + }, + "types": "./dist/tokenizer/node.d.ts", + "import": "./dist/tokenizer/node.mjs", + "require": "./dist/tokenizer/node.cjs", + "default": "./dist/tokenizer/node.mjs" + }, + "./tokenizer/node": { + "types": "./dist/tokenizer/node.d.ts", + "import": "./dist/tokenizer/node.mjs", + "require": "./dist/tokenizer/node.cjs", + "default": "./dist/tokenizer/node.mjs" } }, "scripts": { @@ -43,10 +61,12 @@ "api-extractor:dev:main": "api-extractor run --local --verbose", "api-extractor:dev:node": "api-extractor run -c api-extractor.node.json --local --verbose", "api-extractor:dev:web": "api-extractor run -c api-extractor.web.json --local --verbose", + "api-extractor:dev:tokenizer-node": "api-extractor run -c api-extractor.tokenizer-node.json --local --verbose", "api-extractor:prod:main": "api-extractor run --verbose", "api-extractor:prod:node": "api-extractor run -c api-extractor.node.json --verbose", "api-extractor:prod:web": "api-extractor run -c api-extractor.web.json --verbose", - "unit-test": "tsc && jasmine dist/test/unit/**/*_test.js dist/test/unit/*_test.js", + "api-extractor:prod:tokenizer-node": "api-extractor run -c api-extractor.tokenizer-node.json --verbose", + "unit-test": "tsc && cp src/cross/sentencepiece/sentencepiece_model.pb.js dist/src/cross/sentencepiece/ && jasmine dist/test/unit/**/*_test.js dist/test/unit/**/**/*_test.js dist/test/unit/*_test.js", "system-test": "tsc && jasmine dist/test/system/**/*_test.js", "test-server-tests": "tsc && GOOGLE_CLOUD_PROJECT=googcloudproj GOOGLE_CLOUD_LOCATION=googcloudloc jasmine dist/test/system/node/*_test.js -- --test-server", "test-server-tests:record": "tsc && jasmine --fail-fast dist/test/system/node/*_test.js -- --test-server --record", @@ -56,7 +76,8 @@ "format": "prettier '**/*.ts' '**/*.mjs' '**/*.json' --write", "lint": "eslint '**/*.ts'", "lint-fix": "eslint --fix '**/*.ts'", - "coverage-report": "./test/generate_report.sh" + "coverage-report": "./test/generate_report.sh", + "generate-proto": "pbjs -t static-module -w es6 -o src/cross/sentencepiece/sentencepiece_model.pb.js src/cross/sentencepiece/sentencepiece_model.proto && pbts -o src/cross/sentencepiece/sentencepiece_model.pb.d.ts src/cross/sentencepiece/sentencepiece_model.pb.js && sed -i.bak 's/import \\* as \\$protobuf from \"protobufjs\\/minimal\"/import \\$protobuf from \"protobufjs\\/minimal.js\"/' src/cross/sentencepiece/sentencepiece_model.pb.js && rm src/cross/sentencepiece/sentencepiece_model.pb.js.bak" }, "engines": { "node": ">=20.0.0" @@ -76,6 +97,10 @@ "dist/web/index.mjs", "dist/web/index.mjs.map", "dist/web/web.d.ts", + "dist/tokenizer/node.mjs", + "dist/tokenizer/node.cjs", + "dist/tokenizer/node.mjs.map", + "dist/tokenizer/node.d.ts", "node/package.json", "web/package.json" ], @@ -101,6 +126,7 @@ "patch-package": "^8.0.1", "prettier": "3.3.3", "prettier-plugin-organize-imports": "^4.1.0", + "protobufjs-cli": "^1.1.3", "rollup-plugin-typescript2": "^0.36.0", "test-server-sdk": "^0.2.9", "ts-node": "^10.9.2", @@ -116,6 +142,7 @@ }, "dependencies": { "google-auth-library": "^10.3.0", + "protobufjs": "^7.5.4", "ws": "^8.18.0" }, "peerDependencies": { diff --git a/rollup.config.mjs b/rollup.config.mjs index 65053ed53..e3113ae00 100644 --- a/rollup.config.mjs +++ b/rollup.config.mjs @@ -30,6 +30,10 @@ const externalDeps = [ '@modelcontextprotocol/sdk/client/index.js', '@modelcontextprotocol/sdk/types.js', 'path', + 'crypto', + 'os', + 'protobufjs/minimal', + 'protobufjs/minimal.js', ]; export default [ @@ -92,4 +96,28 @@ export default [ plugins: rollupPlugins, external: externalDeps, }, + + // The `tokenizer/node` ES module (dist/tokenizer/node.mjs) + { + input: 'src/tokenizer/node.ts', + output: { + file: 'dist/tokenizer/node.mjs', + format: 'es', + sourcemap: true, + }, + plugins: rollupPlugins, + external: externalDeps, + }, + + // The `tokenizer/node` CJS module (dist/tokenizer/node.cjs) + { + input: 'src/tokenizer/node.ts', + output: { + file: 'dist/tokenizer/node.cjs', + format: 'cjs', + sourcemap: true, + }, + plugins: rollupPlugins, + external: externalDeps, + }, ]; diff --git a/sdk-samples/count_tokens.ts b/sdk-samples/count_tokens.ts index 0217734fc..67d80ca1f 100644 --- a/sdk-samples/count_tokens.ts +++ b/sdk-samples/count_tokens.ts @@ -28,12 +28,51 @@ async function countTokensFromVertexAI() { location: GOOGLE_CLOUD_LOCATION, }); - const response = await ai.models.countTokens({ - model: 'gemini-2.0-flash', - contents: 'The quick brown fox jumps over the lazy dog.', + // const response = await ai.models.countTokens({ + // model: 'gemini-2.0-flash-001', + // contents: 'What is your name?', + // }); + + // console.debug(JSON.stringify(response)); + + // Count tokens in a simple string + const result1 = await ai.models.countTokens({ + model: 'gemini-2.0-flash-001', + contents: 'What is your name?', }); + console.debug(JSON.stringify(result1)); + console.log('Input: "What is your name?"'); + console.log(`Total tokens: ${result1.totalTokens}\n`); - console.debug(JSON.stringify(response)); + // Count tokens in a longer text + const longText = + 'The quick brown fox jumps over the lazy dog. This is a sample sentence for tokenization.'; + const result2 = await ai.models.countTokens({ + model: 'gemini-2.0-flash-001', + contents: longText, + }); + console.log(`Input: "${longText}"`); + console.log(`Total tokens: ${result2.totalTokens}\n`); + + // Count tokens with structured content + const result3 = await ai.models.countTokens({ + model: 'gemini-2.0-flash-001', + contents: [{role: 'user', parts: [{text: 'Hello, how are you?'}]}], + }); + console.log('Input: User message "Hello, how are you?"'); + console.log(`Total tokens: ${result3.totalTokens}\n`); + + // Count tokens in a multi-turn conversation + const result4 = await ai.models.countTokens({ + model: 'gemini-2.0-flash-001', + contents: [ + {role: 'user', parts: [{text: 'What is the capital of France?'}]}, + {role: 'model', parts: [{text: 'The capital of France is Paris.'}]}, + {role: 'user', parts: [{text: 'What about Spain?'}]}, + ], + }); + console.log('Input: Multi-turn conversation'); + console.log(`Total tokens: ${result4.totalTokens}\n`); } async function main() { diff --git a/sdk-samples/local-tokenizer/node/local_tokenizer_basic.ts b/sdk-samples/local-tokenizer/node/local_tokenizer_basic.ts new file mode 100644 index 000000000..fabad5e7b --- /dev/null +++ b/sdk-samples/local-tokenizer/node/local_tokenizer_basic.ts @@ -0,0 +1,57 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Basic example of using LocalTokenizer to count tokens offline. + * No API calls are made - tokenization happens locally. + */ + +import {LocalTokenizer} from '@google/genai/tokenizer/node'; + +async function basicCountTokens() { + console.log('=== Basic Token Counting ===\n'); + + // Create a local tokenizer for gemini-2.0-flash + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + // Count tokens in a simple string + const result1 = await tokenizer.countTokens('What is your name?'); + console.log('Input: "What is your name?"'); + console.log(`Total tokens: ${result1.totalTokens}\n`); + + // Count tokens in a longer text + const longText = + 'The quick brown fox jumps over the lazy dog. This is a sample sentence for tokenization.'; + const result2 = await tokenizer.countTokens(longText); + console.log(`Input: "${longText}"`); + console.log(`Total tokens: ${result2.totalTokens}\n`); + + // Count tokens with structured content + const result3 = await tokenizer.countTokens([ + {role: 'user', parts: [{text: 'Hello, how are you?'}]}, + ]); + console.log('Input: User message "Hello, how are you?"'); + console.log(`Total tokens: ${result3.totalTokens}\n`); + + // Count tokens in a multi-turn conversation + const result4 = await tokenizer.countTokens([ + {role: 'user', parts: [{text: 'What is the capital of France?'}]}, + {role: 'model', parts: [{text: 'The capital of France is Paris.'}]}, + {role: 'user', parts: [{text: 'What about Spain?'}]}, + ]); + console.log('Input: Multi-turn conversation'); + console.log(`Total tokens: ${result4.totalTokens}\n`); +} + +async function main() { + try { + await basicCountTokens(); + } catch (error) { + console.error('Error:', error); + } +} + +main(); diff --git a/sdk-samples/local-tokenizer/node/local_tokenizer_compute_tokens.ts b/sdk-samples/local-tokenizer/node/local_tokenizer_compute_tokens.ts new file mode 100644 index 000000000..3dafb0dad --- /dev/null +++ b/sdk-samples/local-tokenizer/node/local_tokenizer_compute_tokens.ts @@ -0,0 +1,60 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Example of using LocalTokenizer.computeTokens() to get detailed token information. + * This includes token IDs, bytes (base64 encoded), and roles. + */ + +import {LocalTokenizer} from '@google/genai/tokenizer/node'; + +async function computeDetailedTokens() { + console.log('=== Compute Detailed Token Information ===\n'); + + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + // Compute tokens for a simple string + const result1 = await tokenizer.computeTokens('Hello world'); + console.log('Input: "Hello world"'); + console.log('Token IDs:', result1.tokensInfo?.[0]?.tokenIds); + console.log('Tokens (base64):', result1.tokensInfo?.[0]?.tokens); + console.log('Role:', result1.tokensInfo?.[0]?.role); + + // Compute tokens for structured content + const result2 = await tokenizer.computeTokens([ + {role: 'user', parts: [{text: 'Explain quantum computing'}]}, + ]); + console.log('Input: "Explain quantum computing"'); + console.log( + `Number of token IDs: ${result2.tokensInfo?.[0]?.tokenIds?.length}\n`, + ); + + // Compute tokens for a multi-turn conversation + const result3 = await tokenizer.computeTokens([ + {role: 'user', parts: [{text: 'Hi'}]}, + {role: 'model', parts: [{text: 'Hello! How can I help you?'}]}, + ]); + console.log('Input: Multi-turn conversation'); + console.log(`Number of token segments: ${result3.tokensInfo?.length}`); + if (result3.tokensInfo) { + for (let i = 0; i < result3.tokensInfo.length; i++) { + const info = result3.tokensInfo[i]; + console.log( + ` Segment ${i + 1} (${info.role}): ${info.tokenIds?.length} tokens`, + ); + } + } +} + +async function main() { + try { + await computeDetailedTokens(); + } catch (error) { + console.error('Error:', error); + } +} + +main(); diff --git a/sdk-samples/local-tokenizer/node/local_tokenizer_with_tools.ts b/sdk-samples/local-tokenizer/node/local_tokenizer_with_tools.ts new file mode 100644 index 000000000..04d59a73b --- /dev/null +++ b/sdk-samples/local-tokenizer/node/local_tokenizer_with_tools.ts @@ -0,0 +1,158 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Example of using LocalTokenizer with tools, system instructions, and response schemas. + * This shows how to count tokens for complex requests that include function calling. + */ + +import {Type} from '@google/genai/node'; +import {LocalTokenizer} from '@google/genai/tokenizer/node'; + +async function countTokensWithTools() { + console.log('=== Count Tokens with Tools and Configuration ===\n'); + + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + // Define a tool for function calling + const calculatorTool = { + functionDeclarations: [ + { + name: 'calculate', + description: 'Performs arithmetic calculations on two numbers', + parameters: { + type: Type.OBJECT, + properties: { + operation: { + type: Type.STRING, + description: 'The operation to perform', + enum: ['add', 'subtract', 'multiply', 'divide'], + }, + a: { + type: Type.NUMBER, + description: 'First number', + }, + b: { + type: Type.NUMBER, + description: 'Second number', + }, + }, + required: ['operation', 'a', 'b'], + }, + }, + ], + }; + + // Count tokens with tool definition + const result1 = await tokenizer.countTokens( + [{role: 'user', parts: [{text: 'Calculate 15 + 27'}]}], + { + tools: [calculatorTool], + }, + ); + console.log('Request with calculator tool:'); + console.log(`Total tokens: ${result1.totalTokens}`); + console.log('(includes tool definition tokens)\n'); + + // Count tokens with system instruction + const result2 = await tokenizer.countTokens( + [{role: 'user', parts: [{text: 'What is 5 times 3?'}]}], + { + tools: [calculatorTool], + systemInstruction: + 'You are a helpful calculator assistant. Always use the calculate function to perform arithmetic operations.', + }, + ); + console.log('Request with tool and system instruction:'); + console.log(`Total tokens: ${result2.totalTokens}`); + console.log('(includes tool + system instruction tokens)\n'); + + // Count tokens with response schema + const responseSchema = { + type: Type.OBJECT, + properties: { + result: { + type: Type.NUMBER, + description: 'The result of the calculation', + }, + explanation: { + type: Type.STRING, + description: 'Step-by-step explanation', + }, + }, + required: ['result', 'explanation'], + }; + + const result3 = await tokenizer.countTokens( + [{role: 'user', parts: [{text: 'What is 100 divided by 4?'}]}], + { + tools: [calculatorTool], + systemInstruction: 'You are a helpful calculator assistant.', + generationConfig: { + responseSchema: responseSchema, + }, + }, + ); + console.log('Request with tool, system instruction, and response schema:'); + console.log(`Total tokens: ${result3.totalTokens}`); + console.log('(includes all configuration tokens)\n'); +} + +async function countTokensWithFunctionCall() { + console.log('=== Count Tokens with Function Call and Response ===\n'); + + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + // Count tokens for a conversation with function call + const result = await tokenizer.countTokens([ + {role: 'user', parts: [{text: 'What is the weather in San Francisco?'}]}, + { + role: 'model', + parts: [ + { + functionCall: { + name: 'get_weather', + args: {city: 'San Francisco'}, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'get_weather', + response: {temperature: 68, condition: 'sunny'}, + }, + }, + ], + }, + { + role: 'model', + parts: [ + { + text: 'The weather in San Francisco is sunny with a temperature of 68°F.', + }, + ], + }, + ]); + + console.log('Multi-turn conversation with function calling:'); + console.log(`Total tokens: ${result.totalTokens}`); + console.log('(includes function call and response tokens)\n'); +} + +async function main() { + try { + await countTokensWithTools(); + await countTokensWithFunctionCall(); + } catch (error) { + console.error('Error:', error); + } +} + +main(); diff --git a/sdk-samples/local-tokenizer/node/package-lock.json b/sdk-samples/local-tokenizer/node/package-lock.json new file mode 100644 index 000000000..ae10427b1 --- /dev/null +++ b/sdk-samples/local-tokenizer/node/package-lock.json @@ -0,0 +1,637 @@ +{ + "name": "local-tokenizer-node-examples", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "local-tokenizer-node-examples", + "version": "1.0.0", + "license": "Apache-2.0", + "dependencies": { + "@google/genai": "file:../../.." + }, + "devDependencies": { + "tsx": "^4.20.6", + "typescript": "^5.9.3" + } + }, + "../../..": { + "name": "@google/genai", + "version": "1.34.0", + "license": "Apache-2.0", + "dependencies": { + "google-auth-library": "^10.3.0", + "protobufjs": "^7.5.4", + "ws": "^8.18.0" + }, + "devDependencies": { + "@cfworker/json-schema": "^4.1.1", + "@eslint/js": "9.20.0", + "@microsoft/api-extractor": "^7.52.9", + "@modelcontextprotocol/sdk": "^1.24.0", + "@rollup/plugin-json": "^6.1.0", + "@types/jasmine": "^5.1.2", + "@types/node": "^20.9.0", + "@types/node-fetch": "^2.6.13", + "@types/unist": "^3.0.3", + "@types/ws": "^8.5.14", + "c8": "^10.1.3", + "eslint": "8.57.0", + "gts": "^5.2.0", + "jasmine": "^5.5.0", + "jasmine-reporters": "^2.4.0", + "node-fetch": "^3.3.2", + "npm-run-all": "^4.1.5", + "nyc": "^17.1.0", + "patch-package": "^8.0.1", + "prettier": "3.3.3", + "prettier-plugin-organize-imports": "^4.1.0", + "protobufjs-cli": "^1.1.3", + "rollup-plugin-typescript2": "^0.36.0", + "test-server-sdk": "^0.2.9", + "ts-node": "^10.9.2", + "tslib": "^2.8.1", + "tsx": "^4.19.4", + "typedoc": "^0.27.0", + "typescript": "~5.4.0", + "typescript-eslint": "8.24.1", + "undici": "^7.16.0", + "undici-types": "^7.16.0", + "zod": "^3.25.0", + "zod-to-json-schema": "^3.25.0" + }, + "engines": { + "node": ">=20.0.0" + }, + "peerDependencies": { + "@modelcontextprotocol/sdk": "^1.24.0" + }, + "peerDependenciesMeta": { + "@modelcontextprotocol/sdk": { + "optional": true + } + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@google/genai": { + "resolved": "../../..", + "link": true + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/get-tsconfig": { + "version": "4.13.0", + "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.0.tgz", + "integrity": "sha512-1VKTZJCwBrvbd+Wn3AOgQP/2Av+TfTCOlE4AcRJE72W1ksZXbAx8PPBR9RzgTeSPzlPMHrbANMH3LbltH73wxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "resolve-pkg-maps": "^1.0.0" + }, + "funding": { + "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" + } + }, + "node_modules/resolve-pkg-maps": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", + "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" + } + }, + "node_modules/tsx": { + "version": "4.20.6", + "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.20.6.tgz", + "integrity": "sha512-ytQKuwgmrrkDTFP4LjR0ToE2nqgy886GpvRSpU0JAnrdBYppuY5rLkRUYPU1yCryb24SsKBTL/hlDQAEFVwtZg==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "~0.25.0", + "get-tsconfig": "^4.7.5" + }, + "bin": { + "tsx": "dist/cli.mjs" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + } + } +} diff --git a/sdk-samples/local-tokenizer/node/package.json b/sdk-samples/local-tokenizer/node/package.json new file mode 100644 index 000000000..f87f06c06 --- /dev/null +++ b/sdk-samples/local-tokenizer/node/package.json @@ -0,0 +1,17 @@ +{ + "name": "local-tokenizer-node-examples", + "version": "1.0.0", + "description": "Node.js examples for using LocalTokenizer from @google/genai/tokenizer", + "type": "module", + "scripts": { + "build": "tsc" + }, + "dependencies": { + "@google/genai": "file:../../.." + }, + "license": "Apache-2.0", + "devDependencies": { + "tsx": "^4.20.6", + "typescript": "^5.9.3" + } +} diff --git a/sdk-samples/local-tokenizer/node/tsconfig.json b/sdk-samples/local-tokenizer/node/tsconfig.json new file mode 100644 index 000000000..21b154ebe --- /dev/null +++ b/sdk-samples/local-tokenizer/node/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "es2020", + "module": "es2020", + "moduleResolution": "bundler", + "outDir": "./build", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true + }, + "include": ["*.ts"] +} diff --git a/sdk-samples/package-lock.json b/sdk-samples/package-lock.json index c2113cc12..cef85b50f 100644 --- a/sdk-samples/package-lock.json +++ b/sdk-samples/package-lock.json @@ -22,10 +22,11 @@ }, "..": { "name": "@google/genai", - "version": "1.32.0", + "version": "1.34.0", "license": "Apache-2.0", "dependencies": { "google-auth-library": "^10.3.0", + "protobufjs": "^7.5.4", "ws": "^8.18.0" }, "devDependencies": { @@ -50,6 +51,7 @@ "patch-package": "^8.0.1", "prettier": "3.3.3", "prettier-plugin-organize-imports": "^4.1.0", + "protobufjs-cli": "^1.1.3", "rollup-plugin-typescript2": "^0.36.0", "test-server-sdk": "^0.2.9", "ts-node": "^10.9.2", @@ -886,7 +888,6 @@ "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", "license": "MIT", - "peer": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -1922,7 +1923,6 @@ "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "license": "MIT", - "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/sdk-samples/tsconfig.json b/sdk-samples/tsconfig.json index c64b8e18b..d7e3be28f 100644 --- a/sdk-samples/tsconfig.json +++ b/sdk-samples/tsconfig.json @@ -5,8 +5,7 @@ "moduleResolution": "node", "outDir": "./build", "esModuleInterop": true, - "forceConsistentCasingInFileNames": true, - "strict": true + "forceConsistentCasingInFileNames": true }, "include": ["*.ts"] } diff --git a/src/cross/sentencepiece/_model.ts b/src/cross/sentencepiece/_model.ts new file mode 100644 index 000000000..4abf02ddf --- /dev/null +++ b/src/cross/sentencepiece/_model.ts @@ -0,0 +1,56 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * TypeScript representation of the SentencePiece model protobuf. + * Translated from sentencepiece_model.proto + */ + +export enum ModelType { + UNIGRAM = 1, + BPE = 2, + WORD = 3, + CHAR = 4, +} + +export enum SentencePieceType { + NORMAL = 1, + UNKNOWN = 2, + CONTROL = 3, + USER_DEFINED = 4, + BYTE = 6, + UNUSED = 5, +} + +export interface TrainerSpec { + modelType?: ModelType; + vocabSize?: number; + characterCoverage?: number; + byteFallback?: boolean; + unkSurface?: string; + // Add other fields as needed +} + +export interface NormalizerSpec { + name?: string; + precompiledCharsmap?: Uint8Array; + addDummyPrefix?: boolean; + removeExtraWhitespaces?: boolean; + escapeWhitespaces?: boolean; + normalizationRuleTsv?: string; +} + +export interface SentencePiece { + piece?: string; + score?: number; + type?: SentencePieceType; +} + +export interface ModelProto { + pieces?: SentencePiece[]; + trainerSpec?: TrainerSpec; + normalizerSpec?: NormalizerSpec; +} diff --git a/src/cross/sentencepiece/_prefix_matcher.ts b/src/cross/sentencepiece/_prefix_matcher.ts new file mode 100644 index 000000000..c49fb1253 --- /dev/null +++ b/src/cross/sentencepiece/_prefix_matcher.ts @@ -0,0 +1,82 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * PrefixMatcher helps find longest prefixes using a trie data structure. + * Translated from go-sentencepiece/internal/prefixmatcher + */ + +interface TrieNode { + children: Map; + final: boolean; +} + +function newTrieNode(): TrieNode { + return { + children: new Map(), + final: false, + }; +} + +/** + * PrefixMatcher finds the longest prefix of a string that matches + * a vocabulary word using a trie data structure. + */ +export class PrefixMatcher { + private root: TrieNode; + + /** + * Creates a new PrefixMatcher from a set of vocabulary strings. + */ + constructor(vocab: Set) { + this.root = newTrieNode(); + for (const word of vocab) { + this.add(word); + } + } + + /** + * Finds the longest prefix of text that matches a vocabulary word. + * Returns the length of the prefix, or 0 if no prefix was found. + */ + findPrefixLen(text: string): number { + let node = this.root; + let maxLen = 0; + + let i = 0; + for (const char of text) { + const child = node.children.get(char); + if (!child) { + return maxLen; + } + if (child.final) { + maxLen = i + 1; + } + node = child; + i++; + } + + return maxLen; + } + + /** + * Adds a word to the trie. + */ + private add(word: string): void { + let node = this.root; + + for (const char of word) { + let child = node.children.get(char); + if (!child) { + child = newTrieNode(); + node.children.set(char, child); + } + node = child; + } + + node.final = true; + } +} diff --git a/src/cross/sentencepiece/_priority_queue.ts b/src/cross/sentencepiece/_priority_queue.ts new file mode 100644 index 000000000..0bf040bc8 --- /dev/null +++ b/src/cross/sentencepiece/_priority_queue.ts @@ -0,0 +1,142 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Generic priority queue with Insert, PopMax, and RemoveFunc operations. + * Translated from go-sentencepiece/internal/priorityqueue + * + * Uses a binary heap data structure where items[0] is unused, + * and elements are stored at indices 1...N. + */ +export class PriorityQueue { + private cmp: (a: T, b: T) => number; + private items: T[]; + + /** + * Creates a new PriorityQueue. + * + * @param sizeHint Initial capacity hint for the queue + * @param cmp Comparison function that returns > 0 if a has higher priority than b, + * 0 if equal priority, < 0 otherwise + */ + constructor(sizeHint: number, cmp: (a: T, b: T) => number) { + this.cmp = cmp; + this.items = new Array(Math.max(1, sizeHint + 1)); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + this.items[0] = null as any; + this.items.length = 1; + } + + /** + * Returns the number of items in the queue. + */ + len(): number { + return this.items.length - 1; + } + + /** + * Inserts a new element into the priority queue. + */ + insert(elem: T): void { + this.items.push(elem); + this.siftUp(this.items.length - 1); + } + + /** + * Returns and removes the element with the maximal priority. + * Throws if the queue is empty. + */ + popMax(): T { + if (this.items.length < 2) { + throw new Error('popping from empty priority queue'); + } + const maxItem = this.items[1]; + this.items[1] = this.items[this.items.length - 1]; + this.items.pop(); + if (this.items.length > 1) { + this.siftDown(1); + } + return maxItem; + } + + /** + * Removes all elements for which the predicate returns true. + */ + removeFunc(rm: (elem: T) => boolean): void { + let i = 1; + while (i < this.items.length && !rm(this.items[i])) { + i++; + } + if (i === this.items.length) { + return; + } + + for (let j = i + 1; j < this.items.length; j++) { + if (!rm(this.items[j])) { + this.items[i] = this.items[j]; + i++; + } + } + + this.items.length = i; + + this.rebuildHeap(); + } + + /** + * Rebuilds the entire heap from scratch. + */ + private rebuildHeap(): void { + for (let i = Math.floor(this.items.length / 2); i >= 1; i--) { + this.siftDown(i); + } + } + + /** + * Moves an element up the heap until heap property is restored. + */ + private siftUp(n: number): void { + let i = n; + while (i > 1) { + const p = Math.floor(i / 2); + if (this.cmp(this.items[p], this.items[i]) >= 0) { + return; + } + [this.items[i], this.items[p]] = [this.items[p], this.items[i]]; + i = p; + } + } + + /** + * Moves an element down the heap until heap property is restored. + */ + private siftDown(i: number): void { + // eslint-disable-next-line + while (true) { + const c = 2 * i; + if (c >= this.items.length) { + return; + } + + let maxChild = c; + if (c + 1 < this.items.length) { + if (this.cmp(this.items[c + 1], this.items[c]) > 0) { + maxChild = c + 1; + } + } + + if (this.cmp(this.items[i], this.items[maxChild]) >= 0) { + return; + } + + [this.items[i], this.items[maxChild]] = [ + this.items[maxChild], + this.items[i], + ]; + i = maxChild; + } + } +} diff --git a/src/cross/sentencepiece/_processor.ts b/src/cross/sentencepiece/_processor.ts new file mode 100644 index 000000000..41230baf0 --- /dev/null +++ b/src/cross/sentencepiece/_processor.ts @@ -0,0 +1,464 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Pure TypeScript implementation of SentencePiece tokenizer. + * Translated from github.com/eliben/go-sentencepiece + */ + +import {ModelProto, ModelType, SentencePieceType} from './_model.js'; +import {PrefixMatcher} from './_prefix_matcher.js'; +import {PriorityQueue} from './_priority_queue.js'; +import {sentencepiece} from './sentencepiece_model.pb.js'; + +const WHITESPACE_SEPARATOR = '▁'; + +/** Token represents a single token from the input text */ +export interface Token { + id: number; + text: string; +} + +/** Model information */ +export interface ModelInfo { + vocabularySize: number; + beginningOfSentenceID: number; + endOfSentenceID: number; + unknownID: number; + padID: number; +} + +const SYMBOL_BOS = ''; +const SYMBOL_EOS = ''; +const SYMBOL_PAD = ''; + +/** + * Processor represents a SentencePiece processor (tokenizer). + * It converts input text into a sequence of tokens and back. + */ +export class SentencePieceProcessor { + private model: ModelProto; + private pieces: Map; + private reserved: Map; + private unknownID: number; + private userDefinedMatcher: PrefixMatcher; + private byte2Token: Map; + private idToByte: Map; + private maxPieceLength: number; + + /** + * Creates a new Processor from model proto bytes. + */ + constructor(modelProtoBytes: Uint8Array) { + this.model = this.parseModelProto(modelProtoBytes); + + const tspec = this.model.trainerSpec; + if (!tspec || tspec.modelType !== ModelType.BPE) { + throw new Error( + `Model type ${tspec?.modelType} not supported, only BPE is supported`, + ); + } + + const nspec = this.model.normalizerSpec; + if (nspec?.addDummyPrefix || nspec?.removeExtraWhitespaces) { + throw new Error( + `Normalizer spec options not supported: ${JSON.stringify(nspec)}`, + ); + } + + const userDefined = new Set(); + this.pieces = new Map(); + this.reserved = new Map(); + this.byte2Token = new Map(); + this.idToByte = new Map(); + this.unknownID = -1; + this.maxPieceLength = 0; + + if (!this.model.pieces) { + throw new Error('Model has no pieces'); + } + + for (let i = 0; i < this.model.pieces.length; i++) { + const piece = this.model.pieces[i]; + const pieceText = piece.piece ?? ''; + const pieceType = piece.type ?? SentencePieceType.NORMAL; + + const isNormalPiece = + pieceType === SentencePieceType.NORMAL || + pieceType === SentencePieceType.USER_DEFINED || + pieceType === SentencePieceType.UNUSED; + + if (isNormalPiece) { + this.pieces.set(pieceText, i); + this.maxPieceLength = Math.max(this.maxPieceLength, pieceText.length); + } else { + this.reserved.set(pieceText, i); + } + + if (pieceType === SentencePieceType.USER_DEFINED) { + userDefined.add(pieceText); + } else if (pieceType === SentencePieceType.UNKNOWN) { + if (this.unknownID >= 0) { + throw new Error('unk redefined'); + } + this.unknownID = i; + } else if (pieceType === SentencePieceType.BYTE) { + if (!tspec.byteFallback) { + throw new Error( + `byte piece "${pieceText}" found although byte_fallback=false`, + ); + } + const bv = convertHexValue(pieceText); + if (bv >= 0 && bv < 256) { + this.byte2Token.set(bv, {id: i, text: pieceText}); + this.idToByte.set(i, bv); + } + } + } + + if (this.unknownID < 0) { + throw new Error('unk symbol is not defined'); + } + + // If byte_fallback is specified, ensure all 256 byte values are present + if (tspec.byteFallback) { + for (let i = 0; i < 256; i++) { + if (!this.byte2Token.has(i)) { + throw new Error( + `byte value 0x${i.toString(16).padStart(2, '0')} not found`, + ); + } + } + } + + this.userDefinedMatcher = new PrefixMatcher(userDefined); + } + + /** + * Encodes text into a list of tokens. + */ + encode(text: string): Token[] { + text = this.normalize(text); + + // Symbol list element type + interface SymListElem { + prev: number; + next: number; + noMerge: boolean; + symbol: string; + } + + const symList: SymListElem[] = []; + + while (text.length > 0) { + const [slen, found] = this.symbolMatch(text); + + const sym: SymListElem = { + noMerge: found, + symbol: text.substring(0, slen), + prev: symList.length - 1, + next: symList.length + 1, + }; + symList.push(sym); + + text = text.substring(slen); + } + + if (symList.length === 0) { + return []; + } + + symList[symList.length - 1].next = -1; + let _nTokens = symList.length; + + interface MergeCandidate { + left: number; + right: number; + length: number; + score: number; + } + + const mergeQueue = new PriorityQueue( + symList.length, + (a, b) => { + if (a.score > b.score || (a.score === b.score && a.left < b.left)) { + return 1; + } + return -1; + }, + ); + + const findMerged = ( + x: SymListElem, + y: SymListElem, + ): [string, number, boolean] => { + const merged = x.symbol + y.symbol; + const id = this.pieces.get(merged); + if (id !== undefined && this.model.pieces) { + return [this.model.pieces[id].piece ?? '', id, true]; + } + return ['', 0, false]; + }; + + const suggestNewMergePair = (left: number, right: number) => { + if ( + left === -1 || + right === -1 || + symList[left].noMerge || + symList[right].noMerge + ) { + return; + } + + const [mergedSymbol, id, ok] = findMerged(symList[left], symList[right]); + if (ok && this.model.pieces) { + mergeQueue.insert({ + left, + right, + length: mergedSymbol.length, + score: this.model.pieces[id].score ?? 0, + }); + } + }; + + for (let i = 1; i < symList.length; i++) { + suggestNewMergePair(i - 1, i); + } + + const candidateIsDead = (candidate: MergeCandidate): boolean => { + const leftSymbol = symList[candidate.left].symbol; + const rightSymbol = symList[candidate.right].symbol; + return ( + leftSymbol === '' || + rightSymbol === '' || + leftSymbol.length + rightSymbol.length !== candidate.length + ); + }; + + let mergeQueueDead = 0; + while (mergeQueue.len() > 0) { + const candidate = mergeQueue.popMax(); + const leftSymbol = symList[candidate.left]; + const rightSymbol = symList[candidate.right]; + + if (candidateIsDead(candidate)) { + mergeQueueDead--; + continue; + } + + if (mergeQueueDead * 3 > mergeQueue.len()) { + mergeQueue.removeFunc(candidateIsDead); + mergeQueueDead = 0; + } + + const [mergedSymbol, , ok] = findMerged(leftSymbol, rightSymbol); + if (!ok) { + throw new Error('failed to merge symbols'); + } + symList[candidate.left].symbol = mergedSymbol; + _nTokens--; + + symList[candidate.left].next = rightSymbol.next; + if (rightSymbol.next >= 0) { + symList[rightSymbol.next].prev = candidate.left; + } + + symList[candidate.right].symbol = ''; + mergeQueueDead++; + + suggestNewMergePair(leftSymbol.prev, candidate.left); + suggestNewMergePair(candidate.left, rightSymbol.next); + } + + const tokens: Token[] = []; + for (let i = 0; i >= 0; i = symList[i].next) { + const symbol = symList[i].symbol; + const id = this.symbolToID(symbol); + if (id === this.unknownID && this.model.trainerSpec?.byteFallback) { + // Need to convert byte to token at UTF-8 bytes level + const bytes = new TextEncoder().encode(symbol); + for (let j = 0; j < bytes.length; j++) { + const byteToken = this.byte2Token.get(bytes[j]); + if (byteToken) { + tokens.push(byteToken); + } + } + } else { + tokens.push({id, text: symbol}); + } + } + + return tokens; + } + + /** + * Decodes a list of token IDs back into text. + */ + decode(ids: number[]): string { + const parts: string[] = []; + + let i = 0; + while (i < ids.length) { + let nextNonByte = i; + while (nextNonByte < ids.length && this.isByteID(ids[nextNonByte])) { + nextNonByte++; + } + const numBytes = nextNonByte - i; + + if (numBytes > 0) { + const bytes: number[] = []; + for (let bi = i; bi < nextNonByte; bi++) { + const byte = this.idToByte.get(ids[bi]); + if (byte !== undefined) { + bytes.push(byte); + } + } + + const textDecoder = new TextDecoder('utf-8', {fatal: false}); + const text = textDecoder.decode(new Uint8Array(bytes)); + parts.push(text); + } + + if (nextNonByte >= ids.length) { + break; + } + + const id = ids[nextNonByte]; + // eslint-disable-next-line no-empty + if (this.isControlID(id)) { + } else if (id === this.unknownID) { + parts.push(this.model.trainerSpec?.unkSurface ?? ''); + } else if (this.model.pieces && this.model.pieces[id]) { + const piece = this.model.pieces[id].piece ?? ''; + parts.push(this.replaceSeparatorsBySpace(piece)); + } + i = nextNonByte + 1; + } + + return parts.join(''); + } + + /** + * Decodes a list of tokens back into text. + */ + decodeTokens(tokens: Token[]): string { + return this.decode(tokens.map((t) => t.id)); + } + + /** + * Returns information about the loaded model. + */ + modelInfo(): ModelInfo { + const getControlID = (symbol: string): number => { + const id = this.symbolToID(symbol); + return this.isControlID(id) ? id : -1; + }; + + return { + vocabularySize: this.model.pieces?.length ?? 0, + beginningOfSentenceID: getControlID(SYMBOL_BOS), + endOfSentenceID: getControlID(SYMBOL_EOS), + padID: getControlID(SYMBOL_PAD), + unknownID: this.unknownID, + }; + } + + private normalize(text: string): string { + return text.replace(/ /g, WHITESPACE_SEPARATOR); + } + + private replaceSeparatorsBySpace(text: string): string { + return text.replace(new RegExp(WHITESPACE_SEPARATOR, 'g'), ' '); + } + + private symbolMatch(text: string): [number, boolean] { + const prefixLen = this.userDefinedMatcher.findPrefixLen(text); + if (prefixLen > 0) { + return [prefixLen, true]; + } + + // Return character length (1), not byte length + // This matches the Java implementation where i++ advances by 1 character + return [1, false]; + } + + private symbolToID(symbol: string): number { + const reservedID = this.reserved.get(symbol); + if (reservedID !== undefined) { + return reservedID; + } + const pieceID = this.pieces.get(symbol); + if (pieceID !== undefined) { + return pieceID; + } + return this.unknownID; + } + + private isByteID(id: number): boolean { + if (!this.model.pieces || id >= this.model.pieces.length) { + return false; + } + return this.model.pieces[id].type === SentencePieceType.BYTE; + } + + private isControlID(id: number): boolean { + if (!this.model.pieces || id >= this.model.pieces.length) { + return false; + } + return this.model.pieces[id].type === SentencePieceType.CONTROL; + } + + private parseModelProto(data: Uint8Array): ModelProto { + const decoded = sentencepiece.ModelProto.decode(data); + + const model: ModelProto = { + pieces: decoded.pieces?.map( + (p: sentencepiece.ModelProto.ISentencePiece) => ({ + piece: p.piece ?? undefined, + score: p.score ?? undefined, + type: (p.type as unknown as SentencePieceType) ?? undefined, + }), + ), + trainerSpec: decoded.trainerSpec + ? { + modelType: + (decoded.trainerSpec.modelType as unknown as ModelType) ?? + undefined, + vocabSize: decoded.trainerSpec.vocabSize ?? undefined, + characterCoverage: + decoded.trainerSpec.characterCoverage ?? undefined, + byteFallback: decoded.trainerSpec.byteFallback ?? undefined, + unkSurface: decoded.trainerSpec.unkSurface ?? undefined, + } + : undefined, + normalizerSpec: decoded.normalizerSpec + ? { + name: decoded.normalizerSpec.name ?? undefined, + precompiledCharsmap: decoded.normalizerSpec.precompiledCharsmap + ? new Uint8Array(decoded.normalizerSpec.precompiledCharsmap) + : undefined, + addDummyPrefix: decoded.normalizerSpec.addDummyPrefix ?? undefined, + removeExtraWhitespaces: + decoded.normalizerSpec.removeExtraWhitespaces ?? undefined, + escapeWhitespaces: + decoded.normalizerSpec.escapeWhitespaces ?? undefined, + normalizationRuleTsv: + decoded.normalizerSpec.normalizationRuleTsv ?? undefined, + } + : undefined, + }; + + return model; + } +} + +function convertHexValue(bv: string): number { + const match = bv.match(/^<0x([0-9A-Fa-f]{2})>$/); + if (!match) { + return -1; + } + return parseInt(match[1], 16); +} diff --git a/src/cross/sentencepiece/sentencepiece_model.pb.d.ts b/src/cross/sentencepiece/sentencepiece_model.pb.d.ts new file mode 100644 index 000000000..87574e6a3 --- /dev/null +++ b/src/cross/sentencepiece/sentencepiece_model.pb.d.ts @@ -0,0 +1,1058 @@ +/* eslint-disable */ + +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import * as $protobuf from 'protobufjs'; +import Long = require('long'); +/** Namespace sentencepiece. */ +export namespace sentencepiece { + /** Properties of a TrainerSpec. */ + interface ITrainerSpec { + /** TrainerSpec input */ + input?: string[] | null; + + /** TrainerSpec inputFormat */ + inputFormat?: string | null; + + /** TrainerSpec modelPrefix */ + modelPrefix?: string | null; + + /** TrainerSpec modelType */ + modelType?: sentencepiece.TrainerSpec.ModelType | null; + + /** TrainerSpec vocabSize */ + vocabSize?: number | null; + + /** TrainerSpec acceptLanguage */ + acceptLanguage?: string[] | null; + + /** TrainerSpec selfTestSampleSize */ + selfTestSampleSize?: number | null; + + /** TrainerSpec enableDifferentialPrivacy */ + enableDifferentialPrivacy?: boolean | null; + + /** TrainerSpec differentialPrivacyNoiseLevel */ + differentialPrivacyNoiseLevel?: number | null; + + /** TrainerSpec differentialPrivacyClippingThreshold */ + differentialPrivacyClippingThreshold?: number | Long | null; + + /** TrainerSpec characterCoverage */ + characterCoverage?: number | null; + + /** TrainerSpec inputSentenceSize */ + inputSentenceSize?: number | Long | null; + + /** TrainerSpec shuffleInputSentence */ + shuffleInputSentence?: boolean | null; + + /** TrainerSpec miningSentenceSize */ + miningSentenceSize?: number | null; + + /** TrainerSpec trainingSentenceSize */ + trainingSentenceSize?: number | null; + + /** TrainerSpec seedSentencepieceSize */ + seedSentencepieceSize?: number | null; + + /** TrainerSpec shrinkingFactor */ + shrinkingFactor?: number | null; + + /** TrainerSpec maxSentenceLength */ + maxSentenceLength?: number | null; + + /** TrainerSpec numThreads */ + numThreads?: number | null; + + /** TrainerSpec numSubIterations */ + numSubIterations?: number | null; + + /** TrainerSpec maxSentencepieceLength */ + maxSentencepieceLength?: number | null; + + /** TrainerSpec splitByUnicodeScript */ + splitByUnicodeScript?: boolean | null; + + /** TrainerSpec splitByNumber */ + splitByNumber?: boolean | null; + + /** TrainerSpec splitByWhitespace */ + splitByWhitespace?: boolean | null; + + /** TrainerSpec treatWhitespaceAsSuffix */ + treatWhitespaceAsSuffix?: boolean | null; + + /** TrainerSpec allowWhitespaceOnlyPieces */ + allowWhitespaceOnlyPieces?: boolean | null; + + /** TrainerSpec splitDigits */ + splitDigits?: boolean | null; + + /** TrainerSpec pretokenizationDelimiter */ + pretokenizationDelimiter?: string | null; + + /** TrainerSpec controlSymbols */ + controlSymbols?: string[] | null; + + /** TrainerSpec userDefinedSymbols */ + userDefinedSymbols?: string[] | null; + + /** TrainerSpec requiredChars */ + requiredChars?: string | null; + + /** TrainerSpec byteFallback */ + byteFallback?: boolean | null; + + /** TrainerSpec vocabularyOutputPieceScore */ + vocabularyOutputPieceScore?: boolean | null; + + /** TrainerSpec hardVocabLimit */ + hardVocabLimit?: boolean | null; + + /** TrainerSpec useAllVocab */ + useAllVocab?: boolean | null; + + /** TrainerSpec unkId */ + unkId?: number | null; + + /** TrainerSpec bosId */ + bosId?: number | null; + + /** TrainerSpec eosId */ + eosId?: number | null; + + /** TrainerSpec padId */ + padId?: number | null; + + /** TrainerSpec unkPiece */ + unkPiece?: string | null; + + /** TrainerSpec bosPiece */ + bosPiece?: string | null; + + /** TrainerSpec eosPiece */ + eosPiece?: string | null; + + /** TrainerSpec padPiece */ + padPiece?: string | null; + + /** TrainerSpec unkSurface */ + unkSurface?: string | null; + + /** TrainerSpec trainExtremelyLargeCorpus */ + trainExtremelyLargeCorpus?: boolean | null; + + /** TrainerSpec seedSentencepiecesFile */ + seedSentencepiecesFile?: string | null; + } + + /** Represents a TrainerSpec. */ + class TrainerSpec implements ITrainerSpec { + /** + * Constructs a new TrainerSpec. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.ITrainerSpec); + + /** TrainerSpec input. */ + public input: string[]; + + /** TrainerSpec inputFormat. */ + public inputFormat: string; + + /** TrainerSpec modelPrefix. */ + public modelPrefix: string; + + /** TrainerSpec modelType. */ + public modelType: sentencepiece.TrainerSpec.ModelType; + + /** TrainerSpec vocabSize. */ + public vocabSize: number; + + /** TrainerSpec acceptLanguage. */ + public acceptLanguage: string[]; + + /** TrainerSpec selfTestSampleSize. */ + public selfTestSampleSize: number; + + /** TrainerSpec enableDifferentialPrivacy. */ + public enableDifferentialPrivacy: boolean; + + /** TrainerSpec differentialPrivacyNoiseLevel. */ + public differentialPrivacyNoiseLevel: number; + + /** TrainerSpec differentialPrivacyClippingThreshold. */ + public differentialPrivacyClippingThreshold: number | Long; + + /** TrainerSpec characterCoverage. */ + public characterCoverage: number; + + /** TrainerSpec inputSentenceSize. */ + public inputSentenceSize: number | Long; + + /** TrainerSpec shuffleInputSentence. */ + public shuffleInputSentence: boolean; + + /** TrainerSpec miningSentenceSize. */ + public miningSentenceSize: number; + + /** TrainerSpec trainingSentenceSize. */ + public trainingSentenceSize: number; + + /** TrainerSpec seedSentencepieceSize. */ + public seedSentencepieceSize: number; + + /** TrainerSpec shrinkingFactor. */ + public shrinkingFactor: number; + + /** TrainerSpec maxSentenceLength. */ + public maxSentenceLength: number; + + /** TrainerSpec numThreads. */ + public numThreads: number; + + /** TrainerSpec numSubIterations. */ + public numSubIterations: number; + + /** TrainerSpec maxSentencepieceLength. */ + public maxSentencepieceLength: number; + + /** TrainerSpec splitByUnicodeScript. */ + public splitByUnicodeScript: boolean; + + /** TrainerSpec splitByNumber. */ + public splitByNumber: boolean; + + /** TrainerSpec splitByWhitespace. */ + public splitByWhitespace: boolean; + + /** TrainerSpec treatWhitespaceAsSuffix. */ + public treatWhitespaceAsSuffix: boolean; + + /** TrainerSpec allowWhitespaceOnlyPieces. */ + public allowWhitespaceOnlyPieces: boolean; + + /** TrainerSpec splitDigits. */ + public splitDigits: boolean; + + /** TrainerSpec pretokenizationDelimiter. */ + public pretokenizationDelimiter: string; + + /** TrainerSpec controlSymbols. */ + public controlSymbols: string[]; + + /** TrainerSpec userDefinedSymbols. */ + public userDefinedSymbols: string[]; + + /** TrainerSpec requiredChars. */ + public requiredChars: string; + + /** TrainerSpec byteFallback. */ + public byteFallback: boolean; + + /** TrainerSpec vocabularyOutputPieceScore. */ + public vocabularyOutputPieceScore: boolean; + + /** TrainerSpec hardVocabLimit. */ + public hardVocabLimit: boolean; + + /** TrainerSpec useAllVocab. */ + public useAllVocab: boolean; + + /** TrainerSpec unkId. */ + public unkId: number; + + /** TrainerSpec bosId. */ + public bosId: number; + + /** TrainerSpec eosId. */ + public eosId: number; + + /** TrainerSpec padId. */ + public padId: number; + + /** TrainerSpec unkPiece. */ + public unkPiece: string; + + /** TrainerSpec bosPiece. */ + public bosPiece: string; + + /** TrainerSpec eosPiece. */ + public eosPiece: string; + + /** TrainerSpec padPiece. */ + public padPiece: string; + + /** TrainerSpec unkSurface. */ + public unkSurface: string; + + /** TrainerSpec trainExtremelyLargeCorpus. */ + public trainExtremelyLargeCorpus: boolean; + + /** TrainerSpec seedSentencepiecesFile. */ + public seedSentencepiecesFile: string; + + /** + * Creates a new TrainerSpec instance using the specified properties. + * @param [properties] Properties to set + * @returns TrainerSpec instance + */ + public static create( + properties?: sentencepiece.ITrainerSpec, + ): sentencepiece.TrainerSpec; + + /** + * Encodes the specified TrainerSpec message. Does not implicitly {@link sentencepiece.TrainerSpec.verify|verify} messages. + * @param message TrainerSpec message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.ITrainerSpec, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified TrainerSpec message, length delimited. Does not implicitly {@link sentencepiece.TrainerSpec.verify|verify} messages. + * @param message TrainerSpec message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.ITrainerSpec, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a TrainerSpec message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns TrainerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.TrainerSpec; + + /** + * Decodes a TrainerSpec message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns TrainerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.TrainerSpec; + + /** + * Verifies a TrainerSpec message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a TrainerSpec message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns TrainerSpec + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.TrainerSpec; + + /** + * Creates a plain object from a TrainerSpec message. Also converts values to other types if specified. + * @param message TrainerSpec + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.TrainerSpec, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this TrainerSpec to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for TrainerSpec + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + + namespace TrainerSpec { + /** ModelType enum. */ + enum ModelType { + UNIGRAM = 1, + BPE = 2, + WORD = 3, + CHAR = 4, + } + } + + /** Properties of a NormalizerSpec. */ + interface INormalizerSpec { + /** NormalizerSpec name */ + name?: string | null; + + /** NormalizerSpec precompiledCharsmap */ + precompiledCharsmap?: Uint8Array | null; + + /** NormalizerSpec addDummyPrefix */ + addDummyPrefix?: boolean | null; + + /** NormalizerSpec removeExtraWhitespaces */ + removeExtraWhitespaces?: boolean | null; + + /** NormalizerSpec escapeWhitespaces */ + escapeWhitespaces?: boolean | null; + + /** NormalizerSpec normalizationRuleTsv */ + normalizationRuleTsv?: string | null; + } + + /** Represents a NormalizerSpec. */ + class NormalizerSpec implements INormalizerSpec { + /** + * Constructs a new NormalizerSpec. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.INormalizerSpec); + + /** NormalizerSpec name. */ + public name: string; + + /** NormalizerSpec precompiledCharsmap. */ + public precompiledCharsmap: Uint8Array; + + /** NormalizerSpec addDummyPrefix. */ + public addDummyPrefix: boolean; + + /** NormalizerSpec removeExtraWhitespaces. */ + public removeExtraWhitespaces: boolean; + + /** NormalizerSpec escapeWhitespaces. */ + public escapeWhitespaces: boolean; + + /** NormalizerSpec normalizationRuleTsv. */ + public normalizationRuleTsv: string; + + /** + * Creates a new NormalizerSpec instance using the specified properties. + * @param [properties] Properties to set + * @returns NormalizerSpec instance + */ + public static create( + properties?: sentencepiece.INormalizerSpec, + ): sentencepiece.NormalizerSpec; + + /** + * Encodes the specified NormalizerSpec message. Does not implicitly {@link sentencepiece.NormalizerSpec.verify|verify} messages. + * @param message NormalizerSpec message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.INormalizerSpec, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified NormalizerSpec message, length delimited. Does not implicitly {@link sentencepiece.NormalizerSpec.verify|verify} messages. + * @param message NormalizerSpec message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.INormalizerSpec, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a NormalizerSpec message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns NormalizerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.NormalizerSpec; + + /** + * Decodes a NormalizerSpec message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns NormalizerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.NormalizerSpec; + + /** + * Verifies a NormalizerSpec message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a NormalizerSpec message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns NormalizerSpec + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.NormalizerSpec; + + /** + * Creates a plain object from a NormalizerSpec message. Also converts values to other types if specified. + * @param message NormalizerSpec + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.NormalizerSpec, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this NormalizerSpec to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for NormalizerSpec + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + + /** Properties of a SelfTestData. */ + interface ISelfTestData { + /** SelfTestData samples */ + samples?: sentencepiece.SelfTestData.ISample[] | null; + } + + /** Represents a SelfTestData. */ + class SelfTestData implements ISelfTestData { + /** + * Constructs a new SelfTestData. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.ISelfTestData); + + /** SelfTestData samples. */ + public samples: sentencepiece.SelfTestData.ISample[]; + + /** + * Creates a new SelfTestData instance using the specified properties. + * @param [properties] Properties to set + * @returns SelfTestData instance + */ + public static create( + properties?: sentencepiece.ISelfTestData, + ): sentencepiece.SelfTestData; + + /** + * Encodes the specified SelfTestData message. Does not implicitly {@link sentencepiece.SelfTestData.verify|verify} messages. + * @param message SelfTestData message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.ISelfTestData, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified SelfTestData message, length delimited. Does not implicitly {@link sentencepiece.SelfTestData.verify|verify} messages. + * @param message SelfTestData message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.ISelfTestData, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a SelfTestData message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns SelfTestData + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.SelfTestData; + + /** + * Decodes a SelfTestData message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns SelfTestData + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.SelfTestData; + + /** + * Verifies a SelfTestData message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a SelfTestData message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns SelfTestData + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.SelfTestData; + + /** + * Creates a plain object from a SelfTestData message. Also converts values to other types if specified. + * @param message SelfTestData + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.SelfTestData, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this SelfTestData to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for SelfTestData + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + + namespace SelfTestData { + /** Properties of a Sample. */ + interface ISample { + /** Sample input */ + input?: string | null; + + /** Sample expected */ + expected?: string | null; + } + + /** Represents a Sample. */ + class Sample implements ISample { + /** + * Constructs a new Sample. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.SelfTestData.ISample); + + /** Sample input. */ + public input: string; + + /** Sample expected. */ + public expected: string; + + /** + * Creates a new Sample instance using the specified properties. + * @param [properties] Properties to set + * @returns Sample instance + */ + public static create( + properties?: sentencepiece.SelfTestData.ISample, + ): sentencepiece.SelfTestData.Sample; + + /** + * Encodes the specified Sample message. Does not implicitly {@link sentencepiece.SelfTestData.Sample.verify|verify} messages. + * @param message Sample message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.SelfTestData.ISample, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified Sample message, length delimited. Does not implicitly {@link sentencepiece.SelfTestData.Sample.verify|verify} messages. + * @param message Sample message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.SelfTestData.ISample, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a Sample message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns Sample + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.SelfTestData.Sample; + + /** + * Decodes a Sample message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns Sample + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.SelfTestData.Sample; + + /** + * Verifies a Sample message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a Sample message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns Sample + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.SelfTestData.Sample; + + /** + * Creates a plain object from a Sample message. Also converts values to other types if specified. + * @param message Sample + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.SelfTestData.Sample, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this Sample to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for Sample + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + } + + /** Properties of a ModelProto. */ + interface IModelProto { + /** ModelProto pieces */ + pieces?: sentencepiece.ModelProto.ISentencePiece[] | null; + + /** ModelProto trainerSpec */ + trainerSpec?: sentencepiece.ITrainerSpec | null; + + /** ModelProto normalizerSpec */ + normalizerSpec?: sentencepiece.INormalizerSpec | null; + + /** ModelProto selfTestData */ + selfTestData?: sentencepiece.ISelfTestData | null; + + /** ModelProto denormalizerSpec */ + denormalizerSpec?: sentencepiece.INormalizerSpec | null; + } + + /** Represents a ModelProto. */ + class ModelProto implements IModelProto { + /** + * Constructs a new ModelProto. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.IModelProto); + + /** ModelProto pieces. */ + public pieces: sentencepiece.ModelProto.ISentencePiece[]; + + /** ModelProto trainerSpec. */ + public trainerSpec?: sentencepiece.ITrainerSpec | null; + + /** ModelProto normalizerSpec. */ + public normalizerSpec?: sentencepiece.INormalizerSpec | null; + + /** ModelProto selfTestData. */ + public selfTestData?: sentencepiece.ISelfTestData | null; + + /** ModelProto denormalizerSpec. */ + public denormalizerSpec?: sentencepiece.INormalizerSpec | null; + + /** + * Creates a new ModelProto instance using the specified properties. + * @param [properties] Properties to set + * @returns ModelProto instance + */ + public static create( + properties?: sentencepiece.IModelProto, + ): sentencepiece.ModelProto; + + /** + * Encodes the specified ModelProto message. Does not implicitly {@link sentencepiece.ModelProto.verify|verify} messages. + * @param message ModelProto message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.IModelProto, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified ModelProto message, length delimited. Does not implicitly {@link sentencepiece.ModelProto.verify|verify} messages. + * @param message ModelProto message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.IModelProto, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a ModelProto message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns ModelProto + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.ModelProto; + + /** + * Decodes a ModelProto message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns ModelProto + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.ModelProto; + + /** + * Verifies a ModelProto message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a ModelProto message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns ModelProto + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.ModelProto; + + /** + * Creates a plain object from a ModelProto message. Also converts values to other types if specified. + * @param message ModelProto + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.ModelProto, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this ModelProto to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for ModelProto + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + + namespace ModelProto { + /** Properties of a SentencePiece. */ + interface ISentencePiece { + /** SentencePiece piece */ + piece?: string | null; + + /** SentencePiece score */ + score?: number | null; + + /** SentencePiece type */ + type?: sentencepiece.ModelProto.SentencePiece.Type | null; + } + + /** Represents a SentencePiece. */ + class SentencePiece implements ISentencePiece { + /** + * Constructs a new SentencePiece. + * @param [properties] Properties to set + */ + constructor(properties?: sentencepiece.ModelProto.ISentencePiece); + + /** SentencePiece piece. */ + public piece: string; + + /** SentencePiece score. */ + public score: number; + + /** SentencePiece type. */ + public type: sentencepiece.ModelProto.SentencePiece.Type; + + /** + * Creates a new SentencePiece instance using the specified properties. + * @param [properties] Properties to set + * @returns SentencePiece instance + */ + public static create( + properties?: sentencepiece.ModelProto.ISentencePiece, + ): sentencepiece.ModelProto.SentencePiece; + + /** + * Encodes the specified SentencePiece message. Does not implicitly {@link sentencepiece.ModelProto.SentencePiece.verify|verify} messages. + * @param message SentencePiece message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encode( + message: sentencepiece.ModelProto.ISentencePiece, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Encodes the specified SentencePiece message, length delimited. Does not implicitly {@link sentencepiece.ModelProto.SentencePiece.verify|verify} messages. + * @param message SentencePiece message or plain object to encode + * @param [writer] Writer to encode to + * @returns Writer + */ + public static encodeDelimited( + message: sentencepiece.ModelProto.ISentencePiece, + writer?: $protobuf.Writer, + ): $protobuf.Writer; + + /** + * Decodes a SentencePiece message from the specified reader or buffer. + * @param reader Reader or buffer to decode from + * @param [length] Message length if known beforehand + * @returns SentencePiece + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decode( + reader: $protobuf.Reader | Uint8Array, + length?: number, + ): sentencepiece.ModelProto.SentencePiece; + + /** + * Decodes a SentencePiece message from the specified reader or buffer, length delimited. + * @param reader Reader or buffer to decode from + * @returns SentencePiece + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + public static decodeDelimited( + reader: $protobuf.Reader | Uint8Array, + ): sentencepiece.ModelProto.SentencePiece; + + /** + * Verifies a SentencePiece message. + * @param message Plain object to verify + * @returns `null` if valid, otherwise the reason why it is not + */ + public static verify(message: {[k: string]: any}): string | null; + + /** + * Creates a SentencePiece message from a plain object. Also converts values to their respective internal types. + * @param object Plain object + * @returns SentencePiece + */ + public static fromObject(object: { + [k: string]: any; + }): sentencepiece.ModelProto.SentencePiece; + + /** + * Creates a plain object from a SentencePiece message. Also converts values to other types if specified. + * @param message SentencePiece + * @param [options] Conversion options + * @returns Plain object + */ + public static toObject( + message: sentencepiece.ModelProto.SentencePiece, + options?: $protobuf.IConversionOptions, + ): {[k: string]: any}; + + /** + * Converts this SentencePiece to JSON. + * @returns JSON object + */ + public toJSON(): {[k: string]: any}; + + /** + * Gets the default type url for SentencePiece + * @param [typeUrlPrefix] your custom typeUrlPrefix(default "type.googleapis.com") + * @returns The default type url + */ + public static getTypeUrl(typeUrlPrefix?: string): string; + } + + namespace SentencePiece { + /** Type enum. */ + enum Type { + NORMAL = 1, + UNKNOWN = 2, + CONTROL = 3, + USER_DEFINED = 4, + BYTE = 6, + UNUSED = 5, + } + } + } +} diff --git a/src/cross/sentencepiece/sentencepiece_model.pb.js b/src/cross/sentencepiece/sentencepiece_model.pb.js new file mode 100644 index 000000000..81236807c --- /dev/null +++ b/src/cross/sentencepiece/sentencepiece_model.pb.js @@ -0,0 +1,3205 @@ +/*eslint-disable block-scoped-var, id-length, no-control-regex, + * no-magic-numbers, no-prototype-builtins, no-redeclare, no-shadow, no-var, + * sort-vars*/ + +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import $protobuf from 'protobufjs/minimal.js'; + +// Common aliases +const $Reader = $protobuf.Reader, $Writer = $protobuf.Writer, + $util = $protobuf.util; + +// Exported root namespace +const $root = $protobuf.roots['default'] || ($protobuf.roots['default'] = {}); + +export const sentencepiece = $root.sentencepiece = (() => { + /** + * Namespace sentencepiece. + * @exports sentencepiece + * @namespace + */ + const sentencepiece = {}; + + sentencepiece.TrainerSpec = (function() { + /** + * Properties of a TrainerSpec. + * @memberof sentencepiece + * @interface ITrainerSpec + * @property {Array.|null} [input] TrainerSpec input + * @property {string|null} [inputFormat] TrainerSpec inputFormat + * @property {string|null} [modelPrefix] TrainerSpec modelPrefix + * @property {sentencepiece.TrainerSpec.ModelType|null} [modelType] + * TrainerSpec modelType + * @property {number|null} [vocabSize] TrainerSpec vocabSize + * @property {Array.|null} [acceptLanguage] TrainerSpec + * acceptLanguage + * @property {number|null} [selfTestSampleSize] TrainerSpec + * selfTestSampleSize + * @property {boolean|null} [enableDifferentialPrivacy] TrainerSpec + * enableDifferentialPrivacy + * @property {number|null} [differentialPrivacyNoiseLevel] TrainerSpec + * differentialPrivacyNoiseLevel + * @property {number|Long|null} [differentialPrivacyClippingThreshold] + * TrainerSpec differentialPrivacyClippingThreshold + * @property {number|null} [characterCoverage] TrainerSpec characterCoverage + * @property {number|Long|null} [inputSentenceSize] TrainerSpec + * inputSentenceSize + * @property {boolean|null} [shuffleInputSentence] TrainerSpec + * shuffleInputSentence + * @property {number|null} [miningSentenceSize] TrainerSpec + * miningSentenceSize + * @property {number|null} [trainingSentenceSize] TrainerSpec + * trainingSentenceSize + * @property {number|null} [seedSentencepieceSize] TrainerSpec + * seedSentencepieceSize + * @property {number|null} [shrinkingFactor] TrainerSpec shrinkingFactor + * @property {number|null} [maxSentenceLength] TrainerSpec maxSentenceLength + * @property {number|null} [numThreads] TrainerSpec numThreads + * @property {number|null} [numSubIterations] TrainerSpec numSubIterations + * @property {number|null} [maxSentencepieceLength] TrainerSpec + * maxSentencepieceLength + * @property {boolean|null} [splitByUnicodeScript] TrainerSpec + * splitByUnicodeScript + * @property {boolean|null} [splitByNumber] TrainerSpec splitByNumber + * @property {boolean|null} [splitByWhitespace] TrainerSpec + * splitByWhitespace + * @property {boolean|null} [treatWhitespaceAsSuffix] TrainerSpec + * treatWhitespaceAsSuffix + * @property {boolean|null} [allowWhitespaceOnlyPieces] TrainerSpec + * allowWhitespaceOnlyPieces + * @property {boolean|null} [splitDigits] TrainerSpec splitDigits + * @property {string|null} [pretokenizationDelimiter] TrainerSpec + * pretokenizationDelimiter + * @property {Array.|null} [controlSymbols] TrainerSpec + * controlSymbols + * @property {Array.|null} [userDefinedSymbols] TrainerSpec + * userDefinedSymbols + * @property {string|null} [requiredChars] TrainerSpec requiredChars + * @property {boolean|null} [byteFallback] TrainerSpec byteFallback + * @property {boolean|null} [vocabularyOutputPieceScore] TrainerSpec + * vocabularyOutputPieceScore + * @property {boolean|null} [hardVocabLimit] TrainerSpec hardVocabLimit + * @property {boolean|null} [useAllVocab] TrainerSpec useAllVocab + * @property {number|null} [unkId] TrainerSpec unkId + * @property {number|null} [bosId] TrainerSpec bosId + * @property {number|null} [eosId] TrainerSpec eosId + * @property {number|null} [padId] TrainerSpec padId + * @property {string|null} [unkPiece] TrainerSpec unkPiece + * @property {string|null} [bosPiece] TrainerSpec bosPiece + * @property {string|null} [eosPiece] TrainerSpec eosPiece + * @property {string|null} [padPiece] TrainerSpec padPiece + * @property {string|null} [unkSurface] TrainerSpec unkSurface + * @property {boolean|null} [trainExtremelyLargeCorpus] TrainerSpec + * trainExtremelyLargeCorpus + * @property {string|null} [seedSentencepiecesFile] TrainerSpec + * seedSentencepiecesFile + */ + + /** + * Constructs a new TrainerSpec. + * @memberof sentencepiece + * @classdesc Represents a TrainerSpec. + * @implements ITrainerSpec + * @constructor + * @param {sentencepiece.ITrainerSpec=} [properties] Properties to set + */ + function TrainerSpec(properties) { + this.input = []; + this.acceptLanguage = []; + this.controlSymbols = []; + this.userDefinedSymbols = []; + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) this[keys[i]] = properties[keys[i]]; + } + + /** + * TrainerSpec input. + * @member {Array.} input + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.input = $util.emptyArray; + + /** + * TrainerSpec inputFormat. + * @member {string} inputFormat + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.inputFormat = ''; + + /** + * TrainerSpec modelPrefix. + * @member {string} modelPrefix + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.modelPrefix = ''; + + /** + * TrainerSpec modelType. + * @member {sentencepiece.TrainerSpec.ModelType} modelType + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.modelType = 1; + + /** + * TrainerSpec vocabSize. + * @member {number} vocabSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.vocabSize = 8000; + + /** + * TrainerSpec acceptLanguage. + * @member {Array.} acceptLanguage + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.acceptLanguage = $util.emptyArray; + + /** + * TrainerSpec selfTestSampleSize. + * @member {number} selfTestSampleSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.selfTestSampleSize = 0; + + /** + * TrainerSpec enableDifferentialPrivacy. + * @member {boolean} enableDifferentialPrivacy + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.enableDifferentialPrivacy = false; + + /** + * TrainerSpec differentialPrivacyNoiseLevel. + * @member {number} differentialPrivacyNoiseLevel + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.differentialPrivacyNoiseLevel = 0; + + /** + * TrainerSpec differentialPrivacyClippingThreshold. + * @member {number|Long} differentialPrivacyClippingThreshold + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.differentialPrivacyClippingThreshold = + $util.Long ? $util.Long.fromBits(0, 0, true) : 0; + + /** + * TrainerSpec characterCoverage. + * @member {number} characterCoverage + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.characterCoverage = 0.9995; + + /** + * TrainerSpec inputSentenceSize. + * @member {number|Long} inputSentenceSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.inputSentenceSize = + $util.Long ? $util.Long.fromBits(0, 0, true) : 0; + + /** + * TrainerSpec shuffleInputSentence. + * @member {boolean} shuffleInputSentence + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.shuffleInputSentence = true; + + /** + * TrainerSpec miningSentenceSize. + * @member {number} miningSentenceSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.miningSentenceSize = 0; + + /** + * TrainerSpec trainingSentenceSize. + * @member {number} trainingSentenceSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.trainingSentenceSize = 0; + + /** + * TrainerSpec seedSentencepieceSize. + * @member {number} seedSentencepieceSize + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.seedSentencepieceSize = 1000000; + + /** + * TrainerSpec shrinkingFactor. + * @member {number} shrinkingFactor + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.shrinkingFactor = 0.75; + + /** + * TrainerSpec maxSentenceLength. + * @member {number} maxSentenceLength + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.maxSentenceLength = 4192; + + /** + * TrainerSpec numThreads. + * @member {number} numThreads + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.numThreads = 16; + + /** + * TrainerSpec numSubIterations. + * @member {number} numSubIterations + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.numSubIterations = 2; + + /** + * TrainerSpec maxSentencepieceLength. + * @member {number} maxSentencepieceLength + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.maxSentencepieceLength = 16; + + /** + * TrainerSpec splitByUnicodeScript. + * @member {boolean} splitByUnicodeScript + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.splitByUnicodeScript = true; + + /** + * TrainerSpec splitByNumber. + * @member {boolean} splitByNumber + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.splitByNumber = true; + + /** + * TrainerSpec splitByWhitespace. + * @member {boolean} splitByWhitespace + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.splitByWhitespace = true; + + /** + * TrainerSpec treatWhitespaceAsSuffix. + * @member {boolean} treatWhitespaceAsSuffix + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.treatWhitespaceAsSuffix = false; + + /** + * TrainerSpec allowWhitespaceOnlyPieces. + * @member {boolean} allowWhitespaceOnlyPieces + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.allowWhitespaceOnlyPieces = false; + + /** + * TrainerSpec splitDigits. + * @member {boolean} splitDigits + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.splitDigits = false; + + /** + * TrainerSpec pretokenizationDelimiter. + * @member {string} pretokenizationDelimiter + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.pretokenizationDelimiter = ''; + + /** + * TrainerSpec controlSymbols. + * @member {Array.} controlSymbols + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.controlSymbols = $util.emptyArray; + + /** + * TrainerSpec userDefinedSymbols. + * @member {Array.} userDefinedSymbols + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.userDefinedSymbols = $util.emptyArray; + + /** + * TrainerSpec requiredChars. + * @member {string} requiredChars + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.requiredChars = ''; + + /** + * TrainerSpec byteFallback. + * @member {boolean} byteFallback + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.byteFallback = false; + + /** + * TrainerSpec vocabularyOutputPieceScore. + * @member {boolean} vocabularyOutputPieceScore + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.vocabularyOutputPieceScore = true; + + /** + * TrainerSpec hardVocabLimit. + * @member {boolean} hardVocabLimit + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.hardVocabLimit = true; + + /** + * TrainerSpec useAllVocab. + * @member {boolean} useAllVocab + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.useAllVocab = false; + + /** + * TrainerSpec unkId. + * @member {number} unkId + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.unkId = 0; + + /** + * TrainerSpec bosId. + * @member {number} bosId + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.bosId = 1; + + /** + * TrainerSpec eosId. + * @member {number} eosId + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.eosId = 2; + + /** + * TrainerSpec padId. + * @member {number} padId + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.padId = -1; + + /** + * TrainerSpec unkPiece. + * @member {string} unkPiece + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.unkPiece = ''; + + /** + * TrainerSpec bosPiece. + * @member {string} bosPiece + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.bosPiece = ''; + + /** + * TrainerSpec eosPiece. + * @member {string} eosPiece + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.eosPiece = ''; + + /** + * TrainerSpec padPiece. + * @member {string} padPiece + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.padPiece = ''; + + /** + * TrainerSpec unkSurface. + * @member {string} unkSurface + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.unkSurface = ' E28187 '; + + /** + * TrainerSpec trainExtremelyLargeCorpus. + * @member {boolean} trainExtremelyLargeCorpus + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.trainExtremelyLargeCorpus = false; + + /** + * TrainerSpec seedSentencepiecesFile. + * @member {string} seedSentencepiecesFile + * @memberof sentencepiece.TrainerSpec + * @instance + */ + TrainerSpec.prototype.seedSentencepiecesFile = ''; + + /** + * Creates a new TrainerSpec instance using the specified properties. + * @function create + * @memberof sentencepiece.TrainerSpec + * @static + * @param {sentencepiece.ITrainerSpec=} [properties] Properties to set + * @returns {sentencepiece.TrainerSpec} TrainerSpec instance + */ + TrainerSpec.create = function create(properties) { + return new TrainerSpec(properties); + }; + + /** + * Encodes the specified TrainerSpec message. Does not implicitly {@link + * sentencepiece.TrainerSpec.verify|verify} messages. + * @function encode + * @memberof sentencepiece.TrainerSpec + * @static + * @param {sentencepiece.ITrainerSpec} message TrainerSpec message or plain + * object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + TrainerSpec.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.input != null && message.input.length) + for (let i = 0; i < message.input.length; ++i) + writer.uint32(/* id 1, wireType 2 =*/ 10).string(message.input[i]); + if (message.modelPrefix != null && + Object.hasOwnProperty.call(message, 'modelPrefix')) + writer.uint32(/* id 2, wireType 2 =*/ 18).string(message.modelPrefix); + if (message.modelType != null && + Object.hasOwnProperty.call(message, 'modelType')) + writer.uint32(/* id 3, wireType 0 =*/ 24).int32(message.modelType); + if (message.vocabSize != null && + Object.hasOwnProperty.call(message, 'vocabSize')) + writer.uint32(/* id 4, wireType 0 =*/ 32).int32(message.vocabSize); + if (message.acceptLanguage != null && message.acceptLanguage.length) + for (let i = 0; i < message.acceptLanguage.length; ++i) + writer.uint32(/* id 5, wireType 2 =*/ 42) + .string(message.acceptLanguage[i]); + if (message.selfTestSampleSize != null && + Object.hasOwnProperty.call(message, 'selfTestSampleSize')) + writer.uint32(/* id 6, wireType 0 =*/ 48) + .int32(message.selfTestSampleSize); + if (message.inputFormat != null && + Object.hasOwnProperty.call(message, 'inputFormat')) + writer.uint32(/* id 7, wireType 2 =*/ 58).string(message.inputFormat); + if (message.characterCoverage != null && + Object.hasOwnProperty.call(message, 'characterCoverage')) + writer.uint32(/* id 10, wireType 5 =*/ 85) + .float(message.characterCoverage); + if (message.inputSentenceSize != null && + Object.hasOwnProperty.call(message, 'inputSentenceSize')) + writer.uint32(/* id 11, wireType 0 =*/ 88) + .uint64(message.inputSentenceSize); + if (message.miningSentenceSize != null && + Object.hasOwnProperty.call(message, 'miningSentenceSize')) + writer.uint32(/* id 12, wireType 0 =*/ 96) + .int32(message.miningSentenceSize); + if (message.trainingSentenceSize != null && + Object.hasOwnProperty.call(message, 'trainingSentenceSize')) + writer.uint32(/* id 13, wireType 0 =*/ 104) + .int32(message.trainingSentenceSize); + if (message.seedSentencepieceSize != null && + Object.hasOwnProperty.call(message, 'seedSentencepieceSize')) + writer.uint32(/* id 14, wireType 0 =*/ 112) + .int32(message.seedSentencepieceSize); + if (message.shrinkingFactor != null && + Object.hasOwnProperty.call(message, 'shrinkingFactor')) + writer.uint32(/* id 15, wireType 5 =*/ 125) + .float(message.shrinkingFactor); + if (message.numThreads != null && + Object.hasOwnProperty.call(message, 'numThreads')) + writer.uint32(/* id 16, wireType 0 =*/ 128).int32(message.numThreads); + if (message.numSubIterations != null && + Object.hasOwnProperty.call(message, 'numSubIterations')) + writer.uint32(/* id 17, wireType 0 =*/ 136) + .int32(message.numSubIterations); + if (message.maxSentenceLength != null && + Object.hasOwnProperty.call(message, 'maxSentenceLength')) + writer.uint32(/* id 18, wireType 0 =*/ 144) + .int32(message.maxSentenceLength); + if (message.shuffleInputSentence != null && + Object.hasOwnProperty.call(message, 'shuffleInputSentence')) + writer.uint32(/* id 19, wireType 0 =*/ 152) + .bool(message.shuffleInputSentence); + if (message.maxSentencepieceLength != null && + Object.hasOwnProperty.call(message, 'maxSentencepieceLength')) + writer.uint32(/* id 20, wireType 0 =*/ 160) + .int32(message.maxSentencepieceLength); + if (message.splitByUnicodeScript != null && + Object.hasOwnProperty.call(message, 'splitByUnicodeScript')) + writer.uint32(/* id 21, wireType 0 =*/ 168) + .bool(message.splitByUnicodeScript); + if (message.splitByWhitespace != null && + Object.hasOwnProperty.call(message, 'splitByWhitespace')) + writer.uint32(/* id 22, wireType 0 =*/ 176) + .bool(message.splitByWhitespace); + if (message.splitByNumber != null && + Object.hasOwnProperty.call(message, 'splitByNumber')) + writer.uint32(/* id 23, wireType 0 =*/ 184).bool(message.splitByNumber); + if (message.treatWhitespaceAsSuffix != null && + Object.hasOwnProperty.call(message, 'treatWhitespaceAsSuffix')) + writer.uint32(/* id 24, wireType 0 =*/ 192) + .bool(message.treatWhitespaceAsSuffix); + if (message.splitDigits != null && + Object.hasOwnProperty.call(message, 'splitDigits')) + writer.uint32(/* id 25, wireType 0 =*/ 200).bool(message.splitDigits); + if (message.allowWhitespaceOnlyPieces != null && + Object.hasOwnProperty.call(message, 'allowWhitespaceOnlyPieces')) + writer.uint32(/* id 26, wireType 0 =*/ 208) + .bool(message.allowWhitespaceOnlyPieces); + if (message.controlSymbols != null && message.controlSymbols.length) + for (let i = 0; i < message.controlSymbols.length; ++i) + writer.uint32(/* id 30, wireType 2 =*/ 242) + .string(message.controlSymbols[i]); + if (message.userDefinedSymbols != null && + message.userDefinedSymbols.length) + for (let i = 0; i < message.userDefinedSymbols.length; ++i) + writer.uint32(/* id 31, wireType 2 =*/ 250) + .string(message.userDefinedSymbols[i]); + if (message.vocabularyOutputPieceScore != null && + Object.hasOwnProperty.call(message, 'vocabularyOutputPieceScore')) + writer.uint32(/* id 32, wireType 0 =*/ 256) + .bool(message.vocabularyOutputPieceScore); + if (message.hardVocabLimit != null && + Object.hasOwnProperty.call(message, 'hardVocabLimit')) + writer.uint32(/* id 33, wireType 0 =*/ 264) + .bool(message.hardVocabLimit); + if (message.useAllVocab != null && + Object.hasOwnProperty.call(message, 'useAllVocab')) + writer.uint32(/* id 34, wireType 0 =*/ 272).bool(message.useAllVocab); + if (message.byteFallback != null && + Object.hasOwnProperty.call(message, 'byteFallback')) + writer.uint32(/* id 35, wireType 0 =*/ 280).bool(message.byteFallback); + if (message.requiredChars != null && + Object.hasOwnProperty.call(message, 'requiredChars')) + writer.uint32(/* id 36, wireType 2 =*/ 290) + .string(message.requiredChars); + if (message.unkId != null && Object.hasOwnProperty.call(message, 'unkId')) + writer.uint32(/* id 40, wireType 0 =*/ 320).int32(message.unkId); + if (message.bosId != null && Object.hasOwnProperty.call(message, 'bosId')) + writer.uint32(/* id 41, wireType 0 =*/ 328).int32(message.bosId); + if (message.eosId != null && Object.hasOwnProperty.call(message, 'eosId')) + writer.uint32(/* id 42, wireType 0 =*/ 336).int32(message.eosId); + if (message.padId != null && Object.hasOwnProperty.call(message, 'padId')) + writer.uint32(/* id 43, wireType 0 =*/ 344).int32(message.padId); + if (message.unkSurface != null && + Object.hasOwnProperty.call(message, 'unkSurface')) + writer.uint32(/* id 44, wireType 2 =*/ 354).string(message.unkSurface); + if (message.unkPiece != null && + Object.hasOwnProperty.call(message, 'unkPiece')) + writer.uint32(/* id 45, wireType 2 =*/ 362).string(message.unkPiece); + if (message.bosPiece != null && + Object.hasOwnProperty.call(message, 'bosPiece')) + writer.uint32(/* id 46, wireType 2 =*/ 370).string(message.bosPiece); + if (message.eosPiece != null && + Object.hasOwnProperty.call(message, 'eosPiece')) + writer.uint32(/* id 47, wireType 2 =*/ 378).string(message.eosPiece); + if (message.padPiece != null && + Object.hasOwnProperty.call(message, 'padPiece')) + writer.uint32(/* id 48, wireType 2 =*/ 386).string(message.padPiece); + if (message.trainExtremelyLargeCorpus != null && + Object.hasOwnProperty.call(message, 'trainExtremelyLargeCorpus')) + writer.uint32(/* id 49, wireType 0 =*/ 392) + .bool(message.trainExtremelyLargeCorpus); + if (message.enableDifferentialPrivacy != null && + Object.hasOwnProperty.call(message, 'enableDifferentialPrivacy')) + writer.uint32(/* id 50, wireType 0 =*/ 400) + .bool(message.enableDifferentialPrivacy); + if (message.differentialPrivacyNoiseLevel != null && + Object.hasOwnProperty.call(message, 'differentialPrivacyNoiseLevel')) + writer.uint32(/* id 51, wireType 5 =*/ 413) + .float(message.differentialPrivacyNoiseLevel); + if (message.differentialPrivacyClippingThreshold != null && + Object.hasOwnProperty.call( + message, 'differentialPrivacyClippingThreshold')) + writer.uint32(/* id 52, wireType 0 =*/ 416) + .uint64(message.differentialPrivacyClippingThreshold); + if (message.pretokenizationDelimiter != null && + Object.hasOwnProperty.call(message, 'pretokenizationDelimiter')) + writer.uint32(/* id 53, wireType 2 =*/ 426) + .string(message.pretokenizationDelimiter); + if (message.seedSentencepiecesFile != null && + Object.hasOwnProperty.call(message, 'seedSentencepiecesFile')) + writer.uint32(/* id 54, wireType 2 =*/ 434) + .string(message.seedSentencepiecesFile); + return writer; + }; + + /** + * Encodes the specified TrainerSpec message, length delimited. Does not + * implicitly {@link sentencepiece.TrainerSpec.verify|verify} messages. + * @function encodeDelimited + * @memberof sentencepiece.TrainerSpec + * @static + * @param {sentencepiece.ITrainerSpec} message TrainerSpec message or plain + * object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + TrainerSpec.encodeDelimited = function encodeDelimited(message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a TrainerSpec message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.TrainerSpec + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.TrainerSpec} TrainerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + TrainerSpec.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.TrainerSpec(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + if (!(message.input && message.input.length)) message.input = []; + message.input.push(reader.string()); + break; + } + case 7: { + message.inputFormat = reader.string(); + break; + } + case 2: { + message.modelPrefix = reader.string(); + break; + } + case 3: { + message.modelType = reader.int32(); + break; + } + case 4: { + message.vocabSize = reader.int32(); + break; + } + case 5: { + if (!(message.acceptLanguage && message.acceptLanguage.length)) + message.acceptLanguage = []; + message.acceptLanguage.push(reader.string()); + break; + } + case 6: { + message.selfTestSampleSize = reader.int32(); + break; + } + case 50: { + message.enableDifferentialPrivacy = reader.bool(); + break; + } + case 51: { + message.differentialPrivacyNoiseLevel = reader.float(); + break; + } + case 52: { + message.differentialPrivacyClippingThreshold = reader.uint64(); + break; + } + case 10: { + message.characterCoverage = reader.float(); + break; + } + case 11: { + message.inputSentenceSize = reader.uint64(); + break; + } + case 19: { + message.shuffleInputSentence = reader.bool(); + break; + } + case 12: { + message.miningSentenceSize = reader.int32(); + break; + } + case 13: { + message.trainingSentenceSize = reader.int32(); + break; + } + case 14: { + message.seedSentencepieceSize = reader.int32(); + break; + } + case 15: { + message.shrinkingFactor = reader.float(); + break; + } + case 18: { + message.maxSentenceLength = reader.int32(); + break; + } + case 16: { + message.numThreads = reader.int32(); + break; + } + case 17: { + message.numSubIterations = reader.int32(); + break; + } + case 20: { + message.maxSentencepieceLength = reader.int32(); + break; + } + case 21: { + message.splitByUnicodeScript = reader.bool(); + break; + } + case 23: { + message.splitByNumber = reader.bool(); + break; + } + case 22: { + message.splitByWhitespace = reader.bool(); + break; + } + case 24: { + message.treatWhitespaceAsSuffix = reader.bool(); + break; + } + case 26: { + message.allowWhitespaceOnlyPieces = reader.bool(); + break; + } + case 25: { + message.splitDigits = reader.bool(); + break; + } + case 53: { + message.pretokenizationDelimiter = reader.string(); + break; + } + case 30: { + if (!(message.controlSymbols && message.controlSymbols.length)) + message.controlSymbols = []; + message.controlSymbols.push(reader.string()); + break; + } + case 31: { + if (!(message.userDefinedSymbols && + message.userDefinedSymbols.length)) + message.userDefinedSymbols = []; + message.userDefinedSymbols.push(reader.string()); + break; + } + case 36: { + message.requiredChars = reader.string(); + break; + } + case 35: { + message.byteFallback = reader.bool(); + break; + } + case 32: { + message.vocabularyOutputPieceScore = reader.bool(); + break; + } + case 33: { + message.hardVocabLimit = reader.bool(); + break; + } + case 34: { + message.useAllVocab = reader.bool(); + break; + } + case 40: { + message.unkId = reader.int32(); + break; + } + case 41: { + message.bosId = reader.int32(); + break; + } + case 42: { + message.eosId = reader.int32(); + break; + } + case 43: { + message.padId = reader.int32(); + break; + } + case 45: { + message.unkPiece = reader.string(); + break; + } + case 46: { + message.bosPiece = reader.string(); + break; + } + case 47: { + message.eosPiece = reader.string(); + break; + } + case 48: { + message.padPiece = reader.string(); + break; + } + case 44: { + message.unkSurface = reader.string(); + break; + } + case 49: { + message.trainExtremelyLargeCorpus = reader.bool(); + break; + } + case 54: { + message.seedSentencepiecesFile = reader.string(); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a TrainerSpec message from the specified reader or buffer, length + * delimited. + * @function decodeDelimited + * @memberof sentencepiece.TrainerSpec + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.TrainerSpec} TrainerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + TrainerSpec.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a TrainerSpec message. + * @function verify + * @memberof sentencepiece.TrainerSpec + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + TrainerSpec.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.input != null && message.hasOwnProperty('input')) { + if (!Array.isArray(message.input)) return 'input: array expected'; + for (let i = 0; i < message.input.length; ++i) + if (!$util.isString(message.input[i])) + return 'input: string[] expected'; + } + if (message.inputFormat != null && message.hasOwnProperty('inputFormat')) + if (!$util.isString(message.inputFormat)) + return 'inputFormat: string expected'; + if (message.modelPrefix != null && message.hasOwnProperty('modelPrefix')) + if (!$util.isString(message.modelPrefix)) + return 'modelPrefix: string expected'; + if (message.modelType != null && message.hasOwnProperty('modelType')) + switch (message.modelType) { + default: + return 'modelType: enum value expected'; + case 1: + case 2: + case 3: + case 4: + break; + } + if (message.vocabSize != null && message.hasOwnProperty('vocabSize')) + if (!$util.isInteger(message.vocabSize)) + return 'vocabSize: integer expected'; + if (message.acceptLanguage != null && + message.hasOwnProperty('acceptLanguage')) { + if (!Array.isArray(message.acceptLanguage)) + return 'acceptLanguage: array expected'; + for (let i = 0; i < message.acceptLanguage.length; ++i) + if (!$util.isString(message.acceptLanguage[i])) + return 'acceptLanguage: string[] expected'; + } + if (message.selfTestSampleSize != null && + message.hasOwnProperty('selfTestSampleSize')) + if (!$util.isInteger(message.selfTestSampleSize)) + return 'selfTestSampleSize: integer expected'; + if (message.enableDifferentialPrivacy != null && + message.hasOwnProperty('enableDifferentialPrivacy')) + if (typeof message.enableDifferentialPrivacy !== 'boolean') + return 'enableDifferentialPrivacy: boolean expected'; + if (message.differentialPrivacyNoiseLevel != null && + message.hasOwnProperty('differentialPrivacyNoiseLevel')) + if (typeof message.differentialPrivacyNoiseLevel !== 'number') + return 'differentialPrivacyNoiseLevel: number expected'; + if (message.differentialPrivacyClippingThreshold != null && + message.hasOwnProperty('differentialPrivacyClippingThreshold')) + if (!$util.isInteger(message.differentialPrivacyClippingThreshold) && + !(message.differentialPrivacyClippingThreshold && + $util.isInteger( + message.differentialPrivacyClippingThreshold.low) && + $util.isInteger( + message.differentialPrivacyClippingThreshold.high))) + return 'differentialPrivacyClippingThreshold: integer|Long expected'; + if (message.characterCoverage != null && + message.hasOwnProperty('characterCoverage')) + if (typeof message.characterCoverage !== 'number') + return 'characterCoverage: number expected'; + if (message.inputSentenceSize != null && + message.hasOwnProperty('inputSentenceSize')) + if (!$util.isInteger(message.inputSentenceSize) && + !(message.inputSentenceSize && + $util.isInteger(message.inputSentenceSize.low) && + $util.isInteger(message.inputSentenceSize.high))) + return 'inputSentenceSize: integer|Long expected'; + if (message.shuffleInputSentence != null && + message.hasOwnProperty('shuffleInputSentence')) + if (typeof message.shuffleInputSentence !== 'boolean') + return 'shuffleInputSentence: boolean expected'; + if (message.miningSentenceSize != null && + message.hasOwnProperty('miningSentenceSize')) + if (!$util.isInteger(message.miningSentenceSize)) + return 'miningSentenceSize: integer expected'; + if (message.trainingSentenceSize != null && + message.hasOwnProperty('trainingSentenceSize')) + if (!$util.isInteger(message.trainingSentenceSize)) + return 'trainingSentenceSize: integer expected'; + if (message.seedSentencepieceSize != null && + message.hasOwnProperty('seedSentencepieceSize')) + if (!$util.isInteger(message.seedSentencepieceSize)) + return 'seedSentencepieceSize: integer expected'; + if (message.shrinkingFactor != null && + message.hasOwnProperty('shrinkingFactor')) + if (typeof message.shrinkingFactor !== 'number') + return 'shrinkingFactor: number expected'; + if (message.maxSentenceLength != null && + message.hasOwnProperty('maxSentenceLength')) + if (!$util.isInteger(message.maxSentenceLength)) + return 'maxSentenceLength: integer expected'; + if (message.numThreads != null && message.hasOwnProperty('numThreads')) + if (!$util.isInteger(message.numThreads)) + return 'numThreads: integer expected'; + if (message.numSubIterations != null && + message.hasOwnProperty('numSubIterations')) + if (!$util.isInteger(message.numSubIterations)) + return 'numSubIterations: integer expected'; + if (message.maxSentencepieceLength != null && + message.hasOwnProperty('maxSentencepieceLength')) + if (!$util.isInteger(message.maxSentencepieceLength)) + return 'maxSentencepieceLength: integer expected'; + if (message.splitByUnicodeScript != null && + message.hasOwnProperty('splitByUnicodeScript')) + if (typeof message.splitByUnicodeScript !== 'boolean') + return 'splitByUnicodeScript: boolean expected'; + if (message.splitByNumber != null && + message.hasOwnProperty('splitByNumber')) + if (typeof message.splitByNumber !== 'boolean') + return 'splitByNumber: boolean expected'; + if (message.splitByWhitespace != null && + message.hasOwnProperty('splitByWhitespace')) + if (typeof message.splitByWhitespace !== 'boolean') + return 'splitByWhitespace: boolean expected'; + if (message.treatWhitespaceAsSuffix != null && + message.hasOwnProperty('treatWhitespaceAsSuffix')) + if (typeof message.treatWhitespaceAsSuffix !== 'boolean') + return 'treatWhitespaceAsSuffix: boolean expected'; + if (message.allowWhitespaceOnlyPieces != null && + message.hasOwnProperty('allowWhitespaceOnlyPieces')) + if (typeof message.allowWhitespaceOnlyPieces !== 'boolean') + return 'allowWhitespaceOnlyPieces: boolean expected'; + if (message.splitDigits != null && message.hasOwnProperty('splitDigits')) + if (typeof message.splitDigits !== 'boolean') + return 'splitDigits: boolean expected'; + if (message.pretokenizationDelimiter != null && + message.hasOwnProperty('pretokenizationDelimiter')) + if (!$util.isString(message.pretokenizationDelimiter)) + return 'pretokenizationDelimiter: string expected'; + if (message.controlSymbols != null && + message.hasOwnProperty('controlSymbols')) { + if (!Array.isArray(message.controlSymbols)) + return 'controlSymbols: array expected'; + for (let i = 0; i < message.controlSymbols.length; ++i) + if (!$util.isString(message.controlSymbols[i])) + return 'controlSymbols: string[] expected'; + } + if (message.userDefinedSymbols != null && + message.hasOwnProperty('userDefinedSymbols')) { + if (!Array.isArray(message.userDefinedSymbols)) + return 'userDefinedSymbols: array expected'; + for (let i = 0; i < message.userDefinedSymbols.length; ++i) + if (!$util.isString(message.userDefinedSymbols[i])) + return 'userDefinedSymbols: string[] expected'; + } + if (message.requiredChars != null && + message.hasOwnProperty('requiredChars')) + if (!$util.isString(message.requiredChars)) + return 'requiredChars: string expected'; + if (message.byteFallback != null && + message.hasOwnProperty('byteFallback')) + if (typeof message.byteFallback !== 'boolean') + return 'byteFallback: boolean expected'; + if (message.vocabularyOutputPieceScore != null && + message.hasOwnProperty('vocabularyOutputPieceScore')) + if (typeof message.vocabularyOutputPieceScore !== 'boolean') + return 'vocabularyOutputPieceScore: boolean expected'; + if (message.hardVocabLimit != null && + message.hasOwnProperty('hardVocabLimit')) + if (typeof message.hardVocabLimit !== 'boolean') + return 'hardVocabLimit: boolean expected'; + if (message.useAllVocab != null && message.hasOwnProperty('useAllVocab')) + if (typeof message.useAllVocab !== 'boolean') + return 'useAllVocab: boolean expected'; + if (message.unkId != null && message.hasOwnProperty('unkId')) + if (!$util.isInteger(message.unkId)) return 'unkId: integer expected'; + if (message.bosId != null && message.hasOwnProperty('bosId')) + if (!$util.isInteger(message.bosId)) return 'bosId: integer expected'; + if (message.eosId != null && message.hasOwnProperty('eosId')) + if (!$util.isInteger(message.eosId)) return 'eosId: integer expected'; + if (message.padId != null && message.hasOwnProperty('padId')) + if (!$util.isInteger(message.padId)) return 'padId: integer expected'; + if (message.unkPiece != null && message.hasOwnProperty('unkPiece')) + if (!$util.isString(message.unkPiece)) + return 'unkPiece: string expected'; + if (message.bosPiece != null && message.hasOwnProperty('bosPiece')) + if (!$util.isString(message.bosPiece)) + return 'bosPiece: string expected'; + if (message.eosPiece != null && message.hasOwnProperty('eosPiece')) + if (!$util.isString(message.eosPiece)) + return 'eosPiece: string expected'; + if (message.padPiece != null && message.hasOwnProperty('padPiece')) + if (!$util.isString(message.padPiece)) + return 'padPiece: string expected'; + if (message.unkSurface != null && message.hasOwnProperty('unkSurface')) + if (!$util.isString(message.unkSurface)) + return 'unkSurface: string expected'; + if (message.trainExtremelyLargeCorpus != null && + message.hasOwnProperty('trainExtremelyLargeCorpus')) + if (typeof message.trainExtremelyLargeCorpus !== 'boolean') + return 'trainExtremelyLargeCorpus: boolean expected'; + if (message.seedSentencepiecesFile != null && + message.hasOwnProperty('seedSentencepiecesFile')) + if (!$util.isString(message.seedSentencepiecesFile)) + return 'seedSentencepiecesFile: string expected'; + return null; + }; + + /** + * Creates a TrainerSpec message from a plain object. Also converts values + * to their respective internal types. + * @function fromObject + * @memberof sentencepiece.TrainerSpec + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.TrainerSpec} TrainerSpec + */ + TrainerSpec.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.TrainerSpec) return object; + let message = new $root.sentencepiece.TrainerSpec(); + if (object.input) { + if (!Array.isArray(object.input)) + throw TypeError('.sentencepiece.TrainerSpec.input: array expected'); + message.input = []; + for (let i = 0; i < object.input.length; ++i) + message.input[i] = String(object.input[i]); + } + if (object.inputFormat != null) + message.inputFormat = String(object.inputFormat); + if (object.modelPrefix != null) + message.modelPrefix = String(object.modelPrefix); + switch (object.modelType) { + default: + if (typeof object.modelType === 'number') { + message.modelType = object.modelType; + break; + } + break; + case 'UNIGRAM': + case 1: + message.modelType = 1; + break; + case 'BPE': + case 2: + message.modelType = 2; + break; + case 'WORD': + case 3: + message.modelType = 3; + break; + case 'CHAR': + case 4: + message.modelType = 4; + break; + } + if (object.vocabSize != null) message.vocabSize = object.vocabSize | 0; + if (object.acceptLanguage) { + if (!Array.isArray(object.acceptLanguage)) + throw TypeError( + '.sentencepiece.TrainerSpec.acceptLanguage: array expected'); + message.acceptLanguage = []; + for (let i = 0; i < object.acceptLanguage.length; ++i) + message.acceptLanguage[i] = String(object.acceptLanguage[i]); + } + if (object.selfTestSampleSize != null) + message.selfTestSampleSize = object.selfTestSampleSize | 0; + if (object.enableDifferentialPrivacy != null) + message.enableDifferentialPrivacy = + Boolean(object.enableDifferentialPrivacy); + if (object.differentialPrivacyNoiseLevel != null) + message.differentialPrivacyNoiseLevel = + Number(object.differentialPrivacyNoiseLevel); + if (object.differentialPrivacyClippingThreshold != null) + if ($util.Long) + (message.differentialPrivacyClippingThreshold = $util.Long.fromValue( + object.differentialPrivacyClippingThreshold)) + .unsigned = true; + else if ( + typeof object.differentialPrivacyClippingThreshold === 'string') + message.differentialPrivacyClippingThreshold = + parseInt(object.differentialPrivacyClippingThreshold, 10); + else if ( + typeof object.differentialPrivacyClippingThreshold === 'number') + message.differentialPrivacyClippingThreshold = + object.differentialPrivacyClippingThreshold; + else if ( + typeof object.differentialPrivacyClippingThreshold === 'object') + message.differentialPrivacyClippingThreshold = + new $util + .LongBits( + object.differentialPrivacyClippingThreshold.low >>> 0, + object.differentialPrivacyClippingThreshold.high >>> 0) + .toNumber(true); + if (object.characterCoverage != null) + message.characterCoverage = Number(object.characterCoverage); + if (object.inputSentenceSize != null) + if ($util.Long) + (message.inputSentenceSize = + $util.Long.fromValue(object.inputSentenceSize)) + .unsigned = true; + else if (typeof object.inputSentenceSize === 'string') + message.inputSentenceSize = parseInt(object.inputSentenceSize, 10); + else if (typeof object.inputSentenceSize === 'number') + message.inputSentenceSize = object.inputSentenceSize; + else if (typeof object.inputSentenceSize === 'object') + message.inputSentenceSize = + new $util + .LongBits( + object.inputSentenceSize.low >>> 0, + object.inputSentenceSize.high >>> 0) + .toNumber(true); + if (object.shuffleInputSentence != null) + message.shuffleInputSentence = Boolean(object.shuffleInputSentence); + if (object.miningSentenceSize != null) + message.miningSentenceSize = object.miningSentenceSize | 0; + if (object.trainingSentenceSize != null) + message.trainingSentenceSize = object.trainingSentenceSize | 0; + if (object.seedSentencepieceSize != null) + message.seedSentencepieceSize = object.seedSentencepieceSize | 0; + if (object.shrinkingFactor != null) + message.shrinkingFactor = Number(object.shrinkingFactor); + if (object.maxSentenceLength != null) + message.maxSentenceLength = object.maxSentenceLength | 0; + if (object.numThreads != null) message.numThreads = object.numThreads | 0; + if (object.numSubIterations != null) + message.numSubIterations = object.numSubIterations | 0; + if (object.maxSentencepieceLength != null) + message.maxSentencepieceLength = object.maxSentencepieceLength | 0; + if (object.splitByUnicodeScript != null) + message.splitByUnicodeScript = Boolean(object.splitByUnicodeScript); + if (object.splitByNumber != null) + message.splitByNumber = Boolean(object.splitByNumber); + if (object.splitByWhitespace != null) + message.splitByWhitespace = Boolean(object.splitByWhitespace); + if (object.treatWhitespaceAsSuffix != null) + message.treatWhitespaceAsSuffix = + Boolean(object.treatWhitespaceAsSuffix); + if (object.allowWhitespaceOnlyPieces != null) + message.allowWhitespaceOnlyPieces = + Boolean(object.allowWhitespaceOnlyPieces); + if (object.splitDigits != null) + message.splitDigits = Boolean(object.splitDigits); + if (object.pretokenizationDelimiter != null) + message.pretokenizationDelimiter = + String(object.pretokenizationDelimiter); + if (object.controlSymbols) { + if (!Array.isArray(object.controlSymbols)) + throw TypeError( + '.sentencepiece.TrainerSpec.controlSymbols: array expected'); + message.controlSymbols = []; + for (let i = 0; i < object.controlSymbols.length; ++i) + message.controlSymbols[i] = String(object.controlSymbols[i]); + } + if (object.userDefinedSymbols) { + if (!Array.isArray(object.userDefinedSymbols)) + throw TypeError( + '.sentencepiece.TrainerSpec.userDefinedSymbols: array expected'); + message.userDefinedSymbols = []; + for (let i = 0; i < object.userDefinedSymbols.length; ++i) + message.userDefinedSymbols[i] = String(object.userDefinedSymbols[i]); + } + if (object.requiredChars != null) + message.requiredChars = String(object.requiredChars); + if (object.byteFallback != null) + message.byteFallback = Boolean(object.byteFallback); + if (object.vocabularyOutputPieceScore != null) + message.vocabularyOutputPieceScore = + Boolean(object.vocabularyOutputPieceScore); + if (object.hardVocabLimit != null) + message.hardVocabLimit = Boolean(object.hardVocabLimit); + if (object.useAllVocab != null) + message.useAllVocab = Boolean(object.useAllVocab); + if (object.unkId != null) message.unkId = object.unkId | 0; + if (object.bosId != null) message.bosId = object.bosId | 0; + if (object.eosId != null) message.eosId = object.eosId | 0; + if (object.padId != null) message.padId = object.padId | 0; + if (object.unkPiece != null) message.unkPiece = String(object.unkPiece); + if (object.bosPiece != null) message.bosPiece = String(object.bosPiece); + if (object.eosPiece != null) message.eosPiece = String(object.eosPiece); + if (object.padPiece != null) message.padPiece = String(object.padPiece); + if (object.unkSurface != null) + message.unkSurface = String(object.unkSurface); + if (object.trainExtremelyLargeCorpus != null) + message.trainExtremelyLargeCorpus = + Boolean(object.trainExtremelyLargeCorpus); + if (object.seedSentencepiecesFile != null) + message.seedSentencepiecesFile = String(object.seedSentencepiecesFile); + return message; + }; + + /** + * Creates a plain object from a TrainerSpec message. Also converts values + * to other types if specified. + * @function toObject + * @memberof sentencepiece.TrainerSpec + * @static + * @param {sentencepiece.TrainerSpec} message TrainerSpec + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + TrainerSpec.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.arrays || options.defaults) { + object.input = []; + object.acceptLanguage = []; + object.controlSymbols = []; + object.userDefinedSymbols = []; + } + if (options.defaults) { + object.modelPrefix = ''; + object.modelType = options.enums === String ? 'UNIGRAM' : 1; + object.vocabSize = 8000; + object.selfTestSampleSize = 0; + object.inputFormat = ''; + object.characterCoverage = 0.9995; + if ($util.Long) { + let long = new $util.Long(0, 0, true); + object.inputSentenceSize = options.longs === String ? + long.toString() : + options.longs === Number ? long.toNumber() : + long; + } else + object.inputSentenceSize = options.longs === String ? '0' : 0; + object.miningSentenceSize = 0; + object.trainingSentenceSize = 0; + object.seedSentencepieceSize = 1000000; + object.shrinkingFactor = 0.75; + object.numThreads = 16; + object.numSubIterations = 2; + object.maxSentenceLength = 4192; + object.shuffleInputSentence = true; + object.maxSentencepieceLength = 16; + object.splitByUnicodeScript = true; + object.splitByWhitespace = true; + object.splitByNumber = true; + object.treatWhitespaceAsSuffix = false; + object.splitDigits = false; + object.allowWhitespaceOnlyPieces = false; + object.vocabularyOutputPieceScore = true; + object.hardVocabLimit = true; + object.useAllVocab = false; + object.byteFallback = false; + object.requiredChars = ''; + object.unkId = 0; + object.bosId = 1; + object.eosId = 2; + object.padId = -1; + object.unkSurface = ' E28187 '; + object.unkPiece = ''; + object.bosPiece = ''; + object.eosPiece = ''; + object.padPiece = ''; + object.trainExtremelyLargeCorpus = false; + object.enableDifferentialPrivacy = false; + object.differentialPrivacyNoiseLevel = 0; + if ($util.Long) { + let long = new $util.Long(0, 0, true); + object.differentialPrivacyClippingThreshold = + options.longs === String ? long.toString() : + options.longs === Number ? long.toNumber() : + long; + } else + object.differentialPrivacyClippingThreshold = + options.longs === String ? '0' : 0; + object.pretokenizationDelimiter = ''; + object.seedSentencepiecesFile = ''; + } + if (message.input && message.input.length) { + object.input = []; + for (let j = 0; j < message.input.length; ++j) + object.input[j] = message.input[j]; + } + if (message.modelPrefix != null && message.hasOwnProperty('modelPrefix')) + object.modelPrefix = message.modelPrefix; + if (message.modelType != null && message.hasOwnProperty('modelType')) + object.modelType = options.enums === String ? + $root.sentencepiece.TrainerSpec.ModelType[message.modelType] === + undefined ? + message.modelType : + $root.sentencepiece.TrainerSpec.ModelType[message.modelType] : + message.modelType; + if (message.vocabSize != null && message.hasOwnProperty('vocabSize')) + object.vocabSize = message.vocabSize; + if (message.acceptLanguage && message.acceptLanguage.length) { + object.acceptLanguage = []; + for (let j = 0; j < message.acceptLanguage.length; ++j) + object.acceptLanguage[j] = message.acceptLanguage[j]; + } + if (message.selfTestSampleSize != null && + message.hasOwnProperty('selfTestSampleSize')) + object.selfTestSampleSize = message.selfTestSampleSize; + if (message.inputFormat != null && message.hasOwnProperty('inputFormat')) + object.inputFormat = message.inputFormat; + if (message.characterCoverage != null && + message.hasOwnProperty('characterCoverage')) + object.characterCoverage = + options.json && !isFinite(message.characterCoverage) ? + String(message.characterCoverage) : + message.characterCoverage; + if (message.inputSentenceSize != null && + message.hasOwnProperty('inputSentenceSize')) + if (typeof message.inputSentenceSize === 'number') + object.inputSentenceSize = options.longs === String ? + String(message.inputSentenceSize) : + message.inputSentenceSize; + else + object.inputSentenceSize = options.longs === String ? + $util.Long.prototype.toString.call(message.inputSentenceSize) : + options.longs === Number ? + new $util + .LongBits( + message.inputSentenceSize.low >>> 0, + message.inputSentenceSize.high >>> 0) + .toNumber(true) : + message.inputSentenceSize; + if (message.miningSentenceSize != null && + message.hasOwnProperty('miningSentenceSize')) + object.miningSentenceSize = message.miningSentenceSize; + if (message.trainingSentenceSize != null && + message.hasOwnProperty('trainingSentenceSize')) + object.trainingSentenceSize = message.trainingSentenceSize; + if (message.seedSentencepieceSize != null && + message.hasOwnProperty('seedSentencepieceSize')) + object.seedSentencepieceSize = message.seedSentencepieceSize; + if (message.shrinkingFactor != null && + message.hasOwnProperty('shrinkingFactor')) + object.shrinkingFactor = + options.json && !isFinite(message.shrinkingFactor) ? + String(message.shrinkingFactor) : + message.shrinkingFactor; + if (message.numThreads != null && message.hasOwnProperty('numThreads')) + object.numThreads = message.numThreads; + if (message.numSubIterations != null && + message.hasOwnProperty('numSubIterations')) + object.numSubIterations = message.numSubIterations; + if (message.maxSentenceLength != null && + message.hasOwnProperty('maxSentenceLength')) + object.maxSentenceLength = message.maxSentenceLength; + if (message.shuffleInputSentence != null && + message.hasOwnProperty('shuffleInputSentence')) + object.shuffleInputSentence = message.shuffleInputSentence; + if (message.maxSentencepieceLength != null && + message.hasOwnProperty('maxSentencepieceLength')) + object.maxSentencepieceLength = message.maxSentencepieceLength; + if (message.splitByUnicodeScript != null && + message.hasOwnProperty('splitByUnicodeScript')) + object.splitByUnicodeScript = message.splitByUnicodeScript; + if (message.splitByWhitespace != null && + message.hasOwnProperty('splitByWhitespace')) + object.splitByWhitespace = message.splitByWhitespace; + if (message.splitByNumber != null && + message.hasOwnProperty('splitByNumber')) + object.splitByNumber = message.splitByNumber; + if (message.treatWhitespaceAsSuffix != null && + message.hasOwnProperty('treatWhitespaceAsSuffix')) + object.treatWhitespaceAsSuffix = message.treatWhitespaceAsSuffix; + if (message.splitDigits != null && message.hasOwnProperty('splitDigits')) + object.splitDigits = message.splitDigits; + if (message.allowWhitespaceOnlyPieces != null && + message.hasOwnProperty('allowWhitespaceOnlyPieces')) + object.allowWhitespaceOnlyPieces = message.allowWhitespaceOnlyPieces; + if (message.controlSymbols && message.controlSymbols.length) { + object.controlSymbols = []; + for (let j = 0; j < message.controlSymbols.length; ++j) + object.controlSymbols[j] = message.controlSymbols[j]; + } + if (message.userDefinedSymbols && message.userDefinedSymbols.length) { + object.userDefinedSymbols = []; + for (let j = 0; j < message.userDefinedSymbols.length; ++j) + object.userDefinedSymbols[j] = message.userDefinedSymbols[j]; + } + if (message.vocabularyOutputPieceScore != null && + message.hasOwnProperty('vocabularyOutputPieceScore')) + object.vocabularyOutputPieceScore = message.vocabularyOutputPieceScore; + if (message.hardVocabLimit != null && + message.hasOwnProperty('hardVocabLimit')) + object.hardVocabLimit = message.hardVocabLimit; + if (message.useAllVocab != null && message.hasOwnProperty('useAllVocab')) + object.useAllVocab = message.useAllVocab; + if (message.byteFallback != null && + message.hasOwnProperty('byteFallback')) + object.byteFallback = message.byteFallback; + if (message.requiredChars != null && + message.hasOwnProperty('requiredChars')) + object.requiredChars = message.requiredChars; + if (message.unkId != null && message.hasOwnProperty('unkId')) + object.unkId = message.unkId; + if (message.bosId != null && message.hasOwnProperty('bosId')) + object.bosId = message.bosId; + if (message.eosId != null && message.hasOwnProperty('eosId')) + object.eosId = message.eosId; + if (message.padId != null && message.hasOwnProperty('padId')) + object.padId = message.padId; + if (message.unkSurface != null && message.hasOwnProperty('unkSurface')) + object.unkSurface = message.unkSurface; + if (message.unkPiece != null && message.hasOwnProperty('unkPiece')) + object.unkPiece = message.unkPiece; + if (message.bosPiece != null && message.hasOwnProperty('bosPiece')) + object.bosPiece = message.bosPiece; + if (message.eosPiece != null && message.hasOwnProperty('eosPiece')) + object.eosPiece = message.eosPiece; + if (message.padPiece != null && message.hasOwnProperty('padPiece')) + object.padPiece = message.padPiece; + if (message.trainExtremelyLargeCorpus != null && + message.hasOwnProperty('trainExtremelyLargeCorpus')) + object.trainExtremelyLargeCorpus = message.trainExtremelyLargeCorpus; + if (message.enableDifferentialPrivacy != null && + message.hasOwnProperty('enableDifferentialPrivacy')) + object.enableDifferentialPrivacy = message.enableDifferentialPrivacy; + if (message.differentialPrivacyNoiseLevel != null && + message.hasOwnProperty('differentialPrivacyNoiseLevel')) + object.differentialPrivacyNoiseLevel = + options.json && !isFinite(message.differentialPrivacyNoiseLevel) ? + String(message.differentialPrivacyNoiseLevel) : + message.differentialPrivacyNoiseLevel; + if (message.differentialPrivacyClippingThreshold != null && + message.hasOwnProperty('differentialPrivacyClippingThreshold')) + if (typeof message.differentialPrivacyClippingThreshold === 'number') + object.differentialPrivacyClippingThreshold = + options.longs === String ? + String(message.differentialPrivacyClippingThreshold) : + message.differentialPrivacyClippingThreshold; + else + object.differentialPrivacyClippingThreshold = + options.longs === String ? + $util.Long.prototype.toString.call( + message.differentialPrivacyClippingThreshold) : + options.longs === Number ? + new $util + .LongBits( + message.differentialPrivacyClippingThreshold.low >>> 0, + message.differentialPrivacyClippingThreshold.high >>> 0) + .toNumber(true) : + message.differentialPrivacyClippingThreshold; + if (message.pretokenizationDelimiter != null && + message.hasOwnProperty('pretokenizationDelimiter')) + object.pretokenizationDelimiter = message.pretokenizationDelimiter; + if (message.seedSentencepiecesFile != null && + message.hasOwnProperty('seedSentencepiecesFile')) + object.seedSentencepiecesFile = message.seedSentencepiecesFile; + return object; + }; + + /** + * Converts this TrainerSpec to JSON. + * @function toJSON + * @memberof sentencepiece.TrainerSpec + * @instance + * @returns {Object.} JSON object + */ + TrainerSpec.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for TrainerSpec + * @function getTypeUrl + * @memberof sentencepiece.TrainerSpec + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + TrainerSpec.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.TrainerSpec'; + }; + + /** + * ModelType enum. + * @name sentencepiece.TrainerSpec.ModelType + * @enum {number} + * @property {number} UNIGRAM=1 UNIGRAM value + * @property {number} BPE=2 BPE value + * @property {number} WORD=3 WORD value + * @property {number} CHAR=4 CHAR value + */ + TrainerSpec.ModelType = (function() { + const valuesById = {}, values = Object.create(valuesById); + values[valuesById[1] = 'UNIGRAM'] = 1; + values[valuesById[2] = 'BPE'] = 2; + values[valuesById[3] = 'WORD'] = 3; + values[valuesById[4] = 'CHAR'] = 4; + return values; + })(); + + return TrainerSpec; + })(); + + sentencepiece.NormalizerSpec = (function() { + /** + * Properties of a NormalizerSpec. + * @memberof sentencepiece + * @interface INormalizerSpec + * @property {string|null} [name] NormalizerSpec name + * @property {Uint8Array|null} [precompiledCharsmap] NormalizerSpec + * precompiledCharsmap + * @property {boolean|null} [addDummyPrefix] NormalizerSpec addDummyPrefix + * @property {boolean|null} [removeExtraWhitespaces] NormalizerSpec + * removeExtraWhitespaces + * @property {boolean|null} [escapeWhitespaces] NormalizerSpec + * escapeWhitespaces + * @property {string|null} [normalizationRuleTsv] NormalizerSpec + * normalizationRuleTsv + */ + + /** + * Constructs a new NormalizerSpec. + * @memberof sentencepiece + * @classdesc Represents a NormalizerSpec. + * @implements INormalizerSpec + * @constructor + * @param {sentencepiece.INormalizerSpec=} [properties] Properties to set + */ + function NormalizerSpec(properties) { + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) this[keys[i]] = properties[keys[i]]; + } + + /** + * NormalizerSpec name. + * @member {string} name + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.name = ''; + + /** + * NormalizerSpec precompiledCharsmap. + * @member {Uint8Array} precompiledCharsmap + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.precompiledCharsmap = $util.newBuffer([]); + + /** + * NormalizerSpec addDummyPrefix. + * @member {boolean} addDummyPrefix + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.addDummyPrefix = true; + + /** + * NormalizerSpec removeExtraWhitespaces. + * @member {boolean} removeExtraWhitespaces + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.removeExtraWhitespaces = true; + + /** + * NormalizerSpec escapeWhitespaces. + * @member {boolean} escapeWhitespaces + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.escapeWhitespaces = true; + + /** + * NormalizerSpec normalizationRuleTsv. + * @member {string} normalizationRuleTsv + * @memberof sentencepiece.NormalizerSpec + * @instance + */ + NormalizerSpec.prototype.normalizationRuleTsv = ''; + + /** + * Creates a new NormalizerSpec instance using the specified properties. + * @function create + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {sentencepiece.INormalizerSpec=} [properties] Properties to set + * @returns {sentencepiece.NormalizerSpec} NormalizerSpec instance + */ + NormalizerSpec.create = function create(properties) { + return new NormalizerSpec(properties); + }; + + /** + * Encodes the specified NormalizerSpec message. Does not implicitly {@link + * sentencepiece.NormalizerSpec.verify|verify} messages. + * @function encode + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {sentencepiece.INormalizerSpec} message NormalizerSpec message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + NormalizerSpec.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.name != null && Object.hasOwnProperty.call(message, 'name')) + writer.uint32(/* id 1, wireType 2 =*/ 10).string(message.name); + if (message.precompiledCharsmap != null && + Object.hasOwnProperty.call(message, 'precompiledCharsmap')) + writer.uint32(/* id 2, wireType 2 =*/ 18) + .bytes(message.precompiledCharsmap); + if (message.addDummyPrefix != null && + Object.hasOwnProperty.call(message, 'addDummyPrefix')) + writer.uint32(/* id 3, wireType 0 =*/ 24).bool(message.addDummyPrefix); + if (message.removeExtraWhitespaces != null && + Object.hasOwnProperty.call(message, 'removeExtraWhitespaces')) + writer.uint32(/* id 4, wireType 0 =*/ 32) + .bool(message.removeExtraWhitespaces); + if (message.escapeWhitespaces != null && + Object.hasOwnProperty.call(message, 'escapeWhitespaces')) + writer.uint32(/* id 5, wireType 0 =*/ 40) + .bool(message.escapeWhitespaces); + if (message.normalizationRuleTsv != null && + Object.hasOwnProperty.call(message, 'normalizationRuleTsv')) + writer.uint32(/* id 6, wireType 2 =*/ 50) + .string(message.normalizationRuleTsv); + return writer; + }; + + /** + * Encodes the specified NormalizerSpec message, length delimited. Does not + * implicitly {@link sentencepiece.NormalizerSpec.verify|verify} messages. + * @function encodeDelimited + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {sentencepiece.INormalizerSpec} message NormalizerSpec message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + NormalizerSpec.encodeDelimited = function encodeDelimited(message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a NormalizerSpec message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.NormalizerSpec} NormalizerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + NormalizerSpec.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.NormalizerSpec(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + message.name = reader.string(); + break; + } + case 2: { + message.precompiledCharsmap = reader.bytes(); + break; + } + case 3: { + message.addDummyPrefix = reader.bool(); + break; + } + case 4: { + message.removeExtraWhitespaces = reader.bool(); + break; + } + case 5: { + message.escapeWhitespaces = reader.bool(); + break; + } + case 6: { + message.normalizationRuleTsv = reader.string(); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a NormalizerSpec message from the specified reader or buffer, + * length delimited. + * @function decodeDelimited + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.NormalizerSpec} NormalizerSpec + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + NormalizerSpec.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a NormalizerSpec message. + * @function verify + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + NormalizerSpec.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.name != null && message.hasOwnProperty('name')) + if (!$util.isString(message.name)) return 'name: string expected'; + if (message.precompiledCharsmap != null && + message.hasOwnProperty('precompiledCharsmap')) + if (!(message.precompiledCharsmap && + typeof message.precompiledCharsmap.length === 'number' || + $util.isString(message.precompiledCharsmap))) + return 'precompiledCharsmap: buffer expected'; + if (message.addDummyPrefix != null && + message.hasOwnProperty('addDummyPrefix')) + if (typeof message.addDummyPrefix !== 'boolean') + return 'addDummyPrefix: boolean expected'; + if (message.removeExtraWhitespaces != null && + message.hasOwnProperty('removeExtraWhitespaces')) + if (typeof message.removeExtraWhitespaces !== 'boolean') + return 'removeExtraWhitespaces: boolean expected'; + if (message.escapeWhitespaces != null && + message.hasOwnProperty('escapeWhitespaces')) + if (typeof message.escapeWhitespaces !== 'boolean') + return 'escapeWhitespaces: boolean expected'; + if (message.normalizationRuleTsv != null && + message.hasOwnProperty('normalizationRuleTsv')) + if (!$util.isString(message.normalizationRuleTsv)) + return 'normalizationRuleTsv: string expected'; + return null; + }; + + /** + * Creates a NormalizerSpec message from a plain object. Also converts + * values to their respective internal types. + * @function fromObject + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.NormalizerSpec} NormalizerSpec + */ + NormalizerSpec.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.NormalizerSpec) return object; + let message = new $root.sentencepiece.NormalizerSpec(); + if (object.name != null) message.name = String(object.name); + if (object.precompiledCharsmap != null) + if (typeof object.precompiledCharsmap === 'string') + $util.base64.decode( + object.precompiledCharsmap, + message.precompiledCharsmap = $util.newBuffer( + $util.base64.length(object.precompiledCharsmap)), + 0); + else if (object.precompiledCharsmap.length >= 0) + message.precompiledCharsmap = object.precompiledCharsmap; + if (object.addDummyPrefix != null) + message.addDummyPrefix = Boolean(object.addDummyPrefix); + if (object.removeExtraWhitespaces != null) + message.removeExtraWhitespaces = Boolean(object.removeExtraWhitespaces); + if (object.escapeWhitespaces != null) + message.escapeWhitespaces = Boolean(object.escapeWhitespaces); + if (object.normalizationRuleTsv != null) + message.normalizationRuleTsv = String(object.normalizationRuleTsv); + return message; + }; + + /** + * Creates a plain object from a NormalizerSpec message. Also converts + * values to other types if specified. + * @function toObject + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {sentencepiece.NormalizerSpec} message NormalizerSpec + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + NormalizerSpec.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.defaults) { + object.name = ''; + if (options.bytes === String) + object.precompiledCharsmap = ''; + else { + object.precompiledCharsmap = []; + if (options.bytes !== Array) + object.precompiledCharsmap = + $util.newBuffer(object.precompiledCharsmap); + } + object.addDummyPrefix = true; + object.removeExtraWhitespaces = true; + object.escapeWhitespaces = true; + object.normalizationRuleTsv = ''; + } + if (message.name != null && message.hasOwnProperty('name')) + object.name = message.name; + if (message.precompiledCharsmap != null && + message.hasOwnProperty('precompiledCharsmap')) + object.precompiledCharsmap = options.bytes === String ? + $util.base64.encode( + message.precompiledCharsmap, 0, + message.precompiledCharsmap.length) : + options.bytes === Array ? + Array.prototype.slice.call(message.precompiledCharsmap) : + message.precompiledCharsmap; + if (message.addDummyPrefix != null && + message.hasOwnProperty('addDummyPrefix')) + object.addDummyPrefix = message.addDummyPrefix; + if (message.removeExtraWhitespaces != null && + message.hasOwnProperty('removeExtraWhitespaces')) + object.removeExtraWhitespaces = message.removeExtraWhitespaces; + if (message.escapeWhitespaces != null && + message.hasOwnProperty('escapeWhitespaces')) + object.escapeWhitespaces = message.escapeWhitespaces; + if (message.normalizationRuleTsv != null && + message.hasOwnProperty('normalizationRuleTsv')) + object.normalizationRuleTsv = message.normalizationRuleTsv; + return object; + }; + + /** + * Converts this NormalizerSpec to JSON. + * @function toJSON + * @memberof sentencepiece.NormalizerSpec + * @instance + * @returns {Object.} JSON object + */ + NormalizerSpec.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for NormalizerSpec + * @function getTypeUrl + * @memberof sentencepiece.NormalizerSpec + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + NormalizerSpec.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.NormalizerSpec'; + }; + + return NormalizerSpec; + })(); + + sentencepiece.SelfTestData = (function() { + /** + * Properties of a SelfTestData. + * @memberof sentencepiece + * @interface ISelfTestData + * @property {Array.|null} [samples] + * SelfTestData samples + */ + + /** + * Constructs a new SelfTestData. + * @memberof sentencepiece + * @classdesc Represents a SelfTestData. + * @implements ISelfTestData + * @constructor + * @param {sentencepiece.ISelfTestData=} [properties] Properties to set + */ + function SelfTestData(properties) { + this.samples = []; + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) this[keys[i]] = properties[keys[i]]; + } + + /** + * SelfTestData samples. + * @member {Array.} samples + * @memberof sentencepiece.SelfTestData + * @instance + */ + SelfTestData.prototype.samples = $util.emptyArray; + + /** + * Creates a new SelfTestData instance using the specified properties. + * @function create + * @memberof sentencepiece.SelfTestData + * @static + * @param {sentencepiece.ISelfTestData=} [properties] Properties to set + * @returns {sentencepiece.SelfTestData} SelfTestData instance + */ + SelfTestData.create = function create(properties) { + return new SelfTestData(properties); + }; + + /** + * Encodes the specified SelfTestData message. Does not implicitly {@link + * sentencepiece.SelfTestData.verify|verify} messages. + * @function encode + * @memberof sentencepiece.SelfTestData + * @static + * @param {sentencepiece.ISelfTestData} message SelfTestData message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + SelfTestData.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.samples != null && message.samples.length) + for (let i = 0; i < message.samples.length; ++i) + $root.sentencepiece.SelfTestData.Sample + .encode( + message.samples[i], + writer.uint32(/* id 1, wireType 2 =*/ 10).fork()) + .ldelim(); + return writer; + }; + + /** + * Encodes the specified SelfTestData message, length delimited. Does not + * implicitly {@link sentencepiece.SelfTestData.verify|verify} messages. + * @function encodeDelimited + * @memberof sentencepiece.SelfTestData + * @static + * @param {sentencepiece.ISelfTestData} message SelfTestData message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + SelfTestData.encodeDelimited = function encodeDelimited(message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a SelfTestData message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.SelfTestData + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.SelfTestData} SelfTestData + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + SelfTestData.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.SelfTestData(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + if (!(message.samples && message.samples.length)) + message.samples = []; + message.samples.push($root.sentencepiece.SelfTestData.Sample.decode( + reader, reader.uint32())); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a SelfTestData message from the specified reader or buffer, + * length delimited. + * @function decodeDelimited + * @memberof sentencepiece.SelfTestData + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.SelfTestData} SelfTestData + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + SelfTestData.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a SelfTestData message. + * @function verify + * @memberof sentencepiece.SelfTestData + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + SelfTestData.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.samples != null && message.hasOwnProperty('samples')) { + if (!Array.isArray(message.samples)) return 'samples: array expected'; + for (let i = 0; i < message.samples.length; ++i) { + let error = $root.sentencepiece.SelfTestData.Sample.verify( + message.samples[i]); + if (error) return 'samples.' + error; + } + } + return null; + }; + + /** + * Creates a SelfTestData message from a plain object. Also converts values + * to their respective internal types. + * @function fromObject + * @memberof sentencepiece.SelfTestData + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.SelfTestData} SelfTestData + */ + SelfTestData.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.SelfTestData) return object; + let message = new $root.sentencepiece.SelfTestData(); + if (object.samples) { + if (!Array.isArray(object.samples)) + throw TypeError( + '.sentencepiece.SelfTestData.samples: array expected'); + message.samples = []; + for (let i = 0; i < object.samples.length; ++i) { + if (typeof object.samples[i] !== 'object') + throw TypeError( + '.sentencepiece.SelfTestData.samples: object expected'); + message.samples[i] = + $root.sentencepiece.SelfTestData.Sample.fromObject( + object.samples[i]); + } + } + return message; + }; + + /** + * Creates a plain object from a SelfTestData message. Also converts values + * to other types if specified. + * @function toObject + * @memberof sentencepiece.SelfTestData + * @static + * @param {sentencepiece.SelfTestData} message SelfTestData + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + SelfTestData.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.arrays || options.defaults) object.samples = []; + if (message.samples && message.samples.length) { + object.samples = []; + for (let j = 0; j < message.samples.length; ++j) + object.samples[j] = $root.sentencepiece.SelfTestData.Sample.toObject( + message.samples[j], options); + } + return object; + }; + + /** + * Converts this SelfTestData to JSON. + * @function toJSON + * @memberof sentencepiece.SelfTestData + * @instance + * @returns {Object.} JSON object + */ + SelfTestData.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for SelfTestData + * @function getTypeUrl + * @memberof sentencepiece.SelfTestData + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + SelfTestData.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.SelfTestData'; + }; + + SelfTestData.Sample = (function() { + /** + * Properties of a Sample. + * @memberof sentencepiece.SelfTestData + * @interface ISample + * @property {string|null} [input] Sample input + * @property {string|null} [expected] Sample expected + */ + + /** + * Constructs a new Sample. + * @memberof sentencepiece.SelfTestData + * @classdesc Represents a Sample. + * @implements ISample + * @constructor + * @param {sentencepiece.SelfTestData.ISample=} [properties] Properties to + * set + */ + function Sample(properties) { + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) + this[keys[i]] = properties[keys[i]]; + } + + /** + * Sample input. + * @member {string} input + * @memberof sentencepiece.SelfTestData.Sample + * @instance + */ + Sample.prototype.input = ''; + + /** + * Sample expected. + * @member {string} expected + * @memberof sentencepiece.SelfTestData.Sample + * @instance + */ + Sample.prototype.expected = ''; + + /** + * Creates a new Sample instance using the specified properties. + * @function create + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {sentencepiece.SelfTestData.ISample=} [properties] Properties to + * set + * @returns {sentencepiece.SelfTestData.Sample} Sample instance + */ + Sample.create = function create(properties) { + return new Sample(properties); + }; + + /** + * Encodes the specified Sample message. Does not implicitly {@link + * sentencepiece.SelfTestData.Sample.verify|verify} messages. + * @function encode + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {sentencepiece.SelfTestData.ISample} message Sample message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + Sample.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.input != null && + Object.hasOwnProperty.call(message, 'input')) + writer.uint32(/* id 1, wireType 2 =*/ 10).string(message.input); + if (message.expected != null && + Object.hasOwnProperty.call(message, 'expected')) + writer.uint32(/* id 2, wireType 2 =*/ 18).string(message.expected); + return writer; + }; + + /** + * Encodes the specified Sample message, length delimited. Does not + * implicitly {@link sentencepiece.SelfTestData.Sample.verify|verify} + * messages. + * @function encodeDelimited + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {sentencepiece.SelfTestData.ISample} message Sample message or + * plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + Sample.encodeDelimited = function encodeDelimited(message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a Sample message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.SelfTestData.Sample} Sample + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + Sample.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.SelfTestData.Sample(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + message.input = reader.string(); + break; + } + case 2: { + message.expected = reader.string(); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a Sample message from the specified reader or buffer, length + * delimited. + * @function decodeDelimited + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.SelfTestData.Sample} Sample + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + Sample.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a Sample message. + * @function verify + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + Sample.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.input != null && message.hasOwnProperty('input')) + if (!$util.isString(message.input)) return 'input: string expected'; + if (message.expected != null && message.hasOwnProperty('expected')) + if (!$util.isString(message.expected)) + return 'expected: string expected'; + return null; + }; + + /** + * Creates a Sample message from a plain object. Also converts values to + * their respective internal types. + * @function fromObject + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.SelfTestData.Sample} Sample + */ + Sample.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.SelfTestData.Sample) + return object; + let message = new $root.sentencepiece.SelfTestData.Sample(); + if (object.input != null) message.input = String(object.input); + if (object.expected != null) message.expected = String(object.expected); + return message; + }; + + /** + * Creates a plain object from a Sample message. Also converts values to + * other types if specified. + * @function toObject + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {sentencepiece.SelfTestData.Sample} message Sample + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + Sample.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.defaults) { + object.input = ''; + object.expected = ''; + } + if (message.input != null && message.hasOwnProperty('input')) + object.input = message.input; + if (message.expected != null && message.hasOwnProperty('expected')) + object.expected = message.expected; + return object; + }; + + /** + * Converts this Sample to JSON. + * @function toJSON + * @memberof sentencepiece.SelfTestData.Sample + * @instance + * @returns {Object.} JSON object + */ + Sample.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for Sample + * @function getTypeUrl + * @memberof sentencepiece.SelfTestData.Sample + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + Sample.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.SelfTestData.Sample'; + }; + + return Sample; + })(); + + return SelfTestData; + })(); + + sentencepiece.ModelProto = (function() { + /** + * Properties of a ModelProto. + * @memberof sentencepiece + * @interface IModelProto + * @property {Array.|null} [pieces] + * ModelProto pieces + * @property {sentencepiece.ITrainerSpec|null} [trainerSpec] ModelProto + * trainerSpec + * @property {sentencepiece.INormalizerSpec|null} [normalizerSpec] + * ModelProto normalizerSpec + * @property {sentencepiece.ISelfTestData|null} [selfTestData] ModelProto + * selfTestData + * @property {sentencepiece.INormalizerSpec|null} [denormalizerSpec] + * ModelProto denormalizerSpec + */ + + /** + * Constructs a new ModelProto. + * @memberof sentencepiece + * @classdesc Represents a ModelProto. + * @implements IModelProto + * @constructor + * @param {sentencepiece.IModelProto=} [properties] Properties to set + */ + function ModelProto(properties) { + this.pieces = []; + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) this[keys[i]] = properties[keys[i]]; + } + + /** + * ModelProto pieces. + * @member {Array.} pieces + * @memberof sentencepiece.ModelProto + * @instance + */ + ModelProto.prototype.pieces = $util.emptyArray; + + /** + * ModelProto trainerSpec. + * @member {sentencepiece.ITrainerSpec|null|undefined} trainerSpec + * @memberof sentencepiece.ModelProto + * @instance + */ + ModelProto.prototype.trainerSpec = null; + + /** + * ModelProto normalizerSpec. + * @member {sentencepiece.INormalizerSpec|null|undefined} normalizerSpec + * @memberof sentencepiece.ModelProto + * @instance + */ + ModelProto.prototype.normalizerSpec = null; + + /** + * ModelProto selfTestData. + * @member {sentencepiece.ISelfTestData|null|undefined} selfTestData + * @memberof sentencepiece.ModelProto + * @instance + */ + ModelProto.prototype.selfTestData = null; + + /** + * ModelProto denormalizerSpec. + * @member {sentencepiece.INormalizerSpec|null|undefined} denormalizerSpec + * @memberof sentencepiece.ModelProto + * @instance + */ + ModelProto.prototype.denormalizerSpec = null; + + /** + * Creates a new ModelProto instance using the specified properties. + * @function create + * @memberof sentencepiece.ModelProto + * @static + * @param {sentencepiece.IModelProto=} [properties] Properties to set + * @returns {sentencepiece.ModelProto} ModelProto instance + */ + ModelProto.create = function create(properties) { + return new ModelProto(properties); + }; + + /** + * Encodes the specified ModelProto message. Does not implicitly {@link + * sentencepiece.ModelProto.verify|verify} messages. + * @function encode + * @memberof sentencepiece.ModelProto + * @static + * @param {sentencepiece.IModelProto} message ModelProto message or plain + * object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + ModelProto.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.pieces != null && message.pieces.length) + for (let i = 0; i < message.pieces.length; ++i) + $root.sentencepiece.ModelProto.SentencePiece + .encode( + message.pieces[i], + writer.uint32(/* id 1, wireType 2 =*/ 10).fork()) + .ldelim(); + if (message.trainerSpec != null && + Object.hasOwnProperty.call(message, 'trainerSpec')) + $root.sentencepiece.TrainerSpec + .encode( + message.trainerSpec, + writer.uint32(/* id 2, wireType 2 =*/ 18).fork()) + .ldelim(); + if (message.normalizerSpec != null && + Object.hasOwnProperty.call(message, 'normalizerSpec')) + $root.sentencepiece.NormalizerSpec + .encode( + message.normalizerSpec, + writer.uint32(/* id 3, wireType 2 =*/ 26).fork()) + .ldelim(); + if (message.selfTestData != null && + Object.hasOwnProperty.call(message, 'selfTestData')) + $root.sentencepiece.SelfTestData + .encode( + message.selfTestData, + writer.uint32(/* id 4, wireType 2 =*/ 34).fork()) + .ldelim(); + if (message.denormalizerSpec != null && + Object.hasOwnProperty.call(message, 'denormalizerSpec')) + $root.sentencepiece.NormalizerSpec + .encode( + message.denormalizerSpec, + writer.uint32(/* id 5, wireType 2 =*/ 42).fork()) + .ldelim(); + return writer; + }; + + /** + * Encodes the specified ModelProto message, length delimited. Does not + * implicitly {@link sentencepiece.ModelProto.verify|verify} messages. + * @function encodeDelimited + * @memberof sentencepiece.ModelProto + * @static + * @param {sentencepiece.IModelProto} message ModelProto message or plain + * object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + ModelProto.encodeDelimited = function encodeDelimited(message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a ModelProto message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.ModelProto + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.ModelProto} ModelProto + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + ModelProto.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.ModelProto(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + if (!(message.pieces && message.pieces.length)) message.pieces = []; + message.pieces.push( + $root.sentencepiece.ModelProto.SentencePiece.decode( + reader, reader.uint32())); + break; + } + case 2: { + message.trainerSpec = + $root.sentencepiece.TrainerSpec.decode(reader, reader.uint32()); + break; + } + case 3: { + message.normalizerSpec = $root.sentencepiece.NormalizerSpec.decode( + reader, reader.uint32()); + break; + } + case 4: { + message.selfTestData = $root.sentencepiece.SelfTestData.decode( + reader, reader.uint32()); + break; + } + case 5: { + message.denormalizerSpec = + $root.sentencepiece.NormalizerSpec.decode( + reader, reader.uint32()); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a ModelProto message from the specified reader or buffer, length + * delimited. + * @function decodeDelimited + * @memberof sentencepiece.ModelProto + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.ModelProto} ModelProto + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + ModelProto.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a ModelProto message. + * @function verify + * @memberof sentencepiece.ModelProto + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + ModelProto.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.pieces != null && message.hasOwnProperty('pieces')) { + if (!Array.isArray(message.pieces)) return 'pieces: array expected'; + for (let i = 0; i < message.pieces.length; ++i) { + let error = $root.sentencepiece.ModelProto.SentencePiece.verify( + message.pieces[i]); + if (error) return 'pieces.' + error; + } + } + if (message.trainerSpec != null && + message.hasOwnProperty('trainerSpec')) { + let error = $root.sentencepiece.TrainerSpec.verify(message.trainerSpec); + if (error) return 'trainerSpec.' + error; + } + if (message.normalizerSpec != null && + message.hasOwnProperty('normalizerSpec')) { + let error = + $root.sentencepiece.NormalizerSpec.verify(message.normalizerSpec); + if (error) return 'normalizerSpec.' + error; + } + if (message.selfTestData != null && + message.hasOwnProperty('selfTestData')) { + let error = + $root.sentencepiece.SelfTestData.verify(message.selfTestData); + if (error) return 'selfTestData.' + error; + } + if (message.denormalizerSpec != null && + message.hasOwnProperty('denormalizerSpec')) { + let error = + $root.sentencepiece.NormalizerSpec.verify(message.denormalizerSpec); + if (error) return 'denormalizerSpec.' + error; + } + return null; + }; + + /** + * Creates a ModelProto message from a plain object. Also converts values to + * their respective internal types. + * @function fromObject + * @memberof sentencepiece.ModelProto + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.ModelProto} ModelProto + */ + ModelProto.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.ModelProto) return object; + let message = new $root.sentencepiece.ModelProto(); + if (object.pieces) { + if (!Array.isArray(object.pieces)) + throw TypeError('.sentencepiece.ModelProto.pieces: array expected'); + message.pieces = []; + for (let i = 0; i < object.pieces.length; ++i) { + if (typeof object.pieces[i] !== 'object') + throw TypeError( + '.sentencepiece.ModelProto.pieces: object expected'); + message.pieces[i] = + $root.sentencepiece.ModelProto.SentencePiece.fromObject( + object.pieces[i]); + } + } + if (object.trainerSpec != null) { + if (typeof object.trainerSpec !== 'object') + throw TypeError( + '.sentencepiece.ModelProto.trainerSpec: object expected'); + message.trainerSpec = + $root.sentencepiece.TrainerSpec.fromObject(object.trainerSpec); + } + if (object.normalizerSpec != null) { + if (typeof object.normalizerSpec !== 'object') + throw TypeError( + '.sentencepiece.ModelProto.normalizerSpec: object expected'); + message.normalizerSpec = $root.sentencepiece.NormalizerSpec.fromObject( + object.normalizerSpec); + } + if (object.selfTestData != null) { + if (typeof object.selfTestData !== 'object') + throw TypeError( + '.sentencepiece.ModelProto.selfTestData: object expected'); + message.selfTestData = + $root.sentencepiece.SelfTestData.fromObject(object.selfTestData); + } + if (object.denormalizerSpec != null) { + if (typeof object.denormalizerSpec !== 'object') + throw TypeError( + '.sentencepiece.ModelProto.denormalizerSpec: object expected'); + message.denormalizerSpec = + $root.sentencepiece.NormalizerSpec.fromObject( + object.denormalizerSpec); + } + return message; + }; + + /** + * Creates a plain object from a ModelProto message. Also converts values to + * other types if specified. + * @function toObject + * @memberof sentencepiece.ModelProto + * @static + * @param {sentencepiece.ModelProto} message ModelProto + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + ModelProto.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.arrays || options.defaults) object.pieces = []; + if (options.defaults) { + object.trainerSpec = null; + object.normalizerSpec = null; + object.selfTestData = null; + object.denormalizerSpec = null; + } + if (message.pieces && message.pieces.length) { + object.pieces = []; + for (let j = 0; j < message.pieces.length; ++j) + object.pieces[j] = + $root.sentencepiece.ModelProto.SentencePiece.toObject( + message.pieces[j], options); + } + if (message.trainerSpec != null && message.hasOwnProperty('trainerSpec')) + object.trainerSpec = $root.sentencepiece.TrainerSpec.toObject( + message.trainerSpec, options); + if (message.normalizerSpec != null && + message.hasOwnProperty('normalizerSpec')) + object.normalizerSpec = $root.sentencepiece.NormalizerSpec.toObject( + message.normalizerSpec, options); + if (message.selfTestData != null && + message.hasOwnProperty('selfTestData')) + object.selfTestData = $root.sentencepiece.SelfTestData.toObject( + message.selfTestData, options); + if (message.denormalizerSpec != null && + message.hasOwnProperty('denormalizerSpec')) + object.denormalizerSpec = $root.sentencepiece.NormalizerSpec.toObject( + message.denormalizerSpec, options); + return object; + }; + + /** + * Converts this ModelProto to JSON. + * @function toJSON + * @memberof sentencepiece.ModelProto + * @instance + * @returns {Object.} JSON object + */ + ModelProto.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for ModelProto + * @function getTypeUrl + * @memberof sentencepiece.ModelProto + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + ModelProto.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.ModelProto'; + }; + + ModelProto.SentencePiece = (function() { + /** + * Properties of a SentencePiece. + * @memberof sentencepiece.ModelProto + * @interface ISentencePiece + * @property {string|null} [piece] SentencePiece piece + * @property {number|null} [score] SentencePiece score + * @property {sentencepiece.ModelProto.SentencePiece.Type|null} [type] + * SentencePiece type + */ + + /** + * Constructs a new SentencePiece. + * @memberof sentencepiece.ModelProto + * @classdesc Represents a SentencePiece. + * @implements ISentencePiece + * @constructor + * @param {sentencepiece.ModelProto.ISentencePiece=} [properties] + * Properties to set + */ + function SentencePiece(properties) { + if (properties) + for (let keys = Object.keys(properties), i = 0; i < keys.length; ++i) + if (properties[keys[i]] != null) + this[keys[i]] = properties[keys[i]]; + } + + /** + * SentencePiece piece. + * @member {string} piece + * @memberof sentencepiece.ModelProto.SentencePiece + * @instance + */ + SentencePiece.prototype.piece = ''; + + /** + * SentencePiece score. + * @member {number} score + * @memberof sentencepiece.ModelProto.SentencePiece + * @instance + */ + SentencePiece.prototype.score = 0; + + /** + * SentencePiece type. + * @member {sentencepiece.ModelProto.SentencePiece.Type} type + * @memberof sentencepiece.ModelProto.SentencePiece + * @instance + */ + SentencePiece.prototype.type = 1; + + /** + * Creates a new SentencePiece instance using the specified properties. + * @function create + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {sentencepiece.ModelProto.ISentencePiece=} [properties] + * Properties to set + * @returns {sentencepiece.ModelProto.SentencePiece} SentencePiece + * instance + */ + SentencePiece.create = function create(properties) { + return new SentencePiece(properties); + }; + + /** + * Encodes the specified SentencePiece message. Does not implicitly {@link + * sentencepiece.ModelProto.SentencePiece.verify|verify} messages. + * @function encode + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {sentencepiece.ModelProto.ISentencePiece} message SentencePiece + * message or plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + SentencePiece.encode = function encode(message, writer) { + if (!writer) writer = $Writer.create(); + if (message.piece != null && + Object.hasOwnProperty.call(message, 'piece')) + writer.uint32(/* id 1, wireType 2 =*/ 10).string(message.piece); + if (message.score != null && + Object.hasOwnProperty.call(message, 'score')) + writer.uint32(/* id 2, wireType 5 =*/ 21).float(message.score); + if (message.type != null && Object.hasOwnProperty.call(message, 'type')) + writer.uint32(/* id 3, wireType 0 =*/ 24).int32(message.type); + return writer; + }; + + /** + * Encodes the specified SentencePiece message, length delimited. Does not + * implicitly {@link sentencepiece.ModelProto.SentencePiece.verify|verify} + * messages. + * @function encodeDelimited + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {sentencepiece.ModelProto.ISentencePiece} message SentencePiece + * message or plain object to encode + * @param {$protobuf.Writer} [writer] Writer to encode to + * @returns {$protobuf.Writer} Writer + */ + SentencePiece.encodeDelimited = function encodeDelimited( + message, writer) { + return this.encode(message, writer).ldelim(); + }; + + /** + * Decodes a SentencePiece message from the specified reader or buffer. + * @function decode + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @param {number} [length] Message length if known beforehand + * @returns {sentencepiece.ModelProto.SentencePiece} SentencePiece + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + SentencePiece.decode = function decode(reader, length, error) { + if (!(reader instanceof $Reader)) reader = $Reader.create(reader); + let end = length === undefined ? reader.len : reader.pos + length, + message = new $root.sentencepiece.ModelProto.SentencePiece(); + while (reader.pos < end) { + let tag = reader.uint32(); + if (tag === error) break; + switch (tag >>> 3) { + case 1: { + message.piece = reader.string(); + break; + } + case 2: { + message.score = reader.float(); + break; + } + case 3: { + message.type = reader.int32(); + break; + } + default: + reader.skipType(tag & 7); + break; + } + } + return message; + }; + + /** + * Decodes a SentencePiece message from the specified reader or buffer, + * length delimited. + * @function decodeDelimited + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {$protobuf.Reader|Uint8Array} reader Reader or buffer to decode + * from + * @returns {sentencepiece.ModelProto.SentencePiece} SentencePiece + * @throws {Error} If the payload is not a reader or valid buffer + * @throws {$protobuf.util.ProtocolError} If required fields are missing + */ + SentencePiece.decodeDelimited = function decodeDelimited(reader) { + if (!(reader instanceof $Reader)) reader = new $Reader(reader); + return this.decode(reader, reader.uint32()); + }; + + /** + * Verifies a SentencePiece message. + * @function verify + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {Object.} message Plain object to verify + * @returns {string|null} `null` if valid, otherwise the reason why it is + * not + */ + SentencePiece.verify = function verify(message) { + if (typeof message !== 'object' || message === null) + return 'object expected'; + if (message.piece != null && message.hasOwnProperty('piece')) + if (!$util.isString(message.piece)) return 'piece: string expected'; + if (message.score != null && message.hasOwnProperty('score')) + if (typeof message.score !== 'number') + return 'score: number expected'; + if (message.type != null && message.hasOwnProperty('type')) + switch (message.type) { + default: + return 'type: enum value expected'; + case 1: + case 2: + case 3: + case 4: + case 6: + case 5: + break; + } + return null; + }; + + /** + * Creates a SentencePiece message from a plain object. Also converts + * values to their respective internal types. + * @function fromObject + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {Object.} object Plain object + * @returns {sentencepiece.ModelProto.SentencePiece} SentencePiece + */ + SentencePiece.fromObject = function fromObject(object) { + if (object instanceof $root.sentencepiece.ModelProto.SentencePiece) + return object; + let message = new $root.sentencepiece.ModelProto.SentencePiece(); + if (object.piece != null) message.piece = String(object.piece); + if (object.score != null) message.score = Number(object.score); + switch (object.type) { + default: + if (typeof object.type === 'number') { + message.type = object.type; + break; + } + break; + case 'NORMAL': + case 1: + message.type = 1; + break; + case 'UNKNOWN': + case 2: + message.type = 2; + break; + case 'CONTROL': + case 3: + message.type = 3; + break; + case 'USER_DEFINED': + case 4: + message.type = 4; + break; + case 'BYTE': + case 6: + message.type = 6; + break; + case 'UNUSED': + case 5: + message.type = 5; + break; + } + return message; + }; + + /** + * Creates a plain object from a SentencePiece message. Also converts + * values to other types if specified. + * @function toObject + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {sentencepiece.ModelProto.SentencePiece} message SentencePiece + * @param {$protobuf.IConversionOptions} [options] Conversion options + * @returns {Object.} Plain object + */ + SentencePiece.toObject = function toObject(message, options) { + if (!options) options = {}; + let object = {}; + if (options.defaults) { + object.piece = ''; + object.score = 0; + object.type = options.enums === String ? 'NORMAL' : 1; + } + if (message.piece != null && message.hasOwnProperty('piece')) + object.piece = message.piece; + if (message.score != null && message.hasOwnProperty('score')) + object.score = options.json && !isFinite(message.score) ? + String(message.score) : + message.score; + if (message.type != null && message.hasOwnProperty('type')) + object.type = options.enums === String ? + $root.sentencepiece.ModelProto.SentencePiece + .Type[message.type] === undefined ? + message.type : + $root.sentencepiece.ModelProto.SentencePiece.Type[message.type] : + message.type; + return object; + }; + + /** + * Converts this SentencePiece to JSON. + * @function toJSON + * @memberof sentencepiece.ModelProto.SentencePiece + * @instance + * @returns {Object.} JSON object + */ + SentencePiece.prototype.toJSON = function toJSON() { + return this.constructor.toObject(this, $protobuf.util.toJSONOptions); + }; + + /** + * Gets the default type url for SentencePiece + * @function getTypeUrl + * @memberof sentencepiece.ModelProto.SentencePiece + * @static + * @param {string} [typeUrlPrefix] your custom typeUrlPrefix(default + * "type.googleapis.com") + * @returns {string} The default type url + */ + SentencePiece.getTypeUrl = function getTypeUrl(typeUrlPrefix) { + if (typeUrlPrefix === undefined) { + typeUrlPrefix = 'type.googleapis.com'; + } + return typeUrlPrefix + '/sentencepiece.ModelProto.SentencePiece'; + }; + + /** + * Type enum. + * @name sentencepiece.ModelProto.SentencePiece.Type + * @enum {number} + * @property {number} NORMAL=1 NORMAL value + * @property {number} UNKNOWN=2 UNKNOWN value + * @property {number} CONTROL=3 CONTROL value + * @property {number} USER_DEFINED=4 USER_DEFINED value + * @property {number} BYTE=6 BYTE value + * @property {number} UNUSED=5 UNUSED value + */ + SentencePiece.Type = (function() { + const valuesById = {}, values = Object.create(valuesById); + values[valuesById[1] = 'NORMAL'] = 1; + values[valuesById[2] = 'UNKNOWN'] = 2; + values[valuesById[3] = 'CONTROL'] = 3; + values[valuesById[4] = 'USER_DEFINED'] = 4; + values[valuesById[6] = 'BYTE'] = 6; + values[valuesById[5] = 'UNUSED'] = 5; + return values; + })(); + + return SentencePiece; + })(); + + return ModelProto; + })(); + + return sentencepiece; +})(); + +export {$root as default}; diff --git a/src/cross/sentencepiece/sentencepiece_model.proto b/src/cross/sentencepiece/sentencepiece_model.proto new file mode 100644 index 000000000..38c0f7dc7 --- /dev/null +++ b/src/cross/sentencepiece/sentencepiece_model.proto @@ -0,0 +1,324 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +syntax = "proto2"; + +// TODO(taku): Needs to use LITE RUNTIME in OSS release. +package sentencepiece; + +option optimize_for = LITE_RUNTIME; + +// TrainerSpec encodes a various parameters for SentencePiece training. +// Next id: 55 +message TrainerSpec { + /////////////////////////////////////////////////////////////////// + // General parameters + // + // Input corpus files. + // Trainer accepts the following two formats: + // A) Monolingual: plain text, one sentence per line. + // B) Bilingual: TSV, source sentence target sentence + // When bilingual data is passed, shared vocabulary model is built. + // Note that the input file must be raw corpus, not a preprocessed corpus. + // Trainer only loads the first `input_sentence_size` sentences specified + // with this parameter. + repeated string input = 1; + + // Input corpus format: + // "text": one-sentence-per-line text format (default) + // "tsv": sentence freq + optional string input_format = 7; + + // Output model file prefix. + // .model and .vocab are generated. + optional string model_prefix = 2; + + // Model type. only have UNIGRAM now. + enum ModelType { + UNIGRAM = 1; // Unigram language model with dynamic algorithm + BPE = 2; // Byte Pair Encoding + WORD = 3; // Delimitered by whitespace. + CHAR = 4; // tokenizes into character sequence + } + optional ModelType model_type = 3 [default = UNIGRAM]; + + // Vocabulary size. 8k is the default size. + optional int32 vocab_size = 4 [default = 8000]; + + // List of the languages this model can accept. + // Since the model is language-agnostic, this field is used as a reference. + repeated string accept_language = 5; + + // Size of self-test samples, which are encoded in the model file. + optional int32 self_test_sample_size = 6 [default = 0]; + + // Whether to use DP version of sentencepiece. Use it with TSV input format + // (requires precomputed word tab counts to work). + optional bool enable_differential_privacy = 50 [default = false]; + // Set these parameters if you need DP version of sentencepiece. + // std of noise to add. + optional float differential_privacy_noise_level = 51 [default = 0.0]; + // Clipping threshold to apply after adding noise. All the words with + // frequency less than this value are dropped. + optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; + + /////////////////////////////////////////////////////////////////// + // Training parameters. + // + // Uses characters which cover the corpus with the ratio of `chars_coverage`. + // This parameter determines the set of basic Alphabet of sentence piece. + // 1.0 - `chars_coverage` characters are treated as UNK. + // See also required_chars field. + optional float character_coverage = 10 [default = 0.9995]; + + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. + // It is better to shuffle the input corpus randomly. + optional uint64 input_sentence_size = 11 [default = 0]; + optional bool shuffle_input_sentence = 19 [default = true]; + + // Maximum size of sentences to make seed sentence pieces. + // Extended suffix array is constructed to extract frequent + // sub-strings from the corpus. This uses 20N working space, + // where N is the size of corpus. + optional int32 mining_sentence_size = 12 [deprecated = true]; + + // Maximum size of sentences to train sentence pieces. + optional int32 training_sentence_size = 13 [deprecated = true]; + + // The size of seed sentencepieces. + // `seed_sentencepiece_size` must be larger than `vocab_size`. + optional int32 seed_sentencepiece_size = 14 [default = 1000000]; + + // In every EM sub-iterations, keeps top + // `shrinking_factor` * `current sentencepieces size` with respect to + // the loss of the sentence piece. This value should be smaller than 1.0. + optional float shrinking_factor = 15 [default = 0.75]; + + // The maximum sentence length in byte. The sentences with the length + // larger than `max_sentence_length` is simply ignored. + // Longer input tends to bring the following risks: + // * Overflow during EM training (unigram language model only) + // * Performance drop because of O(n log n) cost in BPE. + optional int32 max_sentence_length = 18 [default = 4192]; + + // Number of threads in the training. + optional int32 num_threads = 16 [default = 16]; + + // Number of EM sub iterations. + optional int32 num_sub_iterations = 17 [default = 2]; + + /////////////////////////////////////////////////////////////////// + // SentencePiece parameters which control the shapes of sentence piece. + // + // Maximum length of sentencepiece. + optional int32 max_sentencepiece_length = 20 [default = 16]; + + // Uses Unicode script to split sentence pieces. + // When `split_by_unicode_script` is true, we do not allow sentence piece to + // include multiple Unicode scripts, e.g. "F1" is not a valid piece. + // Exception: CJ characters (Hiragana/Katakana/Han) are all handled + // as one script type, since Japanese word can consist of multiple scripts. + // This exception is always applied regardless of the accept-language + // parameter. + optional bool split_by_unicode_script = 21 [default = true]; + + // When `split_by_number` is true, put a boundary between number and + // non-number transition. If we want to treat "F1" is one token, set this flag + // to be false. + optional bool split_by_number = 23 [default = true]; + + // Use a white space to split sentence pieces. + // When `split_by_whitespace` is false, we may have the piece containing + // a white space in the middle. e.g., "in_the". + optional bool split_by_whitespace = 22 [default = true]; + + // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => + // hello_. When `treat_whitespace_as_suffix` is true, + // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end + // of sentence. + optional bool treat_whitespace_as_suffix = 24 [default = false]; + + // Allows pieces that only contain whitespaces instead of appearing only as + // prefix or suffix of other pieces. + optional bool allow_whitespace_only_pieces = 26 [default = false]; + + // Split all digits (0-9) into separate pieces. + optional bool split_digits = 25 [default = false]; + + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + optional string pretokenization_delimiter = 53 [default = ""]; + + /////////////////////////////////////////////////////////////////// + // Vocabulary management + // + // Defines control symbols used as an indicator to + // change the behavior of the decoder. and are pre-defined. + // We can use this field to encode various meta information, + // including language indicator in multilingual model. + // These symbols are not visible to users, but visible to + // the decoder. Note that when the input sentence contains control symbols, + // they are not treated as one token, but segmented into normal pieces. + // Control symbols must be inserted independently from the segmentation. + repeated string control_symbols = 30; + + // Defines user defined symbols. + // These symbols are added with extremely high score + // so they are always treated as one unique symbol in any context. + // Typical usage of user_defined_symbols is placeholder for named entities. + repeated string user_defined_symbols = 31; + + // Defines required characters. Each UTF8 character in this string is included + // in the character set regardless of character_coverage value. Unlike + // user_defined_symbols, these characters have scores based on the frequency + // on input sentences, and the model can form subwords using characters + // in this field. + optional string required_chars = 36; + + // Decomposes unknown pieces into UTF-8 bytes. + optional bool byte_fallback = 35 [default = false]; + + // When creating the vocabulary file, defines whether or not to additionally + // output the score for each piece. + optional bool vocabulary_output_piece_score = 32 [default = true]; + + // `vocab_size` is treated as hard limit. Crash if + // the model can not produce the vocab of size `vocab_size`, + // When `hard_vocab_limit` is false, vocab_size is treated + // as soft limit. Note that when model_type=char, + // always assumes hard_vocab_limit = false. + optional bool hard_vocab_limit = 33 [default = true]; + + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + optional bool use_all_vocab = 34 [default = false]; + + /////////////////////////////////////////////////////////////////// + // Reserved special meta tokens. + // * -1 is not used. + // * unk_id must not be -1. + // Id must starts with 0 and be contigous. + optional int32 unk_id = 40 [default = 0]; // + optional int32 bos_id = 41 [default = 1]; // + optional int32 eos_id = 42 [default = 2]; // + optional int32 pad_id = 43 [default = -1]; // (padding) + optional string unk_piece = 45 [default = ""]; + optional string bos_piece = 46 [default = ""]; + optional string eos_piece = 47 [default = ""]; + optional string pad_piece = 48 [default = ""]; + + // Encodes into U+2047 (DOUBLE QUESTION MARK), + // since this character can be useful both for user and + // developer. We can easily figure out that is emitted. + optional string unk_surface = 44 [default = " \xE2\x81\x87 "]; + + // Increase bit depth to allow unigram model training on large + // (>10M sentences) corpora. A Side-effect of enabling this flag + // is increased memory usage. + optional bool train_extremely_large_corpus = 49 [default = false]; + + // Path to a seed sentencepieces file, with one tab-separated + // seed sentencepiece frequency per line. + optional string seed_sentencepieces_file = 54 [default = ""]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// NormalizerSpec encodes a various parameters for string normalizaiton +message NormalizerSpec { + // name of normalization rule. + optional string name = 1; + + // Pre-compiled normalization rule created by + // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. + // Usually this field is set by Builder::GetNormalizerSpec() method. + optional bytes precompiled_charsmap = 2; + + // Adds dummy whitespace at the beginning of text in order to + // treat "world" in "world" and "hello world" in the same way. + optional bool add_dummy_prefix = 3 [default = true]; + + // Removes leading, trailing, and duplicate internal whitespace. + optional bool remove_extra_whitespaces = 4 [default = true]; + + // Replaces whitespace with meta symbol. + // This field must be true to train sentence piece model. + optional bool escape_whitespaces = 5 [default = true]; + + // Custom normalization rule file in TSV format. + // https://github.com/google/sentencepiece/blob/master/doc/normalization.md + // This field is only used in SentencePieceTrainer::Train() method, which + // compiles the rule into the binary rule stored in `precompiled_charsmap`. + optional string normalization_rule_tsv = 6; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// Proto to store samples for self-testing. +message SelfTestData { + message Sample { + optional string input = 1; + optional string expected = 2; + } + repeated Sample samples = 1; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// ModelProto stores model parameters. +// SentencePieceProcessor is supposed to be self-contained. +// All settings/parameters which may change the behavior must be encoded +// in ModelProto. +message ModelProto { + message SentencePiece { + enum Type { + NORMAL = 1; // normal symbol + UNKNOWN = 2; // unknown symbol. only for now. + CONTROL = 3; // control symbols. , , <2ja> etc. + USER_DEFINED = 4; // user defined symbols. + // Typical usage of USER_DEFINED symbol + // is placeholder. + BYTE = 6; // byte symbols. Used when `byte_fallback` is true. + UNUSED = 5; // this piece is not used. + } + optional string piece = 1; // piece must not be empty. + optional float score = 2; + optional Type type = 3 [default = NORMAL]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; + } + + // Sentence pieces with scores. + repeated SentencePiece pieces = 1; + + // Spec used to generate this model file. + optional TrainerSpec trainer_spec = 2; + + // Spec for text normalization. + optional NormalizerSpec normalizer_spec = 3; + + // Stores sample input and its expected segmentation to verify the model. + optional SelfTestData self_test_data = 4; + + // Spec for text de-normalization. + optional NormalizerSpec denormalizer_spec = 5; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} diff --git a/src/cross/tokenizer/_interfaces.ts b/src/cross/tokenizer/_interfaces.ts new file mode 100644 index 000000000..f97424ab0 --- /dev/null +++ b/src/cross/tokenizer/_interfaces.ts @@ -0,0 +1,105 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + ComputeTokensResult, + ContentListUnion, + CountTokensConfig, + CountTokensResult, +} from '../../types.js'; + +/** + * Interface for platform-specific cache operations for tokenizer models. + */ +export interface TokenizerCache { + /** + * Loads tokenizer model data from cache if available and valid. + * + * @param cacheKey Unique identifier for the cached model + * @param expectedHash SHA-256 hash to validate cached data + * @return Cached model data if valid, null otherwise + */ + load(cacheKey: string, expectedHash: string): Promise; + + /** + * Saves tokenizer model data to cache. + * + * @param cacheKey Unique identifier for the model + * @param data Model data to cache + */ + save(cacheKey: string, data: Uint8Array): Promise; +} + +/** + * Interface for platform-specific file operations for tokenizer. + */ +export interface TokenizerFileSystem { + /** + * Downloads file from URL. + * + * @param url URL to download from + * @return File contents as bytes + */ + fetchFromUrl(url: string): Promise; + + /** + * Validates file hash using SHA-256. + * + * @param data File data + * @param expectedHash Expected SHA-256 hash + * @return true if hash matches + */ + validateHash(data: Uint8Array, expectedHash: string): Promise; + + /** + * Computes SHA-1 hash of a string (used for cache keys). + * + * @param text Text to hash + * @return SHA-1 hash as hex string + */ + computeSha1(text: Uint8Array): Promise; +} + +/** + * Platform-specific dependencies for tokenizer. + */ +export interface TokenizerPlatform { + cache: TokenizerCache; + fileSystem: TokenizerFileSystem; +} + +/** + * Configuration for a specific tokenizer model. + */ +export interface TokenizerConfig { + modelUrl: string; + modelHash: string; +} + +/** + * Interface for local tokenizer implementation. + */ +export interface ILocalTokenizer { + /** + * Counts the number of tokens in the given content. + * + * @param contents The contents to tokenize + * @param config Optional configuration for counting tokens + * @return A CountTokensResult containing the total number of tokens + */ + countTokens( + contents: ContentListUnion, + config?: CountTokensConfig, + ): Promise; + + /** + * Computes detailed token information for the given content. + * + * @param contents The contents to tokenize + * @return A ComputeTokensResult containing token IDs, bytes, and roles + */ + computeTokens(contents: ContentListUnion): Promise; +} diff --git a/src/cross/tokenizer/_loader.ts b/src/cross/tokenizer/_loader.ts new file mode 100644 index 000000000..936cf5429 --- /dev/null +++ b/src/cross/tokenizer/_loader.ts @@ -0,0 +1,128 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import {TokenizerConfig, TokenizerPlatform} from './_interfaces.js'; + +/** + * Source of truth: https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models + */ +const GEMINI_MODELS_TO_TOKENIZER_NAMES: Record = { + 'gemini-2.5-pro': 'gemma3', + 'gemini-2.5-flash': 'gemma3', + 'gemini-2.5-flash-lite': 'gemma3', + 'gemini-2.0-flash': 'gemma3', + 'gemini-2.0-flash-lite': 'gemma3', +}; + +const GEMINI_STABLE_MODELS_TO_TOKENIZER_NAMES: Record = { + 'gemini-3-pro-preview': 'gemma3', + 'gemini-2.5-pro-preview-06-05': 'gemma3', + 'gemini-2.5-pro-preview-05-06': 'gemma3', + 'gemini-2.5-pro-exp-03-25': 'gemma3', + 'gemini-live-2.5-flash': 'gemma3', + 'gemini-2.5-flash-preview-05-20': 'gemma3', + 'gemini-2.5-flash-preview-04-17': 'gemma3', + 'gemini-2.5-flash-lite-preview-06-17': 'gemma3', + 'gemini-2.0-flash-001': 'gemma3', + 'gemini-2.0-flash-lite-001': 'gemma3', +}; + +const TOKENIZERS: Record = { + gemma2: { + modelUrl: + 'https://raw.githubusercontent.com/google/gemma_pytorch/33b652c465537c6158f9a472ea5700e5e770ad3f/tokenizer/tokenizer.model', + modelHash: + '61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2', + }, + gemma3: { + modelUrl: + 'https://raw.githubusercontent.com/google/gemma_pytorch/014acb7ac4563a5f77c76d7ff98f31b568c16508/tokenizer/gemma3_cleaned_262144_v2.spiece.model', + modelHash: + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c', + }, +}; + +/** + * Gets the tokenizer name for the given model name. + * + * @param modelName The Gemini model name + * @return The tokenizer name to use + * @throws Error if the model is not supported + */ +export function getTokenizerName(modelName: string): string { + if (modelName in GEMINI_MODELS_TO_TOKENIZER_NAMES) { + return GEMINI_MODELS_TO_TOKENIZER_NAMES[modelName]; + } + if (modelName in GEMINI_STABLE_MODELS_TO_TOKENIZER_NAMES) { + return GEMINI_STABLE_MODELS_TO_TOKENIZER_NAMES[modelName]; + } + + const supportedModels = [ + ...Object.keys(GEMINI_MODELS_TO_TOKENIZER_NAMES), + ...Object.keys(GEMINI_STABLE_MODELS_TO_TOKENIZER_NAMES), + ].join(', '); + + throw new Error( + `Model ${modelName} is not supported for local tokenization. Supported models: ${supportedModels}.`, + ); +} + +/** + * Gets the tokenizer configuration for the given tokenizer name. + * + * @param tokenizerName The tokenizer name + * @return The tokenizer configuration + * @throws Error if the tokenizer is not found + */ +export function getTokenizerConfig(tokenizerName: string): TokenizerConfig { + if (!(tokenizerName in TOKENIZERS)) { + throw new Error( + `Tokenizer ${tokenizerName} is not supported. Supported tokenizers: ${Object.keys(TOKENIZERS).join(', ')}`, + ); + } + return TOKENIZERS[tokenizerName]; +} + +/** + * Loads tokenizer model bytes from cache or URL. + * + * @param tokenizerName The tokenizer name + * @param platform Platform-specific implementations + * @return The model bytes + */ +export async function loadModelProtoBytes( + tokenizerName: string, + platform: TokenizerPlatform, +): Promise { + const config = getTokenizerConfig(tokenizerName); + + const encoder = new TextEncoder(); + const cacheKey = await platform.fileSystem.computeSha1( + encoder.encode(config.modelUrl), + ); + + let modelData = await platform.cache.load(cacheKey, config.modelHash); + + if (!modelData) { + modelData = await platform.fileSystem.fetchFromUrl(config.modelUrl); + + const isValid = await platform.fileSystem.validateHash( + modelData, + config.modelHash, + ); + + if (!isValid) { + const actualHash = await platform.fileSystem.computeSha1(modelData); + throw new Error( + `Downloaded model file is corrupted. Expected hash ${config.modelHash}. Got file hash ${actualHash}.`, + ); + } + + await platform.cache.save(cacheKey, modelData); + } + + return modelData; +} diff --git a/src/cross/tokenizer/_local_tokenizer_impl.ts b/src/cross/tokenizer/_local_tokenizer_impl.ts new file mode 100644 index 000000000..0349b833e --- /dev/null +++ b/src/cross/tokenizer/_local_tokenizer_impl.ts @@ -0,0 +1,191 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * LocalTokenizer implementation that integrates SentencePiece with platform-specific + * caching and file operations. + * + * This is the main implementation that brings together: + * - SentencePiece BPE tokenizer + * - Platform-specific model loading and caching + * - Text extraction from Content/Tool/Schema objects + */ + +import {tContent, tContents} from '../../_transformers.js'; +import type { + ComputeTokensResult, + ContentListUnion, + CountTokensConfig, + CountTokensResult, + TokensInfo, +} from '../../types.js'; +import {SentencePieceProcessor} from '../sentencepiece/_processor.js'; +import {ILocalTokenizer, TokenizerPlatform} from './_interfaces.js'; +import {getTokenizerName, loadModelProtoBytes} from './_loader.js'; +import {TextsAccumulator} from './_texts_accumulator.js'; + +type SentencePieceProcessorConstructor = new ( + modelBytes: Uint8Array, +) => SentencePieceProcessor; + +/** + * LocalTokenizer provides text-only local tokenization for Gemini models. + * + * LIMITATIONS: + * - Only supports text-based tokenization (no multimodal) + * - Forward compatibility depends on open-source tokenizer models + * - For tools/schemas, only supports types.Tool and types.Schema objects + * (Python functions or Pydantic models cannot be passed directly) + */ +export class LocalTokenizer implements ILocalTokenizer { + private tokenizerName: string; + private platform: TokenizerPlatform; + private processor?: SentencePieceProcessor; + private modelName: string; + + /** + * Creates a new LocalTokenizer. + * + * @param modelName Gemini model name (e.g., 'gemini-2.0-flash-001') + * @param platform Platform-specific implementations for caching and file operations + */ + constructor( + modelName: string, + platform: TokenizerPlatform, + private readonly ProcessorClass: SentencePieceProcessorConstructor = SentencePieceProcessor, + ) { + this.modelName = modelName; + this.tokenizerName = getTokenizerName(modelName); + this.platform = platform; + } + + private async ensureProcessor(): Promise { + if (this.processor) { + return; + } + + const modelBytes = await loadModelProtoBytes( + this.tokenizerName, + this.platform, + ); + + this.processor = new this.ProcessorClass(modelBytes); + } + + /** + * Counts the number of tokens in the given content. + * + * @param contents The contents to tokenize + * @param config Optional configuration for counting tokens + * @return A CountTokensResult containing the total number of tokens + * + * @example + * ```typescript + * const tokenizer = new LocalTokenizer('gemini-2.0-flash-001', platform); + * const result = await tokenizer.countTokens("What is your name?"); + * console.log(result.totalTokens); // 5 + * ``` + */ + async countTokens( + contents: ContentListUnion, + config?: CountTokensConfig, + ): Promise { + await this.ensureProcessor(); + + const processedContents = tContents(contents); + + const textAccumulator = new TextsAccumulator(); + textAccumulator.addContents(processedContents); + + if (config?.systemInstruction) { + const systemContent = tContent(config.systemInstruction); + textAccumulator.addContents([systemContent]); + } + + if (config?.tools) { + textAccumulator.addTools(config.tools); + } + + if (config?.generationConfig?.responseSchema) { + textAccumulator.addSchema(config.generationConfig.responseSchema); + } + + const texts = textAccumulator.getTexts(); + let totalTokens = 0; + + for (const text of texts) { + const tokens = this.processor!.encode(text); + totalTokens += tokens.length; + } + + return { + totalTokens, + }; + } + + /** + * Computes detailed token information for the given content. + * + * @param contents The contents to tokenize + * @return A ComputeTokensResult containing token IDs, bytes, and roles + * + * @example + * ```typescript + * const tokenizer = new LocalTokenizer('gemini-2.0-flash-001', platform); + * const result = await tokenizer.computeTokens("What is your name?"); + * console.log(result.tokensInfo); + * // [{tokenIds: [279, 329, 1313, 2508, 13], tokens: [' What', ' is', ...], role: 'user'}] + * ``` + */ + async computeTokens( + contents: ContentListUnion, + ): Promise { + await this.ensureProcessor(); + + const processedContents = tContents(contents); + + const tokensInfo: TokensInfo[] = []; + + for (const content of processedContents) { + const textAccumulator = new TextsAccumulator(); + textAccumulator.addContent(content); + + const texts = textAccumulator.getTexts(); + + const allTokenIds: number[] = []; + const allTokens: string[] = []; + + for (const text of texts) { + const tokens = this.processor!.encode(text); + allTokenIds.push(...tokens.map((t) => t.id)); + allTokens.push(...tokens.map((t) => this.tokenTextToBase64(t.text))); + } + + if (allTokenIds.length > 0) { + tokensInfo.push({ + tokenIds: allTokenIds.map((id) => id.toString()), + tokens: allTokens, + role: content.role, + }); + } + } + + return { + tokensInfo, + }; + } + + private tokenTextToBase64(text: string): string { + const encoder = new TextEncoder(); + const bytes = encoder.encode(text.replace(/▁/g, ' ')); + + let binary = ''; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); + } +} diff --git a/src/cross/tokenizer/_texts_accumulator.ts b/src/cross/tokenizer/_texts_accumulator.ts new file mode 100644 index 000000000..1fd1137a3 --- /dev/null +++ b/src/cross/tokenizer/_texts_accumulator.ts @@ -0,0 +1,246 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * TextsAccumulator extracts countable text from Content and Tool objects. + * + * This class traverses complex Content and Tool objects and extracts all text + * content that should be included when calculating token counts. + * + * A key feature is its ability to detect unsupported fields in Content objects. + * If a user provides a Content object with fields that this local tokenizer + * doesn't recognize, this class will log a warning. + * + * Translated from python-genai/local_tokenizer.py + */ + +import type { + Content, + FunctionCall, + FunctionDeclaration, + FunctionResponse, + Part, + Schema, + Tool, +} from '../../types.js'; + +/** + * Accumulates countable texts from Content and Tool objects. + */ +export class TextsAccumulator { + private texts: string[]; + + constructor() { + this.texts = []; + } + + /** + * Returns all accumulated texts. + */ + getTexts(): string[] { + return this.texts; + } + + /** + * Adds multiple Content objects. + */ + addContents(contents: Content[]): void { + for (const content of contents) { + this.addContent(content); + } + } + + addContent(content: Content): void { + const countedContent: Content = { + parts: [], + role: content.role, + }; + + if (content.parts) { + for (const part of content.parts) { + const countedPart: Part = {}; + + if (part.fileData || part.inlineData) { + throw new Error( + 'LocalTokenizers do not support non-text content types.', + ); + } + + if (part.videoMetadata) { + countedPart.videoMetadata = part.videoMetadata; + } + + if (part.functionCall) { + this.addFunctionCall(part.functionCall); + countedPart.functionCall = part.functionCall; + } + + if (part.functionResponse) { + this.addFunctionResponse(part.functionResponse); + countedPart.functionResponse = part.functionResponse; + } + + if (part.text) { + countedPart.text = part.text; + this.texts.push(part.text); + } + + if (countedContent.parts) { + countedContent.parts.push(countedPart); + } + } + } + + if (!this.deepEqual(content, countedContent)) { + console.warn( + `Content contains unsupported types for token counting. ` + + `Supported fields: ${JSON.stringify(countedContent)}. ` + + `Got: ${JSON.stringify(content)}.`, + ); + } + } + + addFunctionCall(functionCall: FunctionCall): void { + if (functionCall.name) { + this.texts.push(functionCall.name); + } + + if (functionCall.args) { + this.dictTraverse(functionCall.args); + } + } + + addTools(tools: Tool[]): void { + for (const tool of tools) { + this.addTool(tool); + } + } + + addTool(tool: Tool): void { + if (tool.functionDeclarations) { + for (const functionDeclaration of tool.functionDeclarations) { + this.functionDeclarationTraverse(functionDeclaration); + } + } + } + + addFunctionResponses(functionResponses: FunctionResponse[]): void { + for (const functionResponse of functionResponses) { + this.addFunctionResponse(functionResponse); + } + } + + addFunctionResponse(functionResponse: FunctionResponse): void { + if (functionResponse.name) { + this.texts.push(functionResponse.name); + } + + if (functionResponse.response) { + this.dictTraverse(functionResponse.response); + } + } + + private functionDeclarationTraverse( + functionDeclaration: FunctionDeclaration, + ): void { + if (functionDeclaration.name) { + this.texts.push(functionDeclaration.name); + } + + if (functionDeclaration.description) { + this.texts.push(functionDeclaration.description); + } + + if (functionDeclaration.parameters) { + this.addSchema(functionDeclaration.parameters); + } + + if (functionDeclaration.response) { + this.addSchema(functionDeclaration.response); + } + } + + addSchema(schema: Schema): void { + if (schema.format) { + this.texts.push(schema.format); + } + + if (schema.description) { + this.texts.push(schema.description); + } + + if (schema.enum) { + this.texts.push(...schema.enum); + } + + if (schema.required) { + this.texts.push(...schema.required); + } + + if (schema.items) { + this.addSchema(schema.items); + } + + if (schema.properties) { + for (const [key, value] of Object.entries(schema.properties)) { + this.texts.push(key); + this.addSchema(value); + } + } + + if (schema.example !== undefined && schema.example !== null) { + this.anyTraverse(schema.example); + } + } + + private dictTraverse(obj: Record): void { + this.texts.push(...Object.keys(obj)); + for (const value of Object.values(obj)) { + this.anyTraverse(value); + } + } + + private anyTraverse(value: any): void { + if (typeof value === 'string') { + this.texts.push(value); + } else if (typeof value === 'object' && value !== null) { + if (Array.isArray(value)) { + for (const item of value) { + this.anyTraverse(item); + } + } else { + this.dictTraverse(value); + } + } + } + + private deepEqual(obj1: any, obj2: any): boolean { + if (obj1 === obj2) return true; + if (obj1 == null || obj2 == null) return obj1 === obj2; + if (typeof obj1 !== 'object' || typeof obj2 !== 'object') return false; + + if (Array.isArray(obj1) && Array.isArray(obj2)) { + if (obj1.length !== obj2.length) return false; + for (let i = 0; i < obj1.length; i++) { + if (!this.deepEqual(obj1[i], obj2[i])) return false; + } + return true; + } + + const keys1 = Object.keys(obj1).filter((k) => obj1[k] !== undefined); + const keys2 = Object.keys(obj2).filter((k) => obj2[k] !== undefined); + + if (keys1.length !== keys2.length) return false; + + for (const key of keys1) { + if (!keys2.includes(key)) return false; + if (!this.deepEqual(obj1[key], obj2[key])) return false; + } + + return true; + } +} diff --git a/src/node/_node_tokenizer_platform.ts b/src/node/_node_tokenizer_platform.ts new file mode 100644 index 000000000..792419608 --- /dev/null +++ b/src/node/_node_tokenizer_platform.ts @@ -0,0 +1,110 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import * as crypto from 'crypto'; +import * as fs from 'fs/promises'; +import * as os from 'os'; +import * as path from 'path'; + +import { + TokenizerCache, + TokenizerFileSystem, + TokenizerPlatform, +} from '../cross/tokenizer/_interfaces.js'; + +/** + * Node.js implementation of tokenizer cache using the file system. + */ +export class NodeTokenizerCache implements TokenizerCache { + private cacheDir: string; + + constructor() { + this.cacheDir = path.join(os.tmpdir(), 'vertexai_tokenizer_model'); + } + + async load( + cacheKey: string, + expectedHash: string, + ): Promise { + const filePath = path.join(this.cacheDir, cacheKey); + try { + const data = await fs.readFile(filePath); + const hash = crypto.createHash('sha256').update(data).digest('hex'); + + if (hash === expectedHash) { + return new Uint8Array(data); + } + + await this.removeFile(filePath); + return null; + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (error) { + return null; + } + } + + async save(cacheKey: string, data: Uint8Array): Promise { + const filePath = path.join(this.cacheDir, cacheKey); + try { + await fs.mkdir(this.cacheDir, {recursive: true}); + + const tmpPath = `${this.cacheDir}.${crypto.randomUUID()}.tmp`; + await fs.writeFile(tmpPath, data); + await fs.rename(tmpPath, filePath); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (error) { + // Cache is optional, so errors are silently ignored + } + } + + private async removeFile(filePath: string): Promise { + try { + await fs.unlink(filePath); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + } catch (error) { + // Cache is optional, so errors are silently ignored + } + } +} + +/** + * Node.js implementation of tokenizer file system operations. + */ +export class NodeTokenizerFileSystem implements TokenizerFileSystem { + async fetchFromUrl(url: string): Promise { + const response = await fetch(url); + if (!response.ok) { + throw new Error( + `Failed to fetch tokenizer model from ${url}: ${response.statusText}`, + ); + } + const arrayBuffer = await response.arrayBuffer(); + return new Uint8Array(arrayBuffer); + } + + async validateHash(data: Uint8Array, expectedHash: string): Promise { + const hash = crypto.createHash('sha256').update(data).digest('hex'); + return hash === expectedHash; + } + + async computeSha1(text: Uint8Array): Promise { + const hash = crypto.createHash('sha1').update(text).digest('hex'); + return hash; + } +} + +/** + * Node.js platform implementation for tokenizer. + */ +export class NodeTokenizerPlatform implements TokenizerPlatform { + cache: TokenizerCache; + fileSystem: TokenizerFileSystem; + + constructor() { + this.cache = new NodeTokenizerCache(); + this.fileSystem = new NodeTokenizerFileSystem(); + } +} diff --git a/src/node/local_tokenizer.ts b/src/node/local_tokenizer.ts new file mode 100644 index 000000000..18be4d45c --- /dev/null +++ b/src/node/local_tokenizer.ts @@ -0,0 +1,78 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Node.js-specific LocalTokenizer implementation. + * + * This wrapper automatically uses the Node.js platform (filesystem caching, crypto hashing) + * without requiring users to manually create a platform instance. + */ + +import {LocalTokenizer as BaseLocalTokenizer} from '../cross/tokenizer/_local_tokenizer_impl.js'; +import type { + ComputeTokensResult, + ContentListUnion, + CountTokensConfig, + CountTokensResult, +} from '../types.js'; +import {NodeTokenizerPlatform} from './_node_tokenizer_platform.js'; + +/** + * LocalTokenizer for Node.js environment. + * + * Provides local tokenization for Gemini models without requiring API calls. + * Automatically uses Node.js platform (filesystem caching in temp directory). + * + * @example + * ```typescript + * import {LocalTokenizer} from '@google/genai/node'; + * + * const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + * const result = await tokenizer.countTokens("What is your name?"); + * console.log(result.totalTokens); // 5 + * ``` + * + * @experimental This API is experimental and may change in future versions. + */ +export class LocalTokenizer { + private baseTokenizer: BaseLocalTokenizer; + + /** + * Creates a new LocalTokenizer for Node.js. + * + * @param modelName Gemini model name (e.g., 'gemini-2.0-flash-001') + */ + constructor(modelName: string) { + const platform = new NodeTokenizerPlatform(); + this.baseTokenizer = new BaseLocalTokenizer(modelName, platform); + } + + /** + * Counts the number of tokens in the given content. + * + * @param contents The contents to tokenize + * @param config Optional configuration for counting tokens + * @return A CountTokensResult containing the total number of tokens + */ + async countTokens( + contents: ContentListUnion, + config?: CountTokensConfig, + ): Promise { + return this.baseTokenizer.countTokens(contents, config); + } + + /** + * Computes detailed token information for the given content. + * + * @param contents The contents to tokenize + * @return A ComputeTokensResult containing token IDs, bytes, and roles + */ + async computeTokens( + contents: ContentListUnion, + ): Promise { + return this.baseTokenizer.computeTokens(contents); + } +} diff --git a/src/tokenizer/node.ts b/src/tokenizer/node.ts new file mode 100644 index 000000000..7cf32597b --- /dev/null +++ b/src/tokenizer/node.ts @@ -0,0 +1,34 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Public API for LocalTokenizer (Node.js). + * + * This module provides local tokenization capabilities for Gemini models without + * requiring API calls. The tokenizer uses SentencePiece BPE algorithm and supports + * text-only token counting and computation. + * + * @example Node.js Usage + * ```typescript + * import {LocalTokenizer} from '@google/genai/tokenizer/node'; + * + * const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + * const result = await tokenizer.countTokens("What is your name?"); + * console.log(result.totalTokens); // 5 + * ``` + * + * @experimental This API is experimental and may change in future versions. + */ + +// Re-export from node-specific local_tokenizer +export {LocalTokenizer} from '../node/local_tokenizer.js'; + +// Re-export types that users might need +export type { + ComputeTokensResult, + CountTokensResult, + TokensInfo, +} from '../types.js'; diff --git a/src/tokenizer/web.ts b/src/tokenizer/web.ts new file mode 100644 index 000000000..13cb977d2 --- /dev/null +++ b/src/tokenizer/web.ts @@ -0,0 +1,31 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Public API for LocalTokenizer (Web/Browser). + * + * ⚠️ NOT YET IMPLEMENTED ⚠️ + * + * Web tokenizer support is planned but not yet available. The implementation requires: + * - IndexedDB for caching tokenizer models + * - SubtleCrypto API for SHA-256/SHA-1 hashing + * - Fetch API for downloading models + * + * See LOCAL_TOKENIZER_DESIGN.md for implementation details and re-enablement steps. + * + * @experimental This API is experimental and may change in future versions. + */ + +// Web tokenizer exports are disabled until the platform implementation is complete. +// Uncomment these exports when WebTokenizerPlatform is fully implemented: +// +// export {LocalTokenizer} from '../web/local_tokenizer.js'; +// +// export type { +// ComputeTokensResult, +// CountTokensResult, +// TokensInfo, +// } from '../types.js'; diff --git a/src/types.ts b/src/types.ts index 761ecdb93..7c8c7188d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -6856,6 +6856,18 @@ export declare interface OperationGetParameters> { operation: U; } +/** Local tokenizer count tokens result. */ +export declare interface CountTokensResult { + /** The total number of tokens. */ + totalTokens?: number; +} + +/** Local tokenizer compute tokens result. */ +export declare interface ComputeTokensResult { + /** Lists of tokens info from the input. */ + tokensInfo?: TokensInfo[]; +} + /** Fine-tuning job creation parameters - optional fields. */ export declare interface CreateTuningJobParameters { /** The base model that is being tuned, e.g., "gemini-2.5-flash". */ diff --git a/src/web/_web_tokenizer_platform.ts b/src/web/_web_tokenizer_platform.ts new file mode 100644 index 000000000..233a7f505 --- /dev/null +++ b/src/web/_web_tokenizer_platform.ts @@ -0,0 +1,73 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + TokenizerCache, + TokenizerFileSystem, + TokenizerPlatform, +} from '../cross/tokenizer/_interfaces.js'; + +/** + * Web implementation of tokenizer cache using IndexedDB. + * TODO: Implement using IndexedDB for web storage. + */ +export class WebTokenizerCache implements TokenizerCache { + async load( + _cacheKey: string, + _expectedHash: string, + ): Promise { + throw new Error( + 'Web tokenizer cache not yet implemented. Use Node.js environment for local tokenization.', + ); + } + + async save(_cacheKey: string, _data: Uint8Array): Promise { + throw new Error( + 'Web tokenizer cache not yet implemented. Use Node.js environment for local tokenization.', + ); + } +} + +/** + * Web implementation of tokenizer file system operations. + * TODO: Implement using fetch API and SubtleCrypto. + */ +export class WebTokenizerFileSystem implements TokenizerFileSystem { + async fetchFromUrl(_url: string): Promise { + throw new Error( + 'Web tokenizer file system not yet implemented. Use Node.js environment for local tokenization.', + ); + } + + async validateHash( + _data: Uint8Array, + _expectedHash: string, + ): Promise { + throw new Error( + 'Web tokenizer file system not yet implemented. Use Node.js environment for local tokenization.', + ); + } + + async computeSha1(_text: Uint8Array): Promise { + throw new Error( + 'Web tokenizer file system not yet implemented. Use Node.js environment for local tokenization.', + ); + } +} + +/** + * Web platform implementation for tokenizer. + * TODO: Complete implementation for web environment. + */ +export class WebTokenizerPlatform implements TokenizerPlatform { + cache: TokenizerCache; + fileSystem: TokenizerFileSystem; + + constructor() { + this.cache = new WebTokenizerCache(); + this.fileSystem = new WebTokenizerFileSystem(); + } +} diff --git a/src/web/local_tokenizer.ts b/src/web/local_tokenizer.ts new file mode 100644 index 000000000..e9fd8737c --- /dev/null +++ b/src/web/local_tokenizer.ts @@ -0,0 +1,81 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Web-specific LocalTokenizer implementation. + * + * This wrapper automatically uses the Web platform (IndexedDB caching, SubtleCrypto hashing) + * without requiring users to manually create a platform instance. + * + * NOTE: Web implementation is not yet complete. Use Node.js environment for now. + */ + +import {LocalTokenizer as BaseLocalTokenizer} from '../cross/tokenizer/_local_tokenizer_impl.js'; +import type { + ComputeTokensResult, + ContentListUnion, + CountTokensConfig, + CountTokensResult, +} from '../types.js'; +import {WebTokenizerPlatform} from './_web_tokenizer_platform.js'; + +/** + * LocalTokenizer for Web environment. + * + * Provides local tokenization for Gemini models without requiring API calls. + * Automatically uses Web platform (IndexedDB caching, SubtleCrypto hashing). + * + * @example + * ```typescript + * import {LocalTokenizer} from '@google/genai/web'; + * + * const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + * const result = await tokenizer.countTokens("What is your name?"); + * console.log(result.totalTokens); // 5 + * ``` + * + * @experimental This API is experimental and not yet fully implemented. + * Use Node.js environment for now. + */ +export class LocalTokenizer { + private baseTokenizer: BaseLocalTokenizer; + + /** + * Creates a new LocalTokenizer for Web. + * + * @param modelName Gemini model name (e.g., 'gemini-2.0-flash-001') + */ + constructor(modelName: string) { + const platform = new WebTokenizerPlatform(); + this.baseTokenizer = new BaseLocalTokenizer(modelName, platform); + } + + /** + * Counts the number of tokens in the given content. + * + * @param contents The contents to tokenize + * @param config Optional configuration for counting tokens + * @return A CountTokensResult containing the total number of tokens + */ + async countTokens( + contents: ContentListUnion, + config?: CountTokensConfig, + ): Promise { + return this.baseTokenizer.countTokens(contents, config); + } + + /** + * Computes detailed token information for the given content. + * + * @param contents The contents to tokenize + * @return A ComputeTokensResult containing token IDs, bytes, and roles + */ + async computeTokens( + contents: ContentListUnion, + ): Promise { + return this.baseTokenizer.computeTokens(contents); + } +} diff --git a/test/unit/cross/sentencepiece/processor_test.ts b/test/unit/cross/sentencepiece/processor_test.ts new file mode 100644 index 000000000..daa7643b8 --- /dev/null +++ b/test/unit/cross/sentencepiece/processor_test.ts @@ -0,0 +1,287 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * SentencePiece Processor tests. + * Translated from https://github.com/eliben/go-sentencepiece/blob/main/processor_test.go + */ + +import { + SentencePieceProcessor, + Token, +} from '../../../../src/cross/sentencepiece/_processor.js'; +import {loadModelProtoBytes} from '../../../../src/cross/tokenizer/_loader.js'; +import {NodeTokenizerPlatform} from '../../../../src/node/_node_tokenizer_platform.js'; + +/** + * Creates a processor instance for testing. + * Downloads the gemma2 tokenizer model if needed. + * Note: Test expectations are based on gemma2 model with vocab size 256000. + */ +async function createProcessor(): Promise { + const platform = new NodeTokenizerPlatform(); + const modelBytes = await loadModelProtoBytes('gemma2', platform); + return new SentencePieceProcessor(modelBytes); +} + +describe('SentencePieceProcessor', () => { + describe('encode - IDs only', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + const tests = [ + {text: 'hello world', wantIDs: [17534, 2134]}, + {text: '12345', wantIDs: [235274, 235284, 235304, 235310, 235308]}, + {text: ' ', wantIDs: [139]}, + {text: ' ', wantIDs: [140]}, + {text: ' ', wantIDs: [145]}, + {text: 'ҔӌԐڎ', wantIDs: [427, 365, 428, 357, 429, 361, 435, 359]}, + {text: ' ', wantIDs: [235248, 4, 139, 235322, 8939, 235313]}, + {text: '
', wantIDs: [169, 175, 183, 177]}, + { + text: 'one line\nand another line', + wantIDs: [785, 2017, 108, 639, 2550, 2017], + }, + { + text: 'Language: English\r\n\r\nCredits: Produced by David Widger\r\n', + wantIDs: [ + 14357, 235292, 4645, 235316, 108, 235316, 108, 34711, 235292, 99662, + 731, 6046, 37303, 1197, 235316, 108, + ], + }, + {text: 'Bienvenido a este proyecto', wantIDs: [176831, 476, 4004, 25431]}, + { + text: 'अस्मिन् परियोजनायां स्वागतम्', + wantIDs: [ + 236088, 22740, 212361, 18029, 14480, 19900, 146166, 6751, 235563, + 56545, 44071, 235550, 26989, + ], + }, + { + text: 'if allow == true { return x;} else {return x+y;}', + wantIDs: [ + 648, 2765, 1159, 1382, 612, 2203, 1141, 22505, 1354, 612, 773, 1141, + 235340, 235267, 22505, + ], + }, + ]; + + tests.forEach(({text, wantIDs}) => { + it(`should encode "${text}"`, () => { + const got = proc.encode(text); + const gotIDs = got.map((t) => t.id); + + expect(gotIDs).toEqual(wantIDs); + }); + }); + }); + + describe('encode - with text', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + const tests = [ + { + text: 'hi bye', + wantTokens: [ + {id: 544, text: 'hi'}, + {id: 235248, text: '▁'}, + {id: 176, text: ''}, + {id: 44788, text: '▁bye'}, + ], + }, + { + text: 'hiƻ 🤨there ⇲bob, สวัสดี', + wantTokens: [ + {id: 544, text: 'hi'}, + {id: 415, text: '<0xC6>'}, + {id: 404, text: '<0xBB>'}, + {id: 235248, text: '▁'}, + {id: 176, text: ''}, + {id: 241847, text: '🤨'}, + {id: 11048, text: 'there'}, + {id: 235248, text: '▁'}, + {id: 248372, text: '⇲'}, + {id: 26242, text: 'bob'}, + {id: 235269, text: ','}, + {id: 12515, text: '▁ส'}, + {id: 151622, text: 'วัส'}, + {id: 28890, text: 'ดี'}, + ], + }, + ]; + + tests.forEach(({text, wantTokens}) => { + it(`should encode "${text}" with token text`, () => { + const got = proc.encode(text); + expect(got).toEqual(wantTokens); + }); + }); + }); + + describe('symbolMatch', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + const tests = [ + {text: '', wantLen: 4, wantFound: true}, + {text: '', wantLen: 3, wantFound: true}, + {text: '', wantLen: 4, wantFound: true}, + {text: '', wantLen: 15, wantFound: true}, + {text: ' { + it(`should match symbol "${text}"`, () => { + // Note: symbolMatch is private, so we test it through encode behavior + // This test would need the method to be exposed or use type assertion + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const [gotLen, gotFound] = (proc as any).symbolMatch(text); + expect(gotLen).toBe(wantLen); + expect(gotFound).toBe(wantFound); + }); + }); + }); + + describe('convertHexValue', () => { + const tests = [ + {input: '<0x40>', wantN: 64}, + {input: '<0x00>', wantN: 0}, + {input: '<0x1a>', wantN: 26}, + {input: '<0xF3>', wantN: 243}, + {input: '0x12>', wantN: -1}, + {input: '', wantN: -1}, + {input: '<012>', wantN: -1}, + {input: '<0xTA>', wantN: -1}, + ]; + + tests.forEach(({input, wantN}) => { + it(`should convert "${input}" to ${wantN}`, () => { + // convertHexValue is a private function in the processor + // We need to test it indirectly or expose it for testing + const convertHexValue = (bv: string): number => { + const match = bv.match(/^<0x([0-9A-Fa-f]{2})>$/); + if (!match) { + return -1; + } + return parseInt(match[1], 16); + }; + + const gotN = convertHexValue(input); + expect(gotN).toBe(wantN); + }); + }); + }); + + describe('decode', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + const tests = [ + {ids: [17534, 2134], wantText: 'hello world'}, + { + ids: [427, 365, 428, 357, 29422, 1653, 427, 365, 428, 357], + wantText: 'Ҕӌnever againҔӌ', + }, + { + ids: [785, 2017, 108, 639, 2550, 2017], + wantText: 'one line\nand another line', + }, + {ids: [1001, 1002, 1003, 1004], wantText: 'buark}) res'}, + { + ids: [111001, 111002, 111003, 111004], + wantText: ' Wichita EducaçãoVocabulary天堂', + }, + {ids: [139], wantText: ' '}, + {ids: [140], wantText: ' '}, + {ids: [145], wantText: ' '}, + {ids: [441, 401, 387], wantText: 'ส'}, + {ids: [411, 380], wantText: '£'}, + + // control IDs (0, 1, 2) + {ids: [2, 411, 380], wantText: '£'}, + {ids: [1, 2, 411, 380], wantText: '£'}, + {ids: [2, 411, 380, 0, 1, 2, 0], wantText: '£'}, + + // unknown (id=3) + {ids: [3, 411, 380], wantText: ' ⁇ £'}, + {ids: [3, 3, 1000, 3], wantText: ' ⁇ ⁇ ew ⁇ '}, + + // invalid bytes for UTF-8, produce "invalid unicode" runes + {ids: [349, 349, 349], wantText: '���'}, + {ids: [800, 348, 500, 348], wantText: 'sed�it�'}, + ]; + + tests.forEach(({ids, wantText}) => { + it(`should decode [${ids}]`, () => { + const got = proc.decode(ids); + expect(got).toBe(wantText); + }); + }); + }); + + describe('decodeTokens', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + it('should decode tokens using their IDs', () => { + const wantText = 'hello world'; + const tokens: Token[] = [ + {id: 17534, text: 'xxx'}, + {id: 139, text: 'xxx'}, + {id: 2134, text: 'xxx'}, + ]; + + const text = proc.decodeTokens(tokens); + expect(text).toBe(wantText); + }); + }); + + describe('modelInfo', () => { + let proc: SentencePieceProcessor; + + beforeAll(async () => { + proc = await createProcessor(); + }); + + it('should return correct model info', () => { + const info = proc.modelInfo(); + + // Assumes we use the known model file + const wantVocabSize = 256000; + const wantBOS = 2; + const wantEOS = 1; + const wantPAD = 0; + const wantUNK = 3; + + expect(info.vocabularySize).toBe(wantVocabSize); + expect(info.beginningOfSentenceID).toBe(wantBOS); + expect(info.endOfSentenceID).toBe(wantEOS); + expect(info.padID).toBe(wantPAD); + expect(info.unknownID).toBe(wantUNK); + }); + }); +}); diff --git a/test/unit/cross/tokenizer/loader_test.ts b/test/unit/cross/tokenizer/loader_test.ts new file mode 100644 index 000000000..f92a14203 --- /dev/null +++ b/test/unit/cross/tokenizer/loader_test.ts @@ -0,0 +1,314 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + TokenizerCache, + TokenizerFileSystem, + TokenizerPlatform, +} from '../../../../src/cross/tokenizer/_interfaces.js'; +import { + getTokenizerConfig, + getTokenizerName, + loadModelProtoBytes, +} from '../../../../src/cross/tokenizer/_loader.js'; + +describe('Tokenizer Loader', () => { + describe('getTokenizerName', () => { + it('should return gemma3 for gemini-2.5-pro', () => { + expect(getTokenizerName('gemini-2.5-pro')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-flash', () => { + expect(getTokenizerName('gemini-2.5-flash')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-flash-lite', () => { + expect(getTokenizerName('gemini-2.5-flash-lite')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.0-flash', () => { + expect(getTokenizerName('gemini-2.0-flash')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.0-flash-lite', () => { + expect(getTokenizerName('gemini-2.0-flash-lite')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-pro-preview-06-05', () => { + expect(getTokenizerName('gemini-2.5-pro-preview-06-05')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-pro-preview-05-06', () => { + expect(getTokenizerName('gemini-2.5-pro-preview-05-06')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-pro-exp-03-25', () => { + expect(getTokenizerName('gemini-2.5-pro-exp-03-25')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-live-2.5-flash', () => { + expect(getTokenizerName('gemini-live-2.5-flash')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-flash-preview-05-20', () => { + expect(getTokenizerName('gemini-2.5-flash-preview-05-20')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-flash-preview-04-17', () => { + expect(getTokenizerName('gemini-2.5-flash-preview-04-17')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.5-flash-lite-preview-06-17', () => { + expect(getTokenizerName('gemini-2.5-flash-lite-preview-06-17')).toBe( + 'gemma3', + ); + }); + + it('should return gemma3 for gemini-2.0-flash-001', () => { + expect(getTokenizerName('gemini-2.0-flash-001')).toBe('gemma3'); + }); + + it('should return gemma3 for gemini-2.0-flash-lite-001', () => { + expect(getTokenizerName('gemini-2.0-flash-lite-001')).toBe('gemma3'); + }); + + it('should throw error for unsupported model', () => { + expect(() => getTokenizerName('gemini-1.5-pro')).toThrowError( + /is not supported for local tokenization\. Supported models:/, + ); + }); + + it('should throw error for unknown model', () => { + expect(() => getTokenizerName('unknown-model')).toThrowError( + /is not supported for local tokenization\. Supported models:/, + ); + }); + + it('should include supported models in error message', () => { + try { + getTokenizerName('unsupported-model'); + fail('Should have thrown an error'); + } catch (error) { + expect((error as Error).message).toContain('gemini-2.5-pro'); + expect((error as Error).message).toContain('gemini-2.0-flash'); + } + }); + }); + + describe('getTokenizerConfig', () => { + it('should return config for gemma2 tokenizer', () => { + const config = getTokenizerConfig('gemma2'); + expect(config).toBeDefined(); + expect(config.modelUrl).toContain('tokenizer.model'); + expect(config.modelHash).toBe( + '61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2', + ); + }); + + it('should return config for gemma3 tokenizer', () => { + const config = getTokenizerConfig('gemma3'); + expect(config).toBeDefined(); + expect(config.modelUrl).toContain( + 'gemma3_cleaned_262144_v2.spiece.model', + ); + expect(config.modelHash).toBe( + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c', + ); + }); + + it('should throw error for unsupported tokenizer', () => { + expect(() => getTokenizerConfig('unknown-tokenizer')).toThrowError( + /is not supported\. Supported tokenizers:/, + ); + }); + + it('should include supported tokenizers in error message', () => { + try { + getTokenizerConfig('unsupported-tokenizer'); + fail('Should have thrown an error'); + } catch (error) { + expect((error as Error).message).toContain('gemma2'); + expect((error as Error).message).toContain('gemma3'); + } + }); + }); + + describe('loadModelProtoBytes', () => { + let mockCache: jasmine.SpyObj; + let mockFileSystem: jasmine.SpyObj; + let mockPlatform: TokenizerPlatform; + let mockModelData: Uint8Array; + + beforeEach(() => { + mockModelData = new Uint8Array([1, 2, 3, 4, 5]); + + mockCache = jasmine.createSpyObj('TokenizerCache', [ + 'load', + 'save', + ]); + + mockFileSystem = jasmine.createSpyObj( + 'TokenizerFileSystem', + ['fetchFromUrl', 'validateHash', 'computeSha1'], + ); + + mockPlatform = { + cache: mockCache, + fileSystem: mockFileSystem, + }; + }); + + it('should load model from cache if available and valid', async () => { + mockCache.load.and.returnValue(Promise.resolve(mockModelData)); + mockFileSystem.computeSha1.and.returnValue(Promise.resolve('cache-key')); + + const result = await loadModelProtoBytes('gemma3', mockPlatform); + + expect(result).toBe(mockModelData); + expect(mockCache.load).toHaveBeenCalledWith( + 'cache-key', + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c', + ); + expect(mockFileSystem.fetchFromUrl).not.toHaveBeenCalled(); + }); + + it('should download and cache model if not in cache', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValue(Promise.resolve('cache-key')); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(true)); + + const result = await loadModelProtoBytes('gemma3', mockPlatform); + + expect(result).toBe(mockModelData); + expect(mockFileSystem.fetchFromUrl).toHaveBeenCalledWith( + jasmine.stringContaining('gemma3_cleaned_262144_v2.spiece.model'), + ); + expect(mockFileSystem.validateHash).toHaveBeenCalledWith( + mockModelData, + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c', + ); + expect(mockCache.save).toHaveBeenCalledWith('cache-key', mockModelData); + }); + + it('should compute cache key from model URL', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValue( + Promise.resolve('computed-key'), + ); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(true)); + + await loadModelProtoBytes('gemma3', mockPlatform); + + expect(mockFileSystem.computeSha1).toHaveBeenCalled(); + const call = mockFileSystem.computeSha1.calls.first(); + const urlBytes = call.args[0] as Uint8Array; + const decoder = new TextDecoder(); + const url = decoder.decode(urlBytes); + expect(url).toContain('gemma3_cleaned_262144_v2.spiece.model'); + }); + + it('should throw error if downloaded model hash is invalid', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValue(Promise.resolve('cache-key')); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(false)); + + await expectAsync( + loadModelProtoBytes('gemma3', mockPlatform), + ).toBeRejectedWithError(/Downloaded model file is corrupted/); + + expect(mockCache.save).not.toHaveBeenCalled(); + }); + + it('should include expected and actual hash in error message', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValues( + Promise.resolve('cache-key'), + Promise.resolve('actual-hash-value'), + ); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(false)); + + try { + await loadModelProtoBytes('gemma3', mockPlatform); + fail('Should have thrown an error'); + } catch (error) { + expect((error as Error).message).toContain('Expected hash'); + expect((error as Error).message).toContain( + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c', + ); + expect((error as Error).message).toContain('Got file hash'); + expect((error as Error).message).toContain('actual-hash-value'); + } + }); + + it('should work with gemma2 tokenizer', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValue(Promise.resolve('cache-key')); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(true)); + + const result = await loadModelProtoBytes('gemma2', mockPlatform); + + expect(result).toBe(mockModelData); + expect(mockFileSystem.fetchFromUrl).toHaveBeenCalledWith( + jasmine.stringContaining('tokenizer.model'), + ); + expect(mockFileSystem.validateHash).toHaveBeenCalledWith( + mockModelData, + '61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2', + ); + }); + + it('should not save to cache if hash validation fails', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.computeSha1.and.returnValues( + Promise.resolve('cache-key'), + Promise.resolve('wrong-hash'), + ); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(false)); + + await expectAsync( + loadModelProtoBytes('gemma3', mockPlatform), + ).toBeRejected(); + + expect(mockCache.save).not.toHaveBeenCalled(); + }); + + it('should call cache load with correct cache key and hash', async () => { + const expectedCacheKey = 'test-cache-key'; + const expectedHash = + '1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c'; + + mockFileSystem.computeSha1.and.returnValue( + Promise.resolve(expectedCacheKey), + ); + mockCache.load.and.returnValue(Promise.resolve(mockModelData)); + + await loadModelProtoBytes('gemma3', mockPlatform); + + expect(mockCache.load).toHaveBeenCalledWith( + expectedCacheKey, + expectedHash, + ); + }); + }); +}); diff --git a/test/unit/cross/tokenizer/local_tokenizer_impl_test.ts b/test/unit/cross/tokenizer/local_tokenizer_impl_test.ts new file mode 100644 index 000000000..2d3463c6e --- /dev/null +++ b/test/unit/cross/tokenizer/local_tokenizer_impl_test.ts @@ -0,0 +1,610 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + SentencePieceProcessor, + Token, +} from '../../../../src/cross/sentencepiece/_processor.js'; +import { + TokenizerCache, + TokenizerFileSystem, + TokenizerPlatform, +} from '../../../../src/cross/tokenizer/_interfaces.js'; +import {LocalTokenizer} from '../../../../src/cross/tokenizer/_local_tokenizer_impl.js'; +import {Content, CountTokensConfig, Tool, Type} from '../../../../src/types.js'; + +describe('LocalTokenizer', () => { + let mockCache: jasmine.SpyObj; + let mockFileSystem: jasmine.SpyObj; + let mockPlatform: TokenizerPlatform; + let mockModelData: Uint8Array; + let tokenizer: LocalTokenizer; + + beforeEach(() => { + mockModelData = new Uint8Array([1, 2, 3, 4, 5]); + + mockCache = jasmine.createSpyObj('TokenizerCache', [ + 'load', + 'save', + ]); + + mockFileSystem = jasmine.createSpyObj( + 'TokenizerFileSystem', + ['fetchFromUrl', 'validateHash', 'computeSha1'], + ); + + mockPlatform = { + cache: mockCache, + fileSystem: mockFileSystem, + }; + + // Setup default mock behaviors + mockCache.load.and.returnValue(Promise.resolve(mockModelData)); + mockFileSystem.computeSha1.and.returnValue(Promise.resolve('cache-key')); + }); + + describe('constructor', () => { + it('should create tokenizer instance for gemini-2.0-flash-001', () => { + tokenizer = new LocalTokenizer('gemini-2.0-flash-001', mockPlatform); + expect(tokenizer).toBeDefined(); + }); + + it('should create tokenizer instance for gemini-2.5-pro', () => { + tokenizer = new LocalTokenizer('gemini-2.5-pro', mockPlatform); + expect(tokenizer).toBeDefined(); + }); + + it('should create tokenizer instance for gemini-2.5-flash', () => { + tokenizer = new LocalTokenizer('gemini-2.5-flash', mockPlatform); + expect(tokenizer).toBeDefined(); + }); + + it('should throw error for unsupported model', () => { + expect( + () => new LocalTokenizer('unsupported-model', mockPlatform), + ).toThrowError(/is not supported for local tokenization/); + }); + }); + + describe('countTokens', () => { + let mockProcessor: jasmine.SpyObj; + + beforeEach(() => { + // Create a spy for SentencePieceProcessor + mockProcessor = jasmine.createSpyObj( + 'SentencePieceProcessor', + ['encode', 'decode'], + ); + + // Mock the encode method to return tokens + mockProcessor.encode.and.callFake((text: string): Token[] => { + // Simple mock: return one token per word + const words = text.split(/\s+/).filter((w) => w.length > 0); + return words.map((word, idx) => ({id: idx, text: word})); + }); + + // Spy on SentencePieceProcessor constructor + type ProcessorConstructor = new ( + modelBytes: Uint8Array, + ) => SentencePieceProcessor; + const MockProcessorConstructor = jasmine + .createSpy('ProcessorConstructor') + .and.returnValue(mockProcessor) as unknown as ProcessorConstructor; + + tokenizer = new LocalTokenizer( + 'gemini-2.0-flash-001', + mockPlatform, + MockProcessorConstructor, + ); + }); + + it('should count tokens for simple string content', async () => { + const result = await tokenizer.countTokens('Hello world'); + expect(result.totalTokens).toBeGreaterThan(0); + expect(mockCache.load).toHaveBeenCalled(); + }); + + it('should count tokens for Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'What is your name?'}], + }; + const result = await tokenizer.countTokens(content); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should count tokens for array of Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi there'}]}, + ]; + const result = await tokenizer.countTokens(contents); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should include system instruction in token count', async () => { + const config: CountTokensConfig = { + systemInstruction: 'You are a helpful assistant', + }; + const result = await tokenizer.countTokens('Hello', config); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should include tools in token count', async () => { + const tools: Tool[] = [ + { + functionDeclarations: [ + { + name: 'getWeather', + description: 'Get the current weather', + parameters: { + type: Type.OBJECT, + properties: { + location: {type: Type.STRING, description: 'City name'}, + }, + }, + }, + ], + }, + ]; + const config: CountTokensConfig = {tools}; + const result = await tokenizer.countTokens( + 'What is the weather?', + config, + ); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should include response schema in token count', async () => { + const config: CountTokensConfig = { + generationConfig: { + responseSchema: { + type: Type.OBJECT, + properties: { + answer: {type: Type.STRING, description: 'The answer'}, + }, + }, + }, + }; + const result = await tokenizer.countTokens('Question?', config); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should handle empty content', async () => { + const result = await tokenizer.countTokens(''); + expect(result.totalTokens).toBe(0); + }); + + it('should count tokens for content with function calls', async () => { + const content: Content = { + role: 'model', + parts: [ + { + functionCall: { + name: 'getWeather', + args: {location: 'San Francisco'}, + }, + }, + ], + }; + const result = await tokenizer.countTokens(content); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should count tokens for content with function responses', async () => { + const content: Content = { + role: 'function', + parts: [ + { + functionResponse: { + name: 'getWeather', + response: {temperature: '72F', condition: 'sunny'}, + }, + }, + ], + }; + const result = await tokenizer.countTokens(content); + expect(result.totalTokens).toBeGreaterThan(0); + }); + + it('should load model only once for multiple calls', async () => { + await tokenizer.countTokens('First call'); + await tokenizer.countTokens('Second call'); + await tokenizer.countTokens('Third call'); + + // Cache load should be called only once (during first countTokens call) + expect(mockCache.load).toHaveBeenCalledTimes(1); + }); + + it('should handle complex config with all options', async () => { + const config: CountTokensConfig = { + systemInstruction: { + role: 'system', + parts: [{text: 'You are helpful'}], + }, + tools: [ + { + functionDeclarations: [ + { + name: 'search', + description: 'Search the web', + }, + ], + }, + ], + generationConfig: { + responseSchema: { + type: Type.OBJECT, + properties: {result: {type: Type.STRING}}, + }, + }, + }; + const result = await tokenizer.countTokens('Hello', config); + expect(result.totalTokens).toBeGreaterThan(0); + }); + }); + + describe('computeTokens', () => { + let mockProcessor: jasmine.SpyObj; + + beforeEach(() => { + mockProcessor = jasmine.createSpyObj( + 'SentencePieceProcessor', + ['encode', 'decode'], + ); + + mockProcessor.encode.and.callFake((text: string): Token[] => { + const words = text.split(/\s+/).filter((w) => w.length > 0); + return words.map((word, idx) => ({id: idx + 100, text: word})); + }); + + type ProcessorConstructor = new ( + modelBytes: Uint8Array, + ) => SentencePieceProcessor; + const MockProcessorConstructor = jasmine + .createSpy('ProcessorConstructor') + .and.returnValue(mockProcessor) as unknown as ProcessorConstructor; + + tokenizer = new LocalTokenizer( + 'gemini-2.0-flash-001', + mockPlatform, + MockProcessorConstructor, + ); + }); + + it('should compute tokens for simple string', async () => { + const result = await tokenizer.computeTokens('Hello world'); + expect(result.tokensInfo).toBeDefined(); + expect(result.tokensInfo!.length).toBeGreaterThan(0); + }); + + it('should compute tokens for Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'What is AI?'}], + }; + const result = await tokenizer.computeTokens(content); + expect(result.tokensInfo).toBeDefined(); + expect(result.tokensInfo!.length).toBe(1); + expect(result.tokensInfo![0].role).toBe('user'); + }); + + it('should compute tokens for multiple Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi'}]}, + ]; + const result = await tokenizer.computeTokens(contents); + expect(result.tokensInfo!.length).toBe(2); + expect(result.tokensInfo![0].role).toBe('user'); + expect(result.tokensInfo![1].role).toBe('model'); + }); + + it('should return token IDs as strings', async () => { + const result = await tokenizer.computeTokens('Test'); + expect(result.tokensInfo!.length).toBeGreaterThan(0); + const tokenIds = result.tokensInfo![0].tokenIds; + expect(tokenIds).toBeDefined(); + expect(tokenIds!.length).toBeGreaterThan(0); + expect(typeof tokenIds![0]).toBe('string'); + }); + + it('should return base64 encoded tokens', async () => { + const result = await tokenizer.computeTokens('Test'); + expect(result.tokensInfo!.length).toBeGreaterThan(0); + const tokens = result.tokensInfo![0].tokens; + expect(tokens).toBeDefined(); + expect(tokens!.length).toBeGreaterThan(0); + // Base64 encoded strings should only contain valid base64 characters + tokens!.forEach((token) => { + expect(token).toMatch(/^[A-Za-z0-9+/=]*$/); + }); + }); + + it('should handle content with function calls', async () => { + const content: Content = { + role: 'model', + parts: [ + { + functionCall: { + name: 'calculate', + args: {operation: 'add', x: 5, y: 3}, + }, + }, + ], + }; + const result = await tokenizer.computeTokens(content); + expect(result.tokensInfo!.length).toBe(1); + expect(result.tokensInfo![0].tokenIds!.length).toBeGreaterThan(0); + }); + + it('should skip content with no text', async () => { + const content: Content = { + role: 'user', + parts: [{videoMetadata: {startOffset: '0', endOffset: '1000'}}], + }; + const result = await tokenizer.computeTokens(content); + // Should not include tokensInfo for content with no extractable text + expect(result.tokensInfo!.length).toBe(0); + }); + + it('should handle mixed content with text and function calls', async () => { + const content: Content = { + role: 'model', + parts: [ + {text: 'Let me check that'}, + { + functionCall: { + name: 'search', + args: {query: 'weather'}, + }, + }, + ], + }; + const result = await tokenizer.computeTokens(content); + expect(result.tokensInfo!.length).toBe(1); + expect(result.tokensInfo![0].tokenIds!.length).toBeGreaterThan(0); + }); + + it('should preserve role information', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Question'}]}, + {role: 'model', parts: [{text: 'Answer'}]}, + { + role: 'function', + parts: [{functionResponse: {name: 'fn', response: {}}}], + }, + ]; + const result = await tokenizer.computeTokens(contents); + expect(result.tokensInfo![0].role).toBe('user'); + expect(result.tokensInfo![1].role).toBe('model'); + expect(result.tokensInfo![2].role).toBe('function'); + }); + + it('should handle empty content', async () => { + const result = await tokenizer.computeTokens(''); + expect(result.tokensInfo!.length).toBe(0); + }); + + it('should load model only once for multiple calls', async () => { + await tokenizer.computeTokens('First'); + await tokenizer.computeTokens('Second'); + await tokenizer.computeTokens('Third'); + + expect(mockCache.load).toHaveBeenCalledTimes(1); + }); + }); + + describe('model loading', () => { + let mockProcessor: jasmine.SpyObj; + + beforeEach(() => { + mockProcessor = jasmine.createSpyObj( + 'SentencePieceProcessor', + ['encode', 'decode'], + ); + + mockProcessor.encode.and.callFake((text: string): Token[] => { + const words = text.split(/\s+/).filter((w) => w.length > 0); + return words.map((word, idx) => ({id: idx + 100, text: word})); + }); + + type ProcessorConstructor = new ( + modelBytes: Uint8Array, + ) => SentencePieceProcessor; + const MockProcessorConstructor = jasmine + .createSpy('ProcessorConstructor') + .and.returnValue(mockProcessor) as unknown as ProcessorConstructor; + + tokenizer = new LocalTokenizer( + 'gemini-2.0-flash-001', + mockPlatform, + MockProcessorConstructor, + ); + }); + + it('should load model from cache', async () => { + mockCache.load.and.returnValue(Promise.resolve(mockModelData)); + + await tokenizer.countTokens('Test'); + + expect(mockCache.load).toHaveBeenCalled(); + }); + + it('should download model if not in cache', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(true)); + + await tokenizer.countTokens('Test'); + + expect(mockFileSystem.fetchFromUrl).toHaveBeenCalled(); + expect(mockCache.save).toHaveBeenCalledWith('cache-key', mockModelData); + }); + + it('should throw error if model download fails validation', async () => { + mockCache.load.and.returnValue(Promise.resolve(null)); + mockFileSystem.fetchFromUrl.and.returnValue( + Promise.resolve(mockModelData), + ); + mockFileSystem.validateHash.and.returnValue(Promise.resolve(false)); + mockFileSystem.computeSha1.and.returnValues( + Promise.resolve('cache-key'), + Promise.resolve('wrong-hash'), + ); + + await expectAsync(tokenizer.countTokens('Test')).toBeRejectedWithError( + /Downloaded model file is corrupted/, + ); + }); + }); + + describe('error handling', () => { + let mockProcessor: jasmine.SpyObj; + + beforeEach(() => { + mockProcessor = jasmine.createSpyObj( + 'SentencePieceProcessor', + ['encode', 'decode'], + ); + + mockProcessor.encode.and.callFake((text: string): Token[] => { + const words = text.split(/\s+/).filter((w) => w.length > 0); + return words.map((word, idx) => ({id: idx + 100, text: word})); + }); + + type ProcessorConstructor = new ( + modelBytes: Uint8Array, + ) => SentencePieceProcessor; + const MockProcessorConstructor = jasmine + .createSpy('ProcessorConstructor') + .and.returnValue(mockProcessor) as unknown as ProcessorConstructor; + + tokenizer = new LocalTokenizer( + 'gemini-2.0-flash-001', + mockPlatform, + MockProcessorConstructor, + ); + }); + + it('should throw error for non-text content in countTokens', async () => { + const content: Content = { + role: 'user', + parts: [{fileData: {mimeType: 'image/png', fileUri: 'gs://test'}}], + }; + + await expectAsync(tokenizer.countTokens(content)).toBeRejectedWithError( + /LocalTokenizers do not support non-text content types/, + ); + }); + + it('should throw error for non-text content in computeTokens', async () => { + const content: Content = { + role: 'user', + parts: [{inlineData: {mimeType: 'audio/mp3', data: 'base64'}}], + }; + + await expectAsync(tokenizer.computeTokens(content)).toBeRejectedWithError( + /LocalTokenizers do not support non-text content types/, + ); + }); + }); + + describe('integration scenarios', () => { + let mockProcessor: jasmine.SpyObj; + + beforeEach(() => { + mockProcessor = jasmine.createSpyObj( + 'SentencePieceProcessor', + ['encode', 'decode'], + ); + + mockProcessor.encode.and.callFake((text: string): Token[] => { + const words = text.split(/\s+/).filter((w) => w.length > 0); + return words.map((word, idx) => ({id: idx + 100, text: word})); + }); + + type ProcessorConstructor = new ( + modelBytes: Uint8Array, + ) => SentencePieceProcessor; + const MockProcessorConstructor = jasmine + .createSpy('ProcessorConstructor') + .and.returnValue(mockProcessor) as unknown as ProcessorConstructor; + + tokenizer = new LocalTokenizer( + 'gemini-2.0-flash-001', + mockPlatform, + MockProcessorConstructor, + ); + }); + + it('should handle chat-like conversation', async () => { + const conversation: Content[] = [ + {role: 'user', parts: [{text: 'What is the weather today?'}]}, + { + role: 'model', + parts: [ + { + functionCall: { + name: 'getWeather', + args: {location: 'current'}, + }, + }, + ], + }, + { + role: 'function', + parts: [ + { + functionResponse: { + name: 'getWeather', + response: {temp: '75F', condition: 'sunny'}, + }, + }, + ], + }, + {role: 'model', parts: [{text: 'It is sunny and 75 degrees.'}]}, + ]; + + const countResult = await tokenizer.countTokens(conversation); + expect(countResult.totalTokens).toBeGreaterThan(0); + + const computeResult = await tokenizer.computeTokens(conversation); + expect(computeResult.tokensInfo!.length).toBe(4); + }); + + it('should handle content with tools and system instruction', async () => { + const config: CountTokensConfig = { + systemInstruction: 'You are a weather assistant', + tools: [ + { + functionDeclarations: [ + { + name: 'getWeather', + description: 'Get current weather', + parameters: { + type: Type.OBJECT, + properties: { + location: {type: Type.STRING}, + }, + required: ['location'], + }, + }, + ], + }, + ], + }; + + const result = await tokenizer.countTokens( + 'What is the weather in Paris?', + config, + ); + expect(result.totalTokens).toBeGreaterThan(0); + }); + }); +}); diff --git a/test/unit/cross/tokenizer/texts_accumulator_test.ts b/test/unit/cross/tokenizer/texts_accumulator_test.ts new file mode 100644 index 000000000..7cb3db678 --- /dev/null +++ b/test/unit/cross/tokenizer/texts_accumulator_test.ts @@ -0,0 +1,542 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import {TextsAccumulator} from '../../../../src/cross/tokenizer/_texts_accumulator.js'; +import { + Content, + FunctionCall, + FunctionResponse, + Schema, + Tool, + Type, +} from '../../../../src/types.js'; + +describe('TextsAccumulator', () => { + let accumulator: TextsAccumulator; + + beforeEach(() => { + accumulator = new TextsAccumulator(); + }); + + describe('getTexts', () => { + it('should return empty array when no texts added', () => { + expect(accumulator.getTexts()).toEqual([]); + }); + + it('should return accumulated texts', () => { + const content: Content = { + role: 'user', + parts: [{text: 'Hello world'}], + }; + accumulator.addContent(content); + expect(accumulator.getTexts()).toEqual(['Hello world']); + }); + }); + + describe('addContent', () => { + it('should add text from simple content', () => { + const content: Content = { + role: 'user', + parts: [{text: 'Hello'}], + }; + accumulator.addContent(content); + expect(accumulator.getTexts()).toEqual(['Hello']); + }); + + it('should add multiple texts from content with multiple parts', () => { + const content: Content = { + role: 'user', + parts: [{text: 'Hello'}, {text: 'World'}], + }; + accumulator.addContent(content); + expect(accumulator.getTexts()).toEqual(['Hello', 'World']); + }); + + it('should throw error for fileData', () => { + const content: Content = { + role: 'user', + parts: [{fileData: {mimeType: 'image/png', fileUri: 'gs://test'}}], + }; + expect(() => accumulator.addContent(content)).toThrowError( + 'LocalTokenizers do not support non-text content types.', + ); + }); + + it('should throw error for inlineData', () => { + const content: Content = { + role: 'user', + parts: [{inlineData: {mimeType: 'image/png', data: 'base64data'}}], + }; + expect(() => accumulator.addContent(content)).toThrowError( + 'LocalTokenizers do not support non-text content types.', + ); + }); + + it('should handle videoMetadata without error', () => { + const content: Content = { + role: 'user', + parts: [ + { + text: 'Video content', + videoMetadata: {startOffset: '0', endOffset: '1000'}, + }, + ], + }; + accumulator.addContent(content); + expect(accumulator.getTexts()).toEqual(['Video content']); + }); + + it('should add texts from functionCall', () => { + const content: Content = { + role: 'model', + parts: [ + { + functionCall: { + name: 'getWeather', + args: {location: 'San Francisco', unit: 'celsius'}, + }, + }, + ], + }; + accumulator.addContent(content); + const texts = accumulator.getTexts(); + expect(texts).toContain('getWeather'); + expect(texts).toContain('location'); + expect(texts).toContain('San Francisco'); + expect(texts).toContain('unit'); + expect(texts).toContain('celsius'); + }); + + it('should add texts from functionResponse', () => { + const content: Content = { + role: 'function', + parts: [ + { + functionResponse: { + name: 'getWeather', + response: {temperature: '72', condition: 'sunny'}, + }, + }, + ], + }; + accumulator.addContent(content); + const texts = accumulator.getTexts(); + expect(texts).toContain('getWeather'); + expect(texts).toContain('temperature'); + expect(texts).toContain('72'); + expect(texts).toContain('condition'); + expect(texts).toContain('sunny'); + }); + + it('should warn for unsupported content fields', () => { + const consoleWarnSpy = spyOn(console, 'warn'); + const content = { + role: 'user', + parts: [{text: 'Hello'}], + unsupportedField: 'value', + } as unknown as Content; + + accumulator.addContent(content); + expect(consoleWarnSpy).toHaveBeenCalledWith( + jasmine.stringContaining('Content contains unsupported types'), + ); + }); + }); + + describe('addContents', () => { + it('should add multiple contents', () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi there'}]}, + {role: 'user', parts: [{text: 'How are you?'}]}, + ]; + accumulator.addContents(contents); + expect(accumulator.getTexts()).toEqual([ + 'Hello', + 'Hi there', + 'How are you?', + ]); + }); + + it('should handle empty array', () => { + accumulator.addContents([]); + expect(accumulator.getTexts()).toEqual([]); + }); + }); + + describe('addFunctionCall', () => { + it('should add function name', () => { + const functionCall: FunctionCall = { + name: 'calculateSum', + args: {}, + }; + accumulator.addFunctionCall(functionCall); + expect(accumulator.getTexts()).toContain('calculateSum'); + }); + + it('should add function args', () => { + const functionCall: FunctionCall = { + name: 'add', + args: {a: 5, b: 10}, + }; + accumulator.addFunctionCall(functionCall); + const texts = accumulator.getTexts(); + expect(texts).toContain('add'); + expect(texts).toContain('a'); + expect(texts).toContain('b'); + }); + + it('should handle nested args', () => { + const functionCall: FunctionCall = { + name: 'complexFunction', + args: { + nested: { + level1: { + level2: 'deep value', + }, + }, + }, + }; + accumulator.addFunctionCall(functionCall); + const texts = accumulator.getTexts(); + expect(texts).toContain('complexFunction'); + expect(texts).toContain('nested'); + expect(texts).toContain('level1'); + expect(texts).toContain('level2'); + expect(texts).toContain('deep value'); + }); + + it('should handle array args', () => { + const functionCall: FunctionCall = { + name: 'processItems', + args: {items: ['apple', 'banana', 'cherry']}, + }; + accumulator.addFunctionCall(functionCall); + const texts = accumulator.getTexts(); + expect(texts).toContain('processItems'); + expect(texts).toContain('items'); + expect(texts).toContain('apple'); + expect(texts).toContain('banana'); + expect(texts).toContain('cherry'); + }); + }); + + describe('addFunctionResponse', () => { + it('should add function response name', () => { + const functionResponse: FunctionResponse = { + name: 'getResult', + response: {}, + }; + accumulator.addFunctionResponse(functionResponse); + expect(accumulator.getTexts()).toContain('getResult'); + }); + + it('should add response data', () => { + const functionResponse: FunctionResponse = { + name: 'getData', + response: {status: 'success', message: 'Data retrieved'}, + }; + accumulator.addFunctionResponse(functionResponse); + const texts = accumulator.getTexts(); + expect(texts).toContain('getData'); + expect(texts).toContain('status'); + expect(texts).toContain('success'); + expect(texts).toContain('message'); + expect(texts).toContain('Data retrieved'); + }); + }); + + describe('addFunctionResponses', () => { + it('should add multiple function responses', () => { + const functionResponses: FunctionResponse[] = [ + {name: 'func1', response: {result: 'result1'}}, + {name: 'func2', response: {result: 'result2'}}, + ]; + accumulator.addFunctionResponses(functionResponses); + const texts = accumulator.getTexts(); + expect(texts).toContain('func1'); + expect(texts).toContain('func2'); + expect(texts).toContain('result1'); + expect(texts).toContain('result2'); + }); + }); + + describe('addTool', () => { + it('should add tool with function declarations', () => { + const tool: Tool = { + functionDeclarations: [ + { + name: 'getWeather', + description: 'Get the current weather', + }, + ], + }; + accumulator.addTool(tool); + const texts = accumulator.getTexts(); + expect(texts).toContain('getWeather'); + expect(texts).toContain('Get the current weather'); + }); + + it('should handle tool without function declarations', () => { + const tool: Tool = {}; + accumulator.addTool(tool); + expect(accumulator.getTexts()).toEqual([]); + }); + + it('should add function declaration parameters', () => { + const tool: Tool = { + functionDeclarations: [ + { + name: 'calculateArea', + description: 'Calculate area of a rectangle', + parameters: { + type: Type.OBJECT, + properties: { + width: { + type: Type.NUMBER, + description: 'Width of the rectangle', + }, + height: { + type: Type.NUMBER, + description: 'Height of the rectangle', + }, + }, + required: ['width', 'height'], + }, + }, + ], + }; + accumulator.addTool(tool); + const texts = accumulator.getTexts(); + expect(texts).toContain('calculateArea'); + expect(texts).toContain('Calculate area of a rectangle'); + expect(texts).toContain('width'); + expect(texts).toContain('Width of the rectangle'); + expect(texts).toContain('height'); + expect(texts).toContain('Height of the rectangle'); + }); + }); + + describe('addTools', () => { + it('should add multiple tools', () => { + const tools: Tool[] = [ + { + functionDeclarations: [ + {name: 'func1', description: 'First function'}, + ], + }, + { + functionDeclarations: [ + {name: 'func2', description: 'Second function'}, + ], + }, + ]; + accumulator.addTools(tools); + const texts = accumulator.getTexts(); + expect(texts).toContain('func1'); + expect(texts).toContain('First function'); + expect(texts).toContain('func2'); + expect(texts).toContain('Second function'); + }); + }); + + describe('addSchema', () => { + it('should add schema format', () => { + const schema: Schema = { + type: Type.STRING, + format: 'email', + }; + accumulator.addSchema(schema); + expect(accumulator.getTexts()).toContain('email'); + }); + + it('should add schema description', () => { + const schema: Schema = { + type: Type.STRING, + description: 'User email address', + }; + accumulator.addSchema(schema); + expect(accumulator.getTexts()).toContain('User email address'); + }); + + it('should add schema enum values', () => { + const schema: Schema = { + type: Type.STRING, + enum: ['red', 'green', 'blue'], + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('red'); + expect(texts).toContain('green'); + expect(texts).toContain('blue'); + }); + + it('should add schema required fields', () => { + const schema: Schema = { + type: Type.OBJECT, + required: ['name', 'email'], + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('name'); + expect(texts).toContain('email'); + }); + + it('should add schema properties', () => { + const schema: Schema = { + type: Type.OBJECT, + properties: { + username: {type: Type.STRING, description: 'User name'}, + age: {type: Type.NUMBER, description: 'User age'}, + }, + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('username'); + expect(texts).toContain('User name'); + expect(texts).toContain('age'); + expect(texts).toContain('User age'); + }); + + it('should add schema items for arrays', () => { + const schema: Schema = { + type: Type.ARRAY, + items: { + type: Type.STRING, + description: 'Array item', + }, + }; + accumulator.addSchema(schema); + expect(accumulator.getTexts()).toContain('Array item'); + }); + + it('should add schema example', () => { + const schema: Schema = { + type: Type.STRING, + example: 'example value', + }; + accumulator.addSchema(schema); + expect(accumulator.getTexts()).toContain('example value'); + }); + + it('should handle complex nested schema', () => { + const schema: Schema = { + type: Type.OBJECT, + properties: { + address: { + type: Type.OBJECT, + properties: { + street: {type: Type.STRING, description: 'Street name'}, + city: {type: Type.STRING, description: 'City name'}, + }, + }, + }, + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('address'); + expect(texts).toContain('street'); + expect(texts).toContain('Street name'); + expect(texts).toContain('city'); + expect(texts).toContain('City name'); + }); + + it('should handle null example', () => { + const schema: Schema = { + type: Type.STRING, + example: null, + }; + accumulator.addSchema(schema); + // Should not throw and should not add null to texts + expect(accumulator.getTexts()).toEqual([]); + }); + + it('should handle object example', () => { + const schema: Schema = { + type: Type.OBJECT, + example: {key: 'value', nested: {deep: 'data'}}, + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('key'); + expect(texts).toContain('value'); + expect(texts).toContain('nested'); + expect(texts).toContain('deep'); + expect(texts).toContain('data'); + }); + + it('should handle array example', () => { + const schema: Schema = { + type: Type.ARRAY, + example: ['item1', 'item2', 'item3'], + }; + accumulator.addSchema(schema); + const texts = accumulator.getTexts(); + expect(texts).toContain('item1'); + expect(texts).toContain('item2'); + expect(texts).toContain('item3'); + }); + }); + + describe('complex scenarios', () => { + it('should handle content with mixed parts', () => { + const content: Content = { + role: 'user', + parts: [ + {text: 'First text'}, + { + functionCall: { + name: 'myFunction', + args: {param: 'value'}, + }, + }, + {text: 'Second text'}, + ], + }; + accumulator.addContent(content); + const texts = accumulator.getTexts(); + expect(texts).toContain('First text'); + expect(texts).toContain('myFunction'); + expect(texts).toContain('param'); + expect(texts).toContain('value'); + expect(texts).toContain('Second text'); + }); + + it('should handle function declaration with response schema', () => { + const tool: Tool = { + functionDeclarations: [ + { + name: 'getUser', + description: 'Get user information', + parameters: { + type: Type.OBJECT, + properties: { + userId: {type: Type.STRING, description: 'User ID'}, + }, + }, + response: { + type: Type.OBJECT, + properties: { + name: {type: Type.STRING, description: 'User name'}, + email: {type: Type.STRING, description: 'User email'}, + }, + }, + }, + ], + }; + accumulator.addTool(tool); + const texts = accumulator.getTexts(); + expect(texts).toContain('getUser'); + expect(texts).toContain('Get user information'); + expect(texts).toContain('userId'); + expect(texts).toContain('User ID'); + expect(texts).toContain('name'); + expect(texts).toContain('User name'); + expect(texts).toContain('email'); + expect(texts).toContain('User email'); + }); + }); +}); diff --git a/test/unit/node/local_tokenizer_test.ts b/test/unit/node/local_tokenizer_test.ts new file mode 100644 index 000000000..3ce53b8b6 --- /dev/null +++ b/test/unit/node/local_tokenizer_test.ts @@ -0,0 +1,348 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import {LocalTokenizer as BaseLocalTokenizer} from '../../../src/cross/tokenizer/_local_tokenizer_impl.js'; +import {NodeTokenizerPlatform} from '../../../src/node/_node_tokenizer_platform.js'; +import {LocalTokenizer} from '../../../src/node/local_tokenizer.js'; +import type {Content, CountTokensConfig} from '../../../src/types.js'; + +describe('LocalTokenizer (Node)', () => { + describe('constructor', () => { + it('should create a LocalTokenizer instance', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + expect(tokenizer).toBeDefined(); + expect(tokenizer).toBeInstanceOf(LocalTokenizer); + }); + + it('should initialize with NodeTokenizerPlatform', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + expect(tokenizer['baseTokenizer']).toBeDefined(); + expect(tokenizer['baseTokenizer']).toBeInstanceOf(BaseLocalTokenizer); + + const platform = tokenizer['baseTokenizer']['platform']; + expect(platform).toBeInstanceOf(NodeTokenizerPlatform); + }); + + it('should accept different model names', () => { + const models = [ + 'gemini-2.0-flash-001', + 'gemini-2.5-pro', + 'gemini-2.5-flash', + ]; + + models.forEach((model) => { + const tokenizer = new LocalTokenizer(model); + expect(tokenizer).toBeDefined(); + }); + }); + + it('should throw error for unsupported model', () => { + expect(() => new LocalTokenizer('unsupported-model')).toThrowError( + /is not supported for local tokenization/, + ); + }); + }); + + describe('countTokens', () => { + let tokenizer: LocalTokenizer; + + beforeEach(() => { + tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + }); + + it('should delegate to base tokenizer', async () => { + const spy = spyOn( + tokenizer['baseTokenizer'], + 'countTokens', + ).and.returnValue(Promise.resolve({totalTokens: 5})); + + const result = await tokenizer.countTokens('What is your name?'); + + expect(spy).toHaveBeenCalledWith('What is your name?', undefined); + expect(result.totalTokens).toBe(5); + }); + + it('should pass config to base tokenizer', async () => { + const config: CountTokensConfig = { + systemInstruction: 'You are helpful', + }; + + const spy = spyOn( + tokenizer['baseTokenizer'], + 'countTokens', + ).and.returnValue(Promise.resolve({totalTokens: 10})); + + const result = await tokenizer.countTokens('Hello', config); + + expect(spy).toHaveBeenCalledWith('Hello', config); + expect(result.totalTokens).toBe(10); + }); + + it('should handle string content', async () => { + spyOn(tokenizer['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 3}), + ); + + const result = await tokenizer.countTokens('Hello world'); + expect(result.totalTokens).toBe(3); + }); + + it('should handle Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'Hello'}], + }; + + spyOn(tokenizer['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 1}), + ); + + const result = await tokenizer.countTokens(content); + expect(result.totalTokens).toBe(1); + }); + + it('should handle array of Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi there!'}]}, + ]; + + spyOn(tokenizer['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 5}), + ); + + const result = await tokenizer.countTokens(contents); + expect(result.totalTokens).toBe(5); + }); + + it('should propagate errors from base tokenizer', async () => { + const error = new Error('Model loading failed'); + spyOn(tokenizer['baseTokenizer'], 'countTokens').and.returnValue( + Promise.reject(error), + ); + + await expectAsync( + tokenizer.countTokens('Test text'), + ).toBeRejectedWithError('Model loading failed'); + }); + }); + + describe('computeTokens', () => { + let tokenizer: LocalTokenizer; + + beforeEach(() => { + tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + }); + + it('should delegate to base tokenizer', async () => { + const mockResult = { + tokensInfo: [ + { + tokenIds: ['1', '2', '3'], + tokens: ['dGVz', 'dA==', 'IQ=='], + role: 'user', + }, + ], + }; + + const spy = spyOn( + tokenizer['baseTokenizer'], + 'computeTokens', + ).and.returnValue(Promise.resolve(mockResult)); + + const result = await tokenizer.computeTokens('test!'); + + expect(spy).toHaveBeenCalledWith('test!'); + expect(result).toEqual(mockResult); + }); + + it('should return base64 encoded tokens', async () => { + const mockResult = { + tokensInfo: [ + { + tokenIds: ['100', '101'], + tokens: ['SGVs', 'bG8='], // base64 encoded + role: 'user', + }, + ], + }; + + spyOn(tokenizer['baseTokenizer'], 'computeTokens').and.returnValue( + Promise.resolve(mockResult), + ); + + const result = await tokenizer.computeTokens('Hello'); + expect(result.tokensInfo).toBeDefined(); + expect(result.tokensInfo!.length).toBe(1); + expect(result.tokensInfo![0].tokens).toBeDefined(); + // Verify tokens are base64 encoded + result.tokensInfo![0].tokens!.forEach((token) => { + expect(token).toMatch(/^[A-Za-z0-9+/=]*$/); + }); + }); + + it('should handle Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'What is AI?'}], + }; + + const mockResult = { + tokensInfo: [ + { + tokenIds: ['1', '2', '3'], + tokens: ['V2hh', 'dCBp', 'cyBBST8='], + role: 'user', + }, + ], + }; + + spyOn(tokenizer['baseTokenizer'], 'computeTokens').and.returnValue( + Promise.resolve(mockResult), + ); + + const result = await tokenizer.computeTokens(content); + expect(result.tokensInfo![0].role).toBe('user'); + expect(result.tokensInfo![0].tokenIds!.length).toBe(3); + }); + + it('should handle multiple Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi'}]}, + ]; + + const mockResult = { + tokensInfo: [ + { + tokenIds: ['1'], + tokens: ['SGVsbG8='], + role: 'user', + }, + { + tokenIds: ['2'], + tokens: ['SGk='], + role: 'model', + }, + ], + }; + + spyOn(tokenizer['baseTokenizer'], 'computeTokens').and.returnValue( + Promise.resolve(mockResult), + ); + + const result = await tokenizer.computeTokens(contents); + expect(result.tokensInfo!.length).toBe(2); + expect(result.tokensInfo![0].role).toBe('user'); + expect(result.tokensInfo![1].role).toBe('model'); + }); + + it('should propagate errors from base tokenizer', async () => { + const error = new Error('Tokenization failed'); + spyOn(tokenizer['baseTokenizer'], 'computeTokens').and.returnValue( + Promise.reject(error), + ); + + await expectAsync( + tokenizer.computeTokens('Test text'), + ).toBeRejectedWithError('Tokenization failed'); + }); + }); + + describe('platform integration', () => { + it('should use NodeTokenizerPlatform cache', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + expect(platform.cache).toBeDefined(); + expect(typeof platform.cache.load).toBe('function'); + expect(typeof platform.cache.save).toBe('function'); + }); + + it('should use NodeTokenizerPlatform fileSystem', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + expect(platform.fileSystem).toBeDefined(); + expect(typeof platform.fileSystem.fetchFromUrl).toBe('function'); + expect(typeof platform.fileSystem.validateHash).toBe('function'); + expect(typeof platform.fileSystem.computeSha1).toBe('function'); + }); + }); + + describe('multiple instances', () => { + it('should create independent tokenizer instances', () => { + const tokenizer1 = new LocalTokenizer('gemini-2.0-flash-001'); + const tokenizer2 = new LocalTokenizer('gemini-2.5-pro'); + + expect(tokenizer1).not.toBe(tokenizer2); + expect(tokenizer1['baseTokenizer']).not.toBe(tokenizer2['baseTokenizer']); + }); + + it('should maintain independent state', async () => { + const tokenizer1 = new LocalTokenizer('gemini-2.0-flash-001'); + const tokenizer2 = new LocalTokenizer('gemini-2.5-pro'); + + spyOn(tokenizer1['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 5}), + ); + spyOn(tokenizer2['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 10}), + ); + + const result1 = await tokenizer1.countTokens('test'); + const result2 = await tokenizer2.countTokens('test'); + + expect(result1.totalTokens).toBe(5); + expect(result2.totalTokens).toBe(10); + }); + }); + + describe('type compatibility', () => { + it('should return CountTokensResult with totalTokens', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + spyOn(tokenizer['baseTokenizer'], 'countTokens').and.returnValue( + Promise.resolve({totalTokens: 42}), + ); + + const result = await tokenizer.countTokens('test'); + + // Type check - should have totalTokens property + expect(result.totalTokens).toBeDefined(); + expect(typeof result.totalTokens).toBe('number'); + }); + + it('should return ComputeTokensResult with tokensInfo', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + + const mockResult = { + tokensInfo: [ + { + tokenIds: ['1', '2'], + tokens: ['dGVz', 'dA=='], + role: 'user', + }, + ], + }; + + spyOn(tokenizer['baseTokenizer'], 'computeTokens').and.returnValue( + Promise.resolve(mockResult), + ); + + const result = await tokenizer.computeTokens('test'); + + // Type check - should have tokensInfo array + expect(result.tokensInfo).toBeDefined(); + expect(Array.isArray(result.tokensInfo)).toBe(true); + if (result.tokensInfo && result.tokensInfo.length > 0) { + expect(result.tokensInfo[0].tokenIds).toBeDefined(); + expect(result.tokensInfo[0].tokens).toBeDefined(); + expect(result.tokensInfo[0].role).toBeDefined(); + } + }); + }); +}); diff --git a/test/unit/web/local_tokenizer_test.ts b/test/unit/web/local_tokenizer_test.ts new file mode 100644 index 000000000..e07b41f41 --- /dev/null +++ b/test/unit/web/local_tokenizer_test.ts @@ -0,0 +1,196 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import {LocalTokenizer as BaseLocalTokenizer} from '../../../src/cross/tokenizer/_local_tokenizer_impl.js'; +import type {Content} from '../../../src/types.js'; +import {WebTokenizerPlatform} from '../../../src/web/_web_tokenizer_platform.js'; +import {LocalTokenizer} from '../../../src/web/local_tokenizer.js'; + +describe('LocalTokenizer (Web)', () => { + describe('constructor', () => { + it('should create a LocalTokenizer instance', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + expect(tokenizer).toBeDefined(); + expect(tokenizer).toBeInstanceOf(LocalTokenizer); + }); + + it('should initialize with WebTokenizerPlatform', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + expect(tokenizer['baseTokenizer']).toBeDefined(); + expect(tokenizer['baseTokenizer']).toBeInstanceOf(BaseLocalTokenizer); + + const platform = tokenizer['baseTokenizer']['platform']; + expect(platform).toBeInstanceOf(WebTokenizerPlatform); + }); + + it('should accept different model names', () => { + const models = [ + 'gemini-2.0-flash-001', + 'gemini-2.5-pro', + 'gemini-2.5-flash', + ]; + + models.forEach((model) => { + const tokenizer = new LocalTokenizer(model); + expect(tokenizer).toBeDefined(); + }); + }); + + it('should throw error for unsupported model', () => { + expect(() => new LocalTokenizer('unsupported-model')).toThrowError( + /is not supported for local tokenization/, + ); + }); + }); + + describe('countTokens', () => { + let tokenizer: LocalTokenizer; + + beforeEach(() => { + tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + }); + + it('should throw error for unimplemented web platform', async () => { + await expectAsync( + tokenizer.countTokens('Test text'), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + + it('should throw error with Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'Hello'}], + }; + + await expectAsync(tokenizer.countTokens(content)).toBeRejectedWithError( + /Web tokenizer file system not yet implemented/, + ); + }); + + it('should throw error with array of Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi there!'}]}, + ]; + + await expectAsync(tokenizer.countTokens(contents)).toBeRejectedWithError( + /Web tokenizer file system not yet implemented/, + ); + }); + }); + + describe('computeTokens', () => { + let tokenizer: LocalTokenizer; + + beforeEach(() => { + tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + }); + + it('should throw error for unimplemented web platform', async () => { + await expectAsync( + tokenizer.computeTokens('Test text'), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + + it('should throw error with Content object', async () => { + const content: Content = { + role: 'user', + parts: [{text: 'What is AI?'}], + }; + + await expectAsync(tokenizer.computeTokens(content)).toBeRejectedWithError( + /Web tokenizer file system not yet implemented/, + ); + }); + + it('should throw error with array of Content objects', async () => { + const contents: Content[] = [ + {role: 'user', parts: [{text: 'Hello'}]}, + {role: 'model', parts: [{text: 'Hi'}]}, + ]; + + await expectAsync( + tokenizer.computeTokens(contents), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + }); + + describe('platform integration', () => { + it('should use WebTokenizerPlatform cache', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + expect(platform.cache).toBeDefined(); + expect(typeof platform.cache.load).toBe('function'); + expect(typeof platform.cache.save).toBe('function'); + }); + + it('should use WebTokenizerPlatform fileSystem', () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + expect(platform.fileSystem).toBeDefined(); + expect(typeof platform.fileSystem.fetchFromUrl).toBe('function'); + expect(typeof platform.fileSystem.validateHash).toBe('function'); + expect(typeof platform.fileSystem.computeSha1).toBe('function'); + }); + + it('should throw error for unimplemented cache.load', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + await expectAsync( + platform.cache.load('key', 'hash'), + ).toBeRejectedWithError(/Web tokenizer cache not yet implemented/); + }); + + it('should throw error for unimplemented cache.save', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + await expectAsync( + platform.cache.save('key', new Uint8Array()), + ).toBeRejectedWithError(/Web tokenizer cache not yet implemented/); + }); + + it('should throw error for unimplemented fileSystem.fetchFromUrl', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + await expectAsync( + platform.fileSystem.fetchFromUrl('https://example.com'), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + + it('should throw error for unimplemented fileSystem.validateHash', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + await expectAsync( + platform.fileSystem.validateHash(new Uint8Array(), 'hash'), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + + it('should throw error for unimplemented fileSystem.computeSha1', async () => { + const tokenizer = new LocalTokenizer('gemini-2.0-flash-001'); + const platform = tokenizer['baseTokenizer']['platform']; + + await expectAsync( + platform.fileSystem.computeSha1(new Uint8Array()), + ).toBeRejectedWithError(/Web tokenizer file system not yet implemented/); + }); + }); + + describe('multiple instances', () => { + it('should create independent tokenizer instances', () => { + const tokenizer1 = new LocalTokenizer('gemini-2.0-flash-001'); + const tokenizer2 = new LocalTokenizer('gemini-2.5-pro'); + + expect(tokenizer1).not.toBe(tokenizer2); + expect(tokenizer1['baseTokenizer']).not.toBe(tokenizer2['baseTokenizer']); + }); + }); +});