- set max_length of default model to 1024 due to excessive memory usage when working on text longer than 2048 (the model supports up to 8192)

t83714 · t83714 · commit b5ed6868c9c9 · 2025-01-24T17:50:49.000+11:00
- set limits.memory to 850M
diff --git a/README.md b/README.md
@@ -100,9 +100,9 @@ Kubernetes: `>= 1.21.0`
 | readinessProbe.successThreshold | int | `1` |  |
 | readinessProbe.timeoutSeconds | int | `5` |  |
 | replicas | int | `2` |  |
-| resources.limits.memory | string | `"2000M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size.  When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length. E.g. the default model supports up to 8192 tokens, but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G). |
+| resources.limits.memory | string | `"850M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size.  When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length. E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G). |
 | resources.requests.cpu | string | `"100m"` |  |
-| resources.requests.memory | string | `"850M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model. |
+| resources.requests.memory | string | `"650M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model. |
 | service.annotations | object | `{}` |  |
 | service.httpPortName | string | `"http"` |  |
 | service.labels | object | `{}` |  |
diff --git a/deploy/magda-embedding-api/values.yaml b/deploy/magda-embedding-api/values.yaml
@@ -204,11 +204,11 @@ resources:
     cpu: "100m"
     # -- (string) the memory request of the container
     # Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model.
-    memory: "850M"
+    memory: "650M"
   limits:
     # -- (string) the memory limit of the container
     # Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size. 
     # When change the default model, be sure to test the peak memory usage of the service before setting the memory limit.
     # When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length.
-    # E.g. the default model supports up to 8192 tokens, but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G).
-    memory: "2000M"
+    # E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G).
+    memory: "850M"
diff --git a/deploy/test-deploy.yaml b/deploy/test-deploy.yaml
@@ -19,7 +19,11 @@ appConfig:
   - name: Xenova/bge-small-en-v1.5
     # set quantized to false to use the non-quantized version of the model
     # by default, the quantized version of the model will be used
-    quantized: true
+    dtype: "q8"
+    # optional set max length of the input text
+    # if not set, the value in model config will be used
+    # if model config does not have max_length, the default value (512) will be used
+    max_length: 512,
     extraction_config:
       pooling: "mean"
       normalize: true
diff --git a/src/libs/EmbeddingEncoder.ts b/src/libs/EmbeddingEncoder.ts
@@ -18,8 +18,11 @@ export interface ExtractionConfig {
     precision?: "binary" | "ubinary";
 }
 
+export const DEFAULT_MAX_LENGTH = 512;
+
 export const defaultModel: ModelItem = {
     name: "Alibaba-NLP/gte-base-en-v1.5",
+    max_length: 1024,
     dtype: "q8",
     extraction_config: {
         pooling: "cls",
@@ -37,6 +40,7 @@ export const DEFAULT_EXTRACTION_CONFIG: ExtractionConfig = {
 
 export interface ModelItem {
     name: string;
+    max_length?: number;
     // whether or not this model is the default model
     // if all models are not default, the first one will be used as default
     default?: boolean;
@@ -187,7 +191,8 @@ class EmbeddingEncoder {
 
     async featureExtraction(
         texts: string | string[],
-        opts: FeatureExtractionPipelineOptions = {}
+        opts: FeatureExtractionPipelineOptions = {},
+        max_length: number | undefined = undefined
     ) {
         if (!this.tokenizer || !this.model) {
             throw new Error("Tokenizer or model not initialized");
@@ -205,7 +210,13 @@ class EmbeddingEncoder {
         // Run tokenization
         const model_inputs = this.tokenizer(texts, {
             padding: true,
-            truncation: true
+            truncation: true,
+            max_length:
+                typeof max_length !== "undefined" && max_length > 0
+                    ? max_length
+                    : this.model.config.max_position_embeddings > 0
+                      ? this.model.config.max_position_embeddings
+                      : DEFAULT_MAX_LENGTH
         });
 
         // Run model
@@ -286,11 +297,15 @@ class EmbeddingEncoder {
         sentences: string | string[],
         model: string = this.defaultModel
     ) {
-        const { extraction_config } = this.getModelByName(model);
-
-        const output = await this.featureExtraction(sentences, {
-            ...extraction_config
-        });
+        const { extraction_config, max_length } = this.getModelByName(model);
+
+        const output = await this.featureExtraction(
+            sentences,
+            {
+                ...extraction_config
+            },
+            max_length
+        );
 
         const embeddings = output[0].tolist() as number[][];
         const tokenSize = output[1].input_ids.size as number;