Skip to content

Commit b5ed686

Browse files
committed
- set max_length of default model to 1024 due to excessive memory usage when working on text longer than 2048 (the model supports up to 8192)
- set limits.memory to 850M
1 parent 6f461d0 commit b5ed686

File tree

4 files changed

+32
-13
lines changed

4 files changed

+32
-13
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ Kubernetes: `>= 1.21.0`
100100
| readinessProbe.successThreshold | int | `1` | |
101101
| readinessProbe.timeoutSeconds | int | `5` | |
102102
| replicas | int | `2` | |
103-
| resources.limits.memory | string | `"2000M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size. When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length. E.g. the default model supports up to 8192 tokens, but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G). |
103+
| resources.limits.memory | string | `"850M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size. When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length. E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G). |
104104
| resources.requests.cpu | string | `"100m"` | |
105-
| resources.requests.memory | string | `"850M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model. |
105+
| resources.requests.memory | string | `"650M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model. |
106106
| service.annotations | object | `{}` | |
107107
| service.httpPortName | string | `"http"` | |
108108
| service.labels | object | `{}` | |

deploy/magda-embedding-api/values.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,11 @@ resources:
204204
cpu: "100m"
205205
# -- (string) the memory request of the container
206206
# Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model.
207-
memory: "850M"
207+
memory: "650M"
208208
limits:
209209
# -- (string) the memory limit of the container
210210
# Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size.
211211
# When change the default model, be sure to test the peak memory usage of the service before setting the memory limit.
212212
# When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length.
213-
# E.g. the default model supports up to 8192 tokens, but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G).
214-
memory: "2000M"
213+
# E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G).
214+
memory: "850M"

deploy/test-deploy.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ appConfig:
1919
- name: Xenova/bge-small-en-v1.5
2020
# set quantized to false to use the non-quantized version of the model
2121
# by default, the quantized version of the model will be used
22-
quantized: true
22+
dtype: "q8"
23+
# optional set max length of the input text
24+
# if not set, the value in model config will be used
25+
# if model config does not have max_length, the default value (512) will be used
26+
max_length: 512,
2327
extraction_config:
2428
pooling: "mean"
2529
normalize: true

src/libs/EmbeddingEncoder.ts

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@ export interface ExtractionConfig {
1818
precision?: "binary" | "ubinary";
1919
}
2020

21+
export const DEFAULT_MAX_LENGTH = 512;
22+
2123
export const defaultModel: ModelItem = {
2224
name: "Alibaba-NLP/gte-base-en-v1.5",
25+
max_length: 1024,
2326
dtype: "q8",
2427
extraction_config: {
2528
pooling: "cls",
@@ -37,6 +40,7 @@ export const DEFAULT_EXTRACTION_CONFIG: ExtractionConfig = {
3740

3841
export interface ModelItem {
3942
name: string;
43+
max_length?: number;
4044
// whether or not this model is the default model
4145
// if all models are not default, the first one will be used as default
4246
default?: boolean;
@@ -187,7 +191,8 @@ class EmbeddingEncoder {
187191

188192
async featureExtraction(
189193
texts: string | string[],
190-
opts: FeatureExtractionPipelineOptions = {}
194+
opts: FeatureExtractionPipelineOptions = {},
195+
max_length: number | undefined = undefined
191196
) {
192197
if (!this.tokenizer || !this.model) {
193198
throw new Error("Tokenizer or model not initialized");
@@ -205,7 +210,13 @@ class EmbeddingEncoder {
205210
// Run tokenization
206211
const model_inputs = this.tokenizer(texts, {
207212
padding: true,
208-
truncation: true
213+
truncation: true,
214+
max_length:
215+
typeof max_length !== "undefined" && max_length > 0
216+
? max_length
217+
: this.model.config.max_position_embeddings > 0
218+
? this.model.config.max_position_embeddings
219+
: DEFAULT_MAX_LENGTH
209220
});
210221

211222
// Run model
@@ -286,11 +297,15 @@ class EmbeddingEncoder {
286297
sentences: string | string[],
287298
model: string = this.defaultModel
288299
) {
289-
const { extraction_config } = this.getModelByName(model);
290-
291-
const output = await this.featureExtraction(sentences, {
292-
...extraction_config
293-
});
300+
const { extraction_config, max_length } = this.getModelByName(model);
301+
302+
const output = await this.featureExtraction(
303+
sentences,
304+
{
305+
...extraction_config
306+
},
307+
max_length
308+
);
294309

295310
const embeddings = output[0].tolist() as number[][];
296311
const tokenSize = output[1].input_ids.size as number;

0 commit comments

Comments
 (0)