Skip to content

Commit 3fdd248

Browse files
authored
Merge pull request #4 from magda-io/refactor
Refactor & improve stability
2 parents 4a49dd1 + 2226460 commit 3fdd248

27 files changed

+797
-591
lines changed

CHANGES.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
# CHANGELOG
22

3+
## v1.1.0
4+
5+
- Rename EmbeddingGenerator to EmbeddingEncoder
6+
- Fixed serverOptions weren't passed through properly in test cases
7+
- Upgrade to @huggingface/transformers v3.2.4
8+
- Upgrade onnxruntime-node v1.20.1
9+
- Avoid including unused models in docker images (smaller image size)
10+
- Increase probe timeout seconds
11+
- Use worker pool
12+
- Process sentence list with separate model runs
13+
- set default `workerTaskTimeout` to `60` seconds
14+
- use quantized version (q8) default model
15+
- set default `limits.memory` to `850M`
16+
- set default replicas number to `2`
17+
- Add max_length config to model config (configurable via helm config)
18+
- set max_length of default model to 1024 due to excessive memory usage when working on text longer than 2048 (the default model supports up to 8192)
19+
- only use padding for multiple inputs received when encoding the input
20+
321
## v1.0.0
422

5-
- #1: Initial implementation
23+
- #1: Initial implementation

Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@ RUN mkdir -p /usr/src/app /etc/config && \
88
chmod -R g=u /usr/src/app /etc/config
99

1010
COPY . /usr/src/app
11-
# make local cache folder writable by 1000 user
12-
RUN chown -R 1000 /usr/src/app/component/node_modules/@xenova/transformers/.cache
1311
# Reinstall onnxruntime-node based on current building platform architecture
1412
RUN cd /usr/src/app/component/node_modules/onnxruntime-node && npm run postinstall
1513
# Reinstall sharp based on current building platform architecture
16-
RUN cd /usr/src/app/component/node_modules/sharp && npm run clean && node install/libvips && node install/dll-copy && node ../prebuild-install/bin.js
14+
RUN cd /usr/src/app/component/node_modules && rm -Rf @img && cd sharp && npm install
15+
# remove downloaded model cache
16+
RUN cd /usr/src/app/component/node_modules/@huggingface/transformers && rm -Rf .cache && mkdir .cache
17+
# download default model
18+
RUN cd /usr/src/app/component && npm run download-default-model
19+
# make local cache folder writable by 1000 user
20+
RUN chown -R 1000 /usr/src/app/component/node_modules/@huggingface/transformers/.cache
1721

1822
USER 1000
1923
WORKDIR /usr/src/app/component

README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Kubernetes: `>= 1.21.0`
4949
| appConfig | object | `{}` | Application configuration of the service. You can supply a list of key-value pairs to be used as the application configuration. Currently, the only supported config field is `modelList`. Via the `modelList` field, you can specify a list of LLM models that the service supports. Although you can specify multiple models, only one model will be used at this moment. Each model item have the following fields: <ul> <li> `name` (string): The huggingface registered model name. We only support ONNX model at this moment. This field is required. </li> <li> `default` (bool): Optional; Whether this model is the default model. If not specified, the first model in the list will be the default model. Only default model will be loaded. </li> <li> `quantized` (bool): Optional; Whether the quantized version of model will be used. If not specified, the quantized version model will be loaded. </li> <li> `config` (object): Optional; The configuration object that will be passed to the model. </li> <li> `cache_dir` (string): Optional; The cache directory of the downloaded models. If not specified, the default cache directory will be used. </li> <li> `local_files_only` (bool): Optional; Whether to only load the model from local files. If not specified, the model will be downloaded from the huggingface model hub. </li> <li> `revision` (string) Optional, Default to 'main'; The specific model version to use. It can be a branch name, a tag name, or a commit id. Since we use a git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. NOTE: This setting is ignored for local requests. </li> <li> `model_file_name` (string) Optional; </li> <li> `extraction_config` (object) Optional; The configuration object that will be passed to the model extraction function for embedding generation. <br/> <ul> <li> `pooling`: ('none' or 'mean' or 'cls') Default to 'none'. The pooling method to use. </li> <li> `normalize`: (bool) Default to true. Whether or not to normalize the embeddings in the last dimension. </li> <li> `quantize`: (bool) Default to `false`. Whether or not to quantize the embeddings. </li> <li> `precision`: ("binary" or "ubinary") default to "binary". The precision to use for quantization. Only used when `quantize` is true. </li> </ul> </li> </ul> Please note: The released docker image only contains "Alibaba-NLP/gte-base-en-v1.5" model. If you specify other models, the server will download the model from the huggingface model hub at the startup. You might want to adjust the `startupProbe` settings to accommodate the model downloading time. Depends on the model size, you might also want to adjust the `resources.limits.memory` & `resources.requests.memory`value. |
5050
| autoscaling.hpa.enabled | bool | `false` | |
5151
| autoscaling.hpa.maxReplicas | int | `3` | |
52-
| autoscaling.hpa.minReplicas | int | `1` | |
52+
| autoscaling.hpa.minReplicas | int | `2` | |
5353
| autoscaling.hpa.targetCPU | int | `90` | |
5454
| autoscaling.hpa.targetMemory | string | `""` | |
5555
| bodyLimit | int | Default to 10485760 (10MB). | Defines the maximum payload, in bytes, that the server is allowed to accept |
@@ -79,9 +79,11 @@ Kubernetes: `>= 1.21.0`
7979
| livenessProbe.successThreshold | int | `1` | |
8080
| livenessProbe.timeoutSeconds | int | `5` | |
8181
| logLevel | string | `"warn"` | The log level of the application. one of 'fatal', 'error', 'warn', 'info', 'debug', 'trace'; also 'silent' is supported to disable logging. Any other value defines a custom level and requires supplying a level value via levelVal. |
82+
| maxWorkers | int | Default to 1. | The maximum number of workers that run the model to serve the request. |
83+
| minWorkers | int | Default to 1. | The maximum number of workers that run the model to serve the request. |
8284
| nameOverride | string | `""` | |
8385
| nodeSelector | object | `{}` | |
84-
| pluginTimeout | int | Default to 10000 (10 seconds). | The maximum amount of time in milliseconds in which a fastify plugin can load. If not, ready will complete with an Error with code 'ERR_AVVIO_PLUGIN_TIMEOUT'. |
86+
| pluginTimeout | int | Default to 180000 (180 seconds). | The maximum amount of time in milliseconds in which a fastify plugin can load. If not, ready will complete with an Error with code 'ERR_AVVIO_PLUGIN_TIMEOUT'. |
8587
| podAnnotations | object | `{}` | |
8688
| podSecurityContext.runAsNonRoot | bool | `true` | |
8789
| podSecurityContext.runAsUser | int | `1000` | |
@@ -97,10 +99,10 @@ Kubernetes: `>= 1.21.0`
9799
| readinessProbe.periodSeconds | int | `20` | |
98100
| readinessProbe.successThreshold | int | `1` | |
99101
| readinessProbe.timeoutSeconds | int | `5` | |
100-
| replicas | int | `1` | |
101-
| resources.limits.memory | string | `"1100M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size. When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. quantized model will be used by default, the memory limit is set to 1100M to accommodate the default model size. |
102+
| replicas | int | `2` | |
103+
| resources.limits.memory | string | `"850M"` | the memory limit of the container Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size. When change the default model, be sure to test the peak memory usage of the service before setting the memory limit. When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length. E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G). |
102104
| resources.requests.cpu | string | `"100m"` | |
103-
| resources.requests.memory | string | `"650M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 650M for default model. |
105+
| resources.requests.memory | string | `"650M"` | the memory request of the container Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model. |
104106
| service.annotations | object | `{}` | |
105107
| service.httpPortName | string | `"http"` | |
106108
| service.labels | object | `{}` | |
@@ -120,6 +122,7 @@ Kubernetes: `>= 1.21.0`
120122
| startupProbe.timeoutSeconds | int | `5` | |
121123
| tolerations | list | `[]` | |
122124
| topologySpreadConstraints | list | `[]` | This is the pod topology spread constraints https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ |
125+
| workerTaskTimeout | int | Default to 60000 (60 seconds). | The maximum time in milliseconds that a worker can run before being killed. |
123126

124127
### Build & Run for Local Development
125128

deploy/magda-embedding-api/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: v2
22
name: magda-embedding-api
33
description: An OpenAI embeddings API compatible microservice for Magda.
4-
version: "1.0.0"
4+
version: "1.1.0-alpha.0"
55
kubeVersion: ">= 1.21.0"
66
home: "https://github.com/magda-io/magda-embedding-api"
77
sources: [ "https://github.com/magda-io/magda-embedding-api" ]

deploy/magda-embedding-api/templates/deployment.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ spec:
105105
- "start"
106106
- "dist/app.js"
107107
- "--"
108+
- "--minWorkers"
109+
- {{ .Values.minWorkers | quote }}
110+
- "--maxWorkers"
111+
- {{ .Values.maxWorkers | quote }}
112+
- "--workerTaskTimeout"
113+
- {{ .Values.workerTaskTimeout | quote }}
108114
- "--appConfigFile"
109115
- "/etc/config/appConfig.json"
110116
{{- if .Values.readinessProbe }}

deploy/magda-embedding-api/values.yaml

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ logLevel: "warn"
1414

1515
# -- (int) The maximum amount of time in milliseconds in which a fastify plugin can load.
1616
# If not, ready will complete with an Error with code 'ERR_AVVIO_PLUGIN_TIMEOUT'.
17-
# @default -- Default to 10000 (10 seconds).
18-
pluginTimeout:
17+
# @default -- Default to 180000 (180 seconds).
18+
pluginTimeout: 180000
1919

2020
# -- (int) Defines the maximum payload, in bytes, that the server is allowed to accept
2121
# @default -- Default to 10485760 (10MB).
@@ -57,6 +57,18 @@ closeGraceDelay: 25000
5757
# Depends on the model size, you might also want to adjust the `resources.limits.memory` & `resources.requests.memory`value.
5858
appConfig: {}
5959

60+
# -- (int) The maximum number of workers that run the model to serve the request.
61+
# @default -- Default to 1.
62+
maxWorkers: 1
63+
64+
# -- (int) The maximum number of workers that run the model to serve the request.
65+
# @default -- Default to 1.
66+
minWorkers: 1
67+
68+
# -- (int) The maximum time in milliseconds that a worker can run before being killed.
69+
# @default -- Default to 60000 (60 seconds).
70+
workerTaskTimeout: 60000
71+
6072
# image setting loadding order: (from higher priority to lower priority)
6173
# - Values.image.x
6274
# - Values.defaultImage.x
@@ -76,12 +88,12 @@ defaultImage:
7688
nameOverride: ""
7789
fullnameOverride: ""
7890

79-
replicas: 1
91+
replicas: 2
8092

8193
autoscaling:
8294
hpa:
8395
enabled: false
84-
minReplicas: 1
96+
minReplicas: 2
8597
maxReplicas: 3
8698
targetCPU: 90
8799
targetMemory: ""
@@ -191,11 +203,12 @@ resources:
191203
requests:
192204
cpu: "100m"
193205
# -- (string) the memory request of the container
194-
# Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 650M for default model.
206+
# Once the model is loaded, the memory usage of the service for serving request would be much lower. Set to 850M for default model.
195207
memory: "650M"
196208
limits:
197209
# -- (string) the memory limit of the container
198210
# Due to [this issue of ONNX runtime](https://github.com/microsoft/onnxruntime/issues/15080), the peak memory usage of the service is much higher than the model file size.
199211
# When change the default model, be sure to test the peak memory usage of the service before setting the memory limit.
200-
# quantized model will be used by default, the memory limit is set to 1100M to accommodate the default model size.
201-
memory: "1100M"
212+
# When test your model memory requirement, please note that the memory usage of the model often goes much higher with long context length.
213+
# E.g. the default model supports up to 8192 tokens (default max_length set to 1024), but when the content go beyond 512 tokens, the memory usage will be much higher (requires around 2G).
214+
memory: "850M"

deploy/test-deploy.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ appConfig:
1919
- name: Xenova/bge-small-en-v1.5
2020
# set quantized to false to use the non-quantized version of the model
2121
# by default, the quantized version of the model will be used
22-
quantized: true
22+
dtype: "q8"
23+
# optional set max length of the input text
24+
# if not set, the value in model config will be used
25+
# if model config does not have max_length, the default value (512) will be used
26+
max_length: 512,
2327
extraction_config:
2428
pooling: "mean"
2529
normalize: true

package.json

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "magda-embedding-api",
33
"type": "module",
4-
"version": "1.0.0",
4+
"version": "1.1.0-alpha.0",
55
"description": "An OpenAI embeddings API compatible microservice for Magda.",
66
"main": "app.ts",
77
"directories": {
@@ -22,7 +22,8 @@
2222
"prebuild": "rimraf dist",
2323
"build": "tsc",
2424
"watch": "tsc -w",
25-
"start": "npm run build && fastify start -l info dist/app.js",
25+
"start": "npm run build && fastify start -T 60000 -l info dist/app.js -- --workerTaskTimeout 10000",
26+
"download-default-model": "node ./dist/loadDefaultModel.js",
2627
"dev": "npm run build && concurrently -k -p \"[{name}]\" -n \"TypeScript,App\" -c \"yellow.bold,cyan.bold\" \"npm:watch\" \"npm:dev:start\"",
2728
"dev:start": "fastify start --ignore-watch=.ts$ -w -l info -P dist/app.js",
2829
"docker-build-local": "create-docker-context-for-node-component --build --push --tag auto --repository example.com",
@@ -41,20 +42,22 @@
4142
"@fastify/autoload": "^5.10.0",
4243
"@fastify/sensible": "^5.6.0",
4344
"@fastify/type-provider-typebox": "^4.0.0",
45+
"@huggingface/transformers": "^3.2.4",
4446
"@sinclair/typebox": "^0.32.34",
45-
"@xenova/transformers": "^2.17.2",
4647
"fastify": "^4.28.1",
4748
"fastify-cli": "^6.2.1",
4849
"fastify-plugin": "^4.5.1",
4950
"fs-extra": "^11.2.0",
50-
"onnxruntime-node": "^1.14.0"
51+
"onnxruntime-node": "^1.20.1",
52+
"workerpool": "^9.2.0"
5153
},
5254
"devDependencies": {
53-
"@langchain/openai": "^0.2.1",
55+
"@langchain/openai": "^0.2.8",
5456
"@magda/ci-utils": "^1.0.5",
5557
"@magda/docker-utils": "^4.2.1",
5658
"@types/fs-extra": "^11.0.4",
5759
"@types/node": "^18.19.3",
60+
"@types/workerpool": "^6.4.7",
5861
"concurrently": "^8.2.2",
5962
"eslint": "^9.6.0",
6063
"fastify-tsconfig": "^2.0.0",
@@ -75,7 +78,7 @@
7578
"exclude": [
7679
"test/helper.ts"
7780
],
78-
"timeout": 120,
81+
"timeout": 12000,
7982
"jobs": 1
8083
},
8184
"config": {

src/app.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,16 @@ const __dirname = path.dirname(__filename);
99

1010
export type AppOptions = {
1111
appConfigFile: string;
12+
maxWorkers: number;
13+
minWorkers: number;
1214
// Place your custom options for app below here.
1315
} & Partial<AutoloadPluginOptions>;
1416

1517
// Pass --options via CLI arguments in command to enable these options.
1618
const options: AppOptions = {
17-
appConfigFile: ""
19+
appConfigFile: "",
20+
maxWorkers: 1,
21+
minWorkers: 1
1822
};
1923

2024
const app: FastifyPluginAsync<AppOptions> = async (

0 commit comments

Comments
 (0)