Skip to content

Commit c9d1b8a

Browse files
committed
tokenizer: only use padding for multiple inputs are received
1 parent b5ed686 commit c9d1b8a

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

src/libs/EmbeddingEncoder.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class EmbeddingEncoder {
209209

210210
// Run tokenization
211211
const model_inputs = this.tokenizer(texts, {
212-
padding: true,
212+
padding: typeof texts !== "string",
213213
truncation: true,
214214
max_length:
215215
typeof max_length !== "undefined" && max_length > 0
@@ -334,6 +334,8 @@ class EmbeddingEncoder {
334334
`Model \`${model}\` is not supported. Supported models: ${this.supportModels.join(", ")}`
335335
);
336336
}
337+
const modelOpts = this.getModelByName(model);
338+
const { max_length } = modelOpts;
337339
opts = {
338340
...opts,
339341
...(typeof opts.padding !== "boolean" ? { padding: true } : {}),
@@ -343,8 +345,12 @@ class EmbeddingEncoder {
343345
};
344346

345347
return this.tokenizer(texts, {
346-
padding: true,
347-
truncation: true
348+
padding: typeof texts !== "string",
349+
truncation: true,
350+
max_length:
351+
typeof max_length !== "undefined" && max_length > 0
352+
? max_length
353+
: DEFAULT_MAX_LENGTH
348354
});
349355
}
350356
}

0 commit comments

Comments
 (0)