From 607cb462adfed97c77d63ddb44e3e5cea42e6b84 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Sun, 3 May 2026 06:17:55 -0600 Subject: [PATCH] fix(search): point jina-code alias at published HF repo The `jina-code` alias mapped to `Xenova/jina-embeddings-v2-base-code`, which 404s on Hugging Face. Point it at `jinaai/jina-embeddings-v2-base-code`, the published code embedding model, drop the stale "requires HF token" note in the README, and add a regression test for the alias. Fixes #1025. --- README.md | 2 +- src/domain/search/models.ts | 2 +- tests/search/embedding-strategy.test.ts | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7aaf2fdc..f2596403 100644 --- a/README.md +++ b/README.md @@ -428,7 +428,7 @@ A single trailing semicolon is ignored (falls back to single-query mode). The `- | `minilm` | all-MiniLM-L6-v2 | 384 | ~23 MB | Apache-2.0 | Fastest, good for quick iteration | | `jina-small` | jina-embeddings-v2-small-en | 512 | ~33 MB | Apache-2.0 | Better quality, still small | | `jina-base` | jina-embeddings-v2-base-en | 768 | ~137 MB | Apache-2.0 | High quality, 8192 token context | -| `jina-code` | jina-embeddings-v2-base-code | 768 | ~137 MB | Apache-2.0 | Best for code search, trained on code+text (requires HF token) | +| `jina-code` | jina-embeddings-v2-base-code | 768 | ~137 MB | Apache-2.0 | Best for code search, trained on code+text | | `nomic` | nomic-embed-text-v1 | 768 | ~137 MB | Apache-2.0 | Good quality, 8192 context | | `nomic-v1.5` (default) | nomic-embed-text-v1.5 | 768 | ~137 MB | Apache-2.0 | **Improved nomic, Matryoshka dimensions** | | `bge-large` | bge-large-en-v1.5 | 1024 | ~335 MB | MIT | Best general retrieval, top MTEB scores | diff --git a/src/domain/search/models.ts b/src/domain/search/models.ts index 54ce5956..920e6e62 100644 --- a/src/domain/search/models.ts +++ b/src/domain/search/models.ts @@ -42,7 +42,7 @@ export const MODELS: Record = { quantized: false, }, 'jina-code': { - name: 'Xenova/jina-embeddings-v2-base-code', + name: 'jinaai/jina-embeddings-v2-base-code', dim: 768, contextWindow: 8192, desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', diff --git a/tests/search/embedding-strategy.test.ts b/tests/search/embedding-strategy.test.ts index 0f829bc0..f7a79f93 100644 --- a/tests/search/embedding-strategy.test.ts +++ b/tests/search/embedding-strategy.test.ts @@ -143,6 +143,10 @@ describe('MODELS contextWindow', () => { expect(config.contextWindow, `${key} missing contextWindow`).toBeGreaterThan(0); } }); + + test('jina-code points to the published code embedding model', () => { + expect(MODELS['jina-code'].name).toBe('jinaai/jina-embeddings-v2-base-code'); + }); }); describe('buildEmbeddings with structured strategy', () => {