Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: @llamaindex/postgres #1597

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/polite-coats-return.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@llamaindex/e2e": patch
"@llamaindex/core": patch
"llamaindex": patch
"pg-vector-store": patch
---

refactor: @llamaindex/postgres
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
done
- name: Pack provider packages
run: |
for dir in packages/providers/*; do
for dir in packages/providers/* packages/providers/storage/*; do
if [ -d "$dir" ] && [ -f "$dir/package.json" ]; then
echo "Packing $dir"
pnpm pack --pack-destination ${{ runner.temp }} -C $dir
Expand Down
3 changes: 1 addition & 2 deletions e2e/node/vector-store/pg-vector-store.e2e.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { config } from "dotenv";
import { Document, VectorStoreQueryMode } from "llamaindex";
import { PGVectorStore } from "llamaindex/vector-store/PGVectorStore";
import { Document, PGVectorStore, VectorStoreQueryMode } from "llamaindex";
import assert from "node:assert";
import { test } from "node:test";
import pg from "pg";
Expand Down
21 changes: 19 additions & 2 deletions examples/vector-store/pg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,31 @@ Read and follow the instructions in the README.md file located one directory up

To import documents and save the embedding vectors to your database:

> `npx tsx pg-vector-store/load-docs.ts data`
> `npx tsx vector-store/pg/load-docs.ts data`

where data is the directory containing your input files. Using the `data` directory in the example above will read all of the files in that directory using the LlamaIndexTS default readers for each file type.

## RAG Querying

To query using the resulting vector store:

> `npx tsx pg-vector-store/query.ts`
> `npx tsx vector-store/pg/query.ts`

The script will prompt for a question, then process and present the answer using the PGVectorStore data and your OpenAI API key. It will continue to prompt until you enter `q`, `quit` or `exit` as the next query.

## Supabase

You can try the supabase example by running:

> `npx tsx vector-store/pg/supabase.ts`

This will use the `POSTGRES_URL` environment variable to connect to your Supabase database.
Get one from the Supabase project settings page. See more details here: https://supabase.com/docs/guides/database/connecting-to-postgres#direct-connection

## Vercel

You can try the vercel example by running:

> `npx tsx vector-store/pg/vercel.ts`

For more information on Vercel Postgres, see: https://vercel.com/docs/storage/vercel-postgres/sdk
34 changes: 34 additions & 0 deletions examples/vector-store/pg/supabase.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import dotenv from "dotenv";
import {
PGVectorStore,
SimpleDirectoryReader,
storageContextFromDefaults,
VectorStoreIndex,
} from "llamaindex";

dotenv.config();

// Get direct connection string from Supabase and set it as POSTGRES_URL environment variable
// https://supabase.com/docs/guides/database/connecting-to-postgres#direct-connection

const sourceDir = "../data";
const connectionString = process.env.POSTGRES_URL;

const rdr = new SimpleDirectoryReader();
const docs = await rdr.loadData({ directoryPath: sourceDir });
const pgvs = new PGVectorStore({ clientConfig: { connectionString } });
pgvs.setCollection(sourceDir);

const ctx = await storageContextFromDefaults({ vectorStore: pgvs });

const index = await VectorStoreIndex.fromDocuments(docs, {
storageContext: ctx,
});

const queryEngine = index.asQueryEngine();

const results = await queryEngine.query({
query: "Information about the planet",
});

console.log(results);
4 changes: 4 additions & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,8 @@
},
"devDependencies": {
"@edge-runtime/vm": "^4.0.4",
"@types/lodash.get": "^4.4.9",
"@types/lodash.clone": "^4.5.9",
"ajv": "^8.17.1",
"bunchee": "6.2.0",
"happy-dom": "^15.11.6",
Expand All @@ -398,6 +400,8 @@
"dependencies": {
"@llamaindex/env": "workspace:*",
"@types/node": "^22.9.0",
"lodash.get": "^4.4.2",
"lodash.clone": "^4.5.0",
"magic-bytes.js": "^1.10.0",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.3"
Expand Down
15 changes: 15 additions & 0 deletions packages/core/src/global/settings.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { getEnv } from "@llamaindex/env";
import type { Tokenizer } from "@llamaindex/env/tokenizers";
import type { BaseEmbedding } from "../embeddings";
import type { LLM } from "../llms";
import {
type CallbackManager,
Expand All @@ -12,6 +13,11 @@ import {
setChunkSize,
withChunkSize,
} from "./settings/chunk-size";
import {
getEmbeddedModel,
setEmbeddedModel,
withEmbeddedModel,
} from "./settings/embedModel";
import { getLLM, setLLM, withLLM } from "./settings/llm";
import {
getTokenizer,
Expand All @@ -29,6 +35,15 @@ export const Settings = {
withLLM<Result>(llm: LLM, fn: () => Result): Result {
return withLLM(llm, fn);
},
get embedModel() {
return getEmbeddedModel();
},
set embedModel(embedModel) {
setEmbeddedModel(embedModel);
},
withEmbedModel<Result>(embedModel: BaseEmbedding, fn: () => Result): Result {
return withEmbeddedModel(embedModel, fn);
},
get tokenizer() {
return getTokenizer();
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import type { BaseEmbedding } from "@llamaindex/core/embeddings";
import { AsyncLocalStorage } from "@llamaindex/env";
import { OpenAIEmbedding } from "@llamaindex/openai";

const embeddedModelAsyncLocalStorage = new AsyncLocalStorage<BaseEmbedding>();
let globalEmbeddedModel: BaseEmbedding | null = null;

export function getEmbeddedModel(): BaseEmbedding {
if (globalEmbeddedModel === null) {
globalEmbeddedModel = new OpenAIEmbedding();
const currentEmbeddedModel =
embeddedModelAsyncLocalStorage.getStore() ?? globalEmbeddedModel;
if (!currentEmbeddedModel) {
throw new Error(
"Cannot find Embedding, please set `Settings.embedModel = ...` on the top of your code",
);
}
return embeddedModelAsyncLocalStorage.getStore() ?? globalEmbeddedModel;
return currentEmbeddedModel;
}

export function setEmbeddedModel(embeddedModel: BaseEmbedding) {
Expand Down
167 changes: 167 additions & 0 deletions packages/core/src/storage/doc-store/base-document-store.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import { path } from "@llamaindex/env";
import {
DEFAULT_DOC_STORE_PERSIST_FILENAME,
DEFAULT_PERSIST_DIR,
} from "../../global";
import type { StoredValue } from "../../schema";
import { BaseNode, Document, ObjectType, TextNode } from "../../schema";

const TYPE_KEY = "__type__";
const DATA_KEY = "__data__";

export interface Serializer<T> {
toPersistence(data: Record<string, unknown>): T;

fromPersistence(data: T): Record<string, unknown>;
}

export const jsonSerializer: Serializer<string> = {
toPersistence(data) {
return JSON.stringify(data);
},
fromPersistence(data) {
return JSON.parse(data);
},
};

export const noneSerializer: Serializer<Record<string, unknown>> = {
toPersistence(data) {
return data;
},
fromPersistence(data) {
return data;
},
};

type DocJson<Data> = {
[TYPE_KEY]: ObjectType;
[DATA_KEY]: Data;
};

export function isValidDocJson(
docJson: StoredValue | null | undefined,
): docJson is DocJson<unknown> {
return (
typeof docJson === "object" &&
docJson !== null &&
docJson[TYPE_KEY] !== undefined &&
docJson[DATA_KEY] !== undefined
);
}

export function docToJson(
doc: BaseNode,
serializer: Serializer<unknown>,
): DocJson<unknown> {
return {
[DATA_KEY]: serializer.toPersistence(doc.toJSON()),
[TYPE_KEY]: doc.type,
};
}

export function jsonToDoc<Data>(
docDict: DocJson<Data>,
serializer: Serializer<Data>,
): BaseNode {
const docType = docDict[TYPE_KEY];
// fixme: zod type check this
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const dataDict: any = serializer.fromPersistence(docDict[DATA_KEY]);
let doc: BaseNode;

if (docType === ObjectType.DOCUMENT) {
doc = new Document({
text: dataDict.text,
id_: dataDict.id_,
embedding: dataDict.embedding,
hash: dataDict.hash,
metadata: dataDict.metadata,
});
} else if (docType === ObjectType.TEXT) {
doc = new TextNode({
text: dataDict.text,
id_: dataDict.id_,
hash: dataDict.hash,
metadata: dataDict.metadata,
relationships: dataDict.relationships,
});
} else {
throw new Error(`Unknown doc type: ${docType}`);
}

return doc;
}

const DEFAULT_PERSIST_PATH = path.join(
DEFAULT_PERSIST_DIR,
DEFAULT_DOC_STORE_PERSIST_FILENAME,
);

export interface RefDocInfo {
nodeIds: string[];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
extraInfo: Record<string, any>;
}

export abstract class BaseDocumentStore {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
serializer: Serializer<any> = jsonSerializer;

// Save/load
persist(persistPath: string = DEFAULT_PERSIST_PATH): void {
// Persist the docstore to a file.
}

// Main interface
abstract docs(): Promise<Record<string, BaseNode>>;

abstract addDocuments(docs: BaseNode[], allowUpdate: boolean): Promise<void>;

abstract getDocument(
docId: string,
raiseError: boolean,
): Promise<BaseNode | undefined>;

abstract deleteDocument(docId: string, raiseError: boolean): Promise<void>;

abstract documentExists(docId: string): Promise<boolean>;

// Hash
abstract setDocumentHash(docId: string, docHash: string): Promise<void>;

abstract getDocumentHash(docId: string): Promise<string | undefined>;

abstract getAllDocumentHashes(): Promise<Record<string, string>>;

// Ref Docs
abstract getAllRefDocInfo(): Promise<Record<string, RefDocInfo> | undefined>;

abstract getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined>;

abstract deleteRefDoc(refDocId: string, raiseError: boolean): Promise<void>;

// Nodes
getNodes(nodeIds: string[], raiseError: boolean = true): Promise<BaseNode[]> {
return Promise.all(
nodeIds.map((nodeId) => this.getNode(nodeId, raiseError)),
);
}

async getNode(nodeId: string, raiseError: boolean = true): Promise<BaseNode> {
const doc = await this.getDocument(nodeId, raiseError);
if (!(doc instanceof BaseNode)) {
throw new Error(`Document ${nodeId} is not a Node.`);
}
return doc;
}

async getNodeDict(nodeIdDict: {
[index: number]: string;
}): Promise<Record<number, BaseNode>> {
const result: Record<number, BaseNode> = {};
for (const index in nodeIdDict) {
result[index] = await this.getNode(nodeIdDict[index]!);
}
return result;
}
}
Loading
Loading