diff --git a/package.json b/package.json index 1fa66eca..c1b07819 100644 --- a/package.json +++ b/package.json @@ -73,11 +73,17 @@ }, "dependencies": { "@eeacms/volto-matomo": "*", + "@langchain/core": "0.3.76", + "@langchain/openai": "0.6.12", "@microsoft/fetch-event-source": "2.0.1", + "d3-array": "3.2.4", "fast-json-patch": "3.1.1", "highlight.js": "11.10.0", + "langchain": "0.3.34", "luxon": "3.5.0", "marked": "13.0.3", + "mathjs": "14.7.0", + "natural": "8.1.0", "node-fetch": "2.7.0", "react-markdown": "6.0.3", "react-textarea-autosize": "^8.5.3", diff --git a/src/halloumi/chunking.ts b/src/halloumi/chunking.ts new file mode 100644 index 00000000..f1611582 --- /dev/null +++ b/src/halloumi/chunking.ts @@ -0,0 +1,168 @@ +import { OpenAIEmbeddings } from '@langchain/openai'; +import * as math from 'mathjs'; +import { quantile } from 'd3-array'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; + +// --- Step 1: Sentence splitting (character-based safeguard) --- +const splitToSentences = async (textCorpus: string): Promise => { + const splitter = new RecursiveCharacterTextSplitter({ + chunkSize: 200, + chunkOverlap: 0, // no overlap at character level + }); + + const output = await splitter.createDocuments([textCorpus]); + return output.map((out) => out.pageContent); +}; + +// --- Step 2: Sentence object structure --- +const structureSentences = (sentences: string[]): SentenceObject[] => + sentences.map((sentence, i) => ({ + sentence, + index: i, + })); + +// --- Step 3: Embeddings for raw sentences --- +const generateAndAttachEmbeddings = async ( + sentencesArray: SentenceObject[], +): Promise => { + const embeddings = new OpenAIEmbeddings({ + modelName: process.env.OPENAI_EMBEDDING_MODEL_NAME, + configuration: { baseURL: process.env.OPENAI_API_BASE_URL }, + apiKey: process.env.OPENAI_API_KEY, + }); + + const sentencesArrayCopy = sentencesArray.map((s) => ({ ...s })); + const embeddingsArray = await embeddings.embedDocuments( + sentencesArrayCopy.map((s) => s.sentence), + ); + + for (let i = 0; i < sentencesArrayCopy.length; i++) { + sentencesArrayCopy[i].embedding = embeddingsArray[i]; + } + + return sentencesArrayCopy; +}; + +// --- Step 4: Cosine similarity --- +const cosineSimilarity = (vecA: number[], vecB: number[]): number => { + const dotProduct = math.dot(vecA, vecB) as number; + const normA = math.norm(vecA) as number; + const normB = math.norm(vecB) as number; + + if (normA === 0 || normB === 0) return 0; + return dotProduct / (normA * normB); +}; + +// --- Step 5: Distance + semantic shifts --- +const calculateCosineDistancesAndSignificantShifts = ( + sentenceObjectArray: SentenceObject[], + percentileThreshold: number, +): { updatedArray: SentenceObject[]; significantShiftIndices: number[] } => { + const distances: number[] = []; + + const updatedSentenceObjectArray = sentenceObjectArray.map( + (item, index, array) => { + if ( + index < array.length - 1 && + item.embedding && + array[index + 1].embedding + ) { + const similarity = cosineSimilarity( + item.embedding, + array[index + 1].embedding, + ); + const distance = 1 - similarity; + distances.push(distance); + return { ...item, distance_to_next: distance }; + } else { + return { ...item, distance_to_next: undefined }; + } + }, + ); + + if (distances.length === 0) { + return { + updatedArray: updatedSentenceObjectArray, + significantShiftIndices: [], + }; + } + + const sortedDistances = [...distances].sort((a, b) => a - b); + const quantileThreshold = percentileThreshold / 100; + const breakpointDistanceThreshold = + quantile(sortedDistances, quantileThreshold) ?? 0.0; + + const significantShiftIndices = distances + .map((distance, index) => + distance > breakpointDistanceThreshold ? index : -1, + ) + .filter((index) => index !== -1); + + return { + updatedArray: updatedSentenceObjectArray, + significantShiftIndices, + }; +}; + +// --- Step 6: Group strictly (no overlap) --- +const groupSentencesIntoChunks = ( + sentenceObjectArray: SentenceObject[], + shiftIndices: number[], +): string[] => { + const chunks: string[] = []; + let startIdx = 0; + + for (const breakpoint of shiftIndices) { + const group = sentenceObjectArray.slice(startIdx, breakpoint + 1); + chunks.push(group.map((s) => s.sentence).join(' ')); + startIdx = breakpoint + 1; + } + + if (startIdx < sentenceObjectArray.length) { + chunks.push( + sentenceObjectArray + .slice(startIdx) + .map((s) => s.sentence) + .join(' '), + ); + } + + return chunks; +}; + +// --- Step 7: Main pipeline --- +const processTextToSemanticChunks = async ( + textCorpus: string, + percentileThreshold: number = 70, +): Promise => { + const sentences = await splitToSentences(textCorpus); + const structuredSentences = structureSentences(sentences); + const sentencesWithEmbeddings = + await generateAndAttachEmbeddings(structuredSentences); + + const { updatedArray, significantShiftIndices } = + calculateCosineDistancesAndSignificantShifts( + sentencesWithEmbeddings, + percentileThreshold, + ); + + return groupSentencesIntoChunks(updatedArray, significantShiftIndices); +}; + +// --- Types --- +interface SentenceObject { + sentence: string; + index: number; + embedding?: number[]; + distance_to_next?: number; +} + +export { + splitToSentences, + structureSentences, + generateAndAttachEmbeddings, + cosineSimilarity, + calculateCosineDistancesAndSignificantShifts, + groupSentencesIntoChunks, + processTextToSemanticChunks, +}; diff --git a/src/halloumi/index.js b/src/halloumi/index.js index 247f01f0..5b02caf7 100644 --- a/src/halloumi/index.js +++ b/src/halloumi/index.js @@ -1,13 +1,12 @@ -// import fs from 'fs'; import debug from 'debug'; import fetch from 'node-fetch'; import { getClaimsFromResponse, - getClassifierProbabilitiesFromLogits, + // getClassifierProbabilitiesFromLogits, getTokenProbabilitiesFromLogits, } from './postprocessing'; import { - createHalloumiClassifierPrompts, + // createHalloumiClassifierPrompts, createHalloumiPrompt, } from './preprocessing'; @@ -23,54 +22,55 @@ export function applyPlattScaling(platt, probability) { return sigmoid(-1 * (platt.a * log_prob + platt.b)); } -export async function halloumiClassifierAPI(model, context, claims) { - const classifierPrompts = createHalloumiClassifierPrompts(context, claims); - const headers = { - 'Content-Type': 'application/json', - accept: 'application/json', - }; - if (model.apiKey) { - headers['Authorization'] = `Bearer ${model.apiKey}`; - } - const data = { - input: classifierPrompts.prompts, - model: model.name, - }; - - const response = await fetch(model.apiUrl, { - method: 'POST', - headers: headers, - body: JSON.stringify(data), - }); - const jsonData = await response.json(); - const output = { - claims: [], - }; - for (let i = 0; i < classifierPrompts.prompts.length; i++) { - const embedding = jsonData.data[i].embedding; - const probs = getClassifierProbabilitiesFromLogits(embedding); - if (model.plattScaling) { - const platt = model.plattScaling; - const unsupportedScore = applyPlattScaling(platt, probs[1]); - const supportedScore = 1 - unsupportedScore; - probs[0] = supportedScore; - probs[1] = unsupportedScore; - } - const offset = classifierPrompts.responseOffsets.get(i + 1); - // 0-th index is the supported class. - // 1-th index is the unsupported class. - output.claims.push({ - startOffset: offset.startOffset, - endOffset: offset.endOffset, - citationIds: [], - score: probs[0], - rationale: '', - }); - } - - return output; -} - +// export async function halloumiClassifierAPI(model, context, claims) { +// const classifierPrompts = createHalloumiClassifierPrompts(context, claims); +// const headers = { +// 'Content-Type': 'application/json', +// accept: 'application/json', +// }; +// if (model.apiKey) { +// headers['Authorization'] = `Bearer ${model.apiKey}`; +// } +// const data = { +// input: classifierPrompts.prompts, +// model: model.name, +// }; +// +// const response = await fetch(model.apiUrl, { +// method: 'POST', +// headers: headers, +// body: JSON.stringify(data), +// }); +// const jsonData = await response.json(); +// const output = { +// claims: [], +// }; +// for (let i = 0; i < classifierPrompts.prompts.length; i++) { +// const embedding = jsonData.data[i].embedding; +// const probs = getClassifierProbabilitiesFromLogits(embedding); +// if (model.plattScaling) { +// const platt = model.plattScaling; +// const unsupportedScore = applyPlattScaling(platt, probs[1]); +// const supportedScore = 1 - unsupportedScore; +// probs[0] = supportedScore; +// probs[1] = unsupportedScore; +// } +// const offset = classifierPrompts.responseOffsets.get(i + 1); +// // 0-th index is the supported class. +// // 1-th index is the unsupported class. +// output.claims.push({ +// startOffset: offset.startOffset, +// endOffset: offset.endOffset, +// citationIds: [], +// score: probs[0], +// rationale: '', +// }); +// } +// +// return output; +// } + +// main function to get verify claim response, used directly by the middleware export async function getVerifyClaimResponse(model, context, claims) { if (!context || !claims) { const response = { @@ -79,25 +79,27 @@ export async function getVerifyClaimResponse(model, context, claims) { }; return response; } - if (model.isEmbeddingModel) { - return halloumiClassifierAPI(model, context, claims).then((response) => { - const parsedResponse = { - claims: response.claims, - citations: {}, - }; - return parsedResponse; - }); - } - const prompt = createHalloumiPrompt(context, claims); - // write prompt to a file named prompt.txt - // fs.writeFileSync( - // '/home/tibi/work/tmp/prompt.txt', - // JSON.stringify(prompt, null, 2), - // ); + // if (model.isEmbeddingModel) { + // return halloumiClassifierAPI(model, context, claims).then((response) => { + // const parsedResponse = { + // claims: response.claims, + // citations: {}, + // }; + // return parsedResponse; + // }); + // } + const prompt = await createHalloumiPrompt(context, claims); + log('Halloumi prompt', JSON.stringify(prompt, null, 2)); - const result = await halloumiGenerativeAPI(model, prompt).then((claims) => { - return convertGenerativesClaimToVerifyClaimResponse(claims, prompt); - }); + + const rawClaims = await halloumiGenerativeAPI(model, prompt); + console.log('Halloumi prompt responseOffsets ', prompt.responseOffsets); + const result = { + ...convertGenerativesClaimToVerifyClaimResponse(rawClaims, prompt), + rawClaims, + halloumiPrompt: prompt, + }; + return result; } @@ -132,13 +134,11 @@ export async function halloumiGenerativeAPI(model, prompt) { const jsonData = await response.json(); - // write jsonData to a file named response.json - // fs.writeFileSync( - // '/home/tibi/work/tmp/response.json', - // JSON.stringify(jsonData, null, 2), - // ); - log('Classifier response', jsonData); - log('Logprobs', jsonData.choices[0].logprobs.content); + log('Halloumi generative full response:', jsonData); + log( + 'Halloumi generative message content:', + jsonData.choices[0].message.content, + ); const logits = jsonData.choices[0].logprobs.content; const tokenProbabilities = getTokenProbabilitiesFromLogits( @@ -193,12 +193,20 @@ export function convertGenerativesClaimToVerifyClaimResponse( citationIds.push(citation.toString()); } + console.log('generativeClaims', generativeClaims); + const claimId = generativeClaim.claimId; if (!prompt.responseOffsets.has(claimId)) { throw new Error(`Claim ${claimId} not found in response offsets.`); } const claimResponseWindow = prompt.responseOffsets.get(claimId); + console.log( + 'claimResponseWindow', + claimResponseWindow, + prompt.responseOffsets, + ); + const score = generativeClaim.probabilities.get('supported'); const claim = { startOffset: claimResponseWindow.startOffset, diff --git a/src/halloumi/middleware.js b/src/halloumi/middleware.js index 2593e3bd..3e2f22d4 100644 --- a/src/halloumi/middleware.js +++ b/src/halloumi/middleware.js @@ -57,9 +57,11 @@ export default async function middleware(req, res, next) { sources.join('\n---\n'), answer, ); - log('Halloumi response', resp); + log('Halloumi parsed response:', JSON.stringify(resp, null, 2)); res.send(resp); } catch (error) { + // eslint-disable-next-line no-console + console.error('Halloumi error', error); res.send({ error: `Halloumi error: ${error}` }); } } diff --git a/src/halloumi/postprocessing.js b/src/halloumi/postprocessing.js index 1ee071c4..24dde30a 100644 --- a/src/halloumi/postprocessing.js +++ b/src/halloumi/postprocessing.js @@ -1,5 +1,4 @@ -import debug from 'debug'; - +// import debug from 'debug'; // // /** // * Represents a claim object with all relevant information. diff --git a/src/halloumi/preprocessing.js b/src/halloumi/preprocessing.js index 9e1ab9b3..1e0975c7 100644 --- a/src/halloumi/preprocessing.js +++ b/src/halloumi/preprocessing.js @@ -1,13 +1,11 @@ -/** - * Represents a prompt with appropriate metadata - */ +import { processTextToSemanticChunks } from './chunking.ts'; /** * Splits a given text into sentences using sentence-splitter. * @param text The input string to split. * @returns An array of sentence strings. */ -function splitIntoSentences(text) { +async function processTextToSemanticChunks2(text) { const segmenter = new Intl.Segmenter('en', { granularity: 'sentence' }); const segments = segmenter.segment(text); @@ -68,21 +66,27 @@ function getOffsets(originalString, sentences) { * @param request The request or question that was used to produce the response. * @returns The Halloumi prompt. */ -export function createHalloumiPrompt( +export async function createHalloumiPrompt( context, response, request = 'Make one or more claims about information in the documents.', ) { - const contextSentences = splitIntoSentences(context); + const contextSentences = await processTextToSemanticChunks(context); + const contextOffsets = getOffsets(context, contextSentences); const annotatedContextSentences = annotate(contextSentences, 's'); const annotatedContext = `<|context|>${annotatedContextSentences}`; const annotatedRequest = `<|request|><${request.trim()}>`; - const responseSentences = splitIntoSentences(response); + const responseSentences = await processTextToSemanticChunks(response); const responseOffsets = getOffsets(response, responseSentences); + + // console.log('responseSentences', { responseSentences, response }); + console.log('dump', { response, responseSentences, responseOffsets }); + const annotatedResponseSentences = annotate(responseSentences, 'r'); + const annotatedResponse = `<|response|>${annotatedResponseSentences}`; const prompt = `${annotatedContext}${annotatedRequest}${annotatedResponse}`; @@ -115,8 +119,8 @@ export function createHalloumiClassifierPrompt(context, response) { * @param response The response to the request. * @returns The Halloumi Classifier prompt strings. */ -export function createHalloumiClassifierPrompts(context, response) { - const responseSentences = splitIntoSentences(response); +export async function createHalloumiClassifierPrompts(context, response) { + const responseSentences = await processTextToSemanticChunks(response); const responseOffsets = getOffsets(response, responseSentences); const prompts = []; for (const sentence of responseSentences) { @@ -129,7 +133,6 @@ export function createHalloumiClassifierPrompts(context, response) { sentences: responseSentences, responseOffsets: responseOffsets, }; - // console.log(halloumiPrompt); return halloumiPrompt; }