Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,17 @@
},
"dependencies": {
"@eeacms/volto-matomo": "*",
"@langchain/core": "0.3.76",
"@langchain/openai": "0.6.12",
"@microsoft/fetch-event-source": "2.0.1",
"d3-array": "3.2.4",
"fast-json-patch": "3.1.1",
"highlight.js": "11.10.0",
"langchain": "0.3.34",
"luxon": "3.5.0",
"marked": "13.0.3",
"mathjs": "14.7.0",
"natural": "8.1.0",
"node-fetch": "2.7.0",
"react-markdown": "6.0.3",
"react-textarea-autosize": "^8.5.3",
Expand Down
168 changes: 168 additions & 0 deletions src/halloumi/chunking.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import { OpenAIEmbeddings } from '@langchain/openai';
import * as math from 'mathjs';
import { quantile } from 'd3-array';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';

// --- Step 1: Sentence splitting (character-based safeguard) ---
const splitToSentences = async (textCorpus: string): Promise<string[]> => {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 200,
chunkOverlap: 0, // no overlap at character level
});

const output = await splitter.createDocuments([textCorpus]);
return output.map((out) => out.pageContent);
};

// --- Step 2: Sentence object structure ---
const structureSentences = (sentences: string[]): SentenceObject[] =>
sentences.map((sentence, i) => ({
sentence,
index: i,
}));

// --- Step 3: Embeddings for raw sentences ---
const generateAndAttachEmbeddings = async (
sentencesArray: SentenceObject[],
): Promise<SentenceObject[]> => {
const embeddings = new OpenAIEmbeddings({
modelName: process.env.OPENAI_EMBEDDING_MODEL_NAME,
configuration: { baseURL: process.env.OPENAI_API_BASE_URL },
apiKey: process.env.OPENAI_API_KEY,
});

const sentencesArrayCopy = sentencesArray.map((s) => ({ ...s }));
const embeddingsArray = await embeddings.embedDocuments(
sentencesArrayCopy.map((s) => s.sentence),
);

for (let i = 0; i < sentencesArrayCopy.length; i++) {
sentencesArrayCopy[i].embedding = embeddingsArray[i];
}

return sentencesArrayCopy;
};

// --- Step 4: Cosine similarity ---
const cosineSimilarity = (vecA: number[], vecB: number[]): number => {
const dotProduct = math.dot(vecA, vecB) as number;
const normA = math.norm(vecA) as number;
const normB = math.norm(vecB) as number;

if (normA === 0 || normB === 0) return 0;
return dotProduct / (normA * normB);
};

// --- Step 5: Distance + semantic shifts ---
const calculateCosineDistancesAndSignificantShifts = (
sentenceObjectArray: SentenceObject[],
percentileThreshold: number,
): { updatedArray: SentenceObject[]; significantShiftIndices: number[] } => {
const distances: number[] = [];

const updatedSentenceObjectArray = sentenceObjectArray.map(
(item, index, array) => {
if (
index < array.length - 1 &&
item.embedding &&
array[index + 1].embedding
) {
const similarity = cosineSimilarity(
item.embedding,
array[index + 1].embedding,
);
const distance = 1 - similarity;
distances.push(distance);
return { ...item, distance_to_next: distance };
} else {
return { ...item, distance_to_next: undefined };
}
},
);

if (distances.length === 0) {
return {
updatedArray: updatedSentenceObjectArray,
significantShiftIndices: [],
};
}

const sortedDistances = [...distances].sort((a, b) => a - b);
const quantileThreshold = percentileThreshold / 100;
const breakpointDistanceThreshold =
quantile(sortedDistances, quantileThreshold) ?? 0.0;

const significantShiftIndices = distances
.map((distance, index) =>
distance > breakpointDistanceThreshold ? index : -1,
)
.filter((index) => index !== -1);

return {
updatedArray: updatedSentenceObjectArray,
significantShiftIndices,
};
};

// --- Step 6: Group strictly (no overlap) ---
const groupSentencesIntoChunks = (
sentenceObjectArray: SentenceObject[],
shiftIndices: number[],
): string[] => {
const chunks: string[] = [];
let startIdx = 0;

for (const breakpoint of shiftIndices) {
const group = sentenceObjectArray.slice(startIdx, breakpoint + 1);
chunks.push(group.map((s) => s.sentence).join(' '));
startIdx = breakpoint + 1;
}

if (startIdx < sentenceObjectArray.length) {
chunks.push(
sentenceObjectArray
.slice(startIdx)
.map((s) => s.sentence)
.join(' '),
);
}

return chunks;
};

// --- Step 7: Main pipeline ---
const processTextToSemanticChunks = async (
textCorpus: string,
percentileThreshold: number = 70,
): Promise<string[]> => {
const sentences = await splitToSentences(textCorpus);
const structuredSentences = structureSentences(sentences);
const sentencesWithEmbeddings =
await generateAndAttachEmbeddings(structuredSentences);

const { updatedArray, significantShiftIndices } =
calculateCosineDistancesAndSignificantShifts(
sentencesWithEmbeddings,
percentileThreshold,
);

return groupSentencesIntoChunks(updatedArray, significantShiftIndices);
};

// --- Types ---
interface SentenceObject {
sentence: string;
index: number;
embedding?: number[];
distance_to_next?: number;
}

export {
splitToSentences,
structureSentences,
generateAndAttachEmbeddings,
cosineSimilarity,
calculateCosineDistancesAndSignificantShifts,
groupSentencesIntoChunks,
processTextToSemanticChunks,
};
160 changes: 84 additions & 76 deletions src/halloumi/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
// import fs from 'fs';
import debug from 'debug';
import fetch from 'node-fetch';
import {
getClaimsFromResponse,
getClassifierProbabilitiesFromLogits,
// getClassifierProbabilitiesFromLogits,
getTokenProbabilitiesFromLogits,
} from './postprocessing';
import {
createHalloumiClassifierPrompts,
// createHalloumiClassifierPrompts,
createHalloumiPrompt,
} from './preprocessing';

Expand All @@ -23,54 +22,55 @@ export function applyPlattScaling(platt, probability) {
return sigmoid(-1 * (platt.a * log_prob + platt.b));
}

export async function halloumiClassifierAPI(model, context, claims) {
const classifierPrompts = createHalloumiClassifierPrompts(context, claims);
const headers = {
'Content-Type': 'application/json',
accept: 'application/json',
};
if (model.apiKey) {
headers['Authorization'] = `Bearer ${model.apiKey}`;
}
const data = {
input: classifierPrompts.prompts,
model: model.name,
};

const response = await fetch(model.apiUrl, {
method: 'POST',
headers: headers,
body: JSON.stringify(data),
});
const jsonData = await response.json();
const output = {
claims: [],
};
for (let i = 0; i < classifierPrompts.prompts.length; i++) {
const embedding = jsonData.data[i].embedding;
const probs = getClassifierProbabilitiesFromLogits(embedding);
if (model.plattScaling) {
const platt = model.plattScaling;
const unsupportedScore = applyPlattScaling(platt, probs[1]);
const supportedScore = 1 - unsupportedScore;
probs[0] = supportedScore;
probs[1] = unsupportedScore;
}
const offset = classifierPrompts.responseOffsets.get(i + 1);
// 0-th index is the supported class.
// 1-th index is the unsupported class.
output.claims.push({
startOffset: offset.startOffset,
endOffset: offset.endOffset,
citationIds: [],
score: probs[0],
rationale: '',
});
}

return output;
}

// export async function halloumiClassifierAPI(model, context, claims) {
// const classifierPrompts = createHalloumiClassifierPrompts(context, claims);
// const headers = {
// 'Content-Type': 'application/json',
// accept: 'application/json',
// };
// if (model.apiKey) {
// headers['Authorization'] = `Bearer ${model.apiKey}`;
// }
// const data = {
// input: classifierPrompts.prompts,
// model: model.name,
// };
//
// const response = await fetch(model.apiUrl, {
// method: 'POST',
// headers: headers,
// body: JSON.stringify(data),
// });
// const jsonData = await response.json();
// const output = {
// claims: [],
// };
// for (let i = 0; i < classifierPrompts.prompts.length; i++) {
// const embedding = jsonData.data[i].embedding;
// const probs = getClassifierProbabilitiesFromLogits(embedding);
// if (model.plattScaling) {
// const platt = model.plattScaling;
// const unsupportedScore = applyPlattScaling(platt, probs[1]);
// const supportedScore = 1 - unsupportedScore;
// probs[0] = supportedScore;
// probs[1] = unsupportedScore;
// }
// const offset = classifierPrompts.responseOffsets.get(i + 1);
// // 0-th index is the supported class.
// // 1-th index is the unsupported class.
// output.claims.push({
// startOffset: offset.startOffset,
// endOffset: offset.endOffset,
// citationIds: [],
// score: probs[0],
// rationale: '',
// });
// }
//
// return output;
// }

// main function to get verify claim response, used directly by the middleware
export async function getVerifyClaimResponse(model, context, claims) {
if (!context || !claims) {
const response = {
Expand All @@ -79,25 +79,27 @@ export async function getVerifyClaimResponse(model, context, claims) {
};
return response;
}
if (model.isEmbeddingModel) {
return halloumiClassifierAPI(model, context, claims).then((response) => {
const parsedResponse = {
claims: response.claims,
citations: {},
};
return parsedResponse;
});
}
const prompt = createHalloumiPrompt(context, claims);
// write prompt to a file named prompt.txt
// fs.writeFileSync(
// '/home/tibi/work/tmp/prompt.txt',
// JSON.stringify(prompt, null, 2),
// );
// if (model.isEmbeddingModel) {
// return halloumiClassifierAPI(model, context, claims).then((response) => {
// const parsedResponse = {
// claims: response.claims,
// citations: {},
// };
// return parsedResponse;
// });
// }
const prompt = await createHalloumiPrompt(context, claims);

log('Halloumi prompt', JSON.stringify(prompt, null, 2));
const result = await halloumiGenerativeAPI(model, prompt).then((claims) => {
return convertGenerativesClaimToVerifyClaimResponse(claims, prompt);
});

const rawClaims = await halloumiGenerativeAPI(model, prompt);
console.log('Halloumi prompt responseOffsets ', prompt.responseOffsets);
const result = {
...convertGenerativesClaimToVerifyClaimResponse(rawClaims, prompt),
rawClaims,
halloumiPrompt: prompt,
};

return result;
}

Expand Down Expand Up @@ -132,13 +134,11 @@ export async function halloumiGenerativeAPI(model, prompt) {

const jsonData = await response.json();

// write jsonData to a file named response.json
// fs.writeFileSync(
// '/home/tibi/work/tmp/response.json',
// JSON.stringify(jsonData, null, 2),
// );
log('Classifier response', jsonData);
log('Logprobs', jsonData.choices[0].logprobs.content);
log('Halloumi generative full response:', jsonData);
log(
'Halloumi generative message content:',
jsonData.choices[0].message.content,
);

const logits = jsonData.choices[0].logprobs.content;
const tokenProbabilities = getTokenProbabilitiesFromLogits(
Expand Down Expand Up @@ -193,12 +193,20 @@ export function convertGenerativesClaimToVerifyClaimResponse(
citationIds.push(citation.toString());
}

console.log('generativeClaims', generativeClaims);

const claimId = generativeClaim.claimId;
if (!prompt.responseOffsets.has(claimId)) {
throw new Error(`Claim ${claimId} not found in response offsets.`);
}

const claimResponseWindow = prompt.responseOffsets.get(claimId);
console.log(
'claimResponseWindow',
claimResponseWindow,
prompt.responseOffsets,
);

const score = generativeClaim.probabilities.get('supported');
const claim = {
startOffset: claimResponseWindow.startOffset,
Expand Down
Loading