eea · tiberiuichim · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 17, 2025
diff --git a/package.json b/package.json
@@ -73,11 +73,17 @@
   },
   "dependencies": {
     "@eeacms/volto-matomo": "*",
+    "@langchain/core": "0.3.76",
+    "@langchain/openai": "0.6.12",
     "@microsoft/fetch-event-source": "2.0.1",
+    "d3-array": "3.2.4",
     "fast-json-patch": "3.1.1",
     "highlight.js": "11.10.0",
+    "langchain": "0.3.34",
     "luxon": "3.5.0",
     "marked": "13.0.3",
+    "mathjs": "14.7.0",
+    "natural": "8.1.0",
     "node-fetch": "2.7.0",
     "react-markdown": "6.0.3",
     "react-textarea-autosize": "^8.5.3",

diff --git a/src/halloumi/chunking.ts b/src/halloumi/chunking.ts
@@ -0,0 +1,168 @@
+import { OpenAIEmbeddings } from '@langchain/openai';
+import * as math from 'mathjs';
+import { quantile } from 'd3-array';
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+
+// --- Step 1: Sentence splitting (character-based safeguard) ---
+const splitToSentences = async (textCorpus: string): Promise<string[]> => {
+  const splitter = new RecursiveCharacterTextSplitter({
+    chunkSize: 200,
+    chunkOverlap: 0, // no overlap at character level
+  });
+
+  const output = await splitter.createDocuments([textCorpus]);
+  return output.map((out) => out.pageContent);
+};
+
+// --- Step 2: Sentence object structure ---
+const structureSentences = (sentences: string[]): SentenceObject[] =>
+  sentences.map((sentence, i) => ({
+    sentence,
+    index: i,
+  }));
+
+// --- Step 3: Embeddings for raw sentences ---
+const generateAndAttachEmbeddings = async (
+  sentencesArray: SentenceObject[],
+): Promise<SentenceObject[]> => {
+  const embeddings = new OpenAIEmbeddings({
+    modelName: process.env.OPENAI_EMBEDDING_MODEL_NAME,
+    configuration: { baseURL: process.env.OPENAI_API_BASE_URL },
+    apiKey: process.env.OPENAI_API_KEY,
+  });
+
+  const sentencesArrayCopy = sentencesArray.map((s) => ({ ...s }));
+  const embeddingsArray = await embeddings.embedDocuments(
+    sentencesArrayCopy.map((s) => s.sentence),
+  );
+
+  for (let i = 0; i < sentencesArrayCopy.length; i++) {
+    sentencesArrayCopy[i].embedding = embeddingsArray[i];
+  }
+
+  return sentencesArrayCopy;
+};
+
+// --- Step 4: Cosine similarity ---
+const cosineSimilarity = (vecA: number[], vecB: number[]): number => {
+  const dotProduct = math.dot(vecA, vecB) as number;
+  const normA = math.norm(vecA) as number;
+  const normB = math.norm(vecB) as number;
+
+  if (normA === 0 || normB === 0) return 0;
+  return dotProduct / (normA * normB);
+};
+
+// --- Step 5: Distance + semantic shifts ---
+const calculateCosineDistancesAndSignificantShifts = (
+  sentenceObjectArray: SentenceObject[],
+  percentileThreshold: number,
+): { updatedArray: SentenceObject[]; significantShiftIndices: number[] } => {
+  const distances: number[] = [];
+
+  const updatedSentenceObjectArray = sentenceObjectArray.map(
+    (item, index, array) => {
+      if (
+        index < array.length - 1 &&
+        item.embedding &&
+        array[index + 1].embedding
+      ) {
+        const similarity = cosineSimilarity(
+          item.embedding,
+          array[index + 1].embedding,
+        );
+        const distance = 1 - similarity;
+        distances.push(distance);
+        return { ...item, distance_to_next: distance };
+      } else {
+        return { ...item, distance_to_next: undefined };
+      }
+    },
+  );
+
+  if (distances.length === 0) {
+    return {
+      updatedArray: updatedSentenceObjectArray,
+      significantShiftIndices: [],
+    };
+  }
+
+  const sortedDistances = [...distances].sort((a, b) => a - b);
+  const quantileThreshold = percentileThreshold / 100;
+  const breakpointDistanceThreshold =
+    quantile(sortedDistances, quantileThreshold) ?? 0.0;
+
+  const significantShiftIndices = distances
+    .map((distance, index) =>
+      distance > breakpointDistanceThreshold ? index : -1,
+    )
+    .filter((index) => index !== -1);
+
+  return {
+    updatedArray: updatedSentenceObjectArray,
+    significantShiftIndices,
+  };
+};
+
+// --- Step 6: Group strictly (no overlap) ---
+const groupSentencesIntoChunks = (
+  sentenceObjectArray: SentenceObject[],
+  shiftIndices: number[],
+): string[] => {
+  const chunks: string[] = [];
+  let startIdx = 0;
+
+  for (const breakpoint of shiftIndices) {
+    const group = sentenceObjectArray.slice(startIdx, breakpoint + 1);
+    chunks.push(group.map((s) => s.sentence).join(' '));
+    startIdx = breakpoint + 1;
+  }
+
+  if (startIdx < sentenceObjectArray.length) {
+    chunks.push(
+      sentenceObjectArray
+        .slice(startIdx)
+        .map((s) => s.sentence)
+        .join(' '),
+    );
+  }
+
+  return chunks;
+};
+
+// --- Step 7: Main pipeline ---
+const processTextToSemanticChunks = async (
+  textCorpus: string,
+  percentileThreshold: number = 70,
+): Promise<string[]> => {
+  const sentences = await splitToSentences(textCorpus);
+  const structuredSentences = structureSentences(sentences);
+  const sentencesWithEmbeddings =
+    await generateAndAttachEmbeddings(structuredSentences);
+
+  const { updatedArray, significantShiftIndices } =
+    calculateCosineDistancesAndSignificantShifts(
+      sentencesWithEmbeddings,
+      percentileThreshold,
+    );
+
+  return groupSentencesIntoChunks(updatedArray, significantShiftIndices);
+};
+
+// --- Types ---
+interface SentenceObject {
+  sentence: string;
+  index: number;
+  embedding?: number[];
+  distance_to_next?: number;
+}
+
+export {
+  splitToSentences,
+  structureSentences,
+  generateAndAttachEmbeddings,
+  cosineSimilarity,
+  calculateCosineDistancesAndSignificantShifts,
+  groupSentencesIntoChunks,
+  processTextToSemanticChunks,
+};
diff --git a/src/halloumi/index.js b/src/halloumi/index.js
@@ -1,13 +1,12 @@
-// import fs from 'fs';
 import debug from 'debug';
 import fetch from 'node-fetch';
 import {
   getClaimsFromResponse,
-  getClassifierProbabilitiesFromLogits,
+  // getClassifierProbabilitiesFromLogits,
   getTokenProbabilitiesFromLogits,
 } from './postprocessing';
 import {
-  createHalloumiClassifierPrompts,
+  // createHalloumiClassifierPrompts,
   createHalloumiPrompt,
 } from './preprocessing';
 
@@ -23,54 +22,55 @@ export function applyPlattScaling(platt, probability) {
   return sigmoid(-1 * (platt.a * log_prob + platt.b));
 }
 
-export async function halloumiClassifierAPI(model, context, claims) {
-  const classifierPrompts = createHalloumiClassifierPrompts(context, claims);
-  const headers = {
-    'Content-Type': 'application/json',
-    accept: 'application/json',
-  };
-  if (model.apiKey) {
-    headers['Authorization'] = `Bearer ${model.apiKey}`;
-  }
-  const data = {
-    input: classifierPrompts.prompts,
-    model: model.name,
-  };
-
-  const response = await fetch(model.apiUrl, {
-    method: 'POST',
-    headers: headers,
-    body: JSON.stringify(data),
-  });
-  const jsonData = await response.json();
-  const output = {
-    claims: [],
-  };
-  for (let i = 0; i < classifierPrompts.prompts.length; i++) {
-    const embedding = jsonData.data[i].embedding;
-    const probs = getClassifierProbabilitiesFromLogits(embedding);
-    if (model.plattScaling) {
-      const platt = model.plattScaling;
-      const unsupportedScore = applyPlattScaling(platt, probs[1]);
-      const supportedScore = 1 - unsupportedScore;
-      probs[0] = supportedScore;
-      probs[1] = unsupportedScore;
-    }
-    const offset = classifierPrompts.responseOffsets.get(i + 1);
-    // 0-th index is the supported class.
-    // 1-th index is the unsupported class.
-    output.claims.push({
-      startOffset: offset.startOffset,
-      endOffset: offset.endOffset,
-      citationIds: [],
-      score: probs[0],
-      rationale: '',
-    });
-  }
-
-  return output;
-}
-
+// export async function halloumiClassifierAPI(model, context, claims) {
+//   const classifierPrompts = createHalloumiClassifierPrompts(context, claims);
+//   const headers = {
+//     'Content-Type': 'application/json',
+//     accept: 'application/json',
+//   };
+//   if (model.apiKey) {
+//     headers['Authorization'] = `Bearer ${model.apiKey}`;
+//   }
+//   const data = {
+//     input: classifierPrompts.prompts,
+//     model: model.name,
+//   };
+//
+//   const response = await fetch(model.apiUrl, {
+//     method: 'POST',
+//     headers: headers,
+//     body: JSON.stringify(data),
+//   });
+//   const jsonData = await response.json();
+//   const output = {
+//     claims: [],
+//   };
+//   for (let i = 0; i < classifierPrompts.prompts.length; i++) {
+//     const embedding = jsonData.data[i].embedding;
+//     const probs = getClassifierProbabilitiesFromLogits(embedding);
+//     if (model.plattScaling) {
+//       const platt = model.plattScaling;
+//       const unsupportedScore = applyPlattScaling(platt, probs[1]);
+//       const supportedScore = 1 - unsupportedScore;
+//       probs[0] = supportedScore;
+//       probs[1] = unsupportedScore;
+//     }
+//     const offset = classifierPrompts.responseOffsets.get(i + 1);
+//     // 0-th index is the supported class.
+//     // 1-th index is the unsupported class.
+//     output.claims.push({
+//       startOffset: offset.startOffset,
+//       endOffset: offset.endOffset,
+//       citationIds: [],
+//       score: probs[0],
+//       rationale: '',
+//     });
+//   }
+//
+//   return output;
+// }
+
+// main function to get verify claim response, used directly by the middleware
 export async function getVerifyClaimResponse(model, context, claims) {
   if (!context || !claims) {
     const response = {
@@ -79,25 +79,27 @@ export async function getVerifyClaimResponse(model, context, claims) {
     };
     return response;
   }
-  if (model.isEmbeddingModel) {
-    return halloumiClassifierAPI(model, context, claims).then((response) => {
-      const parsedResponse = {
-        claims: response.claims,
-        citations: {},
-      };
-      return parsedResponse;
-    });
-  }
-  const prompt = createHalloumiPrompt(context, claims);
-  // write prompt to a file named prompt.txt
-  // fs.writeFileSync(
-  //   '/home/tibi/work/tmp/prompt.txt',
-  //   JSON.stringify(prompt, null, 2),
-  // );
+  // if (model.isEmbeddingModel) {
+  //   return halloumiClassifierAPI(model, context, claims).then((response) => {
+  //     const parsedResponse = {
+  //       claims: response.claims,
+  //       citations: {},
+  //     };
+  //     return parsedResponse;
+  //   });
+  // }
+  const prompt = await createHalloumiPrompt(context, claims);
+
   log('Halloumi prompt', JSON.stringify(prompt, null, 2));
-  const result = await halloumiGenerativeAPI(model, prompt).then((claims) => {
-    return convertGenerativesClaimToVerifyClaimResponse(claims, prompt);
-  });
+
+  const rawClaims = await halloumiGenerativeAPI(model, prompt);
+  console.log('Halloumi prompt responseOffsets ', prompt.responseOffsets);
+  const result = {
+    ...convertGenerativesClaimToVerifyClaimResponse(rawClaims, prompt),
+    rawClaims,
+    halloumiPrompt: prompt,
+  };
+
   return result;
 }
 
@@ -132,13 +134,11 @@ export async function halloumiGenerativeAPI(model, prompt) {
 
   const jsonData = await response.json();
 
-  // write jsonData to a file named response.json
-  // fs.writeFileSync(
-  //   '/home/tibi/work/tmp/response.json',
-  //   JSON.stringify(jsonData, null, 2),
-  // );
-  log('Classifier response', jsonData);
-  log('Logprobs', jsonData.choices[0].logprobs.content);
+  log('Halloumi generative full response:', jsonData);
+  log(
+    'Halloumi generative message content:',
+    jsonData.choices[0].message.content,
+  );
 
   const logits = jsonData.choices[0].logprobs.content;
   const tokenProbabilities = getTokenProbabilitiesFromLogits(
@@ -193,12 +193,20 @@ export function convertGenerativesClaimToVerifyClaimResponse(
       citationIds.push(citation.toString());
     }
 
+    console.log('generativeClaims', generativeClaims);
+
     const claimId = generativeClaim.claimId;
     if (!prompt.responseOffsets.has(claimId)) {
       throw new Error(`Claim ${claimId} not found in response offsets.`);
     }
 
     const claimResponseWindow = prompt.responseOffsets.get(claimId);
+    console.log(
+      'claimResponseWindow',
+      claimResponseWindow,
+      prompt.responseOffsets,
+    );
+
     const score = generativeClaim.probabilities.get('supported');
     const claim = {
       startOffset: claimResponseWindow.startOffset,