Updates

mattpocock · mattpocock · commit 3de2525525d0 · 2025-11-14T10:34:28.000Z
diff --git a/.changeset/0000-cache-debug-mode.md b/.changeset/0000-cache-debug-mode.md
@@ -0,0 +1,5 @@
+---
+"evalite": patch
+---
+
+Added cache debug mode via debugCache in runEvalite to debug cache hits/misses.
diff --git a/apps/evalite-ui/app/routes/suite.$name.tsx b/apps/evalite-ui/app/routes/suite.$name.tsx
@@ -129,7 +129,8 @@ function EvalTableRow({
               <Zap className="size-4 text-accent-foreground" />
             </TooltipTrigger>
             <TooltipContent>
-              {cacheHitCount} cache {cacheHitCount === 1 ? "hit" : "hits"}
+              {cacheHitCount} LLM{" "}
+              {cacheHitCount === 1 ? "call was cached" : "calls were cached"}
             </TooltipContent>
           </Tooltip>
         </TableCell>
diff --git a/packages/evalite-tests/tests/ai-sdk-caching.test.ts b/packages/evalite-tests/tests/ai-sdk-caching.test.ts
@@ -0,0 +1,34 @@
+import { expect, it } from "vitest";
+import { loadFixture } from "./test-utils.js";
+
+it("Should cache AI SDK in the task and scorers", async () => {
+  await using fixture = await loadFixture("ai-sdk-caching");
+
+  // First run - should log cache misses
+  await fixture.run({
+    mode: "run-once-and-exit",
+    cacheDebug: true,
+  });
+
+  // Second run - should log cache hits
+  await fixture.run({
+    mode: "run-once-and-exit",
+    cacheDebug: true,
+  });
+
+  const output = fixture.getOutput();
+
+  const storage = fixture.storage;
+
+  const runs = await storage.runs.getMany();
+
+  expect(runs).toHaveLength(2);
+
+  const allLogs = fixture.getOutput().split("\n");
+
+  const cachelogs = allLogs.filter((log) => log.includes("[CACHE]"));
+  expect(cachelogs.length).toBeGreaterThan(0);
+  expect(cachelogs.some((log) => log.includes("Task cache HIT"))).toBe(true);
+  expect(cachelogs.some((log) => log.includes("Scorer cache HIT"))).toBe(true);
+  expect(cachelogs.some((log) => log.includes("saved"))).toBe(true);
+});
diff --git a/packages/evalite-tests/tests/fixtures/ai-sdk-caching/caching.eval.ts b/packages/evalite-tests/tests/fixtures/ai-sdk-caching/caching.eval.ts
@@ -0,0 +1,67 @@
+import { generateText } from "ai";
+import { MockLanguageModelV2 } from "ai/test";
+import { wrapAISDKModel } from "evalite/ai-sdk";
+import { evalite } from "evalite";
+
+const model = new MockLanguageModelV2({
+  doGenerate: async (options) => ({
+    rawCall: { rawPrompt: null, rawSettings: {} },
+    finishReason: "stop",
+    usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
+    content: [{ type: "text", text: `Response for task` }],
+    warnings: [],
+    providerMetadata: undefined,
+    request: undefined,
+    response: undefined,
+  }),
+});
+
+const scorerModel = new MockLanguageModelV2({
+  doGenerate: async (options) => ({
+    rawCall: { rawPrompt: null, rawSettings: {} },
+    finishReason: "stop",
+    usage: { inputTokens: 5, outputTokens: 10, totalTokens: 15 },
+    content: [{ type: "text", text: `1` }],
+    warnings: [],
+    providerMetadata: undefined,
+    request: undefined,
+    response: undefined,
+  }),
+});
+
+const tracedModel = wrapAISDKModel(model);
+const tracedScorerModel = wrapAISDKModel(scorerModel);
+
+evalite("AI SDK Caching", {
+  data: () => {
+    return [
+      {
+        input: "test input 1",
+        expected: "expected output 1",
+      },
+      {
+        input: "test input 2",
+        expected: "expected output 2",
+      },
+    ];
+  },
+  task: async (input) => {
+    const result = await generateText({
+      model: tracedModel,
+      prompt: input,
+    });
+    return result.text;
+  },
+  scorers: [
+    {
+      name: "AI Scorer",
+      scorer: async ({ input, output, expected }) => {
+        const result = await generateText({
+          model: tracedScorerModel,
+          prompt: `Score this: ${output}`,
+        });
+        return { score: 1 };
+      },
+    },
+  ],
+});
diff --git a/packages/evalite-tests/tests/test-utils.ts b/packages/evalite-tests/tests/test-utils.ts
@@ -61,13 +61,15 @@ export const loadFixture = async (
        * Set this to true if your test needs the server running (e.g., for cache functionality).
        */
       enableServer?: boolean;
+      cacheDebug?: boolean;
     }) => {
       const result = await runEvalite({
         ...opts,
         cwd: dirPath,
         storage,
         testOutputWritable: captured.writable,
         disableServer: !opts.enableServer,
+        cacheDebug: opts.cacheDebug ?? false,
       });
       vitestInstance = result.vitest;
       return vitestInstance;
diff --git a/packages/evalite/src/ai-sdk.ts b/packages/evalite/src/ai-sdk.ts
@@ -4,7 +4,7 @@ import type {
   LanguageModelV2StreamPart,
 } from "@ai-sdk/provider";
 import { wrapLanguageModel } from "ai";
-import { reportTrace, shouldReportTrace } from "./traces.js";
+import { reportTraceLocalStorage } from "./traces.js";
 import { getCacheContext, generateCacheKey } from "./cache.js";
 
 const handlePromptContent = (
@@ -95,14 +95,8 @@ export const wrapAISDKModel = (
   const enableTracing = options?.tracing ?? true;
   const enableCaching = options?.caching ?? true;
 
-  const context = getCacheContext();
-  const cachingAvailable =
-    enableCaching && (context?.cacheEnabled ?? true) && context?.serverPort;
-
-  const tracingAvailable = enableTracing && shouldReportTrace();
-
   // If neither is enabled/available, return original model
-  if (!cachingAvailable && !tracingAvailable) {
+  if (!enableCaching && !enableTracing) {
     return model;
   }
 
@@ -112,10 +106,10 @@ export const wrapAISDKModel = (
       wrapGenerate: async (opts) => {
         const start = performance.now();
         let result: Awaited<ReturnType<typeof opts.doGenerate>> | undefined;
+        const cacheContext = getCacheContext();
 
         // Try cache if enabled
-        if (cachingAvailable) {
-          const context = getCacheContext()!;
+        if (cacheContext) {
           const keyHash = generateCacheKey({
             model: model.modelId,
             params: opts.params,
@@ -125,7 +119,7 @@ export const wrapAISDKModel = (
 
           try {
             const cacheResponse = await fetch(
-              `http://localhost:${context.serverPort}/api/cache/${keyHash}`
+              `http://localhost:${cacheContext.serverPort}/api/cache/${keyHash}`
             );
 
             if (cacheResponse.ok) {
@@ -134,7 +128,7 @@ export const wrapAISDKModel = (
                 duration: number;
               };
               if (cached?.value) {
-                context.reportCacheHit({
+                cacheContext.reportCacheHit({
                   keyHash,
                   hit: true,
                   savedDuration: cached.duration,
@@ -161,8 +155,7 @@ export const wrapAISDKModel = (
           const duration = performance.now() - start;
 
           // Store in cache if caching enabled
-          if (cachingAvailable) {
-            const context = getCacheContext()!;
+          if (cacheContext) {
             const keyHash = generateCacheKey({
               model: model.modelId,
               params: opts.params,
@@ -172,7 +165,7 @@ export const wrapAISDKModel = (
 
             try {
               await fetch(
-                `http://localhost:${context.serverPort}/api/cache/${keyHash}`,
+                `http://localhost:${cacheContext.serverPort}/api/cache/${keyHash}`,
                 {
                   method: "POST",
                   headers: { "Content-Type": "application/json" },
@@ -183,12 +176,18 @@ export const wrapAISDKModel = (
               console.warn("Cache write failed:", error);
             }
 
-            context.reportCacheHit({ keyHash, hit: false, savedDuration: 0 });
+            cacheContext.reportCacheHit({
+              keyHash,
+              hit: false,
+              savedDuration: 0,
+            });
           }
         }
 
+        const reportTraceFromContext = reportTraceLocalStorage.getStore();
+
         // Report trace if enabled
-        if (tracingAvailable) {
+        if (reportTraceFromContext) {
           const end = performance.now();
           const textContent = result.content
             .filter((c) => c.type === "text")
@@ -208,7 +207,7 @@ export const wrapAISDKModel = (
             )
             .filter(Boolean);
 
-          reportTrace({
+          reportTraceFromContext({
             output: {
               text: textContent,
               toolCalls,
@@ -230,9 +229,11 @@ export const wrapAISDKModel = (
         const start = performance.now();
         let cachedParts: LanguageModelV2StreamPart[] | undefined;
 
+        const cacheContext = getCacheContext();
+        const reportTraceFromContext = reportTraceLocalStorage.getStore();
+
         // Try cache if enabled
-        if (cachingAvailable) {
-          const context = getCacheContext()!;
+        if (cacheContext) {
           const keyHash = generateCacheKey({
             model: model.modelId,
             params: params,
@@ -242,7 +243,7 @@ export const wrapAISDKModel = (
 
           try {
             const cacheResponse = await fetch(
-              `http://localhost:${context.serverPort}/api/cache/${keyHash}`
+              `http://localhost:${cacheContext.serverPort}/api/cache/${keyHash}`
             );
 
             if (cacheResponse.ok) {
@@ -251,7 +252,7 @@ export const wrapAISDKModel = (
                 duration: number;
               };
               if (cached?.value) {
-                context.reportCacheHit({
+                cacheContext.reportCacheHit({
                   keyHash,
                   hit: true,
                   savedDuration: cached.duration,
@@ -260,12 +261,12 @@ export const wrapAISDKModel = (
                 cachedParts = cached.value as LanguageModelV2StreamPart[];
 
                 // If tracing enabled, report trace for cached stream
-                if (tracingAvailable) {
+                if (reportTraceFromContext) {
                   const usage = cachedParts.find(
                     (part) => part.type === "finish"
                   )?.usage;
 
-                  reportTrace({
+                  reportTraceFromContext({
                     start,
                     end: performance.now(),
                     input: processPromptForTracing(params.prompt),
@@ -319,8 +320,7 @@ export const wrapAISDKModel = (
               const duration = performance.now() - start;
 
               // Store in cache if enabled
-              if (cachingAvailable) {
-                const context = getCacheContext()!;
+              if (cacheContext) {
                 const keyHash = generateCacheKey({
                   model: model.modelId,
                   params: params,
@@ -330,7 +330,7 @@ export const wrapAISDKModel = (
 
                 try {
                   await fetch(
-                    `http://localhost:${context.serverPort}/api/cache/${keyHash}`,
+                    `http://localhost:${cacheContext.serverPort}/api/cache/${keyHash}`,
                     {
                       method: "POST",
                       headers: { "Content-Type": "application/json" },
@@ -344,20 +344,20 @@ export const wrapAISDKModel = (
                   console.warn("Cache write failed:", error);
                 }
 
-                context.reportCacheHit({
+                cacheContext.reportCacheHit({
                   keyHash,
                   hit: false,
                   savedDuration: 0,
                 });
               }
 
               // Report trace if enabled
-              if (tracingAvailable) {
+              if (reportTraceFromContext) {
                 const usage = fullResponse.find(
                   (part) => part.type === "finish"
                 )?.usage;
 
-                reportTrace({
+                reportTraceFromContext({
                   start,
                   end: performance.now(),
                   input: processPromptForTracing(params.prompt),
diff --git a/packages/evalite/src/cache.ts b/packages/evalite/src/cache.ts
@@ -5,7 +5,6 @@ export type CacheContextConfig = {
   trialCount: number | undefined;
   evalName: string;
   serverPort: number;
-  cacheEnabled: boolean;
 };
 
 export interface CacheContext extends CacheContextConfig {
diff --git a/packages/evalite/src/evalite.ts b/packages/evalite/src/evalite.ts
@@ -60,6 +60,7 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
     variant: TVariant;
     traces: Evalite.Trace[];
     cacheContext: CacheContextConfig;
+    cacheDebug: boolean;
   } & Omit<Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>, "data">
 ) => {
   const start = performance.now();
@@ -81,6 +82,11 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
               ...opts.cacheContext,
               reportCacheHit: (hit) => {
                 scorerCacheHits.push(hit);
+                if (opts.cacheDebug) {
+                  console.log(
+                    `[CACHE] Scorer cache ${hit.hit ? "HIT" : "MISS"}${hit.hit ? ` (saved ${hit.savedDuration.toFixed(0)}ms)` : ""}`
+                  );
+                }
               },
             },
             async (): Promise<Evalite.ScoreWithCacheHits> => {
@@ -347,12 +353,20 @@ function registerEvalite<TInput, TOutput, TExpected>(
           trialCount: inject("trialCount"),
           evalName: evalName,
           serverPort: inject("serverPort"),
-          cacheEnabled: inject("cacheEnabled"),
         };
 
+        const cacheDebug = inject("cacheDebug");
+
         cacheContextLocalStorage.enterWith({
           ...cacheContext,
-          reportCacheHit: (hit) => taskCacheHits.push(hit),
+          reportCacheHit: (hit) => {
+            taskCacheHits.push(hit);
+            if (cacheDebug) {
+              console.log(
+                `[CACHE] Task cache HIT (saved ${hit.savedDuration.toFixed(0)}ms)`
+              );
+            }
+          },
         });
 
         const [inputForMeta, expectedForMeta] = await Promise.all([
@@ -375,6 +389,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
             columns: opts.columns,
             traces,
             cacheContext,
+            cacheDebug,
           });
 
           const [outputWithFiles, tracesWithFiles, renderedColumns] =
diff --git a/packages/evalite/src/run-evalite.ts b/packages/evalite/src/run-evalite.ts
diff --git a/packages/evalite/src/traces.ts b/packages/evalite/src/traces.ts
diff --git a/packages/example/src/fake-models.eval.ts b/packages/example/src/fake-models.eval.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"evalite": patch
 +---
++
 +Added cache debug mode via debugCache in runEvalite to debug cache hits/misses.