Skip to content

Commit 28f4ad1

Browse files
committed
Got it half working
1 parent 478f0ed commit 28f4ad1

File tree

10 files changed

+135
-57
lines changed

10 files changed

+135
-57
lines changed

apps/evalite-ui/app/routes/suite.$name.tsx

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ type EvalTableRowProps = {
7171
hasScores: boolean;
7272
prevSuite: Evalite.SDK.GetSuiteByNameResult["prevSuite"];
7373
cacheHitCount: number;
74+
cacheHitsByScorer: Record<string, number>;
7475
trialConfig?: {
7576
isFirstTrial: boolean;
7677
rowSpan: number;
@@ -111,6 +112,7 @@ function EvalTableRow({
111112
hasScores,
112113
prevSuite: prevEvaluation,
113114
cacheHitCount,
115+
cacheHitsByScorer,
114116
trialConfig,
115117
}: EvalTableRowProps) {
116118
const Wrapper = useMemo(
@@ -202,10 +204,24 @@ function EvalTableRow({
202204
const scoreInPreviousEvaluation = prevEvaluation?.evals
203205
.find((r) => r.input === _eval.input)
204206
?.scores.find((s) => s.name === scorer.name);
207+
const scorerCacheHitCount = cacheHitsByScorer[scorer.name] ?? 0;
205208
return (
206209
<TableCell key={scorer.id} className={cn(index === 0 && "border-l")}>
207210
<Wrapper>
208211
<div className="flex items-center gap-2">
212+
{scorerCacheHitCount > 0 && (
213+
<Tooltip>
214+
<TooltipTrigger asChild>
215+
<Zap className="size-3 text-accent-foreground" />
216+
</TooltipTrigger>
217+
<TooltipContent>
218+
{scorerCacheHitCount} LLM{" "}
219+
{scorerCacheHitCount === 1
220+
? "call was cached"
221+
: "calls were cached"}
222+
</TooltipContent>
223+
</Tooltip>
224+
)}
209225
<Score
210226
hasScores={hasScores}
211227
score={scorer.score}
@@ -541,6 +557,8 @@ function SuiteComponent() {
541557
evaluationWithoutLayoutShift!.evals.indexOf(_eval);
542558
const cacheHitCount =
543559
serverState.cacheHitsByEval[_eval.id] ?? 0;
560+
const cacheHitsByScorer =
561+
serverState.cacheHitsByScorer[_eval.id] ?? {};
544562
return (
545563
<EvalTableRow
546564
key={`${JSON.stringify(_eval.input)}-${_eval.trial_index}`}
@@ -553,6 +571,7 @@ function SuiteComponent() {
553571
hasScores={hasScores}
554572
prevSuite={prevSuite}
555573
cacheHitCount={cacheHitCount}
574+
cacheHitsByScorer={cacheHitsByScorer}
556575
trialConfig={{
557576
isFirstTrial: trialIndex === 0,
558577
rowSpan: group.evals.length,
@@ -566,6 +585,8 @@ function SuiteComponent() {
566585
evaluationWithoutLayoutShift.evals.map((_eval, index) => {
567586
const cacheHitCount =
568587
serverState.cacheHitsByEval[_eval.id] ?? 0;
588+
const cacheHitsByScorer =
589+
serverState.cacheHitsByScorer[_eval.id] ?? {};
569590
return (
570591
<EvalTableRow
571592
key={JSON.stringify(_eval.input)}
@@ -578,6 +599,7 @@ function SuiteComponent() {
578599
hasScores={hasScores}
579600
prevSuite={prevSuite}
580601
cacheHitCount={cacheHitCount}
602+
cacheHitsByScorer={cacheHitsByScorer}
581603
/>
582604
);
583605
})}

packages/evalite/src/ai-sdk.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import type {
55
} from "@ai-sdk/provider";
66
import { wrapLanguageModel } from "ai";
77
import { reportTrace, shouldReportTrace } from "./traces.js";
8-
import { getCacheContext, generateCacheKey, reportCacheHit } from "./cache.js";
8+
import { getCacheContext, generateCacheKey } from "./cache.js";
99

1010
const handlePromptContent = (
1111
content: LanguageModelV2CallOptions["prompt"][number]["content"][number]
@@ -98,6 +98,7 @@ export const wrapAISDKModel = (
9898
const context = getCacheContext();
9999
const cachingAvailable =
100100
enableCaching && (context?.cacheEnabled ?? true) && context?.serverPort;
101+
101102
const tracingAvailable = enableTracing && shouldReportTrace();
102103

103104
// If neither is enabled/available, return original model
@@ -133,7 +134,7 @@ export const wrapAISDKModel = (
133134
duration: number;
134135
};
135136
if (cached?.value) {
136-
reportCacheHit({
137+
context.reportCacheHit({
137138
keyHash,
138139
hit: true,
139140
savedDuration: cached.duration,
@@ -182,7 +183,7 @@ export const wrapAISDKModel = (
182183
console.warn("Cache write failed:", error);
183184
}
184185

185-
reportCacheHit({ keyHash, hit: false, savedDuration: 0 });
186+
context.reportCacheHit({ keyHash, hit: false, savedDuration: 0 });
186187
}
187188
}
188189

@@ -250,7 +251,7 @@ export const wrapAISDKModel = (
250251
duration: number;
251252
};
252253
if (cached?.value) {
253-
reportCacheHit({
254+
context.reportCacheHit({
254255
keyHash,
255256
hit: true,
256257
savedDuration: cached.duration,
@@ -343,7 +344,11 @@ export const wrapAISDKModel = (
343344
console.warn("Cache write failed:", error);
344345
}
345346

346-
reportCacheHit({ keyHash, hit: false, savedDuration: 0 });
347+
context.reportCacheHit({
348+
keyHash,
349+
hit: false,
350+
savedDuration: 0,
351+
});
347352
}
348353

349354
// Report trace if enabled

packages/evalite/src/cache.ts

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
11
import { AsyncLocalStorage } from "node:async_hooks";
22
import { createHash } from "node:crypto";
33

4-
export const cacheContextLocalStorage = new AsyncLocalStorage<{
4+
export type CacheContextConfig = {
55
trialCount: number | undefined;
66
evalName: string;
77
serverPort: number;
88
cacheEnabled: boolean;
9-
}>();
9+
};
10+
11+
export interface CacheContext extends CacheContextConfig {
12+
reportCacheHit: (hit: {
13+
keyHash: string;
14+
hit: boolean;
15+
savedDuration: number;
16+
}) => void;
17+
}
18+
19+
export const cacheContextLocalStorage = new AsyncLocalStorage<CacheContext>();
1020

1121
export const getCacheContext = () => {
1222
return cacheContextLocalStorage.getStore();
@@ -30,19 +40,3 @@ export const generateCacheKey = (params: {
3040

3141
return createHash("sha256").update(JSON.stringify(cacheObject)).digest("hex");
3242
};
33-
34-
export const reportCacheHitLocalStorage = new AsyncLocalStorage<
35-
(hit: { keyHash: string; hit: boolean; savedDuration: number }) => void
36-
>();
37-
38-
export const reportCacheHit = (hit: {
39-
keyHash: string;
40-
hit: boolean;
41-
savedDuration: number;
42-
}): void => {
43-
const _reportCacheHit = reportCacheHitLocalStorage.getStore();
44-
45-
if (_reportCacheHit) {
46-
_reportCacheHit(hit);
47-
}
48-
};

packages/evalite/src/evalite.ts

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@ import type { Evalite } from "./types.js";
88
import { FILES_LOCATION } from "./backend-only-constants.js";
99
import { createScorer } from "./index.js";
1010
import { serializeAnnotation } from "./reporter/events.js";
11-
import {
12-
cacheContextLocalStorage,
13-
reportCacheHitLocalStorage,
14-
} from "./cache.js";
11+
import { cacheContextLocalStorage, type CacheContextConfig } from "./cache.js";
1512

1613
const makeSerializable = (obj: unknown): unknown => {
1714
try {
@@ -62,27 +59,53 @@ const runTask = async <TInput, TOutput, TExpected, TVariant = undefined>(
6259
expected: TExpected | undefined;
6360
variant: TVariant;
6461
traces: Evalite.Trace[];
62+
cacheContext: CacheContextConfig;
6563
} & Omit<Evalite.RunnerOpts<TInput, TOutput, TExpected, TVariant>, "data">
6664
) => {
6765
const start = performance.now();
6866
const output = await executeTask(opts.task, opts.input, opts.variant);
6967
const duration = Math.round(performance.now() - start);
7068

7169
const scores = await Promise.all(
72-
(opts.scorers || []).map(async (scorerOrOpts) => {
73-
if (typeof scorerOrOpts === "function") {
74-
return scorerOrOpts({
75-
input: opts.input,
76-
output,
77-
expected: opts.expected as TExpected,
78-
});
79-
} else {
80-
return createScorer(scorerOrOpts)({
81-
input: opts.input,
82-
output,
83-
expected: opts.expected as TExpected,
84-
});
85-
}
70+
(opts.scorers || []).map(async (scorerOrOpts, index) => {
71+
// Isolate scorer traces - LLM calls in scorers still get traced
72+
// but traces are discarded (not collected in parent eval)
73+
return reportTraceLocalStorage.run(
74+
() => {
75+
// no-op: discard traces
76+
},
77+
() => {
78+
const scorerCacheHits: Array<Evalite.CacheHit> = [];
79+
return cacheContextLocalStorage.run(
80+
{
81+
...opts.cacheContext,
82+
reportCacheHit: (hit) => {
83+
scorerCacheHits.push(hit);
84+
},
85+
},
86+
async (): Promise<Evalite.ScoreWithCacheHits> => {
87+
const score =
88+
typeof scorerOrOpts === "function"
89+
? await scorerOrOpts({
90+
input: opts.input,
91+
output,
92+
expected: opts.expected as TExpected,
93+
})
94+
: await createScorer(scorerOrOpts)({
95+
input: opts.input,
96+
output,
97+
expected: opts.expected as TExpected,
98+
});
99+
100+
// Attach cache hits to score if there were any
101+
return {
102+
...score,
103+
cacheHits: scorerCacheHits,
104+
};
105+
}
106+
);
107+
}
108+
);
86109
})
87110
);
88111

@@ -224,7 +247,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
224247
output: datasetResult.error,
225248
scores: [],
226249
traces: [],
227-
cacheHits: [],
250+
taskCacheHits: [],
228251
renderedColumns: [],
229252
},
230253
})
@@ -318,18 +341,18 @@ function registerEvalite<TInput, TOutput, TExpected>(
318341
const traces: Evalite.Trace[] = [];
319342
reportTraceLocalStorage.enterWith((trace) => traces.push(trace));
320343

321-
const cacheHits: Array<{
322-
keyHash: string;
323-
hit: boolean;
324-
savedDuration: number;
325-
}> = [];
326-
reportCacheHitLocalStorage.enterWith((hit) => cacheHits.push(hit));
344+
const taskCacheHits: Array<Evalite.CacheHit> = [];
327345

328-
cacheContextLocalStorage.enterWith({
346+
const cacheContext: CacheContextConfig = {
329347
trialCount: inject("trialCount"),
330348
evalName: evalName,
331349
serverPort: inject("serverPort"),
332350
cacheEnabled: inject("cacheEnabled"),
351+
};
352+
353+
cacheContextLocalStorage.enterWith({
354+
...cacheContext,
355+
reportCacheHit: (hit) => taskCacheHits.push(hit),
333356
});
334357

335358
const [inputForMeta, expectedForMeta] = await Promise.all([
@@ -351,6 +374,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
351374
task: opts.task,
352375
columns: opts.columns,
353376
traces,
377+
cacheContext,
354378
});
355379

356380
const [outputWithFiles, tracesWithFiles, renderedColumns] =
@@ -379,7 +403,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
379403
output: serializableOutput,
380404
scores,
381405
traces: tracesWithFiles,
382-
cacheHits,
406+
taskCacheHits: taskCacheHits,
383407
status: "success",
384408
renderedColumns,
385409
variantName: vitestOpts.variantName,
@@ -415,7 +439,7 @@ function registerEvalite<TInput, TOutput, TExpected>(
415439
output: serializedError,
416440
scores: [],
417441
traces: await handleFilesInTraces(rootDir, traces),
418-
cacheHits,
442+
taskCacheHits: taskCacheHits,
419443
status: "fail",
420444
renderedColumns: [],
421445
variantName: vitestOpts.variantName,

packages/evalite/src/export-static.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ export const exportStaticUI = async (
275275
const serverState: Evalite.ServerState = {
276276
type: "idle",
277277
cacheHitsByEval: {},
278+
cacheHitsByScorer: {},
278279
};
279280
await fs.writeFile(
280281
path.join(dataDir, "server-state.json"),

packages/evalite/src/reporter.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ export default class EvaliteReporter implements Reporter {
368368
output: null,
369369
scores: [],
370370
traces: [],
371-
cacheHits: [],
371+
taskCacheHits: [],
372372
status: "fail",
373373
renderedColumns: [],
374374
variantName: data.initialEval.variantName,

packages/evalite/src/reporter/EvaliteRunner.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ export interface EvaliteRunnerOptions {
1010

1111
export class EvaliteRunner {
1212
private opts: EvaliteRunnerOptions;
13-
private state: Evalite.ServerState = { type: "idle", cacheHitsByEval: {} };
13+
private state: Evalite.ServerState = {
14+
type: "idle",
15+
cacheHitsByEval: {},
16+
cacheHitsByScorer: {},
17+
};
1418
private didLastRunFailThreshold: "yes" | "no" | "unknown" = "unknown";
1519
private collectedResults: Map<string, Evalite.Eval> = new Map();
1620
private eventQueue: Promise<void> = Promise.resolve();
@@ -121,6 +125,7 @@ export class EvaliteRunner {
121125
this.updateState({
122126
type: "idle",
123127
cacheHitsByEval: this.state.cacheHitsByEval,
128+
cacheHitsByScorer: this.state.cacheHitsByScorer,
124129
});
125130
break;
126131
case "EVAL_STARTED":
@@ -271,13 +276,25 @@ export class EvaliteRunner {
271276
}
272277

273278
// Count cache hits for this eval
274-
const cacheHitCount = event.eval.cacheHits.filter(
279+
const cacheHitCount = event.eval.taskCacheHits.filter(
275280
(hit) => hit.hit
276281
).length;
277282
if (cacheHitCount > 0) {
278283
this.state.cacheHitsByEval[evalId] = cacheHitCount;
279284
}
280285

286+
// Count cache hits per scorer
287+
if (!this.state.cacheHitsByScorer[evalId]) {
288+
this.state.cacheHitsByScorer[evalId] = {};
289+
}
290+
for (const score of event.eval.scores) {
291+
const scorerCacheHits = score.cacheHits;
292+
if (scorerCacheHits && scorerCacheHits.length > 0) {
293+
this.state.cacheHitsByScorer[evalId]![score.name] =
294+
scorerCacheHits.length;
295+
}
296+
}
297+
281298
const allEvals = await this.opts.storage.evals.getMany({
282299
suiteIds: [suite.id],
283300
});
@@ -327,6 +344,7 @@ export class EvaliteRunner {
327344
// Clear collected results for new run
328345
this.collectedResults.clear();
329346
this.state.cacheHitsByEval = {};
347+
this.state.cacheHitsByScorer = {};
330348

331349
this.updateState({
332350
filepaths: event.filepaths,
@@ -336,6 +354,7 @@ export class EvaliteRunner {
336354
suiteNamesRunning: [],
337355
evalIdsRunning: [],
338356
cacheHitsByEval: this.state.cacheHitsByEval,
357+
cacheHitsByScorer: this.state.cacheHitsByScorer,
339358
});
340359
break;
341360
}

0 commit comments

Comments
 (0)