Skip to content

Commit bff00f6

Browse files
authored
Merge pull request #30 from ArshVermaGit/main
feat: production-grade baseline agent with LLM mode, batch evaluation, and CSV/JSON export
2 parents ea85d55 + 0d95482 commit bff00f6

3 files changed

Lines changed: 768 additions & 68 deletions

File tree

results.json

Lines changed: 362 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
[
2+
{
3+
"episode_id": "d35a5286-23e7-4a53-a34c-6ca93f4e7134",
4+
"task_id": "bug_detection",
5+
"seed": 0,
6+
"final_score": 0.0,
7+
"steps_taken": 1,
8+
"issues_found": 0,
9+
"issues_total": 1,
10+
"noise_penalties": 0,
11+
"terminated_reason": "terminal_action",
12+
"duration_seconds": 0.01
13+
},
14+
{
15+
"episode_id": "9c81d2b3-f0dd-4efc-915e-4b7dfcf355ef",
16+
"task_id": "bug_detection",
17+
"seed": 1,
18+
"final_score": 0.0,
19+
"steps_taken": 1,
20+
"issues_found": 0,
21+
"issues_total": 1,
22+
"noise_penalties": 0,
23+
"terminated_reason": "terminal_action",
24+
"duration_seconds": 0.01
25+
},
26+
{
27+
"episode_id": "38fba47b-2915-4fba-89ef-865834bcc67b",
28+
"task_id": "bug_detection",
29+
"seed": 2,
30+
"final_score": 0.9167,
31+
"steps_taken": 6,
32+
"issues_found": 1,
33+
"issues_total": 1,
34+
"noise_penalties": 5,
35+
"terminated_reason": "noise_exhausted",
36+
"duration_seconds": 0.02
37+
},
38+
{
39+
"episode_id": "ce85c7b9-2c34-4d29-96e6-83b66da4c4a2",
40+
"task_id": "bug_detection",
41+
"seed": 3,
42+
"final_score": 0.9167,
43+
"steps_taken": 6,
44+
"issues_found": 1,
45+
"issues_total": 1,
46+
"noise_penalties": 5,
47+
"terminated_reason": "noise_exhausted",
48+
"duration_seconds": 0.02
49+
},
50+
{
51+
"episode_id": "03b43be8-968b-4d35-8cb6-4a4a7211061d",
52+
"task_id": "bug_detection",
53+
"seed": 4,
54+
"final_score": 0.8267,
55+
"steps_taken": 6,
56+
"issues_found": 1,
57+
"issues_total": 1,
58+
"noise_penalties": 5,
59+
"terminated_reason": "noise_exhausted",
60+
"duration_seconds": 0.03
61+
},
62+
{
63+
"episode_id": "1acad7bc-2374-4d70-95ad-f5536ecc22a6",
64+
"task_id": "bug_detection",
65+
"seed": 5,
66+
"final_score": 0.0,
67+
"steps_taken": 1,
68+
"issues_found": 0,
69+
"issues_total": 1,
70+
"noise_penalties": 0,
71+
"terminated_reason": "terminal_action",
72+
"duration_seconds": 0.01
73+
},
74+
{
75+
"episode_id": "fa84dd18-e38c-412d-a252-206a514fc352",
76+
"task_id": "bug_detection",
77+
"seed": 6,
78+
"final_score": 0.0,
79+
"steps_taken": 1,
80+
"issues_found": 0,
81+
"issues_total": 1,
82+
"noise_penalties": 0,
83+
"terminated_reason": "terminal_action",
84+
"duration_seconds": 0.01
85+
},
86+
{
87+
"episode_id": "c43cf6db-d5ca-4c45-871d-1a0bc64602fa",
88+
"task_id": "bug_detection",
89+
"seed": 7,
90+
"final_score": 0.0,
91+
"steps_taken": 1,
92+
"issues_found": 0,
93+
"issues_total": 1,
94+
"noise_penalties": 0,
95+
"terminated_reason": "terminal_action",
96+
"duration_seconds": 0.02
97+
},
98+
{
99+
"episode_id": "7dcff1f7-41f4-483f-8fab-caa6d62f5b66",
100+
"task_id": "bug_detection",
101+
"seed": 8,
102+
"final_score": 0.9167,
103+
"steps_taken": 6,
104+
"issues_found": 1,
105+
"issues_total": 1,
106+
"noise_penalties": 5,
107+
"terminated_reason": "noise_exhausted",
108+
"duration_seconds": 0.02
109+
},
110+
{
111+
"episode_id": "b379af5c-4096-45fd-95fe-534a0bf4a7af",
112+
"task_id": "bug_detection",
113+
"seed": 9,
114+
"final_score": 0.0,
115+
"steps_taken": 5,
116+
"issues_found": 0,
117+
"issues_total": 1,
118+
"noise_penalties": 5,
119+
"terminated_reason": "noise_exhausted",
120+
"duration_seconds": 0.02
121+
},
122+
{
123+
"episode_id": "ee70e3aa-fbaf-4a2e-8b5e-fc62a8a93192",
124+
"task_id": "security_audit",
125+
"seed": 0,
126+
"final_score": 0.0,
127+
"steps_taken": 5,
128+
"issues_found": 0,
129+
"issues_total": 1,
130+
"noise_penalties": 5,
131+
"terminated_reason": "noise_exhausted",
132+
"duration_seconds": 0.02
133+
},
134+
{
135+
"episode_id": "c9df9d0e-1719-4fbd-b6e8-3b5c5663a0a2",
136+
"task_id": "security_audit",
137+
"seed": 1,
138+
"final_score": 0.85,
139+
"steps_taken": 6,
140+
"issues_found": 1,
141+
"issues_total": 1,
142+
"noise_penalties": 5,
143+
"terminated_reason": "noise_exhausted",
144+
"duration_seconds": 0.02
145+
},
146+
{
147+
"episode_id": "fbf2c333-8b32-4ab8-b260-bdeb2ccda91b",
148+
"task_id": "security_audit",
149+
"seed": 2,
150+
"final_score": 0.0,
151+
"steps_taken": 5,
152+
"issues_found": 0,
153+
"issues_total": 1,
154+
"noise_penalties": 5,
155+
"terminated_reason": "noise_exhausted",
156+
"duration_seconds": 0.04
157+
},
158+
{
159+
"episode_id": "4fd0a956-7b46-4819-b59d-5e54bec65311",
160+
"task_id": "security_audit",
161+
"seed": 3,
162+
"final_score": 0.775,
163+
"steps_taken": 6,
164+
"issues_found": 1,
165+
"issues_total": 1,
166+
"noise_penalties": 5,
167+
"terminated_reason": "noise_exhausted",
168+
"duration_seconds": 0.03
169+
},
170+
{
171+
"episode_id": "ee98565e-4fc1-430c-8463-c0bcd801f107",
172+
"task_id": "security_audit",
173+
"seed": 4,
174+
"final_score": 0.0,
175+
"steps_taken": 5,
176+
"issues_found": 0,
177+
"issues_total": 1,
178+
"noise_penalties": 5,
179+
"terminated_reason": "noise_exhausted",
180+
"duration_seconds": 0.03
181+
},
182+
{
183+
"episode_id": "7a5a3689-5f55-4f1c-8c8d-81cfaa1e35e6",
184+
"task_id": "security_audit",
185+
"seed": 5,
186+
"final_score": 0.0,
187+
"steps_taken": 5,
188+
"issues_found": 0,
189+
"issues_total": 1,
190+
"noise_penalties": 5,
191+
"terminated_reason": "noise_exhausted",
192+
"duration_seconds": 0.02
193+
},
194+
{
195+
"episode_id": "1a2c2666-389e-4835-8aab-7e7ff63a2511",
196+
"task_id": "security_audit",
197+
"seed": 6,
198+
"final_score": 0.0,
199+
"steps_taken": 5,
200+
"issues_found": 0,
201+
"issues_total": 1,
202+
"noise_penalties": 5,
203+
"terminated_reason": "noise_exhausted",
204+
"duration_seconds": 0.02
205+
},
206+
{
207+
"episode_id": "9e78465a-b7d6-4ca8-8aae-761a2e55be82",
208+
"task_id": "security_audit",
209+
"seed": 7,
210+
"final_score": 0.0,
211+
"steps_taken": 5,
212+
"issues_found": 0,
213+
"issues_total": 1,
214+
"noise_penalties": 5,
215+
"terminated_reason": "noise_exhausted",
216+
"duration_seconds": 0.02
217+
},
218+
{
219+
"episode_id": "e59ee756-fbf1-4aa1-ac42-d1cb23079d88",
220+
"task_id": "security_audit",
221+
"seed": 8,
222+
"final_score": 0.0,
223+
"steps_taken": 5,
224+
"issues_found": 0,
225+
"issues_total": 1,
226+
"noise_penalties": 5,
227+
"terminated_reason": "noise_exhausted",
228+
"duration_seconds": 0.02
229+
},
230+
{
231+
"episode_id": "f573727f-ac41-47ba-bcb9-55495da61615",
232+
"task_id": "security_audit",
233+
"seed": 9,
234+
"final_score": 0.0,
235+
"steps_taken": 5,
236+
"issues_found": 0,
237+
"issues_total": 1,
238+
"noise_penalties": 5,
239+
"terminated_reason": "noise_exhausted",
240+
"duration_seconds": 0.02
241+
},
242+
{
243+
"episode_id": "0c368016-4685-4699-abf0-d74337a3ea8d",
244+
"task_id": "architectural_review",
245+
"seed": 0,
246+
"final_score": 0.0,
247+
"steps_taken": 1,
248+
"issues_found": 0,
249+
"issues_total": 1,
250+
"noise_penalties": 0,
251+
"terminated_reason": "terminal_action",
252+
"duration_seconds": 0.01
253+
},
254+
{
255+
"episode_id": "5dbf1824-e62b-4491-aaf2-c6ec3a2ae597",
256+
"task_id": "architectural_review",
257+
"seed": 1,
258+
"final_score": 0.059,
259+
"steps_taken": 5,
260+
"issues_found": 0,
261+
"issues_total": 1,
262+
"noise_penalties": 5,
263+
"terminated_reason": "noise_exhausted",
264+
"duration_seconds": 0.02
265+
},
266+
{
267+
"episode_id": "b2249f5c-8e6a-4ee4-b973-2dd428613a7c",
268+
"task_id": "architectural_review",
269+
"seed": 2,
270+
"final_score": 0.661,
271+
"steps_taken": 6,
272+
"issues_found": 1,
273+
"issues_total": 1,
274+
"noise_penalties": 5,
275+
"terminated_reason": "noise_exhausted",
276+
"duration_seconds": 0.02
277+
},
278+
{
279+
"episode_id": "0e58c8c0-efa1-4c16-9002-6d48e8f82439",
280+
"task_id": "architectural_review",
281+
"seed": 3,
282+
"final_score": 0.658,
283+
"steps_taken": 5,
284+
"issues_found": 0,
285+
"issues_total": 1,
286+
"noise_penalties": 5,
287+
"terminated_reason": "noise_exhausted",
288+
"duration_seconds": 0.02
289+
},
290+
{
291+
"episode_id": "69cf00eb-5a20-4347-9887-f9806026a66b",
292+
"task_id": "architectural_review",
293+
"seed": 4,
294+
"final_score": 0.058,
295+
"steps_taken": 5,
296+
"issues_found": 0,
297+
"issues_total": 1,
298+
"noise_penalties": 5,
299+
"terminated_reason": "noise_exhausted",
300+
"duration_seconds": 0.02
301+
},
302+
{
303+
"episode_id": "233ff87c-475f-4485-bd76-9abab4d2a304",
304+
"task_id": "architectural_review",
305+
"seed": 5,
306+
"final_score": 0.657,
307+
"steps_taken": 6,
308+
"issues_found": 1,
309+
"issues_total": 1,
310+
"noise_penalties": 5,
311+
"terminated_reason": "noise_exhausted",
312+
"duration_seconds": 0.02
313+
},
314+
{
315+
"episode_id": "89210c97-a95a-49c8-a9d1-8dbe6db92238",
316+
"task_id": "architectural_review",
317+
"seed": 6,
318+
"final_score": 0.059,
319+
"steps_taken": 5,
320+
"issues_found": 0,
321+
"issues_total": 1,
322+
"noise_penalties": 5,
323+
"terminated_reason": "noise_exhausted",
324+
"duration_seconds": 0.02
325+
},
326+
{
327+
"episode_id": "80c89d9d-92e9-4fbc-9a4f-401848c92cce",
328+
"task_id": "architectural_review",
329+
"seed": 7,
330+
"final_score": 0.664,
331+
"steps_taken": 6,
332+
"issues_found": 1,
333+
"issues_total": 1,
334+
"noise_penalties": 5,
335+
"terminated_reason": "noise_exhausted",
336+
"duration_seconds": 0.02
337+
},
338+
{
339+
"episode_id": "325d65a3-94e7-40f8-90a6-d93bac2cbd9e",
340+
"task_id": "architectural_review",
341+
"seed": 8,
342+
"final_score": 0.039,
343+
"steps_taken": 5,
344+
"issues_found": 0,
345+
"issues_total": 1,
346+
"noise_penalties": 5,
347+
"terminated_reason": "noise_exhausted",
348+
"duration_seconds": 0.02
349+
},
350+
{
351+
"episode_id": "d94abdb2-90c6-424a-9a26-e798a2ea9b13",
352+
"task_id": "architectural_review",
353+
"seed": 9,
354+
"final_score": 0.075,
355+
"steps_taken": 5,
356+
"issues_found": 0,
357+
"issues_total": 1,
358+
"noise_penalties": 5,
359+
"terminated_reason": "noise_exhausted",
360+
"duration_seconds": 0.02
361+
}
362+
]

0 commit comments

Comments
 (0)