1+ [
2+ {
3+ "episode_id" : " d35a5286-23e7-4a53-a34c-6ca93f4e7134" ,
4+ "task_id" : " bug_detection" ,
5+ "seed" : 0 ,
6+ "final_score" : 0.0 ,
7+ "steps_taken" : 1 ,
8+ "issues_found" : 0 ,
9+ "issues_total" : 1 ,
10+ "noise_penalties" : 0 ,
11+ "terminated_reason" : " terminal_action" ,
12+ "duration_seconds" : 0.01
13+ },
14+ {
15+ "episode_id" : " 9c81d2b3-f0dd-4efc-915e-4b7dfcf355ef" ,
16+ "task_id" : " bug_detection" ,
17+ "seed" : 1 ,
18+ "final_score" : 0.0 ,
19+ "steps_taken" : 1 ,
20+ "issues_found" : 0 ,
21+ "issues_total" : 1 ,
22+ "noise_penalties" : 0 ,
23+ "terminated_reason" : " terminal_action" ,
24+ "duration_seconds" : 0.01
25+ },
26+ {
27+ "episode_id" : " 38fba47b-2915-4fba-89ef-865834bcc67b" ,
28+ "task_id" : " bug_detection" ,
29+ "seed" : 2 ,
30+ "final_score" : 0.9167 ,
31+ "steps_taken" : 6 ,
32+ "issues_found" : 1 ,
33+ "issues_total" : 1 ,
34+ "noise_penalties" : 5 ,
35+ "terminated_reason" : " noise_exhausted" ,
36+ "duration_seconds" : 0.02
37+ },
38+ {
39+ "episode_id" : " ce85c7b9-2c34-4d29-96e6-83b66da4c4a2" ,
40+ "task_id" : " bug_detection" ,
41+ "seed" : 3 ,
42+ "final_score" : 0.9167 ,
43+ "steps_taken" : 6 ,
44+ "issues_found" : 1 ,
45+ "issues_total" : 1 ,
46+ "noise_penalties" : 5 ,
47+ "terminated_reason" : " noise_exhausted" ,
48+ "duration_seconds" : 0.02
49+ },
50+ {
51+ "episode_id" : " 03b43be8-968b-4d35-8cb6-4a4a7211061d" ,
52+ "task_id" : " bug_detection" ,
53+ "seed" : 4 ,
54+ "final_score" : 0.8267 ,
55+ "steps_taken" : 6 ,
56+ "issues_found" : 1 ,
57+ "issues_total" : 1 ,
58+ "noise_penalties" : 5 ,
59+ "terminated_reason" : " noise_exhausted" ,
60+ "duration_seconds" : 0.03
61+ },
62+ {
63+ "episode_id" : " 1acad7bc-2374-4d70-95ad-f5536ecc22a6" ,
64+ "task_id" : " bug_detection" ,
65+ "seed" : 5 ,
66+ "final_score" : 0.0 ,
67+ "steps_taken" : 1 ,
68+ "issues_found" : 0 ,
69+ "issues_total" : 1 ,
70+ "noise_penalties" : 0 ,
71+ "terminated_reason" : " terminal_action" ,
72+ "duration_seconds" : 0.01
73+ },
74+ {
75+ "episode_id" : " fa84dd18-e38c-412d-a252-206a514fc352" ,
76+ "task_id" : " bug_detection" ,
77+ "seed" : 6 ,
78+ "final_score" : 0.0 ,
79+ "steps_taken" : 1 ,
80+ "issues_found" : 0 ,
81+ "issues_total" : 1 ,
82+ "noise_penalties" : 0 ,
83+ "terminated_reason" : " terminal_action" ,
84+ "duration_seconds" : 0.01
85+ },
86+ {
87+ "episode_id" : " c43cf6db-d5ca-4c45-871d-1a0bc64602fa" ,
88+ "task_id" : " bug_detection" ,
89+ "seed" : 7 ,
90+ "final_score" : 0.0 ,
91+ "steps_taken" : 1 ,
92+ "issues_found" : 0 ,
93+ "issues_total" : 1 ,
94+ "noise_penalties" : 0 ,
95+ "terminated_reason" : " terminal_action" ,
96+ "duration_seconds" : 0.02
97+ },
98+ {
99+ "episode_id" : " 7dcff1f7-41f4-483f-8fab-caa6d62f5b66" ,
100+ "task_id" : " bug_detection" ,
101+ "seed" : 8 ,
102+ "final_score" : 0.9167 ,
103+ "steps_taken" : 6 ,
104+ "issues_found" : 1 ,
105+ "issues_total" : 1 ,
106+ "noise_penalties" : 5 ,
107+ "terminated_reason" : " noise_exhausted" ,
108+ "duration_seconds" : 0.02
109+ },
110+ {
111+ "episode_id" : " b379af5c-4096-45fd-95fe-534a0bf4a7af" ,
112+ "task_id" : " bug_detection" ,
113+ "seed" : 9 ,
114+ "final_score" : 0.0 ,
115+ "steps_taken" : 5 ,
116+ "issues_found" : 0 ,
117+ "issues_total" : 1 ,
118+ "noise_penalties" : 5 ,
119+ "terminated_reason" : " noise_exhausted" ,
120+ "duration_seconds" : 0.02
121+ },
122+ {
123+ "episode_id" : " ee70e3aa-fbaf-4a2e-8b5e-fc62a8a93192" ,
124+ "task_id" : " security_audit" ,
125+ "seed" : 0 ,
126+ "final_score" : 0.0 ,
127+ "steps_taken" : 5 ,
128+ "issues_found" : 0 ,
129+ "issues_total" : 1 ,
130+ "noise_penalties" : 5 ,
131+ "terminated_reason" : " noise_exhausted" ,
132+ "duration_seconds" : 0.02
133+ },
134+ {
135+ "episode_id" : " c9df9d0e-1719-4fbd-b6e8-3b5c5663a0a2" ,
136+ "task_id" : " security_audit" ,
137+ "seed" : 1 ,
138+ "final_score" : 0.85 ,
139+ "steps_taken" : 6 ,
140+ "issues_found" : 1 ,
141+ "issues_total" : 1 ,
142+ "noise_penalties" : 5 ,
143+ "terminated_reason" : " noise_exhausted" ,
144+ "duration_seconds" : 0.02
145+ },
146+ {
147+ "episode_id" : " fbf2c333-8b32-4ab8-b260-bdeb2ccda91b" ,
148+ "task_id" : " security_audit" ,
149+ "seed" : 2 ,
150+ "final_score" : 0.0 ,
151+ "steps_taken" : 5 ,
152+ "issues_found" : 0 ,
153+ "issues_total" : 1 ,
154+ "noise_penalties" : 5 ,
155+ "terminated_reason" : " noise_exhausted" ,
156+ "duration_seconds" : 0.04
157+ },
158+ {
159+ "episode_id" : " 4fd0a956-7b46-4819-b59d-5e54bec65311" ,
160+ "task_id" : " security_audit" ,
161+ "seed" : 3 ,
162+ "final_score" : 0.775 ,
163+ "steps_taken" : 6 ,
164+ "issues_found" : 1 ,
165+ "issues_total" : 1 ,
166+ "noise_penalties" : 5 ,
167+ "terminated_reason" : " noise_exhausted" ,
168+ "duration_seconds" : 0.03
169+ },
170+ {
171+ "episode_id" : " ee98565e-4fc1-430c-8463-c0bcd801f107" ,
172+ "task_id" : " security_audit" ,
173+ "seed" : 4 ,
174+ "final_score" : 0.0 ,
175+ "steps_taken" : 5 ,
176+ "issues_found" : 0 ,
177+ "issues_total" : 1 ,
178+ "noise_penalties" : 5 ,
179+ "terminated_reason" : " noise_exhausted" ,
180+ "duration_seconds" : 0.03
181+ },
182+ {
183+ "episode_id" : " 7a5a3689-5f55-4f1c-8c8d-81cfaa1e35e6" ,
184+ "task_id" : " security_audit" ,
185+ "seed" : 5 ,
186+ "final_score" : 0.0 ,
187+ "steps_taken" : 5 ,
188+ "issues_found" : 0 ,
189+ "issues_total" : 1 ,
190+ "noise_penalties" : 5 ,
191+ "terminated_reason" : " noise_exhausted" ,
192+ "duration_seconds" : 0.02
193+ },
194+ {
195+ "episode_id" : " 1a2c2666-389e-4835-8aab-7e7ff63a2511" ,
196+ "task_id" : " security_audit" ,
197+ "seed" : 6 ,
198+ "final_score" : 0.0 ,
199+ "steps_taken" : 5 ,
200+ "issues_found" : 0 ,
201+ "issues_total" : 1 ,
202+ "noise_penalties" : 5 ,
203+ "terminated_reason" : " noise_exhausted" ,
204+ "duration_seconds" : 0.02
205+ },
206+ {
207+ "episode_id" : " 9e78465a-b7d6-4ca8-8aae-761a2e55be82" ,
208+ "task_id" : " security_audit" ,
209+ "seed" : 7 ,
210+ "final_score" : 0.0 ,
211+ "steps_taken" : 5 ,
212+ "issues_found" : 0 ,
213+ "issues_total" : 1 ,
214+ "noise_penalties" : 5 ,
215+ "terminated_reason" : " noise_exhausted" ,
216+ "duration_seconds" : 0.02
217+ },
218+ {
219+ "episode_id" : " e59ee756-fbf1-4aa1-ac42-d1cb23079d88" ,
220+ "task_id" : " security_audit" ,
221+ "seed" : 8 ,
222+ "final_score" : 0.0 ,
223+ "steps_taken" : 5 ,
224+ "issues_found" : 0 ,
225+ "issues_total" : 1 ,
226+ "noise_penalties" : 5 ,
227+ "terminated_reason" : " noise_exhausted" ,
228+ "duration_seconds" : 0.02
229+ },
230+ {
231+ "episode_id" : " f573727f-ac41-47ba-bcb9-55495da61615" ,
232+ "task_id" : " security_audit" ,
233+ "seed" : 9 ,
234+ "final_score" : 0.0 ,
235+ "steps_taken" : 5 ,
236+ "issues_found" : 0 ,
237+ "issues_total" : 1 ,
238+ "noise_penalties" : 5 ,
239+ "terminated_reason" : " noise_exhausted" ,
240+ "duration_seconds" : 0.02
241+ },
242+ {
243+ "episode_id" : " 0c368016-4685-4699-abf0-d74337a3ea8d" ,
244+ "task_id" : " architectural_review" ,
245+ "seed" : 0 ,
246+ "final_score" : 0.0 ,
247+ "steps_taken" : 1 ,
248+ "issues_found" : 0 ,
249+ "issues_total" : 1 ,
250+ "noise_penalties" : 0 ,
251+ "terminated_reason" : " terminal_action" ,
252+ "duration_seconds" : 0.01
253+ },
254+ {
255+ "episode_id" : " 5dbf1824-e62b-4491-aaf2-c6ec3a2ae597" ,
256+ "task_id" : " architectural_review" ,
257+ "seed" : 1 ,
258+ "final_score" : 0.059 ,
259+ "steps_taken" : 5 ,
260+ "issues_found" : 0 ,
261+ "issues_total" : 1 ,
262+ "noise_penalties" : 5 ,
263+ "terminated_reason" : " noise_exhausted" ,
264+ "duration_seconds" : 0.02
265+ },
266+ {
267+ "episode_id" : " b2249f5c-8e6a-4ee4-b973-2dd428613a7c" ,
268+ "task_id" : " architectural_review" ,
269+ "seed" : 2 ,
270+ "final_score" : 0.661 ,
271+ "steps_taken" : 6 ,
272+ "issues_found" : 1 ,
273+ "issues_total" : 1 ,
274+ "noise_penalties" : 5 ,
275+ "terminated_reason" : " noise_exhausted" ,
276+ "duration_seconds" : 0.02
277+ },
278+ {
279+ "episode_id" : " 0e58c8c0-efa1-4c16-9002-6d48e8f82439" ,
280+ "task_id" : " architectural_review" ,
281+ "seed" : 3 ,
282+ "final_score" : 0.658 ,
283+ "steps_taken" : 5 ,
284+ "issues_found" : 0 ,
285+ "issues_total" : 1 ,
286+ "noise_penalties" : 5 ,
287+ "terminated_reason" : " noise_exhausted" ,
288+ "duration_seconds" : 0.02
289+ },
290+ {
291+ "episode_id" : " 69cf00eb-5a20-4347-9887-f9806026a66b" ,
292+ "task_id" : " architectural_review" ,
293+ "seed" : 4 ,
294+ "final_score" : 0.058 ,
295+ "steps_taken" : 5 ,
296+ "issues_found" : 0 ,
297+ "issues_total" : 1 ,
298+ "noise_penalties" : 5 ,
299+ "terminated_reason" : " noise_exhausted" ,
300+ "duration_seconds" : 0.02
301+ },
302+ {
303+ "episode_id" : " 233ff87c-475f-4485-bd76-9abab4d2a304" ,
304+ "task_id" : " architectural_review" ,
305+ "seed" : 5 ,
306+ "final_score" : 0.657 ,
307+ "steps_taken" : 6 ,
308+ "issues_found" : 1 ,
309+ "issues_total" : 1 ,
310+ "noise_penalties" : 5 ,
311+ "terminated_reason" : " noise_exhausted" ,
312+ "duration_seconds" : 0.02
313+ },
314+ {
315+ "episode_id" : " 89210c97-a95a-49c8-a9d1-8dbe6db92238" ,
316+ "task_id" : " architectural_review" ,
317+ "seed" : 6 ,
318+ "final_score" : 0.059 ,
319+ "steps_taken" : 5 ,
320+ "issues_found" : 0 ,
321+ "issues_total" : 1 ,
322+ "noise_penalties" : 5 ,
323+ "terminated_reason" : " noise_exhausted" ,
324+ "duration_seconds" : 0.02
325+ },
326+ {
327+ "episode_id" : " 80c89d9d-92e9-4fbc-9a4f-401848c92cce" ,
328+ "task_id" : " architectural_review" ,
329+ "seed" : 7 ,
330+ "final_score" : 0.664 ,
331+ "steps_taken" : 6 ,
332+ "issues_found" : 1 ,
333+ "issues_total" : 1 ,
334+ "noise_penalties" : 5 ,
335+ "terminated_reason" : " noise_exhausted" ,
336+ "duration_seconds" : 0.02
337+ },
338+ {
339+ "episode_id" : " 325d65a3-94e7-40f8-90a6-d93bac2cbd9e" ,
340+ "task_id" : " architectural_review" ,
341+ "seed" : 8 ,
342+ "final_score" : 0.039 ,
343+ "steps_taken" : 5 ,
344+ "issues_found" : 0 ,
345+ "issues_total" : 1 ,
346+ "noise_penalties" : 5 ,
347+ "terminated_reason" : " noise_exhausted" ,
348+ "duration_seconds" : 0.02
349+ },
350+ {
351+ "episode_id" : " d94abdb2-90c6-424a-9a26-e798a2ea9b13" ,
352+ "task_id" : " architectural_review" ,
353+ "seed" : 9 ,
354+ "final_score" : 0.075 ,
355+ "steps_taken" : 5 ,
356+ "issues_found" : 0 ,
357+ "issues_total" : 1 ,
358+ "noise_penalties" : 5 ,
359+ "terminated_reason" : " noise_exhausted" ,
360+ "duration_seconds" : 0.02
361+ }
362+ ]
0 commit comments