@@ -2271,10 +2271,10 @@ <h2 id="installation">Installation<a class="headerlink" href="#installation" tit
22712271</ div >
22722272< div class ="tabbed-block ">
22732273< p > Pull the Docker image:</ p >
2274- < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-1-1 " name ="__codelineno-1-1 " href ="#__codelineno-1-1 "> </ a > docker< span class ="w "> </ span > pull< span class ="w "> </ span > am1n3e /webarena-verified:latest
2274+ < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-1-1 " name ="__codelineno-1-1 " href ="#__codelineno-1-1 "> </ a > docker< span class ="w "> </ span > pull< span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest
22752275</ code > </ pre > </ div >
22762276< p > Verify the installation:</ p >
2277- < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-2-1 " name ="__codelineno-2-1 " href ="#__codelineno-2-1 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > --help
2277+ < div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-2-1 " name ="__codelineno-2-1 " href ="#__codelineno-2-1 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > --help
22782278</ code > </ pre > </ div >
22792279</ div >
22802280< div class ="tabbed-block ">
@@ -2423,7 +2423,7 @@ <h3 id="basic-evaluation">Basic Evaluation<a class="headerlink" href="#basic-eva
24232423< div class ="tabbed-block ">
24242424< div class ="highlight "> < pre > < span > </ span > < code > < a id ="__codelineno-15-1 " name ="__codelineno-15-1 " href ="#__codelineno-15-1 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
24252425< a id ="__codelineno-15-2 " name ="__codelineno-15-2 " href ="#__codelineno-15-2 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
2426- < a id ="__codelineno-15-3 " name ="__codelineno-15-3 " href ="#__codelineno-15-3 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2426+ < a id ="__codelineno-15-3 " name ="__codelineno-15-3 " href ="#__codelineno-15-3 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
24272427< a id ="__codelineno-15-4 " name ="__codelineno-15-4 " href ="#__codelineno-15-4 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data
24282428</ code > </ pre > </ div >
24292429</ div >
@@ -2488,49 +2488,49 @@ <h3 id="filtering-tasks">Filtering Tasks<a class="headerlink" href="#filtering-t
24882488< a id ="__codelineno-18-2 " name ="__codelineno-18-2 " href ="#__codelineno-18-2 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
24892489< a id ="__codelineno-18-3 " name ="__codelineno-18-3 " href ="#__codelineno-18-3 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
24902490< a id ="__codelineno-18-4 " name ="__codelineno-18-4 " href ="#__codelineno-18-4 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2491- < a id ="__codelineno-18-5 " name ="__codelineno-18-5 " href ="#__codelineno-18-5 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2491+ < a id ="__codelineno-18-5 " name ="__codelineno-18-5 " href ="#__codelineno-18-5 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
24922492< a id ="__codelineno-18-6 " name ="__codelineno-18-6 " href ="#__codelineno-18-6 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --task-ids< span class ="w "> </ span > < span class ="m "> 1</ span > ,2,3
24932493< a id ="__codelineno-18-7 " name ="__codelineno-18-7 " href ="#__codelineno-18-7 "> </ a >
24942494< a id ="__codelineno-18-8 " name ="__codelineno-18-8 " href ="#__codelineno-18-8 "> </ a > < span class ="c1 "> # Single task</ span >
24952495< a id ="__codelineno-18-9 " name ="__codelineno-18-9 " href ="#__codelineno-18-9 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
24962496< a id ="__codelineno-18-10 " name ="__codelineno-18-10 " href ="#__codelineno-18-10 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
24972497< a id ="__codelineno-18-11 " name ="__codelineno-18-11 " href ="#__codelineno-18-11 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2498- < a id ="__codelineno-18-12 " name ="__codelineno-18-12 " href ="#__codelineno-18-12 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2498+ < a id ="__codelineno-18-12 " name ="__codelineno-18-12 " href ="#__codelineno-18-12 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
24992499< a id ="__codelineno-18-13 " name ="__codelineno-18-13 " href ="#__codelineno-18-13 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --task-ids< span class ="w "> </ span > < span class ="m "> 42</ span >
25002500< a id ="__codelineno-18-14 " name ="__codelineno-18-14 " href ="#__codelineno-18-14 "> </ a >
25012501< a id ="__codelineno-18-15 " name ="__codelineno-18-15 " href ="#__codelineno-18-15 "> </ a > < span class ="c1 "> # By site</ span >
25022502< a id ="__codelineno-18-16 " name ="__codelineno-18-16 " href ="#__codelineno-18-16 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
25032503< a id ="__codelineno-18-17 " name ="__codelineno-18-17 " href ="#__codelineno-18-17 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
25042504< a id ="__codelineno-18-18 " name ="__codelineno-18-18 " href ="#__codelineno-18-18 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2505- < a id ="__codelineno-18-19 " name ="__codelineno-18-19 " href ="#__codelineno-18-19 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2505+ < a id ="__codelineno-18-19 " name ="__codelineno-18-19 " href ="#__codelineno-18-19 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
25062506< a id ="__codelineno-18-20 " name ="__codelineno-18-20 " href ="#__codelineno-18-20 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --sites< span class ="w "> </ span > shopping
25072507< a id ="__codelineno-18-21 " name ="__codelineno-18-21 " href ="#__codelineno-18-21 "> </ a >
25082508< a id ="__codelineno-18-22 " name ="__codelineno-18-22 " href ="#__codelineno-18-22 "> </ a > < span class ="c1 "> # By task type</ span >
25092509< a id ="__codelineno-18-23 " name ="__codelineno-18-23 " href ="#__codelineno-18-23 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
25102510< a id ="__codelineno-18-24 " name ="__codelineno-18-24 " href ="#__codelineno-18-24 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
25112511< a id ="__codelineno-18-25 " name ="__codelineno-18-25 " href ="#__codelineno-18-25 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2512- < a id ="__codelineno-18-26 " name ="__codelineno-18-26 " href ="#__codelineno-18-26 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2512+ < a id ="__codelineno-18-26 " name ="__codelineno-18-26 " href ="#__codelineno-18-26 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
25132513< a id ="__codelineno-18-27 " name ="__codelineno-18-27 " href ="#__codelineno-18-27 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --task-type< span class ="w "> </ span > mutate
25142514< a id ="__codelineno-18-28 " name ="__codelineno-18-28 " href ="#__codelineno-18-28 "> </ a >
25152515< a id ="__codelineno-18-29 " name ="__codelineno-18-29 " href ="#__codelineno-18-29 "> </ a > < span class ="c1 "> # By template ID</ span >
25162516< a id ="__codelineno-18-30 " name ="__codelineno-18-30 " href ="#__codelineno-18-30 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
25172517< a id ="__codelineno-18-31 " name ="__codelineno-18-31 " href ="#__codelineno-18-31 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
25182518< a id ="__codelineno-18-32 " name ="__codelineno-18-32 " href ="#__codelineno-18-32 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2519- < a id ="__codelineno-18-33 " name ="__codelineno-18-33 " href ="#__codelineno-18-33 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2519+ < a id ="__codelineno-18-33 " name ="__codelineno-18-33 " href ="#__codelineno-18-33 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
25202520< a id ="__codelineno-18-34 " name ="__codelineno-18-34 " href ="#__codelineno-18-34 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --template-id< span class ="w "> </ span > < span class ="m "> 5</ span >
25212521< a id ="__codelineno-18-35 " name ="__codelineno-18-35 " href ="#__codelineno-18-35 "> </ a >
25222522< a id ="__codelineno-18-36 " name ="__codelineno-18-36 " href ="#__codelineno-18-36 "> </ a > < span class ="c1 "> # Combined filters</ span >
25232523< a id ="__codelineno-18-37 " name ="__codelineno-18-37 " href ="#__codelineno-18-37 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
25242524< a id ="__codelineno-18-38 " name ="__codelineno-18-38 " href ="#__codelineno-18-38 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
25252525< a id ="__codelineno-18-39 " name ="__codelineno-18-39 " href ="#__codelineno-18-39 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2526- < a id ="__codelineno-18-40 " name ="__codelineno-18-40 " href ="#__codelineno-18-40 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2526+ < a id ="__codelineno-18-40 " name ="__codelineno-18-40 " href ="#__codelineno-18-40 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
25272527< a id ="__codelineno-18-41 " name ="__codelineno-18-41 " href ="#__codelineno-18-41 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --sites< span class ="w "> </ span > shopping,reddit< span class ="w "> </ span > --task-type< span class ="w "> </ span > mutate
25282528< a id ="__codelineno-18-42 " name ="__codelineno-18-42 " href ="#__codelineno-18-42 "> </ a >
25292529< a id ="__codelineno-18-43 " name ="__codelineno-18-43 " href ="#__codelineno-18-43 "> </ a > < span class ="c1 "> # Dry run (no scoring)</ span >
25302530< a id ="__codelineno-18-44 " name ="__codelineno-18-44 " href ="#__codelineno-18-44 "> </ a > docker< span class ="w "> </ span > run< span class ="w "> </ span > --rm< span class ="w "> </ span > < span class ="se "> \</ span >
25312531< a id ="__codelineno-18-45 " name ="__codelineno-18-45 " href ="#__codelineno-18-45 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/output:/data< span class ="w "> </ span > < span class ="se "> \</ span >
25322532< a id ="__codelineno-18-46 " name ="__codelineno-18-46 " href ="#__codelineno-18-46 "> </ a > < span class ="w "> </ span > -v< span class ="w "> </ span > /path/to/config.json:/config.json< span class ="w "> </ span > < span class ="se "> \</ span >
2533- < a id ="__codelineno-18-47 " name ="__codelineno-18-47 " href ="#__codelineno-18-47 "> </ a > < span class ="w "> </ span > am1n3e /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
2533+ < a id ="__codelineno-18-47 " name ="__codelineno-18-47 " href ="#__codelineno-18-47 "> </ a > < span class ="w "> </ span > ghcr.io/servicenow /webarena-verified:latest< span class ="w "> </ span > < span class ="se "> \</ span >
25342534< a id ="__codelineno-18-48 " name ="__codelineno-18-48 " href ="#__codelineno-18-48 "> </ a > < span class ="w "> </ span > eval-tasks< span class ="w "> </ span > --config< span class ="w "> </ span > /config.json< span class ="w "> </ span > --output-dir< span class ="w "> </ span > /data< span class ="w "> </ span > --sites< span class ="w "> </ span > reddit< span class ="w "> </ span > --dry-run
25352535</ code > </ pre > </ div >
25362536</ div >
0 commit comments