From bb00d9136830b3e861fe2b567d2b203e772078e5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 16 Jun 2025 20:37:51 +0000
Subject: [PATCH 001/166] add README

---
 pipelinerl/miniwob/README.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 pipelinerl/miniwob/README.md

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/miniwob/README.md
new file mode 100644
index 00000000..9f91ddba
--- /dev/null
+++ b/pipelinerl/miniwob/README.md
@@ -0,0 +1,30 @@
+# Miniwob example
+
+## Prerequesites
+
+### TapeAgents
+
+Clone [TapeAgents](https://github.com/ServiceNow/TapeAgents/) in your parent folder and install it.
+```bash
+cd ..
+git clone git@github.com:ServiceNow/TapeAgents.git
+cd TapeAgents
+git checkout async_web_agent  # required until #230 is merged into main
+pip install -e .
+pip install 'tapeagents[finetune,converters]=0.1.12'
+cd ../PipelineRL
+```
+
+### Miniwob
+
+see setup here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md
+
+### Playwright
+
+The environment server will need to have playwright installed.
+
+`playwright install`
+
+## Launch Command
+
+`python -m pipelinerl.launch --config-name miniwob`

From dc81770ddde67527264c30604a65c2a2cb6cd920 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 17 Jun 2025 15:09:07 +0000
Subject: [PATCH 002/166] increase env session inactivity timout

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a5bf8bc2..7e435c49 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -123,7 +123,7 @@ environment:
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
-  max_session_inactivity_secs: 300
+  max_session_inactivity_secs: 600
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true

From e60d4c137753fcf176b83f387fd0906afa3d9a9f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 17 Jun 2025 15:09:22 +0000
Subject: [PATCH 003/166] update readme

---
 pipelinerl/miniwob/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/miniwob/README.md
index 9f91ddba..04e63120 100644
--- a/pipelinerl/miniwob/README.md
+++ b/pipelinerl/miniwob/README.md
@@ -9,7 +9,6 @@ Clone [TapeAgents](https://github.com/ServiceNow/TapeAgents/) in your parent fol
 cd ..
 git clone git@github.com:ServiceNow/TapeAgents.git
 cd TapeAgents
-git checkout async_web_agent  # required until #230 is merged into main
 pip install -e .
 pip install 'tapeagents[finetune,converters]=0.1.12'
 cd ../PipelineRL

From f9e45c26bafb65fc687dcca0f68d5066bc3a6678 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 18 Jun 2025 21:02:28 +0000
Subject: [PATCH 004/166] move miniwob to domains/

---
 pipelinerl/{ => domains}/miniwob/README.md                        | 0
 pipelinerl/{ => domains}/miniwob/environment_server.py            | 0
 pipelinerl/{ => domains}/miniwob/load_tasks.py                    | 0
 pipelinerl/{ => domains}/miniwob/rollouts.py                      | 0
 .../{ => domains}/miniwob/tool_chat_template_llama3.1_json.jinja  | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename pipelinerl/{ => domains}/miniwob/README.md (100%)
 rename pipelinerl/{ => domains}/miniwob/environment_server.py (100%)
 rename pipelinerl/{ => domains}/miniwob/load_tasks.py (100%)
 rename pipelinerl/{ => domains}/miniwob/rollouts.py (100%)
 rename pipelinerl/{ => domains}/miniwob/tool_chat_template_llama3.1_json.jinja (100%)

diff --git a/pipelinerl/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
similarity index 100%
rename from pipelinerl/miniwob/README.md
rename to pipelinerl/domains/miniwob/README.md
diff --git a/pipelinerl/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
similarity index 100%
rename from pipelinerl/miniwob/environment_server.py
rename to pipelinerl/domains/miniwob/environment_server.py
diff --git a/pipelinerl/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
similarity index 100%
rename from pipelinerl/miniwob/load_tasks.py
rename to pipelinerl/domains/miniwob/load_tasks.py
diff --git a/pipelinerl/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
similarity index 100%
rename from pipelinerl/miniwob/rollouts.py
rename to pipelinerl/domains/miniwob/rollouts.py
diff --git a/pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja b/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
similarity index 100%
rename from pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja
rename to pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja

From 8cdbd06a4dbd2ded04f9d42b8dc95425d4e530dc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 19:04:49 +0000
Subject: [PATCH 005/166] fix

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 7e435c49..86f84c51 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -45,7 +45,7 @@ vllm_config:
     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
-  rollout_policy: pipelinerl.miniwob.rollouts.generate_miniwob_rollout
+  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
   shared_memory_entry_size: 100000000
 
 preprocess:
@@ -119,7 +119,7 @@ agent:
 # ENVIRONMENT CONFIGURATION
 start_attempts: 3  # number of attempts to start each task
 environment:
-  _target_: pipelinerl.miniwob.environment_server.WebEnvironmentServer
+  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
@@ -130,7 +130,7 @@ environment:
   observation_format: html
 
 # DATASET CONFIGURATION
-dataset_loader: pipelinerl.miniwob.load_tasks.load_tasks
+dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
 dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]

From 551098270e3791e9f771e086852663464ebd0ac5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 19:14:47 +0000
Subject: [PATCH 006/166] fix path

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 86f84c51..658fdd05 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -41,7 +41,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: llama3_json # use hermes for qwen
-    chat_template: pipelinerl/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
+    chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:

From 07e858c3ef8e74e6393c7fd239ecb4e51afa44bb Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 7 Jul 2025 20:27:33 +0000
Subject: [PATCH 007/166] return RuntimeError instead of HTTPException because
 not pickable

---
 pipelinerl/domains/miniwob/rollouts.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index bbf68860..fb437658 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -5,6 +5,7 @@
 import random
 import time
 import aiohttp
+from fastapi import HTTPException
 from hydra.utils import instantiate
 from omegaconf import DictConfig
 
@@ -73,7 +74,10 @@ async def generate_miniwob_rollout(
             except Exception as e:
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    raise e
+                    if isinstance(e, HTTPException):
+                        raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
+                    else:
+                        raise e
                 logger.warning(f"Failed to start task, retry after 5 seconds: {e}")
                 await asyncio.sleep(5)
         logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")

From 5e568964cd4f41ccd6852f72e415116f14bffba1 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 14:22:13 +0000
Subject: [PATCH 008/166] add env_call_timeout

---
 conf/miniwob.yaml                                | 3 ++-
 pipelinerl/domains/miniwob/environment_server.py | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 658fdd05..d20ca1b6 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -123,7 +123,8 @@ environment:
   miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
   n_envs: 64
   host: "0.0.0.0"
-  max_session_inactivity_secs: 600
+  max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
+  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/pipelinerl/domains/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
index 13839f7a..db0072c5 100644
--- a/pipelinerl/domains/miniwob/environment_server.py
+++ b/pipelinerl/domains/miniwob/environment_server.py
@@ -14,11 +14,15 @@ def __init__(self,
         headless: bool = True,
         observation_format: str = "html",
         max_session_inactivity_secs: int = 600,
+        env_call_timeout: int = 60,
     ):
         os.environ["MINIWOB_URL"] = miniwob_url
+        # Remote environment server configuration
         self.n_envs = n_envs
         self.host = host
         self.max_session_inactivity_secs = max_session_inactivity_secs
+        self.env_call_timeout = env_call_timeout
+        # Individual web environment configuration
         self.web_env_target = web_env_target
         self.exp_path = exp_path
         self.headless = headless
@@ -29,7 +33,7 @@ def launch(self, port: int):
         """
         Serve the web environment in TapeAgent.
         """
-        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs)
+        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs, env_call_timeout=self.env_call_timeout)
         env_server.launch(OmegaConf.create({
             "_target_": self.web_env_target,
             "exp_path": self.exp_path,

From c06b768f7e3ada49f9d0658174dd74cba4b1f79e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 18:39:10 +0000
Subject: [PATCH 009/166] update gpu fractions

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d20ca1b6..a20c594b 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -2,9 +2,9 @@ defaults:
   - base
 
 world:
-  actor_fraction: 4
-  preprocessor_fraction: 1
-  finetune_fraction: 3
+  actor_fraction: 2
+  preprocessor_fraction: 0
+  finetune_fraction: 6
 
 # debug:
 #   mode: actor

From b1ad285cf43c8b2fd647146846933ec615f659fc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 18:49:11 +0000
Subject: [PATCH 010/166] set kl coef to 0

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a20c594b..d93edeaf 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -21,7 +21,7 @@ finetune:
   learning_rate: 1e-6
   optim: adamw_torch
   rl:
-    kl_coef: 0.01  # GRPO beta coefficient
+    kl_coef: 0.0  # GRPO beta coefficient
     reward_minus_kl_coef: 0.0  # RLOO beta coefficient
     use_advantages: true
     algo: grpo

From c8ac64d59a8e59429449135cb37b26fa08163154 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 8 Jul 2025 19:27:27 +0000
Subject: [PATCH 011/166] update max seq len

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d93edeaf..4b8c2149 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -15,7 +15,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   save_checkpoint_steps: 10
-  seq_length: 4096
+  seq_length: 8192
   train_batch_size: 1
   gradient_accumulation_passes: 1024
   learning_rate: 1e-6

From b87a6d11102f121e5c5dcfceb7dd35dfd0b77952 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 9 Jul 2025 21:38:17 +0000
Subject: [PATCH 012/166] revert to json instead of tool use agent

---
 conf/miniwob.yaml | 76 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 4b8c2149..8a9eb6cd 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -17,7 +17,7 @@ finetune:
   save_checkpoint_steps: 10
   seq_length: 8192
   train_batch_size: 1
-  gradient_accumulation_passes: 1024
+  gradient_accumulation_passes: 512
   learning_rate: 1e-6
   optim: adamw_torch
   rl:
@@ -28,7 +28,7 @@ finetune:
 
 llm:
   parameters:
-    max_tokens: 3072
+    max_tokens: 4096
     temperature: 1.0
 test_llm:
   parameters:
@@ -37,12 +37,12 @@ test_llm:
     top_p: 1.0
     top_k: 50
 
-vllm_config:
-  vllm_kwargs:
-    enable-auto-tool-choice: ""
-    tool-call-parser: llama3_json # use hermes for qwen
-    chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
-    enforce-eager: ""  # speed the actor llm startup a bit
+# vllm_config:
+#   vllm_kwargs:
+#     enable-auto-tool-choice: ""
+#     tool-call-parser: llama3_json # use hermes for qwen
+#     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
+#     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
@@ -68,6 +68,10 @@ agent:
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
+    allowed_steps: |
+      You are allowed to produce ONLY steps with the following json schemas:
+      {allowed_steps}
+      Do not reproduce schema when producing the steps, use it as a reference.
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
@@ -75,13 +79,22 @@ agent:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
       system_prompt: ${agent.templates.system_prompt}
+      # guidance: |
+      #   Produce the thought that describes the intended solution to the task. In the reasoning lines:
+      #   - review the instructions from the user and the content of the page.
+      #   - outline the main task to be accomplished and the steps to be taken to achieve it.
+      #   - produce definiton of done, that will be checked later to verify if the task was completed.
+      #   ${agent.templates.thought_format}
       guidance: |
-        Produce the thought that describes the intended solution to the task. In the reasoning lines:
+        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
         - review the instructions from the user and the content of the page.
         - outline the main task to be accomplished and the steps to be taken to achieve it.
         - produce definiton of done, that will be checked later to verify if the task was completed.
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+        Produce only one step!
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
@@ -91,26 +104,41 @@ agent:
         Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
         Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
         Produce only one reflection_thought step!
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+      #   ${agent.templates.thought_format}
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - examples.rl_webagent.steps.ReflectionThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
+      # guidance: |
+      #   Produce the single next tool call to be performed with the current page.
+      #   If you think that the task is solved, call the FinalAnswer.
+      #   You can interact with the page elements using their BIDs or coordinates as arguments for actions.
+      #   HINTS:
+      #   - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
+      #   - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
+      #   - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
+      #   - Press enter key to submit the search query.
       guidance: |
-        Produce the single next tool call to be performed with the current page.
-        If you think that the task is solved, call the FinalAnswer.
+        Produce the next action to be performed with the current page.
+        If you think that the task is solved, produce the final_answer_action.
         You can interact with the page elements using their BIDs or coordinates as arguments for actions.
         HINTS:
         - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-        - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
+        - To select value in a dropdown or combobox, ALWAYS use select_action.
         - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
         - Press enter key to submit the search query.
-      use_known_actions: true
-      use_function_calls: true
+        - Always produce only one step at a time.
+        - Step kind is always lowercase and underscore separated.
+      # steps_prompt: ${agent.templates.allowed_tools}
+      steps_prompt: ${agent.templates.allowed_steps}
+      # use_known_actions: true
       steps:
-        - examples.rl_webagent.steps.FinalAnswerAction
+        - examples.rl_webagent.steps.WebAgentAction
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect
@@ -120,11 +148,11 @@ agent:
 start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
-  miniwob_url: file:///home/toolkit/miniwob-plusplus/miniwob/html/miniwob/
-  n_envs: 64
+  miniwob_url: ???
+  n_envs: 8
   host: "0.0.0.0"
   max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
-  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
@@ -136,6 +164,6 @@ dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]
 train_dataset_names:
-  - train
+  - debug
 test_dataset_names:
-  - test
+  - debug

From 824d841861ac1d2314d604ede6ac6f90ac724b24 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 9 Jul 2025 21:38:34 +0000
Subject: [PATCH 013/166] update README

---
 pipelinerl/domains/miniwob/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 04e63120..0539f078 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -26,4 +26,4 @@ The environment server will need to have playwright installed.
 
 ## Launch Command
 
-`python -m pipelinerl.launch --config-name miniwob`
+`python -m pipelinerl.launch --config-name  environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`

From 8d170eccad7d21985d79acd3132302365184b8fe Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 10 Jul 2025 19:53:31 +0000
Subject: [PATCH 014/166] debug overflow counter

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index fb437658..22d1511e 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -123,7 +123,7 @@ async def generate_miniwob_rollout(
     ]
 
     # (4) # For each LLM interaction in the tape, make a training example.
-    all_finished = 0
+    all_finished = 1
     prompt_tokens = [llm_call.prompt_length_tokens for llm_call in llm_calls]
     output_tokens = [llm_call.output_length_tokens for llm_call in llm_calls]
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]

From 21a1b2afc91d8d7dc42eb0292b698d071a094f87 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 10 Jul 2025 19:54:52 +0000
Subject: [PATCH 015/166] fix prompts

---
 conf/miniwob.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 8a9eb6cd..6ab67eaf 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -75,6 +75,9 @@ agent:
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
+    json_format: |
+      Important! Respond with parsable JSON, do not include any text or code.
+      Do not output anything besides one JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
@@ -90,7 +93,8 @@ agent:
         - review the instructions from the user and the content of the page.
         - outline the main task to be accomplished and the steps to be taken to achieve it.
         - produce definiton of done, that will be checked later to verify if the task was completed.
-        Produce only one step!
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
@@ -104,6 +108,7 @@ agent:
         Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
         Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
         Produce only one reflection_thought step!
+        ${agent.templates.json_format}
       #   ${agent.templates.thought_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
@@ -134,11 +139,10 @@ agent:
         - Press enter key to submit the search query.
         - Always produce only one step at a time.
         - Step kind is always lowercase and underscore separated.
+        ${agent.templates.json_format}
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
-      # use_known_actions: true
-      steps:
-        - examples.rl_webagent.steps.WebAgentAction
+      use_known_actions: true
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect

From 05b67941586085a0bde3cd43432dbce38b1e1c12 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 11 Jul 2025 14:44:22 +0000
Subject: [PATCH 016/166] update readme

---
 pipelinerl/domains/miniwob/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 0539f078..9ff8461c 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -14,6 +14,11 @@ pip install 'tapeagents[finetune,converters]=0.1.12'
 cd ../PipelineRL
 ```
 
+Make sure to add the TapeAgent folder to your python path.
+```bash
+export PYTHONPATH="/path/to/TapeAgents:$PYTHONPATH"
+```
+
 ### Miniwob
 
 see setup here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md

From ef6b2b02687d642d1d9eeec3ed6822f7651aa00d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 17:52:46 +0000
Subject: [PATCH 017/166] flag tape as invalid instead of raising http errors

---
 pipelinerl/domains/miniwob/rollouts.py | 62 ++++++++++++++++----------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 22d1511e..5d483a0b 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -33,7 +33,8 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     - the last step is a PageObservation with an error
     """
     return (
-        isinstance(tape.steps[-1], LLMOutputParsingFailureAction)
+        len(tape.steps) == 0
+        or isinstance(tape.steps[-1], LLMOutputParsingFailureAction)
         or tape.metadata.result.get("error") is not None
         or (isinstance(tape.steps[-1], PageObservation) and tape.steps[-1].error)
     )
@@ -63,6 +64,7 @@ async def generate_miniwob_rollout(
     env_job_url = f"http://{env_job.hostname}:{env_job.port}"
 
     # (2) Generate environment, TapeAgent, and run them to get a Tape
+    no_error = True  # track if there was an error in the tape
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
         start_attempts = cfg.start_attempts
@@ -72,26 +74,35 @@ async def generate_miniwob_rollout(
                 tape_dict, _ = await env.start_task(problem)
                 break
             except Exception as e:
+                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
                 if start_attempts <= 0:
-                    if isinstance(e, HTTPException):
-                        raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
-                    else:
-                        raise e
-                logger.warning(f"Failed to start task, retry after 5 seconds: {e}")
-                await asyncio.sleep(5)
+                    no_error = False
+                    tape_dict = {}
+                    break
+                    # if isinstance(e, HTTPException):
+                    #     raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
+                    # else:
+                    #     raise e
+                else:
+                    logger.warning(f"retry after 5 seconds: {e}")
+                    await asyncio.sleep(5)
         logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
-        try:
-            actions = await env.a_actions()
-            tools_description = await env.a_tools_description()
-            logger.debug(f"Available tools: {tools_description}")
-            agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-            agent.llms = {DEFAULT: llm}
-            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-        except Exception as e:
-            logger.error(f"Error occurred while running agent: {e}")
+        if no_error:  # only run the agent if the task started successfully
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']]}/{problem['seed']}")
+            try:
+                actions = await env.a_actions()
+                tools_description = await env.a_tools_description()
+                logger.debug(f"Available tools: {tools_description}")
+                agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                agent.llms = {DEFAULT: llm}
+                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+            except Exception as e:
+                logger.error(f"Error occurred while running agent: {e}")
+                no_error = False
+            logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
         tape.metadata.result = {"execution_time": time.perf_counter() - t}
 
     # save the tape as we go
@@ -99,13 +110,18 @@ async def generate_miniwob_rollout(
         save_json_tape(tape, os.path.join(cfg.output_dir, "tapes"), tape.metadata.id)
 
     # (3) Compute rewards
-    last_obs = [step for step in tape if isinstance(step, Observation)][-1]
-    # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-    # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
-    # Let's take directly the RAW_REWARD_GLOBAL from the metadata
-    # raw_reward = last_obs.metadata.other.get("reward", 0.0)
-    raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
-    no_error = not tape_contains_an_error(tape)
+    obs_steps = [step for step in tape if isinstance(step, Observation)]
+    if obs_steps:
+        last_obs = obs_steps[-1]
+        # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
+        # Let's take directly the RAW_REWARD_GLOBAL from the metadata
+        # raw_reward = last_obs.metadata.other.get("reward", 0.0)
+        raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
+    else:
+        raw_reward = -1.0
+
+    no_error = no_error and not tape_contains_an_error(tape)
     # get the number of LLMOutputParsingFailureAction in the tape
     n_step_errors = len([step for step in tape.steps if isinstance(step, LLMOutputParsingFailureAction)])
     # get the number of PageObservation steps in the tape

From 0abc2b094071130249edce0f9b54e9a75fe9cd31 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 17:59:27 +0000
Subject: [PATCH 018/166] use redis

---
 conf/miniwob.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 6ab67eaf..faa759ca 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -1,5 +1,7 @@
 defaults:
   - base
+  - override streams: redis
+  - _self_
 
 world:
   actor_fraction: 2

From d3f68893113988c6af2b50bdec97c884336366e2 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 18:04:32 +0000
Subject: [PATCH 019/166] track task names instead of data splits

---
 pipelinerl/domains/miniwob/load_tasks.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index e5056c80..2c58f0e0 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -56,20 +56,24 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
     for name in dataset_names:
         if name == "debug":
             tasks.extend([
-                {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
+                # {"dataset": "miniwob.debug", "task": task, "seed": 0} for task in DEBUG_SPLIT
+                {"dataset": task, "task": task, "seed": 0} for task in DEBUG_SPLIT
             ])
         elif name == "easy":
             tasks.extend([
-                {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
+                # {"dataset": "miniwob.easy", "task": task, "seed": 0} for task in EASY_SPLIT
+                {"dataset": task, "task": task, "seed": 0} for task in EASY_SPLIT
             ])
         elif name == "train":
             tasks.extend([
-                {"dataset": "miniwob.train", "task": task, "seed": seed}
+                # {"dataset": "miniwob.train", "task": task, "seed": seed}
+                {"dataset": task, "task": task, "seed": seed}
                 for task in TRAIN_SPLIT for seed in seeds
             ])
         elif name == "test":
             tasks.extend([
-                {"dataset": "miniwob.test", "task": task, "seed": seed}
+                # {"dataset": "miniwob.test", "task": task, "seed": seed}
+                {"dataset": task, "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
     return tasks

From 9c319e3eeb95e6d448d57f6e4ad0de1a69623c2e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 21 Jul 2025 19:31:55 +0000
Subject: [PATCH 020/166] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5d483a0b..e3f039dd 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -91,7 +91,7 @@ async def generate_miniwob_rollout(
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
-            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']]}/{problem['seed']}")
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
             try:
                 actions = await env.a_actions()
                 tools_description = await env.a_tools_description()

From 92c8a93b227c65b5f044c04fafc33cc185f5b4d5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 22 Jul 2025 19:38:27 +0000
Subject: [PATCH 021/166] remove unused var in new tapeagent remote_env

---
 conf/miniwob.yaml                                | 1 -
 pipelinerl/domains/miniwob/environment_server.py | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index faa759ca..926f176b 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -157,7 +157,6 @@ environment:
   miniwob_url: ???
   n_envs: 8
   host: "0.0.0.0"
-  max_session_inactivity_secs: 600  # kill session after 10 minutes of inactivity
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
diff --git a/pipelinerl/domains/miniwob/environment_server.py b/pipelinerl/domains/miniwob/environment_server.py
index db0072c5..b30f9ef7 100644
--- a/pipelinerl/domains/miniwob/environment_server.py
+++ b/pipelinerl/domains/miniwob/environment_server.py
@@ -13,14 +13,12 @@ def __init__(self,
         exp_path: str,
         headless: bool = True,
         observation_format: str = "html",
-        max_session_inactivity_secs: int = 600,
         env_call_timeout: int = 60,
     ):
         os.environ["MINIWOB_URL"] = miniwob_url
         # Remote environment server configuration
         self.n_envs = n_envs
         self.host = host
-        self.max_session_inactivity_secs = max_session_inactivity_secs
         self.env_call_timeout = env_call_timeout
         # Individual web environment configuration
         self.web_env_target = web_env_target
@@ -33,7 +31,7 @@ def launch(self, port: int):
         """
         Serve the web environment in TapeAgent.
         """
-        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, max_session_inactivity_secs=self.max_session_inactivity_secs, env_call_timeout=self.env_call_timeout)
+        env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
         env_server.launch(OmegaConf.create({
             "_target_": self.web_env_target,
             "exp_path": self.exp_path,

From edf4d000a10cd166116666659a44fbc9495807d6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 18:33:11 +0000
Subject: [PATCH 022/166] use BaseMetrics

---
 pipelinerl/domains/miniwob/rollouts.py | 34 +++++++++++++++++---------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index e3f039dd..b2de5373 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -25,6 +25,18 @@
 logger = logging.getLogger(__name__)
 
 
+class MiniwobMetrics(BaseMetrics):
+    reward: float
+    success: bool
+    no_error: bool
+    no_answer: bool
+    overflow: bool
+    n_llm_calls: int
+    n_step_errors: int
+    n_page_observations: int
+    n_steps: int
+
+
 def tape_contains_an_error(tape: WebTape) -> bool:
     """
     Returns true if the tape ends with an error, ie if one of the following is true:
@@ -149,17 +161,17 @@ async def generate_miniwob_rollout(
 
     latency = time.time() - start_time
 
-    metrics = {
-        "reward": reward,
-        "success": 1 if reward > 0.5 else 0,
-        "no_error": no_error,
-        "no_answer": 1 if reward < 0 else 0,
-        "overflow": 0 if all_finished else 1,
-        "n_llm_calls": n_llm_calls,
-        "n_step_errors": n_step_errors,
-        "n_page_observations": n_page_observations,
-        "n_steps": len(tape.steps),
-    }
+    metrics = MiniwobMetrics(
+        reward=reward,
+        success=reward > 0.5,
+        no_error=no_error,
+        no_answer=reward < 0,
+        overflow=not all_finished,
+        n_llm_calls=n_llm_calls,
+        n_step_errors=n_step_errors,
+        n_page_observations=n_page_observations,
+        n_steps=len(tape.steps),
+    )
 
     return RolloutResult(
         training_texts=training_texts,

From 28749e0928d6db9b76efcb6e6f67e50bc933dca3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 18:37:30 +0000
Subject: [PATCH 023/166] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b2de5373..eb6d563b 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -10,7 +10,7 @@
 from omegaconf import DictConfig
 
 from pipelinerl.async_llm import llm_async_generate, make_training_text
-from pipelinerl.rollouts import RolloutResult
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
 from tapeagents.agent import Agent, DEFAULT
 from tapeagents.core import LLMOutputParsingFailureAction, Observation, LLMCall

From a4f9f79bec163b558e24a660636d221c0b39508b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 23 Jul 2025 21:18:22 +0000
Subject: [PATCH 024/166] keep track of time taken

---
 pipelinerl/domains/miniwob/rollouts.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index eb6d563b..43bf70d3 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -35,6 +35,9 @@ class MiniwobMetrics(BaseMetrics):
     n_step_errors: int
     n_page_observations: int
     n_steps: int
+    total_execution_time: float
+    agent_execution_time: float
+    environment_execution_time: float
 
 
 def tape_contains_an_error(tape: WebTape) -> bool:
@@ -115,7 +118,7 @@ async def generate_miniwob_rollout(
                 logger.error(f"Error occurred while running agent: {e}")
                 no_error = False
             logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
-        tape.metadata.result = {"execution_time": time.perf_counter() - t}
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 
     # save the tape as we go
     if cfg.save_tapes:
@@ -171,6 +174,9 @@ async def generate_miniwob_rollout(
         n_step_errors=n_step_errors,
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
+        total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
+        agent_execution_time=tape.metadata.result.get("agent_execution_time", -1.0),
+        environment_execution_time=tape.metadata.result.get("environment_execution_time", -1.0),
     )
 
     return RolloutResult(

From 8a6120f1a8261ffac963f9602d2bd503afe86a39 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 24 Jul 2025 11:11:33 +0200
Subject: [PATCH 025/166] send per step times to wandb

---
 pipelinerl/domains/miniwob/rollouts.py | 45 +++++++++++++++-----------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 43bf70d3..5c56c92a 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -1,26 +1,24 @@
-
 import asyncio
 import logging
 import os
 import random
 import time
+
 import aiohttp
-from fastapi import HTTPException
+from examples.rl_webagent.steps import WebTape
 from hydra.utils import instantiate
 from omegaconf import DictConfig
-
-from pipelinerl.async_llm import llm_async_generate, make_training_text
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
-from pipelinerl.world import Job
-from tapeagents.agent import Agent, DEFAULT
-from tapeagents.core import LLMOutputParsingFailureAction, Observation, LLMCall
+from tapeagents.agent import DEFAULT, Agent
+from tapeagents.core import LLMCall, LLMOutputParsingFailureAction, Observation
+from tapeagents.io import save_json_tape
 from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.orchestrator import async_execute_agent
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 from tapeagents.tools.simple_browser import PageObservation
-from tapeagents.orchestrator import async_execute_agent
-from tapeagents.io import save_json_tape
-from examples.rl_webagent.steps import WebTape
 
+from pipelinerl.async_llm import make_training_text
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
+from pipelinerl.world import Job
 
 logger = logging.getLogger(__name__)
 
@@ -38,6 +36,8 @@ class MiniwobMetrics(BaseMetrics):
     total_execution_time: float
     agent_execution_time: float
     environment_execution_time: float
+    env_step_time: float
+    agent_step_time: float
 
 
 def tape_contains_an_error(tape: WebTape) -> bool:
@@ -102,7 +102,9 @@ async def generate_miniwob_rollout(
                 else:
                     logger.warning(f"retry after 5 seconds: {e}")
                     await asyncio.sleep(5)
-        logger.info(f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds")
+        logger.info(
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+        )
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
@@ -117,7 +119,9 @@ async def generate_miniwob_rollout(
             except Exception as e:
                 logger.error(f"Error occurred while running agent: {e}")
                 no_error = False
-            logger.info(f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds")
+            logger.info(
+                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
+            )
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 
     # save the tape as we go
@@ -148,7 +152,8 @@ async def generate_miniwob_rollout(
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
     n_llm_calls = len(llm_calls)
     llm_calls: list[LLMCall] = [
-        LLMCall(**step.metadata.other["llm_call"]) if isinstance(step.metadata.other["llm_call"], dict)
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
         for step in llm_calls
     ]
@@ -163,7 +168,10 @@ async def generate_miniwob_rollout(
         all_finished &= 1 if text.input_ids[-1] == llm.tokenizer.eos_token_id else 0
 
     latency = time.time() - start_time
-
+    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
+    n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,
         success=reward > 0.5,
@@ -175,8 +183,10 @@ async def generate_miniwob_rollout(
         n_page_observations=n_page_observations,
         n_steps=len(tape.steps),
         total_execution_time=tape.metadata.result.get("total_execution_time", -1.0),
-        agent_execution_time=tape.metadata.result.get("agent_execution_time", -1.0),
-        environment_execution_time=tape.metadata.result.get("environment_execution_time", -1.0),
+        agent_execution_time=agent_time,
+        environment_execution_time=env_time,
+        env_step_time=env_time / n_observations if env_time > 0 and n_observations > 0 else -1.0,
+        agent_step_time=agent_time / n_other_steps if agent_time > 0 and n_other_steps > 0 else -1.0,
     )
 
     return RolloutResult(
@@ -187,4 +197,3 @@ async def generate_miniwob_rollout(
         prompt_tokens=prompt_tokens,
         output_tokens=output_tokens,
     )
-

From 3d57d2effaaf4073338d13831f2277b4bdb0d970 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 17:21:01 +0000
Subject: [PATCH 026/166] processed_entries_queue_popped_data

---
 pipelinerl/preprocess.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..887bdceb 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -637,6 +637,7 @@ def run_preprocessing_loop(
                             "preprocessor/queue/output": output_queue.qsize(),
                             "preprocessor/filtered_out_samples": num_filtered_out,
                             "preprocessor/total_filtered_out_samples": total_filtered_out,
+                            "preprocessor/popped_entries_queue": processed_entries_queue_popped_data,
                         }
                         if stats_aggregator.has_enough_data():
                             stats.update({"preprocessor/" + k: v for k, v in stats_aggregator.get_stats().items()})

From 4fbc5c7dbcd64ac98aad60c8486c3acd913d2d44 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 18:54:27 +0000
Subject: [PATCH 027/166] faster preprocess

---
 conf/base.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 3d426f4c..ac44fdde 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -23,9 +23,9 @@ preprocess:
   input: actor
   output: training_data
   n_workers: 8
-  chunk_n_groups: 2
+  chunk_n_groups: 8
   # queue for loaded raw groups
-  raw_queue_size: 8
+  raw_queue_size: 128
   # queue for processed chunks of multiple groups  
   input_queue_size: 32
   # queue for ready chunks for multiple groups

From 91acbc4386cf413ed6d5646485b6adbe9b4df799 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:16:33 +0000
Subject: [PATCH 028/166] more logging

---
 pipelinerl/preprocess.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 887bdceb..a8676e7a 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -170,6 +170,7 @@ def run_dataset_loader(
     check_group_size: int,
     chunk_n_groups: int,
     pop_old_data: bool,
+    wandb_run,
 ):
     old_and_dropped = 0
     last_time_notice = 0
@@ -196,6 +197,8 @@ def run_dataset_loader(
                             if old_and_dropped // 100 != last_time_notice:
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
+                                if wandb_run is not None:
+                                    wandb_run.log({"preprocessor/old_and_dropped": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -382,6 +385,7 @@ def run_preprocessing_loop(
         check_group_size=cfg.attempts,
         chunk_n_groups=cfg.preprocess.chunk_n_groups,
         pop_old_data=pop_old_data,
+        wandb_run=wandb_run,
     )
     # Start the dataset loader thread using Thread
     dataset_loader_thread = threading.Thread(target=dataset_loader_worker_fn)

From fb5a0bd06750d4f4dd6d3573759da03d795b7acc Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:18:11 +0000
Subject: [PATCH 029/166] better namming

---
 pipelinerl/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index a8676e7a..fc238798 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -198,7 +198,7 @@ def run_dataset_loader(
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
                                 if wandb_run is not None:
-                                    wandb_run.log({"preprocessor/old_and_dropped": old_and_dropped})
+                                    wandb_run.log({"preprocessor/dropped_before_preprocessing": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -641,7 +641,7 @@ def run_preprocessing_loop(
                             "preprocessor/queue/output": output_queue.qsize(),
                             "preprocessor/filtered_out_samples": num_filtered_out,
                             "preprocessor/total_filtered_out_samples": total_filtered_out,
-                            "preprocessor/popped_entries_queue": processed_entries_queue_popped_data,
+                            "preprocessor/dropped_after_preprocessing": processed_entries_queue_popped_data,
                         }
                         if stats_aggregator.has_enough_data():
                             stats.update({"preprocessor/" + k: v for k, v in stats_aggregator.get_stats().items()})

From 8c78c4517d61d7d8542e334488772d70f94ae946 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 25 Jul 2025 19:27:39 +0000
Subject: [PATCH 030/166] clean up

---
 pipelinerl/preprocess.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index fc238798..d758ff36 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -170,7 +170,6 @@ def run_dataset_loader(
     check_group_size: int,
     chunk_n_groups: int,
     pop_old_data: bool,
-    wandb_run,
 ):
     old_and_dropped = 0
     last_time_notice = 0
@@ -197,8 +196,6 @@ def run_dataset_loader(
                             if old_and_dropped // 100 != last_time_notice:
                                 logger.info(f"So far removed {old_and_dropped} old elements from preprocessor queue")
                                 last_time_notice = old_and_dropped // 100
-                                if wandb_run is not None:
-                                    wandb_run.log({"preprocessor/dropped_before_preprocessing": old_and_dropped})
                         except Empty:
                             pass
                     # Put new element in now that we made space
@@ -385,7 +382,6 @@ def run_preprocessing_loop(
         check_group_size=cfg.attempts,
         chunk_n_groups=cfg.preprocess.chunk_n_groups,
         pop_old_data=pop_old_data,
-        wandb_run=wandb_run,
     )
     # Start the dataset loader thread using Thread
     dataset_loader_thread = threading.Thread(target=dataset_loader_worker_fn)

From 5eb3a4eb7b96212f8fad19268b10f09b49fb399e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 25 Jul 2025 21:05:43 +0000
Subject: [PATCH 031/166] use all miniwob tasks

---
 conf/miniwob.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 926f176b..48e66108 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -169,6 +169,6 @@ dataset_loader_params:
   train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
   seeds: [0, 42, 1337, 900, 103]
 train_dataset_names:
-  - debug
+  - train
 test_dataset_names:
-  - debug
+  - test

From 1b90a4b9033a9842fdc8c5ae9538c39026c0368f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 26 Jul 2025 18:03:48 +0000
Subject: [PATCH 032/166] add groups_in_progress

---
 pipelinerl/actor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index dad79e0b..b0908d08 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -498,6 +498,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "finished_groups": finished_groups,
                             "trainer_model_version": trainer_version_to_publish, 
                             "time_since_start": time.time() - loop_start_time,
+                            "groups_in_progress": in_progress,
                         }
                         trainer_version_to_publish = None
                     else:

From 3c8f338e9c5bd7c41106333de785f6ce68a026a7 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 26 Jul 2025 18:54:34 +0000
Subject: [PATCH 033/166] raise when finetune is done

---
 pipelinerl/finetune_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 68bbfb17..0948e056 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -483,6 +483,7 @@ def run_finetuning_loop(
     finally:
         if actor_update_group:
             dist.destroy_process_group(actor_update_group)
+        raise RuntimeError("Finetuning loop finished, exiting worker thread")
 
 
 def rl_finetuning_worker(

From f88dceb9704be64d95dede133bd79f8a1725e430 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sun, 27 Jul 2025 21:40:05 +0000
Subject: [PATCH 034/166] cte lr

---
 conf/finetune/base.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
index 237e6d56..6fb09310 100644
--- a/conf/finetune/base.yaml
+++ b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.

From 75d3c9c303a8372e1d02673526b77691a6990ef3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 14:45:57 +0000
Subject: [PATCH 035/166] default save checkpoints

---
 conf/miniwob.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 48e66108..16cd1068 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -16,7 +16,6 @@ output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  save_checkpoint_steps: 10
   seq_length: 8192
   train_batch_size: 1
   gradient_accumulation_passes: 512

From 6b97c7b0cbb607d83ebb927b84b0ae8b61c8538a Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 15:03:22 +0000
Subject: [PATCH 036/166] update vllm max tokens

---
 conf/base.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 3d426f4c..2f20d18c 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -47,7 +47,7 @@ llm:
     temperature: 1.0
 test_llm:
   parameters: 
-    max_tokens: 16000
+    max_tokens: 8192
     temperature: 1.0
     top_p: 0.95
     top_k: 50
@@ -67,6 +67,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
+    max_model_len: 10000
 
 world:
   replicas: 1

From d3cf30b9c44ba03fc9edb1ae2507418bad5828cc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:31:18 +0000
Subject: [PATCH 037/166] assert group size is as expected

---
 pipelinerl/actor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index dad79e0b..6a1f2447 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -462,6 +462,9 @@ def run(self, dataset: list[tuple[str, dict]]):
 
                 assert isinstance(rollout_results, list)
                 assert isinstance(rollout_results[0], RolloutResult)
+                assert len(rollout_results) == attempts, (
+                    f"Expected {attempts} rollouts, got {len(rollout_results)}"
+                )
                 group_samples = sum(len(r.training_texts) for r in rollout_results)
 
                 published_samples += group_samples

From 4c50f1f4e9721e1956c2fe794a8f87c2908397e4 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:32:10 +0000
Subject: [PATCH 038/166] assert finetuning length is as much as vllm max
 length

---
 pipelinerl/launch.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b03ab8d7..ac87457e 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -71,6 +71,13 @@ def validate_config(cfg: DictConfig):
         if not hasattr(cfg.finetune.rl, "value_loss_coef") or cfg.finetune.rl.value_loss_coef <= 0.0:
             raise ValueError("value_loss_coef must be greater than 0 when using causal-language-modeling-with-value-head")
 
+    if cfg.finetune.seq_length < cfg.vllm_config.vllm_kwargs.max_model_len:
+        raise ValueError(
+            f"seq_length {cfg.finetune.seq_length} must be greater than or equal to "
+            f"vllm_kwargs.max_model_len {cfg.vllm_config.vllm_kwargs.max_model_len}"
+        )
+
+
 
 def run_ref_llm(cfg: DictConfig, preprocessor_llm_idx: int, local_idx: int, gpus: list[int], exp_dir: Path):
     kwargs = cfg.vllm_config.vllm_kwargs

From ff61d73dc92ad71cba9bf4d2f6cdf8a5d2f9b00e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 18:32:47 +0000
Subject: [PATCH 039/166] update finetuning & vllm max lengths

---
 conf/miniwob.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 16cd1068..225a7e6e 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -16,7 +16,7 @@ output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  seq_length: 8192
+  seq_length: 16384
   train_batch_size: 1
   gradient_accumulation_passes: 512
   learning_rate: 1e-6
@@ -38,8 +38,9 @@ test_llm:
     top_p: 1.0
     top_k: 50
 
-# vllm_config:
-#   vllm_kwargs:
+vllm_config:
+  vllm_kwargs:
+    max_model_len: 16384
 #     enable-auto-tool-choice: ""
 #     tool-call-parser: llama3_json # use hermes for qwen
 #     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja

From a00e6e63a74d3212c53c41bdf681fad6b5e94162 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 28 Jul 2025 21:19:48 +0000
Subject: [PATCH 040/166] debug agent

---
 conf/miniwob.yaml                      | 2 ++
 pipelinerl/domains/miniwob/rollouts.py | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 225a7e6e..91c89ee6 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -145,6 +145,8 @@ agent:
       # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       use_known_actions: true
+      steps:
+        - examples.rl_webagent.steps.FinalAnswerAction
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
       next_node: reflect
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5c56c92a..3d3287be 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -95,10 +95,6 @@ async def generate_miniwob_rollout(
                     no_error = False
                     tape_dict = {}
                     break
-                    # if isinstance(e, HTTPException):
-                    #     raise RuntimeError(f"HTTPException: {e.status_code} {e.detail}")
-                    # else:
-                    #     raise e
                 else:
                     logger.warning(f"retry after 5 seconds: {e}")
                     await asyncio.sleep(5)

From 6f149c89e67094fa1e8633072ba72c534082b099 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 8 Aug 2025 15:05:43 +0000
Subject: [PATCH 041/166] use ppo & upd config

---
 conf/miniwob.yaml | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 91c89ee6..57cd8b89 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -1,6 +1,7 @@
 defaults:
   - base
   - override streams: redis
+  - override finetune: ppo
   - _self_
 
 world:
@@ -12,24 +13,15 @@ world:
 #   mode: actor
 save_tapes: False
 
-output_dir: results/miniwob_debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
 model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
-  seq_length: 16384
-  train_batch_size: 1
-  gradient_accumulation_passes: 512
-  learning_rate: 1e-6
-  optim: adamw_torch
-  rl:
-    kl_coef: 0.0  # GRPO beta coefficient
-    reward_minus_kl_coef: 0.0  # RLOO beta coefficient
-    use_advantages: true
-    algo: grpo
+  seq_length: 16384  # input + output tokens
 
 llm:
   parameters:
-    max_tokens: 4096
+    max_tokens: 4096  # output tokens
     temperature: 1.0
 test_llm:
   parameters:
@@ -40,7 +32,7 @@ test_llm:
 
 vllm_config:
   vllm_kwargs:
-    max_model_len: 16384
+    max_model_len: 16384  # input + output tokens
 #     enable-auto-tool-choice: ""
 #     tool-call-parser: llama3_json # use hermes for qwen
 #     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
@@ -157,9 +149,9 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 8
+  n_envs: 32
   host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true

From 2ae2dd8d9fc41a733b06cbbd4bc36b95ea0c2434 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 8 Aug 2025 15:05:59 +0000
Subject: [PATCH 042/166] update readme

---
 pipelinerl/domains/miniwob/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/README.md b/pipelinerl/domains/miniwob/README.md
index 9ff8461c..e9af1b42 100644
--- a/pipelinerl/domains/miniwob/README.md
+++ b/pipelinerl/domains/miniwob/README.md
@@ -31,4 +31,4 @@ The environment server will need to have playwright installed.
 
 ## Launch Command
 
-`python -m pipelinerl.launch --config-name  environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`
+`python -m pipelinerl.launch --config-name miniwob environment.miniwob_url=file:///PATH/TO/miniwob-plusplus/miniwob/html/miniwob/`

From 913c8e27771d307ba1bf84ea76a211463bd2001b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 11 Aug 2025 14:03:22 +0000
Subject: [PATCH 043/166] stop training after 1k steps

---
 conf/miniwob.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 57cd8b89..07943203 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -18,6 +18,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   seq_length: 16384  # input + output tokens
+  max_train_steps: 1000
 
 llm:
   parameters:

From 812aafcc100f4b56a8ebe9e6365e5b109bef4a75 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 15 Aug 2025 22:39:44 +0000
Subject: [PATCH 044/166] first mcp

---
 .gitignore                               |   3 +-
 conf/mcp/python.json                     |  16 +++
 conf/tir_mcp.yaml                        | 119 +++++++++++++++++++++++
 pipelinerl/domains/math/__init__.py      |   2 +-
 pipelinerl/domains/math/rollouts.py      |  45 +++++----
 pipelinerl/domains/tir_mcp/__init__.py   |   1 +
 pipelinerl/domains/tir_mcp/env_server.py |  44 +++++++++
 pipelinerl/domains/tir_mcp/rollouts.py   |  77 +++++++++++++++
 8 files changed, 284 insertions(+), 23 deletions(-)
 create mode 100644 conf/mcp/python.json
 create mode 100644 conf/tir_mcp.yaml
 create mode 100644 pipelinerl/domains/tir_mcp/__init__.py
 create mode 100644 pipelinerl/domains/tir_mcp/env_server.py
 create mode 100644 pipelinerl/domains/tir_mcp/rollouts.py

diff --git a/.gitignore b/.gitignore
index 476aab77..1469bc67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,6 +120,7 @@ celerybeat.pid
 
 # SageMath parsed files
 *.sage.py
+node_modules/
 
 # Environments
 .env
@@ -185,4 +186,4 @@ results
 results/
 data/
 cache/
-dump.rdb
\ No newline at end of file
+dump.rdb
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
new file mode 100644
index 00000000..50ccbe69
--- /dev/null
+++ b/conf/mcp/python.json
@@ -0,0 +1,16 @@
+{
+    "mcpServers": {
+        "python_exec": {
+            "command": "deno",
+            "args": [
+                "run",
+                "-N",
+                "-R=node_modules",
+                "-W=node_modules",
+                "--node-modules-dir=auto",
+                "jsr:@pydantic/mcp-run-python",
+                "stdio"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
new file mode 100644
index 00000000..45596607
--- /dev/null
+++ b/conf/tir_mcp.yaml
@@ -0,0 +1,119 @@
+defaults:
+    - base
+    - _self_
+
+actor:
+  rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
+  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  task_template: |-
+    {task}
+
+dataset_loader: pipelinerl.domains.math.load_datasets
+train_dataset_names:
+- open_reasoner_zero_57k
+- open_reasoner_zero_extended_72k 
+test_dataset_names:
+  - aime_2024
+  - amc_2023
+  - math_500
+
+vllm_config:
+  use_v1: true
+  vllm_kwargs:
+    enable-auto-tool-choice: ""
+    tool-call-parser: hermes
+
+environment:
+  _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
+  n_envs: 8
+  n_envs_mcp: 7
+  n_envs_math: 1
+  host: localhost
+  exp_path: ${output_dir}/env_server
+  mcp_target: tapeagents.mcp.MCPEnvironment
+  mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
+  mcp_tools_whitelist:
+    - run_python_code
+  math_target: pipelinerl.domains.math.MathEnvironment
+
+
+agent_max_loops: 2
+agent:
+  _target_: tapeagents.agent.Agent
+  name : mcp_agent
+  max_iterations: 2
+  templates:
+    system_prompt: |
+      You are an expert AI Agent trained to assist users with complex information processing tasks.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions about user questions.
+    allowed_tools: |
+      You have access to the following tools:
+      {tools_description}
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      You have access to the following tools:
+      {tools_description}
+    format: >
+      Output only a single JSON dict.
+      Do not repeat the last thought again.
+      If the last action does not change the observation, do not repeat it!
+      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
+      It will break the system that processes the output.
+
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: plan
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
+        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
+        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
+        Start with the title "Plan". Every step should have short name and description.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: select
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
+      steps:
+        - examples.gaia_agent.steps.GaiaAnswer
+      use_known_actions: true
+      use_function_calls: true
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: summarize
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        Summarize last observation. If its an image, thoroughly describe it with all details.
+        Describe the results of the last action and observed changes
+        Do not hallucinate or make up any information, only describe what you see in the observation.
+        Do not guess or assume action effects, describe only visible changes.
+        ${agent.templates.thought_format}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: reflect
+      system_prompt: ${agent.templates.system_prompt}
+      trim_obs_except_last_n: 1
+      guidance: |
+        1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
+        2. If the last action was not successful, describe errors and the possible reasons for failure.
+        3. Check if the current plan step is finished. 
+        4. If the step is finished, update the following steps of the plan with new information and choose the next step.
+        ${agent.templates.thought_format}
+      next_node: select
\ No newline at end of file
diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
index 9aee0b8f..1c7310f2 100644
--- a/pipelinerl/domains/math/__init__.py
+++ b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable
+from .rollouts import generate_math_rollout, RewardTable, get_reward
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
\ No newline at end of file
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 41a61021..cdb7ba2a 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -26,6 +26,28 @@ class RewardTable(BaseModel):
     correct_answer_finished: float
     buffer_tokens: int = 0 # 0 means no overlong reward shaping
 
+def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) -> float:
+    match (answer_status, finished):
+        case ("wrong", False):
+            return reward_table.wrong_answer_not_finished
+        case ("wrong", True):
+            return reward_table.wrong_answer_finished
+        case ("no_answer", False):
+            reward = reward_table.no_answer_not_finished
+        case ("no_answer", True):
+            reward = reward_table.no_answer_finished
+        case ("unparsable", False):
+            reward = reward_table.unparsable_not_finished
+        case ("unparsable", True):
+            reward = reward_table.unparsable_finished
+        case ("correct", False):
+            reward = reward_table.correct_answer_not_finished
+        case ("correct", True):
+            reward = reward_table.correct_answer_finished
+        case _:
+            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
+
+
 def length_penalty(max_length: int, sequence_length: int, buffer_tokens: int) -> float:
     """
     Compute the overlong penalty
@@ -51,7 +73,7 @@ async def generate_math_rollout(
     latency = time.time() - time_start
 
     assert llm_call.output.content is not None
-    rewards = RewardTable(**dict(cfg.rewards))
+    reward_table = RewardTable(**dict(cfg.rewards))
     discount_factor = cfg.actor.discount_factor
 
     # math_verify is a fast environment, no support for environment replicas for now
@@ -70,26 +92,7 @@ async def generate_math_rollout(
 
     trace = make_training_text(llm, llm_call)
     # Determine reward based on answer status and finished state
-    match (answer_status, trace.finished):
-        case ("wrong", False):
-            reward = rewards.wrong_answer_not_finished
-        case ("wrong", True):
-            reward = rewards.wrong_answer_finished
-        case ("no_answer", False):
-            reward = rewards.no_answer_not_finished
-        case ("no_answer", True):
-            reward = rewards.no_answer_finished
-        case ("unparsable", False):
-            reward = rewards.unparsable_not_finished
-        case ("unparsable", True):
-            reward = rewards.unparsable_finished
-        case ("correct", False):
-            reward = rewards.correct_answer_not_finished
-        case ("correct", True):
-            reward = rewards.correct_answer_finished
-        case _:
-            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
-
+    reward = get_reward(answer_status, trace.finished, reward_table)
     # Apply discount factor based on output length
     reward *= discount_factor**llm_call.output_length_tokens
     overlong_penalty = 0
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/tir_mcp/__init__.py
new file mode 100644
index 00000000..c558147b
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/__init__.py
@@ -0,0 +1 @@
+from .rollouts import generate_math_rollout2
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
new file mode 100644
index 00000000..53259069
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -0,0 +1,44 @@
+import os
+from tapeagents.remote_environment import EnvironmentServer
+from omegaconf import OmegaConf
+from typing import List
+
+class MCPEnvironmentServer:
+
+    def __init__(self,
+        n_envs: int,
+        n_envs_mcp: int,
+        n_envs_math: int,
+        host: str,
+        mcp_target: str,
+        mcp_config_path: str,
+        mcp_tools_whitelist: List[str],
+        math_target: str,
+        exp_path: str,
+        env_call_timeout: int = 60,
+    ):
+        # Remote environment server configuration
+        self.n_envs = n_envs
+        self.host = host
+        self.env_call_timeout = env_call_timeout
+        # Individual web environment configuration
+        self.mcp_target = mcp_target
+        self.mcp_config_path = mcp_config_path
+        self.mcp_tools_whitelist = mcp_tools_whitelist
+        self.exp_path = exp_path
+
+
+    def launch(self, port: int):
+        """
+        Serve the environment in TapeAgent.
+        """
+        if port != 7777:
+            env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
+            env_server.launch(OmegaConf.create({
+                "_target_": self.mcp_target,
+                "config_path": self.mcp_config_path,
+                "tools_whitelist": self.mcp_tools_whitelist,
+            }))
+        else:
+            MathEnvironment.launch(port)
+
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
new file mode 100644
index 00000000..3a85804a
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -0,0 +1,77 @@
+import time
+import random
+import logging 
+
+import aiohttp
+from omegaconf import DictConfig
+from pydantic import BaseModel
+from pipelinerl.world import Job
+from tapeagents.core import Prompt
+from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.remote_environment import AsyncRemoteEnvironment
+from pipelinerl.async_llm import llm_async_generate, make_training_text
+from tapeagents.orchestrator import async_execute_agent
+from tapeagents.agent import DEFAULT, Agent
+from hydra.utils import instantiate
+from tapeagents.core import StopStep, Tape
+from tapeagents.dialog_tape import UserStep
+
+from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
+from pipelinerl.rollouts import RolloutResult, BaseMetrics
+
+logger = logging.getLogger(__name__)
+
+
+
+async def generate_math_rollout2(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+) -> RolloutResult:
+    # (1) Choose a random environment server
+    start = time.perf_counter()
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+    math_job, mcp_jobs = env_jobs[:1], env_jobs[1:]
+    # choose the env job randomly
+    mcp_job = random.choice(mcp_jobs)
+    assert mcp_job.port is not None
+    mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
+    environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore
+    async with environment.acontext(session, wait_for_env=True) as env:
+        actions = await env.a_actions()
+        tools_description = await env.a_tools_description()
+        logger.debug(f"Available tools: {tools_description}")
+        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+        agent.llms = {DEFAULT: llm}
+        tape = Tape(steps=[UserStep(content=problem["task"])])
+        tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+
+    reward_table = RewardTable(**dict(cfg.rewards))
+    answer_status = await verify_answer_rpc(
+        session=session,
+        host=math_job.hostname,
+        port=math_job.port,
+        prediction=llm_call.output.content,
+        gold=problem["answer"],
+        strict=True,
+    )
+    reward = get_reward(answer_status, tape.finished, reward_table)
+
+    metrics = BaseMetrics(
+        reward=reward,
+        success=answer_status == "correct",
+        no_error=answer_status != "unparsable",
+        no_answer=answer_status == "no_answer",
+    )
+
+    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+    for text in training_texts:
+        text.reward = reward
+    latency = time.perf_counter() - start
+    return RolloutResult(
+        training_texts=training_texts,
+        metrics=metrics,
+        latency=latency,
+        dataset_name=problem["dataset"],
+    )

From ca8516b6b9d9b4d98a9e028298613657e50b054f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 00:01:58 +0000
Subject: [PATCH 045/166] fix the env server

---
 conf/tir_mcp.yaml                        | 11 +++++-----
 pipelinerl/domains/tir_mcp/env_server.py |  5 ++++-
 pipelinerl/domains/tir_mcp/rollouts.py   | 28 +++++++++++++++++-------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 45596607..f4e95376 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -42,12 +42,11 @@ agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
   max_iterations: 2
+  store_llm_calls: true
   templates:
     system_prompt: |
-      You are an expert AI Agent trained to assist users with complex information processing tasks.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions.
+      You are an math and code expert AI Agent.
+      Please reason step by step, and put your final answer within \boxed{}.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -116,4 +115,6 @@ agent:
         3. Check if the current plan step is finished. 
         4. If the step is finished, update the following steps of the plan with new information and choose the next step.
         ${agent.templates.thought_format}
-      next_node: select
\ No newline at end of file
+      next_node: select
+
+model_path: Qwen/Qwen3-8B
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index 53259069..8c265549 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -3,6 +3,9 @@
 from omegaconf import OmegaConf
 from typing import List
 
+
+from pipelinerl.domains.math import MathEnvironment
+
 class MCPEnvironmentServer:
 
     def __init__(self,
@@ -40,5 +43,5 @@ def launch(self, port: int):
                 "tools_whitelist": self.mcp_tools_whitelist,
             }))
         else:
-            MathEnvironment.launch(port)
+            MathEnvironment().launch(port)
 
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 3a85804a..43e404c3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -15,6 +15,7 @@
 from hydra.utils import instantiate
 from tapeagents.core import StopStep, Tape
 from tapeagents.dialog_tape import UserStep
+from tapeagents.core import LLMCall
 
 from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
@@ -32,7 +33,7 @@ async def generate_math_rollout2(
     # (1) Choose a random environment server
     start = time.perf_counter()
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    math_job, mcp_jobs = env_jobs[:1], env_jobs[1:]
+    math_job, mcp_jobs = env_jobs[0], env_jobs[1:]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
     assert mcp_job.port is not None
@@ -48,15 +49,31 @@ async def generate_math_rollout2(
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
     reward_table = RewardTable(**dict(cfg.rewards))
+
+
+    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
+    llm_calls: list[LLMCall] = [
+        LLMCall(**step.metadata.other["llm_call"])
+        if isinstance(step.metadata.other["llm_call"], dict)
+        else step.metadata.other["llm_call"]
+        for step in llm_calls
+    ]
+    assert len(llm_calls) > 0, "No LLM calls found"
+    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
         host=math_job.hostname,
         port=math_job.port,
-        prediction=llm_call.output.content,
+        prediction=llm_calls[-1].output.content,
         gold=problem["answer"],
         strict=True,
     )
-    reward = get_reward(answer_status, tape.finished, reward_table)
+    tape_finished = True # TODO
+    reward = get_reward(answer_status, tape_finished, reward_table)
+    for text in training_texts:
+        text.reward = reward
+
+    latency = time.perf_counter() - start
 
     metrics = BaseMetrics(
         reward=reward,
@@ -64,11 +81,6 @@ async def generate_math_rollout2(
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
     )
-
-    training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
-    for text in training_texts:
-        text.reward = reward
-    latency = time.perf_counter() - start
     return RolloutResult(
         training_texts=training_texts,
         metrics=metrics,

From f3af1bc9cff436edef31e379581198df3f737477 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 00:05:00 +0000
Subject: [PATCH 046/166] tweak prompt

---
 conf/tir_mcp.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index f4e95376..67b025ec 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -45,8 +45,11 @@ agent:
   store_llm_calls: true
   templates:
     system_prompt: |
-      You are an math and code expert AI Agent.
-      Please reason step by step, and put your final answer within \boxed{}.
+      You are an expert AI Agent trained to assist users with complex information processing tasks.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions about user questions.
+      Put your final answer within \boxed{}.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}

From 5b10c33e2be57c6f64b7eedae23538c505787fa8 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 16 Aug 2025 16:49:20 +0000
Subject: [PATCH 047/166] upd

---
 conf/tir_mcp.yaml                      | 20 ++++++++------------
 pipelinerl/domains/tir_mcp/rollouts.py | 10 ++++++----
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 67b025ec..555523d9 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -48,8 +48,7 @@ agent:
       You are an expert AI Agent trained to assist users with complex information processing tasks.
       Your role is to understand user queries and respond in a helpful and accurate manner.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions.
-      Put your final answer within \boxed{}.
+      Do not express emotions or opinions about user questions. You must use the python tool for computation.
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -65,23 +64,20 @@ agent:
       If the last action does not change the observation, do not repeat it!
       DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
       It will break the system that processes the output.
+      
 
   nodes:
     - _target_: tapeagents.nodes.StandardNode
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
-        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
-        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-        Start with the title "Plan". Every step should have short name and description.
-        ${agent.templates.thought_format}
+        Use python to compute the correct answer
       steps_prompt: ${agent.templates.allowed_tools}
 
     - _target_: tapeagents.nodes.StandardNode
       name: select
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
         ${agent.templates.thought_format}
@@ -90,8 +86,8 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
-      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
+      trim_obs_except_last_n: 100
+      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer. Put your final answer within \boxed{}.
       steps:
         - examples.gaia_agent.steps.GaiaAnswer
       use_known_actions: true
@@ -100,7 +96,7 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: summarize
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         Summarize last observation. If its an image, thoroughly describe it with all details.
         Describe the results of the last action and observed changes
@@ -111,7 +107,7 @@ agent:
     - _target_: tapeagents.nodes.StandardNode
       name: reflect
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 1
+      trim_obs_except_last_n: 100
       guidance: |
         1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
         2. If the last action was not successful, describe errors and the possible reasons for failure.
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 43e404c3..5dd20104 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -45,18 +45,20 @@ async def generate_math_rollout2(
         logger.debug(f"Available tools: {tools_description}")
         agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
         agent.llms = {DEFAULT: llm}
-        tape = Tape(steps=[UserStep(content=problem["task"])])
+
+        tape = Tape(steps=[
+            #UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+            UserStep(content=f"Use run_python_code to compute 32+45")
+            ])
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
-
-    llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
     llm_calls: list[LLMCall] = [
         LLMCall(**step.metadata.other["llm_call"])
         if isinstance(step.metadata.other["llm_call"], dict)
         else step.metadata.other["llm_call"]
-        for step in llm_calls
+        for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]

From d2e6d09deb3a2efddd79d8e42d14ac2ca8e6101f Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 14:33:11 +0000
Subject: [PATCH 048/166] clean up

---
 conf/tir_mcp.yaml                      |  4 ++--
 pipelinerl/domains/math/rollouts.py    | 12 ++++++------
 pipelinerl/domains/tir_mcp/rollouts.py | 12 +++++++++---
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 555523d9..03fd7699 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -37,11 +37,11 @@ environment:
   math_target: pipelinerl.domains.math.MathEnvironment
 
 
-agent_max_loops: 2
+agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 2
+  max_iterations: 5
   store_llm_calls: true
   templates:
     system_prompt: |
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index cdb7ba2a..7bc21a8f 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -33,17 +33,17 @@ def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) ->
         case ("wrong", True):
             return reward_table.wrong_answer_finished
         case ("no_answer", False):
-            reward = reward_table.no_answer_not_finished
+            return reward_table.no_answer_not_finished
         case ("no_answer", True):
-            reward = reward_table.no_answer_finished
+            return reward_table.no_answer_finished
         case ("unparsable", False):
-            reward = reward_table.unparsable_not_finished
+            return reward_table.unparsable_not_finished
         case ("unparsable", True):
-            reward = reward_table.unparsable_finished
+            return reward_table.unparsable_finished
         case ("correct", False):
-            reward = reward_table.correct_answer_not_finished
+            return reward_table.correct_answer_not_finished
         case ("correct", True):
-            reward = reward_table.correct_answer_finished
+            return reward_table.correct_answer_finished
         case _:
             raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
 
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 5dd20104..fcdba1f3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -23,6 +23,9 @@
 logger = logging.getLogger(__name__)
 
 
+class Metrics(BaseMetrics):
+    num_tool_calls: int
+    num_python_calls: int
 
 async def generate_math_rollout2(
     cfg: DictConfig,
@@ -47,8 +50,7 @@ async def generate_math_rollout2(
         agent.llms = {DEFAULT: llm}
 
         tape = Tape(steps=[
-            #UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-            UserStep(content=f"Use run_python_code to compute 32+45")
+            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
             ])
         tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
 
@@ -60,6 +62,7 @@ async def generate_math_rollout2(
         else step.metadata.other["llm_call"]
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
+    num_tool_call = len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls])
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
@@ -77,12 +80,15 @@ async def generate_math_rollout2(
 
     latency = time.perf_counter() - start
 
-    metrics = BaseMetrics(
+    metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
+        num_tool_calls=num_tool_call,
+        num_python_calls=len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls and llm_call.output.tool_calls[0].function.name != "GaiaAnswer"])
     )
+
     return RolloutResult(
         training_texts=training_texts,
         metrics=metrics,

From 228cb42d8f389a4102b627c1776967d3f9c383de Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:33:55 +0000
Subject: [PATCH 049/166] hard code dino

---
 conf/mcp/python.json                   |  2 +-
 conf/tir_mcp.yaml                      |  8 ++++--
 pipelinerl/domains/math/rollouts.py    |  2 +-
 pipelinerl/domains/tir_mcp/rollouts.py | 37 +++++++++++++++++++++-----
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 50ccbe69..f9ff1a04 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -1,7 +1,7 @@
 {
     "mcpServers": {
         "python_exec": {
-            "command": "deno",
+            "command": "/home/toolkit/.deno/bin/deno",
             "args": [
                 "run",
                 "-N",
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 03fd7699..1b846b60 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -5,6 +5,7 @@ defaults:
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  llm_max_rollouts: 8
   task_template: |-
     {task}
 
@@ -17,6 +18,9 @@ test_dataset_names:
   - amc_2023
   - math_500
 
+world:
+  env_replicas: 16
+
 vllm_config:
   use_v1: true
   vllm_kwargs:
@@ -37,11 +41,11 @@ environment:
   math_target: pipelinerl.domains.math.MathEnvironment
 
 
-agent_max_loops: 3
+agent_max_loops: 1
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 5
+  max_iterations: 4
   store_llm_calls: true
   templates:
     system_prompt: |
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 7bc21a8f..7f370214 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -96,7 +96,7 @@ async def generate_math_rollout(
     # Apply discount factor based on output length
     reward *= discount_factor**llm_call.output_length_tokens
     overlong_penalty = 0
-    if rewards.buffer_tokens > 0:
+    if reward_table.buffer_tokens > 0:
         overlong_penalty = length_penalty(llm.parameters['max_tokens'], llm_call.output_length_tokens, rewards.buffer_tokens)
     reward += overlong_penalty
     trace.reward = reward
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index fcdba1f3..bd984337 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -1,6 +1,8 @@
 import time
 import random
 import logging 
+from collections import Counter
+from typing import List, Dict
 
 import aiohttp
 from omegaconf import DictConfig
@@ -23,9 +25,29 @@
 logger = logging.getLogger(__name__)
 
 
+def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
+    """
+    Count the number of tool calls for each function name category.
+    
+    Args:
+        llm_calls: List of LLMCall objects
+        
+    Returns:
+        Dictionary mapping function names to their counts
+    """
+    tool_call_names = []
+    
+    for llm_call in llm_calls:
+        if llm_call.output.tool_calls:
+            for tool_call in llm_call.output.tool_calls:
+                tool_call_names.append(tool_call.function.name)
+    
+    return dict(Counter(tool_call_names))
+
+
 class Metrics(BaseMetrics):
-    num_tool_calls: int
-    num_python_calls: int
+    num_python_calls: int = 0
+    num_steps: int = 0
 
 async def generate_math_rollout2(
     cfg: DictConfig,
@@ -62,14 +84,13 @@ async def generate_math_rollout2(
         else step.metadata.other["llm_call"]
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
-    num_tool_call = len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls])
     assert len(llm_calls) > 0, "No LLM calls found"
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
         host=math_job.hostname,
-        port=math_job.port,
-        prediction=llm_calls[-1].output.content,
+        port=math_job.port, # type: ignore
+        prediction=llm_calls[-1].output.content, # type: ignore
         gold=problem["answer"],
         strict=True,
     )
@@ -80,13 +101,15 @@ async def generate_math_rollout2(
 
     latency = time.perf_counter() - start
 
+    tool_call_counts = count_tool_calls_by_category(llm_calls)
+    
     metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
         no_error=answer_status != "unparsable",
         no_answer=answer_status == "no_answer",
-        num_tool_calls=num_tool_call,
-        num_python_calls=len([llm_call for llm_call in llm_calls if llm_call.output.tool_calls and llm_call.output.tool_calls[0].function.name != "GaiaAnswer"])
+        num_steps=len(tape.steps),
+        num_python_calls=tool_call_counts.get("run_python_code", 0),
     )
 
     return RolloutResult(

From fdf3c830f185ce23d31ebcbd7b6969dee0c8e1cc Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:44:23 +0000
Subject: [PATCH 050/166] less envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 1b846b60..ef4ef28d 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -19,7 +19,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 3
 
 vllm_config:
   use_v1: true

From 1165397c6482d166a3ab174105d18e0a491b8004 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 18:56:20 +0000
Subject: [PATCH 051/166] less envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index ef4ef28d..2cdf6602 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -29,7 +29,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 2
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 40a144aa716661848087fb9fea784288194b323b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:18:09 +0000
Subject: [PATCH 052/166] longer timeout

---
 conf/tir_mcp.yaml                        | 1 +
 pipelinerl/domains/tir_mcp/env_server.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 2cdf6602..305aa610 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -39,6 +39,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
+  mcp_read_timeout_seconds: 300
 
 
 agent_max_loops: 1
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index 8c265549..e1662990 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -19,6 +19,7 @@ def __init__(self,
         math_target: str,
         exp_path: str,
         env_call_timeout: int = 60,
+        mcp_read_timeout_seconds: int = 10,
     ):
         # Remote environment server configuration
         self.n_envs = n_envs
@@ -29,6 +30,7 @@ def __init__(self,
         self.mcp_config_path = mcp_config_path
         self.mcp_tools_whitelist = mcp_tools_whitelist
         self.exp_path = exp_path
+        self.mcp_read_timeout_seconds = mcp_read_timeout_seconds
 
 
     def launch(self, port: int):
@@ -41,6 +43,7 @@ def launch(self, port: int):
                 "_target_": self.mcp_target,
                 "config_path": self.mcp_config_path,
                 "tools_whitelist": self.mcp_tools_whitelist,
+                "read_timeout_seconds": self.mcp_read_timeout_seconds,
             }))
         else:
             MathEnvironment().launch(port)

From 2d25d8870d9f3e3ae2c0048aedca5c3ace046efd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:30:43 +0000
Subject: [PATCH 053/166] longer seq length

---
 conf/tir_mcp.yaml        | 4 ++++
 pipelinerl/preprocess.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 305aa610..5974132b 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -9,6 +9,10 @@ actor:
   task_template: |-
     {task}
 
+finetune:
+  seq_length: 50000
+  seq_parallel: 4
+
 dataset_loader: pipelinerl.domains.math.load_datasets
 train_dataset_names:
 - open_reasoner_zero_57k
diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..5f2b4af5 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -573,6 +573,10 @@ def run_preprocessing_loop(
                                     sample_length = len(entry["input_ids"])
 
                                     if current_length + sample_length > cfg.finetune.seq_length:
+                                        if len(current_batch) == 0:
+                                            raise ValueError(
+                                                f"sample_length is {sample_length}, but cfg.finetune.seq_length is {cfg.finetune.seq_length}"
+                                            )
                                         time_to_write = True
                                         break  # Current micro batch is full
                                     

From 20361677a2b560f34123d6af84c290f079c8e6fb Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:48:45 +0000
Subject: [PATCH 054/166] more envs

---
 conf/tir_mcp.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 5974132b..24d5b977 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -10,7 +10,7 @@ actor:
     {task}
 
 finetune:
-  seq_length: 50000
+  seq_length: 60000
   seq_parallel: 4
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 3
+  env_replicas: 16
 
 vllm_config:
   use_v1: true
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 2
+  n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 664b53968520b6c5428b968b4b26be8de2778e77 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:52:03 +0000
Subject: [PATCH 055/166] more llms per actor

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 24d5b977..5a26861a 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -5,7 +5,7 @@ defaults:
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
-  llm_max_rollouts: 8
+  llm_max_rollouts: 64
   task_template: |-
     {task}
 

From 4b0db03d50827b8535a7bb090cec297179caa6d5 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 19:59:57 +0000
Subject: [PATCH 056/166] even more envs

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 5a26861a..44db4fbe 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 16
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost

From 63d40924642dd56fabfd00b905373a8bb90758a0 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 20:08:45 +0000
Subject: [PATCH 057/166] longer timeout and revert prompt

---
 conf/tir_mcp.yaml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 44db4fbe..e6cb0349 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -43,7 +43,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
-  mcp_read_timeout_seconds: 300
+  mcp_read_timeout_seconds: 3000
 
 
 agent_max_loops: 1
@@ -57,7 +57,7 @@ agent:
       You are an expert AI Agent trained to assist users with complex information processing tasks.
       Your role is to understand user queries and respond in a helpful and accurate manner.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions. You must use the python tool for computation.
+      Do not express emotions or opinions about user questions. 
     allowed_tools: |
       You have access to the following tools:
       {tools_description}
@@ -80,7 +80,11 @@ agent:
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Use python to compute the correct answer
+        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
+        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
+        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
+        Start with the title "Plan". Every step should have short name and description.
+        ${agent.templates.thought_format}
       steps_prompt: ${agent.templates.allowed_tools}
 
     - _target_: tapeagents.nodes.StandardNode

From 6d81456d544734bcc26bf8dd65b78d73728b4702 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 20:23:45 +0000
Subject: [PATCH 058/166] retry task

---
 pipelinerl/domains/tir_mcp/rollouts.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index bd984337..a22b690a 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -1,3 +1,4 @@
+import asyncio
 import time
 import random
 import logging 
@@ -74,7 +75,12 @@ async def generate_math_rollout2(
         tape = Tape(steps=[
             UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
             ])
-        tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+        while True:
+            try:
+                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                break
+            except Exception as e:
+                await asyncio.sleep(5)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 

From 373b0ac16e8ac282dfd877561b1ce1229c01931d Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:18:37 +0000
Subject: [PATCH 059/166] pid deno module

---
 conf/mcp/python.json | 6 +++---
 conf/tir_mcp.yaml    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index f9ff1a04..f6e79890 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,9 +5,9 @@
             "args": [
                 "run",
                 "-N",
-                "-R=node_modules",
-                "-W=node_modules",
-                "--node-modules-dir=auto",
+                "-R=.mcp_node_modules_$$",
+                "-W=.mcp_node_modules_$$",
+                "--node-modules-dir=.mcp_node_modules_$$",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
             ]
diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index e6cb0349..9856e5c0 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 64
 
 vllm_config:
   use_v1: true

From e2de76821eb69fa38490c0fc88026332d02bd369 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:30:01 +0000
Subject: [PATCH 060/166] diff deno tmp dir

---
 conf/mcp/python.json | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index f6e79890..59531b79 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,12 +5,13 @@
             "args": [
                 "run",
                 "-N",
-                "-R=.mcp_node_modules_$$",
-                "-W=.mcp_node_modules_$$",
-                "--node-modules-dir=.mcp_node_modules_$$",
+                "--node-modules-dir=auto",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
-            ]
+            ],
+            "env": {
+                "DENO_DIR": "/tmp/deno_cache_mcp_python_$$"
+            }
         }
     }
 }
\ No newline at end of file

From 763b594d860184cdde9605c530e405e28ab1df4c Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 18 Aug 2025 21:33:55 +0000
Subject: [PATCH 061/166] none node modules

---
 conf/mcp/python.json | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 59531b79..44699388 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -5,13 +5,10 @@
             "args": [
                 "run",
                 "-N",
-                "--node-modules-dir=auto",
+                "--node-modules-dir=none",
                 "jsr:@pydantic/mcp-run-python",
                 "stdio"
-            ],
-            "env": {
-                "DENO_DIR": "/tmp/deno_cache_mcp_python_$$"
-            }
+            ]
         }
     }
 }
\ No newline at end of file

From 07835700ebf21a6d883fd46001452ca1691ed98b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 02:07:36 +0000
Subject: [PATCH 062/166] bigger timeout

---
 conf/tir_mcp.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 9856e5c0..0506a0bf 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -10,7 +10,7 @@ actor:
     {task}
 
 finetune:
-  seq_length: 60000
+  seq_length: 48000
   seq_parallel: 4
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -23,7 +23,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 64
+  env_replicas: 5
 
 vllm_config:
   use_v1: true
@@ -33,7 +33,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
-  n_envs: 16
+  n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
   host: localhost
@@ -43,6 +43,7 @@ environment:
   mcp_tools_whitelist:
     - run_python_code
   math_target: pipelinerl.domains.math.MathEnvironment
+  env_call_timeout: 600  # Increased from default 60s to 10 minutes
   mcp_read_timeout_seconds: 3000
 
 

From b284fcb43523474c8d926a06951bea9187f0afd2 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 10:26:09 +0000
Subject: [PATCH 063/166] diff temp dir for each mcp

---
 conf/mcp/python.json | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 44699388..b0881201 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -1,13 +1,10 @@
 {
     "mcpServers": {
         "python_exec": {
-            "command": "/home/toolkit/.deno/bin/deno",
+            "command": "bash",
             "args": [
-                "run",
-                "-N",
-                "--node-modules-dir=none",
-                "jsr:@pydantic/mcp-run-python",
-                "stdio"
+                "-c",
+                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From eb48d90ca945af5f6490829496dece843708edcd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 17:40:43 +0000
Subject: [PATCH 064/166] 0.0.0.0

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 0506a0bf..3d2c9678 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -36,7 +36,7 @@ environment:
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
-  host: localhost
+  host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
   mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json

From efa271767ff7883bdb352cbdc548a6abe6210037 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 19 Aug 2025 19:04:36 +0000
Subject: [PATCH 065/166] filter based on port

---
 pipelinerl/domains/tir_mcp/rollouts.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index a22b690a..417a214c 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,10 +58,11 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    math_job, mcp_jobs = env_jobs[0], env_jobs[1:]
+    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7777]
+    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7777]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
+    math_job = random.choice(math_jobs)
     assert mcp_job.port is not None
     mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
     environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore

From 402eeb239e82694a29f4378be17d75dc2148f322 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:17:28 +0000
Subject: [PATCH 066/166] scale up env servers by llm_servers

---
 conf/base.yaml      | 3 ++-
 pipelinerl/world.py | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 2f20d18c..995db7c5 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -76,7 +76,8 @@ world:
   preprocessor_fraction: 0
   finetune_fraction: 4
 
-  env_replicas: 2
+  # Number of environment servers per actor VLLM server
+  env_replicas_per_actor: 1
 
   actor_group_port: 9000
   environment_start_port: 7777
diff --git a/pipelinerl/world.py b/pipelinerl/world.py
index f41714e4..cc23afd0 100644
--- a/pipelinerl/world.py
+++ b/pipelinerl/world.py
@@ -188,7 +188,10 @@ def _place_pipeline_stages(self, cfg):
             self.add_job(kind="preprocessor", replica_idx=worker_idx, node_rank=node, gpus=[], cpu_heavy=True)
 
     def _place_environments(self, cfg):
-        for worker_idx in range(cfg.world.env_replicas):
+        # Scale environment servers to be the same as llm servers
+        env_replicas_per_actor = getattr(cfg.world, "env_replicas_per_actor", 1)
+        total_env_replicas = cfg.world.replicas * self.llms_per_actor * env_replicas_per_actor
+        for worker_idx in range(total_env_replicas):
             node = self.get_least_busy_node()
             envs_at_node = len([job for job in self.job_map[node] if job.kind == "environment"])
             self.add_job(

From 58f31ccd6f441bc286cea798dfe201f2b267265b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:19:30 +0000
Subject: [PATCH 067/166] reweight actor/trainer

---
 conf/miniwob.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 07943203..8ab18d3a 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -5,9 +5,9 @@ defaults:
   - _self_
 
 world:
-  actor_fraction: 2
+  actor_fraction: 3
   preprocessor_fraction: 0
-  finetune_fraction: 6
+  finetune_fraction: 5
 
 # debug:
 #   mode: actor

From 4101d777bfcb2664d76d1a876eb1e02f959da44f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:21:16 +0000
Subject: [PATCH 068/166] add massimo miniwob split

---
 pipelinerl/domains/miniwob/load_tasks.py | 136 +++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 2c58f0e0..4bade257 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -34,6 +34,132 @@
     "miniwob.tic-tac-toe",
     "miniwob.use-autocomplete-nodelay"
 ]
+MASSIMO_TRAIN_SPLIT = [
+    "miniwob.ascending-numbers",
+    "miniwob.bisect-angle",
+    "miniwob.book-flight",
+    "miniwob.choose-date",
+    "miniwob.choose-date-easy",
+    "miniwob.choose-date-medium",
+    "miniwob.choose-date-nodelay",
+    "miniwob.choose-list",
+    "miniwob.circle-center",
+    "miniwob.click-button-sequence",
+    "miniwob.click-checkboxes-soft",
+    "miniwob.click-checkboxes-transfer",
+    "miniwob.click-collapsible-2",
+    "miniwob.click-collapsible-2-nodelay",
+    "miniwob.click-collapsible-nodelay",
+    "miniwob.click-color",
+    "miniwob.click-dialog",
+    "miniwob.click-dialog-2",
+    "miniwob.click-link",
+    "miniwob.click-menu",
+    "miniwob.click-menu-2",
+    "miniwob.click-scroll-list",
+    "miniwob.click-shape",
+    "miniwob.click-tab",
+    "miniwob.click-tab-2",
+    "miniwob.click-tab-2-hard",
+    "miniwob.click-tab-2-medium",
+    "miniwob.click-test",
+    "miniwob.click-test-2",
+    "miniwob.click-test-transfer",
+    "miniwob.click-widget",
+    "miniwob.copy-paste",
+    "miniwob.copy-paste-2",
+    "miniwob.count-shape",
+    "miniwob.count-sides",
+    "miniwob.daily-calendar",
+    "miniwob.drag-box",
+    "miniwob.drag-circle",
+    "miniwob.drag-cube",
+    "miniwob.drag-items",
+    "miniwob.drag-items-grid",
+    "miniwob.drag-shapes",
+    "miniwob.drag-shapes-2",
+    "miniwob.drag-sort-numbers",
+    "miniwob.draw-circle",
+    "miniwob.draw-line",
+    "miniwob.email-inbox",
+    "miniwob.email-inbox-delete",
+    "miniwob.email-inbox-forward",
+    "miniwob.email-inbox-forward-nl",
+    "miniwob.email-inbox-forward-nl-turk",
+    "miniwob.email-inbox-important",
+    "miniwob.email-inbox-noscroll",
+    "miniwob.email-inbox-reply",
+    "miniwob.email-inbox-star-reply",
+    "miniwob.enter-date",
+    "miniwob.enter-text",
+    "miniwob.enter-text-dynamic",
+    "miniwob.enter-time",
+    "miniwob.find-greatest",
+    "miniwob.find-word",
+    "miniwob.focus-text-2",
+    "miniwob.form-sequence",
+    "miniwob.form-sequence-2",
+    "miniwob.generate-number",
+    "miniwob.grid-coordinate",
+    "miniwob.guess-number",
+    "miniwob.highlight-text",
+    "miniwob.hot-cold",
+    "miniwob.identify-shape",
+    "miniwob.login-user",
+    "miniwob.login-user-popup",
+    "miniwob.multi-layouts",
+    "miniwob.multi-orderings",
+    "miniwob.navigate-tree",
+    "miniwob.odd-or-even",
+    "miniwob.order-food",
+    "miniwob.phone-book",
+    "miniwob.read-table",
+    "miniwob.read-table-2",
+    "miniwob.resize-textarea",
+    "miniwob.right-angle",
+    "miniwob.scroll-text",
+    "miniwob.scroll-text-2",
+    "miniwob.search-engine",
+    "miniwob.sign-agreement",
+    "miniwob.simple-algebra",
+    "miniwob.social-media",
+    "miniwob.social-media-all",
+    "miniwob.social-media-some",
+    "miniwob.text-editor",
+    "miniwob.text-transform",
+    "miniwob.tic-tac-toe",
+    "miniwob.use-autocomplete",
+    "miniwob.use-autocomplete-nodelay",
+    "miniwob.use-colorwheel",
+    "miniwob.use-colorwheel-2",
+    "miniwob.use-spinner",
+    "miniwob.visual-addition",
+]
+MASSIMO_TEST_SPLIT = [
+    "miniwob.buy-ticket",
+    "miniwob.click-button",
+    "miniwob.click-option",
+    "miniwob.click-pie-nodelay",
+    "miniwob.drag-single-shape",
+    "miniwob.email-inbox-nl-turk",
+    "miniwob.enter-text-2",
+    "miniwob.find-midpoint",
+    "miniwob.focus-text",
+    "miniwob.simple-arithmetic",
+    "miniwob.stock-market",
+    "miniwob.use-slider-2",
+    "miniwob.click-checkboxes",
+    "miniwob.click-checkboxes-large",
+    "miniwob.click-collapsible",
+    "miniwob.click-pie",
+    "miniwob.click-shades",
+    "miniwob.click-tab-2-easy",
+    "miniwob.enter-password",
+    "miniwob.form-sequence-3",
+    "miniwob.highlight-text-2",
+    "miniwob.unicode-test",
+    "miniwob.use-slider",
+]
 TRAIN_SPLIT = None
 TEST_SPLIT = None
 
@@ -76,5 +202,15 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
                 {"dataset": task, "task": task, "seed": seed}
                 for task in TEST_SPLIT for seed in seeds
             ])
+        elif name == "massimo_train":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TRAIN_SPLIT for seed in seeds
+            ])
+        elif name == "massimo_test":
+            tasks.extend([
+                {"dataset": task, "task": task, "seed": seed}
+                for task in MASSIMO_TEST_SPLIT for seed in seeds
+            ])
     return tasks
 

From b00e4760a8ed3b41e6a91dfd798269f9f0fdba85 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 20 Aug 2025 18:40:07 +0000
Subject: [PATCH 069/166] cleanup

---
 conf/miniwob.yaml                             |   4 -
 .../tool_chat_template_llama3.1_json.jinja    | 120 ------------------
 2 files changed, 124 deletions(-)
 delete mode 100644 pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 8ab18d3a..08ade1ed 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -34,10 +34,6 @@ test_llm:
 vllm_config:
   vllm_kwargs:
     max_model_len: 16384  # input + output tokens
-#     enable-auto-tool-choice: ""
-#     tool-call-parser: llama3_json # use hermes for qwen
-#     chat_template: pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja  # copy pasted from https://github.com/vllm-project/vllm/blob/main/examples/tool_chat_template_llama3.1_json.jinja
-#     enforce-eager: ""  # speed the actor llm startup a bit
 
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
diff --git a/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja b/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
deleted file mode 100644
index a3bc9f02..00000000
--- a/pipelinerl/domains/miniwob/tool_chat_template_llama3.1_json.jinja
+++ /dev/null
@@ -1,120 +0,0 @@
-{{- bos_token }}
-{%- if custom_tools is defined %}
-    {%- set tools = custom_tools %}
-{%- endif %}
-{%- if not tools_in_user_message is defined %}
-    {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
-    {%- set tools_in_user_message = true %}
-{%- endif %}
-{%- if not date_string is defined %}
-    {%- if strftime_now is defined %}
-        {%- set date_string = strftime_now("%d %b %Y") %}
-    {%- else %}
-        {%- set date_string = "26 Jul 2024" %}
-    {%- endif %}
-{%- endif %}
-{%- if not tools is defined %}
-    {%- set tools = none %}
-{%- endif %}
-
-{#- This block extracts the system message, so we can slot it into the right place. #}
-{%- if messages[0]['role'] == 'system' %}
-    {%- if messages[0]['content'] is string %}
-        {%- set system_message = messages[0]['content']|trim %}
-    {%- else %}
-        {%- set system_message = messages[0]['content'][0]['text']|trim %}
-    {%- endif %}
-    {%- set messages = messages[1:] %}
-{%- else %}
-    {%- if tools is not none %}
-        {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
-    {%- else %}
-        {%- set system_message = "" %}
-    {%- endif %}
-{%- endif %}
-
-{#- System message #}
-{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
-{%- if tools is not none %}
-    {{- "Environment: ipython\n" }}
-{%- endif %}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
-{%- if tools is not none and not tools_in_user_message %}
-    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
-    {{- "Do not use variables.\n\n" }}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-{%- endif %}
-{{- system_message }}
-{{- "<|eot_id|>" }}
-
-{#- Custom tools are passed in a user message with some extra guidance #}
-{%- if tools_in_user_message and not tools is none %}
-    {#- Extract the first user message so we can plug it in here #}
-    {%- if messages | length != 0 %}
-        {%- if messages[0]['content'] is string %}
-            {%- set first_user_message = messages[0]['content']|trim %}
-        {%- else %}
-            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
-        {%- endif %}
-        {%- set messages = messages[1:] %}
-    {%- else %}
-        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
-    {%- endif %}
-    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
-    {{- "Given the following functions, please respond with a JSON for a function call " }}
-    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
-    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }}
-    {{- "Do not use variables.\n\n" }}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-    {{- first_user_message + "<|eot_id|>"}}
-{%- endif %}
-
-{%- for message in messages %}
-    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
-        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
-        {%- if message['content'] is string %}
-            {{- message['content'] | trim}}
-        {%- else %}
-            {%- for content in message['content'] %}
-                {%- if content['type'] == 'text' %}
-                    {{- content['text'] | trim }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|eot_id|>' }}
-    {%- elif 'tool_calls' in message %}
-        {%- if not message.tool_calls|length == 1 %}
-            {{- raise_exception("This model only supports single tool-calls at once!") }}
-        {%- endif %}
-        {%- set tool_call = message.tool_calls[0].function %}
-        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
-        {{- '{"name": "' + tool_call.name + '", ' }}
-        {{- '"parameters": ' }}
-        {{- tool_call.arguments | tojson }}
-        {{- "}" }}
-        {{- "<|eot_id|>" }}
-    {%- elif message.role == "tool" or message.role == "ipython" %}
-        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
-        {%- if message.content is string %}
-            {{- { "output": message.content } | tojson }}
-        {%- else %}
-            {%- for content in message['content']  %}
-                {%- if content['type']  == 'text' %}
-                    {{- { "output": content['text']  } | tojson }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- "<|eot_id|>" }}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
-{%- endif %}
\ No newline at end of file

From 3d86a28c92edd74c3a3dd6d4bddbdca766b2454b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 15:01:04 +0000
Subject: [PATCH 070/166] change port to 7778

---
 pipelinerl/domains/tir_mcp/env_server.py | 2 +-
 pipelinerl/domains/tir_mcp/rollouts.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index e1662990..d2be2dd8 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -37,7 +37,7 @@ def launch(self, port: int):
         """
         Serve the environment in TapeAgent.
         """
-        if port != 7777:
+        if port != 7778:
             env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
             env_server.launch(OmegaConf.create({
                 "_target_": self.mcp_target,
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 417a214c..27a15b71 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,8 +58,8 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7777]
-    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7777]
+    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7778]
+    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7778]
     # choose the env job randomly
     mcp_job = random.choice(mcp_jobs)
     math_job = random.choice(math_jobs)

From 0b561258c946b470149094fe309029e8c7195198 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 21 Aug 2025 15:34:58 +0000
Subject: [PATCH 071/166] update agent reflection node

---
 conf/miniwob.yaml | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 08ade1ed..a55dfd65 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -47,7 +47,7 @@ agent_max_loops: 10  # max number of agent - environment interactions for each t
 agent:
   _target_: tapeagents.agent.Agent
   name : web_agent
-  max_iterations: 4  # max number of iterations (make_prompt + llm? + generate_steps) for each loop
+  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
   store_llm_calls: true
   templates:
     system_prompt: |
@@ -56,16 +56,10 @@ agent:
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
       You will be provided with the content of the current page and a task from the user.
       Do not express your emotions or opinions about the user question.
-    allowed_tools: |
-      You have access to the following tools:
-      {tools_description}
     allowed_steps: |
       You are allowed to produce ONLY steps with the following json schemas:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
-    thought_format: |
-      Important! Respond with the plain text, do not include any JSON or code.
-      Do not output anything besides what I asked in this message.
     json_format: |
       Important! Respond with parsable JSON, do not include any text or code.
       Do not output anything besides one JSON object.
@@ -73,12 +67,6 @@ agent:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
       system_prompt: ${agent.templates.system_prompt}
-      # guidance: |
-      #   Produce the thought that describes the intended solution to the task. In the reasoning lines:
-      #   - review the instructions from the user and the content of the page.
-      #   - outline the main task to be accomplished and the steps to be taken to achieve it.
-      #   - produce definiton of done, that will be checked later to verify if the task was completed.
-      #   ${agent.templates.thought_format}
       guidance: |
         Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
         - review the instructions from the user and the content of the page.
@@ -86,7 +74,6 @@ agent:
         - produce definiton of done, that will be checked later to verify if the task was completed.
         Produce only one reasoning_thought step!
         ${agent.templates.json_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
         - tapeagents.steps.ReasoningThought
@@ -96,29 +83,22 @@ agent:
       name: reflect
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Review the current state of the page and previous steps to find the best possible next action to accomplish the task.
-        Produce the reflection_thought to describe the current page state, reflect on your last action, describe what is left to do, and what will be the immediate next action.
-        Produce only one reflection_thought step!
+        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
+        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
+        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
+        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
+        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
+        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
+        Produce only one reasoning_thought step!
         ${agent.templates.json_format}
-      #   ${agent.templates.thought_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
-        - examples.rl_webagent.steps.ReflectionThought
+        - tapeagents.steps.ReasoningThought
       trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
       max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
     - _target_: examples.rl_webagent.agent.WebNode
       name: act
       system_prompt: ${agent.templates.system_prompt}
-      # guidance: |
-      #   Produce the single next tool call to be performed with the current page.
-      #   If you think that the task is solved, call the FinalAnswer.
-      #   You can interact with the page elements using their BIDs or coordinates as arguments for actions.
-      #   HINTS:
-      #   - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-      #   - To select value in a dropdown or combobox, ALWAYS use SelectOption tool.
-      #   - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
-      #   - Press enter key to submit the search query.
       guidance: |
         Produce the next action to be performed with the current page.
         If you think that the task is solved, produce the final_answer_action.
@@ -131,7 +111,6 @@ agent:
         - Always produce only one step at a time.
         - Step kind is always lowercase and underscore separated.
         ${agent.templates.json_format}
-      # steps_prompt: ${agent.templates.allowed_tools}
       steps_prompt: ${agent.templates.allowed_steps}
       use_known_actions: true
       steps:

From 96a75c176bcfe9dd60a1a8573865f476a30b8d98 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 16:39:36 +0000
Subject: [PATCH 072/166] mcp and verify server

---
 pipelinerl/domains/tir_mcp/env_server.py | 78 ++++++++++++++++++++----
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/tir_mcp/env_server.py
index d2be2dd8..d1f14961 100644
--- a/pipelinerl/domains/tir_mcp/env_server.py
+++ b/pipelinerl/domains/tir_mcp/env_server.py
@@ -2,9 +2,61 @@
 from tapeagents.remote_environment import EnvironmentServer
 from omegaconf import OmegaConf
 from typing import List
+from fastapi import HTTPException
+from pydantic import BaseModel
+import logging
+import asyncio
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
 
+from pipelinerl.domains.math.verifier_api import verify_answer
+
+logger = logging.getLogger(__name__)
+
+
+class EnvironmentServerWithVerifier(EnvironmentServer):
+    """Environment server that includes the verify_answer endpoint."""
+    
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.process_pool = ProcessPoolExecutor(max_workers=4)
+    
+    def create_app(self):
+        app = super().create_app()
+        
+        class VerifyAnswerRequest(BaseModel):
+            prediction: str
+            gold: str
+            strict: bool = True
+            max_prediction_length: int = 1000
+        
+        @app.post("/verify_answer")
+        async def verify_answer_endpoint(request: VerifyAnswerRequest):
+            try:
+                # Run verification in the process pool to avoid blocking the main thread
+                loop = asyncio.get_event_loop()
+                answer_status = await loop.run_in_executor(
+                    self.process_pool, 
+                    partial(
+                        verify_answer, 
+                        request.prediction, 
+                        request.gold, 
+                        request.strict, 
+                        request.max_prediction_length
+                    )
+                )
+                return {"answer_status": answer_status}
+            except Exception as e:
+                logger.exception(f"Error in verify_answer: {e}")
+                raise HTTPException(status_code=500, detail=f"Error verifying answer: {str(e)}")
+        
+        return app
+    
+    def shutdown(self):
+        super().shutdown()
+        if hasattr(self, 'process_pool'):
+            self.process_pool.shutdown(wait=True)
 
-from pipelinerl.domains.math import MathEnvironment
 
 class MCPEnvironmentServer:
 
@@ -35,16 +87,18 @@ def __init__(self,
 
     def launch(self, port: int):
         """
-        Serve the environment in TapeAgent.
+        Serve the environment in TapeAgent with verify_answer endpoint.
         """
-        if port != 7778:
-            env_server = EnvironmentServer(n_envs=self.n_envs, host=self.host, port=port, env_call_timeout=self.env_call_timeout)
-            env_server.launch(OmegaConf.create({
-                "_target_": self.mcp_target,
-                "config_path": self.mcp_config_path,
-                "tools_whitelist": self.mcp_tools_whitelist,
-                "read_timeout_seconds": self.mcp_read_timeout_seconds,
-            }))
-        else:
-            MathEnvironment().launch(port)
+        env_server = EnvironmentServerWithVerifier(
+            n_envs=self.n_envs, 
+            host=self.host, 
+            port=port, 
+            env_call_timeout=self.env_call_timeout
+        )
+        env_server.launch(OmegaConf.create({
+            "_target_": self.mcp_target,
+            "config_path": self.mcp_config_path,
+            "tools_whitelist": self.mcp_tools_whitelist,
+            "read_timeout_seconds": self.mcp_read_timeout_seconds,
+        }))
 

From 0b4c9922fafcfe8984a58dd804fe2917b2f31a92 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:37:46 +0000
Subject: [PATCH 073/166] use custom parser

---
 conf/tir_mcp.yaml                      |  11 +-
 pipelinerl/domains/tir_mcp/rollouts.py |  16 ++-
 pipelinerl/rl_tool_parser_plugin.py    | 141 +++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 10 deletions(-)
 create mode 100644 pipelinerl/rl_tool_parser_plugin.py

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index 3d2c9678..c3b2658a 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -2,6 +2,14 @@ defaults:
     - base
     - _self_
 
+llm:
+ stop:
+    - "</tool_call>"
+
+test_llm:
+  stop:
+    - "</tool_call>"
+
 actor:
   rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
@@ -29,7 +37,8 @@ vllm_config:
   use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
-    tool-call-parser: hermes
+    tool-call-parser: rl_tool
+    tool-parser-plugin: /home/toolkit/research-now-reasoner/pipelinerl/pipelinerl/rl_tool_parser_plugin.py
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index 27a15b71..f0c751f3 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -58,14 +58,12 @@ async def generate_math_rollout2(
 ) -> RolloutResult:
     # (1) Choose a random environment server
     start = time.perf_counter()
-    mcp_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] != 7778]
-    math_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment" and job["port"] == 7778]
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     # choose the env job randomly
-    mcp_job = random.choice(mcp_jobs)
-    math_job = random.choice(math_jobs)
-    assert mcp_job.port is not None
-    mcp_job_url = f"http://{mcp_job.hostname}:{mcp_job.port}"
-    environment = AsyncRemoteEnvironment(server_url=mcp_job_url)  # type: ignore
+    env_job = random.choice(env_jobs)
+    assert env_job.port is not None
+    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
+    environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
         actions = await env.a_actions()
         tools_description = await env.a_tools_description()
@@ -95,8 +93,8 @@ async def generate_math_rollout2(
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
-        host=math_job.hostname,
-        port=math_job.port, # type: ignore
+        host=env_job.hostname,
+        port=env_job.port, # type: ignore
         prediction=llm_calls[-1].output.content, # type: ignore
         gold=problem["answer"],
         strict=True,
diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
new file mode 100644
index 00000000..23c67d66
--- /dev/null
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -0,0 +1,141 @@
+"""
+Tool parser plugin for RL tool calling format.
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Union, Sequence
+
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest, 
+    ExtractedToolCallInformation,
+    ToolCall,
+    FunctionCall
+)
+
+
+@ToolParserManager.register_module("rl_tool")
+class HermesRLToolParser(ToolParser):
+    """
+    Tool parser for RL tool calling format using <tool_call></tool_call> markers.
+    """
+    
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        
+        # Tool call markers
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+        
+        # Regex pattern for parsing tool calls
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
+        )
+        
+        # State for streaming
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+    
+    def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from the model output.
+        
+        Args:
+            model_output: The raw model output string
+            request: The request object
+            
+        Returns:
+            ExtractedToolCallInformation with tool calls and metadata
+        """
+        # Quick check to avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output
+            )
+        
+        try:
+            # Find all tool call matches
+            function_call_tuples = self.tool_call_regex.findall(model_output)
+            
+            # Parse JSON from matches
+            tool_calls = []
+            for i, match in enumerate(function_call_tuples):
+                json_str = match[0] if match[0] else match[1]
+                try:
+                    parsed_call = json.loads(json_str.strip())
+                    
+                    tool_call = ToolCall(
+                        id=f"call_{i}",
+                        type="function",
+                        function=FunctionCall(
+                            name=parsed_call.get("name", ""),
+                            arguments=json.dumps(
+                                parsed_call.get("arguments", {}),
+                                ensure_ascii=False
+                            )
+                        )
+                    )
+                    tool_calls.append(tool_call)
+                except json.JSONDecodeError:
+                    continue
+            
+            # Extract content before first tool call
+            content = model_output#[:model_output.find(self.tool_call_end_token)].strip()
+            if not content:
+                content = None
+                
+            return ExtractedToolCallInformation(
+                tools_called=bool(tool_calls),
+                tool_calls=tool_calls,
+                content=content
+            )
+            
+        except Exception:
+            return ExtractedToolCallInformation(
+                tools_called=False,
+                tool_calls=[],
+                content=model_output
+            )
+    
+    def extract_tool_calls_streaming(
+        self, 
+        previous_text: str, 
+        current_text: str, 
+        delta_text: str, 
+        request
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extract tool calls in streaming mode.
+        
+        Args:
+            previous_text: The previous text
+            current_text: The current complete text
+            delta_text: The new text delta
+            request: The request object
+            
+        Returns:
+            Dictionary with streaming tool call information
+        """
+        # Simple streaming implementation
+        if self.tool_call_start_token not in current_text:
+            return {"content": delta_text}
+        
+        # Check if we're starting a new tool call
+        if self.tool_call_start_token in delta_text:
+            self.current_tool_id += 1
+            return {
+                "tool_calls": [{
+                    "index": self.current_tool_id,
+                    "type": "function",
+                    "id": f"call_{self.current_tool_id}",
+                    "function": {"name": ""}
+                }]
+            }
+        
+        return {"content": delta_text}
\ No newline at end of file

From 471d28d4e8f0338720756b27ed819d2a4a8613c3 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:44:34 +0000
Subject: [PATCH 074/166] relative path

---
 conf/tir_mcp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index c3b2658a..fa5cee77 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -38,7 +38,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
-    tool-parser-plugin: /home/toolkit/research-now-reasoner/pipelinerl/pipelinerl/rl_tool_parser_plugin.py
+    tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
   _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer

From 8e0eeffc23865e78c8b0c22b858adc37e5e17100 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 17:56:12 +0000
Subject: [PATCH 075/166] test apth

---
 pipelinerl/launch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b03ab8d7..b0ec5785 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -18,8 +18,9 @@
 
 logger = logging.getLogger(__name__)
 
-# All the launch commands in this file pass the environment to child processes
-os.environ["PYTHONPATH"] = f"/home/toolkit/TapeAgents"
+# TODO: rm debug code
+import tapeagents
+logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"

From f93d7560e6a6449f9e6e8f7b947d9414c59d7e7b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 18:02:21 +0000
Subject: [PATCH 076/166] typo

---
 pipelinerl/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index b0ec5785..26f26b70 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -20,7 +20,6 @@
 
 # TODO: rm debug code
 import tapeagents
-logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
@@ -538,6 +537,7 @@ def main(cfg: DictConfig):
 
     processes = []
 
+    logger.info(f"TapeAgents loaded from: {tapeagents.__file__}")
     lead_launcher_stream = SingleStreamSpec(exp_path=exp_dir, topic="launcher_0")
     init_msg = {"exp_init": "true"}
     if world_map.my_rank == 0:

From 32e3eb62aa58af5e51c6546ce4980863dd987ceb Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 20:09:47 +0000
Subject: [PATCH 077/166] clean up

---
 conf/tir_mcp.yaml                      | 12 ++++++------
 pipelinerl/domains/math/rollouts.py    |  2 +-
 pipelinerl/domains/tir_mcp/__init__.py |  3 ++-
 pipelinerl/domains/tir_mcp/rollouts.py |  8 +++++---
 pipelinerl/domains/tir_mcp/steps.py    | 13 +++++++++++++
 5 files changed, 27 insertions(+), 11 deletions(-)
 create mode 100644 pipelinerl/domains/tir_mcp/steps.py

diff --git a/conf/tir_mcp.yaml b/conf/tir_mcp.yaml
index fa5cee77..11063092 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/tir_mcp.yaml
@@ -11,7 +11,7 @@ test_llm:
     - "</tool_call>"
 
 actor:
-  rollout_policy: pipelinerl.domains.tir_mcp.generate_math_rollout2
+  rollout_policy: pipelinerl.domains.tir_mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
   llm_max_rollouts: 64
   task_template: |-
@@ -41,7 +41,7 @@ vllm_config:
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
-  _target_: pipelinerl.domains.tir_mcp.env_server.MCPEnvironmentServer
+  _target_: pipelinerl.domains.tir_mcp.MCPEnvironmentServer
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
@@ -56,11 +56,11 @@ environment:
   mcp_read_timeout_seconds: 3000
 
 
-agent_max_loops: 1
+agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
   name : mcp_agent
-  max_iterations: 4
+  max_iterations: 3
   store_llm_calls: true
   templates:
     system_prompt: |
@@ -110,9 +110,9 @@ agent:
       name: act
       system_prompt: ${agent.templates.system_prompt}
       trim_obs_except_last_n: 100
-      guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer. Put your final answer within \boxed{}.
+      guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
       steps:
-        - examples.gaia_agent.steps.GaiaAnswer
+        - pipelinerl.domains.tir_mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
 
diff --git a/pipelinerl/domains/math/rollouts.py b/pipelinerl/domains/math/rollouts.py
index 7f370214..c293b36f 100644
--- a/pipelinerl/domains/math/rollouts.py
+++ b/pipelinerl/domains/math/rollouts.py
@@ -45,7 +45,7 @@ def get_reward(answer_status: str, finished: bool, reward_table: RewardTable) ->
         case ("correct", True):
             return reward_table.correct_answer_finished
         case _:
-            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{trace.finished}")
+            raise ValueError(f"Invalid answer_status/finished combination: {answer_status}/{finished}")
 
 
 def length_penalty(max_length: int, sequence_length: int, buffer_tokens: int) -> float:
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/tir_mcp/__init__.py
index c558147b..a47458a5 100644
--- a/pipelinerl/domains/tir_mcp/__init__.py
+++ b/pipelinerl/domains/tir_mcp/__init__.py
@@ -1 +1,2 @@
-from .rollouts import generate_math_rollout2
\ No newline at end of file
+from .rollouts import generate_mcp_rollout
+from .env_server import MCPEnvironmentServer
\ No newline at end of file
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/tir_mcp/rollouts.py
index f0c751f3..5ca29cb8 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/tir_mcp/rollouts.py
@@ -8,6 +8,7 @@
 import aiohttp
 from omegaconf import DictConfig
 from pydantic import BaseModel
+from pipelinerl.domains.tir_mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
@@ -50,7 +51,7 @@ class Metrics(BaseMetrics):
     num_python_calls: int = 0
     num_steps: int = 0
 
-async def generate_math_rollout2(
+async def generate_mcp_rollout(
     cfg: DictConfig,
     llm: TrainableLLM,
     problem: dict,
@@ -90,6 +91,7 @@ async def generate_math_rollout2(
         for step in tape.steps if step.metadata.other.get("llm_call") is not None
     ]
     assert len(llm_calls) > 0, "No LLM calls found"
+    tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     answer_status = await verify_answer_rpc(
         session=session,
@@ -99,14 +101,14 @@ async def generate_math_rollout2(
         gold=problem["answer"],
         strict=True,
     )
-    tape_finished = True # TODO
+    # Tape should finish with an answer
+    tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
     reward = get_reward(answer_status, tape_finished, reward_table)
     for text in training_texts:
         text.reward = reward
 
     latency = time.perf_counter() - start
 
-    tool_call_counts = count_tool_calls_by_category(llm_calls)
     
     metrics = Metrics(
         reward=reward,
diff --git a/pipelinerl/domains/tir_mcp/steps.py b/pipelinerl/domains/tir_mcp/steps.py
new file mode 100644
index 00000000..f33d6efa
--- /dev/null
+++ b/pipelinerl/domains/tir_mcp/steps.py
@@ -0,0 +1,13 @@
+from typing import Any, Literal
+from pydantic import Field
+from tapeagents.core import StopStep
+
+
+class MathAnswer(StopStep):
+    """
+    Action that indicates the agent has finished solving a math problem.
+    The final answer must be contained within \\boxed{} format.
+    """
+
+    kind: Literal["math_answer_action"] = "math_answer_action"
+    answer: Any = Field(description="Final answer in \\boxed{} format")
\ No newline at end of file

From 5a3ab0ee44ff8542025e2060844b0809e088b986 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 21 Aug 2025 20:55:01 +0000
Subject: [PATCH 078/166] clean up

---
 pipelinerl/rl_tool_parser_plugin.py | 45 ++---------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
index 23c67d66..194a5d87 100644
--- a/pipelinerl/rl_tool_parser_plugin.py
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -85,15 +85,10 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
                 except json.JSONDecodeError:
                     continue
             
-            # Extract content before first tool call
-            content = model_output#[:model_output.find(self.tool_call_end_token)].strip()
-            if not content:
-                content = None
-                
             return ExtractedToolCallInformation(
                 tools_called=bool(tool_calls),
                 tool_calls=tool_calls,
-                content=content
+                content=model_output
             )
             
         except Exception:
@@ -102,40 +97,4 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
                 tool_calls=[],
                 content=model_output
             )
-    
-    def extract_tool_calls_streaming(
-        self, 
-        previous_text: str, 
-        current_text: str, 
-        delta_text: str, 
-        request
-    ) -> Optional[Dict[str, Any]]:
-        """
-        Extract tool calls in streaming mode.
-        
-        Args:
-            previous_text: The previous text
-            current_text: The current complete text
-            delta_text: The new text delta
-            request: The request object
-            
-        Returns:
-            Dictionary with streaming tool call information
-        """
-        # Simple streaming implementation
-        if self.tool_call_start_token not in current_text:
-            return {"content": delta_text}
-        
-        # Check if we're starting a new tool call
-        if self.tool_call_start_token in delta_text:
-            self.current_tool_id += 1
-            return {
-                "tool_calls": [{
-                    "index": self.current_tool_id,
-                    "type": "function",
-                    "id": f"call_{self.current_tool_id}",
-                    "function": {"name": ""}
-                }]
-            }
-        
-        return {"content": delta_text}
\ No newline at end of file
+    
\ No newline at end of file

From 436e2333d8638acd601046fa0159b0e73b913583 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 13:43:51 +0000
Subject: [PATCH 079/166] rename domain to mcp

---
 conf/{tir_mcp.yaml => mcp.yaml}                   | 6 +++---
 pipelinerl/domains/{tir_mcp => mcp}/__init__.py   | 0
 pipelinerl/domains/{tir_mcp => mcp}/env_server.py | 0
 pipelinerl/domains/{tir_mcp => mcp}/rollouts.py   | 5 +++--
 pipelinerl/domains/{tir_mcp => mcp}/steps.py      | 0
 5 files changed, 6 insertions(+), 5 deletions(-)
 rename conf/{tir_mcp.yaml => mcp.yaml} (96%)
 rename pipelinerl/domains/{tir_mcp => mcp}/__init__.py (100%)
 rename pipelinerl/domains/{tir_mcp => mcp}/env_server.py (100%)
 rename pipelinerl/domains/{tir_mcp => mcp}/rollouts.py (97%)
 rename pipelinerl/domains/{tir_mcp => mcp}/steps.py (100%)

diff --git a/conf/tir_mcp.yaml b/conf/mcp.yaml
similarity index 96%
rename from conf/tir_mcp.yaml
rename to conf/mcp.yaml
index 11063092..2ffb097d 100644
--- a/conf/tir_mcp.yaml
+++ b/conf/mcp.yaml
@@ -11,7 +11,7 @@ test_llm:
     - "</tool_call>"
 
 actor:
-  rollout_policy: pipelinerl.domains.tir_mcp.generate_mcp_rollout
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{}.
   llm_max_rollouts: 64
   task_template: |-
@@ -41,7 +41,7 @@ vllm_config:
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
 
 environment:
-  _target_: pipelinerl.domains.tir_mcp.MCPEnvironmentServer
+  _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
   n_envs: 8
   n_envs_mcp: 7
   n_envs_math: 1
@@ -112,7 +112,7 @@ agent:
       trim_obs_except_last_n: 100
       guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
       steps:
-        - pipelinerl.domains.tir_mcp.steps.MathAnswer
+        - pipelinerl.domains.mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
 
diff --git a/pipelinerl/domains/tir_mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/__init__.py
rename to pipelinerl/domains/mcp/__init__.py
diff --git a/pipelinerl/domains/tir_mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/env_server.py
rename to pipelinerl/domains/mcp/env_server.py
diff --git a/pipelinerl/domains/tir_mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
similarity index 97%
rename from pipelinerl/domains/tir_mcp/rollouts.py
rename to pipelinerl/domains/mcp/rollouts.py
index 5ca29cb8..099b0abe 100644
--- a/pipelinerl/domains/tir_mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -8,7 +8,7 @@
 import aiohttp
 from omegaconf import DictConfig
 from pydantic import BaseModel
-from pipelinerl.domains.tir_mcp.steps import MathAnswer
+from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
@@ -66,6 +66,7 @@ async def generate_mcp_rollout(
     env_job_url = f"http://{env_job.hostname}:{env_job.port}"
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
     async with environment.acontext(session, wait_for_env=True) as env:
+        await env.start_task(problem)
         actions = await env.a_actions()
         tools_description = await env.a_tools_description()
         logger.debug(f"Available tools: {tools_description}")
@@ -79,7 +80,7 @@ async def generate_mcp_rollout(
             try:
                 tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
                 break
-            except Exception as e:
+            except Exception:
                 await asyncio.sleep(5)
 
     reward_table = RewardTable(**dict(cfg.rewards))
diff --git a/pipelinerl/domains/tir_mcp/steps.py b/pipelinerl/domains/mcp/steps.py
similarity index 100%
rename from pipelinerl/domains/tir_mcp/steps.py
rename to pipelinerl/domains/mcp/steps.py

From 366263ba0ad71ef3f620f654d8b7d42b2897050b Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 15:02:18 +0000
Subject: [PATCH 080/166] more envs

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 2ffb097d..b6fc7e1a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,7 +31,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 5
+  env_replicas: 16
 
 vllm_config:
   use_v1: true
@@ -42,7 +42,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 8
+  n_envs: 16
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From 9b0a74cebeed9f61bf4ca156366b5e75f54b404e Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 22 Aug 2025 15:58:34 +0000
Subject: [PATCH 081/166] towards massimo setup

---
 pipelinerl/domains/miniwob/load_tasks.py |  4 ++--
 pipelinerl/domains/miniwob/rollouts.py   | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/domains/miniwob/load_tasks.py b/pipelinerl/domains/miniwob/load_tasks.py
index 4bade257..a056a311 100644
--- a/pipelinerl/domains/miniwob/load_tasks.py
+++ b/pipelinerl/domains/miniwob/load_tasks.py
@@ -205,12 +205,12 @@ def load_tasks(dataset_names: list[str], train_split: float = 0.6, seeds: list[i
         elif name == "massimo_train":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}
-                for task in MASSIMO_TRAIN_SPLIT for seed in seeds
+                for task in MASSIMO_TRAIN_SPLIT for seed in range(3,10)  # seeds 0-2 are used for held out goals in Mass setup
             ])
         elif name == "massimo_test":
             tasks.extend([
                 {"dataset": task, "task": task, "seed": seed}
-                for task in MASSIMO_TEST_SPLIT for seed in seeds
+                for task in MASSIMO_TEST_SPLIT for seed in range(10)
             ])
     return tasks
 
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3d3287be..5b590665 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -129,7 +129,7 @@ async def generate_miniwob_rollout(
     if obs_steps:
         last_obs = obs_steps[-1]
         # in Miniwob, the observation "reward" is defined as RAW_REWARD_GLOBAL > 0
-        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L183
+        # see here: https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/src/browsergym/miniwob/base.py#L188
         # Let's take directly the RAW_REWARD_GLOBAL from the metadata
         # raw_reward = last_obs.metadata.other.get("reward", 0.0)
         raw_reward = last_obs.metadata.other.get("info", {}).get("task_info", {}).get("REWARD_GLOBAL", -1.0)
@@ -142,7 +142,12 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    #reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    # massimo's setup:
+    reward = float(raw_reward>0)
+    if reward == 0.0:
+        reward = -1.0
+    reward *= 0.98 ** n_page_observations
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]
@@ -166,7 +171,7 @@ async def generate_miniwob_rollout(
     latency = time.time() - start_time
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
-    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])
+    n_observations = len([s for s in tape.steps if isinstance(s, Observation)])  # TODO: is this not the same n_page_observations??
     n_other_steps = len(tape.steps) - n_observations
     metrics = MiniwobMetrics(
         reward=reward,

From 371be6ed40464aad773b6836c3a499fc6ef32a1d Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 15:59:23 +0000
Subject: [PATCH 082/166] less env replicas

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index b6fc7e1a..2de0318a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,7 +31,7 @@ test_dataset_names:
   - math_500
 
 world:
-  env_replicas: 16
+  env_replicas: 8
 
 vllm_config:
   use_v1: true
@@ -42,7 +42,7 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 16
+  n_envs: 32
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From 46b39d1b196906f659c8ee9ea5328c822ffe4fbd Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 18:56:23 +0000
Subject: [PATCH 083/166] clean up tmp

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index b0881201..7b270065 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
+                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio && rm -rf /tmp/mcp_work_$$"
             ]
         }
     }

From af63f51320e7ed1838de19644152cf0b5c4968c2 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 19:38:21 +0000
Subject: [PATCH 084/166] change mcp dir

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 7b270065..e1b7dd63 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /tmp/mcp_work_$$ && cd /tmp/mcp_work_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio && rm -rf /tmp/mcp_work_$$"
+                "mkdir -p /home/toolkit/.cache/mcp && cd /home/toolkit/.cache/mcp && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From 55a96e55b4bb79de19af8dcb2c93fa75968c8dd0 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 19:54:41 +0000
Subject: [PATCH 085/166] bigger model len

---
 conf/mcp.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 2de0318a..826f2445 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -30,19 +30,17 @@ test_dataset_names:
   - amc_2023
   - math_500
 
-world:
-  env_replicas: 8
-
 vllm_config:
   use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
+    max_model_len: 48000
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 32
+  n_envs: ${actor.llm_max_rollouts}
   n_envs_mcp: 7
   n_envs_math: 1
   host: "0.0.0.0"

From dd0ea2bd1c534b3e8b44e9ad35b34043408d64af Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 22 Aug 2025 20:01:46 +0000
Subject: [PATCH 086/166] typo

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 826f2445..f0de57f4 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -31,12 +31,12 @@ test_dataset_names:
   - math_500
 
 vllm_config:
-  use_v1: true
+  use_v1: false
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
-    max_model_len: 48000
+    max_model_len: 40960
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer

From dc4052d9bd7f51888b656fa1192a91840521efb7 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Sat, 23 Aug 2025 16:09:42 +0000
Subject: [PATCH 087/166] typo

---
 conf/base.yaml       | 2 +-
 conf/mcp/python.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 995db7c5..b91f113b 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -67,7 +67,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
-    max_model_len: 10000
+    max_model_len: 16000
 
 world:
   replicas: 1
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index e1b7dd63..5e44e30f 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /home/toolkit/.cache/mcp && cd /home/toolkit/.cache/mcp && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
+                "mkdir -p /home/toolkit/.cache/mcp_$$ && cd /home/toolkit/.cache/mcp_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
             ]
         }
     }

From bb4d0c59128900a7f723ba0b9edd1ed7c860ddda Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 19:51:30 +0000
Subject: [PATCH 088/166] clean up

---
 conf/finetune/base.yaml              |  2 +-
 conf/mcp.yaml                        | 15 ++++-----------
 pipelinerl/domains/mcp/env_server.py |  3 ---
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
index 237e6d56..6fb09310 100644
--- a/conf/finetune/base.yaml
+++ b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index f0de57f4..e8aa33cd 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -2,13 +2,6 @@ defaults:
     - base
     - _self_
 
-llm:
- stop:
-    - "</tool_call>"
-
-test_llm:
-  stop:
-    - "</tool_call>"
 
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
@@ -40,20 +33,20 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: ${actor.llm_max_rollouts}
-  n_envs_mcp: 7
-  n_envs_math: 1
+  n_envs: 32
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
   mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
-  math_target: pipelinerl.domains.math.MathEnvironment
   env_call_timeout: 600  # Increased from default 60s to 10 minutes
   mcp_read_timeout_seconds: 3000
 
 
+world:
+  env_replicas_per_actor: 8
+
 agent_max_loops: 3
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/pipelinerl/domains/mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
index d1f14961..fabc5af2 100644
--- a/pipelinerl/domains/mcp/env_server.py
+++ b/pipelinerl/domains/mcp/env_server.py
@@ -62,13 +62,10 @@ class MCPEnvironmentServer:
 
     def __init__(self,
         n_envs: int,
-        n_envs_mcp: int,
-        n_envs_math: int,
         host: str,
         mcp_target: str,
         mcp_config_path: str,
         mcp_tools_whitelist: List[str],
-        math_target: str,
         exp_path: str,
         env_call_timeout: int = 60,
         mcp_read_timeout_seconds: int = 10,

From ccdcd325c31fd6f1fdcea314d26df150c4b5a064 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 20:33:50 +0000
Subject: [PATCH 089/166] center reward

---
 pipelinerl/finetune/rl/__init__.py | 3 +++
 pipelinerl/finetune_loop.py        | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 57aa4fa7..fd014bb9 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -133,6 +133,7 @@ def rl_step(
     current_step: int,
     max_step: int,
     config: RLConfig,
+    running_avg_reward: float = 0.0,
 ) -> tuple[torch.Tensor, dict[str, float]]:
     """
     Perform a single RL step on the model using the given batch and config.
@@ -211,6 +212,8 @@ def rl_step(
 
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
+    # Center rewards using running average
+    rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
     group_tokens = batch.group_tokens[:, 1:]
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index a91d1aa2..32b1dbba 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -659,7 +659,7 @@ def toggle_sync(sync: bool):
                 assert batch.seq_boundaries is not None
                 update_ring_flash_attn_params(batch.seq_boundaries, seq_parallel_group)
             loss, this_step_rl_metrics = rl_step(
-                model, batch, training_metrics.completed_steps, final_train_steps, rl_config
+                model, batch, training_metrics.completed_steps, final_train_steps, rl_config, training_metrics.running_avg_reward
             )
             if is_sentinel_batch:
                 # zero out the loss and do not update the metrics
@@ -668,6 +668,11 @@ def toggle_sync(sync: bool):
                 # update the metrics
                 for k, v in this_step_rl_metrics.items():
                     rl_metrics[k].append(v)
+                
+                # Update running average reward
+                current_reward = this_step_rl_metrics.get('reward', 0.0)
+                alpha = 0.1  # Exponential moving average coefficient
+                training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From 7f5ed953e166d5d91a26f44144cd3404c5374111 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 20:59:54 +0000
Subject: [PATCH 090/166] running avg reward

---
 pipelinerl/finetune/types.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 33194c90..8c8f85be 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,6 +41,7 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
+    running_avg_reward: float = 0.0
 
 
 class PipelineBatchEncoding(BaseModel):

From 88a0ee7c291d97f446862f348f91e9a82000b342 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Tue, 26 Aug 2025 21:33:02 +0000
Subject: [PATCH 091/166] start from real mean

---
 pipelinerl/finetune/rl/__init__.py | 3 +++
 pipelinerl/finetune/types.py       | 2 +-
 pipelinerl/finetune_loop.py        | 5 ++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index fd014bb9..499cccf8 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,6 +11,7 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
+from tapeagents.tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,
@@ -213,6 +214,8 @@ def rl_step(
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
     # Center rewards using running average
+    if running_avg_reward is None:
+        running_avg_reward = masked_mean(rewards, masks_shifted).item()
     rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 8c8f85be..2af9edfd 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,7 +41,7 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
-    running_avg_reward: float = 0.0
+    running_avg_reward: float | None = None
 
 
 class PipelineBatchEncoding(BaseModel):
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 32b1dbba..82d9ee31 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -672,7 +672,10 @@ def toggle_sync(sync: bool):
                 # Update running average reward
                 current_reward = this_step_rl_metrics.get('reward', 0.0)
                 alpha = 0.1  # Exponential moving average coefficient
-                training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
+                if training_metrics.running_avg_reward is None:
+                    training_metrics.running_avg_reward = current_reward
+                else:
+                    training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From ef46f392ae856bf30bfe3639a7750e13dac7d9b6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 13:57:23 +0000
Subject: [PATCH 092/166] upd configs

---
 conf/miniwob.yaml         |  4 ++--
 conf/miniwob_massimo.yaml | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 conf/miniwob_massimo.yaml

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index a55dfd65..5c090823 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -18,7 +18,7 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 
 finetune:
   seq_length: 16384  # input + output tokens
-  max_train_steps: 1000
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
 
 llm:
   parameters:
@@ -125,7 +125,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 32
+  n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
   env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
new file mode 100644
index 00000000..7f2c3da3
--- /dev/null
+++ b/conf/miniwob_massimo.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - miniwob
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+finetune:
+  train_batch_size: 1
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"

From 1274748dae4c8c36c7d746405471b7746d3021ea Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 13:57:41 +0000
Subject: [PATCH 093/166] upd

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 5b590665..8d34e6ec 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,7 +142,7 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    #reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
     # massimo's setup:
     reward = float(raw_reward>0)
     if reward == 0.0:

From 66bcfbde9f7306357e198eaf4d07ec43688f7786 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 16:19:43 +0000
Subject: [PATCH 094/166] Fix paths

---
 conf/mcp.yaml        | 2 +-
 conf/mcp/python.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index e8aa33cd..be79fde5 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -37,7 +37,7 @@ environment:
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: /home/toolkit/research-now-reasoner/pipelinerl/conf/mcp/python.json
+  mcp_config_path: /home/toolkit/PipelineRL/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
   env_call_timeout: 600  # Increased from default 60s to 10 minutes
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 5e44e30f..580f70ef 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,8 +4,8 @@
             "command": "bash",
             "args": [
                 "-c",
-                "mkdir -p /home/toolkit/.cache/mcp_$$ && cd /home/toolkit/.cache/mcp_$$ && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
-            ]
+                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                ]
         }
     }
 }
\ No newline at end of file

From 3fcb847988ae6e4e27c17fb375f0133d56ed0a4c Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 17:00:25 +0000
Subject: [PATCH 095/166] Use relative path

---
 conf/mcp.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index be79fde5..fc30208e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -33,11 +33,11 @@ vllm_config:
 
 environment:
   _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 32
+  n_envs: 8
   host: "0.0.0.0"
   exp_path: ${output_dir}/env_server
   mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: /home/toolkit/PipelineRL/conf/mcp/python.json
+  mcp_config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
   mcp_tools_whitelist:
     - run_python_code
   env_call_timeout: 600  # Increased from default 60s to 10 minutes

From b16d45c6b2496a017aa5e47817e57485b8ed83a2 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 18:39:11 +0000
Subject: [PATCH 096/166] revert reward calculation

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 8d34e6ec..0c5a4396 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,12 +142,12 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    # reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
     # massimo's setup:
-    reward = float(raw_reward>0)
-    if reward == 0.0:
-        reward = -1.0
-    reward *= 0.98 ** n_page_observations
+    # reward = float(raw_reward>0)
+    # if reward == 0.0:
+    #     reward = -1.0
+    # reward *= 0.98 ** n_page_observations
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]

From 9f239c6b52fea245f89b2408ad928a4253e0ebc3 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 19:26:02 +0000
Subject: [PATCH 097/166] Fix path

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 580f70ef..977ab8c3 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "DIR=$(mktemp -d -p /tmp/mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 9e61c35f51963de30a963fe3cc5bb46ac2ecccd9 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 19:47:54 +0000
Subject: [PATCH 098/166] update massimo cfg to grpo

---
 conf/miniwob_massimo.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
index 7f2c3da3..99ba0c56 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo.yaml
@@ -1,5 +1,6 @@
 defaults:
   - miniwob
+  - override finetune: grpo
   - _self_
 
 train_dataset_names:
@@ -8,6 +9,8 @@ test_dataset_names:
   - massimo_test
 
 finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
   train_batch_size: 1
   gradient_accumulation_passes: 512
 

From 020a02172b1859684a8bc3a9f5f4185d3040f268 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 28 Aug 2025 19:50:40 +0000
Subject: [PATCH 099/166] revert mktemp changes

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 977ab8c3..580f70ef 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp/mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From ef884f22cc4f3b655a067df405562a925fc35ed5 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Thu, 28 Aug 2025 20:49:15 +0000
Subject: [PATCH 100/166] test with ppo

---
 conf/miniwob_massimo.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo.yaml
index 99ba0c56..003238be 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo.yaml
@@ -1,6 +1,6 @@
 defaults:
   - miniwob
-  - override finetune: grpo
+  - override finetune: ppo
   - _self_
 
 train_dataset_names:

From 4323f571853c15c8459b4ed127e8979a36b617b4 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 17:44:27 +0000
Subject: [PATCH 101/166] Fix deno paths

---
 conf/mcp.yaml                      | 2 +-
 conf/mcp/python.json               | 2 +-
 pipelinerl/finetune/rl/__init__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index fc30208e..efdd196a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -28,7 +28,7 @@ vllm_config:
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
-    tool-parser-plugin: pipelinerl/rl_tool_parser_plugin.py
+    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
     max_model_len: 40960
 
 environment:
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 580f70ef..b26cd498 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "DIR=$(mktemp -d -p /tmp mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${SLURM_JOB_ID:-${SLURM_PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }
diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 499cccf8..289b8f74 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,7 +11,7 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
-from tapeagents.tapeagents.finetune.rl.utils import masked_mean
+from tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,

From 2b5e9f5fcee5c824b13867898d8a6b20bc526a53 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 20:31:01 +0000
Subject: [PATCH 102/166] udt

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index b26cd498..0029ea0e 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${SLURM_JOB_ID:-${SLURM_PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${JOB_ID:-${PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 565d25c1f7774a5981c49b1e79aea7dfd3f358d0 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 29 Aug 2025 20:41:40 +0000
Subject: [PATCH 103/166] make the cache tag stable across all processes

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 0029ea0e..1224755b 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${JOB_ID:-${PROCID:-$(hostname)-$$-$(date +%s%N)}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${MCP_JOB_TAG:-${SLURM_JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From e39ff7b3043a3bcf0a220252d100bc0be8cce537 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Fri, 29 Aug 2025 22:56:34 +0000
Subject: [PATCH 104/166] remove running avg

---
 pipelinerl/finetune/rl/__init__.py |  6 ------
 pipelinerl/finetune/types.py       |  1 -
 pipelinerl/finetune_loop.py        | 10 +---------
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 289b8f74..57aa4fa7 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -11,7 +11,6 @@
 from datasets import Dataset
 from transformers import PreTrainedModel
 from pipelinerl.finetune.types import PipelineBatchEncoding
-from tapeagents.finetune.rl.utils import masked_mean
 
 from .utils import (
     sum_sum,
@@ -134,7 +133,6 @@ def rl_step(
     current_step: int,
     max_step: int,
     config: RLConfig,
-    running_avg_reward: float = 0.0,
 ) -> tuple[torch.Tensor, dict[str, float]]:
     """
     Perform a single RL step on the model using the given batch and config.
@@ -213,10 +211,6 @@ def rl_step(
 
     # get shifted values and compute ratios
     rewards = batch.rewards[:, 1:]
-    # Center rewards using running average
-    if running_avg_reward is None:
-        running_avg_reward = masked_mean(rewards, masks_shifted).item()
-    rewards = rewards - running_avg_reward
     ref_logprobs = batch.ref_logprobs[:, 1:]
     old_logprobs = batch.old_logprobs[:, 1:]
     group_tokens = batch.group_tokens[:, 1:]
diff --git a/pipelinerl/finetune/types.py b/pipelinerl/finetune/types.py
index 2af9edfd..33194c90 100644
--- a/pipelinerl/finetune/types.py
+++ b/pipelinerl/finetune/types.py
@@ -41,7 +41,6 @@ class TrainingMetrics:
     best_completed_steps: int = 0
     lr: float = 0.0
     time_waiting_for_data: float = 0.0
-    running_avg_reward: float | None = None
 
 
 class PipelineBatchEncoding(BaseModel):
diff --git a/pipelinerl/finetune_loop.py b/pipelinerl/finetune_loop.py
index 82d9ee31..a91d1aa2 100644
--- a/pipelinerl/finetune_loop.py
+++ b/pipelinerl/finetune_loop.py
@@ -659,7 +659,7 @@ def toggle_sync(sync: bool):
                 assert batch.seq_boundaries is not None
                 update_ring_flash_attn_params(batch.seq_boundaries, seq_parallel_group)
             loss, this_step_rl_metrics = rl_step(
-                model, batch, training_metrics.completed_steps, final_train_steps, rl_config, training_metrics.running_avg_reward
+                model, batch, training_metrics.completed_steps, final_train_steps, rl_config
             )
             if is_sentinel_batch:
                 # zero out the loss and do not update the metrics
@@ -668,14 +668,6 @@ def toggle_sync(sync: bool):
                 # update the metrics
                 for k, v in this_step_rl_metrics.items():
                     rl_metrics[k].append(v)
-                
-                # Update running average reward
-                current_reward = this_step_rl_metrics.get('reward', 0.0)
-                alpha = 0.1  # Exponential moving average coefficient
-                if training_metrics.running_avg_reward is None:
-                    training_metrics.running_avg_reward = current_reward
-                else:
-                    training_metrics.running_avg_reward = (1 - alpha) * training_metrics.running_avg_reward + alpha * current_reward
 
             backward(loss, is_final_micro_batch=do_optimizer_step)
 

From fc17df72414ef5ab0bcb8b1b677295442445af60 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Sat, 30 Aug 2025 21:31:23 +0000
Subject: [PATCH 105/166] fix

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index 1224755b..d64cb8eb 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${MCP_JOB_TAG:-${SLURM_JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
                 ]
         }
     }

From 537ec7a08b5e5f210fda381c99757438cc921ff7 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 2 Sep 2025 14:57:07 +0000
Subject: [PATCH 106/166] update configs

---
 conf/miniwob_grpo.yaml                        | 149 ++++++++++++++++++
 conf/miniwob_massimo_grpo.yaml                |  18 +++
 ..._massimo.yaml => miniwob_massimo_ppo.yaml} |   5 +-
 conf/{miniwob.yaml => miniwob_ppo.yaml}       |   5 +
 pipelinerl/domains/miniwob/rollouts.py        |  15 +-
 5 files changed, 184 insertions(+), 8 deletions(-)
 create mode 100644 conf/miniwob_grpo.yaml
 create mode 100644 conf/miniwob_massimo_grpo.yaml
 rename conf/{miniwob_massimo.yaml => miniwob_massimo_ppo.yaml} (88%)
 rename conf/{miniwob.yaml => miniwob_ppo.yaml} (97%)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
new file mode 100644
index 00000000..864dbc75
--- /dev/null
+++ b/conf/miniwob_grpo.yaml
@@ -0,0 +1,149 @@
+defaults:
+  - base
+  - override streams: redis
+  - override finetune: grpo
+  - _self_
+
+world:
+  actor_fraction: 3
+  preprocessor_fraction: 0
+  finetune_fraction: 5
+
+# debug:
+#   mode: actor
+save_tapes: False
+
+output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
+model_path: meta-llama/Llama-3.1-8B-Instruct
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
+
+eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
+
+llm:
+  parameters:
+    max_tokens: 4096  # output tokens
+    temperature: 1.0
+test_llm:
+  parameters:
+    max_tokens: ${...llm.parameters.max_tokens}
+    temperature: 0.0
+    top_p: 1.0
+    top_k: 50
+
+vllm_config:
+  vllm_kwargs:
+    max_model_len: 16384  # input + output tokens
+
+actor:
+  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
+  shared_memory_entry_size: 100000000
+
+preprocess:
+  shared_memory_entry_size: 1000000000
+
+# AGENT CONFIGURATION
+agent_max_loops: 10  # max number of agent - environment interactions for each task
+reward_computation: nico
+agent:
+  _target_: tapeagents.agent.Agent
+  name : web_agent
+  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
+  store_llm_calls: true
+  templates:
+    system_prompt: |
+      You are an expert AI Agent, your goal is to help the user perform tasks using a web browser.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      You will be provided with the content of the current page and a task from the user.
+      Do not express your emotions or opinions about the user question.
+    allowed_steps: |
+      You are allowed to produce ONLY steps with the following json schemas:
+      {allowed_steps}
+      Do not reproduce schema when producing the steps, use it as a reference.
+    json_format: |
+      Important! Respond with parsable JSON, do not include any text or code.
+      Do not output anything besides one JSON object.
+  nodes:
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: set_goal
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
+        - review the instructions from the user and the content of the page.
+        - outline the main task to be accomplished and the steps to be taken to achieve it.
+        - produce definiton of done, that will be checked later to verify if the task was completed.
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: reflect
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
+        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
+        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
+        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
+        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
+        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
+        Produce only one reasoning_thought step!
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.steps.ReasoningThought
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+    - _target_: examples.rl_webagent.agent.WebNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce the next action to be performed with the current page.
+        If you think that the task is solved, produce the final_answer_action.
+        You can interact with the page elements using their BIDs or coordinates as arguments for actions.
+        HINTS:
+        - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
+        - To select value in a dropdown or combobox, ALWAYS use select_action.
+        - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
+        - Press enter key to submit the search query.
+        - Always produce only one step at a time.
+        - Step kind is always lowercase and underscore separated.
+        ${agent.templates.json_format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      use_known_actions: true
+      steps:
+        - examples.rl_webagent.steps.FinalAnswerAction
+      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
+      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
+      next_node: reflect
+
+
+# ENVIRONMENT CONFIGURATION
+start_attempts: 3  # number of attempts to start each task
+environment:
+  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
+  miniwob_url: ???
+  n_envs: ${actor.llm_max_rollouts}
+  host: "0.0.0.0"
+  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  web_env_target: examples.rl_webagent.environment.WebEnvironment
+  exp_path: ${output_dir}/env_server
+  headless: true
+  observation_format: html
+
+# DATASET CONFIGURATION
+dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
+dataset_loader_params:
+  train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
+  seeds: [0, 42, 1337, 900, 103]
+train_dataset_names:
+  - train
+test_dataset_names:
+  - test
diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
new file mode 100644
index 00000000..761ee43b
--- /dev/null
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -0,0 +1,18 @@
+defaults:
+  - miniwob_grpo
+  - _self_
+
+train_dataset_names:
+  - massimo_train
+test_dataset_names:
+  - massimo_test
+
+reward_computation: massimo
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 512
+
+eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
diff --git a/conf/miniwob_massimo.yaml b/conf/miniwob_massimo_ppo.yaml
similarity index 88%
rename from conf/miniwob_massimo.yaml
rename to conf/miniwob_massimo_ppo.yaml
index 003238be..8b1fefb8 100644
--- a/conf/miniwob_massimo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -1,6 +1,5 @@
 defaults:
-  - miniwob
-  - override finetune: ppo
+  - miniwob_ppo
   - _self_
 
 train_dataset_names:
@@ -8,6 +7,8 @@ train_dataset_names:
 test_dataset_names:
   - massimo_test
 
+reward_computation: massimo
+
 finetune:
   seq_length: 16384  # input + output tokens
   max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
diff --git a/conf/miniwob.yaml b/conf/miniwob_ppo.yaml
similarity index 97%
rename from conf/miniwob.yaml
rename to conf/miniwob_ppo.yaml
index 5c090823..656e7839 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -19,6 +19,10 @@ model_path: meta-llama/Llama-3.1-8B-Instruct
 finetune:
   seq_length: 16384  # input + output tokens
   max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
+
+eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
 
 llm:
   parameters:
@@ -44,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
   name : web_agent
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 0c5a4396..8168bcd5 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -142,12 +142,15 @@ async def generate_miniwob_rollout(
     # get the number of PageObservation steps in the tape
     n_page_observations = len([step for step in tape.steps if isinstance(step, PageObservation)])
 
-    reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
-    # massimo's setup:
-    # reward = float(raw_reward>0)
-    # if reward == 0.0:
-    #     reward = -1.0
-    # reward *= 0.98 ** n_page_observations
+    if cfg.reward_computation == "nico":
+        reward = raw_reward * 0.99**n_step_errors if no_error and raw_reward >= 0 else -1.0
+    elif cfg.reward_computation == "massimo":
+        reward = float(raw_reward>0)
+        if reward == 0.0:
+            reward = -1.0
+        reward *= 0.98 ** n_page_observations
+    else:
+        raise ValueError(f"Invalid reward configuration: {cfg.reward_computation}")
 
     # (3) Get LLM calls from Tape
     llm_calls = [step for step in tape.steps if step.metadata.other.get("llm_call") is not None]

From 7a4e73fb4389dfa87c691a3168d8de21802d0eff Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 2 Sep 2025 20:55:23 +0000
Subject: [PATCH 107/166] add retry mechanism for agent loop

---
 conf/miniwob_grpo.yaml                 |  1 +
 conf/miniwob_ppo.yaml                  |  1 +
 pipelinerl/domains/miniwob/rollouts.py | 48 +++++++++++++++++++-------
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 864dbc75..5e82caae 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -48,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob_ppo.yaml
index 656e7839..05b7ff0d 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -48,6 +48,7 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
+agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 8168bcd5..a356911f 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -84,7 +84,7 @@ async def generate_miniwob_rollout(
     async with environment.acontext(session, wait_for_env=True) as env:
         start_attempts = cfg.start_attempts
         t = time.perf_counter()
-        while True:
+        while start_attempts > 0:
             try:
                 tape_dict, _ = await env.start_task(problem)
                 break
@@ -92,11 +92,12 @@ async def generate_miniwob_rollout(
                 logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
                 if start_attempts <= 0:
+                    logger.error("Failed to start task after all retry attempts")
                     no_error = False
                     tape_dict = {}
                     break
                 else:
-                    logger.warning(f"retry after 5 seconds: {e}")
+                    logger.warning(f"retry after 5 seconds: {e}, {start_attempts} attempts remaining")
                     await asyncio.sleep(5)
         logger.info(
             f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
@@ -105,16 +106,39 @@ async def generate_miniwob_rollout(
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
             logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
-            try:
-                actions = await env.a_actions()
-                tools_description = await env.a_tools_description()
-                logger.debug(f"Available tools: {tools_description}")
-                agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                agent.llms = {DEFAULT: llm}
-                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-            except Exception as e:
-                logger.error(f"Error occurred while running agent: {e}")
-                no_error = False
+            agent_attempts = cfg.agent_attempts
+            while agent_attempts > 0:
+                try:
+                    actions = await env.a_actions()
+                    tools_description = await env.a_tools_description()
+                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                    agent.llms = {DEFAULT: llm}
+                    tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
+                    if tape.metadata.error:
+                        logger.warning(f"Agent execution failed with error: {tape.metadata.error}")
+                        agent_attempts -= 1
+                        if agent_attempts <= 0:
+                            logger.error("Agent execution failed after all retry attempts")
+                            no_error = False
+                            break
+                        else:
+                            logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                            await asyncio.sleep(5)
+                            continue
+                    else:
+                        # Success - break out of retry loop
+                        break
+                except Exception as e:
+                    logger.warning(f"Error occurred while running agent: {e}")
+                    agent_attempts -= 1
+                    if agent_attempts <= 0:
+                        logger.error("Agent execution failed after all retry attempts")
+                        no_error = False
+                        break
+                    else:
+                        logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                        await asyncio.sleep(5)
             logger.info(
                 f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
             )

From 42e811e66d641f3e2219a2721c8ae6c576f0624f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Wed, 3 Sep 2025 20:45:20 +0000
Subject: [PATCH 108/166] add 30min timeout to rollout function

---
 conf/miniwob_grpo.yaml                 |  3 +-
 conf/miniwob_ppo.yaml                  |  3 +-
 pipelinerl/domains/miniwob/rollouts.py | 54 ++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 5e82caae..eb733148 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -49,6 +49,7 @@ preprocess:
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
 agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
+rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -133,7 +134,7 @@ environment:
   miniwob_url: ???
   n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
-  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob_ppo.yaml
index 05b7ff0d..9a85a8cd 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob_ppo.yaml
@@ -49,6 +49,7 @@ preprocess:
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
 agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
+rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -133,7 +134,7 @@ environment:
   miniwob_url: ???
   n_envs: ${actor.llm_max_rollouts}
   host: "0.0.0.0"
-  env_call_timeout: 600  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: ${output_dir}/env_server
   headless: true
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index a356911f..2df03815 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,7 +70,29 @@ async def generate_miniwob_rollout(
     # get training text from llm calls
 
     start_time = time.time()
+    
+    # Overall timeout for the entire rollout to prevent hanging
+    rollout_timeout = getattr(cfg, 'rollout_timeout', 1800)  # 30 minutes default
 
+    try:
+        # Execute the entire rollout with a timeout
+        return await asyncio.wait_for(
+            _execute_rollout_with_timeout(cfg, llm, problem, session, start_time),
+            timeout=rollout_timeout
+        )
+    except asyncio.TimeoutError:
+        logger.error(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+        # Return a failed rollout result
+        return _create_failed_rollout_result(problem, start_time, "timeout")
+
+
+async def _execute_rollout_with_timeout(
+    cfg: DictConfig,
+    llm: TrainableLLM,
+    problem: dict,
+    session: aiohttp.ClientSession,
+    start_time: float,
+) -> RolloutResult:
     # (1) Choose a random environment server
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
     # choose the env job randomly
@@ -225,3 +247,35 @@ async def generate_miniwob_rollout(
         prompt_tokens=prompt_tokens,
         output_tokens=output_tokens,
     )
+
+
+def _create_failed_rollout_result(problem: dict, start_time: float, error_type: str) -> RolloutResult:
+    """Create a failed rollout result for timeout or other errors."""
+    latency = time.time() - start_time
+    
+    # Create empty training texts and metrics for failed rollout
+    metrics = MiniwobMetrics(
+        reward=-1.0,
+        success=False,
+        no_error=False,
+        no_answer=True,
+        overflow=False,
+        n_llm_calls=0,
+        n_step_errors=0,
+        n_page_observations=0,
+        n_steps=0,
+        total_execution_time=latency,
+        agent_execution_time=-1.0,
+        environment_execution_time=-1.0,
+        env_step_time=-1.0,
+        agent_step_time=-1.0,
+    )
+    
+    return RolloutResult(
+        training_texts=[],
+        metrics=metrics,
+        latency=latency,
+        dataset_name=problem["dataset"],
+        prompt_tokens=[],
+        output_tokens=[],
+    )

From a4e8f5fd581e0bcfc62bda9f61dffbef21cb7df3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 02:29:14 +0000
Subject: [PATCH 109/166] upd configs

---
 conf/{miniwob_ppo.yaml => miniwob.yaml} |   7 +-
 conf/miniwob_grpo.yaml                  | 150 +-----------------------
 2 files changed, 5 insertions(+), 152 deletions(-)
 rename conf/{miniwob_ppo.yaml => miniwob.yaml} (97%)

diff --git a/conf/miniwob_ppo.yaml b/conf/miniwob.yaml
similarity index 97%
rename from conf/miniwob_ppo.yaml
rename to conf/miniwob.yaml
index 9a85a8cd..341512ca 100644
--- a/conf/miniwob_ppo.yaml
+++ b/conf/miniwob.yaml
@@ -42,6 +42,7 @@ vllm_config:
 actor:
   rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
   shared_memory_entry_size: 100000000
+  llm_max_rollouts: 32
 
 preprocess:
   shared_memory_entry_size: 1000000000
@@ -68,7 +69,7 @@ agent:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
     json_format: |
-      Important! Respond with parsable JSON, do not include any text or code.
+      Important! Respond with parsable JSON, do not include any special characters or code.
       Do not output anything besides one JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
@@ -96,7 +97,7 @@ agent:
         - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
         - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
         - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
-        Produce only one reasoning_thought step!
+        Produce only one reasoning_thought step! Be brief and to the point. You can skip some details if they are not relevant for this step.
         ${agent.templates.json_format}
       steps_prompt: ${agent.templates.allowed_steps}
       steps:
@@ -132,7 +133,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: ${actor.llm_max_rollouts}
+  n_envs: 64
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index eb733148..7837c14b 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -1,151 +1,3 @@
 defaults:
-  - base
-  - override streams: redis
+  - miniwob
   - override finetune: grpo
-  - _self_
-
-world:
-  actor_fraction: 3
-  preprocessor_fraction: 0
-  finetune_fraction: 5
-
-# debug:
-#   mode: actor
-save_tapes: False
-
-output_dir: results/miniwob/${now:%Y-%m-%d}/${now:%H-%M-%S}
-model_path: meta-llama/Llama-3.1-8B-Instruct
-
-finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
-  gradient_accumulation_passes: 1024
-
-eval_every_n_versions: 10240  # 1024 effective bs * 10 "optim steps"
-
-llm:
-  parameters:
-    max_tokens: 4096  # output tokens
-    temperature: 1.0
-test_llm:
-  parameters:
-    max_tokens: ${...llm.parameters.max_tokens}
-    temperature: 0.0
-    top_p: 1.0
-    top_k: 50
-
-vllm_config:
-  vllm_kwargs:
-    max_model_len: 16384  # input + output tokens
-
-actor:
-  rollout_policy: pipelinerl.domains.miniwob.rollouts.generate_miniwob_rollout
-  shared_memory_entry_size: 100000000
-
-preprocess:
-  shared_memory_entry_size: 1000000000
-
-# AGENT CONFIGURATION
-agent_max_loops: 10  # max number of agent - environment interactions for each task
-agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
-rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
-reward_computation: nico
-agent:
-  _target_: tapeagents.agent.Agent
-  name : web_agent
-  max_iterations: 4  # max number of iterations (make_prompt + llm + generate_steps) for each loop
-  store_llm_calls: true
-  templates:
-    system_prompt: |
-      You are an expert AI Agent, your goal is to help the user perform tasks using a web browser.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      You will be provided with the content of the current page and a task from the user.
-      Do not express your emotions or opinions about the user question.
-    allowed_steps: |
-      You are allowed to produce ONLY steps with the following json schemas:
-      {allowed_steps}
-      Do not reproduce schema when producing the steps, use it as a reference.
-    json_format: |
-      Important! Respond with parsable JSON, do not include any text or code.
-      Do not output anything besides one JSON object.
-  nodes:
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: set_goal
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the reasoning_thought step that describes the intended solution to the task. In the reasoning lines:
-        - review the instructions from the user and the content of the page.
-        - outline the main task to be accomplished and the steps to be taken to achieve it.
-        - produce definiton of done, that will be checked later to verify if the task was completed.
-        Produce only one reasoning_thought step!
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      steps:
-        - tapeagents.steps.ReasoningThought
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: reflect
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the reasoning_thought step that describes the current state of the page, the previous actions, and what should be the next best action to accomplish the task. In the reasoning lines:
-        - think about which information could be relevant to the given task, note relevant BIDs and coordinates.
-        - describe the last action taken, what were its expected effects on the page, versus the actual effects you can observe. Are they the same or not? if not, what could have gone wrong?
-        - check if you are stuck with repeating the same action over and over again, if so, try something else and change the action.
-        - check if you think the task is done, if not give a detailed list of actions to do next to accomplish the task.
-        - finally, if the task is not done, describe the immediate next action to be performed and its expected effect on the page.
-        Produce only one reasoning_thought step!
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      steps:
-        - tapeagents.steps.ReasoningThought
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-    - _target_: examples.rl_webagent.agent.WebNode
-      name: act
-      system_prompt: ${agent.templates.system_prompt}
-      guidance: |
-        Produce the next action to be performed with the current page.
-        If you think that the task is solved, produce the final_answer_action.
-        You can interact with the page elements using their BIDs or coordinates as arguments for actions.
-        HINTS:
-        - You can use the BIDs of the elements or the mouse position in x, y coordinates to interact with them.
-        - To select value in a dropdown or combobox, ALWAYS use select_action.
-        - To click on a checkbox or radio button, ALWAYS use BID (or coordinates) of the corresponding Text and not the BID (or coordinates) of the element itself.
-        - Press enter key to submit the search query.
-        - Always produce only one step at a time.
-        - Step kind is always lowercase and underscore separated.
-        ${agent.templates.json_format}
-      steps_prompt: ${agent.templates.allowed_steps}
-      use_known_actions: true
-      steps:
-        - examples.rl_webagent.steps.FinalAnswerAction
-      trim_obs_except_last_n: 3  # keep the last 3 observations from the tape in prompt messages
-      max_chars_page_observation: 3000  # keep up to 3000 chars in PageObservation steps
-      next_node: reflect
-
-
-# ENVIRONMENT CONFIGURATION
-start_attempts: 3  # number of attempts to start each task
-environment:
-  _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
-  miniwob_url: ???
-  n_envs: ${actor.llm_max_rollouts}
-  host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
-  web_env_target: examples.rl_webagent.environment.WebEnvironment
-  exp_path: ${output_dir}/env_server
-  headless: true
-  observation_format: html
-
-# DATASET CONFIGURATION
-dataset_loader: pipelinerl.domains.miniwob.load_tasks.load_tasks
-dataset_loader_params:
-  train_split: 0.6  # 0.6 of tasks for training, 0.4 for testing
-  seeds: [0, 42, 1337, 900, 103]
-train_dataset_names:
-  - train
-test_dataset_names:
-  - test

From 95b735b7e1ee686cee046ba596b0dbe536083168 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 03:21:24 +0000
Subject: [PATCH 110/166] upd

---
 conf/miniwob.yaml             | 5 +++--
 conf/miniwob_massimo_ppo.yaml | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 341512ca..c9499b48 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -69,8 +69,9 @@ agent:
       {allowed_steps}
       Do not reproduce schema when producing the steps, use it as a reference.
     json_format: |
-      Important! Respond with parsable JSON, do not include any special characters or code.
-      Do not output anything besides one JSON object.
+      Important! Respond with very simple parsable JSON!
+      Do not use any special characters or code. Do not use new lines, tabs, or any other formatting inside the JSON.
+      Do not output anything besides one simple JSON object.
   nodes:
     - _target_: examples.rl_webagent.agent.WebNode
       name: set_goal
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
index 8b1fefb8..b2e3b8ca 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - miniwob_ppo
+  - miniwob
   - _self_
 
 train_dataset_names:

From 8616303e9bebe222ce56b82bd4b75d03e99e55ef Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 5 Sep 2025 03:25:27 +0000
Subject: [PATCH 111/166] upd configs

---
 conf/miniwob_grpo.yaml         | 7 +++++++
 conf/miniwob_massimo_grpo.yaml | 3 ---
 conf/miniwob_massimo_ppo.yaml  | 3 ---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/conf/miniwob_grpo.yaml b/conf/miniwob_grpo.yaml
index 7837c14b..f6cfeed3 100644
--- a/conf/miniwob_grpo.yaml
+++ b/conf/miniwob_grpo.yaml
@@ -1,3 +1,10 @@
 defaults:
   - miniwob
   - override finetune: grpo
+  - _self_
+
+finetune:
+  seq_length: 16384  # input + output tokens
+  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
+  train_batch_size: 1
+  gradient_accumulation_passes: 1024
diff --git a/conf/miniwob_massimo_grpo.yaml b/conf/miniwob_massimo_grpo.yaml
index 761ee43b..b61dcf32 100644
--- a/conf/miniwob_massimo_grpo.yaml
+++ b/conf/miniwob_massimo_grpo.yaml
@@ -10,9 +10,6 @@ test_dataset_names:
 reward_computation: massimo
 
 finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
   gradient_accumulation_passes: 512
 
 eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"
diff --git a/conf/miniwob_massimo_ppo.yaml b/conf/miniwob_massimo_ppo.yaml
index b2e3b8ca..53703d56 100644
--- a/conf/miniwob_massimo_ppo.yaml
+++ b/conf/miniwob_massimo_ppo.yaml
@@ -10,9 +10,6 @@ test_dataset_names:
 reward_computation: massimo
 
 finetune:
-  seq_length: 16384  # input + output tokens
-  max_train_steps: 1000  # 1000 optim steps = 1000 * bs samples
-  train_batch_size: 1
   gradient_accumulation_passes: 512
 
 eval_every_n_versions: 5120  # 512 effective bs * 10 "optim steps"

From f4d8e0d60abfb4ed432f5a29d23db41eeafb47f7 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:13:25 +0000
Subject: [PATCH 112/166] Avoid hot-spotting env; add extra metrics

---
 pipelinerl/domains/mcp/rollouts.py | 99 +++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 099b0abe..cd82e351 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+from urllib.parse import urlparse
 import time
 import random
 import logging 
@@ -7,17 +8,15 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel
 from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
-from tapeagents.core import Prompt
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.remote_environment import AsyncRemoteEnvironment
-from pipelinerl.async_llm import llm_async_generate, make_training_text
+from pipelinerl.async_llm import make_training_text
 from tapeagents.orchestrator import async_execute_agent
 from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
-from tapeagents.core import StopStep, Tape
+from tapeagents.core import Tape
 from tapeagents.dialog_tape import UserStep
 from tapeagents.core import LLMCall
 
@@ -50,6 +49,10 @@ def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
 class Metrics(BaseMetrics):
     num_python_calls: int = 0
     num_steps: int = 0
+    n_llm_calls: int = 0
+    total_execution_time: float = -1.0
+    agent_execution_time: float = -1.0
+    environment_execution_time: float = -1.0
 
 async def generate_mcp_rollout(
     cfg: DictConfig,
@@ -57,31 +60,58 @@ async def generate_mcp_rollout(
     problem: dict,
     session: aiohttp.ClientSession,
 ) -> RolloutResult:
-    # (1) Choose a random environment server
+    # choose and retry env servers if one is saturated
     start = time.perf_counter()
     env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    # choose the env job randomly
-    env_job = random.choice(env_jobs)
-    assert env_job.port is not None
-    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
-    environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore
-    async with environment.acontext(session, wait_for_env=True) as env:
-        await env.start_task(problem)
-        actions = await env.a_actions()
-        tools_description = await env.a_tools_description()
-        logger.debug(f"Available tools: {tools_description}")
-        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-        agent.llms = {DEFAULT: llm}
-
-        tape = Tape(steps=[
-            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-            ])
-        while True:
+    if not env_jobs:
+        raise RuntimeError("No environment servers available")
+
+    # shuffle to avoid dead-locking a single server
+    env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
+    if not env_urls_all:
+        raise RuntimeError("Environment server definitions missing ports")
+
+    while True:
+        env_urls = env_urls_all[:]
+        random.shuffle(env_urls)
+        chosen_url = None
+        for env_url in env_urls:
             try:
-                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                break
-            except Exception:
-                await asyncio.sleep(5)
+                environment = AsyncRemoteEnvironment(
+                    server_url=env_url, start_timeout_sec=600, start_repeat_delay=5)
+                context_manager = environment.acontext(session, wait_for_env=True)
+                env = await context_manager.__aenter__()
+                try:
+                    await env.start_task(problem)
+                    chosen_url = env_url
+                    actions = await env.a_actions()
+                    tools_description = await env.a_tools_description()
+                    logger.debug(f"Available tools: {tools_description}")
+                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                    agent.llms = {DEFAULT: llm}
+
+                    tape = Tape(steps=[
+                        UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+                    ])
+                    t_exec = time.perf_counter()
+                    while True:
+                        try:
+                            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+                            break
+                        except Exception:
+                            await asyncio.sleep(5)
+                    break  # success
+                finally:
+                    await context_manager.__aexit__(None, None, None)
+            except Exception as e:
+                # try the next server on errors (503: busyslots)
+                logger.warning(f"Env start failed at {env_url}: {e}")
+                continue
+        if chosen_url is not None:
+            break  # success
+        # if none succeeded backoff and retry the whole list
+        await asyncio.sleep(1.0)
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
@@ -94,11 +124,14 @@ async def generate_mcp_rollout(
     assert len(llm_calls) > 0, "No LLM calls found"
     tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+    n_llm_calls = len(llm_calls)
+    parsed = urlparse(chosen_url)
+    assert parsed.hostname is not None and parsed.port is not None
     answer_status = await verify_answer_rpc(
         session=session,
-        host=env_job.hostname,
-        port=env_job.port, # type: ignore
-        prediction=llm_calls[-1].output.content, # type: ignore
+        host=parsed.hostname,
+        port=parsed.port,
+        prediction=llm_calls[-1].output.content,  # type: ignore
         gold=problem["answer"],
         strict=True,
     )
@@ -110,6 +143,10 @@ async def generate_mcp_rollout(
 
     latency = time.perf_counter() - start
 
+    agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+    env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+    total_time = tape.metadata.result.get("total_execution_time", -1.0)
+    
     
     metrics = Metrics(
         reward=reward,
@@ -118,6 +155,10 @@ async def generate_mcp_rollout(
         no_answer=answer_status == "no_answer",
         num_steps=len(tape.steps),
         num_python_calls=tool_call_counts.get("run_python_code", 0),
+        n_llm_calls=n_llm_calls,
+        total_execution_time=total_time,
+        agent_execution_time=agent_time,
+        environment_execution_time=env_time,
     )
 
     return RolloutResult(

From 23decf758de8ab1bdfd38b58e2092d72b969ab5e Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:14:33 +0000
Subject: [PATCH 113/166] Print correct policy info

---
 pipelinerl/actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index ce63ac72..46b6606f 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -135,7 +135,7 @@ async def schedule_rollouts(
     # Track rollouts per problem group
     group_rollouts = {}
     rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
-    logger.info(f"Use rollout policy: {rollout_policy}")
+    logger.info(f"Use rollout policy: {rollout_policy.__name__}")
 
     async def rollout_and_maybe_produce_result(
         problem: dict,

From 29118b719b722b2ccbdca4d28185ba7b9fbc0904 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:16:04 +0000
Subject: [PATCH 114/166] Add aime2025

---
 pipelinerl/domains/math/load_datasets.py | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/pipelinerl/domains/math/load_datasets.py b/pipelinerl/domains/math/load_datasets.py
index 4b44dfb6..7cbf9c18 100644
--- a/pipelinerl/domains/math/load_datasets.py
+++ b/pipelinerl/domains/math/load_datasets.py
@@ -170,6 +170,26 @@ def _load_aime_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     return add_ids(samples)
 
 
+def _load_aime_2025_opencompass(upsample_factor: int = 0) -> list[dict]:
+    configs = ["AIME2025-I", "AIME2025-II"]
+    dataset_name = "aime_2025" + ("" if upsample_factor > 0 else "_original")
+
+    samples: list[dict] = []
+    for config_name in configs:
+        ds = load_dataset("opencompass/AIME2025", config_name, split="test")
+        samples.extend([s for s in process_math(ds, dataset_name) if s is not None])
+
+    original_size = len(samples)
+    if upsample_factor > 0:
+        samples *= upsample_factor
+
+    logger.info(
+        f"Loading aime 2025 (OpenCompass) dataset: {len(samples)} samples"
+        + (f" (upsampled from {original_size})" if upsample_factor > 0 else "")
+    )
+    return add_ids(samples)
+
+
 def _load_amc_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     amc_dataset = load_dataset("AI-MO/aimo-validation-amc", split="train", trust_remote_code=True)
     amc_dataset = amc_dataset.filter(lambda x: str(year) in x["url"])
@@ -335,6 +355,12 @@ def load_datasets(dataset_names: List[str] | str | None, seed: int | None = None
     if "aime_2024_original" in dataset_names:
         datasets += _load_aime_dataset(2024)
 
+    if "aime_2025" in dataset_names:
+        datasets += _load_aime_2025_opencompass(upsample_factor=16)
+
+    if "aime_2025_original" in dataset_names:
+        datasets += _load_aime_2025_opencompass()
+
     if "amc_2022" in dataset_names:
         # TODO: AMC 2022 is 43 problems, is that to be expected?
         datasets += _load_amc_dataset(2022, upsample_factor=16)

From 88828596361fe13ad27b8fb8cc6cbb58dda41ce0 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 5 Sep 2025 15:16:37 +0000
Subject: [PATCH 115/166] Test on aime2025

---
 conf/mcp.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index efdd196a..c4b050b8 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -19,6 +19,7 @@ train_dataset_names:
 - open_reasoner_zero_57k
 - open_reasoner_zero_extended_72k 
 test_dataset_names:
+  - aime_2025
   - aime_2024
   - amc_2023
   - math_500

From 923cf6a8c1372a7929cb4ef0ebfb9959da5f717d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 02:37:45 +0000
Subject: [PATCH 116/166] reduce n_env

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index c9499b48..cecf7e3e 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -134,7 +134,7 @@ start_attempts: 3  # number of attempts to start each task
 environment:
   _target_: pipelinerl.domains.miniwob.environment_server.WebEnvironmentServer
   miniwob_url: ???
-  n_envs: 64
+  n_envs: 32
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment

From 44a033f306b37b89a5413e2224326e3891bc7ffd Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 03:46:23 +0000
Subject: [PATCH 117/166] boost preprocess power

---
 conf/miniwob.yaml | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index cecf7e3e..af0397fe 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -5,9 +5,9 @@ defaults:
   - _self_
 
 world:
-  actor_fraction: 3
+  actor_fraction: 2
   preprocessor_fraction: 0
-  finetune_fraction: 5
+  finetune_fraction: 6
 
 # debug:
 #   mode: actor
@@ -45,7 +45,21 @@ actor:
   llm_max_rollouts: 32
 
 preprocess:
-  shared_memory_entry_size: 1000000000
+  n_workers: 16  # Increase from 8
+  chunk_n_groups: 4  # Increase from 2 for better throughput
+  # queue for loaded raw groups
+  raw_queue_size: 16      # Increase from 8
+  # queue for processed chunks of multiple groups
+  input_queue_size: 64    # Increase from 32
+  # queue for ready chunks for multiple groups
+  output_queue_size: 64   # Increase from 32
+  # queue for accumulating samples before further processing
+  dataset_buffer_size: 512  # Enable buffering (was 0)
+  # ring buffer to replace old samples with new ones when training is slow
+  ring_buffer_size: 1024  # Increase from 128
+  # "virtual" sample queue per lead trainer
+  max_ready_samples_per_lead: 256  # Increase from 64
+  shared_memory_entry_size: 1000000000  # Increase from 100M
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task

From 2918d1fe15b72a6930c83c4bfeeffab47f846e7f Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 6 Sep 2025 04:00:18 +0000
Subject: [PATCH 118/166] pop old data

---
 conf/miniwob.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index af0397fe..0f13dc64 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -53,8 +53,6 @@ preprocess:
   input_queue_size: 64    # Increase from 32
   # queue for ready chunks for multiple groups
   output_queue_size: 64   # Increase from 32
-  # queue for accumulating samples before further processing
-  dataset_buffer_size: 512  # Enable buffering (was 0)
   # ring buffer to replace old samples with new ones when training is slow
   ring_buffer_size: 1024  # Increase from 128
   # "virtual" sample queue per lead trainer

From dacaa1f6c366a2996d2db3d88b375de801dca2b3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 04:27:41 +0000
Subject: [PATCH 119/166] do not save playwright traces & screenshots

---
 conf/miniwob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 0f13dc64..d10fbbb2 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -150,7 +150,7 @@ environment:
   host: "0.0.0.0"
   env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
-  exp_path: ${output_dir}/env_server
+  exp_path: null
   headless: true
   observation_format: html
 

From fcee5ee6bb27f4787d9700c109f718261d19a246 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 05:01:47 +0000
Subject: [PATCH 120/166] return empty aggregate stats if empty stats

---
 pipelinerl/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 2b0a252c..a6467271 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -239,6 +239,9 @@ def calculate_stats(stats: List | Dict[Any, Any]) -> Dict[str, float]:
     if not isinstance(stats, list):
         raise TypeError(f"Expected stats to be a list, got {type(stats)}")
 
+    if len(stats) == 0:
+        return {}
+
     aggregated_stats = {
         "max": float(max(stats)),
         "min": float(min(stats)),

From 631389f312a737fcf8852b9eea82ce0aa329ad43 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sun, 7 Sep 2025 05:05:12 +0000
Subject: [PATCH 121/166] increase preprocessor power

---
 conf/miniwob.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index d10fbbb2..65e23d9c 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -45,10 +45,10 @@ actor:
   llm_max_rollouts: 32
 
 preprocess:
-  n_workers: 16  # Increase from 8
-  chunk_n_groups: 4  # Increase from 2 for better throughput
+  n_workers: 32  # Increase from 8
+  chunk_n_groups: 8  # Increase from 2 for better throughput
   # queue for loaded raw groups
-  raw_queue_size: 16      # Increase from 8
+  raw_queue_size: 32      # Increase from 8
   # queue for processed chunks of multiple groups
   input_queue_size: 64    # Increase from 32
   # queue for ready chunks for multiple groups

From f7912114dd701a0d0a6484161a0bb5feba9ecced Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 8 Sep 2025 18:55:36 +0000
Subject: [PATCH 122/166] better error handling

---
 pipelinerl/domains/miniwob/rollouts.py | 31 ++++++++++++--------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 2df03815..72c52678 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -108,18 +108,24 @@ async def _execute_rollout_with_timeout(
         t = time.perf_counter()
         while start_attempts > 0:
             try:
-                tape_dict, _ = await env.start_task(problem)
+                start_result = await env.start_task(problem)
+                if isinstance(start_result, dict) and "error" in start_result:
+                    raise ValueError(start_result['error'])
+                elif isinstance(start_result, tuple):
+                    tape_dict, _ = start_result
+                else:
+                    raise ValueError(f"Invalid start result: {start_result}")
                 break
             except Exception as e:
-                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}")
                 start_attempts -= 1
+                logger.warning(f"Failed to start task {problem['dataset']}/{problem['task']}/{problem['seed']}. {start_attempts} attempts remaining. Error: {e}")
                 if start_attempts <= 0:
-                    logger.error("Failed to start task after all retry attempts")
+                    logger.error(f"Failed to start task after all retry attempts: {e}")
                     no_error = False
                     tape_dict = {}
                     break
                 else:
-                    logger.warning(f"retry after 5 seconds: {e}, {start_attempts} attempts remaining")
+                    logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
         logger.info(
             f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
@@ -138,28 +144,19 @@ async def _execute_rollout_with_timeout(
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
                     # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
                     if tape.metadata.error:
-                        logger.warning(f"Agent execution failed with error: {tape.metadata.error}")
-                        agent_attempts -= 1
-                        if agent_attempts <= 0:
-                            logger.error("Agent execution failed after all retry attempts")
-                            no_error = False
-                            break
-                        else:
-                            logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
-                            await asyncio.sleep(5)
-                            continue
+                        raise ValueError(tape.metadata.error)
                     else:
                         # Success - break out of retry loop
                         break
                 except Exception as e:
-                    logger.warning(f"Error occurred while running agent: {e}")
                     agent_attempts -= 1
+                    logger.warning(f"Error occurred while running agent. {agent_attempts} attempts remaining. Error: {e}")
                     if agent_attempts <= 0:
-                        logger.error("Agent execution failed after all retry attempts")
+                        logger.error(f"Agent execution failed after all retry attempts: {e}")
                         no_error = False
                         break
                     else:
-                        logger.warning(f"Retrying agent execution after 5 seconds, {agent_attempts} attempts remaining")
+                        logger.warning("Retry agent execution after 5 seconds.")
                         await asyncio.sleep(5)
             logger.info(
                 f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"

From c54d90070583ee920797e1fb1c18bbbfbb8f4cc3 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 8 Sep 2025 19:21:36 +0000
Subject: [PATCH 123/166] fix

---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 72c52678..4399f6a1 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -111,7 +111,7 @@ async def _execute_rollout_with_timeout(
                 start_result = await env.start_task(problem)
                 if isinstance(start_result, dict) and "error" in start_result:
                     raise ValueError(start_result['error'])
-                elif isinstance(start_result, tuple):
+                elif isinstance(start_result, list):
                     tape_dict, _ = start_result
                 else:
                     raise ValueError(f"Invalid start result: {start_result}")

From ea4918a67cd0d35532fe0af4a7a3253e7fc3d160 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Tue, 9 Sep 2025 03:31:04 +0000
Subject: [PATCH 124/166] reduce timeouts

---
 conf/miniwob.yaml                      | 6 +++---
 pipelinerl/domains/miniwob/rollouts.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/miniwob.yaml b/conf/miniwob.yaml
index 65e23d9c..1454e774 100644
--- a/conf/miniwob.yaml
+++ b/conf/miniwob.yaml
@@ -61,8 +61,8 @@ preprocess:
 
 # AGENT CONFIGURATION
 agent_max_loops: 10  # max number of agent - environment interactions for each task
-agent_attempts: 3  # number of attempts to run the agent (retry on timeout/errors)
-rollout_timeout: 1800  # overall timeout for entire rollout in seconds (30 minutes)
+agent_attempts: 3  # number of attempts to run the agent (retry on errors)
+rollout_timeout: 600  # overall timeout for entire rollout in seconds (10 minutes)
 reward_computation: nico
 agent:
   _target_: tapeagents.agent.Agent
@@ -148,7 +148,7 @@ environment:
   miniwob_url: ???
   n_envs: 32
   host: "0.0.0.0"
-  env_call_timeout: 120  # timeout for each environment call (e.g. start_task, act, etc.)
+  env_call_timeout: 60  # timeout for each environment call (e.g. start_task, act, etc.)
   web_env_target: examples.rl_webagent.environment.WebEnvironment
   exp_path: null
   headless: true
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 4399f6a1..34ded1b6 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -72,7 +72,7 @@ async def generate_miniwob_rollout(
     start_time = time.time()
     
     # Overall timeout for the entire rollout to prevent hanging
-    rollout_timeout = getattr(cfg, 'rollout_timeout', 1800)  # 30 minutes default
+    rollout_timeout = getattr(cfg, 'rollout_timeout', 600)  # 10 minutes default
 
     try:
         # Execute the entire rollout with a timeout

From e5fca104d7dfa7c9ea3b9c6e141e4e3e0d82a21b Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:44:34 +0000
Subject: [PATCH 125/166] log number of groups done so far

---
 pipelinerl/actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index ce63ac72..a329598f 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -196,6 +196,7 @@ async def rollout_and_maybe_produce_result(
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
+                    f"groups finished so far: {group_id}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
@@ -482,7 +483,6 @@ def run(self, dataset: list[tuple[str, dict]]):
                     f" {in_progress} groups in progress"
                 )
 
-                    
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1

From df66a88a8fb7c0e30907ba242019d0f5b91c9cfa Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:45:45 +0000
Subject: [PATCH 126/166] log everything if populate_rl_data fails

---
 pipelinerl/preprocess.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/preprocess.py b/pipelinerl/preprocess.py
index 65e29b4b..cd34b54d 100644
--- a/pipelinerl/preprocess.py
+++ b/pipelinerl/preprocess.py
@@ -160,7 +160,18 @@ def preprocess_dataset(
         entry["step_index"] = entry["metadata"]["step_index"]
     if not isinstance(tokenizer.eos_token_id, int):
         raise ValueError(f"Tokenizer {tokenizer} does not have an eos_token_id")
-    dataset = populate_rl_data(dataset=dataset, eos_token_id=tokenizer.eos_token_id, config=rl_config)
+    try:
+        dataset = populate_rl_data(dataset=dataset, eos_token_id=tokenizer.eos_token_id, config=rl_config)
+    except Exception as e:
+        logger.error(f"Error in populate_rl_data: {e}")
+        logger.error(f"Data: {data}")
+        logger.error(f"Dataset: {dataset}")
+        logger.error(f"Tokenizer: {tokenizer}")
+        logger.error(f"Tokenizer eos_token_id: {tokenizer.eos_token_id}")
+        logger.error(f"RL config: {rl_config}")
+        logger.error(f"LLM: {llm}")
+        logger.error(f"Seq length: {seq_length}")
+        raise e
     return dataset
 
 

From c8d017122fff127c4218b3b5d2b24a0a426897e4 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 20:49:23 +0000
Subject: [PATCH 127/166] monitor env servers and reset if needed

---
 pipelinerl/domains/miniwob/rollouts.py | 100 ++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 17 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 34ded1b6..dff461c1 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import os
 import random
@@ -55,6 +56,41 @@ def tape_contains_an_error(tape: WebTape) -> bool:
     )
 
 
+async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession) -> dict:
+    """Check environment server health via HTTP API."""
+    try:
+        url = f"http://{env_job.hostname}:{env_job.port}/health"
+        async with session.get(url, timeout=5) as response:
+            if response.status == 200:
+                health_data = await response.json()
+                return {
+                    "healthy": True,
+                    "active_workers": health_data.get("active_workers", 0),
+                    "max_workers": health_data.get("max_workers", 0),
+                    "stopped_workers": health_data.get("stopped_workers", 0)
+                }
+            else:
+                return {"healthy": False, "error": f"HTTP {response.status}"}
+    except Exception as e:
+        return {"healthy": False, "error": str(e)}
+
+
+async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:
+    """Reset environment server via HTTP API."""
+    try:
+        url = f"http://{env_job.hostname}:{env_job.port}/reset_all"
+        async with session.post(url, timeout=10) as response:
+            if response.status == 200:
+                logger.info(f"Reset environment server {env_job.hostname}:{env_job.port}")
+                return True
+            else:
+                logger.error(f"Reset failed: HTTP {response.status}")
+                return False
+    except Exception as e:
+        logger.error(f"Reset failed: {e}")
+        return False
+
+
 async def generate_miniwob_rollout(
     cfg: DictConfig,
     llm: TrainableLLM,
@@ -74,16 +110,52 @@ async def generate_miniwob_rollout(
     # Overall timeout for the entire rollout to prevent hanging
     rollout_timeout = getattr(cfg, 'rollout_timeout', 600)  # 10 minutes default
 
-    try:
-        # Execute the entire rollout with a timeout
-        return await asyncio.wait_for(
-            _execute_rollout_with_timeout(cfg, llm, problem, session, start_time),
-            timeout=rollout_timeout
-        )
-    except asyncio.TimeoutError:
-        logger.error(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
-        # Return a failed rollout result
-        return _create_failed_rollout_result(problem, start_time, "timeout")
+    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+    env_jobs_url_tried = []
+
+    # Try each environment server with health checks until one of them returns a rollout result
+    for _ in range(len(env_jobs)):
+        # Choose the next environment server to try randomly from the ones that have not been tried yet
+        env_job = random.choice([job for job in env_jobs if f"http://{job.hostname}:{job.port}" not in env_jobs_url_tried])
+        env_job_url = f"http://{env_job.hostname}:{env_job.port}"
+        env_jobs_url_tried.append(env_job_url)
+
+        # Check server health before using
+        health = await check_env_server_health(env_job, session)
+        if not health["healthy"]:
+            logger.warning(f"Environment server {env_job_url} is unhealthy: {json.dumps(health, indent=2)}")
+            # Try to reset the server
+            if await reset_env_server(env_job, session):
+                logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
+                await asyncio.sleep(5)  # Wait for server to restart
+                health = await check_env_server_health(env_job, session)
+                if not health["healthy"]:
+                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {json.dumps(health, indent=2)}")
+                    continue
+            else:
+                logger.error(f"Failed to reset environment server {env_job_url}")
+                continue
+        # Log health status for monitoring
+        if health["healthy"]:
+            logger.info(f"Using healthy environment server {env_job_url}: {json.dumps(health, indent=2)}")
+
+        try:
+            # Execute the entire rollout with a timeout
+            return await asyncio.wait_for(
+                _execute_rollout_with_timeout(cfg, llm, problem, session, start_time, env_job_url),
+                timeout=rollout_timeout
+            )
+        except asyncio.TimeoutError:
+            health = await check_env_server_health(env_job, session)
+            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            continue
+        except Exception as e:
+            health = await check_env_server_health(env_job, session)
+            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            continue
+    # If all servers failed
+    logger.error(f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result.")
+    return _create_failed_rollout_result(problem, start_time, "all environment servers failed")
 
 
 async def _execute_rollout_with_timeout(
@@ -92,14 +164,8 @@ async def _execute_rollout_with_timeout(
     problem: dict,
     session: aiohttp.ClientSession,
     start_time: float,
+    env_job_url: str,
 ) -> RolloutResult:
-    # (1) Choose a random environment server
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    # choose the env job randomly
-    env_job = random.choice(env_jobs)
-    assert env_job.port is not None
-    env_job_url = f"http://{env_job.hostname}:{env_job.port}"
-
     # (2) Generate environment, TapeAgent, and run them to get a Tape
     no_error = True  # track if there was an error in the tape
     environment = AsyncRemoteEnvironment(server_url=env_job_url)  # type: ignore

From 981cd85a22dff384b011dab47f67b9daedba4c69 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 12 Sep 2025 21:04:32 +0000
Subject: [PATCH 128/166] better health message

---
 pipelinerl/domains/miniwob/rollouts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index dff461c1..b506a88c 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,9 +70,9 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                     "stopped_workers": health_data.get("stopped_workers", 0)
                 }
             else:
-                return {"healthy": False, "error": f"HTTP {response.status}"}
+                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": response.text}
     except Exception as e:
-        return {"healthy": False, "error": str(e)}
+        return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
 
 
 async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:

From 9c755ed82803f346aadb7826fda968d7e5909e8d Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Sat, 13 Sep 2025 03:16:00 +0000
Subject: [PATCH 129/166] small fix

---
 pipelinerl/domains/miniwob/rollouts.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b506a88c..b9a9cb74 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -70,7 +70,8 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                     "stopped_workers": health_data.get("stopped_workers", 0)
                 }
             else:
-                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": response.text}
+                error_text = await response.text()
+                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
     except Exception as e:
         return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
 
@@ -123,21 +124,21 @@ async def generate_miniwob_rollout(
         # Check server health before using
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
-            logger.warning(f"Environment server {env_job_url} is unhealthy: {json.dumps(health, indent=2)}")
+            logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
             # Try to reset the server
             if await reset_env_server(env_job, session):
                 logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
                 await asyncio.sleep(5)  # Wait for server to restart
                 health = await check_env_server_health(env_job, session)
                 if not health["healthy"]:
-                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {json.dumps(health, indent=2)}")
+                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {health}")
                     continue
             else:
                 logger.error(f"Failed to reset environment server {env_job_url}")
                 continue
         # Log health status for monitoring
         if health["healthy"]:
-            logger.info(f"Using healthy environment server {env_job_url}: {json.dumps(health, indent=2)}")
+            logger.info(f"Using healthy environment server {env_job_url}: {health}")
 
         try:
             # Execute the entire rollout with a timeout
@@ -147,11 +148,11 @@ async def generate_miniwob_rollout(
             )
         except asyncio.TimeoutError:
             health = await check_env_server_health(env_job, session)
-            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
         except Exception as e:
             health = await check_env_server_health(env_job, session)
-            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {json.dumps(health, indent=2)}. Trying next server.")
+            logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
     # If all servers failed
     logger.error(f"All environment servers failed for task {problem['dataset']}/{problem['task']}/{problem['seed']}. Returning a failed rollout result.")

From ea2d393005cd17b6bd2000234fae4cceec94c1c1 Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Mon, 22 Sep 2025 16:37:31 +0000
Subject: [PATCH 130/166] kl new old

---
 pipelinerl/finetune/rl/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 57aa4fa7..3211d579 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -260,6 +260,7 @@ def rl_step(
     )
 
     approx_kl = torch.exp(log_ratio_ref_new_clamp) - log_ratio_ref_new_clamp - 1  # Schulman KL approx
+    approx_kl_new_old = torch.exp(log_ratio_new_old) - log_ratio_new_old - 1  # Schulman KL approx
 
     assert torch.isfinite(approx_kl).all(), f"approx_kl is not finite: {approx_kl}"
     entropy_bonus_coef = linear_decay_coef(current_step, max_step, config.entropy_bonus, config.final_entropy_bonus)
@@ -337,6 +338,7 @@ def rl_step(
         "max_advantage": advantages[masks_shifted].max().item(),
         "min_advantage": advantages[masks_shifted].min().item(),
         "kl": sum_sum(approx_kl / num_labels_in_seq, masks_shifted, segments).item(),
+        "kl_new_old": sum_sum(approx_kl_new_old / num_labels_in_seq, masks_shifted, segments).item(),
         "max_kl": approx_kl[masks_shifted].max().item(),
         "min_kl": approx_kl[masks_shifted].min().item(),
         "policy_loss": sum_sum(policy_loss / num_labels_in_seq, masks_shifted, segments).item(),

From eb7eb0d37e2f61322ea70003e3c58baa5c48149a Mon Sep 17 00:00:00 2001
From: Alex Piche <alexandrelpiche@gmail.com>
Date: Thu, 25 Sep 2025 19:25:44 +0000
Subject: [PATCH 131/166] loo

---
 pipelinerl/finetune/rl/__init__.py | 61 ++++++++++++++++++------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/pipelinerl/finetune/rl/__init__.py b/pipelinerl/finetune/rl/__init__.py
index 3211d579..e74b9a0b 100644
--- a/pipelinerl/finetune/rl/__init__.py
+++ b/pipelinerl/finetune/rl/__init__.py
@@ -383,14 +383,7 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
     """
     Populates a dataset with reinforcement learning specific data columns including
     rewards, advantages, and token weights.
-
-    Args:
-        dataset (Dataset): The input dataset to populate with RL data
-        eos_token_id (int): End of sequence token ID
-        config (RLConfig): Configuration object containing RL training parameters
-
-    Returns:
-        Dataset: The dataset populated with RL-specific columns
+    Uses leave-one-out (LOO) reward mean: each rollout's baseline excludes its own reward.
     """
     # Convert to pandas for processing
     df_init = pd.DataFrame(dataset)
@@ -398,7 +391,7 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
 
     # Step 1: calculate group-level statistics
     df_stats = df_init[["group_id", "rollout_index", "step_index"]].copy()
-    df_stats["num_tokens"] = df_init["input_ids"].apply(lambda x: len(x))
+    df_stats["num_tokens"] = df_init["input_ids"].apply(len)
     # We assume that rewards for all tokens are the same
     df_stats["rollout_reward"] = df_init["rewards"].apply(lambda x: x[0])
     # Check that the reward is the same for each step in the rollout
@@ -408,15 +401,22 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
     df_grouped = (
         df_stats.groupby("group_id")
         .agg(
-            rollout_reward_mean=("rollout_reward", "mean"),
+            rollout_reward_sum=("rollout_reward", "sum"),
+            rollout_reward_count=("rollout_reward", "count"),
             rollout_reward_std=("rollout_reward", "std"),
-            group_tokens=("num_tokens", "mean"), 
+            group_tokens=("num_tokens", "mean"),
         )
         .reset_index()
     )
-    assert df_grouped.columns.tolist() == ["group_id", "rollout_reward_mean", "rollout_reward_std", "group_tokens"]
-
-    # Step 2: calculate advantages for each sample
+    assert df_grouped.columns.tolist() == [
+        "group_id",
+        "rollout_reward_sum",
+        "rollout_reward_count",
+        "rollout_reward_std",
+        "group_tokens",
+    ]
+
+    # Step 2: calculate advantages for each sample (with LOO mean)
     df_advantages = pd.merge(
         df_init[["group_id", "rollout_index", "step_index", "rewards"]],
         df_grouped,
@@ -424,26 +424,37 @@ def populate_rl_data(dataset: list[dict[str, Any]], eos_token_id: int, config: R
         how="left"
     )
     assert len(df_advantages) == len(df_init)
+
     def calculate_advantages(row):
         rewards = row["rewards"]
-        mean = row["rollout_reward_mean"]
+        group_sum = row["rollout_reward_sum"]
+        group_count = row["rollout_reward_count"]
+        current_reward = rewards[0]  # same reward across tokens in rollout
+
+        # Leave-one-out mean
+        if group_count > 1:
+            loo_mean = (group_sum - current_reward) / (group_count - 1)
+        else:
+            loo_mean = current_reward  # degenerate case: only one rollout in group
+
         std = row["rollout_reward_std"]
         if config.divide_advantage_by_std:
-            advantages = [(reward - mean) / (np.nan_to_num(std) + 1e-4) for reward in rewards]
+            advantages = [(r - loo_mean) / (np.nan_to_num(std) + 1e-4) for r in rewards]
         else:
-            advantages = [(reward - mean) for reward in rewards]
+            advantages = [(r - loo_mean) for r in rewards]
         return advantages
-    df_advantages["advantages"] = df_advantages.apply(
-        calculate_advantages,
-        axis=1,
+
+    df_advantages["advantages"] = df_advantages.apply(calculate_advantages, axis=1)
+    df_advantages = df_advantages.drop(
+        columns=["rewards", "rollout_reward_sum", "rollout_reward_count", "rollout_reward_std"]
     )
-    df_advantages = df_advantages.drop(columns=["rewards", "rollout_reward_mean", "rollout_reward_std"])
-    assert df_advantages.columns.tolist() == ["group_id", "rollout_index", "step_index", "group_tokens", "advantages"]
+    assert df_advantages.columns.tolist() == [
+        "group_id", "rollout_index", "step_index", "group_tokens", "advantages"
+    ]
 
     # Step 3: bring advantages and group level stats back to the main df
     df = df_init.drop(columns=["advantages", "group_tokens"])
     df = pd.merge(df, df_advantages, on=["group_id", "rollout_index", "step_index"], how="left")
-    # Debug print lengths of all dataframes
     assert len(df) == len(df_init)
 
     # Step 4: make token-level overflow and mean group length information
@@ -452,7 +463,9 @@ def calculate_advantages(row):
         axis=1,
     )
     df["group_tokens"] = df.apply(lambda row: [row["group_tokens"]] * len(row["input_ids"]), axis=1)
-    df["num_labels"] = df.apply(lambda row: [sum(1 for label in row["labels"] if label != -100)] * len(row["input_ids"]), axis=1)
+    df["num_labels"] = df.apply(
+        lambda row: [sum(1 for label in row["labels"] if label != -100)] * len(row["input_ids"]), axis=1
+    )
 
     # Step 5: move the results back to the dataset
     advantages_list = df["advantages"].tolist()

From 0b8a24d3a209cef477442d37ba7f238f36c32839 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 26 Sep 2025 04:31:39 +0000
Subject: [PATCH 132/166] better logs

---
 pipelinerl/actor.py                    | 2 +-
 pipelinerl/domains/miniwob/rollouts.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index a329598f..bcce006b 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -196,7 +196,7 @@ async def rollout_and_maybe_produce_result(
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
-                    f"groups finished so far: {group_id}, "
+                    f"groups started so far: {group_id}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index b9a9cb74..3e941dae 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -73,7 +73,10 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                 error_text = await response.text()
                 return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
     except Exception as e:
-        return {"healthy": False, "error_status": "Unknown", "error_message": str(e)}
+        exception_type = type(e).__name__
+        exception_message = str(e) if str(e) else "No message available"
+        logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
+        return {"healthy": False, "error_status": f"Exception: {exception_type}", "error_message": exception_message}
 
 
 async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:

From 1247360cb1545e9ada775a06ca9bad05442c5816 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 26 Sep 2025 11:29:57 +0000
Subject: [PATCH 133/166] Add new metrics

---
 pipelinerl/actor.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 46b6606f..cb9a4434 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -349,6 +349,8 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             self.model_versions_list.append(result.model_version)
             domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
+            all_metrics["used_python"] = int(all_metrics.get("used_python", False))
+            all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
             for k, v in all_metrics.items():
                 if isinstance(v, list):
                     self.stats[k][dataset_name][group_id] += v
@@ -549,6 +551,21 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats |= loop_stats
         for k, v in self.sliding_stats.items():
             stats[k] = sum(v) / len(v) if v else 0
+
+        rename_suffixes = {
+            "num_python_calls_mean": "python_calls_mean",
+            "used_python_mean": "python_usage_rate",
+            "num_math_answer_calls_mean": "math_answer_calls_mean",
+            "used_math_answer_mean": "math_answer_usage_rate",
+        }
+
+        for key in list(stats.keys()):
+            for old_suffix, new_suffix in rename_suffixes.items():
+                if key.endswith(old_suffix):
+                    prefix = key[: -len(old_suffix)]
+                    stats[f"{prefix}{new_suffix}"] = stats[key]
+                    break
+
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -592,11 +609,18 @@ def run_actor_loop(cfg: DictConfig):
     else:
         actor_model_path = cfg.model_path
     
+    # Align client-side context size with vLLM server max_model_len when available
+    try:
+        _context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
+    except Exception:
+        _context_size = 32000
+
     train_llms = [
         TrainableLLM(
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.llm.parameters,
             use_cache=False,
             collect_logprobs=True,
@@ -609,6 +633,7 @@ def run_actor_loop(cfg: DictConfig):
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.test_llm.parameters,
             use_cache=False,
             collect_logprobs=True,

From cd27e30f485594393163ab742758510cd82c14cc Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Fri, 26 Sep 2025 19:44:15 +0000
Subject: [PATCH 134/166] always check the worker before launching the agent on
 it + more detailed logs

---
 pipelinerl/domains/miniwob/rollouts.py | 67 +++++++++++---------------
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3e941dae..3243ea1f 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -65,34 +65,17 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
                 health_data = await response.json()
                 return {
                     "healthy": True,
-                    "active_workers": health_data.get("active_workers", 0),
-                    "max_workers": health_data.get("max_workers", 0),
-                    "stopped_workers": health_data.get("stopped_workers", 0)
+                    "health_data": health_data,
+                    "last_check": time.time()
                 }
             else:
                 error_text = await response.text()
-                return {"healthy": False, "error_status": f"HTTP {response.status}", "error_message": error_text}
+                return {"healthy": False, "error_message": f"HTTP {response.status}: {error_text}", "last_check": time.time()}
     except Exception as e:
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
         logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
-        return {"healthy": False, "error_status": f"Exception: {exception_type}", "error_message": exception_message}
-
-
-async def reset_env_server(env_job: Job, session: aiohttp.ClientSession) -> bool:
-    """Reset environment server via HTTP API."""
-    try:
-        url = f"http://{env_job.hostname}:{env_job.port}/reset_all"
-        async with session.post(url, timeout=10) as response:
-            if response.status == 200:
-                logger.info(f"Reset environment server {env_job.hostname}:{env_job.port}")
-                return True
-            else:
-                logger.error(f"Reset failed: HTTP {response.status}")
-                return False
-    except Exception as e:
-        logger.error(f"Reset failed: {e}")
-        return False
+        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time()}
 
 
 async def generate_miniwob_rollout(
@@ -128,17 +111,7 @@ async def generate_miniwob_rollout(
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
             logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
-            # Try to reset the server
-            if await reset_env_server(env_job, session):
-                logger.info(f"Reset environment server {env_job_url} successfully, retrying health check")
-                await asyncio.sleep(5)  # Wait for server to restart
-                health = await check_env_server_health(env_job, session)
-                if not health["healthy"]:
-                    logger.error(f"Environment server {env_job_url} still unhealthy after reset: {health}")
-                    continue
-            else:
-                logger.error(f"Failed to reset environment server {env_job_url}")
-                continue
+            continue
         # Log health status for monitoring
         if health["healthy"]:
             logger.info(f"Using healthy environment server {env_job_url}: {health}")
@@ -198,38 +171,54 @@ async def _execute_rollout_with_timeout(
                     logger.warning("Retry start task after 5 seconds.")
                     await asyncio.sleep(5)
         logger.info(
-            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds"
+            f"Task {problem['dataset']}/{problem['task']}/{problem['seed']} started in {time.perf_counter() - t:.2f} seconds. Worker ID: {env.worker_id}. Tape dict: {tape_dict}"
         )
         tape: WebTape = WebTape(**tape_dict)  # convert http response dict to WebTape object
         t = time.perf_counter()
         if no_error:  # only run the agent if the task started successfully
-            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']}")
+            logger.info(f"Running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}")
             agent_attempts = cfg.agent_attempts
             while agent_attempts > 0:
+                # check if the worker is alive.
+                try:
+                    # this will either raise RuntimeError if worker is not alive anymore, or return a dictionary with the worker status
+                    worker_status = await env.check_worker_alive()
+                    if worker_status.get("status") == "starting":
+                        logger.warning(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is starting, waiting 5 seconds for it to be fully started.")
+                        await asyncio.sleep(5)
+                        continue
+                except Exception as e:
+                    # if worker is dead, no need to retry
+                    logger.exception(f"Worker {env.worker_id} for task {problem['dataset']}/{problem['task']}/{problem['seed']} and tape ID {tape.metadata.id} is dead. Error: {e}", stack_info=True)
+                    no_error = False
+                    break
+                # if worker is alive, run the agent
                 try:
                     actions = await env.a_actions()
                     tools_description = await env.a_tools_description()
                     agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
                     agent.llms = {DEFAULT: llm}
                     tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError)
+                    # Check if the tape has an error from the orchestrator (e.g., SocketTimeoutError, RuntimeError: Worker is not alive, etc.)
                     if tape.metadata.error:
+                        logger.error(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} returned a tape with error: {tape.metadata.error}")
                         raise ValueError(tape.metadata.error)
                     else:
                         # Success - break out of retry loop
+                        logger.info(f"Agent execution for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id} finished successfully")
                         break
                 except Exception as e:
                     agent_attempts -= 1
-                    logger.warning(f"Error occurred while running agent. {agent_attempts} attempts remaining. Error: {e}")
+                    logger.warning(f"Error occurred while running agent for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}. {agent_attempts} attempts remaining. Error: {e}")
                     if agent_attempts <= 0:
-                        logger.error(f"Agent execution failed after all retry attempts: {e}")
+                        logger.error(f"Agent execution failed after all retry attempts for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}: {e}")
                         no_error = False
                         break
                     else:
-                        logger.warning("Retry agent execution after 5 seconds.")
+                        logger.warning(f"Retry agent execution after 5 seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} with worker ID: {env.worker_id} and tape ID {tape.metadata.id}.")
                         await asyncio.sleep(5)
             logger.info(
-                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds"
+                f"Agent finished task {problem['dataset']}/{problem['task']}/{problem['seed']} in {time.perf_counter() - t:.2f} seconds with worker ID: {env.worker_id} and tape ID {tape.metadata.id}"
             )
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t})
 

From f9ce99e7efa0392085c03cb346db8786031a8de7 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 29 Sep 2025 19:35:17 +0000
Subject: [PATCH 135/166] log stack trace

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index 3243ea1f..bdd753a8 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -4,6 +4,7 @@
 import os
 import random
 import time
+import traceback
 
 import aiohttp
 from examples.rl_webagent.steps import WebTape
@@ -75,7 +76,7 @@ async def check_env_server_health(env_job: Job, session: aiohttp.ClientSession)
         exception_type = type(e).__name__
         exception_message = str(e) if str(e) else "No message available"
         logger.exception(f"Error checking environment server health: {exception_type}: {exception_message}", stack_info=True)
-        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time()}
+        return {"healthy": False, "error_message": f"Exception: {exception_type}: {exception_message}", "last_check": time.time(), "error_stacktrace": traceback.format_exc()}
 
 
 async def generate_miniwob_rollout(
@@ -111,6 +112,7 @@ async def generate_miniwob_rollout(
         health = await check_env_server_health(env_job, session)
         if not health["healthy"]:
             logger.warning(f"Environment server {env_job_url} is unhealthy: {health}")
+            logger.warning(f"Get health error stacktrace: {health['error_stacktrace']}")
             continue
         # Log health status for monitoring
         if health["healthy"]:
@@ -124,10 +126,16 @@ async def generate_miniwob_rollout(
             )
         except asyncio.TimeoutError:
             health = await check_env_server_health(env_job, session)
+            if stack_trace := health.get("error_stacktrace"):
+                logger.warning(f"Get health error stacktrace: {stack_trace}")
+            logger.warning(f"Rollout timeout error stacktrace: {traceback.format_exc()}")
             logger.warning(f"Rollout timed out after {rollout_timeout} seconds for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
         except Exception as e:
             health = await check_env_server_health(env_job, session)
+            if stack_trace := health.get("error_stacktrace"):
+                logger.warning(f"Get health error stacktrace: {stack_trace}")
+            logger.warning(f"Rollout failed error stacktrace: {traceback.format_exc()}")
             logger.warning(f"Rollout failed for task {problem['dataset']}/{problem['task']}/{problem['seed']} on environment {env_job_url}. Health: {health}. Trying next server.")
             continue
     # If all servers failed

From 60fb04282d3157eaa771dab5c6282e84a3b0bcb6 Mon Sep 17 00:00:00 2001
From: Nicolas Gontier <gontier.nicolas@yahoo.fr>
Date: Mon, 29 Sep 2025 19:50:02 +0000
Subject: [PATCH 136/166] small cleanup

---
 pipelinerl/domains/miniwob/rollouts.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pipelinerl/domains/miniwob/rollouts.py b/pipelinerl/domains/miniwob/rollouts.py
index bdd753a8..ec71ff8e 100644
--- a/pipelinerl/domains/miniwob/rollouts.py
+++ b/pipelinerl/domains/miniwob/rollouts.py
@@ -159,13 +159,9 @@ async def _execute_rollout_with_timeout(
         t = time.perf_counter()
         while start_attempts > 0:
             try:
-                start_result = await env.start_task(problem)
-                if isinstance(start_result, dict) and "error" in start_result:
-                    raise ValueError(start_result['error'])
-                elif isinstance(start_result, list):
-                    tape_dict, _ = start_result
-                else:
-                    raise ValueError(f"Invalid start result: {start_result}")
+                tape_dict, info = await env.start_task(problem)
+                if info.get("error"):
+                    raise ValueError(info['error'])
                 break
             except Exception as e:
                 start_attempts -= 1

From 61c91c73bac1903b5d6260f5686af7102ff8954c Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Tue, 30 Sep 2025 18:42:11 +0000
Subject: [PATCH 137/166] Embedded envs

---
 conf/base.yaml                       |   2 +
 conf/mcp.yaml                        | 151 +++--
 pipelinerl/domains/mcp/__init__.py   |   2 +-
 pipelinerl/domains/mcp/env_server.py | 948 ++++++++++++++++++++++++++-
 pipelinerl/domains/mcp/rollouts.py   | 245 +++++--
 pipelinerl/domains/mcp/steps.py      |   6 +-
 pipelinerl/launch.py                 |  41 +-
 pipelinerl/rl_tool_parser_plugin.py  | 189 +++++-
 pipelinerl/utils.py                  |  10 +-
 pipelinerl/vllm0.py                  |  19 +
 pipelinerl/world.py                  |   2 +-
 11 files changed, 1445 insertions(+), 170 deletions(-)

diff --git a/conf/base.yaml b/conf/base.yaml
index 82b95d91..638d2c13 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -81,6 +81,8 @@ world:
 
   actor_group_port: 9000
   environment_start_port: 7777
+# Remote vs embedded environment execution strategy
+  environment_mode: remote
 # this will be autocreated based on the config
 jobs: []
 
diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index c4b050b8..cf85ca18 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -1,18 +1,27 @@
 defaults:
     - base
+    - override finetune: grpo
     - _self_
 
+llm:
+  parameters:
+    max_tokens: 8192
+
+test_llm:
+  parameters:
+    max_tokens: 8192
 
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
-  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
   llm_max_rollouts: 64
   task_template: |-
     {task}
+  shared_memory_entry_size: 10000000
 
 finetune:
-  seq_length: 48000
-  seq_parallel: 4
+  seq_length: 128000
+  seq_parallel: 8
 
 dataset_loader: pipelinerl.domains.math.load_datasets
 train_dataset_names:
@@ -20,9 +29,6 @@ train_dataset_names:
 - open_reasoner_zero_extended_72k 
 test_dataset_names:
   - aime_2025
-  - aime_2024
-  - amc_2023
-  - math_500
 
 vllm_config:
   use_v1: false
@@ -30,23 +36,26 @@ vllm_config:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
-    max_model_len: 40960
+    max-num-seqs: ${actor.llm_max_rollouts}
+    max-num-batched-tokens: 4096
+    max_model_len: 128000
+    gpu-memory-utilization: 0.85
 
 environment:
-  _target_: pipelinerl.domains.mcp.MCPEnvironmentServer
-  n_envs: 8
-  host: "0.0.0.0"
-  exp_path: ${output_dir}/env_server
-  mcp_target: tapeagents.mcp.MCPEnvironment
-  mcp_config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
-  mcp_tools_whitelist:
+  _target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
+  config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
+  tools_whitelist:
+    - run_python_code
+  read_timeout_seconds: 600
+  use_cache: false
+  runtime_pool_workers: 4
+  offload_tools:
     - run_python_code
-  env_call_timeout: 600  # Increased from default 60s to 10 minutes
-  mcp_read_timeout_seconds: 3000
 
 
 world:
   env_replicas_per_actor: 8
+  environment_mode: embedded
 
 agent_max_loops: 3
 agent:
@@ -56,25 +65,44 @@ agent:
   store_llm_calls: true
   templates:
     system_prompt: |
-      You are an expert AI Agent trained to assist users with complex information processing tasks.
-      Your role is to understand user queries and respond in a helpful and accurate manner.
+      You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
+      with short, deterministic Python code.
       Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
-      Do not express emotions or opinions about user questions. 
+      Always present the final answer in LaTeX \boxed{{}}.
+      Do not express emotions or opinions about user questions.
+
+      Workflow:
+      1. Draft a brief plan in plain text.
+      2. Execute one run_python_code call to compute or verify the result.
+      3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
+
+      Python execution policy (run_python_code):
+      - Use Python strictly for pure computation to verify and validate the final answer.
+      - No network, file system, OS or environment access.
+      - Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.
+
+      Validation:
+      - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
+      - If execution fails, propose the minimal fix and retry.
+      Keep replies direct and avoid unnecessary text.
     allowed_tools: |
-      You have access to the following tools:
+      You can call the following tools:
       {tools_description}
+      - run_python_code: deterministic math code; print only the final value.
+      - MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
+      Always verify with run_python_code before invoking MathAnswer.
     thought_format: |
       Important! Respond with the plain text, do not include any JSON or code.
       Do not output anything besides what I asked in this message.
     allowed_steps: |
-      You have access to the following tools:
-      {tools_description}
-    format: >
-      Output only a single JSON dict.
-      Do not repeat the last thought again.
-      If the last action does not change the observation, do not repeat it!
-      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
-      It will break the system that processes the output.
+      Workflow summary:
+      - Plan briefly in plain text.
+      - Call run_python_code exactly once per loop to compute/verify.
+      - Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
+    format: |
+      For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
+      immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
+      Never emit unrelated JSON wrappers or duplicate the final thought.
       
 
   nodes:
@@ -82,53 +110,52 @@ agent:
       name: plan
       system_prompt: ${agent.templates.system_prompt}
       guidance: |
-        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
-        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
-        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-        Start with the title "Plan". Every step should have short name and description.
+        Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
         ${agent.templates.thought_format}
       steps_prompt: ${agent.templates.allowed_tools}
+      trim_obs_except_last_n: 2
 
     - _target_: tapeagents.nodes.StandardNode
-      name: select
+      name: code
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
       guidance: |
-        Select the next step to do to move forward with the plan. Describe the expected effect of the proposed action.
-        ${agent.templates.thought_format}
-      steps_prompt: ${agent.templates.allowed_tools}
+        ALWAYS call run_python_code once to compute/verify the result.
+        Use exact, deterministic code; print only the final scalar or tuple.
+        If code fails, fix minimally and call run_python_code again after reviewing the error.
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
 
     - _target_: tapeagents.nodes.StandardNode
-      name: act
+      name: finalize
       system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: Then produce single function call for the next step. If the answer is ready, call MathAnswer. Put your final answer within \boxed{}.
+      guidance: |
+        Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
+        Immediately after that sentence, call the MathAnswer tool exactly once with:
+          name: MathAnswer
+          arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
+        Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
       steps:
         - pipelinerl.domains.mcp.steps.MathAnswer
       use_known_actions: true
       use_function_calls: true
+      trim_obs_except_last_n: 2
+      next_node: code
 
-    - _target_: tapeagents.nodes.StandardNode
-      name: summarize
-      system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: |
-        Summarize last observation. If its an image, thoroughly describe it with all details.
-        Describe the results of the last action and observed changes
-        Do not hallucinate or make up any information, only describe what you see in the observation.
-        Do not guess or assume action effects, describe only visible changes.
-        ${agent.templates.thought_format}
+# model_path: Qwen/Qwen3-8B
+model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
 
-    - _target_: tapeagents.nodes.StandardNode
-      name: reflect
-      system_prompt: ${agent.templates.system_prompt}
-      trim_obs_except_last_n: 100
-      guidance: |
-        1. Evaluate the action's success, explain its effect on current step, overall plan and task solution.
-        2. If the last action was not successful, describe errors and the possible reasons for failure.
-        3. Check if the current plan step is finished. 
-        4. If the step is finished, update the following steps of the plan with new information and choose the next step.
-        ${agent.templates.thought_format}
-      next_node: select
+# Local reward shaping for tool usage
+python_tool_shaping:
+  bonus_on_correct_with_python: 0.2
+  penalty_on_incorrect_without_python: 0.1
+  max_abs: 0.2
 
-model_path: Qwen/Qwen3-8B
\ No newline at end of file
+# Encourage concise outputs (penalize long completions)
+length_shaping:
+  target_ratio: 0.1                # 10% of max_tokens; auto scales with max_tokens
+  min_target_tokens: 256           # lower clamp
+  max_target_tokens: 2048          # upper clamp
+  slope: 0.001                     # penalty per token beyond target
+  max_penalty: 0.2                 # clamp absolute penalty
+  bonus_on_short_correct: 0.05     # bonus if correct and concise
diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index a47458a5..4218ca1b 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
 from .rollouts import generate_mcp_rollout
-from .env_server import MCPEnvironmentServer
\ No newline at end of file
+from .env_server import EmbeddedMCPEnvironment, MCPEnvironmentServer, EmbeddedEnvironmentWorker
diff --git a/pipelinerl/domains/mcp/env_server.py b/pipelinerl/domains/mcp/env_server.py
index fabc5af2..2298e5cd 100644
--- a/pipelinerl/domains/mcp/env_server.py
+++ b/pipelinerl/domains/mcp/env_server.py
@@ -1,19 +1,361 @@
-import os
-from tapeagents.remote_environment import EnvironmentServer
-from omegaconf import OmegaConf
-from typing import List
-from fastapi import HTTPException
-from pydantic import BaseModel
-import logging
 import asyncio
+import atexit
+import inspect
+import json
+import logging
+import os
+import re
+import threading
+import time
+import traceback
 from concurrent.futures import ProcessPoolExecutor
+from contextlib import asynccontextmanager
 from functools import partial
+from typing import Any, AsyncIterator, List
+
+import multiprocessing
+
+from fastapi import HTTPException
+from hydra.utils import instantiate
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel
+from tapeagents.core import Action, Observation
+from tapeagents.environment import Environment
+from tapeagents.mcp import MCPClient, MCPEnvironment, NoTool
+from tapeagents.remote_environment import EnvironmentServer
+from tapeagents.tool_calling import FunctionSpec, ToolCallAction, ToolResult, ToolSpec
+from mcp.types import CallToolResult, TextContent
 
 from pipelinerl.domains.math.verifier_api import verify_answer
+from pipelinerl.domains.mcp.steps import MathAnswer
 
 logger = logging.getLogger(__name__)
 
 
+_CONNECTION_ERROR_PATTERNS = (
+    "closedresourceerror",
+    "brokenresourceerror",
+    "broken pipe",
+    "connectionreseterror",
+    "timed out while waiting for response",
+)
+
+
+_MCP_WORKER_STATE: dict[str, Any] | None = None
+
+
+def _shutdown_mcp_worker() -> None:
+    global _MCP_WORKER_STATE
+    if not _MCP_WORKER_STATE:
+        return
+    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
+    client: MCPClient = _MCP_WORKER_STATE["client"]
+    try:
+        loop.run_until_complete(client.close())
+    except Exception:
+        logger.warning("Failed to close MCP client in worker", exc_info=True)
+    finally:
+        loop.close()
+        _MCP_WORKER_STATE = None
+
+
+def _initialize_mcp_worker(
+    config_path: str,
+    tools_whitelist: list[str] | tuple[str, ...] | None,
+    use_cache: bool,
+    read_timeout_seconds: int,
+) -> None:
+    """Initializer for the ProcessPool workers that own MCP runtimes."""
+    global _MCP_WORKER_STATE
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    client = MCPClient(
+        config_path=config_path,
+        use_cache=use_cache,
+        read_timeout_seconds=read_timeout_seconds,
+    )
+    loop.run_until_complete(client.start_servers())
+    _MCP_WORKER_STATE = {
+        "loop": loop,
+        "client": client,
+        "tools_whitelist": list(tools_whitelist or []),
+    }
+    atexit.register(_shutdown_mcp_worker)
+
+
+def _call_tool_in_worker(tool_name: str, tool_arguments: Any) -> dict[str, Any]:
+    """Execute an MCP tool call inside a worker process."""
+    if not _MCP_WORKER_STATE:
+        raise RuntimeError("MCP worker not initialized")
+    loop: asyncio.AbstractEventLoop = _MCP_WORKER_STATE["loop"]
+    client: MCPClient = _MCP_WORKER_STATE["client"]
+    whitelist: list[str] = _MCP_WORKER_STATE.get("tools_whitelist", [])
+    if whitelist and tool_name not in whitelist:
+        raise NoTool(f"Tool {tool_name} not allowed by whitelist")
+    result = loop.run_until_complete(client.call_tool(tool_name, tool_arguments))
+    return result.model_dump(exclude_none=True)
+
+
+class _RemoteCallError(RuntimeError):
+    def __init__(self, message: str, details: dict[str, Any] | None = None) -> None:
+        super().__init__(message)
+        self.details = details or {}
+
+
+def _invoke_environment_method(
+    environment: Environment,
+    method_name: str,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+    loop: asyncio.AbstractEventLoop,
+) -> Any:
+    attr = getattr(environment, method_name)
+    if inspect.iscoroutinefunction(attr):
+        return loop.run_until_complete(attr(*args, **kwargs))
+    result = attr(*args, **kwargs)
+    if inspect.isawaitable(result):
+        return loop.run_until_complete(result)
+    return result
+
+
+def _environment_process_main(env_cfg_container: dict[str, Any], conn) -> None:
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        env_cfg = OmegaConf.create(env_cfg_container)
+        environment: Environment = instantiate(env_cfg)
+    except Exception:
+        conn.send(
+            (
+                "exception",
+                {
+                    "type": "EnvironmentBootstrapError",
+                    "message": "Failed to instantiate environment",
+                    "traceback": traceback.format_exc(),
+                },
+            )
+        )
+        conn.close()
+        loop.close()
+        return
+
+    async_methods = {
+        name
+        for name in ("ainitialize", "areset", "aclose", "astep", "areact")
+        if hasattr(environment, name) and inspect.iscoroutinefunction(getattr(environment, name))
+    }
+    sync_methods = {
+        name
+        for name in (
+            "initialize",
+            "reset",
+            "close",
+            "start_task",
+            "actions",
+            "tools_description",
+            "mark_healthy",
+            "is_healthy",
+            "step",
+            "react",
+        )
+        if callable(getattr(environment, name, None))
+    }
+
+    conn.send(("capabilities", {"sync": list(sync_methods), "async": list(async_methods)}))
+
+    running = True
+    while running:
+        try:
+            message = conn.recv()
+        except EOFError:
+            break
+        if not isinstance(message, tuple) or len(message) != 3:
+            continue
+        command, args, kwargs = message
+        if command == "__shutdown__":
+            running = False
+            conn.send(("ok", None))
+            break
+        try:
+            result = _invoke_environment_method(environment, command, args, kwargs, loop)
+            conn.send(("ok", result))
+        except Exception as exc:
+            conn.send(
+                (
+                    "exception",
+                    {
+                        "type": exc.__class__.__name__,
+                        "message": str(exc),
+                        "traceback": traceback.format_exc(),
+                    },
+                )
+            )
+
+    try:
+        if "aclose" in async_methods:
+            loop.run_until_complete(environment.aclose())
+        elif "close" in sync_methods:
+            environment.close()
+    except Exception:
+        logger.debug("Failed to close environment during shutdown", exc_info=True)
+    finally:
+        conn.close()
+        loop.close()
+
+
+class _ProcessEnvironmentProxy:
+    def __init__(self, env_cfg: DictConfig):
+        self._ctx = multiprocessing.get_context("spawn")
+        self._parent_conn, child_conn = self._ctx.Pipe()
+        cfg_container = OmegaConf.to_container(env_cfg, resolve=True)
+        self._process = self._ctx.Process(
+            target=_environment_process_main,
+            args=(cfg_container, child_conn),
+        )
+        self._process.daemon = False
+        self._process.start()
+        self._lock = threading.Lock()
+        self._closed = False
+        try:
+            status, payload = self._parent_conn.recv()
+        except EOFError as error:
+            raise _RemoteCallError("Environment process terminated prematurely") from error
+        if status == "exception":
+            raise _RemoteCallError(payload.get("message", "Environment bootstrap failed"), payload)
+        if status != "capabilities":
+            raise _RemoteCallError("Unexpected handshake from environment process")
+        self._sync_methods = set(payload.get("sync", []))
+        self._async_methods = set(payload.get("async", []))
+
+    def supports_async(self, name: str) -> bool:
+        return name in self._async_methods
+
+    def supports_sync(self, name: str) -> bool:
+        return name in self._sync_methods
+
+    def _ensure_alive(self) -> None:
+        if self._closed:
+            raise _RemoteCallError("Environment proxy is closed")
+        if not self._process.is_alive():
+            raise _RemoteCallError("Environment process died unexpectedly")
+
+    def _call_remote(self, method: str, *args: Any, **kwargs: Any) -> Any:
+        self._ensure_alive()
+        with self._lock:
+            try:
+                self._parent_conn.send((method, args, kwargs))
+                status, payload = self._parent_conn.recv()
+            except EOFError as error:
+                raise _RemoteCallError("Lost connection to environment process") from error
+        if status == "ok":
+            return payload
+        if status == "exception":
+            raise _RemoteCallError(payload.get("message", "Remote call failed"), payload)
+        raise _RemoteCallError(f"Unexpected response type: {status}")
+
+    def start_task(self, task: dict) -> dict:
+        return self._call_remote("start_task", task)
+
+    def actions(self) -> tuple[type[Action], ...]:
+        return tuple(self._call_remote("actions"))
+
+    def tools_description(self) -> str:
+        return self._call_remote("tools_description")
+
+    def initialize(self):
+        if self.supports_sync("initialize"):
+            return self._call_remote("initialize")
+        if self.supports_async("ainitialize"):
+            return self._call_remote("ainitialize")
+        return None
+
+    async def ainitialize(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.initialize)
+
+    def reset(self) -> None:
+        if self.supports_sync("reset"):
+            self._call_remote("reset")
+        elif self.supports_async("areset"):
+            self._call_remote("areset")
+
+    async def areset(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.reset)
+
+    def step(self, action: Action) -> Observation:
+        if self.supports_sync("step"):
+            return self._call_remote("step", action)
+        if self.supports_async("astep"):
+            return self._call_remote("astep", action)
+        raise _RemoteCallError("Remote environment does not support step or astep")
+
+    async def astep(self, action: Action) -> Observation:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.step, action)
+
+    def react(self, tape) -> Any:
+        if self.supports_sync("react"):
+            return self._call_remote("react", tape)
+        if self.supports_async("areact"):
+            return self._call_remote("areact", tape)
+        raise _RemoteCallError("Remote environment does not support react or areact")
+
+    async def areact(self, tape) -> Any:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.react, tape)
+
+    def mark_healthy(self) -> None:
+        if self.supports_sync("mark_healthy"):
+            self._call_remote("mark_healthy")
+
+    def is_healthy(self) -> bool:
+        if self.supports_sync("is_healthy"):
+            return bool(self._call_remote("is_healthy"))
+        return True
+
+    def close(self) -> None:
+        if self._closed:
+            return
+        try:
+            if self.supports_sync("close"):
+                self._call_remote("close")
+            elif self.supports_async("aclose"):
+                self._call_remote("aclose")
+        except _RemoteCallError:
+            logger.debug("Remote close failed", exc_info=True)
+        finally:
+            self._shutdown()
+
+    async def aclose(self) -> None:
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.close)
+
+    def _shutdown(self) -> None:
+        if self._closed:
+            return
+        try:
+            with self._lock:
+                if self._process.is_alive():
+                    self._parent_conn.send(("__shutdown__", (), {}))
+                    try:
+                        self._parent_conn.recv()
+                    except EOFError:
+                        pass
+        except Exception:
+            logger.debug("Failed to send shutdown to environment process", exc_info=True)
+        finally:
+            self._parent_conn.close()
+            self._process.join(timeout=5)
+            if self._process.is_alive():
+                self._process.terminate()
+            self._closed = True
+
+    def __del__(self) -> None:
+        try:
+            self._shutdown()
+        except Exception:
+            pass
 class EnvironmentServerWithVerifier(EnvironmentServer):
     """Environment server that includes the verify_answer endpoint."""
     
@@ -99,3 +441,595 @@ def launch(self, port: int):
             "read_timeout_seconds": self.mcp_read_timeout_seconds,
         }))
 
+
+class EmbeddedMCPEnvironment(MCPEnvironment):
+    def __init__(
+        self,
+        *args,
+        math_answer_description: str = "Submit the final answer in LaTeX \\boxed{} format.",
+        **kwargs,
+    ) -> None:
+        config_path = kwargs.get("config_path", "")
+        use_cache = kwargs.get("use_cache", False)
+        read_timeout_seconds = kwargs.get("read_timeout_seconds", 10)
+        runtime_pool_workers = kwargs.pop("runtime_pool_workers", 0)
+        offload_tools = tuple(kwargs.pop("offload_tools", ()))
+
+        super().__init__(*args, **kwargs)
+        self._broken = False
+        self._last_failure_reason: str | None = None
+        self._runtime_guard_installed: bool = False
+        self._runtime_pool: ProcessPoolExecutor | None = None
+        self._runtime_pool_lock = threading.Lock()
+        self._runtime_pool_workers = runtime_pool_workers
+        self._offload_tools = set(offload_tools)
+        self._config_path = getattr(self.client, "config_path", config_path)
+        self._use_cache = getattr(self.client, "use_cache", use_cache)
+        self._read_timeout_seconds = getattr(self.client, "read_timeout_seconds", read_timeout_seconds)
+
+        # try to catch time wasting patterns before execution
+        self._python_blocklist = (
+            (re.compile(r"\bsys\s*\.\s*exit\s*\(", re.IGNORECASE), "sys.exit"),
+            (re.compile(r"\bos\s*\.\s*_exit\s*\(", re.IGNORECASE), "os._exit"),
+            (re.compile(r"\bexit\s*\(", re.IGNORECASE), "exit"),
+            (re.compile(r"\bquit\s*\(", re.IGNORECASE), "quit"),
+            (re.compile(r"raise\s+systemexit", re.IGNORECASE), "raise SystemExit"),
+            (re.compile(r"from\s+sys\s+import\s+exit", re.IGNORECASE), "from sys import exit"),
+            (
+                re.compile(r"__import__\s*\(\s*['\"]os['\"]\s*\)\s*\.\s*_exit", re.IGNORECASE),
+                "__import__('os')._exit",
+            ),
+            (
+                re.compile(r"__import__\s*\(\s*['\"]sys['\"]\s*\)\s*\.\s*exit", re.IGNORECASE),
+                "__import__('sys').exit",
+            ),
+        )
+        self._math_answer_spec = ToolSpec(
+            function=FunctionSpec(
+                name="MathAnswer",
+                description=math_answer_description,
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "answer": {
+                            "type": "string",
+                            "description": "Final answer expressed in LaTeX \\boxed{} format.",
+                        }
+                    },
+                    "required": ["answer"],
+                },
+            )
+        )
+
+    def initialize(self):
+        super().initialize()
+        self._reset_health()
+        self._ensure_math_answer_tool()
+
+    async def ainitialize(self) -> None:
+        self.loop = asyncio.get_running_loop()
+        await super().ainitialize()
+        self._reset_health()
+        self._ensure_math_answer_tool()
+        await self._install_runtime_guard()
+
+    def actions(self):
+        base_actions = super().actions()
+        if not any(
+            getattr(action, "function", None) and action.function.name == "MathAnswer"
+            for action in base_actions
+        ):
+            base_actions = base_actions + (self._math_answer_spec,)
+        return base_actions
+
+    def _should_offload(self, tool_name: str) -> bool:
+        return bool(self._runtime_pool_workers) and tool_name in self._offload_tools
+
+    def _ensure_runtime_pool(self) -> ProcessPoolExecutor:
+        if self._runtime_pool is not None:
+            return self._runtime_pool
+        with self._runtime_pool_lock:
+            if self._runtime_pool is not None:
+                return self._runtime_pool
+            cpu_count = os.cpu_count() or 1
+            default_workers = max(1, cpu_count // 2)
+            max_workers = self._runtime_pool_workers or default_workers
+            whitelist = tuple(self.tools_whitelist) if getattr(self, "tools_whitelist", None) else tuple()
+            self._runtime_pool = ProcessPoolExecutor(
+                max_workers=max_workers,
+                initializer=_initialize_mcp_worker,
+                initargs=(
+                    self._config_path,
+                    whitelist,
+                    bool(self._use_cache),
+                    int(self._read_timeout_seconds),
+                ),
+            )
+            return self._runtime_pool
+
+    @staticmethod
+    def _make_error_call_result(tool_name: str, message: str) -> CallToolResult:
+        return CallToolResult(
+            content=[TextContent(type="text", text=message)],
+            isError=True,
+        )
+
+    def _resolve_pool_future_sync(self, future, tool_name: str) -> CallToolResult:
+        try:
+            payload = future.result()
+            return CallToolResult.model_validate(payload)
+        except NoTool:
+            logger.exception(f"Tool {tool_name} not found in MCP client")
+            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
+        except KeyError as error:
+            logger.exception(f"KeyError when executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
+            )
+        except Exception as error:
+            logger.exception(f"Error executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: {error}"
+            )
+
+    async def _resolve_pool_future_async(self, future, tool_name: str) -> CallToolResult:
+        try:
+            payload = await asyncio.wrap_future(future)
+            return CallToolResult.model_validate(payload)
+        except NoTool:
+            logger.exception(f"Tool {tool_name} not found in MCP client")
+            return self._make_error_call_result(tool_name, f"Tool {tool_name} not found")
+        except KeyError as error:
+            logger.exception(f"KeyError when executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: KeyError {error}"
+            )
+        except Exception as error:
+            logger.exception(f"Error executing MCP tool call: {error}")
+            return self._make_error_call_result(
+                tool_name, f"Error executing tool {tool_name}: {error}"
+            )
+
+    def _shutdown_runtime_pool(self) -> None:
+        if self._runtime_pool is not None:
+            self._runtime_pool.shutdown(wait=True)
+            self._runtime_pool = None
+
+    def _execute_tool_via_pool_sync(self, action: ToolCallAction) -> ToolResult:
+        start = time.perf_counter()
+        future = self._ensure_runtime_pool().submit(
+            _call_tool_in_worker,
+            action.function.name,
+            action.function.arguments,
+        )
+        call_result = self._resolve_pool_future_sync(future, action.function.name)
+        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
+        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
+        observation.metadata.other["action_kind"] = action.kind
+        return observation
+
+    async def _execute_tool_via_pool_async(self, action: ToolCallAction) -> ToolResult:
+        start = time.perf_counter()
+        future = self._ensure_runtime_pool().submit(
+            _call_tool_in_worker,
+            action.function.name,
+            action.function.arguments,
+        )
+        call_result = await self._resolve_pool_future_async(future, action.function.name)
+        observation = ToolResult(tool_call_id=getattr(action, "id", ""), content=call_result)
+        observation.metadata.other["action_execution_time"] = time.perf_counter() - start
+        observation.metadata.other["action_kind"] = action.kind
+        return observation
+
+    def step(self, action: Action) -> Observation:
+        if not isinstance(action, ToolCallAction):
+            return super().step(action)
+
+        outcome, message = self._precheck_tool_action(action)
+        if outcome == "math_answer":
+            return self._create_math_answer(action)
+        if outcome == "error":
+            return self._make_error_tool_result(action, message or "")
+
+        try:
+            observation = self._execute_tool_call_sync(action)
+        except BaseException:
+            self._broken = True
+            raise
+
+        return self._postprocess_after_tool(action, observation)
+
+    async def astep(self, action: Action) -> Observation:
+        if not isinstance(action, ToolCallAction):
+            return await super().astep(action)
+
+        outcome, message = self._precheck_tool_action(action)
+        if outcome == "math_answer":
+            return self._create_math_answer(action)
+        if outcome == "error":
+            return self._make_error_tool_result(action, message or "")
+
+        try:
+            observation = await self._execute_tool_call_async(action)
+        except BaseException:
+            self._broken = True
+            raise
+
+        return self._postprocess_after_tool(action, observation)
+
+    def _precheck_tool_action(self, action: ToolCallAction) -> tuple[str, str | None]:
+        if action.function.name == "MathAnswer":
+            return "math_answer", None
+        if self._broken:
+            return "error", self._backend_unavailable_message()
+        if action.function.name == "run_python_code":
+            block_message = self._check_python_safety(action.function.arguments)
+            if block_message is not None:
+                return "error", block_message
+        return "ok", None
+
+    def _execute_tool_call_sync(self, action: ToolCallAction) -> Observation:
+        if self._should_offload(action.function.name):
+            return self._execute_tool_via_pool_sync(action)
+        return super().step(action)
+
+    async def _execute_tool_call_async(self, action: ToolCallAction) -> Observation:
+        if self._should_offload(action.function.name):
+            return await self._execute_tool_via_pool_async(action)
+        return await super().astep(action)
+
+    def _postprocess_after_tool(
+        self,
+        action: ToolCallAction,
+        observation: Observation,
+    ) -> Observation:
+        if action.function.name != "MathAnswer":
+            return self._postprocess_tool_observation(action, observation)
+        return observation
+
+    def _ensure_math_answer_tool(self) -> None:
+        if not any(
+            getattr(tool, "function", None) and tool.function.name == "MathAnswer"
+            for tool in self.tools
+        ):
+            self.tools.append(self._math_answer_spec)
+
+    def _reset_health(self) -> None:
+        self._broken = False
+        self._last_failure_reason = None
+        self._runtime_guard_installed = False
+
+    def _create_math_answer(self, action: ToolCallAction) -> MathAnswer:
+        answer_value = self._extract_answer(action.function.arguments)
+        math_answer = MathAnswer(answer=answer_value)
+        math_answer.metadata.other.update({
+            "action_kind": "MathAnswer",
+            "tool_call_id": getattr(action, "id", ""),
+            "action_execution_time": 0.0,
+        })
+        return math_answer
+
+    def mark_healthy(self) -> None:
+        self._reset_health()
+
+    def is_healthy(self) -> bool:
+        return not self._broken
+
+    def close(self) -> None:
+        self._shutdown_runtime_pool()
+        super().close()
+
+    async def aclose(self) -> None:
+        self._shutdown_runtime_pool()
+        await super().aclose()
+
+    @staticmethod
+    def _guard_snippet() -> str:
+        """generate Python code that installs safety guards"""
+        return (
+            "import builtins, sys, os, time, atexit\n"
+            "try:\n"
+            "    _PIPELINERL_TIME_LIMIT = float(os.environ.get('PIPELINERL_PY_TIMEOUT', '30'))\n"
+            "except ValueError:\n"
+            "    _PIPELINERL_TIME_LIMIT = 30.0\n"
+            "_PIPELINERL_START = time.perf_counter()\n"
+            "class _ExitBlocked(RuntimeError):\n"
+            "    pass\n"
+            "def _blocked_exit(*_args, **_kwargs):\n"
+            "    raise _ExitBlocked('exit() and os._exit() are disabled in this environment.')\n"
+            "for _target in (builtins, sys):\n"
+            "    for _name in ('exit', 'quit'):\n"
+            "        if hasattr(_target, _name):\n"
+            "            setattr(_target, _name, _blocked_exit)\n"
+            "if hasattr(os, '_exit'):\n"
+            "    os._exit = _blocked_exit\n"
+            "def _pipelinerl_trace(frame, event, arg):\n"
+            "    if event == 'line' and (time.perf_counter() - _PIPELINERL_START) > _PIPELINERL_TIME_LIMIT:\n"
+            "        sys.settrace(None)\n"
+            "        raise RuntimeError(f'Python execution timed out after {_PIPELINERL_TIME_LIMIT} seconds.')\n"
+            "    return _pipelinerl_trace\n"
+            "sys.settrace(_pipelinerl_trace)\n"
+            "atexit.register(lambda: sys.settrace(None))\n"
+        )
+
+    async def _install_runtime_guard(self) -> None:
+        """Install runtime safety guard in the Python environment."""
+        if self._runtime_guard_installed or not getattr(self, "client", None):
+            return
+        try:
+            snippet = self._guard_snippet()
+            if self._should_offload("run_python_code"):
+                future = self._ensure_runtime_pool().submit(
+                    _call_tool_in_worker,
+                    "run_python_code",
+                    {"python_code": snippet},
+                )
+                await self._resolve_pool_future_async(future, "run_python_code")
+            else:
+                await self.client.call_tool(
+                    "run_python_code",
+                    {"python_code": snippet},
+                )
+            self._runtime_guard_installed = True
+            logger.debug("Runtime guard installed successfully")
+        except Exception:
+            logger.warning("Failed to install runtime guard in MCP environment", exc_info=True)
+
+    def _postprocess_tool_observation(
+        self,
+        action: ToolCallAction,
+        observation: Observation,
+    ) -> Observation:
+        if not isinstance(observation, ToolResult):
+            return observation
+        call_result = observation.content
+        if not isinstance(call_result, CallToolResult):
+            return observation
+        if not getattr(call_result, "isError", False):
+            return observation
+        error_text = self._extract_call_result_text(call_result)
+        if not self._is_connection_error_message(error_text):
+            return observation
+        logger.warning(
+            "MCP backend failure detected for tool %s: %s",
+            action.function.name,
+            error_text,
+        )
+        return self._handle_connection_failure(action, observation, error_text)
+
+    @staticmethod
+    def _extract_call_result_text(call_result: CallToolResult) -> str:
+        if not isinstance(call_result.content, list):
+            return ""
+        parts: list[str] = []
+        for block in call_result.content:
+            if isinstance(block, TextContent) and isinstance(block.text, str):
+                parts.append(block.text)
+        return "\n".join(parts).strip()
+
+    @staticmethod
+    def _is_connection_error_message(message: str) -> bool:
+        lowered = message.lower()
+        return any(pattern in lowered for pattern in _CONNECTION_ERROR_PATTERNS)
+
+    def _handle_connection_failure(
+        self,
+        action: ToolCallAction,
+        observation: ToolResult,
+        error_text: str,
+    ) -> ToolResult:
+        """Mark environment as broken and update observation."""
+        self._broken = True
+        failure_message = (
+            "Python tool backend became unavailable (connection lost). "
+            "Environment will restart after this attempt; stop issuing additional tool calls."
+        )
+        if error_text:
+            failure_message = f"{failure_message}\nOriginal error: {error_text}"
+
+        observation.content = CallToolResult(
+            content=[TextContent(type="text", text=failure_message)],
+            isError=True,
+        )
+        observation.metadata.other.setdefault("action_execution_time", observation.metadata.other.get("action_execution_time", 0.0))
+        observation.metadata.other["connection_failure"] = True
+        observation.metadata.other["original_error"] = error_text
+        self._last_failure_reason = failure_message
+        return observation
+
+    def _backend_unavailable_message(self) -> str:
+        """Get message for unavailable backend."""
+        return self._last_failure_reason or (
+            "Python tool backend is restarting after a connection failure. "
+            "Abort this attempt and wait for a fresh environment."
+        )
+
+    @staticmethod
+    def _extract_answer(arguments: dict | str | None) -> str:
+        """Extract answer string from arguments."""
+        if arguments is None:
+            return ""
+        if isinstance(arguments, str):
+            try:
+                parsed = json.loads(arguments)
+                return str(parsed.get("answer", "")) if isinstance(parsed, dict) else str(parsed)
+            except json.JSONDecodeError:
+                return arguments
+        if isinstance(arguments, dict):
+            return str(arguments.get("answer", ""))
+        return str(arguments)
+
+    def _check_python_safety(self, arguments: dict | str | None) -> str | None:
+        """check for Python code problems"""
+        code = self._extract_python_code(arguments)
+        if not code:
+            return None
+        for pattern, label in self._python_blocklist:
+            if pattern.search(code):
+                return (
+                    f"Python execution rejected: forbidden call detected ({label}). "
+                    "Use pure computation without exiting the runtime."
+                )
+        return None
+
+    @staticmethod
+    def _extract_python_code(arguments: dict | str | None) -> str:
+        if arguments is None:
+            return ""
+        if isinstance(arguments, str):
+            try:
+                parsed = json.loads(arguments)
+                if isinstance(parsed, dict):
+                    return str(parsed.get("python_code", parsed.get("code", "")))
+                return str(parsed)
+            except json.JSONDecodeError:
+                return arguments
+        if isinstance(arguments, dict):
+            return str(arguments.get("python_code", arguments.get("code", "")))
+        return str(arguments)
+
+    def _make_error_tool_result(self, action: ToolCallAction, message: str) -> ToolResult:
+        result = CallToolResult(
+            content=[TextContent(type="text", text=message)],
+            isError=True,
+        )
+        tool_result = ToolResult(
+            tool_call_id=getattr(action, "id", ""),
+            content=result,
+        )
+        tool_result.metadata.other["action_execution_time"] = 0.0
+        tool_result.metadata.other["action_kind"] = action.kind
+        return tool_result
+
+
+class EmbeddedEnvironmentWorker:
+    def __init__(self, env_cfg: DictConfig, concurrency: int = 1):
+        # make repeated instantiations stable even if the caller changes its copy
+        self._env_cfg = OmegaConf.create(env_cfg)
+        self._cfg_signature = self._make_cfg_signature(self._env_cfg)
+        self._concurrency = max(1, concurrency)
+        self._init_lock = asyncio.Lock()
+        self._available: asyncio.Queue[_ProcessEnvironmentProxy] | None = None
+        self._all_envs: set[_ProcessEnvironmentProxy] = set()
+
+    @staticmethod
+    def _make_cfg_signature(cfg: DictConfig) -> str:
+        try:
+            container = OmegaConf.to_container(cfg, resolve=True)
+        except Exception:
+            container = OmegaConf.to_container(cfg, resolve=False)
+        return json.dumps(container, sort_keys=True, default=str)
+
+    @property
+    def concurrency(self) -> int:
+        return self._concurrency
+
+    def matches(self, env_cfg: DictConfig) -> bool:
+        return self._cfg_signature == self._make_cfg_signature(env_cfg)
+
+    def set_concurrency(self, concurrency: int) -> None:
+        self._concurrency = max(1, concurrency)
+
+    async def _ensure_pool(self) -> None:
+        if self._available is None:
+            self._available = asyncio.Queue()
+        if len(self._all_envs) >= self._concurrency:
+            return
+        async with self._init_lock:
+            if len(self._all_envs) >= self._concurrency:
+                return
+            missing = self._concurrency - len(self._all_envs)
+            for _ in range(missing):
+                environment = _ProcessEnvironmentProxy(self._env_cfg)
+                try:
+                    await self._init_and_reset(environment)
+                except Exception:
+                    logger.exception("Failed to initialize embedded environment instance")
+                    await self._close(environment)
+                    raise
+                self._all_envs.add(environment)
+                await self._available.put(environment)
+
+    @asynccontextmanager
+    async def alifecycle(self) -> AsyncIterator[Environment]:
+        """Context manager for environment lifecycle with automatic health checking."""
+        await self._ensure_pool()
+        assert self._available is not None
+        
+        environment = await self._available.get()
+        try:
+            await self._reset(environment)
+            yield environment
+        finally:
+            try:
+                unhealthy = (
+                    hasattr(environment, "is_healthy")
+                    and not environment.is_healthy()  # type: ignore
+                )
+            except Exception:
+                logger.warning("Failed to query embedded environment health; replacing", exc_info=True)
+                unhealthy = True
+            is_healthy = not unhealthy
+            
+            if is_healthy:
+                # try to reset and recycle healthy environment
+                try:
+                    await self._reset(environment)
+                    if hasattr(environment, "mark_healthy"):
+                        environment.mark_healthy()  # type: ignore
+                    await self._available.put(environment)
+                except Exception:
+                    logger.exception("Failed to recycle embedded environment; replacing")
+                    await self._replace(environment)
+            else:
+                # environment is unhealthy, replace it
+                logger.warning("Embedded environment is unhealthy, replacing")
+                await self._replace(environment)
+
+    async def _replace(self, environment: Environment) -> None:
+        """Replace a broken environment with a new one."""
+        if environment in self._all_envs:
+            self._all_envs.remove(environment)
+        try:
+            await self._close(environment)
+        except Exception:
+            logger.exception("Failed to close environment during replacement")
+        # Refill the pool
+        await self._ensure_pool()
+
+    async def _init_and_reset(self, env: Environment) -> None:
+        # init
+        if hasattr(env, "ainitialize") and inspect.iscoroutinefunction(env.ainitialize):
+            await env.ainitialize()  # type: ignore
+        else:
+            loop = asyncio.get_running_loop()
+            await loop.run_in_executor(None, env.initialize)
+        
+        # reset
+        await self._reset(env)
+
+    async def _reset(self, env: Environment) -> None:
+        if hasattr(env, "areset") and inspect.iscoroutinefunction(env.areset):
+            await env.areset()  # type: ignore
+        else:
+            reset_fn = getattr(env, "reset", None)
+            if callable(reset_fn):
+                loop = asyncio.get_running_loop()
+                await loop.run_in_executor(None, reset_fn)
+
+    async def _close(self, env: Environment) -> None:
+        loop = asyncio.get_running_loop()
+        
+        # try async close first
+        if hasattr(env, "aclose") and inspect.iscoroutinefunction(env.aclose):
+            try:
+                await env.aclose()  # type: ignore
+                return
+            except Exception as e:
+                logger.debug(f"Async close failed: {e}, trying sync close")
+        
+        # fallback to sync close
+        try:
+            await loop.run_in_executor(None, env.close)
+        except Exception as e:
+            logger.debug(f"Sync close failed: {e}")
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index cd82e351..f62f0567 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,31 +1,46 @@
 import asyncio
-from urllib.parse import urlparse
 import time
 import random
 import logging 
 from collections import Counter
-from typing import List, Dict
+from typing import Dict, List
 
 import aiohttp
+from urllib.parse import urlparse
 from omegaconf import DictConfig
 from pipelinerl.domains.mcp.steps import MathAnswer
 from pipelinerl.world import Job
 from tapeagents.llms.trainable import TrainableLLM
-from tapeagents.remote_environment import AsyncRemoteEnvironment
 from pipelinerl.async_llm import make_training_text
+from tapeagents.environment import Environment
 from tapeagents.orchestrator import async_execute_agent
 from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
 from tapeagents.core import Tape
 from tapeagents.dialog_tape import UserStep
 from tapeagents.core import LLMCall
+from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-from pipelinerl.domains.math import verify_answer_rpc, RewardTable, get_reward
+from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 
 logger = logging.getLogger(__name__)
 
 
+_embedded_worker: EmbeddedEnvironmentWorker | None = None
+
+
+def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvironmentWorker:
+    global _embedded_worker
+    concurrency = max(1, concurrency)
+    if _embedded_worker is None or not _embedded_worker.matches(env_cfg):
+        _embedded_worker = EmbeddedEnvironmentWorker(env_cfg, concurrency=concurrency)
+    else:
+        _embedded_worker.set_concurrency(concurrency)
+    return _embedded_worker
+
+
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
@@ -53,6 +68,7 @@ class Metrics(BaseMetrics):
     total_execution_time: float = -1.0
     agent_execution_time: float = -1.0
     environment_execution_time: float = -1.0
+    overflow: bool = False
 
 async def generate_mcp_rollout(
     cfg: DictConfig,
@@ -60,58 +76,90 @@ async def generate_mcp_rollout(
     problem: dict,
     session: aiohttp.ClientSession,
 ) -> RolloutResult:
-    # choose and retry env servers if one is saturated
     start = time.perf_counter()
-    env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
-    if not env_jobs:
-        raise RuntimeError("No environment servers available")
-
-    # shuffle to avoid dead-locking a single server
-    env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
-    if not env_urls_all:
-        raise RuntimeError("Environment server definitions missing ports")
-
-    while True:
-        env_urls = env_urls_all[:]
-        random.shuffle(env_urls)
-        chosen_url = None
-        for env_url in env_urls:
-            try:
-                environment = AsyncRemoteEnvironment(
-                    server_url=env_url, start_timeout_sec=600, start_repeat_delay=5)
-                context_manager = environment.acontext(session, wait_for_env=True)
-                env = await context_manager.__aenter__()
+
+    chosen_url: str | None = None
+    env_host: str | None = None
+    env_port: int | None = None
+
+    if cfg.world.environment_mode == "remote":
+        env_jobs = [Job(**job) for job in cfg.jobs if job["kind"] == "environment"]
+        if not env_jobs:
+            raise RuntimeError("No environment servers available")
+
+        env_urls_all = [f"http://{job.hostname}:{job.port}" for job in env_jobs if job.port is not None]
+        if not env_urls_all:
+            raise RuntimeError("Environment server definitions missing ports")
+
+        while True:
+            env_urls = env_urls_all[:]
+            random.shuffle(env_urls)
+            chosen_url = None
+            for env_url in env_urls:
+                jitter = random.randint(3, 12)
                 try:
-                    await env.start_task(problem)
-                    chosen_url = env_url
-                    actions = await env.a_actions()
-                    tools_description = await env.a_tools_description()
-                    logger.debug(f"Available tools: {tools_description}")
-                    agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
-                    agent.llms = {DEFAULT: llm}
-
-                    tape = Tape(steps=[
-                        UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
-                    ])
-                    t_exec = time.perf_counter()
-                    while True:
-                        try:
-                            tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
-                            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-                            break
-                        except Exception:
-                            await asyncio.sleep(5)
-                    break  # success
-                finally:
-                    await context_manager.__aexit__(None, None, None)
-            except Exception as e:
-                # try the next server on errors (503: busyslots)
-                logger.warning(f"Env start failed at {env_url}: {e}")
-                continue
-        if chosen_url is not None:
-            break  # success
-        # if none succeeded backoff and retry the whole list
-        await asyncio.sleep(1.0)
+                    environment = AsyncRemoteEnvironment(
+                        server_url=env_url, start_timeout_sec=600, start_repeat_delay=jitter)
+                    context_manager = environment.acontext(session, wait_for_env=True)
+                    env = await context_manager.__aenter__()
+                    try:
+                        await env.start_task(problem)
+                        chosen_url = env_url
+                        actions = await env.a_actions()
+                        tools_description = await env.a_tools_description()
+                        logger.debug(f"Available tools: {tools_description}")
+                        agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+                        agent.llms = {DEFAULT: llm}
+
+                        tape = Tape(steps=[
+                            UserStep(content=f"{problem['task']}. You have access to the following tools: {tools_description}")
+                        ])
+                        t_exec = time.perf_counter()
+                        while True:
+                            try:
+                                tape = await async_execute_agent(agent, tape, env, session, max_loops=cfg.agent_max_loops)
+                                tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+                                break
+                            except Exception:
+                                await asyncio.sleep(5)
+                        break  # success
+                    finally:
+                        await context_manager.__aexit__(None, None, None)
+                except Exception as e:
+                    logger.warning(f"Env start failed at {env_url}: {e}")
+                    continue
+            if chosen_url is not None:
+                break  # success
+            await asyncio.sleep(1.0)
+
+        parsed = urlparse(chosen_url)
+        env_host, env_port = parsed.hostname, parsed.port
+    else:
+        concurrency = max(1, int(getattr(cfg.world, "env_replicas_per_actor", 1)))
+        env_worker = _get_embedded_worker(cfg.environment, concurrency)
+        async with env_worker.alifecycle() as environment:
+            start_result = environment.start_task(problem)
+            tape_metadata = start_result if isinstance(start_result, dict) else {}
+
+            actions = environment.actions()
+            tools_description = environment.tools_description()
+            logger.debug(f"Embedded tools: {tools_description}")
+            agent: Agent = instantiate(cfg.agent, known_actions=actions, tools_description=tools_description)
+            agent.llms = {DEFAULT: llm}
+            tape = Tape(
+                steps=[
+                    UserStep(
+                        content=f"{problem['task']}. You have access to the following tools: {tools_description}"
+                    )
+                ]
+            )
+            if tape_metadata:
+                tape.metadata.other.update(tape_metadata)
+
+            t_exec = time.perf_counter()
+            tape = await async_execute_agent(agent, tape, environment, session, max_loops=cfg.agent_max_loops)
+            tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+        env_host = env_port = None
 
     reward_table = RewardTable(**dict(cfg.rewards))
 
@@ -125,21 +173,87 @@ async def generate_mcp_rollout(
     tool_call_counts = count_tool_calls_by_category(llm_calls)
     training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
     n_llm_calls = len(llm_calls)
-    parsed = urlparse(chosen_url)
-    assert parsed.hostname is not None and parsed.port is not None
-    answer_status = await verify_answer_rpc(
-        session=session,
-        host=parsed.hostname,
-        port=parsed.port,
-        prediction=llm_calls[-1].output.content,  # type: ignore
-        gold=problem["answer"],
-        strict=True,
-    )
+    if env_host and env_port:
+        answer_status = await verify_answer_rpc(
+            session=session,
+            host=env_host,
+            port=env_port,
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
+    else:
+        answer_status = verify_answer(
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
     # Tape should finish with an answer
     tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
-    reward = get_reward(answer_status, tape_finished, reward_table)
+    base_reward = get_reward(answer_status, tape_finished, reward_table)
+
+    # Local reward shaping (configurable in conf/mcp.yaml)
+    total_shaping = 0.0
+    shaping_cfg = getattr(cfg, "python_tool_shaping", None)
+    if shaping_cfg is not None:
+        num_python_calls = tool_call_counts.get("run_python_code", 0)
+        bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
+        penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
+        max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+
+        # Episode-level bonuses/penalties
+        if answer_status == "correct" and num_python_calls >= 1:
+            total_shaping += bonus_on_correct_with_python
+        if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
+            total_shaping -= penalty_on_incorrect_without_python
+
+        # Clamp total shaping
+        if total_shaping > max_abs:
+            total_shaping = max_abs
+        if total_shaping < -max_abs:
+            total_shaping = -max_abs
+
+    # Length shaping: discourage very long completions; award concise correct ones
+    length_cfg = getattr(cfg, "length_shaping", None)
+    if length_cfg is not None:
+        try:
+            # Prefer ratio-based target if provided; otherwise use absolute
+            if hasattr(length_cfg, "target_ratio"):
+                ratio = float(getattr(length_cfg, "target_ratio"))
+                max_gen = int(llm.parameters.get("max_tokens", 2048))
+                target_tokens = int(max(1, ratio * max_gen))
+                # Optional clamps
+                min_t = int(getattr(length_cfg, "min_target_tokens", 0))
+                max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
+                target_tokens = max(min_t, min(max_t, target_tokens))
+            else:
+                target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
+            slope = float(getattr(length_cfg, "slope", 0.0))
+            max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
+            bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
+        except Exception:
+            target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
+
+        # average output tokens across llm calls for this rollout
+        try:
+            avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
+        except Exception:
+            avg_output_tokens = 0.0
+
+        if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
+            over_by = float(avg_output_tokens - target_tokens)
+            penalty = min(max_penalty, slope * over_by)
+            total_shaping -= penalty
+
+        if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
+            total_shaping += bonus_short_correct
+
+    reward = base_reward + total_shaping
+
+    # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
     for text in training_texts:
         text.reward = reward
+        text.finished = tape_finished
 
     latency = time.perf_counter() - start
 
@@ -159,6 +273,7 @@ async def generate_mcp_rollout(
         total_execution_time=total_time,
         agent_execution_time=agent_time,
         environment_execution_time=env_time,
+        overflow=not tape_finished,
     )
 
     return RolloutResult(
diff --git a/pipelinerl/domains/mcp/steps.py b/pipelinerl/domains/mcp/steps.py
index f33d6efa..9b29a717 100644
--- a/pipelinerl/domains/mcp/steps.py
+++ b/pipelinerl/domains/mcp/steps.py
@@ -1,13 +1,13 @@
 from typing import Any, Literal
 from pydantic import Field
-from tapeagents.core import StopStep
+from tapeagents.core import FinalObservation
 
 
-class MathAnswer(StopStep):
+class MathAnswer(FinalObservation):
     """
     Action that indicates the agent has finished solving a math problem.
     The final answer must be contained within \\boxed{} format.
     """
 
     kind: Literal["math_answer_action"] = "math_answer_action"
-    answer: Any = Field(description="Final answer in \\boxed{} format")
\ No newline at end of file
+    answer: Any = Field(description="Final answer in \\boxed{} format")
diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index e56c0e80..be5c8faf 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -1,6 +1,7 @@
 import logging
 import math
 import os
+import shlex
 import shutil
 import subprocess
 import sys
@@ -157,6 +158,29 @@ def run_actor_llm(
         str(world_map.weight_update_group_size),
     ]
 
+    # Provide deterministic rendezvous port defaults when env vars are absent.
+    # vLLM spins up a torch.distributed TCPStore using VLLM_PORT. On the remote
+    # scheduler we observed replica crashes (store collisions, connection
+    # refused) because every start script inherited the same default port. By
+    # exporting VLLM_PORT_BASE/VLLM_PORT_STRIDE we carve out a rendezvous range
+    # per actor_idx while keeping the public HTTP listener at 8080+local_idx.
+    env = dict(os.environ)
+    if "VLLM_PORT_BASE" not in env:
+        # Each rank gets 1000 ports; 43000 leaves room below.
+        env["VLLM_PORT_BASE"] = str(43000 + 1000 * world_map.my_rank)
+        logger.debug(
+            "Setting default VLLM_PORT_BASE=%s for rank %s",
+            env["VLLM_PORT_BASE"], world_map.my_rank,
+        )
+    if "VLLM_PORT_STRIDE" not in env:
+        env["VLLM_PORT_STRIDE"] = "20"
+
+    env_overrides = {
+        key: str(env[key])
+        for key in ("VLLM_PORT_BASE", "VLLM_PORT_STRIDE")
+        if key in env
+    }
+
     # Add vLLM kwargs as separate arguments
     if cfg.vllm_config.vllm_kwargs:
         for k, v in cfg.vllm_config.vllm_kwargs.items():
@@ -169,13 +193,13 @@ def run_actor_llm(
 
     gpu_str = ",".join([str(gpu) for gpu in gpus])
     logger.info(f"Running actor_llm with command: {' '.join(cmd)} on gpus: {gpu_str}")
-    save_command(log_dir, cmd)
+    save_command(log_dir, cmd, env_overrides or None)
     log_file_path = os.path.join(log_dir, "stdout.log")
     err_file_path = os.path.join(log_dir, "stderr.log")
     with open(log_file_path, "a") as log_file, open(err_file_path, "a") as err_file:
         yield _popen(
             cmd,
-            env={**os.environ, "CUDA_VISIBLE_DEVICES": gpu_str},
+            env={**env, "CUDA_VISIBLE_DEVICES": gpu_str},
             stdout=log_file,
             stderr=err_file,
         )
@@ -372,14 +396,21 @@ def run_redis(cfg: DictConfig):
     yield _popen(cmd, env=dict(os.environ))
 
 
-def save_command(script_dir: Path, cmd):
+def save_command(script_dir: Path, cmd, env: dict | None = None):
     os.makedirs(script_dir, exist_ok=True)
     script_path = script_dir / "start.sh"
     with open(script_path, "w") as f:
         f.write("#!/bin/bash\n")
+        f.write("set -e\n")
+        if env:
+            for key, value in sorted(env.items()):
+                quoted_value = shlex.quote(value)
+                f.write(f"export {key}={quoted_value}\n")
         # Properly quote arguments for the shell script
-        quoted_cmd = [f"'{arg}'" if " " in arg or "$" in arg else arg for arg in cmd]
-        f.write(" ".join(quoted_cmd) + "\n")
+        quoted_cmd = [shlex.quote(arg) for arg in cmd]
+        f.write("exec ")
+        f.write(" ".join(quoted_cmd))
+        f.write("\n")
     os.chmod(script_path, 0o755)
     logger.info(f"Saved start script to {script_path}")
 
diff --git a/pipelinerl/rl_tool_parser_plugin.py b/pipelinerl/rl_tool_parser_plugin.py
index 194a5d87..12e6fc2d 100644
--- a/pipelinerl/rl_tool_parser_plugin.py
+++ b/pipelinerl/rl_tool_parser_plugin.py
@@ -4,7 +4,8 @@
 
 import json
 import re
-from typing import Any, Dict, List, Optional, Union, Sequence
+from typing import Any  # noqa: F401
+import logging
 
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -20,6 +21,9 @@
 class HermesRLToolParser(ToolParser):
     """
     Tool parser for RL tool calling format using <tool_call></tool_call> markers.
+    Supports both standard format and Apriel-style formats:
+    - <tool_calls>[{...}, {...}]</tool_calls> (preferred if present)
+    - [BEGIN FINAL RESPONSE] ... [END FINAL RESPONSE] wrapper
     """
     
     def __init__(self, tokenizer):
@@ -34,6 +38,16 @@ def __init__(self, tokenizer):
             r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
         )
         
+        # Apriel-specific patterns
+        self.apriel_final_response_regex = re.compile(
+            r"\[BEGIN FINAL RESPONSE\](.*?)\[END FINAL RESPONSE\]", re.DOTALL
+        )
+        # Prefer parsing aggregated tool calls from <tool_calls>...</tool_calls>
+        # Be lenient: case-insensitive; tolerate missing closing tag by capturing to end.
+        self.apriel_tool_calls_regex = re.compile(
+            r"<tool_calls>\s*(.*?)\s*(?:</tool_calls>|$)", re.DOTALL | re.IGNORECASE
+        )
+        
         # State for streaming
         self.current_tool_name_sent = False
         self.prev_tool_call_arr = []
@@ -51,47 +65,180 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
         Returns:
             ExtractedToolCallInformation with tool calls and metadata
         """
-        # Quick check to avoid unnecessary processing
-        if self.tool_call_start_token not in model_output:
-            return ExtractedToolCallInformation(
-                tools_called=False,
-                tool_calls=[],
-                content=model_output
-            )
-        
+        logger = logging.getLogger("pipelinerl.tool_parser")
+        # Ensure variable exists for any fallback references below
+        final_response_match = None
+
         try:
+            # 1) Apriel aggregated tool calls block has priority
+            tool_calls_matches = list(self.apriel_tool_calls_regex.finditer(model_output))
+            if tool_calls_matches:
+                # Use the last match (in case of multiple blocks)
+                last_match = tool_calls_matches[-1]
+                tool_calls_json = last_match.group(1).strip()
+                parsed_calls = []
+                try:
+                    parsed_calls = json.loads(tool_calls_json) if tool_calls_json else []
+                except Exception:
+                    logger.debug("Failed to parse aggregated <tool_calls> JSON; falling back", exc_info=True)
+                    parsed_calls = []
+
+                tool_calls: list[ToolCall] = []
+                for i, pc in enumerate(parsed_calls):
+                    try:
+                        name = pc.get("name", "")
+                        args_obj = pc.get("arguments", {})
+                        if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                            args_obj = {}
+                        args_str = json.dumps(args_obj, ensure_ascii=False)
+                        call_id = pc.get("id", f"call_{i}")
+                        tool_calls.append(
+                            ToolCall(
+                                id=call_id,
+                                type="function",
+                                function=FunctionCall(name=str(name), arguments=args_str),
+                            )
+                        )
+                    except Exception:
+                        logger.debug("Skipping malformed aggregated tool call", exc_info=True)
+                        continue
+
+                # Prefer final response content if present; otherwise empty string
+                final_response_match = self.apriel_final_response_regex.search(model_output)
+                content = final_response_match.group(1).strip() if final_response_match else ""
+
+                return ExtractedToolCallInformation(
+                    tools_called=bool(tool_calls),
+                    tool_calls=tool_calls,
+                    content=content,
+                )
+
+            # 2) Try bare JSON tool-calls (no tags), but only if tools are declared in the request
+            #    Accept either a list of {name, arguments} or a single dict
+            try:
+                tools_declared = bool(getattr(request, "tools", None))
+            except Exception:
+                tools_declared = False
+
+            if tools_declared:
+                candidate_strings: list[str] = []
+                final_response_match = self.apriel_final_response_regex.search(model_output)
+                if final_response_match:
+                    candidate_strings.append(final_response_match.group(1).strip())
+                candidate_strings.append(model_output.strip())
+
+                for candidate in candidate_strings:
+                    try:
+                        parsed = json.loads(candidate)
+                    except Exception:
+                        continue
+                    parsed_list = []
+                    if isinstance(parsed, dict) and "name" in parsed and "arguments" in parsed:
+                        parsed_list = [parsed]
+                    elif isinstance(parsed, list) and all(isinstance(it, dict) for it in parsed):
+                        parsed_list = [it for it in parsed if "name" in it and "arguments" in it]
+                    if not parsed_list:
+                        continue
+                    tool_calls: list[ToolCall] = []
+                    for i, pc in enumerate(parsed_list):
+                        try:
+                            name = pc.get("name", "")
+                            args_obj = pc.get("arguments", {})
+                            if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                                args_obj = {}
+                            args_str = json.dumps(args_obj, ensure_ascii=False)
+                            call_id = pc.get("id", f"call_{i}")
+                            tool_calls.append(
+                                ToolCall(
+                                    id=call_id,
+                                    type="function",
+                                    function=FunctionCall(name=str(name), arguments=args_str),
+                                )
+                            )
+                        except Exception:
+                            logger.debug("Skipping malformed bare-JSON tool call", exc_info=True)
+                            continue
+                    content = final_response_match.group(1).strip() if final_response_match else ""
+                    return ExtractedToolCallInformation(
+                        tools_called=bool(tool_calls),
+                        tool_calls=tool_calls,
+                        content=content,
+                    )
+
+            # 3) Fallback: look for single <tool_call> blocks (legacy / other models)
+            content_to_search = model_output
+            final_response_match = self.apriel_final_response_regex.search(model_output)
+            if final_response_match:
+                final_response_content = final_response_match.group(1).strip()
+                if self.tool_call_start_token in final_response_content:
+                    content_to_search = final_response_content
+                elif self.tool_call_start_token not in model_output:
+                    # No tool calls found, return final response as content
+                    return ExtractedToolCallInformation(
+                        tools_called=False,
+                        tool_calls=[],
+                        content=final_response_content
+                    )
+
+            # Quick check to avoid unnecessary processing
+            if self.tool_call_start_token not in content_to_search:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=model_output
+                )
+
             # Find all tool call matches
-            function_call_tuples = self.tool_call_regex.findall(model_output)
-            
+            function_call_tuples = self.tool_call_regex.findall(content_to_search)
+
             # Parse JSON from matches
             tool_calls = []
             for i, match in enumerate(function_call_tuples):
                 json_str = match[0] if match[0] else match[1]
                 try:
                     parsed_call = json.loads(json_str.strip())
-                    
+                    args_obj = parsed_call.get("arguments", {})
+                    if not isinstance(args_obj, (dict, list, str, int, float, bool)):
+                        args_obj = {}
                     tool_call = ToolCall(
                         id=f"call_{i}",
                         type="function",
                         function=FunctionCall(
-                            name=parsed_call.get("name", ""),
-                            arguments=json.dumps(
-                                parsed_call.get("arguments", {}),
-                                ensure_ascii=False
-                            )
+                            name=str(parsed_call.get("name", "")),
+                            arguments=json.dumps(args_obj, ensure_ascii=False)
                         )
                     )
                     tool_calls.append(tool_call)
-                except json.JSONDecodeError:
+                except Exception:
+                    logger.debug("Skipping malformed <tool_call> JSON", exc_info=True)
                     continue
-            
+
+            # Determine content based on whether we found tool calls
+            if tool_calls and final_response_match:
+                # If we found tool calls in final response, use just the tool calls
+                content = ""
+            elif final_response_match:
+                # If we have final response but no tool calls there, use final response
+                content = final_response_match.group(1).strip()
+            else:
+                # Standard processing
+                content = model_output
+
             return ExtractedToolCallInformation(
                 tools_called=bool(tool_calls),
                 tool_calls=tool_calls,
-                content=model_output
+                content=content
             )
-            
+
         except Exception:
+            # Never propagate exceptions to the server; log and return a safe fallback.
+            logger.exception("Tool parser encountered an exception; returning safe fallback.")
+            if final_response_match:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=final_response_match.group(1).strip()
+                )
             return ExtractedToolCallInformation(
                 tools_called=False,
                 tool_calls=[],
diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 2b0a252c..7cb58ede 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -293,19 +293,19 @@ def wait_for_inference_servers(urls: list[str]):
 
 
 def wait_for_environments(cfg: DictConfig):
-    """
-    Wait for the verifier to be ready.
-    """
+    """Wait for remote environment servers to report healthy."""
+    if cfg.world.environment_mode != "remote":
+        return
+
     env_jobs = [Job(**job) for job in cfg.jobs if job.kind == "environment"]
     for job in env_jobs:
         while True:
             url = f"http://{job.hostname}:{job.port}/health"
-            # use requests
             try:
                 response = requests.get(url)
                 if response.status_code == 200:
                     break
-            except:
+            except requests.exceptions.RequestException:
                 logger.info(f"Waiting for environment at {url} to be ready...")
                 time.sleep(5.0)
 
diff --git a/pipelinerl/vllm0.py b/pipelinerl/vllm0.py
index 92c51085..32c17093 100644
--- a/pipelinerl/vllm0.py
+++ b/pipelinerl/vllm0.py
@@ -180,6 +180,25 @@ async def run_server(args, **uvicorn_kwargs) -> None:
             f"invalid tool call parser: {args.tool_call_parser} (chose from {{ {','.join(valide_tool_parses)} }})"
         )
 
+    # Choose a unique rendezvous port per actor to avoid torch.distributed
+    # TCPStore collisions across concurrently launched vLLM processes.
+    try:
+        if "VLLM_PORT" not in os.environ:
+            actor_idx = getattr(args, "actor_llm_idx", None)
+            base_str = os.environ.get("VLLM_PORT_BASE", "")
+            stride_str = os.environ.get("VLLM_PORT_STRIDE", "10")
+            if actor_idx is not None and base_str.isdigit():
+                base = int(base_str)
+                stride = int(stride_str) if stride_str.isdigit() else 10
+                port = base + stride * int(actor_idx)
+                os.environ["VLLM_PORT"] = str(port)
+                logger.info(
+                    "Using VLLM_PORT=%s (base=%s stride=%s actor_idx=%s)",
+                    port, base, stride, actor_idx,
+                )
+    except Exception as e:
+        logger.warning("Failed to set VLLM_PORT from actor_idx: %s", e)
+
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
diff --git a/pipelinerl/world.py b/pipelinerl/world.py
index cc23afd0..6a06fc9f 100644
--- a/pipelinerl/world.py
+++ b/pipelinerl/world.py
@@ -71,7 +71,7 @@ def __init__(self, cfg: DictConfig, verbose: bool = False):
         if place_inference_jobs:
             self._place_inference_jobs(cfg)
         self._place_pipeline_stages(cfg)
-        if cfg.environment:
+        if cfg.environment and cfg.world.environment_mode == "remote":
             self._place_environments(cfg)
 
         # Place the finetune workers on the remaining gpus, take all remaining GPUs

From bd46a7d69c40b9d6a5e108659998c26c8fc9971a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Tue, 30 Sep 2025 18:43:01 +0000
Subject: [PATCH 138/166] Remove imports

---
 pipelinerl/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/utils.py b/pipelinerl/utils.py
index 7cb58ede..6243c2c7 100644
--- a/pipelinerl/utils.py
+++ b/pipelinerl/utils.py
@@ -6,14 +6,13 @@
 import time
 from pathlib import Path
 import traceback
-from typing import Dict, Mapping, List, Any, Union
+from typing import Dict, Mapping, List, Any
 import numpy as np
 from omegaconf import DictConfig
 import psutil
 import requests
 from importlib.metadata import distributions
 from transformers import PreTrainedTokenizer
-from collections import defaultdict
 
 from pipelinerl.world import Job
 from tapeagents.llms import LLMOutput
@@ -321,7 +320,7 @@ def better_crashing(entrypoint_name: str):
         # get process if of the current process
         process_id = os.getpid()
         terminate_with_children(process_id)
-        logger.error(f"I should not even be here...")
+        logger.error("I should not even be here...")
         import sys
 
         sys.exit(1)

From 724f318daf22e058ef277e2b32f16fc0114f140d Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:15:17 +0000
Subject: [PATCH 139/166] sketch of new actor loop class, reuse most of the
 current one

---
 pipelinerl/actor.py | 105 +++++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 30 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 38b2daf2..d1907ed4 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -4,24 +4,26 @@
 import multiprocessing as mp
 import os
 import queue
-from queue import Empty
 import random
 import time
 from collections import defaultdict
 from multiprocessing.managers import SharedMemoryManager
 from pathlib import Path
+from queue import Empty
+from typing import Dict, List
 
 import aiohttp
 import hydra
+import ray
 import uvloop
 from omegaconf import DictConfig
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
-from typing import Dict, List
+from tapeagents.orchestrator import save_debug_line
 
 import wandb
 from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
-from pipelinerl.rollouts import RolloutResult, BaseMetrics
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.shared_memory_array import SharedMemoryQueue
 from pipelinerl.state import TrainerState
 from pipelinerl.streams import (
@@ -107,6 +109,10 @@ def make_stats_dict() -> dict:
     return defaultdict(lambda: defaultdict(list))
 
 
+def get_number_of_tokens_in_result(result: RolloutResult) -> int:
+    return sum(training_text.prompt_tokens + training_text.output_tokens for training_text in result.training_texts)
+
+
 async def schedule_rollouts(
     cfg: DictConfig,
     attempts: int,
@@ -132,6 +138,7 @@ async def schedule_rollouts(
     active_rollouts = [0] * len(llms)
     started_rollouts = 0
     finished_rollouts = 0
+    token_count = 0
     # Track rollouts per problem group
     group_rollouts = {}
     rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
@@ -144,13 +151,16 @@ async def rollout_and_maybe_produce_result(
         llm_index: int,
         session: aiohttp.ClientSession,
     ):
-        nonlocal started_rollouts, finished_rollouts
+        nonlocal started_rollouts, finished_rollouts, token_count
         try:
             llm = llms[llm_index]
             model_version = trainer_state.propagated_weight_version
             assert model_version is not None
-            rollout_result = await rollout_policy(cfg, llm, problem, session)
+            logger.info(f"Starting rollout policy for problem {problem['id']}")
+            rollout_result: RolloutResult = await rollout_policy(cfg, llm, problem, session)
+            logger.info(f"Finished rollout policy for problem {problem['id']}")
             rollout_result.model_version = model_version
+            token_count += get_number_of_tokens_in_result(rollout_result)
             # Make a group id that will be different from groups made by another rollout maker
             full_group_id = f"{scheduler_name}_{group_id}"
             rollout_result.group_id = full_group_id
@@ -187,15 +197,20 @@ async def rollout_and_maybe_produce_result(
     logger.info("Starting rollout scheduler")
     connector = aiohttp.TCPConnector(limit=50000, limit_per_host=50000, keepalive_timeout=1.0)
     timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
+    old_finished_rollouts = 0
     async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
             if time.time() - last_logged > 10.0 and sum(active_rollouts):
+                if finished_rollouts > old_finished_rollouts:
+                    old_finished_rollouts = finished_rollouts
+                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count})
                 logger.info(
                     f"{scheduler_name}: "
                     f"rollouts in progress: {sum(active_rollouts)}, "
                     f"groups in progress: {len(group_rollouts)}, "
                     f"rollouts started so far: {started_rollouts}, "
                     f"rollouts finished so far: {finished_rollouts}, "
+                    f"total tokens produced so far: {token_count}, "
                     f"max group size in bytes: {result_queue.max_actual_entry_size()}, "
                 )
                 last_logged = time.time()
@@ -217,7 +232,6 @@ async def rollout_and_maybe_produce_result(
                 await asyncio.sleep(0.01)
                 continue
             active_rollouts[next_llm] += 1
-            started_rollouts += 1
             assert problem is not None
             loop.create_task(
                 rollout_and_maybe_produce_result(
@@ -228,6 +242,7 @@ async def rollout_and_maybe_produce_result(
                     session=session,
                 )
             )
+            started_rollouts += 1
             group_rollout_index += 1
     logger.info("Rollout scheduler finished")
 
@@ -281,40 +296,41 @@ def __init__(
         self.sliding_aggregator = SlidingWindowAggregator(window_size=cfg.actor.throughput_window_size)
         self.llms = llms
         self.loop_start_time = -1
-        self.cfg = cfg
+        self.cfg: DictConfig = cfg
         self.is_training = is_training
         self.is_scheduling_paused = False
         self.debug_mode = bool(cfg.debug.mode)
 
         # Determine the number of processes to use
         num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
-        attempts = self.cfg.attempts if is_training else 1
 
         # Divide LLMs approximately equally across processes
-        llm_groups = [[] for _ in range(num_processes)]
+        self.llm_groups = [[] for _ in range(num_processes)]
         for i, llm in enumerate(self.llms):
-            llm_groups[i % num_processes].append((i, llm))
+            self.llm_groups[i % num_processes].append((i, llm))
 
         self.smm = SharedMemoryManager()
         self.smm.start()
 
-        
+
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
         self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
         self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
-        
+
         logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
 
+    def start_backend(self):
         # Create and start multiple rollout processes
+        attempts = self.cfg.attempts if self.is_training else 1
         self.rollout_processes = []
-        for llm_group in llm_groups:
+        for llm_group in self.llm_groups:
             assert llm_group
             llm_idxs = [llm[0] for llm in llm_group]
             llms = [llm[1] for llm in llm_group]
             scheduler_name = (
-                f"{'train' if is_training else 'test'} scheduler for llms {','.join([str(i) for i in llm_idxs])}"
+                f"{'train' if self.is_training else 'test'} scheduler for llms {','.join([str(i) for i in llm_idxs])}"
             )
             process = mp.Process(
                 target=rollout_maker_entrypoint,
@@ -328,15 +344,15 @@ def init_stats(self):
         self.latency_list = []
         self.model_versions_list = []
         self.sliding_stats = defaultdict(list)
-    
+
     def compute_domain_agnostic_metrics(self, result: RolloutResult) -> Dict[str, float]:
         metrics = {}
-        
+
         metrics['overflow'] = all([not training_text.finished for training_text in result.training_texts ])
         metrics['num_turns'] = len(result.training_texts)
         metrics['prompt_tokens'] = [training_text.prompt_tokens for training_text in result.training_texts]
         metrics['output_tokens'] = [training_text.output_tokens for training_text in result.training_texts]
-        
+
         return metrics
 
     def update_stats(self, rollout_results: List[RolloutResult]):
@@ -347,7 +363,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             group_id = result.group_id
             self.latency_list.append(result.latency)
             self.model_versions_list.append(result.model_version)
-            domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
+            domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result)
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
             all_metrics["used_python"] = int(all_metrics.get("used_python", False))
             all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
@@ -358,7 +374,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
                     self.stats[k][dataset_name][group_id].append(v)
                 else:
                     raise ValueError(f"Unsupported metric type: {type(v)} for key {k}")
-        
+
         prompt_length_tokens = [training_text.prompt_tokens for result in rollout_results for training_text in result.training_texts]
         output_length_tokens = [training_text.output_tokens for result in rollout_results for training_text in result.training_texts]
         self.sliding_aggregator.update(prompt_length_tokens, output_length_tokens)
@@ -366,7 +382,7 @@ def update_stats(self, rollout_results: List[RolloutResult]):
         if sliding_window_stats is not None:
             for k, v in sliding_window_stats.items():
                 self.sliding_stats[k].append(v)
-        
+
 
 
     def run(self, dataset: list[tuple[str, dict]]):
@@ -443,9 +459,9 @@ def run(self, dataset: list[tuple[str, dict]]):
                             try:
                                 try:
                                     problem = next(problem_iter)
-                                    self.problem_queue.put(problem, block=False)
+                                    self.submit_problem(problem)
                                     submitted_groups += 1
-                                except queue.Full:            
+                                except queue.Full:
                                     assert False, "Problem queue was not full just a moment ago, but now it is full"
                             except StopIteration:
                                 break
@@ -455,7 +471,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 # Second, try return a result
                 try:
                     # Directly get the result from the SharedMemoryQueue
-                    rollout_results = self.result_queue.get(block=False)
+                    rollout_results = self.check_for_new_results()
                 except queue.Empty:
                     continue
 
@@ -484,14 +500,14 @@ def run(self, dataset: list[tuple[str, dict]]):
                     f" {in_progress} groups in progress"
                 )
 
-                    
+
                 self.update_stats(rollout_results=rollout_results)
 
                 finished_groups += 1
                 time_to_publish_train_stats = (
                     self.is_training
                     and trainer_version_to_publish is not None
-                ) or self.debug_mode 
+                ) or self.debug_mode
                 time_to_publish_test_stats = finished_groups == expected_rollouts
 
                 # Publish stats at every new model version or if all tapes are finished
@@ -502,7 +518,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "problem_queue_size": self.problem_queue.qsize(),
                             "result_queue_size": self.result_queue.qsize(),
                             "finished_groups": finished_groups,
-                            "trainer_model_version": trainer_version_to_publish, 
+                            "trainer_model_version": trainer_version_to_publish,
                             "time_since_start": time.time() - loop_start_time,
                             "groups_in_progress": in_progress,
                         }
@@ -520,6 +536,7 @@ def run(self, dataset: list[tuple[str, dict]]):
 
                 if finished_groups == expected_rollouts:
                     logger.info(f"Finished {expected_rollouts} rollouts, stopping actor loop")
+                    self.stop_tasks()
                     break
 
     def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
@@ -572,6 +589,34 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats_writer.write(stats)
         self.init_stats()  # Reset stats for the next iteration
 
+    def submit_problem(self, problem: dict):
+        self.problem_queue.put(problem, block=False)
+
+    def stop_tasks(self):
+        pass
+
+    def check_for_new_results(self):
+        rollout_results = self.result_queue.get(block=False)
+        return rollout_results
+
+
+class ActorLoop2(ActorLoop):
+    """
+    Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
+    """
+    def start_backend(self):
+        ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0")
+
+    def submit_problem(self, problem: dict):
+        pass
+
+    def stop_tasks(self):
+        pass
+
+    def check_for_new_results(self):
+        pass
+
+
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)
@@ -609,7 +654,7 @@ def run_actor_loop(cfg: DictConfig):
         actor_model_path = finetune_model_path
     else:
         actor_model_path = cfg.model_path
-    
+
     # Align client-side context size with vLLM server max_model_len when available
     try:
         _context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
@@ -655,9 +700,8 @@ def run_actor_loop(cfg: DictConfig):
     train_loop = ActorLoop(
         data_stream=data_stream, cfg=cfg, trainer_state=trainer_state, stats_stream=stats_stream, llms=train_llms
     )
-    train_loop_run = train_loop.run(
-        dataset=train_dataset,
-    )
+    train_loop.start_backend()
+    train_loop_run = train_loop.run(dataset=train_dataset)
     test_loop = ActorLoop(
         data_stream=test_data_stream,
         cfg=cfg,
@@ -687,6 +731,7 @@ def run_actor_loop(cfg: DictConfig):
             and test_loop_run is None
         ):
             logger.info("Create test loop")
+            test_loop.start_backend()
             test_loop_run = test_loop.run(
                 dataset=test_dataset,
             )

From b5c8d8917c272a881a4a2ad43ddeec5c6b279145 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:16:02 +0000
Subject: [PATCH 140/166] seq len 32k fits 1 h100, use qwen3-8b

---
 conf/mcp.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index cf85ca18..330c6c9e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -20,7 +20,7 @@ actor:
   shared_memory_entry_size: 10000000
 
 finetune:
-  seq_length: 128000
+  seq_length: 32000
   seq_parallel: 8
 
 dataset_loader: pipelinerl.domains.math.load_datasets
@@ -38,7 +38,7 @@ vllm_config:
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
     max-num-seqs: ${actor.llm_max_rollouts}
     max-num-batched-tokens: 4096
-    max_model_len: 128000
+    max_model_len: 32000
     gpu-memory-utilization: 0.85
 
 environment:
@@ -142,8 +142,8 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
 
 # Local reward shaping for tool usage
 python_tool_shaping:

From b2fbc2b4cbca8a4670acb382f4fa3919c01bf738 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 1 Oct 2025 17:16:17 +0000
Subject: [PATCH 141/166] debug entrypoint

---
 debug.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100755 debug.sh

diff --git a/debug.sh b/debug.sh
new file mode 100755
index 00000000..c1e2822a
--- /dev/null
+++ b/debug.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+python -m pipelinerl.launch \
+    output_dir=results/actor_debug1 \
+    force_restart=true \
+    world.env_replicas_per_actor=1 \
+    actor.llm_max_rollouts=16 \
+    finetune.seq_parallel=8 \
+    eval_every_n_versions=0 \
+    actor.rollout_workers=1 \
+    debug.mode=actor \
+    world.actor_fraction=8 \
+    world.finetune_fraction=0 \
+    world.preprocessor_fraction=0 \
+    --config-name mcp
+
+    # environment.n_envs=4 \
+    # environment.mcp_read_timeout_seconds=300 \
+    # environment.env_call_timeout=300 \
\ No newline at end of file

From 550cb6369558fce34e3838e2b9a6799b67e6106a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Thu, 2 Oct 2025 07:57:31 +0000
Subject: [PATCH 142/166] Increase shared_memory_entry_size

---
 conf/mcp.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index cf85ca18..43ebf586 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -17,7 +17,10 @@ actor:
   llm_max_rollouts: 64
   task_template: |-
     {task}
-  shared_memory_entry_size: 10000000
+  shared_memory_entry_size: 200000000
+
+preprocess:
+  shared_memory_entry_size: 2000000000
 
 finetune:
   seq_length: 128000

From c13a71b2c342ff4e6ebf4f75eac15596fb6a5487 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 2 Oct 2025 13:17:54 +0000
Subject: [PATCH 143/166] synchronous rollout policy

---
 pipelinerl/domains/mcp/__init__.py |   4 +-
 pipelinerl/domains/mcp/rollouts.py | 189 +++++++++++++++++++++++++----
 2 files changed, 170 insertions(+), 23 deletions(-)

diff --git a/pipelinerl/domains/mcp/__init__.py b/pipelinerl/domains/mcp/__init__.py
index 4218ca1b..4557fa53 100644
--- a/pipelinerl/domains/mcp/__init__.py
+++ b/pipelinerl/domains/mcp/__init__.py
@@ -1,2 +1,2 @@
-from .rollouts import generate_mcp_rollout
-from .env_server import EmbeddedMCPEnvironment, MCPEnvironmentServer, EmbeddedEnvironmentWorker
+from .env_server import EmbeddedEnvironmentWorker, EmbeddedMCPEnvironment, MCPEnvironmentServer
+from .rollouts import generate_mcp_rollout, generate_mcp_rollout_with_local_env
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index f62f0567..c867cbc4 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,29 +1,28 @@
 import asyncio
-import time
+import logging
 import random
-import logging 
+import time
 from collections import Counter
 from typing import Dict, List
+from urllib.parse import urlparse
 
 import aiohttp
-from urllib.parse import urlparse
-from omegaconf import DictConfig
-from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.world import Job
-from tapeagents.llms.trainable import TrainableLLM
-from pipelinerl.async_llm import make_training_text
-from tapeagents.environment import Environment
-from tapeagents.orchestrator import async_execute_agent
-from tapeagents.agent import DEFAULT, Agent
 from hydra.utils import instantiate
-from tapeagents.core import Tape
+from omegaconf import DictConfig, OmegaConf
+from tapeagents.agent import DEFAULT, Agent
+from tapeagents.core import LLMCall, Tape
 from tapeagents.dialog_tape import UserStep
-from tapeagents.core import LLMCall
+from tapeagents.llms.trainable import TrainableLLM
+from tapeagents.mcp import MCPEnvironment
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
-from pipelinerl.rollouts import RolloutResult, BaseMetrics
+from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.mcp.steps import MathAnswer
+from pipelinerl.rollouts import BaseMetrics, RolloutResult
+from pipelinerl.world import Job
 
 logger = logging.getLogger(__name__)
 
@@ -44,20 +43,20 @@ def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvir
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
-    
+
     Args:
         llm_calls: List of LLMCall objects
-        
+
     Returns:
         Dictionary mapping function names to their counts
     """
     tool_call_names = []
-    
+
     for llm_call in llm_calls:
         if llm_call.output.tool_calls:
             for tool_call in llm_call.output.tool_calls:
                 tool_call_names.append(tool_call.function.name)
-    
+
     return dict(Counter(tool_call_names))
 
 
@@ -260,8 +259,8 @@ async def generate_mcp_rollout(
     agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
     env_time = tape.metadata.result.get("environment_execution_time", -1.0)
     total_time = tape.metadata.result.get("total_execution_time", -1.0)
-    
-    
+
+
     metrics = Metrics(
         reward=reward,
         success=answer_status == "correct",
@@ -282,3 +281,151 @@ async def generate_mcp_rollout(
         latency=latency,
         dataset_name=problem["dataset"],
     )
+
+
+
+def generate_mcp_rollout_with_local_env(
+    cfg: DictConfig | dict,
+    llm: TrainableLLM,
+    problem: dict,
+) -> RolloutResult:
+    start = time.perf_counter()
+    if isinstance(cfg, dict):
+        cfg = OmegaConf.create(cfg)
+    agent, _env = get_agent_and_env_from_config(cfg)
+    environment: MCPEnvironment = _env
+    logger.info("Agent and environment loaded")
+    try:
+        start_result = environment.start_task(problem)
+        logger.info("Task started")
+        tape_metadata = start_result if isinstance(start_result, dict) else {}
+        agent.llms = {DEFAULT: llm}
+        tape = Tape(
+            steps=[
+                UserStep(
+                    content=f"{problem['task']}. You have access to the following tools: {environment.tools_description()}"
+                )
+            ]
+        )
+        if tape_metadata:
+            tape.metadata.other.update(tape_metadata)
+
+        t_exec = time.perf_counter()
+        logger.info("Running agent..")
+        tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
+        logger.info("Agent finished")
+        tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
+
+        reward_table = RewardTable(**dict(cfg.rewards))
+
+        llm_calls: list[LLMCall] = [
+            LLMCall(**step.metadata.other["llm_call"])
+            if isinstance(step.metadata.other["llm_call"], dict)
+            else step.metadata.other["llm_call"]
+            for step in tape.steps if step.metadata.other.get("llm_call") is not None
+        ]
+        assert len(llm_calls) > 0, "No LLM calls found"
+        tool_call_counts = count_tool_calls_by_category(llm_calls)
+        training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
+        n_llm_calls = len(llm_calls)
+        answer_status = verify_answer(
+            prediction=llm_calls[-1].output.content,  # type: ignore
+            gold=problem["answer"],
+            strict=True,
+        )
+        # Tape should finish with an answer
+        tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
+        base_reward = get_reward(answer_status, tape_finished, reward_table)
+
+        # Local reward shaping (configurable in conf/mcp.yaml)
+        total_shaping = 0.0
+        shaping_cfg = getattr(cfg, "python_tool_shaping", None)
+        if shaping_cfg is not None:
+            num_python_calls = tool_call_counts.get("run_python_code", 0)
+            bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
+            penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
+            max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+
+            # Episode-level bonuses/penalties
+            if answer_status == "correct" and num_python_calls >= 1:
+                total_shaping += bonus_on_correct_with_python
+            if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
+                total_shaping -= penalty_on_incorrect_without_python
+
+            # Clamp total shaping
+            if total_shaping > max_abs:
+                total_shaping = max_abs
+            if total_shaping < -max_abs:
+                total_shaping = -max_abs
+
+        # Length shaping: discourage very long completions; award concise correct ones
+        length_cfg = getattr(cfg, "length_shaping", None)
+        if length_cfg is not None:
+            try:
+                # Prefer ratio-based target if provided; otherwise use absolute
+                if hasattr(length_cfg, "target_ratio"):
+                    ratio = float(getattr(length_cfg, "target_ratio"))
+                    max_gen = int(llm.parameters.get("max_tokens", 2048))
+                    target_tokens = int(max(1, ratio * max_gen))
+                    # Optional clamps
+                    min_t = int(getattr(length_cfg, "min_target_tokens", 0))
+                    max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
+                    target_tokens = max(min_t, min(max_t, target_tokens))
+                else:
+                    target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
+                slope = float(getattr(length_cfg, "slope", 0.0))
+                max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
+                bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
+            except Exception:
+                target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
+
+            # average output tokens across llm calls for this rollout
+            try:
+                avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
+            except Exception:
+                avg_output_tokens = 0.0
+
+            if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
+                over_by = float(avg_output_tokens - target_tokens)
+                penalty = min(max_penalty, slope * over_by)
+                total_shaping -= penalty
+
+            if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
+                total_shaping += bonus_short_correct
+
+        reward = base_reward + total_shaping
+
+        # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
+        for text in training_texts:
+            text.reward = reward
+            text.finished = tape_finished
+
+        latency = time.perf_counter() - start
+
+        agent_time = tape.metadata.result.get("agent_execution_time", -1.0)
+        env_time = tape.metadata.result.get("environment_execution_time", -1.0)
+        total_time = tape.metadata.result.get("total_execution_time", -1.0)
+
+        metrics = Metrics(
+            reward=reward,
+            success=answer_status == "correct",
+            no_error=answer_status != "unparsable",
+            no_answer=answer_status == "no_answer",
+            num_steps=len(tape.steps),
+            num_python_calls=tool_call_counts.get("run_python_code", 0),
+            n_llm_calls=n_llm_calls,
+            total_execution_time=total_time,
+            agent_execution_time=agent_time,
+            environment_execution_time=env_time,
+            overflow=not tape_finished,
+        )
+
+        return RolloutResult(
+            training_texts=training_texts,
+            metrics=metrics,
+            latency=latency,
+            dataset_name=problem["dataset"],
+            llm_url=llm.get_base_url(),
+        )
+    finally:
+        environment.close()

From 44d6fd4e1267036d8d46e13d80175a6d6c0ca55b Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 2 Oct 2025 13:19:14 +0000
Subject: [PATCH 144/166] fix import

---
 pipelinerl/async_llm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/async_llm.py b/pipelinerl/async_llm.py
index e375b6a5..aa75d4ed 100644
--- a/pipelinerl/async_llm.py
+++ b/pipelinerl/async_llm.py
@@ -8,12 +8,16 @@
 from tapeagents.core import LLMCall, LLMOutput, Prompt, TokenLogprob
 from tapeagents.llms.trainable import TrainableLLM
 
-from pipelinerl.finetune.data import MASKED_TOKEN_ID
-from pipelinerl.rollouts import TrainingText
 from pipelinerl.processor_factory import get_processor
+from pipelinerl.rollouts import TrainingText
 
 logger = logging.getLogger(__name__)
 
+# -100 is the default "ignore_index" in nn.CrossEntropyLoss
+# Defined here to avoid importing dependencies from finetune.data
+# Do not replace. Import from finetune module breaks ray parallelization!
+MASKED_TOKEN_ID = -100
+
 
 def extract_images_from_messages(messages: list[dict]) -> list[Image.Image]:
     """Extract PIL Images from multimodal messages."""

From 053a532344989753885d00f6f2792c63d9d8e586 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 3 Oct 2025 17:42:28 +0000
Subject: [PATCH 145/166] llm benchmarking scripts

---
 llm.sh       |  69 +++++++++++++++++++++++++++
 llm_bench.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100755 llm.sh
 create mode 100644 llm_bench.py

diff --git a/llm.sh b/llm.sh
new file mode 100755
index 00000000..9ed54075
--- /dev/null
+++ b/llm.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+echo "Run LLM only"
+
+# python -m pipelinerl.launch \
+#     output_dir=results/llm_debug1 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     debug.mode=llm \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+
+python -m pipelinerl.entrypoints.run_vllm0 \
+    --model Qwen/Qwen3-8B \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --seed 42 \
+    --actor-llm-idx 0 \
+    --weight-update-group-init-method tcp://localhost:9000 \
+    --weight-update-group-world-size 2 \
+    --dtype bfloat16 \
+    --gpu-memory-utilization 0.9 \
+    --num-scheduler-steps 1 \
+    --disable-log-requests \
+    --disable-frontend-multiprocessing \
+    --max-num-seqs 256 \
+    --max-num-batched-tokens 32000 \
+    --enable-chunked-prefill \
+    --return-tokens-as-token-ids \
+    --tensor-parallel-size 1 \
+    --pipeline-parallel-size 1 \
+    --generation-config vllm \
+    --max_model_len 32000 \
+    --enable-auto-tool-choice \
+    --tool-call-parser rl_tool \
+    --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
+    --disable-weight-update
+
+
+# python -m pipelinerl.entrypoints.run_vllm0 \
+#     --model Qwen/Qwen2.5-7B \
+#     --host 0.0.0.0 \
+#     --port 8080 \
+#     --seed 13 \
+#     --actor-llm-idx 0 \
+#     --weight-update-group-init-method tcp://localhost:9000 \
+#     --weight-update-group-world-size 2 \
+#     --dtype bfloat16 \
+#     --gpu-memory-utilization 0.9 \
+#     --num-scheduler-steps 1 \
+#     --disable-log-requests \
+#     --disable-frontend-multiprocessing \
+#     --max-num-seqs 64 \
+#     --max-num-batched-tokens 1024 \
+#     --enable-chunked-prefill \
+#     --return-tokens-as-token-ids \
+#     --tensor-parallel-size 1 \
+#     --pipeline-parallel-size 1 \
+#     --generation-config vllm \
+#     --max_model_len 64000 \
+#     --disable-weight-update
+ 
+# python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
+
+ 
\ No newline at end of file
diff --git a/llm_bench.py b/llm_bench.py
new file mode 100644
index 00000000..f4a014ee
--- /dev/null
+++ b/llm_bench.py
@@ -0,0 +1,130 @@
+import json
+import os
+import time
+
+import numpy as np
+import ray
+import requests
+from tapeagents.llms import TrainableLLM
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+llm_url = "http://localhost:8080"
+# llm_model = "Qwen/Qwen3-8B"
+llm_model = "Qwen/Qwen2.5-7B"
+# exp_name = "qwen3-8b"
+exp_name = "qwen2.5-7b"
+
+def llm_quick_response(prompt: str):
+    t = time.perf_counter()
+    r = requests.post(
+        url=f"{llm_url}/v1/chat/completions",
+        json={
+            "model": llm_model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+        },
+        headers={"Content-Type": "application/json"},
+        stream=False,
+        verify=False,
+    )
+    d = r.json()
+    dt = time.perf_counter() - t
+    return d["choices"][0]["message"]["content"], dt
+
+
+llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+response = llm.quick_response("Hello, how are you?")
+response2, _ = llm_quick_response("Hello, how are you?")
+assert len(response) > 0
+assert len(response2) > 0
+assert llm.tokenizer is not None
+print("LLM is ready")
+
+
+with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
+    all_dicts = [json.loads(line) for line in f if line.strip()]
+total_tokens = 0
+for d in all_dicts:
+    text = d["text"]
+    n_predicted = d["n_predicted"]
+    prompt = text[:-n_predicted]
+    response = text[-n_predicted:]
+    tokens = llm.tokenizer.encode(text)
+    total_tokens += len(tokens)
+print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
+
+prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+chunk_size = 4
+prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
+print(f"Chunked to {len(prompts_chunks)} chunks")
+
+
+def benchmark_llm(n_workers: int):
+    ray.shutdown()
+    ray.init(num_cpus=n_workers)
+
+    def get_responses(prompts: str):
+        responses = []
+        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+        for i, prompt in enumerate(prompts):
+            r, dt = llm_quick_response(prompt)
+            responses.append((prompt + r, dt))
+        return responses
+
+    remote_fn = ray.remote(get_responses)
+
+    t = time.perf_counter()
+
+    chunks = prompts_chunks
+    if n_workers > len(chunks):
+        multiplier = n_workers // len(chunks) + 1
+        chunks = chunks * multiplier
+        print(f"Multiplied to {len(chunks)} chunks")
+    unfinished_tasks = []
+    for chunk in chunks:
+        unfinished_tasks.append(remote_fn.remote(chunk))
+
+    responses = []
+    total_tokens = 0
+    total_finished = 0
+    latencies = []
+    print(f"Submitted {len(unfinished_tasks)} tasks")
+    while unfinished_tasks:
+        finished_tasks, unfinished_tasks = ray.wait(unfinished_tasks, num_returns=len(unfinished_tasks), timeout=0.1)
+        for finished_task in finished_tasks:
+            responses = ray.get(finished_task)
+            total_finished += 1
+            for response, dt in responses:
+                latencies.append(dt)
+                tokens = llm.tokenizer.encode(response)
+                total_tokens += len(tokens)
+        dt = time.perf_counter() - t
+        if len(finished_tasks) > 0:
+            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}")
+        # if dt > 600:
+        #     print("Timeout 10 minutes, stopping")
+        #     break
+        time.sleep(1.0)
+
+    final_time = time.perf_counter() - t
+    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
+    ray.shutdown()
+    mean_latency = np.mean(latencies)
+    return total_tokens, final_time, mean_latency
+
+stats = {}
+for n_workers in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+    print(f"Benchmarking {n_workers} workers..")
+    tokens, dt, mean_latency = benchmark_llm(n_workers)
+    print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
+    stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
+    with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
+        f.write(row + "\n")
+
+print("Benchmarking done")
+with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+    json.dump(stats, f, indent=4)
+print("All stats saved")
\ No newline at end of file

From 0f9bf6a685675063e813fefa34550a5ee8090325 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 13:09:41 +0000
Subject: [PATCH 146/166] move to vllm 0.8.5 to support qwen3

---
 pipelinerl/vllm0.py | 38 +++++++++++++++++++-------------------
 pipelinerl/vllm1.py | 30 +++++++++++++++---------------
 pyproject.toml      |  5 +++--
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/pipelinerl/vllm0.py b/pipelinerl/vllm0.py
index 32c17093..4ff219a2 100644
--- a/pipelinerl/vllm0.py
+++ b/pipelinerl/vllm0.py
@@ -3,39 +3,39 @@
 import logging
 import os
 import signal
-from pydantic import TypeAdapter
+
 import torch
+import torch.distributed as dist
 import uvloop
+from pydantic import TypeAdapter
 from vllm import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser, set_ulimit
-from vllm.entrypoints.openai.cli_args import (
-    make_arg_parser,
-    validate_parsed_serve_args,
-)
+from vllm._version import version
+from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.openai.api_server import (
-    run_server,
-    create_server_socket,
     build_app,
+    create_server_socket,
     init_app_state,
+    run_server,
+)
+from vllm.entrypoints.openai.cli_args import (
+    make_arg_parser,
+    validate_parsed_serve_args,
 )
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.logger import init_logger
-from vllm._version import version
-from vllm.worker.worker import Worker
-from vllm.executor.multiproc_worker_utils import ProcessWorkerWrapper
 from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor
+from vllm.executor.multiproc_worker_utils import ProcessWorkerWrapper
+from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.usage.usage_lib import UsageContext
-from vllm.worker.multi_step_worker import MultiStepWorker
+from vllm.utils import FlexibleArgumentParser, set_ulimit
 from vllm.worker.multi_step_model_runner import MultiStepModelRunner
+from vllm.worker.multi_step_worker import MultiStepWorker
+from vllm.worker.worker import Worker
 
-
-import torch.distributed as dist
-from pipelinerl.finetune_loop import TrainerMessage, WeightUpdateRequest
 import pipelinerl.torch_utils
+from pipelinerl.finetune_loop import TrainerMessage, WeightUpdateRequest
 
 logger = logging.getLogger(__name__)
 # configure this logger individually, in order to avoid messign
@@ -247,8 +247,8 @@ async def _receive_weight_update(request: WeightUpdateRequest):
         await weight_update_manager.receive_weight_update(request)
         return {"status": "ok"}
 
-    model_config = await engine.get_model_config()
-    await init_app_state(engine, model_config, app.state, args)
+    # model_config = await engine.get_model_config()
+    await init_app_state(engine, engine_config, app.state, args)
     shutdown_task = await serve_http(
         app,
         sock,
diff --git a/pipelinerl/vllm1.py b/pipelinerl/vllm1.py
index 80cba297..48311d8e 100644
--- a/pipelinerl/vllm1.py
+++ b/pipelinerl/vllm1.py
@@ -1,32 +1,32 @@
 import logging
 import signal
+from typing import Any, Protocol, runtime_checkable
+
 import torch
 import uvloop
-from vllm.utils import FlexibleArgumentParser, set_ulimit
-from vllm.entrypoints.openai.cli_args import (
-    make_arg_parser,
-    validate_parsed_serve_args,
-)
+from vllm._version import version
+from vllm.config import ModelConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.openai.api_server import (
-    run_server,
-    create_server_socket,
     build_app,
+    create_server_socket,
     init_app_state,
+    run_server,
+)
+from vllm.entrypoints.openai.cli_args import (
+    make_arg_parser,
+    validate_parsed_serve_args,
 )
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm._version import version
 from vllm.usage.usage_lib import UsageContext
-from vllm.config import ModelConfig
+from vllm.utils import FlexibleArgumentParser, set_ulimit
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import AsyncMPClient
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
-
-from pipelinerl.finetune_loop import WeightUpdateRequest
-from typing import Any, Protocol, runtime_checkable
 import pipelinerl.torch_utils
+from pipelinerl.finetune_loop import WeightUpdateRequest
 
 logger = logging.getLogger(__name__)
 # configure this logger individually, in order to avoid messign
@@ -172,8 +172,8 @@ async def _receive_weight_update(request: WeightUpdateRequest):
         await weight_update_manager.receive_weight_update(request)
         return {"status": "ok"}
 
-    model_config = await engine.get_model_config()
-    await init_app_state(engine, model_config, app.state, args)
+    # model_config = await engine.get_model_config()
+    await init_app_state(engine, engine_config, app.state, args)
     shutdown_task = await serve_http(
         app,
         sock,
diff --git a/pyproject.toml b/pyproject.toml
index f950d75d..c069389a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,16 +14,17 @@ authors = [
 ]
 dependencies = [
     "torch>=2.6",
-    "vllm==0.8.3",
+    "vllm==0.8.5.post1",
     "accelerate==1.7.0",
     "Tapeagents[finetune]==0.1.15",
-    "transformers==4.51.0",
+    "transformers==4.51.1",
     "flash-attn==2.7.4.post1",
     "ring-flash-attn==0.1.6",
     "math-verify[antlr4_9_3]==0.7.0",
     "orjson==3.10.16",
     "redis==5.2.1",
     "hydra-core>=1.3.2",
+    "ray[default]~=2.47.1",
 ]
 
 [tool.setuptools.packages.find]

From 44d0de4b8bf75a053565f5d926a4d058c91161da Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:45:41 +0000
Subject: [PATCH 147/166] launch mode to run inference llm only

---
 pipelinerl/launch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipelinerl/launch.py b/pipelinerl/launch.py
index be5c8faf..e2109e34 100644
--- a/pipelinerl/launch.py
+++ b/pipelinerl/launch.py
@@ -21,6 +21,7 @@
 
 # TODO: rm debug code
 import tapeagents
+
 os.environ["NCCL_CUMEM_ENABLE"] = "0"
 os.environ["TORCH_DISABLE_SHARE_RDZV_TCP_STORE"] = "1"
 os.environ["HF_DATASETS_DISABLE_PROGRESS_BARS"] = "1"
@@ -615,6 +616,8 @@ def main(cfg: DictConfig):
 
     if cfg.debug.mode == "finetune":
         processes.extend(launch_jobs(cfg, world_map, ["finetune"]))
+    elif cfg.debug.mode == "llm":
+        processes.extend(launch_jobs(cfg, world_map, ["actor_llm"]))
     elif cfg.debug.mode == "actor":
         processes.extend(launch_jobs(cfg, world_map, ["actor", "environment", "actor_llm"]))
     elif cfg.debug.mode == "preprocessor":

From 81675bdc60747f440a5bef064dee46ea12668b3c Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:46:27 +0000
Subject: [PATCH 148/166] updated ray-based actor loop

---
 pipelinerl/actor.py | 168 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 140 insertions(+), 28 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index d1907ed4..44b5daa7 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -10,13 +10,14 @@
 from multiprocessing.managers import SharedMemoryManager
 from pathlib import Path
 from queue import Empty
-from typing import Dict, List
+from typing import Callable, Dict, List
 
 import aiohttp
 import hydra
+import numpy as np
 import ray
 import uvloop
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
 from tapeagents.orchestrator import save_debug_line
@@ -198,12 +199,13 @@ async def rollout_and_maybe_produce_result(
     connector = aiohttp.TCPConnector(limit=50000, limit_per_host=50000, keepalive_timeout=1.0)
     timeout = aiohttp.ClientTimeout(total=3600.0, connect=3600.0, sock_read=3600.0)
     old_finished_rollouts = 0
+    start_time = time.time()
     async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
         while True:
             if time.time() - last_logged > 10.0 and sum(active_rollouts):
                 if finished_rollouts > old_finished_rollouts:
                     old_finished_rollouts = finished_rollouts
-                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count})
+                    save_debug_line({"rollouts_finished": finished_rollouts, "tokens_produced": token_count, "dt": time.time() - start_time, "token_speed": token_count / (time.time() - start_time)})
                 logger.info(
                     f"{scheduler_name}: "
                     f"rollouts in progress: {sum(active_rollouts)}, "
@@ -300,32 +302,36 @@ def __init__(
         self.is_training = is_training
         self.is_scheduling_paused = False
         self.debug_mode = bool(cfg.debug.mode)
+        self.cfg: DictConfig = cfg
 
-        # Determine the number of processes to use
-        num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
-
-        # Divide LLMs approximately equally across processes
-        self.llm_groups = [[] for _ in range(num_processes)]
-        for i, llm in enumerate(self.llms):
-            self.llm_groups[i % num_processes].append((i, llm))
+        self.smm: SharedMemoryManager | None = None
+        self.problem_queue: SharedMemoryQueue | None = None
+        self.result_queue: SharedMemoryQueue | None = None
+        logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
 
+    def start_backend(self):
         self.smm = SharedMemoryManager()
         self.smm.start()
 
-
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
         self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
         self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
 
-        logger.info(f"Initialized {'train' if self.is_training else 'test'} actor loop")
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
 
-    def start_backend(self):
         # Create and start multiple rollout processes
         attempts = self.cfg.attempts if self.is_training else 1
+        # Determine the number of processes to use
+        num_processes = min(self.cfg.actor.rollout_workers, len(self.llms))
+
+        # Divide LLMs approximately equally across processes
+        llm_groups = [[] for _ in range(num_processes)]
+        for i, llm in enumerate(self.llms):
+            llm_groups[i % num_processes].append((i, llm))
+
         self.rollout_processes = []
-        for llm_group in self.llm_groups:
+        for llm_group in llm_groups:
             assert llm_group
             llm_idxs = [llm[0] for llm in llm_group]
             llms = [llm[1] for llm in llm_group]
@@ -455,7 +461,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 if not self.is_scheduling_paused:
                     while True:
                         blocked_by_lag = submitted_groups == can_submit_before_update and self.is_training
-                        if not blocked_by_lag and not self.problem_queue.full():
+                        if not blocked_by_lag and self.have_capacity():
                             try:
                                 try:
                                     problem = next(problem_iter)
@@ -471,7 +477,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 # Second, try return a result
                 try:
                     # Directly get the result from the SharedMemoryQueue
-                    rollout_results = self.check_for_new_results()
+                    rollout_results = self.get_new_results()
                 except queue.Empty:
                     continue
 
@@ -480,6 +486,8 @@ def run(self, dataset: list[tuple[str, dict]]):
                     raise rollout_results
 
                 assert isinstance(rollout_results, list)
+                if len(rollout_results) == 0:
+                    continue
                 assert isinstance(rollout_results[0], RolloutResult)
                 assert len(rollout_results) == attempts, (
                     f"Expected {attempts} rollouts, got {len(rollout_results)}"
@@ -589,37 +597,141 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats_writer.write(stats)
         self.init_stats()  # Reset stats for the next iteration
 
+    def have_capacity(self) -> bool:
+        return not self.problem_queue.full()
+
     def submit_problem(self, problem: dict):
         self.problem_queue.put(problem, block=False)
 
     def stop_tasks(self):
         pass
 
-    def check_for_new_results(self):
-        rollout_results = self.result_queue.get(block=False)
-        return rollout_results
+    def get_new_results(self) -> list[RolloutResult]:
+        return self.result_queue.get(block=False)
 
 
-class ActorLoop2(ActorLoop):
+class ActorLoopRay(ActorLoop):
     """
     Loop that runs the ray tasks for n_jobs to perform rollouts in parallel
     """
+    ray_ready: bool = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
+        self.unfinished_tasks = []
+        self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
+        self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
+        self.problem_id = 0
+        self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
+        self.finished_problems = []
+        self.token_count = 0
+        self.finished_rollouts_count = 0
+
     def start_backend(self):
-        ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0")
+        if not self.ray_ready:
+            logger.info(f"Initializing Ray with {self.cfg.actor.rollout_workers} workers..")
+            ray_context = ray.init(num_cpus=self.cfg.actor.rollout_workers, dashboard_host="0.0.0.0", include_dashboard=True)
+            logger.info(f"Ray initialized, dashboard at {ray_context.dashboard_url}")
+            self.ray_ready = True
+        else:
+            logger.info("Ray already initialized")
+
+        rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
+        def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
+            rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
+            ts = time.monotonic()
+            return rollout_result, llm.get_base_url(), problem_id, ts
+        self.ray_remote = ray.remote(rollout_wrapper)
+        self.start_time = time.time()
+
+    def have_capacity(self) -> bool:
+        have_capacity = len(self.unfinished_tasks) < self.cfg.actor.problem_queue_size
+        have_llm = any(self.llms_utilization[llm_url] < self.cfg.actor.llm_max_rollouts for llm_url in self.llms_utilization)
+        have_capacity = have_capacity and have_llm
+        if not have_capacity:
+            time.sleep(0.1) # sleep for a while to avoid quick loops when no capacity
+        return have_capacity
 
     def submit_problem(self, problem: dict):
-        pass
+        attempts = self.cfg.attempts if self.is_training else 1
+        for attempt_number in range(attempts):
+            llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
+            logger.info(f"Submitting problem attempt {attempt_number} to the least busy LLM {llm_url} with {task_count} tasks")
+            llm = self.llms_by_url[llm_url]
+            task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id)
+            self.problem_id += 1
+            self.llms_utilization[llm_url] += 1
+            self.unfinished_tasks.append(task_ref)
 
     def stop_tasks(self):
-        pass
-
-    def check_for_new_results(self):
-        pass
+        ray.shutdown()
 
+    def receive_finished_tasks(self):
+        num_returns = min(100, len(self.unfinished_tasks))
+        try:
+            finished_tasks, unfinished_tasks = ray.wait(self.unfinished_tasks, num_returns=num_returns, timeout=0.1)
+        except Exception as e:
+            logger.error(f"Error waiting for finished ray tasks: {e}")
+            return
+        if len(finished_tasks) > 0:
+            logger.info(f"Found {len(finished_tasks)} finished tasks, {len(unfinished_tasks)} unfinished tasks left")
+        self.unfinished_tasks = unfinished_tasks
+        dt = time.time() - self.start_time
+        ray_result_latencies = []
+        for finished_task in finished_tasks:
+            try:
+                rollout_result, llm_url, problem_id, inner_ts = ray.get(finished_task)
+                outer_ts = time.monotonic()
+                ray_result_latency = outer_ts - inner_ts
+                ray_result_latencies.append(ray_result_latency)
+            except Exception as e:
+                logger.error(f"Error getting finished ray task: {e}")
+                continue
+            if self.llms_utilization[llm_url] > 0:
+                self.llms_utilization[llm_url] -= 1
+            else:
+                logger.warning(f"LLM {llm_url} utilization is 0, but got a result")
+            self.token_count += get_number_of_tokens_in_result(rollout_result)
+            self.finished_rollouts_count += 1
+            self.unfinished_problems[problem_id].append(rollout_result)
+            logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
+            if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
+                logger.info(f"Group for problem {problem_id} finished")
+                self.finished_problems.append(self.unfinished_problems[problem_id])
+                del self.unfinished_problems[problem_id]
+                logger.info(f"{len(self.finished_problems)} finished problems ready to return")
+            logger.info(
+                f"Ray {'train' if self.is_training else 'test'} actor loop: "
+                f"rollouts in progress: {len(self.unfinished_tasks)}, "
+                f"problems in progress: {len(self.unfinished_problems)}, "
+                f"rollouts finished: {self.finished_rollouts_count}, "
+                f"total tokens: {self.token_count}, "
+                f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
+                f"ray latency: {np.mean(ray_result_latencies):.4f} seconds"
+            )
+            save_debug_line({
+                "rollouts_finished": self.finished_rollouts_count,
+                "rollouts_in_progress": len(self.unfinished_tasks),
+                "problems_in_progress": len(self.unfinished_problems),
+                "tokens_produced": self.token_count,
+                "dt": dt,
+                "token_speed": self.token_count / dt,
+                "ray_latency": np.mean(ray_result_latencies),
+            })
+            logger.info(f"LLMs utilization: {self.llms_utilization}")
+        
+    def get_new_results(self) -> list[list[RolloutResult]]:
+        self.receive_finished_tasks()
+        if len(self.finished_problems) > 0:
+            logger.info(f"have {len(self.finished_problems)} finished problems, pop one")
+            return self.finished_problems.pop(0)
+        return []
 
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)
+    actor_loop_class = ActorLoopRay if cfg.use_ray else ActorLoop
 
     # set seed for reproducibility (mostly intended for dataset loading)
     random.seed(cfg.seed)
@@ -697,12 +809,12 @@ def run_actor_loop(cfg: DictConfig):
         trainer_state.start_listening()
         trainer_state.wait_for_model_version()
 
-    train_loop = ActorLoop(
+    train_loop = actor_loop_class(
         data_stream=data_stream, cfg=cfg, trainer_state=trainer_state, stats_stream=stats_stream, llms=train_llms
     )
     train_loop.start_backend()
     train_loop_run = train_loop.run(dataset=train_dataset)
-    test_loop = ActorLoop(
+    test_loop = actor_loop_class(
         data_stream=test_data_stream,
         cfg=cfg,
         trainer_state=trainer_state,

From d16222ae969ba3dfb273a4d991bf2b8e37ae0d96 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:48:41 +0000
Subject: [PATCH 149/166] rollout debug

---
 pipelinerl/domains/mcp/rollouts.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index c867cbc4..2758ed15 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import random
 import time
@@ -10,11 +11,12 @@
 from hydra.utils import instantiate
 from omegaconf import DictConfig, OmegaConf
 from tapeagents.agent import DEFAULT, Agent
-from tapeagents.core import LLMCall, Tape
+from tapeagents.core import LLMCall, Tape, TrainingText
 from tapeagents.dialog_tape import UserStep
+from tapeagents.llms import LiteLLM
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config, save_debug_tape
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.async_llm import make_training_text
@@ -29,6 +31,8 @@
 
 _embedded_worker: EmbeddedEnvironmentWorker | None = None
 
+class FailedRollout(Exception):
+    pass
 
 def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvironmentWorker:
     global _embedded_worker
@@ -294,8 +298,9 @@ def generate_mcp_rollout_with_local_env(
         cfg = OmegaConf.create(cfg)
     agent, _env = get_agent_and_env_from_config(cfg)
     environment: MCPEnvironment = _env
-    logger.info("Agent and environment loaded")
+    logger.info(f"Agent and environment loaded, using llm {llm.model_name} at {llm.get_base_url()}")
     try:
+        t_exec = time.perf_counter()
         start_result = environment.start_task(problem)
         logger.info("Task started")
         tape_metadata = start_result if isinstance(start_result, dict) else {}
@@ -310,12 +315,11 @@ def generate_mcp_rollout_with_local_env(
         if tape_metadata:
             tape.metadata.other.update(tape_metadata)
 
-        t_exec = time.perf_counter()
         logger.info("Running agent..")
         tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
         logger.info("Agent finished")
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-
+        # save_debug_tape(tape)
         reward_table = RewardTable(**dict(cfg.rewards))
 
         llm_calls: list[LLMCall] = [
@@ -326,6 +330,7 @@ def generate_mcp_rollout_with_local_env(
         ]
         assert len(llm_calls) > 0, "No LLM calls found"
         tool_call_counts = count_tool_calls_by_category(llm_calls)
+        logger.info(f'Use {type(llm)} LLM to generate training texts')
         training_texts = [make_training_text(llm, llm_call) for llm_call in llm_calls]
         n_llm_calls = len(llm_calls)
         answer_status = verify_answer(
@@ -397,6 +402,7 @@ def generate_mcp_rollout_with_local_env(
 
         # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
         for text in training_texts:
+            # debug_save_training_text(text)
             text.reward = reward
             text.finished = tape_finished
 
@@ -427,5 +433,16 @@ def generate_mcp_rollout_with_local_env(
             dataset_name=problem["dataset"],
             llm_url=llm.get_base_url(),
         )
+    except Exception as e:
+        err_msg = f"Error generating rollout: {e}"
+        logger.error(err_msg)
+        raise FailedRollout(err_msg)
     finally:
-        environment.close()
+        try:
+            environment.close()
+        except Exception as e:
+            logger.error(f"Error closing environment: {e}")
+
+def debug_save_training_text(text: TrainingText):
+    with open("debug_training_texts.jsonl", "a") as f:
+        f.write(json.dumps({"text": text.text, "n_predicted": text.n_predicted}, ensure_ascii=False) + "\n")
\ No newline at end of file

From 74857cff059866702e7c8c1b3c6b7f9858cd5f86 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:49:08 +0000
Subject: [PATCH 150/166] llm benchmark scripts update

---
 llm.sh             |  15 ++--
 llm_bench.py       |  71 +++++++++-------
 llm_bench_async.py | 200 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+), 37 deletions(-)
 create mode 100644 llm_bench_async.py

diff --git a/llm.sh b/llm.sh
index 9ed54075..78758724 100755
--- a/llm.sh
+++ b/llm.sh
@@ -14,7 +14,7 @@ echo "Run LLM only"
 #     --config-name mcp
 
 
-python -m pipelinerl.entrypoints.run_vllm0 \
+python -m pipelinerl.entrypoints.run_vllm1 \
     --model Qwen/Qwen3-8B \
     --host 0.0.0.0 \
     --port 8080 \
@@ -29,12 +29,12 @@ python -m pipelinerl.entrypoints.run_vllm0 \
     --disable-frontend-multiprocessing \
     --max-num-seqs 256 \
     --max-num-batched-tokens 32000 \
+    --max_model_len 32000 \
     --enable-chunked-prefill \
     --return-tokens-as-token-ids \
     --tensor-parallel-size 1 \
     --pipeline-parallel-size 1 \
     --generation-config vllm \
-    --max_model_len 32000 \
     --enable-auto-tool-choice \
     --tool-call-parser rl_tool \
     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
@@ -45,7 +45,7 @@ python -m pipelinerl.entrypoints.run_vllm0 \
 #     --model Qwen/Qwen2.5-7B \
 #     --host 0.0.0.0 \
 #     --port 8080 \
-#     --seed 13 \
+#     --seed 42 \
 #     --actor-llm-idx 0 \
 #     --weight-update-group-init-method tcp://localhost:9000 \
 #     --weight-update-group-world-size 2 \
@@ -54,14 +54,17 @@ python -m pipelinerl.entrypoints.run_vllm0 \
 #     --num-scheduler-steps 1 \
 #     --disable-log-requests \
 #     --disable-frontend-multiprocessing \
-#     --max-num-seqs 64 \
-#     --max-num-batched-tokens 1024 \
+#     --max-num-seqs 256 \
+#     --max-num-batched-tokens 32000 \
 #     --enable-chunked-prefill \
 #     --return-tokens-as-token-ids \
 #     --tensor-parallel-size 1 \
 #     --pipeline-parallel-size 1 \
 #     --generation-config vllm \
-#     --max_model_len 64000 \
+#     --max_model_len 32000 \
+#     --enable-auto-tool-choice \
+#     --tool-call-parser rl_tool \
+#     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
 #     --disable-weight-update
  
 # python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
diff --git a/llm_bench.py b/llm_bench.py
index f4a014ee..e61cf342 100644
--- a/llm_bench.py
+++ b/llm_bench.py
@@ -1,5 +1,6 @@
 import json
 import os
+import random
 import time
 
 import numpy as np
@@ -10,13 +11,13 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 llm_url = "http://localhost:8080"
-# llm_model = "Qwen/Qwen3-8B"
-llm_model = "Qwen/Qwen2.5-7B"
-# exp_name = "qwen3-8b"
-exp_name = "qwen2.5-7b"
+llm_model = "Qwen/Qwen3-8B"
+# llm_model = "Qwen/Qwen2.5-7B"
+exp_name = "qwen3-8b-v1"
+# exp_name = "qwen2.5-7b"
+max_tokens = 8192
 
 def llm_quick_response(prompt: str):
-    t = time.perf_counter()
     r = requests.post(
         url=f"{llm_url}/v1/chat/completions",
         json={
@@ -29,15 +30,13 @@ def llm_quick_response(prompt: str):
         verify=False,
     )
     d = r.json()
-    dt = time.perf_counter() - t
-    return d["choices"][0]["message"]["content"], dt
+    return d["choices"][0]["message"]["content"]
 
 
 llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
 response = llm.quick_response("Hello, how are you?")
-response2, _ = llm_quick_response("Hello, how are you?")
+response = llm_quick_response("Hello, how are you?")
 assert len(response) > 0
-assert len(response2) > 0
 assert llm.tokenizer is not None
 print("LLM is ready")
 
@@ -55,10 +54,12 @@ def llm_quick_response(prompt: str):
 print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
 
 prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+random.seed(42)
+random.shuffle(prompts)
 chunk_size = 4
 prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
 print(f"Chunked to {len(prompts_chunks)} chunks")
-
+too_many_chunks = prompts_chunks * 20
 
 def benchmark_llm(n_workers: int):
     ray.shutdown()
@@ -66,21 +67,24 @@ def benchmark_llm(n_workers: int):
 
     def get_responses(prompts: str):
         responses = []
-        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model, parameters={"max_tokens": max_tokens})
         for i, prompt in enumerate(prompts):
-            r, dt = llm_quick_response(prompt)
+            t = time.perf_counter()
+            # r = local_llm.quick_response(prompt)
+            r = llm_quick_response(prompt)
+            dt = time.perf_counter() - t
             responses.append((prompt + r, dt))
         return responses
 
     remote_fn = ray.remote(get_responses)
 
-    t = time.perf_counter()
+    start_time = time.perf_counter()
 
-    chunks = prompts_chunks
-    if n_workers > len(chunks):
-        multiplier = n_workers // len(chunks) + 1
-        chunks = chunks * multiplier
-        print(f"Multiplied to {len(chunks)} chunks")
+    n_chunks = max(200, n_workers * 2)
+    chunks = too_many_chunks[:n_chunks]
+    print(f"Multiplied to {len(chunks)} chunks")
+    random.seed(42)
+    random.shuffle(chunks)
     unfinished_tasks = []
     for chunk in chunks:
         unfinished_tasks.append(remote_fn.remote(chunk))
@@ -95,36 +99,41 @@ def get_responses(prompts: str):
         for finished_task in finished_tasks:
             responses = ray.get(finished_task)
             total_finished += 1
-            for response, dt in responses:
-                latencies.append(dt)
+            for response, latency in responses:
+                latencies.append(latency)
                 tokens = llm.tokenizer.encode(response)
                 total_tokens += len(tokens)
-        dt = time.perf_counter() - t
+        dt = time.perf_counter() - start_time
         if len(finished_tasks) > 0:
-            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}")
-        # if dt > 600:
-        #     print("Timeout 10 minutes, stopping")
-        #     break
-        time.sleep(1.0)
-
-    final_time = time.perf_counter() - t
+            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]):.2f}s")
+            with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
+                ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                row = json.dumps({"ts": ts, "exp_name": exp_name, "n_workers": n_workers, "tokens": total_tokens, "dt": dt, "mean_latency": np.mean(latencies), "last_10_latency": np.mean(latencies[-10:]), "total_finished": total_finished, "token_speed": total_tokens / dt})
+                f.write(row + "\n")
+        if len(unfinished_tasks) < n_workers:
+            print(f"Saturation mode ended, stopping")
+            break
+        time.sleep(2.0)
+
+    final_time = time.perf_counter() - start_time
     print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
     ray.shutdown()
     mean_latency = np.mean(latencies)
     return total_tokens, final_time, mean_latency
 
 stats = {}
-for n_workers in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
+for n_workers in [128]: #[64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
     print(f"Benchmarking {n_workers} workers..")
     tokens, dt, mean_latency = benchmark_llm(n_workers)
     print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
     stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-    with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+    with open(f"llm_token_stats_ray_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
         ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
         f.write(row + "\n")
+    time.sleep(3.0)
 
 print("Benchmarking done")
-with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+with open(f"llm_token_stats_ray_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
     json.dump(stats, f, indent=4)
 print("All stats saved")
\ No newline at end of file
diff --git a/llm_bench_async.py b/llm_bench_async.py
new file mode 100644
index 00000000..0f307576
--- /dev/null
+++ b/llm_bench_async.py
@@ -0,0 +1,200 @@
+import asyncio
+import json
+import os
+import random
+import time
+
+import aiohttp
+import numpy as np
+from tapeagents.llms import TrainableLLM
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+llm_url = "http://localhost:8080"
+llm_model = "Qwen/Qwen3-8B"
+# llm_model = "Qwen/Qwen2.5-7B"
+exp_name = "qwen3-8b-v1"
+# exp_name = "qwen2.5-7b"
+max_tokens = 8192
+
+
+async def llm_quick_response_async(session: aiohttp.ClientSession, prompt: str):
+    """Async version of LLM quick response"""
+    async with session.post(
+        url=f"{llm_url}/v1/chat/completions",
+        json={
+            "model": llm_model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+        },
+        headers={"Content-Type": "application/json"},
+        ssl=False,
+    ) as response:
+        d = await response.json()
+        return d["choices"][0]["message"]["content"]
+
+
+
+
+# Initial LLM test (synchronous)
+llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
+response = llm.quick_response("Hello, how are you?")
+assert len(response) > 0
+assert llm.tokenizer is not None
+print("LLM is ready")
+
+
+with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
+    all_dicts = [json.loads(line) for line in f if line.strip()]
+total_tokens = 0
+for d in all_dicts:
+    text = d["text"]
+    n_predicted = d["n_predicted"]
+    prompt = text[:-n_predicted]
+    response = text[-n_predicted:]
+    tokens = llm.tokenizer.encode(text)
+    total_tokens += len(tokens)
+print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
+
+prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
+random.seed(42)
+random.shuffle(prompts)
+chunk_size = 4
+prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
+print(f"Chunked to {len(prompts_chunks)} chunks")
+too_many_chunks = prompts_chunks * 20
+
+
+async def get_responses_async(session: aiohttp.ClientSession, prompts: list[str], tokenizer):
+    """Process a chunk of prompts asynchronously"""
+    responses = []
+    for prompt in prompts:
+        t = time.perf_counter()
+        try:
+            r = await llm_quick_response_async(session, prompt)
+            dt = time.perf_counter() - t
+            responses.append((prompt + r, dt))
+        except Exception as e:
+            print(f"Error processing prompt: {e}")
+            dt = time.perf_counter() - t
+            responses.append((prompt, dt))
+    return responses
+
+
+async def benchmark_llm_async(n_workers: int):
+    """Benchmark LLM using async/await with controlled concurrency"""
+    print(f"Starting async benchmark with {n_workers} concurrent workers")
+    
+    start_time = time.perf_counter()
+    
+    n_chunks = max(200, n_workers * 2)
+    chunks = too_many_chunks[:n_chunks]
+    print(f"Multiplied to {len(chunks)} chunks")
+    random.seed(42)
+    random.shuffle(chunks)
+    
+    total_tokens = 0
+    total_finished = 0
+    latencies = []
+    
+    # Create shared aiohttp session with connection pooling
+    connector = aiohttp.TCPConnector(limit=n_workers, limit_per_host=n_workers)
+    timeout = aiohttp.ClientTimeout(total=300)  # 5 minute timeout
+    
+    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+        # Create all tasks
+        tasks = []
+        for chunk in chunks:
+            task = asyncio.create_task(get_responses_async(session, chunk, llm.tokenizer))
+            tasks.append(task)
+        
+        print(f"Created {len(tasks)} tasks")
+        
+        # Process tasks with controlled concurrency
+        pending = set(tasks)
+        active = set()
+        
+        while pending or active:
+            # Fill up active tasks up to n_workers limit
+            while len(active) < n_workers and pending:
+                task = pending.pop()
+                active.add(task)
+            
+            if not active:
+                break
+            
+            # Wait for at least one task to complete
+            done, active = await asyncio.wait(active, timeout=0.1, return_when=asyncio.FIRST_COMPLETED)
+            
+            # Process completed tasks
+            for finished_task in done:
+                try:
+                    responses = await finished_task
+                    total_finished += 1
+                    for response, latency in responses:
+                        latencies.append(latency)
+                        tokens = llm.tokenizer.encode(response)
+                        total_tokens += len(tokens)
+                except Exception as e:
+                    print(f"Task failed with error: {e}")
+                    total_finished += 1
+            
+            # Log progress
+            dt = time.perf_counter() - start_time
+            if len(done) > 0:
+                print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]) if latencies else 0:.2f}s")
+                with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
+                    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    row = json.dumps({
+                        "ts": ts,
+                        "exp_name": exp_name,
+                        "n_workers": n_workers,
+                        "tokens": total_tokens,
+                        "dt": dt,
+                        "mean_latency": np.mean(latencies) if latencies else 0,
+                        "last_10_latency": np.mean(latencies[-10:]) if latencies else 0,
+                        "total_finished": total_finished,
+                        "token_speed": total_tokens / dt if dt > 0 else 0
+                    })
+                    f.write(row + "\n")
+            
+            # Check saturation mode
+            if len(pending) + len(active) < n_workers:
+                print(f"Saturation mode ended, stopping")
+                # Cancel remaining tasks
+                for task in active:
+                    task.cancel()
+                break
+            
+            await asyncio.sleep(2.0)
+    
+    final_time = time.perf_counter() - start_time
+    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
+    mean_latency = np.mean(latencies) if latencies else 0
+    return total_tokens, final_time, mean_latency
+
+
+async def run_benchmarks():
+    """Run benchmarks for different worker counts"""
+    stats = {}
+    for n_workers in [128]:  # [64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
+        print(f"Benchmarking {n_workers} workers..")
+        tokens, dt, mean_latency = await benchmark_llm_async(n_workers)
+        print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
+        stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
+        with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
+            ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
+            f.write(row + "\n")
+        await asyncio.sleep(3.0)
+    
+    print("Benchmarking done")
+    with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
+        json.dump(stats, f, indent=4)
+    print("All stats saved")
+
+
+if __name__ == "__main__":
+    # Run the async benchmarks
+    asyncio.run(run_benchmarks())
+

From 1332fc29b59f6c8e33d5b59e543c88e2cc4cefbc Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:49:30 +0000
Subject: [PATCH 151/166] flag to control ray usage

---
 conf/base.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/base.yaml b/conf/base.yaml
index 638d2c13..5bf30c59 100644
--- a/conf/base.yaml
+++ b/conf/base.yaml
@@ -5,6 +5,7 @@ defaults:
   - _self_
 
 seed: 42
+use_ray: false
 
 finetune:
   seed: ${..seed}

From 2163313c70faa8f4b17bde39320341d72c10ae19 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:51:23 +0000
Subject: [PATCH 152/166] mcp config with ray and local envs

---
 conf/mcp.yaml | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 330c6c9e..fa911e1a 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -3,7 +3,10 @@ defaults:
     - override finetune: grpo
     - _self_
 
+use_ray: true
+
 llm:
+  use_cache: false
   parameters:
     max_tokens: 8192
 
@@ -12,9 +15,11 @@ test_llm:
     max_tokens: 8192
 
 actor:
-  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout_with_local_env
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
-  llm_max_rollouts: 64
+  rollout_workers: 64
+  llm_max_rollouts: 256
+  problem_queue_size: 256
   task_template: |-
     {task}
   shared_memory_entry_size: 10000000
@@ -31,26 +36,23 @@ test_dataset_names:
   - aime_2025
 
 vllm_config:
-  use_v1: false
+  use_v1: true
   vllm_kwargs:
     enable-auto-tool-choice: ""
     tool-call-parser: rl_tool
     tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
-    max-num-seqs: ${actor.llm_max_rollouts}
-    max-num-batched-tokens: 4096
+    max-num-seqs: 256
+    max-num-batched-tokens: 32000
     max_model_len: 32000
-    gpu-memory-utilization: 0.85
+    gpu-memory-utilization: 0.9
 
 environment:
-  _target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
+  _target_: tapeagents.mcp.MCPEnvironment
   config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
   tools_whitelist:
     - run_python_code
   read_timeout_seconds: 600
   use_cache: false
-  runtime_pool_workers: 4
-  offload_tools:
-    - run_python_code
 
 
 world:

From e6f232911759b43d159eb1742571967819bb8a62 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Mon, 6 Oct 2025 17:51:47 +0000
Subject: [PATCH 153/166] update debug entrypoint

---
 debug.sh | 69 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/debug.sh b/debug.sh
index c1e2822a..a2aca44b 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,18 +1,71 @@
 #!/bin/bash
-python -m pipelinerl.launch \
-    output_dir=results/actor_debug1 \
+echo "Run 32 workers"
+DEBUG_FILE=timing_debug_workers32_3.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug32_3 \
     force_restart=true \
-    world.env_replicas_per_actor=1 \
-    actor.llm_max_rollouts=16 \
+    actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=1 \
+    actor.rollout_workers=32 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
     world.preprocessor_fraction=0 \
     --config-name mcp
 
-    # environment.n_envs=4 \
-    # environment.mcp_read_timeout_seconds=300 \
-    # environment.env_call_timeout=300 \
\ No newline at end of file
+# echo "Run 10 workers"
+# DEBUG_FILE=timing_debug_gpt_workers10.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=10 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+
+# echo "Run 5 workers"
+# DEBUG_FILE=timing_debug_gpt_workers5.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=5 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+# echo "Run 40 workers"
+# DEBUG_FILE=timing_debug_gpt_workers40.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=40 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
+
+# echo "Run 30 workers"
+# DEBUG_FILE=timing_debug_gpt_workers30.jsonl python -m pipelinerl.launch \
+#     output_dir=results/actor_debug2 \
+#     force_restart=true \
+#     actor.llm_max_rollouts=16 \
+#     finetune.seq_parallel=8 \
+#     eval_every_n_versions=0 \
+#     actor.rollout_workers=30 \
+#     debug.mode=actor \
+#     world.actor_fraction=8 \
+#     world.finetune_fraction=0 \
+#     world.preprocessor_fraction=0 \
+#     --config-name mcp
\ No newline at end of file

From 3dcdf096f6a0f0e5cc4e7e67aaeea2e0952ed1d1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 12:16:02 +0000
Subject: [PATCH 154/166] better timing logging

---
 debug.sh            |  8 ++++----
 pipelinerl/actor.py | 20 +++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/debug.sh b/debug.sh
index a2aca44b..b72841c1 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-echo "Run 32 workers"
-DEBUG_FILE=timing_debug_workers32_3.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug32_3 \
+echo "Run 40 workers"
+DEBUG_FILE=timing_debug_workers40_1.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug40_1 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=32 \
+    actor.rollout_workers=40 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 44b5daa7..b7193be1 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -627,6 +627,8 @@ def __init__(self, *args, **kwargs):
         self.finished_problems = []
         self.token_count = 0
         self.finished_rollouts_count = 0
+        self.task_latencies = []
+        self.ray_result_latencies = []
 
     def start_backend(self):
         if not self.ray_ready:
@@ -639,9 +641,10 @@ def start_backend(self):
 
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
         def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
+            start_ts = time.monotonic()
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
-            return rollout_result, llm.get_base_url(), problem_id, ts
+            return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
         self.ray_remote = ray.remote(rollout_wrapper)
         self.start_time = time.time()
 
@@ -678,13 +681,14 @@ def receive_finished_tasks(self):
             logger.info(f"Found {len(finished_tasks)} finished tasks, {len(unfinished_tasks)} unfinished tasks left")
         self.unfinished_tasks = unfinished_tasks
         dt = time.time() - self.start_time
-        ray_result_latencies = []
         for finished_task in finished_tasks:
             try:
-                rollout_result, llm_url, problem_id, inner_ts = ray.get(finished_task)
+                rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
+                task_dt = stop_ts - start_ts
+                self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
-                ray_result_latency = outer_ts - inner_ts
-                ray_result_latencies.append(ray_result_latency)
+                ray_result_latency = outer_ts - stop_ts
+                self.ray_result_latencies.append(ray_result_latency)
             except Exception as e:
                 logger.error(f"Error getting finished ray task: {e}")
                 continue
@@ -708,7 +712,8 @@ def receive_finished_tasks(self):
                 f"rollouts finished: {self.finished_rollouts_count}, "
                 f"total tokens: {self.token_count}, "
                 f"gen speed: {self.token_count / dt:.2f} tokens/sec, "
-                f"ray latency: {np.mean(ray_result_latencies):.4f} seconds"
+                f"task latency: {np.mean(self.task_latencies[-10:]):.2f} sec, "
+                f"ray delay: {np.mean(self.ray_result_latencies[-10:]):.4f} sec"
             )
             save_debug_line({
                 "rollouts_finished": self.finished_rollouts_count,
@@ -717,7 +722,8 @@ def receive_finished_tasks(self):
                 "tokens_produced": self.token_count,
                 "dt": dt,
                 "token_speed": self.token_count / dt,
-                "ray_latency": np.mean(ray_result_latencies),
+                "ray_latency": np.mean(self.ray_result_latencies[-10:]),
+                "task_latency": np.mean(self.task_latencies[-10:]),
             })
             logger.info(f"LLMs utilization: {self.llms_utilization}")
         

From ec567cc830f7b7ad076d627b40d7b67c0831ede1 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 17:25:46 +0000
Subject: [PATCH 155/166] fixes

---
 pipelinerl/actor.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index b7193be1..c44e6fd3 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -495,7 +495,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                 group_samples = sum(len(r.training_texts) for r in rollout_results)
 
                 published_samples += group_samples
-                samples_in_queue = self.result_queue.qsize() * attempts
+                samples_in_queue = self.results_ready_to_publish()
                 all_text_dumps = []
                 for r in rollout_results:
                     for text in r.training_texts:
@@ -609,6 +609,9 @@ def stop_tasks(self):
     def get_new_results(self) -> list[RolloutResult]:
         return self.result_queue.get(block=False)
 
+    def results_ready_to_publish(self) -> int:
+        return self.result_queue.qsize() * self.cfg.attempts
+
 
 class ActorLoopRay(ActorLoop):
     """
@@ -623,6 +626,7 @@ def __init__(self, *args, **kwargs):
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
         self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
         self.problem_id = 0
+        self.attempts = self.cfg.attempts if self.is_training else 1
         self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
         self.finished_problems = []
         self.token_count = 0
@@ -644,28 +648,28 @@ def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_i
             start_ts = time.monotonic()
             rollout_result: RolloutResult = rollout_policy(cfg, llm, problem)
             ts = time.monotonic()
+            logger.info(f"Problem {problem_id} finished in {ts - start_ts:.2f} seconds")
             return rollout_result, llm.get_base_url(), problem_id, ts, start_ts
         self.ray_remote = ray.remote(rollout_wrapper)
         self.start_time = time.time()
 
     def have_capacity(self) -> bool:
         have_capacity = len(self.unfinished_tasks) < self.cfg.actor.problem_queue_size
-        have_llm = any(self.llms_utilization[llm_url] < self.cfg.actor.llm_max_rollouts for llm_url in self.llms_utilization)
-        have_capacity = have_capacity and have_llm
+        have_llm_capacity = any(self.llms_utilization[llm_url] < (self.cfg.actor.llm_max_rollouts - self.attempts) for llm_url in self.llms_utilization)
+        have_capacity = have_capacity and have_llm_capacity
         if not have_capacity:
             time.sleep(0.1) # sleep for a while to avoid quick loops when no capacity
         return have_capacity
 
     def submit_problem(self, problem: dict):
-        attempts = self.cfg.attempts if self.is_training else 1
-        for attempt_number in range(attempts):
+        for attempt_number in range(self.attempts):
             llm_url, task_count = min(self.llms_utilization.items(), key=lambda x: x[1])
-            logger.info(f"Submitting problem attempt {attempt_number} to the least busy LLM {llm_url} with {task_count} tasks")
+            logger.info(f"Submitting problem {self.problem_id} attempt {attempt_number}/{self.attempts} to the least busy LLM {llm_url} with {task_count} tasks")
             llm = self.llms_by_url[llm_url]
             task_ref = self.ray_remote.remote(self.cfg_dict, llm, problem, self.problem_id)
-            self.problem_id += 1
             self.llms_utilization[llm_url] += 1
             self.unfinished_tasks.append(task_ref)
+        self.problem_id += 1
 
     def stop_tasks(self):
         ray.shutdown()
@@ -701,7 +705,7 @@ def receive_finished_tasks(self):
             self.unfinished_problems[problem_id].append(rollout_result)
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
             if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
-                logger.info(f"Group for problem {problem_id} finished")
+                logger.info(f"Problem {problem_id} group finished")
                 self.finished_problems.append(self.unfinished_problems[problem_id])
                 del self.unfinished_problems[problem_id]
                 logger.info(f"{len(self.finished_problems)} finished problems ready to return")
@@ -734,6 +738,9 @@ def get_new_results(self) -> list[list[RolloutResult]]:
             return self.finished_problems.pop(0)
         return []
 
+    def results_ready_to_publish(self) -> int:
+        return len(self.finished_problems) * self.cfg.attempts
+
 
 def run_actor_loop(cfg: DictConfig):
     set_streams_backend(**cfg.streams)

From 72c04a6715bfc7d512f5848a155d9a77f3cadb33 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Wed, 8 Oct 2025 17:58:53 +0000
Subject: [PATCH 156/166] fixes

---
 pipelinerl/actor.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index c44e6fd3..90c30d31 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -314,8 +314,8 @@ def start_backend(self):
         self.smm.start()
 
         # Use SharedMemoryQueue instead of separate problem_queue, result_queue, and io_buffer
-        self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, cfg.actor.shared_memory_entry_size)
-        self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, cfg.actor.shared_memory_entry_size)
+        self.problem_queue = SharedMemoryQueue(self.smm, self.cfg.actor.problem_queue_size, self.cfg.actor.shared_memory_entry_size)
+        self.result_queue = SharedMemoryQueue(self.smm, self.cfg.actor.result_queue_size, self.cfg.actor.shared_memory_entry_size)
 
         logger.info(f"Problem queue size: {self.problem_queue.max_size}, result queue size: {self.result_queue.max_size}")
         logger.info(f"Result queue buffer size: {self.result_queue.get_memory_size() / 2**30} Gb")
@@ -523,8 +523,8 @@ def run(self, dataset: list[tuple[str, dict]]):
                     if self.is_training:
                         loop_stats = {
                             "published_samples": published_samples,
-                            "problem_queue_size": self.problem_queue.qsize(),
-                            "result_queue_size": self.result_queue.qsize(),
+                            "problem_queue_size": self.problem_queue_size(),
+                            "result_queue_size": self.result_queue_size(),
                             "finished_groups": finished_groups,
                             "trainer_model_version": trainer_version_to_publish,
                             "time_since_start": time.time() - loop_start_time,
@@ -610,7 +610,13 @@ def get_new_results(self) -> list[RolloutResult]:
         return self.result_queue.get(block=False)
 
     def results_ready_to_publish(self) -> int:
-        return self.result_queue.qsize() * self.cfg.attempts
+        return self.result_queue_size() * self.cfg.attempts
+
+    def problem_queue_size(self) -> int:
+        return self.problem_queue.qsize()
+
+    def result_queue_size(self) -> int:
+        return self.result_queue.qsize()
 
 
 class ActorLoopRay(ActorLoop):
@@ -643,6 +649,7 @@ def start_backend(self):
         else:
             logger.info("Ray already initialized")
 
+        assert self.trainer_state.propagated_weight_version is not None
         rollout_policy: Callable[[DictConfig, TrainableLLM, dict], RolloutResult] = hydra.utils.get_method(self.cfg.actor.rollout_policy)
         def rollout_wrapper(cfg: DictConfig, llm: TrainableLLM, problem: dict, problem_id: int) -> RolloutResult:
             start_ts = time.monotonic()
@@ -688,6 +695,7 @@ def receive_finished_tasks(self):
         for finished_task in finished_tasks:
             try:
                 rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
+                rollout_result.model_version = self.trainer_state.propagated_weight_version
                 task_dt = stop_ts - start_ts
                 self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
@@ -738,8 +746,11 @@ def get_new_results(self) -> list[list[RolloutResult]]:
             return self.finished_problems.pop(0)
         return []
 
-    def results_ready_to_publish(self) -> int:
-        return len(self.finished_problems) * self.cfg.attempts
+    def problem_queue_size(self) -> int:
+        return len(self.unfinished_tasks)
+
+    def result_queue_size(self) -> int:
+        return len(self.finished_problems)
 
 
 def run_actor_loop(cfg: DictConfig):

From 68c95343a9d80bd99ea59f61bf60390a28dfcea2 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 11:43:06 +0000
Subject: [PATCH 157/166] faster mcp server startup, significant speedup

---
 conf/mcp/python.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/mcp/python.json b/conf/mcp/python.json
index d64cb8eb..fcbb4dcf 100644
--- a/conf/mcp/python.json
+++ b/conf/mcp/python.json
@@ -4,7 +4,7 @@
             "command": "bash",
             "args": [
                 "-c",
-                "JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                "deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio"
                 ]
         }
     }

From 872059d2abf5552e9440beafcdf8c229438a1574 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 12:37:45 +0000
Subject: [PATCH 158/166] fix training texts metadata

---
 pipelinerl/actor.py                | 16 +++++++++++++++-
 pipelinerl/domains/mcp/rollouts.py |  3 +--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 90c30d31..56834363 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -74,6 +74,7 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
 
     def get_stats(self):
         if len(self.data.prompt_tokens_window) < self.window_size:
+            logger.warning(f"Not enough data to compute sliding stats, window size: {self.window_size}, data length: {len(self.data.prompt_tokens_window)}")
             return None
 
         # 1. How many samples do we produce per second?
@@ -592,6 +593,7 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
                     stats[f"{prefix}{new_suffix}"] = stats[key]
                     break
 
+        logger.info(f"Publish actor stats to wandb: {stats}")
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -631,6 +633,7 @@ def __init__(self, *args, **kwargs):
         self.unfinished_tasks = []
         self.llms_by_url = {llm.get_base_url(): llm for llm in self.llms}
         self.llms_utilization = {llm.get_base_url(): 0 for llm in self.llms}
+        self.scheduler_name = f"{'train' if self.is_training else 'test'} ray scheduler"
         self.problem_id = 0
         self.attempts = self.cfg.attempts if self.is_training else 1
         self.unfinished_problems = defaultdict(list) # up to `attempts` rollout results for each problem
@@ -696,6 +699,15 @@ def receive_finished_tasks(self):
             try:
                 rollout_result, llm_url, problem_id, stop_ts, start_ts = ray.get(finished_task)
                 rollout_result.model_version = self.trainer_state.propagated_weight_version
+                full_group_id = f"{self.scheduler_name}_{problem_id}"
+                rollout_result.group_id = full_group_id
+                rollout_index = len(self.unfinished_problems[problem_id])
+                for step_index, sample in enumerate(rollout_result.training_texts):
+                    # Downstream in the pipeline we'll need these fields in every sample
+                    sample.metadata["model_version"] = rollout_result.model_version
+                    sample.metadata["rollout_index"] = rollout_index
+                    sample.metadata["step_index"] = step_index
+                    sample.group_id = full_group_id
                 task_dt = stop_ts - start_ts
                 self.task_latencies.append(task_dt)
                 outer_ts = time.monotonic()
@@ -714,7 +726,9 @@ def receive_finished_tasks(self):
             logger.info(f"Problem {problem_id} has {len(self.unfinished_problems[problem_id])} rollout results")
             if len(self.unfinished_problems[problem_id]) == self.cfg.attempts:
                 logger.info(f"Problem {problem_id} group finished")
-                self.finished_problems.append(self.unfinished_problems[problem_id])
+                group = self.unfinished_problems[problem_id]
+                random.shuffle(group)
+                self.finished_problems.append(group)
                 del self.unfinished_problems[problem_id]
                 logger.info(f"{len(self.finished_problems)} finished problems ready to return")
             logger.info(
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 2758ed15..360290cb 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -430,8 +430,7 @@ def generate_mcp_rollout_with_local_env(
             training_texts=training_texts,
             metrics=metrics,
             latency=latency,
-            dataset_name=problem["dataset"],
-            llm_url=llm.get_base_url(),
+            dataset_name=problem["dataset"]
         )
     except Exception as e:
         err_msg = f"Error generating rollout: {e}"

From b702489b3bc0ad8e2fb4d54bf4bd21a19c1367cb Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:00:50 +0000
Subject: [PATCH 159/166] fixes

---
 debug.sh                           |  8 ++++----
 pipelinerl/actor.py                | 13 ++++++++++---
 pipelinerl/domains/mcp/rollouts.py |  5 +----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/debug.sh b/debug.sh
index b72841c1..115a5bb2 100755
--- a/debug.sh
+++ b/debug.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-echo "Run 40 workers"
-DEBUG_FILE=timing_debug_workers40_1.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug40_1 \
+echo "Run 38 workers"
+DEBUG_FILE=timing_debug_workers38_3.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug38_3 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
     eval_every_n_versions=0 \
-    actor.rollout_workers=40 \
+    actor.rollout_workers=38 \
     debug.mode=actor \
     world.actor_fraction=8 \
     world.finetune_fraction=0 \
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 56834363..19d3bc34 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import math
 import multiprocessing as mp
@@ -20,7 +21,6 @@
 from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel, Field
 from tapeagents.llms import TrainableLLM
-from tapeagents.orchestrator import save_debug_line
 
 import wandb
 from pipelinerl.finetune.logging_ import flatten_dict_config, init_wandb
@@ -45,6 +45,11 @@
 
 logger = logging.getLogger(__name__)
 
+def save_debug_line(data:dict):
+    data["ts"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    fname = os.environ.get("DEBUG_FILE", "timing_debug.jsonl")
+    with open(fname, "a") as f:
+        f.write(json.dumps(data, ensure_ascii=False) + "\n")
 
 class SlidingWindowData(BaseModel):
     prompt_tokens_window: list[list[int]] = Field(
@@ -73,9 +78,11 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
             self.data.timestamps.pop(0)
 
     def get_stats(self):
-        if len(self.data.prompt_tokens_window) < self.window_size:
-            logger.warning(f"Not enough data to compute sliding stats, window size: {self.window_size}, data length: {len(self.data.prompt_tokens_window)}")
+        if len(self.data.prompt_tokens_window) < 2:
+            logger.warning("Not enough data to compute sliding stats")
             return None
+        elif len(self.data.prompt_tokens_window) < self.window_size:
+            logger.warning(f"Compute sliding stats over just {len(self.data.prompt_tokens_window)} samples")
 
         # 1. How many samples do we produce per second?
         # 2. How many output tokens do we produce per second?
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 360290cb..d3e998c2 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -13,10 +13,9 @@
 from tapeagents.agent import DEFAULT, Agent
 from tapeagents.core import LLMCall, Tape, TrainingText
 from tapeagents.dialog_tape import UserStep
-from tapeagents.llms import LiteLLM
 from tapeagents.llms.trainable import TrainableLLM
 from tapeagents.mcp import MCPEnvironment
-from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config, save_debug_tape
+from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.async_llm import make_training_text
@@ -43,7 +42,6 @@ def _get_embedded_worker(env_cfg: DictConfig, concurrency: int) -> EmbeddedEnvir
         _embedded_worker.set_concurrency(concurrency)
     return _embedded_worker
 
-
 def count_tool_calls_by_category(llm_calls: List[LLMCall]) -> Dict[str, int]:
     """
     Count the number of tool calls for each function name category.
@@ -319,7 +317,6 @@ def generate_mcp_rollout_with_local_env(
         tape = execute_agent(agent, tape, environment, max_loops=cfg.agent_max_loops)
         logger.info("Agent finished")
         tape.metadata.result.update({"total_execution_time": time.perf_counter() - t_exec})
-        # save_debug_tape(tape)
         reward_table = RewardTable(**dict(cfg.rewards))
 
         llm_calls: list[LLMCall] = [

From dea891a1c50e064ef26d574573fcfdd97015ff02 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:03:27 +0000
Subject: [PATCH 160/166] fix

---
 pipelinerl/actor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
index 19d3bc34..b5cdb596 100644
--- a/pipelinerl/actor.py
+++ b/pipelinerl/actor.py
@@ -64,8 +64,9 @@ class SlidingWindowData(BaseModel):
 
 
 class SlidingWindowAggregator:
-    def __init__(self, window_size: int):
+    def __init__(self, window_size: int, min_samples: int = 5):
         self.window_size = window_size
+        self.min_samples = min_samples
         self.data = SlidingWindowData()
 
     def update(self, prompt_tokens: list[int], output_tokens: list[int]):
@@ -78,7 +79,7 @@ def update(self, prompt_tokens: list[int], output_tokens: list[int]):
             self.data.timestamps.pop(0)
 
     def get_stats(self):
-        if len(self.data.prompt_tokens_window) < 2:
+        if len(self.data.prompt_tokens_window) < self.min_samples:
             logger.warning("Not enough data to compute sliding stats")
             return None
         elif len(self.data.prompt_tokens_window) < self.window_size:

From e6723127558584d5fcf50326a36b6c5ebe872081 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Thu, 9 Oct 2025 13:19:39 +0000
Subject: [PATCH 161/166] make exp dir with all my scripts

---
 debug.sh => experiments/olmer/env_speed/debug.sh              | 4 ++--
 llm.sh => experiments/olmer/env_speed/llm.sh                  | 0
 llm_bench.py => experiments/olmer/env_speed/llm_bench.py      | 0
 .../olmer/env_speed/llm_bench_async.py                        | 0
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename debug.sh => experiments/olmer/env_speed/debug.sh (95%)
 rename llm.sh => experiments/olmer/env_speed/llm.sh (100%)
 rename llm_bench.py => experiments/olmer/env_speed/llm_bench.py (100%)
 rename llm_bench_async.py => experiments/olmer/env_speed/llm_bench_async.py (100%)

diff --git a/debug.sh b/experiments/olmer/env_speed/debug.sh
similarity index 95%
rename from debug.sh
rename to experiments/olmer/env_speed/debug.sh
index 115a5bb2..c79ba094 100755
--- a/debug.sh
+++ b/experiments/olmer/env_speed/debug.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 echo "Run 38 workers"
-DEBUG_FILE=timing_debug_workers38_3.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug38_3 \
+DEBUG_FILE=timing_debug_workers38_4.jsonl python -m pipelinerl.launch \
+    output_dir=results/actor_debug38_4 \
     force_restart=true \
     actor.llm_max_rollouts=256 \
     finetune.seq_parallel=8 \
diff --git a/llm.sh b/experiments/olmer/env_speed/llm.sh
similarity index 100%
rename from llm.sh
rename to experiments/olmer/env_speed/llm.sh
diff --git a/llm_bench.py b/experiments/olmer/env_speed/llm_bench.py
similarity index 100%
rename from llm_bench.py
rename to experiments/olmer/env_speed/llm_bench.py
diff --git a/llm_bench_async.py b/experiments/olmer/env_speed/llm_bench_async.py
similarity index 100%
rename from llm_bench_async.py
rename to experiments/olmer/env_speed/llm_bench_async.py

From 4ad27f9d6895bd147a2b5b22e560bf87b365f988 Mon Sep 17 00:00:00 2001
From: Oleh Shliazhko <oleh.shliazhko@servicenow.com>
Date: Fri, 10 Oct 2025 17:00:13 +0200
Subject: [PATCH 162/166] move personal scripts out

---
 experiments/olmer/env_speed/debug.sh          |  71 -------
 experiments/olmer/env_speed/llm.sh            |  72 -------
 experiments/olmer/env_speed/llm_bench.py      | 139 ------------
 .../olmer/env_speed/llm_bench_async.py        | 200 ------------------
 4 files changed, 482 deletions(-)
 delete mode 100755 experiments/olmer/env_speed/debug.sh
 delete mode 100755 experiments/olmer/env_speed/llm.sh
 delete mode 100644 experiments/olmer/env_speed/llm_bench.py
 delete mode 100644 experiments/olmer/env_speed/llm_bench_async.py

diff --git a/experiments/olmer/env_speed/debug.sh b/experiments/olmer/env_speed/debug.sh
deleted file mode 100755
index c79ba094..00000000
--- a/experiments/olmer/env_speed/debug.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-echo "Run 38 workers"
-DEBUG_FILE=timing_debug_workers38_4.jsonl python -m pipelinerl.launch \
-    output_dir=results/actor_debug38_4 \
-    force_restart=true \
-    actor.llm_max_rollouts=256 \
-    finetune.seq_parallel=8 \
-    eval_every_n_versions=0 \
-    actor.rollout_workers=38 \
-    debug.mode=actor \
-    world.actor_fraction=8 \
-    world.finetune_fraction=0 \
-    world.preprocessor_fraction=0 \
-    --config-name mcp
-
-# echo "Run 10 workers"
-# DEBUG_FILE=timing_debug_gpt_workers10.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=10 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-
-# echo "Run 5 workers"
-# DEBUG_FILE=timing_debug_gpt_workers5.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=5 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-# echo "Run 40 workers"
-# DEBUG_FILE=timing_debug_gpt_workers40.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=40 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-# echo "Run 30 workers"
-# DEBUG_FILE=timing_debug_gpt_workers30.jsonl python -m pipelinerl.launch \
-#     output_dir=results/actor_debug2 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     actor.rollout_workers=30 \
-#     debug.mode=actor \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm.sh b/experiments/olmer/env_speed/llm.sh
deleted file mode 100755
index 78758724..00000000
--- a/experiments/olmer/env_speed/llm.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-echo "Run LLM only"
-
-# python -m pipelinerl.launch \
-#     output_dir=results/llm_debug1 \
-#     force_restart=true \
-#     actor.llm_max_rollouts=16 \
-#     finetune.seq_parallel=8 \
-#     eval_every_n_versions=0 \
-#     debug.mode=llm \
-#     world.actor_fraction=8 \
-#     world.finetune_fraction=0 \
-#     world.preprocessor_fraction=0 \
-#     --config-name mcp
-
-
-python -m pipelinerl.entrypoints.run_vllm1 \
-    --model Qwen/Qwen3-8B \
-    --host 0.0.0.0 \
-    --port 8080 \
-    --seed 42 \
-    --actor-llm-idx 0 \
-    --weight-update-group-init-method tcp://localhost:9000 \
-    --weight-update-group-world-size 2 \
-    --dtype bfloat16 \
-    --gpu-memory-utilization 0.9 \
-    --num-scheduler-steps 1 \
-    --disable-log-requests \
-    --disable-frontend-multiprocessing \
-    --max-num-seqs 256 \
-    --max-num-batched-tokens 32000 \
-    --max_model_len 32000 \
-    --enable-chunked-prefill \
-    --return-tokens-as-token-ids \
-    --tensor-parallel-size 1 \
-    --pipeline-parallel-size 1 \
-    --generation-config vllm \
-    --enable-auto-tool-choice \
-    --tool-call-parser rl_tool \
-    --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
-    --disable-weight-update
-
-
-# python -m pipelinerl.entrypoints.run_vllm0 \
-#     --model Qwen/Qwen2.5-7B \
-#     --host 0.0.0.0 \
-#     --port 8080 \
-#     --seed 42 \
-#     --actor-llm-idx 0 \
-#     --weight-update-group-init-method tcp://localhost:9000 \
-#     --weight-update-group-world-size 2 \
-#     --dtype bfloat16 \
-#     --gpu-memory-utilization 0.9 \
-#     --num-scheduler-steps 1 \
-#     --disable-log-requests \
-#     --disable-frontend-multiprocessing \
-#     --max-num-seqs 256 \
-#     --max-num-batched-tokens 32000 \
-#     --enable-chunked-prefill \
-#     --return-tokens-as-token-ids \
-#     --tensor-parallel-size 1 \
-#     --pipeline-parallel-size 1 \
-#     --generation-config vllm \
-#     --max_model_len 32000 \
-#     --enable-auto-tool-choice \
-#     --tool-call-parser rl_tool \
-#     --tool-parser-plugin /home/toolkit/PipelineRL/pipelinerl/rl_tool_parser_plugin.py \
-#     --disable-weight-update
- 
-# python -m pipelinerl.entrypoints.run_vllm0 --model /mnt/llmd/base_models/Mistral-Small-24B-Base-2501 --host 0.0.0.0 --port 8080 --seed 78 --actor-llm-idx 36 --weight-update-group-init-method tcp://dns-99833624-2133-43c0-a112-07520ffee505-0:9000 --weight-update-group-world-size 49 --dtype bfloat16 --gpu-memory-utilization 0.9 --num-scheduler-steps 1 --disable-log-requests --disable-frontend-multiprocessing --max-num-seqs 256 --max-num-batched-tokens 1024 --enable-chunked-prefill --return-tokens-as-token-ids --tensor-parallel-size 1 --pipeline-parallel-size 1 --generation-config vllm --max_model_len 32768
-
- 
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm_bench.py b/experiments/olmer/env_speed/llm_bench.py
deleted file mode 100644
index e61cf342..00000000
--- a/experiments/olmer/env_speed/llm_bench.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import json
-import os
-import random
-import time
-
-import numpy as np
-import ray
-import requests
-from tapeagents.llms import TrainableLLM
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-llm_url = "http://localhost:8080"
-llm_model = "Qwen/Qwen3-8B"
-# llm_model = "Qwen/Qwen2.5-7B"
-exp_name = "qwen3-8b-v1"
-# exp_name = "qwen2.5-7b"
-max_tokens = 8192
-
-def llm_quick_response(prompt: str):
-    r = requests.post(
-        url=f"{llm_url}/v1/chat/completions",
-        json={
-            "model": llm_model,
-            "messages": [{"role": "user", "content": prompt}],
-            "stream": False,
-        },
-        headers={"Content-Type": "application/json"},
-        stream=False,
-        verify=False,
-    )
-    d = r.json()
-    return d["choices"][0]["message"]["content"]
-
-
-llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
-response = llm.quick_response("Hello, how are you?")
-response = llm_quick_response("Hello, how are you?")
-assert len(response) > 0
-assert llm.tokenizer is not None
-print("LLM is ready")
-
-
-with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
-    all_dicts = [json.loads(line) for line in f if line.strip()]
-total_tokens = 0
-for d in all_dicts:
-    text = d["text"]
-    n_predicted = d["n_predicted"]
-    prompt = text[:-n_predicted]
-    response = text[-n_predicted:]
-    tokens = llm.tokenizer.encode(text)
-    total_tokens += len(tokens)
-print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
-
-prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
-random.seed(42)
-random.shuffle(prompts)
-chunk_size = 4
-prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
-print(f"Chunked to {len(prompts_chunks)} chunks")
-too_many_chunks = prompts_chunks * 20
-
-def benchmark_llm(n_workers: int):
-    ray.shutdown()
-    ray.init(num_cpus=n_workers)
-
-    def get_responses(prompts: str):
-        responses = []
-        # local_llm = TrainableLLM(base_url=llm_url, model_name=llm_model, parameters={"max_tokens": max_tokens})
-        for i, prompt in enumerate(prompts):
-            t = time.perf_counter()
-            # r = local_llm.quick_response(prompt)
-            r = llm_quick_response(prompt)
-            dt = time.perf_counter() - t
-            responses.append((prompt + r, dt))
-        return responses
-
-    remote_fn = ray.remote(get_responses)
-
-    start_time = time.perf_counter()
-
-    n_chunks = max(200, n_workers * 2)
-    chunks = too_many_chunks[:n_chunks]
-    print(f"Multiplied to {len(chunks)} chunks")
-    random.seed(42)
-    random.shuffle(chunks)
-    unfinished_tasks = []
-    for chunk in chunks:
-        unfinished_tasks.append(remote_fn.remote(chunk))
-
-    responses = []
-    total_tokens = 0
-    total_finished = 0
-    latencies = []
-    print(f"Submitted {len(unfinished_tasks)} tasks")
-    while unfinished_tasks:
-        finished_tasks, unfinished_tasks = ray.wait(unfinished_tasks, num_returns=len(unfinished_tasks), timeout=0.1)
-        for finished_task in finished_tasks:
-            responses = ray.get(finished_task)
-            total_finished += 1
-            for response, latency in responses:
-                latencies.append(latency)
-                tokens = llm.tokenizer.encode(response)
-                total_tokens += len(tokens)
-        dt = time.perf_counter() - start_time
-        if len(finished_tasks) > 0:
-            print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]):.2f}s")
-            with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
-                ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                row = json.dumps({"ts": ts, "exp_name": exp_name, "n_workers": n_workers, "tokens": total_tokens, "dt": dt, "mean_latency": np.mean(latencies), "last_10_latency": np.mean(latencies[-10:]), "total_finished": total_finished, "token_speed": total_tokens / dt})
-                f.write(row + "\n")
-        if len(unfinished_tasks) < n_workers:
-            print(f"Saturation mode ended, stopping")
-            break
-        time.sleep(2.0)
-
-    final_time = time.perf_counter() - start_time
-    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
-    ray.shutdown()
-    mean_latency = np.mean(latencies)
-    return total_tokens, final_time, mean_latency
-
-stats = {}
-for n_workers in [128]: #[64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
-    print(f"Benchmarking {n_workers} workers..")
-    tokens, dt, mean_latency = benchmark_llm(n_workers)
-    print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
-    stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-    with open(f"llm_token_stats_ray_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
-        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-        row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
-        f.write(row + "\n")
-    time.sleep(3.0)
-
-print("Benchmarking done")
-with open(f"llm_token_stats_ray_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
-    json.dump(stats, f, indent=4)
-print("All stats saved")
\ No newline at end of file
diff --git a/experiments/olmer/env_speed/llm_bench_async.py b/experiments/olmer/env_speed/llm_bench_async.py
deleted file mode 100644
index 0f307576..00000000
--- a/experiments/olmer/env_speed/llm_bench_async.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import asyncio
-import json
-import os
-import random
-import time
-
-import aiohttp
-import numpy as np
-from tapeagents.llms import TrainableLLM
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-llm_url = "http://localhost:8080"
-llm_model = "Qwen/Qwen3-8B"
-# llm_model = "Qwen/Qwen2.5-7B"
-exp_name = "qwen3-8b-v1"
-# exp_name = "qwen2.5-7b"
-max_tokens = 8192
-
-
-async def llm_quick_response_async(session: aiohttp.ClientSession, prompt: str):
-    """Async version of LLM quick response"""
-    async with session.post(
-        url=f"{llm_url}/v1/chat/completions",
-        json={
-            "model": llm_model,
-            "messages": [{"role": "user", "content": prompt}],
-            "stream": False,
-        },
-        headers={"Content-Type": "application/json"},
-        ssl=False,
-    ) as response:
-        d = await response.json()
-        return d["choices"][0]["message"]["content"]
-
-
-
-
-# Initial LLM test (synchronous)
-llm = TrainableLLM(base_url=llm_url, model_name=llm_model)
-response = llm.quick_response("Hello, how are you?")
-assert len(response) > 0
-assert llm.tokenizer is not None
-print("LLM is ready")
-
-
-with open("debug_training_texts.jsonl", "r", encoding="utf-8") as f:
-    all_dicts = [json.loads(line) for line in f if line.strip()]
-total_tokens = 0
-for d in all_dicts:
-    text = d["text"]
-    n_predicted = d["n_predicted"]
-    prompt = text[:-n_predicted]
-    response = text[-n_predicted:]
-    tokens = llm.tokenizer.encode(text)
-    total_tokens += len(tokens)
-print(f"Loaded {len(all_dicts)} texts, total tokens: {total_tokens}")
-
-prompts = [d["text"][:-d["n_predicted"]] for d in all_dicts]
-random.seed(42)
-random.shuffle(prompts)
-chunk_size = 4
-prompts_chunks = [prompts[i:i+chunk_size] for i in range(0, len(prompts), chunk_size)]
-print(f"Chunked to {len(prompts_chunks)} chunks")
-too_many_chunks = prompts_chunks * 20
-
-
-async def get_responses_async(session: aiohttp.ClientSession, prompts: list[str], tokenizer):
-    """Process a chunk of prompts asynchronously"""
-    responses = []
-    for prompt in prompts:
-        t = time.perf_counter()
-        try:
-            r = await llm_quick_response_async(session, prompt)
-            dt = time.perf_counter() - t
-            responses.append((prompt + r, dt))
-        except Exception as e:
-            print(f"Error processing prompt: {e}")
-            dt = time.perf_counter() - t
-            responses.append((prompt, dt))
-    return responses
-
-
-async def benchmark_llm_async(n_workers: int):
-    """Benchmark LLM using async/await with controlled concurrency"""
-    print(f"Starting async benchmark with {n_workers} concurrent workers")
-    
-    start_time = time.perf_counter()
-    
-    n_chunks = max(200, n_workers * 2)
-    chunks = too_many_chunks[:n_chunks]
-    print(f"Multiplied to {len(chunks)} chunks")
-    random.seed(42)
-    random.shuffle(chunks)
-    
-    total_tokens = 0
-    total_finished = 0
-    latencies = []
-    
-    # Create shared aiohttp session with connection pooling
-    connector = aiohttp.TCPConnector(limit=n_workers, limit_per_host=n_workers)
-    timeout = aiohttp.ClientTimeout(total=300)  # 5 minute timeout
-    
-    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-        # Create all tasks
-        tasks = []
-        for chunk in chunks:
-            task = asyncio.create_task(get_responses_async(session, chunk, llm.tokenizer))
-            tasks.append(task)
-        
-        print(f"Created {len(tasks)} tasks")
-        
-        # Process tasks with controlled concurrency
-        pending = set(tasks)
-        active = set()
-        
-        while pending or active:
-            # Fill up active tasks up to n_workers limit
-            while len(active) < n_workers and pending:
-                task = pending.pop()
-                active.add(task)
-            
-            if not active:
-                break
-            
-            # Wait for at least one task to complete
-            done, active = await asyncio.wait(active, timeout=0.1, return_when=asyncio.FIRST_COMPLETED)
-            
-            # Process completed tasks
-            for finished_task in done:
-                try:
-                    responses = await finished_task
-                    total_finished += 1
-                    for response, latency in responses:
-                        latencies.append(latency)
-                        tokens = llm.tokenizer.encode(response)
-                        total_tokens += len(tokens)
-                except Exception as e:
-                    print(f"Task failed with error: {e}")
-                    total_finished += 1
-            
-            # Log progress
-            dt = time.perf_counter() - start_time
-            if len(done) > 0:
-                print(f"t: {dt:.2f}s, {total_finished} finished, Total tokens: {total_tokens}, tokens/sec: {total_tokens / dt:.2f}, last 10 latency: {np.mean(latencies[-10:]) if latencies else 0:.2f}s")
-                with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}_log.jsonl", "a") as f:
-                    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    row = json.dumps({
-                        "ts": ts,
-                        "exp_name": exp_name,
-                        "n_workers": n_workers,
-                        "tokens": total_tokens,
-                        "dt": dt,
-                        "mean_latency": np.mean(latencies) if latencies else 0,
-                        "last_10_latency": np.mean(latencies[-10:]) if latencies else 0,
-                        "total_finished": total_finished,
-                        "token_speed": total_tokens / dt if dt > 0 else 0
-                    })
-                    f.write(row + "\n")
-            
-            # Check saturation mode
-            if len(pending) + len(active) < n_workers:
-                print(f"Saturation mode ended, stopping")
-                # Cancel remaining tasks
-                for task in active:
-                    task.cancel()
-                break
-            
-            await asyncio.sleep(2.0)
-    
-    final_time = time.perf_counter() - start_time
-    print(f"Final, workers:{n_workers}, t:{final_time:.2f}s, total tokens: {total_tokens}, tokens/sec: {total_tokens / final_time:.2f}")
-    mean_latency = np.mean(latencies) if latencies else 0
-    return total_tokens, final_time, mean_latency
-
-
-async def run_benchmarks():
-    """Run benchmarks for different worker counts"""
-    stats = {}
-    for n_workers in [128]:  # [64, 256, 128, 32, 4, 8, 16, 512, 1024]: # most optimal first
-        print(f"Benchmarking {n_workers} workers..")
-        tokens, dt, mean_latency = await benchmark_llm_async(n_workers)
-        print(f"Done {n_workers} workers: {tokens} tokens, {dt:.2f}s, speed {tokens / dt:.2f} tokens/sec, mean latency: {mean_latency:.2f}s")
-        stats[n_workers] = {"tokens": tokens, "dt": dt, "mean_latency": mean_latency}
-        with open(f"llm_token_stats_chunk{chunk_size}_{exp_name}.jsonl", "a") as f:
-            ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-            row = json.dumps({"ts": ts, "n_workers": n_workers, "tokens": tokens, "dt": dt, "mean_latency": mean_latency})
-            f.write(row + "\n")
-        await asyncio.sleep(3.0)
-    
-    print("Benchmarking done")
-    with open(f"llm_token_stats_all_chunk{chunk_size}_{exp_name}.json", "w") as f:
-        json.dump(stats, f, indent=4)
-    print("All stats saved")
-
-
-if __name__ == "__main__":
-    # Run the async benchmarks
-    asyncio.run(run_benchmarks())
-

From 41a080dead71a3ffd451b9b010fbc37ff6803b6a Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:03:55 +0000
Subject: [PATCH 163/166] Remove test reward shaping

---
 conf/mcp.yaml                      | 21 +++------
 pipelinerl/domains/mcp/rollouts.py | 68 ++++++------------------------
 2 files changed, 17 insertions(+), 72 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index 43ebf586..a2fa2bb4 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -11,6 +11,10 @@ test_llm:
   parameters:
     max_tokens: 8192
 
+rewards:
+  correct_answer_not_finished: 0.0
+  buffer_tokens: 2000
+
 actor:
   rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
   system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
@@ -146,19 +150,4 @@ agent:
       next_node: code
 
 # model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
-
-# Local reward shaping for tool usage
-python_tool_shaping:
-  bonus_on_correct_with_python: 0.2
-  penalty_on_incorrect_without_python: 0.1
-  max_abs: 0.2
-
-# Encourage concise outputs (penalize long completions)
-length_shaping:
-  target_ratio: 0.1                # 10% of max_tokens; auto scales with max_tokens
-  min_target_tokens: 256           # lower clamp
-  max_target_tokens: 2048          # upper clamp
-  slope: 0.001                     # penalty per token beyond target
-  max_penalty: 0.2                 # clamp absolute penalty
-  bonus_on_short_correct: 0.05     # bonus if correct and concise
+model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index f62f0567..861c5fae 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -22,7 +22,7 @@
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
+from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 
 logger = logging.getLogger(__name__)
@@ -192,63 +192,19 @@ async def generate_mcp_rollout(
     tape_finished = True if isinstance(tape.steps[-1], MathAnswer) else False
     base_reward = get_reward(answer_status, tape_finished, reward_table)
 
-    # Local reward shaping (configurable in conf/mcp.yaml)
-    total_shaping = 0.0
-    shaping_cfg = getattr(cfg, "python_tool_shaping", None)
-    if shaping_cfg is not None:
-        num_python_calls = tool_call_counts.get("run_python_code", 0)
-        bonus_on_correct_with_python = float(getattr(shaping_cfg, "bonus_on_correct_with_python", 0.0))
-        penalty_on_incorrect_without_python = float(getattr(shaping_cfg, "penalty_on_incorrect_without_python", 0.0))
-        max_abs = float(getattr(shaping_cfg, "max_abs", 0.2))
+    reward = base_reward
 
-        # Episode-level bonuses/penalties
-        if answer_status == "correct" and num_python_calls >= 1:
-            total_shaping += bonus_on_correct_with_python
-        if answer_status in ("wrong", "unparsable") and num_python_calls == 0:
-            total_shaping -= penalty_on_incorrect_without_python
+    discount_factor = float(getattr(cfg.actor, "discount_factor", 1.0))
+    if discount_factor != 1.0:
+        total_generated_tokens = sum(getattr(call, "output_length_tokens", 0) for call in llm_calls)
+        reward *= discount_factor ** total_generated_tokens
 
-        # Clamp total shaping
-        if total_shaping > max_abs:
-            total_shaping = max_abs
-        if total_shaping < -max_abs:
-            total_shaping = -max_abs
-
-    # Length shaping: discourage very long completions; award concise correct ones
-    length_cfg = getattr(cfg, "length_shaping", None)
-    if length_cfg is not None:
-        try:
-            # Prefer ratio-based target if provided; otherwise use absolute
-            if hasattr(length_cfg, "target_ratio"):
-                ratio = float(getattr(length_cfg, "target_ratio"))
-                max_gen = int(llm.parameters.get("max_tokens", 2048))
-                target_tokens = int(max(1, ratio * max_gen))
-                # Optional clamps
-                min_t = int(getattr(length_cfg, "min_target_tokens", 0))
-                max_t = int(getattr(length_cfg, "max_target_tokens", 10**9))
-                target_tokens = max(min_t, min(max_t, target_tokens))
-            else:
-                target_tokens = int(getattr(length_cfg, "target_output_tokens", 512))
-            slope = float(getattr(length_cfg, "slope", 0.0))
-            max_penalty = float(getattr(length_cfg, "max_penalty", 0.0))
-            bonus_short_correct = float(getattr(length_cfg, "bonus_on_short_correct", 0.0))
-        except Exception:
-            target_tokens, slope, max_penalty, bonus_short_correct = 512, 0.0, 0.0, 0.0
-
-        # average output tokens across llm calls for this rollout
-        try:
-            avg_output_tokens = sum(t.output_tokens for t in training_texts) / max(1, len(training_texts))
-        except Exception:
-            avg_output_tokens = 0.0
-
-        if slope > 0.0 and max_penalty > 0.0 and avg_output_tokens > target_tokens:
-            over_by = float(avg_output_tokens - target_tokens)
-            penalty = min(max_penalty, slope * over_by)
-            total_shaping -= penalty
-
-        if bonus_short_correct > 0.0 and answer_status == "correct" and avg_output_tokens <= target_tokens:
-            total_shaping += bonus_short_correct
-
-    reward = base_reward + total_shaping
+    buffer_tokens = getattr(reward_table, "buffer_tokens", 0)
+    if buffer_tokens:
+        max_tokens = int(llm.parameters.get("max_tokens", 0))
+        total_output_tokens = sum(getattr(text, "output_tokens", 0) for text in training_texts)
+        if max_tokens > 0:
+            reward += length_penalty(max_tokens, total_output_tokens, buffer_tokens)
 
     # Assign identical reward to all steps in the rollout (pipeline expects uniform rollout_reward)
     for text in training_texts:

From bd234113564717b0982f475f2a075d3f38a70ca1 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:29:17 +0000
Subject: [PATCH 164/166] Fix imports

---
 conf/mcp.yaml                      | 4 ++--
 pipelinerl/domains/mcp/rollouts.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index a2fa2bb4..27c48a42 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -149,5 +149,5 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
\ No newline at end of file
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index 861c5fae..b3116672 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -21,7 +21,10 @@
 from tapeagents.core import LLMCall
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
+from pipelinerl.async_llm import make_training_text
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
+from pipelinerl.domains.mcp.steps import MathAnswer
+from pipelinerl.world import Job
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
 

From 50f3ff9699dec81d2f3a3b30254154c7b947d10b Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 16:35:03 +0000
Subject: [PATCH 165/166] Fix conflicts

---
 conf/mcp.yaml                      | 8 ++++----
 pipelinerl/domains/mcp/rollouts.py | 6 ------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
index ec44ace0..ca6da70e 100644
--- a/conf/mcp.yaml
+++ b/conf/mcp.yaml
@@ -151,6 +151,7 @@ agent:
       trim_obs_except_last_n: 2
       next_node: code
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 model_path: Qwen/Qwen3-8B
 # model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
@@ -169,7 +170,6 @@ length_shaping:
   slope: 0.001                     # penalty per token beyond target
   max_penalty: 0.2                 # clamp absolute penalty
   bonus_on_short_correct: 0.05     # bonus if correct and concise
-=======
-# model_path: Qwen/Qwen3-8B
-model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
->>>>>>> mcp_tir
+
+model_path: Qwen/Qwen3-8B
+# model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
diff --git a/pipelinerl/domains/mcp/rollouts.py b/pipelinerl/domains/mcp/rollouts.py
index a71a7d2f..782d4978 100644
--- a/pipelinerl/domains/mcp/rollouts.py
+++ b/pipelinerl/domains/mcp/rollouts.py
@@ -18,18 +18,12 @@
 from tapeagents.orchestrator import async_execute_agent, execute_agent, get_agent_and_env_from_config
 from tapeagents.remote_environment import AsyncRemoteEnvironment
 
-<<<<<<< HEAD
 from pipelinerl.async_llm import make_training_text
-from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc
 from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.mcp.steps import MathAnswer
-from pipelinerl.rollouts import BaseMetrics, RolloutResult
 from pipelinerl.world import Job
-=======
-from pipelinerl.domains.mcp.env_server import EmbeddedEnvironmentWorker
 from pipelinerl.domains.math import RewardTable, get_reward, verify_answer, verify_answer_rpc, length_penalty
 from pipelinerl.rollouts import RolloutResult, BaseMetrics
->>>>>>> mcp_tir
 
 logger = logging.getLogger(__name__)
 

From d9a65b3105be673f0aee1d6187f9d210153c6d39 Mon Sep 17 00:00:00 2001
From: rafapi <rafael.pardinas@servicenow.com>
Date: Fri, 10 Oct 2025 18:38:33 +0000
Subject: [PATCH 166/166] Fix

---
 pipelinerl/domains/math/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
index 1c7310f2..7a9809b7 100644
--- a/pipelinerl/domains/math/__init__.py
+++ b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable, get_reward
+from .rollouts import generate_math_rollout, RewardTable, get_reward, length_penalty
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
\ No newline at end of file