From 7a480ae462ad2f79fc8b5e34292d6e76b3c2bd7f Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 28 May 2024 14:30:04 +0200 Subject: [PATCH 01/53] init keyboard agent --- conf/agent/keyboard.yaml | 6 ++ conf/config.yaml | 4 +- experiments/roboarm/train.py | 2 +- src/agents/__init__.py | 10 ++- src/agents/keyboard.py | 139 +++++++++++++++++++++++++++++++++++ 5 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 conf/agent/keyboard.yaml create mode 100644 src/agents/keyboard.py diff --git a/conf/agent/keyboard.yaml b/conf/agent/keyboard.yaml new file mode 100644 index 0000000..39d1ec7 --- /dev/null +++ b/conf/agent/keyboard.yaml @@ -0,0 +1,6 @@ +name: keyboard + +batch_size: 256 +buffer_size: 1000000 +num_updates: 2500 +prefill_episodes: 0 \ No newline at end of file diff --git a/conf/config.yaml b/conf/config.yaml index 609b137..2bae809 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -9,5 +9,5 @@ episodes: 200 defaults: - _self_ # random, sac, td3, droq - - agent: sac - - env: roboarm_sim-v0 \ No newline at end of file + - agent: keyboard + - env: roboarm-v0 \ No newline at end of file diff --git a/experiments/roboarm/train.py b/experiments/roboarm/train.py index 39314b0..ef45e74 100644 --- a/experiments/roboarm/train.py +++ b/experiments/roboarm/train.py @@ -41,7 +41,7 @@ def run(cfg: DictConfig) -> None: # initialize wandb wandb.init(project=project_name) wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) - wandb.watch(agent.actor, log_freq=1) if agent.actor else None + # wandb.watch(agent.actor, log_freq=1) if agent.actor else None # prefill buffer with random actions prefill_buffer( diff --git a/src/agents/__init__.py b/src/agents/__init__.py index 7715150..294b839 100644 --- a/src/agents/__init__.py +++ b/src/agents/__init__.py @@ -1,9 +1,10 @@ from src.agents.behavior_cloning import BehavioralCloningAgent +from src.agents.keyboard import KeyboardAgent from src.agents.random import RandomAgent from src.agents.sac import SACAgent from src.agents.td3 import TD3Agent -all_agents = ["td3", "sac", "bc", "random"] +all_agents = ["td3", "sac", "bc", "random", "keyboard"] def get_agent(action_spec, state_spec, cfg): @@ -35,6 +36,13 @@ def get_agent(action_spec, state_spec, cfg): agent_config=cfg.agent, device=cfg.device, ) + elif cfg.agent.name == "keyboard": + agent = KeyboardAgent( + action_spec=action_spec, + state_spec=state_spec, + agent_config=cfg.agent, + device=cfg.device, + ) else: raise NotImplementedError( f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}" diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py new file mode 100644 index 0000000..5952341 --- /dev/null +++ b/src/agents/keyboard.py @@ -0,0 +1,139 @@ +import time + +import tensordict as td +import torch +from pynput import keyboard +from tensordict import TensorDictBase +from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer +from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage + +from src.agents.base import BaseAgent + + +class KeyboardAgent(BaseAgent): + def __init__(self, state_spec, action_spec, agent_config, device="cpu"): + super(KeyboardAgent, self).__init__( + state_spec, action_spec, agent_config.name, device + ) + + # Define the key to action mapping + self.key_action_mapping = { + "a": [-0.15, 0, 0, 0], # Rotate motor -30 + "d": [0.15, 0, 0, 0], # Rotate motor +30 + "s": [0, -0.25, 0, 0], # Low motor -10 + "w": [0, 0.25, 0, 0], # Low motor +10 + "q": [0, 0, -0.25, 0], # High motor -15 + "e": [0, 0, 0.25, 0], # High motor +15 + "f": [0, 0, 0, -0.25], # Grab motor -10 + "g": [0, 0, 0, 0.25], # Grab motor +10 + } + self.current_action = None + self.setup_key_listener() + + # Define Replay Buffer + self.replay_buffer = self.create_replay_buffer( + batch_size=agent_config.batch_size, + prb=False, + buffer_size=agent_config.buffer_size, + device=device, + ) + + # general stats + self.collected_transitions = 0 + self.total_updates = 0 + + def setup_key_listener(self): + def on_press(key): + try: + if key.char in self.key_action_mapping: + self.current_action = self.key_action_mapping[key.char] + except AttributeError: + pass + + def on_release(key): + self.current_action = None + + self.listener = keyboard.Listener(on_press=on_press, on_release=on_release) + self.listener.start() + + def load_model(self, path): + """load model""" + try: + statedict = torch.load(path) + self.actor.load_state_dict(statedict["actor"]) + self.critic.load_state_dict(statedict["critic"]) + print("Model loaded") + except: + raise ValueError("Model not loaded") + + def load_replaybuffer(self, path): + """load replay buffer""" + try: + self.replay_buffer.load_state_dict(torch.load(path)) + print("Replay Buffer loaded") + print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") + except: + raise ValueError("Replay Buffer not loaded") + + def eval(self): + """Sets the agent to evaluation mode.""" + pass + + def create_replay_buffer( + self, + batch_size=256, + prb=False, + buffer_size=100000, + buffer_scratch_dir=None, + device="cpu", + prefetch=3, + ): + """Create replay buffer""" + if prb: + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=0.7, + beta=0.5, + pin_memory=False, + prefetch=1, + storage=LazyTensorStorage( + buffer_size, + device=device, + ), + ) + else: + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + ) + return replay_buffer + + @torch.no_grad() + def get_action(self, td: TensorDictBase) -> TensorDictBase: + """Get action from actor network or keyboard""" + while self.current_action is None: + time.sleep(0.01) # Add a small sleep to avoid blocking + td.set("action", torch.tensor(self.current_action).float().unsqueeze(0)) + return td + + @torch.no_grad() + def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: + """Get eval action from actor network""" + with set_exploration_type(ExplorationType.MODE): + out_td = self.actor(td.to(self.device)) + self.td_preprocessing(out_td) + return out_td + + def add_experience(self, transition: td.TensorDict): + """Add experience to replay buffer""" + self.replay_buffer.extend(transition) + self.collected_transitions += 1 + + def train(self, batch_size=64, num_updates=1): + """Train the agent""" + return {} From 414b9698098052707a7022504c15529dfb9409e9 Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 29 May 2024 19:48:16 +0200 Subject: [PATCH 02/53] add pretrain script for offlineRL --- experiments/walker/pretrain.py | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 experiments/walker/pretrain.py diff --git a/experiments/walker/pretrain.py b/experiments/walker/pretrain.py new file mode 100644 index 0000000..e1c59a1 --- /dev/null +++ b/experiments/walker/pretrain.py @@ -0,0 +1,61 @@ +import os +import sys + +import hydra +import wandb +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm + +# Add the project root to PYTHONPATH +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +from environments import make_env +from src.agents import get_agent +from src.utils import login, logout, setup_check, tensordict2dict + + +@hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") +def run(cfg: DictConfig) -> None: + print(OmegaConf.to_yaml(cfg)) + + # make environment. + setup_check(robot="walker", config=cfg) + env, action_space, state_space = make_env(cfg) + + # make agent + agent, project_name = get_agent(action_space, state_space, cfg) + login(agent) + + # initialize wandb + wandb.init(project=project_name) + wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) + wandb.watch(agent.actor, log_freq=1) if agent.actor else None + + batch_size = cfg.agent.batch_size + num_updates = cfg.agent.num_updates + train_episodes = cfg.episodes + print("Start training...") + try: + for e in tqdm(range(train_episodes), desc="Training"): + + loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) + + # Metrics Logging + log_dict = { + "epoch": e, + "buffer_size": agent.replay_buffer.__len__(), + } + log_dict.update(tensordict2dict(loss_info)) + wandb.log(log_dict) + + except KeyboardInterrupt: + print("Training interrupted by user.") + + logout(agent) + env.close() + + +if __name__ == "__main__": + run() From f78b7f5ff02b7dc8957c433d0a691a1dde458c4c Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 30 May 2024 11:44:25 +0200 Subject: [PATCH 03/53] add iql --- conf/agent/iql.yaml | 20 ++++ src/agents/__init__.py | 10 +- src/agents/iql.py | 245 +++++++++++++++++++++++++++++++++++++++ src/networks/networks.py | 108 +++++++++++++++++ 4 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 conf/agent/iql.yaml create mode 100644 src/agents/iql.py diff --git a/conf/agent/iql.yaml b/conf/agent/iql.yaml new file mode 100644 index 0000000..268c298 --- /dev/null +++ b/conf/agent/iql.yaml @@ -0,0 +1,20 @@ +name: iql +lr: 3e-4 +batch_size: 256 +num_updates: 1 +prefill_episodes: 10 + +num_cells: 256 +gamma: 0.99 +soft_update_eps: 0.995 +loss_function: l2 +temperature: 1.0 +expectile: 0.5 + +normalization: None +dropout: 0.0 + +prb: 0 +buffer_size: 1000000 +pretrain: False +reset_params: False \ No newline at end of file diff --git a/src/agents/__init__.py b/src/agents/__init__.py index 7715150..e95366a 100644 --- a/src/agents/__init__.py +++ b/src/agents/__init__.py @@ -1,9 +1,10 @@ from src.agents.behavior_cloning import BehavioralCloningAgent +from src.agents.iql import IQLAgent from src.agents.random import RandomAgent from src.agents.sac import SACAgent from src.agents.td3 import TD3Agent -all_agents = ["td3", "sac", "bc", "random"] +all_agents = ["td3", "sac", "bc", "iql", "random"] def get_agent(action_spec, state_spec, cfg): @@ -35,6 +36,13 @@ def get_agent(action_spec, state_spec, cfg): agent_config=cfg.agent, device=cfg.device, ) + elif cfg.agent.name == "iql": + agent = IQLAgent( + action_spec=action_spec, + state_spec=state_spec, + agent_config=cfg.agent, + device=cfg.device, + ) else: raise NotImplementedError( f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}" diff --git a/src/agents/iql.py b/src/agents/iql.py new file mode 100644 index 0000000..e0c3893 --- /dev/null +++ b/src/agents/iql.py @@ -0,0 +1,245 @@ +import tensordict as td +import torch +from tensordict import TensorDictBase +from torch import optim +from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer +from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage +from torchrl.envs.utils import ExplorationType, set_exploration_type +from torchrl.objectives import SoftUpdate + +from torchrl.objectives.iql import IQLLoss + +from src.agents.base import BaseAgent +from src.networks.networks import get_critic, get_stochastic_actor, get_value_operator + + +class IQLAgent(BaseAgent): + def __init__(self, state_spec, action_spec, agent_config, device="cpu"): + super(IQLAgent, self).__init__( + state_spec, action_spec, agent_config.name, device + ) + + self.actor = get_stochastic_actor( + self.observation_keys, action_spec, agent_config + ) + self.critic = get_critic(self.observation_keys, agent_config) + + self.value = get_value_operator(self.observation_keys, agent_config) + + self.actor.to(device) + self.critic.to(device) + self.value.to(device) + + # initialize networks + self.init_nets([self.actor, self.critic, self.value]) + + # define loss function + self.loss_module = IQLLoss( + actor_network=self.actor, + qvalue_network=self.critic, + value_network=self.value, + num_qvalue_nets=2, + temperature=agent_config.temperature, + expectile=agent_config.expectile, + loss_function=agent_config.loss_function, + ) + # Define Target Network Updater + self.target_net_updater = SoftUpdate( + self.loss_module, eps=agent_config.soft_update_eps + ) + self.target_net_updater.init_() + + # Reset weights + self.reset_params = agent_config.reset_params + + # Define Replay Buffer + self.replay_buffer = self.create_replay_buffer( + batch_size=agent_config.batch_size, + prb=agent_config.prb, + buffer_size=agent_config.buffer_size, + device=device, + ) + + # Define Optimizer + critic_params = list( + self.loss_module.qvalue_network_params.flatten_keys().values() + ) + value_params = list( + self.loss_module.value_network_params.flatten_keys().values() + ) + actor_params = list( + self.loss_module.actor_network_params.flatten_keys().values() + ) + self.optimizer_actor = optim.Adam( + actor_params, lr=agent_config.lr, weight_decay=0.0 + ) + self.optimizer_critic = optim.Adam( + critic_params, lr=agent_config.lr, weight_decay=0.0 + ) + self.optimizer_value = optim.Adam( + value_params, lr=agent_config.lr, weight_decay=0.0 + ) + + # general stats + self.collected_transitions = 0 + self.total_updates = 0 + self.do_pretrain = agent_config.pretrain + + def get_agent_statedict(self): + """Save agent""" + act_statedict = self.actor.state_dict() + critic_statedict = self.critic.state_dict() + value_statedict = self.value.state_dict() + return { + "actor": act_statedict, + "critic": critic_statedict, + "value": value_statedict, + } + + def load_model(self, path): + """load model""" + try: + statedict = torch.load(path) + self.actor.load_state_dict(statedict["actor"]) + self.critic.load_state_dict(statedict["critic"]) + self.value.load_state_dict(statedict["value"]) + print("Model loaded") + except: + raise ValueError("Model not loaded") + + def load_replaybuffer(self, path): + """load replay buffer""" + try: + self.replay_buffer.load_state_dict(torch.load(path)) + print("Replay Buffer loaded") + print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") + except: + raise ValueError("Replay Buffer not loaded") + + def reset_networks(self): + """reset network parameters""" + print("Resetting Networks!") + self.loss_module.actor_network_params.apply(self.reset_parameter) + self.loss_module.target_actor_network_params.apply(self.reset_parameter) + self.loss_module.qvalue_network_params.apply(self.reset_parameter) + self.loss_module.target_qvalue_network_params.apply(self.reset_parameter) + self.loss_module.value_network_params.apply(self.reset_parameter) + + def eval(self): + """Sets the agent to evaluation mode.""" + self.actor.eval() + + def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: + # TODO not ideal to have this here + td.pop("scale") + td.pop("loc") + td.pop("params") + if "vector_obs_embedding" in td.keys(): + td.pop("vector_obs_embedding") + if "image_embedding" in td.keys(): + td.pop("image_embedding") + + def create_replay_buffer( + self, + batch_size=256, + prb=False, + buffer_size=100000, + buffer_scratch_dir=None, + device="cpu", + prefetch=3, + ): + """Create replay buffer""" + # TODO: make this part of base off policy agent + if prb: + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=0.7, + beta=0.5, + pin_memory=False, + prefetch=1, + storage=LazyTensorStorage( + buffer_size, + device=device, + ), + ) + else: + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + ) + return replay_buffer + + @torch.no_grad() + def get_action(self, td: TensorDictBase) -> TensorDictBase: + """Get action from actor network""" + with set_exploration_type(ExplorationType.RANDOM): + out_td = self.actor(td.to(self.device)) + self.td_preprocessing(out_td) + return out_td + + @torch.no_grad() + def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: + """Get eval action from actor network""" + with set_exploration_type(ExplorationType.MODE): + out_td = self.actor(td.to(self.device)) + self.td_preprocessing(out_td) + return out_td + + def add_experience(self, transition: td.TensorDict): + """Add experience to replay buffer""" + self.replay_buffer.extend(transition) + self.collected_transitions += 1 + + def pretrain(self, wandb, batch_size=64, num_updates=1): + """Pretrain the agent with simple behavioral cloning""" + # TODO: implement pretrain for testing + # for i in range(num_updates): + # batch = self.replay_buffer.sample(batch_size) + # pred, _ = self.actor(batch["observations"].float()) + # loss = torch.mean((pred - batch["actions"]) ** 2) + # self.optimizer.zero_grad() + # loss.backward() + # self.optimizer.step() + # wandb.log({"pretrain/loss": loss.item()}) + + def train(self, batch_size=64, num_updates=1): + """Train the agent""" + self.actor.train() + for i in range(num_updates): + self.total_updates += 1 + if self.reset_params and self.total_updates % self.reset_params == 0: + self.reset_networks() + # Sample a batch from the replay buffer + batch = self.replay_buffer.sample(batch_size) + # Compute IQL Loss + loss = self.loss_module(batch) + + # Update Actpr Network + self.optimizer_actor.zero_grad() + loss["loss_actor"].backward() + self.optimizer_actor.step() + # Update Critic Network + self.optimizer_critic.zero_grad() + loss["loss_qvalue"].backward() + torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) + self.optimizer_critic.step() + # Update Value Network + self.optimizer_value.zero_grad() + loss["loss_value"].backward() + self.optimizer_value.step() + + # Update Target Networks + self.target_net_updater.step() + # Update Prioritized Replay Buffer + if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer): + self.replay_buffer.update_priorities( + batch["indices"], + loss["critic_loss"].detach().cpu().numpy(), + ) + self.actor.eval() + return loss diff --git a/src/networks/networks.py b/src/networks/networks.py index 35964b1..0dce70b 100644 --- a/src/networks/networks.py +++ b/src/networks/networks.py @@ -56,6 +56,114 @@ def get_critic(observation_keys, agent_config): raise NotImplementedError("Critic for this observation space not implemented") +def get_value_operator(observation_keys, agent_config): + if ( + "vec_observation" in observation_keys + and not "image_observation" in observation_keys + ): + return get_vec_value( + in_keys=observation_keys, + num_cells=[agent_config.num_cells, agent_config.num_cells], + out_features=1, + activation_class=nn.ReLU, + normalization=agent_config.normalization, + dropout=agent_config.dropout, + ) + elif ( + "image_observation" in observation_keys + and "vec_observation" in observation_keys + ): + return get_mixed_value( + vec_in_keys="vec_observation", + img_in_keys="image_observation", + num_cells=[agent_config.num_cells, agent_config.num_cells], + out_features=1, + activation_class=nn.ReLU, + normalization=agent_config.normalization, + dropout=agent_config.dropout, + ) + + +def get_vec_value( + in_keys=["observation"], + num_cells=[256, 256], + out_features=1, + activation_class=nn.ReLU, + normalization="None", + dropout=0.0, +): + """Returns a critic network""" + normalization = get_normalization(normalization) + qvalue_net = MLP( + num_cells=num_cells, + out_features=out_features, + activation_class=activation_class, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + + qvalue = ValueOperator( + in_keys=in_keys, + module=qvalue_net, + ) + return qvalue + + +def get_mixed_value( + vec_in_keys, + img_in_keys, + num_cells=[256, 256], + out_features=1, + activation_class=nn.ReLU, + normalization="None", + dropout=0.0, +): + normalization = get_normalization(normalization) + # image encoder + cnn = ConvNet( + activation_class=activation_class, + num_cells=[32, 64, 64], + kernel_sizes=[8, 4, 3], + strides=[4, 2, 1], + ) + cnn_output = cnn(torch.ones((3, 64, 64))) + mlp = MLP( + in_features=cnn_output.shape[-1], + activation_class=activation_class, + out_features=128, + num_cells=[256], + ) + image_encoder = SafeModule( + torch.nn.Sequential(cnn, mlp), + in_keys=[img_in_keys], + out_keys=["image_embedding"], + ) + + # vector_obs encoder + mlp = MLP( + activation_class=activation_class, + out_features=32, + num_cells=[128], + ) + vector_obs_encoder = SafeModule( + mlp, in_keys=[vec_in_keys], out_keys=["vec_obs_embedding"] + ) + + # output head + mlp = MLP( + activation_class=torch.nn.ReLU, + out_features=out_features, + num_cells=num_cells, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + v_head = ValueOperator(mlp, ["image_embedding", "vec_obs_embedding"]) + # model + return SafeSequential(image_encoder, vector_obs_encoder, v_head) + + def get_vec_critic( in_keys=["observation"], num_cells=[256, 256], From c9b9dadd207e3b4023c22b419cf56b7a33ee83c7 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 30 May 2024 13:05:17 +0200 Subject: [PATCH 04/53] init cql --- src/agents/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/agents/__init__.py b/src/agents/__init__.py index e95366a..2fb2399 100644 --- a/src/agents/__init__.py +++ b/src/agents/__init__.py @@ -1,10 +1,11 @@ from src.agents.behavior_cloning import BehavioralCloningAgent +from src.agents.cql import CQLAgent from src.agents.iql import IQLAgent from src.agents.random import RandomAgent from src.agents.sac import SACAgent from src.agents.td3 import TD3Agent -all_agents = ["td3", "sac", "bc", "iql", "random"] +all_agents = ["td3", "sac", "bc", "iql", "cql", "random"] def get_agent(action_spec, state_spec, cfg): @@ -43,6 +44,13 @@ def get_agent(action_spec, state_spec, cfg): agent_config=cfg.agent, device=cfg.device, ) + elif cfg.agent.name == "cql": + agent = CQLAgent( + action_spec=action_spec, + state_spec=state_spec, + agent_config=cfg.agent, + device=cfg.device, + ) else: raise NotImplementedError( f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}" From 5eefbfed87cc9c356e25ab326bdae3c416d9a289 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 30 May 2024 13:05:51 +0200 Subject: [PATCH 05/53] add cql --- conf/agent/cql.yaml | 28 +++++ src/agents/cql.py | 248 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 conf/agent/cql.yaml create mode 100644 src/agents/cql.py diff --git a/conf/agent/cql.yaml b/conf/agent/cql.yaml new file mode 100644 index 0000000..07c2082 --- /dev/null +++ b/conf/agent/cql.yaml @@ -0,0 +1,28 @@ +name: cql +lr: 3e-4 +batch_size: 256 +num_updates: 1 +prefill_episodes: 10 + +bc_steps: 1000 + +# CQL specific +num_cells: 256 +gamma: 0.99 +soft_update_eps: 0.995 +loss_function: l2 +temperature: 1.0 +min_q_weight: 1.0 +max_q_backup: False +deterministic_backup: False +num_random: 10 +with_lagrange: True +lagrange_thresh: 5.0 # tau + +normalization: None +dropout: 0.0 + +prb: 0 +buffer_size: 1000000 +pretrain: False +reset_params: False \ No newline at end of file diff --git a/src/agents/cql.py b/src/agents/cql.py new file mode 100644 index 0000000..c65cbcc --- /dev/null +++ b/src/agents/cql.py @@ -0,0 +1,248 @@ +import tensordict as td +import torch +from tensordict import TensorDictBase +from torch import optim +from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer +from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage +from torchrl.envs.utils import ExplorationType, set_exploration_type +from torchrl.objectives import SoftUpdate + +from torchrl.objectives.cql import CQLLoss + +from src.agents.base import BaseAgent +from src.networks.networks import get_critic, get_stochastic_actor + + +class CQLAgent(BaseAgent): + def __init__(self, state_spec, action_spec, agent_config, device="cpu"): + super(CQLAgent, self).__init__( + state_spec, action_spec, agent_config.name, device + ) + + with_lagrange = agent_config.with_lagrange + + self.actor = get_stochastic_actor( + self.observation_keys, action_spec, agent_config + ) + self.critic = get_critic(self.observation_keys, agent_config) + + self.actor.to(device) + self.critic.to(device) + + # initialize networks + self.init_nets([self.actor, self.critic]) + + # define loss function + self.loss_module = CQLLoss( + actor_network=self.actor, + qvalue_network=self.critic, + loss_function=agent_config.loss_function, + temperature=agent_config.temperature, + min_q_weight=agent_config.min_q_weight, + max_q_backup=agent_config.max_q_backup, + deterministic_backup=agent_config.deterministic_backup, + num_random=agent_config.num_random, + with_lagrange=agent_config.with_lagrange, + lagrange_thresh=agent_config.lagrange_thresh, + ) + # Define Target Network Updater + self.target_net_updater = SoftUpdate( + self.loss_module, eps=agent_config.soft_update_eps + ) + self.target_net_updater.init_() + + # Reset weights + self.reset_params = agent_config.reset_params + + # Define Replay Buffer + self.replay_buffer = self.create_replay_buffer( + batch_size=agent_config.batch_size, + prb=agent_config.prb, + buffer_size=agent_config.buffer_size, + device=device, + ) + + # Define Optimizer + critic_params = list( + self.loss_module.qvalue_network_params.flatten_keys().values() + ) + actor_params = list( + self.loss_module.actor_network_params.flatten_keys().values() + ) + self.optimizer_actor = optim.Adam( + actor_params, lr=agent_config.lr, weight_decay=0.0 + ) + self.optimizer_critic = optim.Adam( + critic_params, lr=agent_config.lr, weight_decay=0.0 + ) + self.optimizer_alpha = optim.Adam( + [self.loss_module.log_alpha], + lr=3.0e-4, + ) + if with_lagrange: + self.alpha_prime_optim = torch.optim.Adam( + [self.loss_module.log_alpha_prime], + lr=agent_config.lr, + ) + else: + self.alpha_prime_optim = None + # general stats + self.collected_transitions = 0 + self.total_updates = 0 + self.do_pretrain = agent_config.pretrain + self.bc_steps = agent_config.bc_steps + + def get_agent_statedict(self): + """Save agent""" + act_statedict = self.actor.state_dict() + critic_statedict = self.critic.state_dict() + return {"actor": act_statedict, "critic": critic_statedict} + + def load_model(self, path): + """load model""" + try: + statedict = torch.load(path) + self.actor.load_state_dict(statedict["actor"]) + self.critic.load_state_dict(statedict["critic"]) + print("Model loaded") + except: + raise ValueError("Model not loaded") + + def load_replaybuffer(self, path): + """load replay buffer""" + try: + self.replay_buffer.load_state_dict(torch.load(path)) + print("Replay Buffer loaded") + print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") + except: + raise ValueError("Replay Buffer not loaded") + + def reset_networks(self): + """reset network parameters""" + print("Resetting Networks!") + self.loss_module.actor_network_params.apply(self.reset_parameter) + self.loss_module.target_actor_network_params.apply(self.reset_parameter) + self.loss_module.qvalue_network_params.apply(self.reset_parameter) + self.loss_module.target_qvalue_network_params.apply(self.reset_parameter) + + def eval(self): + """Sets the agent to evaluation mode.""" + self.actor.eval() + + def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: + # TODO not ideal to have this here + td.pop("scale") + td.pop("loc") + td.pop("params") + if "vector_obs_embedding" in td.keys(): + td.pop("vector_obs_embedding") + if "image_embedding" in td.keys(): + td.pop("image_embedding") + + def create_replay_buffer( + self, + batch_size=256, + prb=False, + buffer_size=100000, + buffer_scratch_dir=None, + device="cpu", + prefetch=3, + ): + """Create replay buffer""" + # TODO: make this part of base off policy agent + if prb: + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=0.7, + beta=0.5, + pin_memory=False, + prefetch=1, + storage=LazyTensorStorage( + buffer_size, + device=device, + ), + ) + else: + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + ) + return replay_buffer + + @torch.no_grad() + def get_action(self, td: TensorDictBase) -> TensorDictBase: + """Get action from actor network""" + with set_exploration_type(ExplorationType.RANDOM): + out_td = self.actor(td.to(self.device)) + self.td_preprocessing(out_td) + return out_td + + @torch.no_grad() + def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: + """Get eval action from actor network""" + with set_exploration_type(ExplorationType.MODE): + out_td = self.actor(td.to(self.device)) + self.td_preprocessing(out_td) + return out_td + + def add_experience(self, transition: td.TensorDict): + """Add experience to replay buffer""" + self.replay_buffer.extend(transition) + self.collected_transitions += 1 + + def train(self, batch_size=64, num_updates=1): + """Train the agent""" + self.actor.train() + for i in range(num_updates): + self.total_updates += 1 + # Sample a batch from the replay buffer + batch = self.replay_buffer.sample(batch_size) + # Compute CQL Loss + loss = self.loss_module(batch) + + # Update alpha + alpha_loss = loss["loss_alpha"] + alpha_prime_loss = loss["loss_alpha_prime"] + self.optimizer_alpha.zero_grad() + alpha_loss.backward() + self.optimizer_alpha.step() + + # Update Actpr Network + # official cql implementation uses behavior cloning loss for first few updating steps as it helps for some tasks + if self.total_updates >= self.bc_steps: + actor_loss = loss["loss_actor"] + else: + actor_loss = loss["loss_actor_bc"] + self.optimizer_actor.zero_grad() + actor_loss.backward() + self.optimizer_actor.step() + + if self.alpha_prime_optim is not None: + self.alpha_prime_optim.zero_grad() + alpha_prime_loss.backward(retain_graph=True) + self.alpha_prime_optim.step() + + # Update Critic Network + q_loss = loss["loss_qvalue"] + cql_loss = loss["loss_cql"] + + q_loss = q_loss + cql_loss + self.optimizer_critic.zero_grad() + q_loss.backward(retain_graph=False) + self.optimizer_critic.step() + + # Update Target Networks + self.target_net_updater.step() + # Update Prioritized Replay Buffer + if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer): + self.replay_buffer.update_priorities( + batch["indices"], + loss["critic_loss"].detach().cpu().numpy(), + ) + self.actor.eval() + return loss From 47893240fe98ce59cf22267676c18559bf3ac653 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 4 Jun 2024 14:47:39 +0200 Subject: [PATCH 06/53] update random agent --- conf/agent/bc.yaml | 4 +-- src/agents/behavior_cloning.py | 62 ++++++++++++++++------------------ src/agents/random.py | 46 +++++++++++++++++++++++-- 3 files changed, 75 insertions(+), 37 deletions(-) diff --git a/conf/agent/bc.yaml b/conf/agent/bc.yaml index 3467319..37e07ed 100644 --- a/conf/agent/bc.yaml +++ b/conf/agent/bc.yaml @@ -1,7 +1,7 @@ name: bc lr: 3e-4 batch_size: 256 -num_updates: 300 +num_updates: 1 prefill_episodes: 0 @@ -9,5 +9,3 @@ policy_type: deterministic # stochastic or deterministic num_cells: 256 dropout: 0.01 normalization: LayerNorm - -offline_data_path: datasets/walker_v0/walker_gait_250ms_50epochs.npy \ No newline at end of file diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index f54e33e..cddbbfb 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -1,10 +1,12 @@ import numpy as np import tensordict as td import torch +from tensordict import TensorDictBase from torch import nn, optim from torchrl.data import BoundedTensorSpec, TensorDictReplayBuffer from torchrl.data.replay_buffers.storages import LazyMemmapStorage +from torchrl.envs.utils import ExplorationType, set_exploration_type from src.agents.base import BaseAgent from src.networks.networks import get_deterministic_actor, get_stochastic_actor @@ -27,29 +29,19 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): if agent_config.policy_type == "deterministic": self.actor = get_deterministic_actor( - action_spec, - in_keys=["observation"], - num_cells=[agent_config.num_cells, agent_config.num_cells], - activation_class=nn.ReLU, - normalization=agent_config.normalization, - dropout=agent_config.dropout, + self.observation_keys, action_spec, agent_config ) self.pretrain = self.pretrain_deter elif agent_config.policy_type == "stochastic": self.actor = get_stochastic_actor( - action_spec, - in_keys=["observation"], - num_cells=[agent_config.num_cells, agent_config.num_cells], - activation_class=nn.ReLU, - normalization=agent_config.normalization, - dropout=agent_config.dropout, + self.observation_keys, action_spec, agent_config ) self.pretrain = self.pretrain_stoch else: raise ValueError( "policy_type not recognized, choose deterministic or stochastic" ) - + self.actor.to(device) # initialize networks self.init_nets([self.actor]) @@ -58,12 +50,12 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) # create replay buffer - self.offline_data_path = agent_config.offline_data_path + self.offline_data_path = None self.replay_buffer = self.create_replay_buffer() # general stats self.collected_transitions = 0 - self.do_pretrain = True + self.do_pretrain = False self.episodes = 0 def get_agent_statedict(self): @@ -89,15 +81,16 @@ def load_replaybuffer(self, path): except: raise ValueError("Replay Buffer not loaded") - def load_offline_data(self, path): - """load offline data""" - # TODO: cleanup! - try: - data = np.load(path, allow_pickle=True).item() - except: - raise ValueError("Cannot load offline data, check path!") + def eval(self): + """Sets the agent to evaluation mode.""" + self.actor.eval() - return td.TensorDict(data, batch_size=len(data["observations"])) + @torch.no_grad() + def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: + """Get eval action from actor network""" + with set_exploration_type(ExplorationType.MODE): + out_td = self.actor(td.to(self.device)) + return out_td def create_replay_buffer( self, @@ -120,12 +113,6 @@ def create_replay_buffer( batch_size=batch_size, ) - # load offline data - if self.offline_data_path is not None: - offline_data = self.load_offline_data(self.offline_data_path) - - replay_buffer.extend(offline_data) - return replay_buffer @torch.no_grad() @@ -150,7 +137,7 @@ def pretrain_stoch(self, wandb, batch_size=64, num_updates=1): for i in range(num_updates): batch = self.replay_buffer.sample(batch_size) input_td = td.TensorDict( - {"observation": batch["observations"].float()}, batch_size=(256) + {"observation": batch["vec_observations"].float()}, batch_size=(256) ) dist = self.actor.get_dist(input_td) loss = -dist.log_prob(batch["actions"]).mean() @@ -166,7 +153,7 @@ def pretrain_deter(self, wandb, batch_size=64, num_updates=1): for i in range(num_updates): batch = self.replay_buffer.sample(batch_size) - pred, _ = self.actor(batch["observations"].float()) + pred, _ = self.actor(batch["vec_observations"].float()) loss = torch.mean((pred - batch["actions"]) ** 2) self.optimizer.zero_grad() loss.backward() @@ -177,4 +164,15 @@ def pretrain_deter(self, wandb, batch_size=64, num_updates=1): def train(self, batch_size=64, num_updates=1): """Train the agent""" - return {} + log_data = {} + + for i in range(num_updates): + batch = self.replay_buffer.sample(batch_size).to(self.device) + orig_action = batch.get("action").clone() + out_dict = self.actor(batch) + loss = torch.mean((out_dict.get("action") - orig_action) ** 2) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + log_data.update({"loss": loss}) + return log_data diff --git a/src/agents/random.py b/src/agents/random.py index 187f5ad..acbea60 100644 --- a/src/agents/random.py +++ b/src/agents/random.py @@ -1,5 +1,7 @@ import torch from tensordict import TensorDictBase +from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer +from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage from src.agents.base import BaseAgent @@ -12,7 +14,47 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.actor = None self.do_pretrain = False - self.replay_buffer = {} + self.replay_buffer = self.create_replay_buffer( + batch_size=agent_config.batch_size, + prb=False, + buffer_size=100000, + device=device, + ) + + def create_replay_buffer( + self, + batch_size=256, + prb=False, + buffer_size=100000, + buffer_scratch_dir=None, + device="cpu", + prefetch=3, + ): + """Create replay buffer""" + # TODO: make this part of base off policy agent + if prb: + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=0.7, + beta=0.5, + pin_memory=False, + prefetch=1, + storage=LazyTensorStorage( + buffer_size, + device=device, + ), + ) + else: + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + ) + return replay_buffer def eval(self): """Sets the agent to evaluation mode.""" @@ -31,7 +73,7 @@ def get_eval_action(self, tensordict: TensorDictBase): def add_experience(self, transition: TensorDictBase): """Add experience to replay buffer""" - pass + self.replay_buffer.extend(transition) def train(self, batch_size=64, num_updates=1): """Train the agent""" From 063d801075677e072da8043b2dc8b0d85890b225 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 4 Jun 2024 19:55:51 +0200 Subject: [PATCH 07/53] add roboarm eval --- experiments/roboarm/eval.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index bb70b11..60089f4 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -16,7 +16,7 @@ from environments import make_env, VIDEO_LOGGING_ENVS from src.agents import get_agent -from src.utils import create_video_from_images, login, setup_check +from src.utils import create_video_from_images, login, logout, setup_check @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") @@ -62,6 +62,7 @@ def run(cfg: DictConfig) -> None: image_caputres.append( td.get(("next", "original_image")).cpu().numpy() ) + agent.add_experience(td) total_agent_step_time = time.time() - step_start_time total_step_times.append(total_agent_step_time) done = td.get(("next", "done"), False) @@ -102,6 +103,7 @@ def run(cfg: DictConfig) -> None: except KeyboardInterrupt: print("Evaluation interrupted by user.") + logout(agent) env.close() From bee43030a06b0f1a420bda300d7328085231dce1 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 6 Jun 2024 12:11:47 +0200 Subject: [PATCH 08/53] add pretrain roboarm --- experiments/roboarm/pretrain.py | 61 +++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 experiments/roboarm/pretrain.py diff --git a/experiments/roboarm/pretrain.py b/experiments/roboarm/pretrain.py new file mode 100644 index 0000000..bd30a2c --- /dev/null +++ b/experiments/roboarm/pretrain.py @@ -0,0 +1,61 @@ +import os +import sys + +import hydra +import wandb +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm + +# Add the project root to PYTHONPATH +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +from environments import make_env +from src.agents import get_agent +from src.utils import login, logout, setup_check, tensordict2dict + + +@hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") +def run(cfg: DictConfig) -> None: + print(OmegaConf.to_yaml(cfg)) + + # make environment. + setup_check(robot="roboarm", config=cfg) + env, action_space, state_space = make_env(cfg) + + # make agent + agent, project_name = get_agent(action_space, state_space, cfg) + login(agent) + + # initialize wandb + wandb.init(project=project_name) + wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) + wandb.watch(agent.actor, log_freq=1) if agent.actor else None + + batch_size = cfg.agent.batch_size + num_updates = cfg.agent.num_updates + train_episodes = cfg.episodes + print("Start training...") + try: + for e in tqdm(range(train_episodes), desc="Training"): + + loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) + + # Metrics Logging + log_dict = { + "epoch": e, + "buffer_size": agent.replay_buffer.__len__(), + } + log_dict.update(tensordict2dict(loss_info)) + wandb.log(log_dict) + + except KeyboardInterrupt: + print("Training interrupted by user.") + + logout(agent) + env.close() + + +if __name__ == "__main__": + run() From 410c3a950fee976e7571e253e7aa1b71c1590ae2 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 6 Jun 2024 12:13:05 +0200 Subject: [PATCH 09/53] Update eval scripts roboarm walker --- experiments/roboarm/eval.py | 4 ++-- experiments/walker/eval.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index 60089f4..ae62a66 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -85,14 +85,14 @@ def run(cfg: DictConfig) -> None: } if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0": achieved_state = td.get(env.original_observation_key).cpu().numpy() - error = np.sum( + final_error = np.sum( np.abs( env.shortest_angular_distance_vectorized( goal_state, achieved_state ) ) ) - log_dict["final_error"] = error + log_dict["final_error"] = final_error wandb.log(log_dict) if env_name in VIDEO_LOGGING_ENVS: diff --git a/experiments/walker/eval.py b/experiments/walker/eval.py index d7314eb..a5f5204 100644 --- a/experiments/walker/eval.py +++ b/experiments/walker/eval.py @@ -17,7 +17,7 @@ from environments import make_env, VIDEO_LOGGING_ENVS from src.agents import get_agent -from src.utils import create_video_from_images, login, setup_check +from src.utils import create_video_from_images, login, logout, setup_check @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") @@ -61,6 +61,7 @@ def run(cfg: DictConfig) -> None: td = agent.get_eval_action(td) actions.append(td.get("action").cpu().numpy()) td = env.step(td) + agent.add_experience(td) total_agent_step_time = time.time() - step_start_time total_step_times.append(total_agent_step_time) done = td.get(("next", "done"), False) @@ -96,6 +97,7 @@ def run(cfg: DictConfig) -> None: except KeyboardInterrupt: print("Evaluation interrupted by user.") + logout(agent) env.close() From 6e4be4272c3ff6d8bd89a31b5c4d628996622067 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 18 Jun 2024 10:44:53 +0200 Subject: [PATCH 10/53] update keyboard agent --- src/agents/keyboard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py index 5952341..40a54b6 100644 --- a/src/agents/keyboard.py +++ b/src/agents/keyboard.py @@ -20,8 +20,8 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.key_action_mapping = { "a": [-0.15, 0, 0, 0], # Rotate motor -30 "d": [0.15, 0, 0, 0], # Rotate motor +30 - "s": [0, -0.25, 0, 0], # Low motor -10 - "w": [0, 0.25, 0, 0], # Low motor +10 + "s": [0, -0.20, 0, 0], # Low motor -10 + "w": [0, 0.20, 0, 0], # Low motor +10 "q": [0, 0, -0.25, 0], # High motor -15 "e": [0, 0, 0.25, 0], # High motor +15 "f": [0, 0, 0, -0.25], # Grab motor -10 From f0ffdc1bb37da412876ba825be663254e14c35f1 Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 26 Jun 2024 13:55:30 +0200 Subject: [PATCH 11/53] inint pickplace roboarm env --- environments/__init__.py | 24 +- .../RoboArmPickPlaceEnv.py | 221 ++++++++++++++++++ environments/roboarm_pickplace_v0/client.py | 149 ++++++++++++ src/agents/keyboard.py | 32 +-- 4 files changed, 401 insertions(+), 25 deletions(-) create mode 100644 environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py create mode 100644 environments/roboarm_pickplace_v0/client.py diff --git a/environments/__init__.py b/environments/__init__.py index 34bb3d6..f0cd754 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -1,3 +1,4 @@ +import torch from torchrl.envs import ( CatFrames, Compose, @@ -8,6 +9,7 @@ ) from environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0 +from environments.roboarm_pickplace_v0.RoboArmPickPlaceEnv import RoboArmPickPlaceEnv_v0 from environments.roboarm_v0.RoboArmEnv import RoboArmEnv_v0 from environments.roboarm_v0.RoboArmSim import RoboArmSimEnv_v0 from environments.runaway_v0.RunAwayEnv import RunAwayEnv_v0 @@ -15,14 +17,18 @@ from environments.walker_v0.WalkerEnv import WalkerEnv_v0 from environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0 - VIDEO_LOGGING_ENVS = ["roboarm_mixed-v0", "walker_mixed-v0"] ALL_2WHEELER_ENVS = ["spinning-v0", "runaway-v0"] ALL_WALKER_ENVS = [ "walker-v0", "walker_sim-v0", ] -ALL_ROBOARM_ENVS = ["roboarm-v0", "roboarm_mixed-v0", "roboarm_sim-v0"] +ALL_ROBOARM_ENVS = [ + "roboarm-v0", + "roboarm_mixed-v0", + "roboarm_sim-v0", + "roboarm_pickplace-v0", +] ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS @@ -55,7 +61,11 @@ def make_env(config): # env, current_action_influence=config.env.action_filter # ) if "image_observation" in observation_keys: - transforms.append(ToTensorImage(in_keys=["image_observation"], from_int=True)) + transforms.append( + ToTensorImage( + in_keys=["image_observation"], from_int=False, dtype=torch.uint8 + ) + ) # from_int=True, dtype=torch.float32 env = TransformedEnv(env, Compose(*transforms)) @@ -115,5 +125,13 @@ def make(name="RunAway", env_conf=None): camera_id=env_conf.camera_id, goal_radius=env_conf.goal_radius, ) + elif name == "roboarm_pickplace-v0": + return RoboArmPickPlaceEnv_v0( + max_episode_steps=env_conf.max_episode_steps, + sleep_time=env_conf.sleep_time, + verbose=env_conf.verbose, + reward_signal=env_conf.reward_signal, + camera_id=env_conf.camera_id, + ) else: print("Environment not found") diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py new file mode 100644 index 0000000..11e7140 --- /dev/null +++ b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py @@ -0,0 +1,221 @@ +import random +import time +from typing import Tuple + +import cv2 + +import numpy as np +import torch + +from environments.base.base_env import BaseEnv +from tensordict import TensorDict, TensorDictBase +from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec + + +class RoboArmPickPlaceEnv_v0(BaseEnv): + """ """ + + action_dim = ( + 4 # (grab_motor, high_motor_action, low_motor_action, rotation_motor_action) + ) + + state_dim = 4 # (GM, HM, LM, RM) + + motor_ranges = { + "GM": (-148, -45), # Grab motor range is 0-180 + "HM": (-150, 10), + "LM": (10, 100), + "RM": (-179, 179), # Rotation motor needs to be place in the center + } + vec_observation_key = "vec_observation" + image_observation_key = "image_observation" + # original_image_key = "original_image" + + def __init__( + self, + max_episode_steps: int = 50, + sleep_time: float = 0.0, + verbose: bool = False, + reward_signal: str = "dense", + camera_id: int = 0, + ): + self.sleep_time = sleep_time + + assert reward_signal in [ + "dense", + "sparse", + ], "Reward signal must be dense or sparse." + self.reward_signal = reward_signal + self.max_episode_steps = max_episode_steps + + self.camera = cv2.VideoCapture(int(camera_id)) + self._batch_size = torch.Size([1]) + + # Define action spec + self.action_spec = BoundedTensorSpec( + low=-1, + high=1, + shape=(1, self.action_dim), + ) + + # Observation 3 motors (HM, LM, RM) + # Define observation spec + bounds = torch.tensor( + [ + self.motor_ranges["GM"], + self.motor_ranges["HM"], + self.motor_ranges["LM"], + self.motor_ranges["RM"], + ] + ) + + low_bounds = bounds[:, 0].unsqueeze(0) + high_bounds = bounds[:, 1].unsqueeze(0) + + observation_spec = BoundedTensorSpec( + low=low_bounds, + high=high_bounds, + dtype=torch.float32, + ) + # get initial observation to define image observation spec + ret, frame = self.camera.read() + if not ret: + raise ValueError("Camera not available.") + resized_frame = cv2.resize(frame, (64, 64)) + shape = resized_frame.shape + image_observation_spec = BoundedTensorSpec( + low=torch.zeros((1,) + shape, dtype=torch.uint8), + high=torch.ones((1,) + shape, dtype=torch.uint8) * 255, + dtype=torch.uint8, + ) + + self.observation_spec = CompositeSpec( + { + self.vec_observation_key: observation_spec, + self.image_observation_key: image_observation_spec, + }, + shape=(1,), + ) + + self.goal_positions = self.init_camera_position() + + super().__init__( + action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + ) + + def normalize_state(self, state: np.ndarray, key: str) -> torch.Tensor: + """ + Normalize the state to be processed by the agent. + + Args: + state (np.ndarray): The state to be normalized. + + Returns: + torch.Tensor: The normalized state. + """ + state = (torch.from_numpy(state) - self.observation_spec[key].space.low) / ( + self.observation_spec[key].space.high - self.observation_spec[key].space.low + ) + return state + + def init_camera_position( + self, + ): + print( + "\nInitializing camera position... \nMake sure the robot is in the center of the frame.\nPlease press 'c' to continue..." + ) + while True: + ret, frame = self.camera.read() + if not ret: + print("Error: Can't receive frame. Exiting ...") + break + + cv2.imshow("Init RobotPosition", frame) + + if cv2.waitKey(1) == ord("c"): + break + + return + + def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: + """ + Reset the environment and return the initial state. + + Returns: + TensorDictBase: The initial state of the environment. + """ + # TODO solve this fake action sending before to receive first state + self.episode_step_iter = 0 + if tensordict is not None: + action = tensordict.get("action").cpu().numpy().squeeze() + else: + action = np.zeros(self.action_dim) + self.send_to_hub(action) + time.sleep(self.sleep_time) + observation = self.read_from_hub() + # assert + norm_obs = self.normalize_state(observation, self.vec_observation_key) + + ret, frame = self.camera.read() + resized_frame = cv2.resize(frame, (64, 64)) + + return TensorDict( + { + self.vec_observation_key: norm_obs.float(), + self.image_observation_key: torch.from_numpy(resized_frame)[None, :].to( + torch.uint8 + ), + }, + batch_size=[1], + ) + + def reward( + self, + frame: np.ndarray, + ) -> Tuple[float, bool]: + """ """ + # TODO: Find a way to classify if cup is in the goal location + done = False + reward = 0.0 + return reward, done + + def _step(self, tensordict: TensorDictBase) -> TensorDictBase: + """ """ + # Send action to hub to receive next state + self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze()) + time.sleep( + self.sleep_time + ) # we need to wait some time for sensors to read and to + + # receive the next state + next_observation = self.read_from_hub() + + # get next frame + ret, frame = self.camera.read() + + cv2.imshow("Camera", frame) + cv2.waitKey(1) + # calc reward and done + reward, done = self.reward( + frame, + ) + resized_frame = cv2.resize(frame, (64, 64)) + next_tensordict = TensorDict( + { + self.vec_observation_key: self.normalize_state( + next_observation, self.vec_observation_key + ).float(), + self.image_observation_key: torch.from_numpy(resized_frame)[None, :].to( + torch.uint8 + ), + "reward": torch.tensor([reward]).float(), + "done": torch.tensor([done]).bool(), + }, + batch_size=[1], + ) + + # increment episode step counter + self.episode_step_iter += 1 + if self.episode_step_iter >= self.max_episode_steps: + next_tensordict.set("done", torch.tensor([True])) + return next_tensordict diff --git a/environments/roboarm_pickplace_v0/client.py b/environments/roboarm_pickplace_v0/client.py new file mode 100644 index 0000000..1490c94 --- /dev/null +++ b/environments/roboarm_pickplace_v0/client.py @@ -0,0 +1,149 @@ +import umath +import urandom +import ustruct +from micropython import kbd_intr +from pybricks.hubs import InventorHub +from pybricks.parameters import Port +from pybricks.pupdevices import ColorSensor, Motor +from pybricks.tools import wait +from uselect import poll + +# Standard MicroPython modules +from usys import stdin, stdout + +kbd_intr(-1) + +hub = InventorHub() + +# Initialize the drive base. +# Grab Motor range (130, 179) left side closed (-148, -45) +grab_motor_range = (-148, -45) +grab_motor = Motor(Port.E) +grab_motor.run_target(speed=400, target_angle=-95) # start roughly in the middle +# High Motor range (-150, 30) +high_motor_range = (-150, 10) +high_motor = Motor(Port.A) +high_motor.run_target(speed=400, target_angle=-70) + +# Low motor range (10, 70) +low_motor_range = (10, 100) +low_motor = Motor(Port.D) +low_motor.control.limits(500, 1200, 1000) +low_motor.run_target(speed=400, target_angle=40) + +# Rotation motor range (-360, 360) +# observe as its basically ~ 180 +rotation_motor = Motor(Port.B) + +# color_sensor = ColorSensor(Port.C) +motors = {"GM": grab_motor, "HM": high_motor, "LM": low_motor, "RM": rotation_motor} + + +def get_current_motor_angles(): + angles = {} + for k, v in motors.items(): + angle = normalize_angle(get_angle(v)) + angles.update({k: angle}) + return angles + + +def run_angle(motor, angle, speed=300): + motor.run_angle(speed=speed, rotation_angle=angle, wait=False) + + +def get_angle(motor): + return motor.angle() + + +def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360): + # Normalize angle to be within -179 to 179 degrees + while angle <= low_angle: + angle += original_one_round + while angle > high_angle: + angle -= original_one_round + return angle + + +def transform_range(value, old_min, old_max, new_min, new_max): + """ + Transform a value from one range to another. + + Parameters: + value (float): The value to transform. + old_min (float): The minimum value of the old range. + old_max (float): The maximum value of the old range. + new_min (float): The minimum value of the new range. + new_max (float): The maximum value of the new range. + + Returns: + float: The transformed value. + """ + # Compute the scale factor between the old and new ranges + scale = (new_max - new_min) / (old_max - old_min) + # Apply the transformation + return new_min + (value - old_min) * scale + + +keyboard = poll() +keyboard.register(stdin) + +while True: + + while not keyboard.poll(0): + wait(1) + + # Read action values for the motors + data = stdin.buffer.read(16) # Reading 4 bytes (4 floats) + rotation_action, low_action, high_action, grab_action = ustruct.unpack( + "!ffff", data + ) + + # transform action range for motors + grab_action = transform_range(grab_action, -1, 1, -25, 25) + high_action = transform_range(high_action, -1, 1, -60, 60) + low_action = transform_range(low_action, -1, 1, -30, 30) + rotation_action = transform_range(rotation_action, -1, 1, -180, 180) + + angles = get_current_motor_angles() + + if not (angles["GM"] + grab_action > max(grab_motor_range)) and not ( + angles["GM"] + grab_action < min(grab_motor_range) + ): + grab_motor.run_angle(speed=250, rotation_angle=grab_action, wait=False) + + if not (angles["HM"] + high_action > max(high_motor_range)) and not ( + angles["HM"] + high_action < min(high_motor_range) + ): + high_motor.run_angle(speed=250, rotation_angle=high_action, wait=False) + + if not (angles["LM"] + low_action > max(low_motor_range)) and not ( + angles["LM"] + low_action < min(low_motor_range) + ): + low_motor.control.limits(500, 1200, 1000) + low_motor.run_angle(speed=250, rotation_angle=low_action, wait=False) + + # if not (angles["RM"] + rotation_action > 180) or not (angles["RM"] + rotation_action < -180): + rotation_motor.run_angle(speed=250, rotation_angle=rotation_action, wait=False) + + wait(250) + + rotation_angle = rotation_motor.angle() + high_angle = high_motor.angle() + grab_angle = grab_motor.angle() + low_angle = low_motor.angle() + + # sometimes low angle jumps out of range and cant move back this corrects those cases + if low_angle < 10: + low_motor.run_target(speed=200, target_angle=10) + + # GM HM LM RM + out_msg = ustruct.pack( + "!ffff", + grab_angle, + normalize_angle(high_angle), + low_angle, + normalize_angle( + rotation_angle, low_angle=-900, high_angle=900, original_one_round=1800 + ), + ) + stdout.buffer.write(out_msg) diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py index 40a54b6..56929e9 100644 --- a/src/agents/keyboard.py +++ b/src/agents/keyboard.py @@ -89,28 +89,16 @@ def create_replay_buffer( prefetch=3, ): """Create replay buffer""" - if prb: - replay_buffer = TensorDictPrioritizedReplayBuffer( - alpha=0.7, - beta=0.5, - pin_memory=False, - prefetch=1, - storage=LazyTensorStorage( - buffer_size, - device=device, - ), - ) - else: - replay_buffer = TensorDictReplayBuffer( - pin_memory=False, - prefetch=prefetch, - storage=LazyMemmapStorage( - buffer_size, - scratch_dir=buffer_scratch_dir, - device=device, - ), - batch_size=batch_size, - ) + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + ) return replay_buffer @torch.no_grad() From 732226bbdf49c5438e96a26a7d6f390b3fd1b82e Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 3 Jul 2024 09:50:16 +0200 Subject: [PATCH 12/53] update pickplace client, roboarm eval --- environments/roboarm_pickplace_v0/client.py | 2 +- experiments/roboarm/eval.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/roboarm_pickplace_v0/client.py b/environments/roboarm_pickplace_v0/client.py index 1490c94..e7a6b72 100644 --- a/environments/roboarm_pickplace_v0/client.py +++ b/environments/roboarm_pickplace_v0/client.py @@ -4,7 +4,7 @@ from micropython import kbd_intr from pybricks.hubs import InventorHub from pybricks.parameters import Port -from pybricks.pupdevices import ColorSensor, Motor +from pybricks.pupdevices import Motor from pybricks.tools import wait from uselect import poll diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index bb70b11..085d98b 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -16,7 +16,7 @@ from environments import make_env, VIDEO_LOGGING_ENVS from src.agents import get_agent -from src.utils import create_video_from_images, login, setup_check +from src.utils import create_video_from_images, login, setup_check, logout @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") @@ -101,7 +101,7 @@ def run(cfg: DictConfig) -> None: except KeyboardInterrupt: print("Evaluation interrupted by user.") - + logout(agent) env.close() From b429e9dca31a7eab6ea015406e3f9bbd5899bdbc Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 3 Jul 2024 09:51:53 +0200 Subject: [PATCH 13/53] update replay buffer saving --- src/agents/keyboard.py | 25 +++++++++++++++---------- src/agents/sac.py | 14 ++++++++++---- src/agents/td3.py | 7 ++++++- src/utils.py | 2 +- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py index 56929e9..d652504 100644 --- a/src/agents/keyboard.py +++ b/src/agents/keyboard.py @@ -29,10 +29,10 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): } self.current_action = None self.setup_key_listener() - + self.buffer_batch_size = agent_config.batch_size # Define Replay Buffer self.replay_buffer = self.create_replay_buffer( - batch_size=agent_config.batch_size, + batch_size=self.buffer_batch_size, prb=False, buffer_size=agent_config.buffer_size, device=device, @@ -69,7 +69,12 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + self.replay_buffer.load(path) + if self.replay_buffer._batch_size != self.buffer_batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.buffer_batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: @@ -84,7 +89,7 @@ def create_replay_buffer( batch_size=256, prb=False, buffer_size=100000, - buffer_scratch_dir=None, + buffer_scratch_dir="./scratch", device="cpu", prefetch=3, ): @@ -95,10 +100,10 @@ def create_replay_buffer( storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, - device=device, ), batch_size=batch_size, ) + replay_buffer.append_transform(lambda x: x.to(device)) return replay_buffer @torch.no_grad() @@ -111,11 +116,11 @@ def get_action(self, td: TensorDictBase) -> TensorDictBase: @torch.no_grad() def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: - """Get eval action from actor network""" - with set_exploration_type(ExplorationType.MODE): - out_td = self.actor(td.to(self.device)) - self.td_preprocessing(out_td) - return out_td + """Get action from actor network or keyboard""" + while self.current_action is None: + time.sleep(0.01) # Add a small sleep to avoid blocking + td.set("action", torch.tensor(self.current_action).float().unsqueeze(0)) + return td def add_experience(self, transition: td.TensorDict): """Add experience to replay buffer""" diff --git a/src/agents/sac.py b/src/agents/sac.py index 9303b06..639dd2a 100644 --- a/src/agents/sac.py +++ b/src/agents/sac.py @@ -51,12 +51,13 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.reset_params = agent_config.reset_params # Define Replay Buffer + self.buffer_batch_size = agent_config.batch_size self.replay_buffer = self.create_replay_buffer( - batch_size=agent_config.batch_size, + batch_size=self.buffer_batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, - device=device, ) + self.replay_buffer.append_transform(lambda x: x.to(device)) # Define Optimizer critic_params = list( @@ -100,7 +101,12 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + self.replay_buffer.load(path) + if self.replay_buffer._batch_size != self.buffer_batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.buffer_batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: @@ -133,7 +139,7 @@ def create_replay_buffer( batch_size=256, prb=False, buffer_size=100000, - buffer_scratch_dir=None, + buffer_scratch_dir=".", device="cpu", prefetch=3, ): diff --git a/src/agents/td3.py b/src/agents/td3.py index 885109d..f0a033e 100644 --- a/src/agents/td3.py +++ b/src/agents/td3.py @@ -111,7 +111,12 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + self.replay_buffer.load(path) + if self.replay_buffer._batch_size != self.buffer_batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.buffer_batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: diff --git a/src/utils.py b/src/utils.py index 31f6803..f48eacc 100644 --- a/src/utils.py +++ b/src/utils.py @@ -79,7 +79,7 @@ def logout(agent): save_dict.update(buffer_dict) if len(save_dict) > 0: save_name = input("Enter the name of the file to save: ") - torch.save(save_dict, save_name + ".pth") + agent.replay_buffer.dump(save_name) def login(agent): From 79a90c623ceec98c8802596d3fc995cf829d697d Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 3 Jul 2024 09:53:40 +0200 Subject: [PATCH 14/53] tanh low high update --- src/networks/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networks/networks.py b/src/networks/networks.py index 35964b1..c954d41 100644 --- a/src/networks/networks.py +++ b/src/networks/networks.py @@ -318,8 +318,8 @@ def get_vec_stochastic_actor( dist_class = TanhNormal dist_kwargs = { - "min": action_spec.space.minimum, - "max": action_spec.space.maximum, + "min": action_spec.space.low, + "max": action_spec.space.high, "tanh_loc": False, } actor_extractor = NormalParamExtractor( From 0149606dab4febbedc2190c1e948020851403ee5 Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 3 Jul 2024 14:47:58 +0200 Subject: [PATCH 15/53] add pynput install to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7ca509c..bcfa85e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ tqdm==4.66.1 pytest==8.0.2 ufmt pre-commit -numpy==1.24.1 \ No newline at end of file +numpy==1.24.1 +pynput \ No newline at end of file From e9a14e1a8c9b849443d30142567afac9a3ef7262 Mon Sep 17 00:00:00 2001 From: BY571 Date: Wed, 3 Jul 2024 15:07:54 +0200 Subject: [PATCH 16/53] update pickplace env and keyboard agent --- .../RoboArmPickPlaceEnv.py | 56 ++++++------------- src/agents/keyboard.py | 4 +- 2 files changed, 20 insertions(+), 40 deletions(-) diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py index 11e7140..f676c0c 100644 --- a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py +++ b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py @@ -21,15 +21,14 @@ class RoboArmPickPlaceEnv_v0(BaseEnv): state_dim = 4 # (GM, HM, LM, RM) - motor_ranges = { + observation_ranges = { "GM": (-148, -45), # Grab motor range is 0-180 "HM": (-150, 10), "LM": (10, 100), "RM": (-179, 179), # Rotation motor needs to be place in the center } - vec_observation_key = "vec_observation" - image_observation_key = "image_observation" - # original_image_key = "original_image" + observation_key = "observation" + pixels_observation_key = "pixels" def __init__( self, @@ -62,10 +61,10 @@ def __init__( # Define observation spec bounds = torch.tensor( [ - self.motor_ranges["GM"], - self.motor_ranges["HM"], - self.motor_ranges["LM"], - self.motor_ranges["RM"], + self.observation_ranges["GM"], + self.observation_ranges["HM"], + self.observation_ranges["LM"], + self.observation_ranges["RM"], ] ) @@ -83,7 +82,7 @@ def __init__( raise ValueError("Camera not available.") resized_frame = cv2.resize(frame, (64, 64)) shape = resized_frame.shape - image_observation_spec = BoundedTensorSpec( + pixels_observation_spec = BoundedTensorSpec( low=torch.zeros((1,) + shape, dtype=torch.uint8), high=torch.ones((1,) + shape, dtype=torch.uint8) * 255, dtype=torch.uint8, @@ -91,8 +90,8 @@ def __init__( self.observation_spec = CompositeSpec( { - self.vec_observation_key: observation_spec, - self.image_observation_key: image_observation_spec, + self.observation_key: observation_spec, + self.pixels_observation_key: pixels_observation_spec, }, shape=(1,), ) @@ -103,21 +102,6 @@ def __init__( action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose ) - def normalize_state(self, state: np.ndarray, key: str) -> torch.Tensor: - """ - Normalize the state to be processed by the agent. - - Args: - state (np.ndarray): The state to be normalized. - - Returns: - torch.Tensor: The normalized state. - """ - state = (torch.from_numpy(state) - self.observation_spec[key].space.low) / ( - self.observation_spec[key].space.high - self.observation_spec[key].space.low - ) - return state - def init_camera_position( self, ): @@ -153,18 +137,16 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: self.send_to_hub(action) time.sleep(self.sleep_time) observation = self.read_from_hub() - # assert - norm_obs = self.normalize_state(observation, self.vec_observation_key) ret, frame = self.camera.read() resized_frame = cv2.resize(frame, (64, 64)) return TensorDict( { - self.vec_observation_key: norm_obs.float(), - self.image_observation_key: torch.from_numpy(resized_frame)[None, :].to( - torch.uint8 - ), + self.observation_key: torch.tensor(observation).float(), + self.pixels_observation_key: torch.from_numpy(resized_frame)[ + None, : + ].float(), }, batch_size=[1], ) @@ -202,12 +184,10 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase: resized_frame = cv2.resize(frame, (64, 64)) next_tensordict = TensorDict( { - self.vec_observation_key: self.normalize_state( - next_observation, self.vec_observation_key - ).float(), - self.image_observation_key: torch.from_numpy(resized_frame)[None, :].to( - torch.uint8 - ), + self.observation_key: torch.tensor(next_observation).float(), + self.pixels_observation_key: torch.from_numpy(resized_frame)[ + None, : + ].float(), "reward": torch.tensor([reward]).float(), "done": torch.tensor([done]).bool(), }, diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py index d652504..b42d19d 100644 --- a/src/agents/keyboard.py +++ b/src/agents/keyboard.py @@ -4,8 +4,8 @@ import torch from pynput import keyboard from tensordict import TensorDictBase -from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer -from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage +from torchrl.data import TensorDictReplayBuffer +from torchrl.data.replay_buffers.storages import LazyMemmapStorage from src.agents.base import BaseAgent From 30f0aebaa4140b15379c3c30d0110aaa9cd77049 Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 4 Jul 2024 16:43:02 +0200 Subject: [PATCH 17/53] tests --- .../RoboArmPickPlaceEnv.py | 4 ++-- src/agents/behavior_cloning.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py index f676c0c..80df5b0 100644 --- a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py +++ b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py @@ -146,7 +146,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: self.observation_key: torch.tensor(observation).float(), self.pixels_observation_key: torch.from_numpy(resized_frame)[ None, : - ].float(), + ].to(torch.uint8), }, batch_size=[1], ) @@ -187,7 +187,7 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase: self.observation_key: torch.tensor(next_observation).float(), self.pixels_observation_key: torch.from_numpy(resized_frame)[ None, : - ].float(), + ].to(torch.uint8), "reward": torch.tensor([reward]).float(), "done": torch.tensor([done]).bool(), }, diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index cddbbfb..4b7f216 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -7,6 +7,7 @@ from torchrl.data.replay_buffers.storages import LazyMemmapStorage from torchrl.envs.utils import ExplorationType, set_exploration_type +from torchrl.envs import RenameTransform from src.agents.base import BaseAgent from src.networks.networks import get_deterministic_actor, get_stochastic_actor @@ -50,7 +51,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) # create replay buffer - self.offline_data_path = None + self.batch_size = agent_config.batch_size self.replay_buffer = self.create_replay_buffer() # general stats @@ -75,7 +76,12 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + self.replay_buffer.load(path) + if self.replay_buffer._batch_size != self.batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: @@ -95,8 +101,8 @@ def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: def create_replay_buffer( self, batch_size=256, - buffer_size=10000, - buffer_scratch_dir=None, + buffer_size=1000000, + buffer_scratch_dir="./tmp", device="cpu", prefetch=3, ): @@ -108,10 +114,10 @@ def create_replay_buffer( storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, - device=device, ), batch_size=batch_size, ) + replay_buffer.append_transform(lambda x: x.to(device)) return replay_buffer @@ -165,10 +171,12 @@ def pretrain_deter(self, wandb, batch_size=64, num_updates=1): def train(self, batch_size=64, num_updates=1): """Train the agent""" log_data = {} + rename = RenameTransform(in_keys=["image_observation", ("next", "image_observation")], out_keys=["pixels", ("next", "pixels")]) for i in range(num_updates): batch = self.replay_buffer.sample(batch_size).to(self.device) orig_action = batch.get("action").clone() + out_dict = self.actor(batch) loss = torch.mean((out_dict.get("action") - orig_action) ** 2) self.optimizer.zero_grad() From ad3892795d0f5c8ee3bf762347cf705caf8c80ca Mon Sep 17 00:00:00 2001 From: BY571 Date: Thu, 4 Jul 2024 16:43:38 +0200 Subject: [PATCH 18/53] update config --- conf/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/config.yaml b/conf/config.yaml index 2bae809..4f22f49 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -4,10 +4,10 @@ run_name: "" verbose: 0 device: "cuda" -episodes: 200 +episodes: 5000 defaults: - _self_ # random, sac, td3, droq - - agent: keyboard - - env: roboarm-v0 \ No newline at end of file + - agent: bc + - env: roboarm_pickplace-v0 \ No newline at end of file From 540edee08ee76b005b73d1ce4b656a229c850fd1 Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 5 Aug 2024 20:50:38 +0200 Subject: [PATCH 19/53] Update batched --- conf/agent/iql.yaml | 2 +- conf/agent/td3.yaml | 4 +- conf/config.yaml | 6 +- environments/__init__.py | 17 ++ environments/base/base_env.py | 10 + .../RoboArmPickPlaceEnv.py | 86 ++++-- experiments/2wheeler/eval.py | 5 +- experiments/roboarm/eval.py | 4 +- experiments/roboarm/train.py | 5 +- src/agents/behavior_cloning.py | 42 +-- src/agents/cql.py | 10 +- src/agents/iql.py | 25 +- src/agents/keyboard.py | 20 +- src/agents/random.py | 42 ++- src/agents/sac.py | 1 + src/agents/td3.py | 50 +++- src/networks/networks.py | 277 ++++++++++++++++-- src/utils.py | 11 +- 18 files changed, 505 insertions(+), 112 deletions(-) diff --git a/conf/agent/iql.yaml b/conf/agent/iql.yaml index 268c298..d2ca4a8 100644 --- a/conf/agent/iql.yaml +++ b/conf/agent/iql.yaml @@ -2,7 +2,7 @@ name: iql lr: 3e-4 batch_size: 256 num_updates: 1 -prefill_episodes: 10 +prefill_episodes: 0 num_cells: 256 gamma: 0.99 diff --git a/conf/agent/td3.yaml b/conf/agent/td3.yaml index 4465a71..3d9f440 100644 --- a/conf/agent/td3.yaml +++ b/conf/agent/td3.yaml @@ -15,4 +15,6 @@ dropout: 0.0 prb: 0 buffer_size: 1000000 -reset_params: False \ No newline at end of file +reset_params: False +use_bc: True +alpha: 1.0 \ No newline at end of file diff --git a/conf/config.yaml b/conf/config.yaml index 4f22f49..478d1c4 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -4,10 +4,10 @@ run_name: "" verbose: 0 device: "cuda" -episodes: 5000 +episodes: 250 defaults: - _self_ # random, sac, td3, droq - - agent: bc - - env: roboarm_pickplace-v0 \ No newline at end of file + - agent: sac + - env: walker_sim-v0 \ No newline at end of file diff --git a/environments/__init__.py b/environments/__init__.py index 190e7f2..fcde699 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -5,9 +5,11 @@ Compose, DoubleToFloat, ObservationNorm, + PermuteTransform, RewardSum, ToTensorImage, TransformedEnv, + VIPRewardTransform, ) from environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0 @@ -74,6 +76,18 @@ def make_env(config): if "pixels" in observation_keys: transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True)) + if config.env.name == "roboarm_pickplace-v0" and config.env.use_vip_reward: + transforms.append(PermuteTransform((-1, -2, -3), in_keys=["pixels"])) + transforms.append( + VIPRewardTransform( + in_keys=["pixels"], + download=True, + size=100, + model_name="resnet50", + tensor_pixels_keys=["pixels", ("next", "pixels")], # Does not seem to work + ) + ) + env = TransformedEnv(env, Compose(*transforms)) action_spec = env.action_spec @@ -139,6 +153,9 @@ def make(name="RunAway", env_conf=None): verbose=env_conf.verbose, reward_signal=env_conf.reward_signal, camera_id=env_conf.camera_id, + image_size=env_conf.image_size, + target_image_path=env_conf.target_image_path, + use_vip_reward=env_conf.use_vip_reward, ) else: print("Environment not found") diff --git a/environments/base/base_env.py b/environments/base/base_env.py index 761a034..3861f58 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -121,6 +121,16 @@ def _reset( def _set_seed(self, seed: int): return super()._set_seed(seed) + def get_reset_tensordict(self, **kwargs) -> TensorDictBase: + """ """ + return TensorDict( + { + }, + batch_size=[ + 1, + ], + ) + class BaseSimEnv(EnvBase): """ diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py index 80df5b0..948adf4 100644 --- a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py +++ b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py @@ -37,6 +37,11 @@ def __init__( verbose: bool = False, reward_signal: str = "dense", camera_id: int = 0, + image_size: Tuple[int, int] = (64, 64), + human_control: bool = False, + use_vip_reward: bool = False, + target_image_path: str = None, + mixed_observation: bool = True, ): self.sleep_time = sleep_time @@ -46,10 +51,22 @@ def __init__( ], "Reward signal must be dense or sparse." self.reward_signal = reward_signal self.max_episode_steps = max_episode_steps + self.image_size = image_size + self.human_control = human_control self.camera = cv2.VideoCapture(int(camera_id)) self._batch_size = torch.Size([1]) + if target_image_path is not None: + target_image = np.load(target_image_path) + else: + target_image = np.load( + "environments/roboarm_pickplace_v0/pickplace_green100_target.npy" + ) + self.target_image = target_image + self.use_vip_reward = use_vip_reward + self.mixed_observation = mixed_observation + # Define action spec self.action_spec = BoundedTensorSpec( low=-1, @@ -80,23 +97,30 @@ def __init__( ret, frame = self.camera.read() if not ret: raise ValueError("Camera not available.") - resized_frame = cv2.resize(frame, (64, 64)) + resized_frame = cv2.resize(frame, self.image_size) shape = resized_frame.shape pixels_observation_spec = BoundedTensorSpec( - low=torch.zeros((1,) + shape, dtype=torch.uint8), - high=torch.ones((1,) + shape, dtype=torch.uint8) * 255, - dtype=torch.uint8, - ) - - self.observation_spec = CompositeSpec( - { - self.observation_key: observation_spec, - self.pixels_observation_key: pixels_observation_spec, - }, - shape=(1,), + low=torch.zeros((1,) + shape, dtype=torch.int64), + high=torch.ones((1,) + shape, dtype=torch.int64) * 255, + dtype=torch.int64, ) + if self.mixed_observation: + self.observation_spec = CompositeSpec( + { + self.observation_key: observation_spec, + self.pixels_observation_key: pixels_observation_spec, + }, + shape=(1,), + ) + else: + self.observation_spec = CompositeSpec( + { + self.pixels_observation_key: pixels_observation_spec, + }, + shape=(1,), + ) - self.goal_positions = self.init_camera_position() + _ = self.init_camera_position() super().__init__( action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose @@ -121,6 +145,21 @@ def init_camera_position( return + def get_reset_tensordict(self, **kwargs) -> TensorDictBase: + """ """ + if self.use_vip_reward: + return TensorDict( + { + "goal_image": torch.from_numpy(self.target_image) + .to(torch.int64) + .unsqueeze(0), + }, + batch_size=[ + 1, + ], + ) + return TensorDict({},batch_size=[1]) + def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: """ Reset the environment and return the initial state. @@ -130,23 +169,23 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: """ # TODO solve this fake action sending before to receive first state self.episode_step_iter = 0 - if tensordict is not None: - action = tensordict.get("action").cpu().numpy().squeeze() - else: - action = np.zeros(self.action_dim) + action = np.zeros(self.action_dim) self.send_to_hub(action) time.sleep(self.sleep_time) observation = self.read_from_hub() ret, frame = self.camera.read() - resized_frame = cv2.resize(frame, (64, 64)) + resized_frame = cv2.resize(frame, self.image_size) return TensorDict( { self.observation_key: torch.tensor(observation).float(), self.pixels_observation_key: torch.from_numpy(resized_frame)[ None, : - ].to(torch.uint8), + ].to(torch.int64), + # "goal_image": torch.from_numpy(self.target_image) + # .to(torch.int64) + # .unsqueeze(0), }, batch_size=[1], ) @@ -181,18 +220,21 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase: reward, done = self.reward( frame, ) - resized_frame = cv2.resize(frame, (64, 64)) + resized_frame = cv2.resize(frame, self.image_size) next_tensordict = TensorDict( { self.observation_key: torch.tensor(next_observation).float(), self.pixels_observation_key: torch.from_numpy(resized_frame)[ None, : - ].to(torch.uint8), + ].to(torch.int64), "reward": torch.tensor([reward]).float(), "done": torch.tensor([done]).bool(), + # "goal_image": torch.from_numpy(self.target_image) + # .to(torch.int64) + # .unsqueeze(0), }, batch_size=[1], - ) + ) # .to(tensordict.device) # increment episode step counter self.episode_step_iter += 1 diff --git a/experiments/2wheeler/eval.py b/experiments/2wheeler/eval.py index 1bf471f..f1710a4 100644 --- a/experiments/2wheeler/eval.py +++ b/experiments/2wheeler/eval.py @@ -17,7 +17,7 @@ from environments import make_env from src.agents import get_agent -from src.utils import login, setup_check +from src.utils import login, setup_check , logout @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") @@ -56,6 +56,7 @@ def run(cfg: DictConfig) -> None: td = agent.get_eval_action(td) actions.append(td.get("action").cpu().numpy()) td = env.step(td) + agent.add_experience(td) total_agent_step_time = time.time() - step_start_time total_step_times.append(total_agent_step_time) done = td.get(("next", "done"), False) @@ -90,7 +91,7 @@ def run(cfg: DictConfig) -> None: except KeyboardInterrupt: print("Evaluation interrupted by user.") - + logout(agent) env.close() diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index a70998b..8ecd0b0 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -42,7 +42,7 @@ def run(cfg: DictConfig) -> None: _ = input("Press Enter to start evaluation...") try: for e in tqdm(range(eval_episodes), desc="Evaluation"): - td = env.reset() + td = env.reset(env.get_reset_tensordict()) done = td.get("done", False) truncated = td.get("truncated", False) ep_return = 0 @@ -55,7 +55,7 @@ def run(cfg: DictConfig) -> None: ep_steps += 1 step_start_time = time.time() td = agent.get_eval_action(td) - td = env.step(td) + td = env.step(td.to("cpu")) if env_name in VIDEO_LOGGING_ENVS: image_caputres.append( td.get(("next", "original_pixels")).cpu().numpy() diff --git a/experiments/roboarm/train.py b/experiments/roboarm/train.py index 98d194e..5f8901f 100644 --- a/experiments/roboarm/train.py +++ b/experiments/roboarm/train.py @@ -1,7 +1,8 @@ import os import sys import time - +import torch +from tensordict import TensorDict import hydra import numpy as np import wandb @@ -60,7 +61,7 @@ def run(cfg: DictConfig) -> None: quit = False try: for e in tqdm(range(train_episodes), desc="Training"): - td = env.reset() + td = env.reset(env.get_reset_tensordict()) done = td.get("done", False) truncated = td.get("truncated", False) ep_return = 0 diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 4b7f216..0c0d312 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -7,7 +7,7 @@ from torchrl.data.replay_buffers.storages import LazyMemmapStorage from torchrl.envs.utils import ExplorationType, set_exploration_type -from torchrl.envs import RenameTransform +from torchrl.envs import RenameTransform, ToTensorImage from src.agents.base import BaseAgent from src.networks.networks import get_deterministic_actor, get_stochastic_actor @@ -32,12 +32,10 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.actor = get_deterministic_actor( self.observation_keys, action_spec, agent_config ) - self.pretrain = self.pretrain_deter elif agent_config.policy_type == "stochastic": self.actor = get_stochastic_actor( self.observation_keys, action_spec, agent_config ) - self.pretrain = self.pretrain_stoch else: raise ValueError( "policy_type not recognized, choose deterministic or stochastic" @@ -76,7 +74,9 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load(path) + # self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path) + self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: Warning( "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." @@ -118,6 +118,7 @@ def create_replay_buffer( batch_size=batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) return replay_buffer @@ -134,44 +135,11 @@ def get_action(self, state): def add_experience(self, transition: td.TensorDict): """Add experience to replay buffer""" - # TODO: for bc we dont want to add to replay buffer pass - def pretrain_stoch(self, wandb, batch_size=64, num_updates=1): - """Pretrain the agent with simple behavioral cloning""" - - for i in range(num_updates): - batch = self.replay_buffer.sample(batch_size) - input_td = td.TensorDict( - {"observation": batch["vec_observations"].float()}, batch_size=(256) - ) - dist = self.actor.get_dist(input_td) - loss = -dist.log_prob(batch["actions"]).mean() - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - wandb.log({"pretrain/loss": loss.item()}) - - self.actor.eval() - - def pretrain_deter(self, wandb, batch_size=64, num_updates=1): - """Pretrain the agent with simple behavioral cloning""" - - for i in range(num_updates): - batch = self.replay_buffer.sample(batch_size) - pred, _ = self.actor(batch["vec_observations"].float()) - loss = torch.mean((pred - batch["actions"]) ** 2) - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - wandb.log({"pretrain/loss": loss.item()}) - - self.actor.eval() - def train(self, batch_size=64, num_updates=1): """Train the agent""" log_data = {} - rename = RenameTransform(in_keys=["image_observation", ("next", "image_observation")], out_keys=["pixels", ("next", "pixels")]) for i in range(num_updates): batch = self.replay_buffer.sample(batch_size).to(self.device) diff --git a/src/agents/cql.py b/src/agents/cql.py index c65cbcc..c99a2e8 100644 --- a/src/agents/cql.py +++ b/src/agents/cql.py @@ -111,7 +111,12 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + self.replay_buffer.load(path) + if self.replay_buffer._batch_size != self.batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: @@ -158,7 +163,6 @@ def create_replay_buffer( prefetch=1, storage=LazyTensorStorage( buffer_size, - device=device, ), ) else: @@ -168,10 +172,10 @@ def create_replay_buffer( storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, - device=device, ), batch_size=batch_size, ) + replay_buffer.append_transform(lambda x: x.to(device)) return replay_buffer @torch.no_grad() diff --git a/src/agents/iql.py b/src/agents/iql.py index e0c3893..ed7f7a9 100644 --- a/src/agents/iql.py +++ b/src/agents/iql.py @@ -4,6 +4,7 @@ from torch import optim from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage +from torchrl.envs.transforms import ToTensorImage from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.objectives import SoftUpdate @@ -53,8 +54,10 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.reset_params = agent_config.reset_params # Define Replay Buffer + self.batch_size = agent_config.batch_size + self.replay_buffer = self.create_replay_buffer( - batch_size=agent_config.batch_size, + batch_size=self.batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, device=device, @@ -98,6 +101,7 @@ def get_agent_statedict(self): def load_model(self, path): """load model""" + try: statedict = torch.load(path) self.actor.load_state_dict(statedict["actor"]) @@ -110,7 +114,14 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load_state_dict(torch.load(path)) + # self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path) + self.replay_buffer.extend(loaded_data) + if self.replay_buffer._batch_size != self.batch_size: + Warning( + "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." + ) + self.replay_buffer._batch_size = self.batch_size print("Replay Buffer loaded") print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") except: @@ -168,10 +179,18 @@ def create_replay_buffer( storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, - device=device, ), batch_size=batch_size, ) + replay_buffer.append_transform(lambda x: x.to(device)) + replay_buffer.append_transform( + ToTensorImage( + from_int=True, + shape_tolerant=True, + in_keys=["pixels", ("next", "pixels")], + ) + ) + return replay_buffer @torch.no_grad() diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py index b42d19d..bf511d4 100644 --- a/src/agents/keyboard.py +++ b/src/agents/keyboard.py @@ -18,8 +18,8 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): # Define the key to action mapping self.key_action_mapping = { - "a": [-0.15, 0, 0, 0], # Rotate motor -30 - "d": [0.15, 0, 0, 0], # Rotate motor +30 + "a": [0.15, 0, 0, 0], # Rotate motor -30 + "d": [-0.15, 0, 0, 0], # Rotate motor +30 "s": [0, -0.20, 0, 0], # Low motor -10 "w": [0, 0.20, 0, 0], # Low motor +10 "q": [0, 0, -0.25, 0], # High motor -15 @@ -69,7 +69,9 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load(path) + # self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path).to_tensordict() + self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.buffer_batch_size: Warning( "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." @@ -104,6 +106,9 @@ def create_replay_buffer( batch_size=batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + + + return replay_buffer @torch.no_grad() @@ -124,7 +129,14 @@ def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: def add_experience(self, transition: td.TensorDict): """Add experience to replay buffer""" - self.replay_buffer.extend(transition) + + # transform pixels to int if pixels in observation spec + save_transition = transition.copy() + if "pixels" in transition.keys(): + save_transition.set("pixels", (save_transition.get("pixels")*255).to(torch.int64)) + save_transition.set(("next", "pixels"), (save_transition.get(("next", "pixels"))*255).to(torch.int64)) + + self.replay_buffer.extend(save_transition) self.collected_transitions += 1 def train(self, batch_size=64, num_updates=1): diff --git a/src/agents/random.py b/src/agents/random.py index 86fd87d..389e918 100644 --- a/src/agents/random.py +++ b/src/agents/random.py @@ -13,7 +13,13 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) self.actor = None - self.replay_buffer = {} + self.replay_buffer = self.create_replay_buffer( + batch_size=256, + prb=False, + buffer_size=1000000, + device=device, + buffer_scratch_dir="/tmp", + ) def eval(self): """Sets the agent to evaluation mode.""" @@ -37,3 +43,37 @@ def add_experience(self, transition: TensorDictBase): def train(self, batch_size=64, num_updates=1): """Train the agent""" return {} + + + def create_replay_buffer( + self, + batch_size=256, + prb=False, + buffer_size=100000, + buffer_scratch_dir=None, + device="cpu", + prefetch=3, + ): + """Create replay buffer""" + # TODO: make this part of base off policy agent + if prb: + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=0.7, + beta=0.5, + pin_memory=False, + prefetch=1, + storage=LazyTensorStorage( + buffer_size, + ), + ) + else: + replay_buffer = TensorDictReplayBuffer( + pin_memory=False, + prefetch=prefetch, + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + ), + batch_size=batch_size, + ) + return replay_buffer \ No newline at end of file diff --git a/src/agents/sac.py b/src/agents/sac.py index d2b5753..e36d327 100644 --- a/src/agents/sac.py +++ b/src/agents/sac.py @@ -58,6 +58,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): prb=agent_config.prb, buffer_size=agent_config.buffer_size, buffer_scratch_dir="/tmp", + device=device, ) # Define Optimizer critic_params = list( diff --git a/src/agents/td3.py b/src/agents/td3.py index 5c1b9e5..5c4c024 100644 --- a/src/agents/td3.py +++ b/src/agents/td3.py @@ -10,6 +10,8 @@ from torchrl.modules import AdditiveGaussianWrapper from torchrl.objectives import SoftUpdate from torchrl.objectives.td3 import TD3Loss +from torchrl.objectives.td3_bc import TD3BCLoss +from torchrl.envs.transforms import ToTensorImage from src.agents.base import BaseAgent from src.networks.networks import get_critic, get_deterministic_actor @@ -36,6 +38,9 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.critic = get_critic(self.observation_keys, agent_config) self.model = nn.ModuleList([self.actor, self.critic]).to(device) + + print(self.actor) + print(self.critic) # initialize networks self.init_nets(self.model) @@ -48,14 +53,27 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ).to(device) # define loss function - self.loss_module = TD3Loss( - actor_network=self.model[0], - qvalue_network=self.model[1], - action_spec=action_spec, - num_qvalue_nets=2, - loss_function=agent_config.loss_function, - separate_losses=False, - ) + self.use_bc = agent_config.use_bc + if not self.use_bc: + self.loss_module = TD3Loss( + actor_network=self.model[0], + qvalue_network=self.model[1], + action_spec=action_spec, + num_qvalue_nets=2, + loss_function=agent_config.loss_function, + separate_losses=False, + ) + else: + self.loss_module = TD3BCLoss( + actor_network=self.model[0], + qvalue_network=self.model[1], + action_spec=action_spec, + num_qvalue_nets=2, + loss_function=agent_config.loss_function, + separate_losses=False, + alpha=agent_config.alpha, + ) + # Define Target Network Updater self.target_net_updater = SoftUpdate( self.loss_module, eps=agent_config.soft_update_eps @@ -112,7 +130,9 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load(path) + #self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path) + self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: Warning( "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." @@ -163,6 +183,13 @@ def create_replay_buffer( batch_size=batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + # replay_buffer.append_transform( + # ToTensorImage( + # from_int=True, + # shape_tolerant=True, + # in_keys=["pixels", ("next", "pixels")], + # ) + # ) return replay_buffer def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: @@ -215,7 +242,10 @@ def train(self, batch_size=64, num_updates=1): else: sampled_tensordict = sampled_tensordict.clone() # Update Critic Network - q_loss, _ = self.loss_module.value_loss(sampled_tensordict) + if self.use_bc: + q_loss, _ = self.loss_module.qvalue_loss(sampled_tensordict) + else: + q_loss, _ = self.loss_module.value_loss(sampled_tensordict) self.optimizer_critic.zero_grad() q_loss.backward() self.optimizer_critic.step() diff --git a/src/networks/networks.py b/src/networks/networks.py index c6b13cb..98d74d3 100644 --- a/src/networks/networks.py +++ b/src/networks/networks.py @@ -36,6 +36,16 @@ def get_critic(observation_keys, agent_config): normalization=agent_config.normalization, dropout=agent_config.dropout, ) + elif "pixels" in observation_keys and not "observation" in observation_keys: + return get_img_only_critic( + img_in_keys="pixels", + num_cells=[agent_config.num_cells, agent_config.num_cells], + out_features=1, + activation_class=nn.ReLU, + normalization=agent_config.normalization, + dropout=agent_config.dropout, + ) + elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_critic( vec_in_keys="observation", @@ -51,10 +61,7 @@ def get_critic(observation_keys, agent_config): def get_value_operator(observation_keys, agent_config): - if ( - "vec_observation" in observation_keys - and not "image_observation" in observation_keys - ): + if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_value( in_keys=observation_keys, num_cells=[agent_config.num_cells, agent_config.num_cells], @@ -63,13 +70,19 @@ def get_value_operator(observation_keys, agent_config): normalization=agent_config.normalization, dropout=agent_config.dropout, ) - elif ( - "image_observation" in observation_keys - and "vec_observation" in observation_keys - ): + elif "pixels" in observation_keys and not "observation" in observation_keys: + return get_img_only_value( + img_in_keys="pixels", + num_cells=[agent_config.num_cells, agent_config.num_cells], + out_features=1, + activation_class=nn.ReLU, + normalization=agent_config.normalization, + dropout=agent_config.dropout + ) + elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_value( - vec_in_keys="vec_observation", - img_in_keys="image_observation", + vec_in_keys="observation", + img_in_keys="pixels", num_cells=[agent_config.num_cells, agent_config.num_cells], out_features=1, activation_class=nn.ReLU, @@ -103,6 +116,47 @@ def get_vec_value( ) return qvalue +def get_img_only_value( + img_in_keys, + num_cells=[256, 256], + out_features=1, + activation_class=nn.ReLU, + normalization="None", + dropout=0.0, +): + normalization = get_normalization(normalization) + # image encoder + cnn = ConvNet( + activation_class=activation_class, + num_cells=[32, 64, 64], + kernel_sizes=[8, 4, 3], + strides=[4, 2, 1], + ) + cnn_output = cnn(torch.ones((3, 100, 100))) + mlp = MLP( + in_features=cnn_output.shape[-1], + activation_class=activation_class, + out_features=128, + num_cells=[256], + ) + image_encoder = SafeModule( + torch.nn.Sequential(cnn, mlp), + in_keys=[img_in_keys], + out_keys=["pixel_embedding"], + ) + + # output head + mlp = MLP( + activation_class=torch.nn.ReLU, + out_features=out_features, + num_cells=num_cells, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + v_head = ValueOperator(mlp, ["pixel_embedding"]) + # model + return SafeSequential(image_encoder, v_head) def get_mixed_value( vec_in_keys, @@ -121,7 +175,7 @@ def get_mixed_value( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 64, 64))) + cnn_output = cnn(torch.ones((3, 100, 100))) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -131,7 +185,7 @@ def get_mixed_value( image_encoder = SafeModule( torch.nn.Sequential(cnn, mlp), in_keys=[img_in_keys], - out_keys=["image_embedding"], + out_keys=["pixel_embedding"], ) # vector_obs encoder @@ -141,7 +195,7 @@ def get_mixed_value( num_cells=[128], ) vector_obs_encoder = SafeModule( - mlp, in_keys=[vec_in_keys], out_keys=["vec_obs_embedding"] + mlp, in_keys=[vec_in_keys], out_keys=["obs_embedding"] ) # output head @@ -153,7 +207,7 @@ def get_mixed_value( norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, dropout=dropout, ) - v_head = ValueOperator(mlp, ["image_embedding", "vec_obs_embedding"]) + v_head = ValueOperator(mlp, ["pixel_embedding", "obs_embedding"]) # model return SafeSequential(image_encoder, vector_obs_encoder, v_head) @@ -201,7 +255,7 @@ def get_mixed_critic( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 64, 64))) + cnn_output = cnn(torch.ones((3, 100, 100))) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -238,6 +292,49 @@ def get_mixed_critic( return SafeSequential(image_encoder, vector_obs_encoder, v_head) +def get_img_only_critic( + img_in_keys, + num_cells=[256, 256], + out_features=1, + activation_class=nn.ReLU, + normalization="None", + dropout=0.0, +): + normalization = get_normalization(normalization) + # image encoder + cnn = ConvNet( + activation_class=activation_class, + num_cells=[32, 64, 64], + kernel_sizes=[8, 4, 3], + strides=[4, 2, 1], + ) + cnn_output = cnn(torch.ones((3, 100, 100))) + mlp = MLP( + in_features=cnn_output.shape[-1], + activation_class=activation_class, + out_features=128, + num_cells=[256], + ) + image_encoder = SafeModule( + torch.nn.Sequential(cnn, mlp), + in_keys=[img_in_keys], + out_keys=["pixel_embedding"], + ) + + # output head + mlp = MLP( + activation_class=torch.nn.ReLU, + out_features=out_features, + num_cells=num_cells, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + v_head = ValueOperator(mlp, ["pixel_embedding", "action"]) + # model + return SafeSequential(image_encoder, v_head) + + def get_deterministic_actor(observation_keys, action_spec, agent_config): if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_deterministic_actor( @@ -246,6 +343,14 @@ def get_deterministic_actor(observation_keys, action_spec, agent_config): num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, ) + + elif "pixels" in observation_keys and not "observation" in observation_keys: + return get_img_only_det_actor( + img_in_keys="pixels", + action_spec=action_spec, + num_cells=[agent_config.num_cells, agent_config.num_cells], + activation_class=nn.ReLU, + ) elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_deterministic_actor( @@ -297,6 +402,56 @@ def get_vec_deterministic_actor( return actor +def get_img_only_det_actor( + img_in_keys, + action_spec, + num_cells=[256, 256], + activation_class=nn.ReLU, + normalization="None", + dropout=0.0, +): + normalization = get_normalization(normalization) + # image encoder + cnn = ConvNet( + activation_class=activation_class, + num_cells=[32, 64, 64], + kernel_sizes=[8, 4, 3], + strides=[4, 2, 1], + ) + cnn_output = cnn(torch.ones((3, 100, 100))) + mlp = MLP( + in_features=cnn_output.shape[-1], + activation_class=activation_class, + out_features=128, + num_cells=[256], + ) + image_encoder = SafeModule( + torch.nn.Sequential(cnn, mlp), + in_keys=[img_in_keys], + out_keys=["pixel_embedding"], + ) + + + # output head + mlp = MLP( + activation_class=torch.nn.ReLU, + out_features=action_spec.shape[-1], + num_cells=num_cells, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + combined = SafeModule(mlp, ["pixel_embedding"], out_keys=["param"]) + out_module = TanhModule( + in_keys=["param"], + out_keys=["action"], + spec=action_spec, + ) + return SafeSequential( + image_encoder, + combined, + out_module, + ) def get_mixed_deterministic_actor( vec_in_keys, @@ -315,7 +470,7 @@ def get_mixed_deterministic_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 64, 64))) + cnn_output = cnn(torch.ones((3, 100, 100))) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -371,6 +526,15 @@ def get_stochastic_actor(observation_keys, action_spec, agent_config): dropout=agent_config.dropout, activation_class=nn.ReLU, ) + elif "pixels" in observation_keys and not "observation" in observation_keys: + return get_img_only_stochastic_actor( + img_in_keys="pixels", + action_spec=action_spec, + num_cells=[agent_config.num_cells, agent_config.num_cells], + normalization=agent_config.normalization, + dropout=agent_config.dropout, + activation_class=nn.ReLU, + ) elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_stochastic_actor( action_spec, @@ -437,6 +601,85 @@ def get_vec_stochastic_actor( ) return actor +def get_img_only_stochastic_actor( + action_spec, + img_in_keys, + num_cells=[256, 256], + normalization="None", + dropout=0.0, + activation_class=nn.ReLU, +): + + normalization = get_normalization(normalization) + # image encoder + cnn = ConvNet( + activation_class=activation_class, + num_cells=[32, 64, 64], + kernel_sizes=[8, 4, 3], + strides=[4, 2, 1], + ) + cnn_output = cnn(torch.ones((3, 100, 100))) + mlp = MLP( + in_features=cnn_output.shape[-1], + activation_class=activation_class, + out_features=128, + num_cells=[256], + ) + image_encoder = SafeModule( + torch.nn.Sequential(cnn, mlp), + in_keys=[img_in_keys], + out_keys=["pixel_embedding"], + ) + + # output head + mlp = MLP( + activation_class=torch.nn.ReLU, + out_features=2 * action_spec.shape[-1], + num_cells=num_cells, + norm_class=normalization, + norm_kwargs={"normalized_shape": num_cells[-1]} if normalization else None, + dropout=dropout, + ) + actor_module = SafeModule( + mlp, + in_keys=["pixel_embedding"], + out_keys=["params"], + ) + actor_extractor = NormalParamExtractor( + scale_mapping=f"biased_softplus_{1.0}", + scale_lb=0.1, + ) + + extractor_module = SafeModule( + actor_extractor, + in_keys=["params"], + out_keys=[ + "loc", + "scale", + ], + ) + actor_net_combined = SafeSequential( + image_encoder, actor_module, extractor_module + ) + + dist_class = TanhNormal + dist_kwargs = { + "min": action_spec.space.low, + "max": action_spec.space.high, + "tanh_loc": False, + } + actor = ProbabilisticActor( + spec=action_spec, + in_keys=["loc", "scale"], + out_keys=["action"], + module=actor_net_combined, + distribution_class=dist_class, + distribution_kwargs=dist_kwargs, + default_interaction_mode="random", + return_log_prob=False, + ) + return actor + def get_mixed_stochastic_actor( action_spec, @@ -456,7 +699,7 @@ def get_mixed_stochastic_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 64, 64))) + cnn_output = cnn(torch.ones((3, 100, 100))) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, diff --git a/src/utils.py b/src/utils.py index 3ea4c73..a6a63f8 100644 --- a/src/utils.py +++ b/src/utils.py @@ -6,7 +6,7 @@ from environments import ALL_2WHEELER_ENVS, ALL_ROBOARM_ENVS, ALL_WALKER_ENVS from moviepy.editor import concatenate_videoclips, ImageClip from omegaconf import DictConfig -from tensordict import TensorDictBase +from tensordict import TensorDict, TensorDictBase from torchrl.envs.utils import step_mdp from tqdm import tqdm @@ -49,7 +49,11 @@ def logout(agent): x = input("Do you want to save the replay buffer? (y/n)") if x == "y": save_name = input("Enter the name of the file to save: ") - agent.replay_buffer.dump(save_name) + # agent.replay_buffer.dump(save_name) + batched_data = agent.replay_buffer.storage._storage[ + : agent.replay_buffer.__len__() + ] + batched_data.save(save_name, copy_existing=True) def login(agent): @@ -82,8 +86,7 @@ def prefill_buffer(env, agent, num_episodes=10, stop_on_done=False): inpt = input("Press Enter to start prefilling episode: ") for e in tqdm(range(num_episodes), desc="Prefilling buffer"): print("Prefill episode: ", e) - - td = env.reset() + td = env.reset(env.get_reset_tensordict()) done = False truncated = False while not done and not truncated: From 7e0b3cef83c56d75b0833484e35228303709de23 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 09:28:38 +0200 Subject: [PATCH 20/53] set bc default false --- conf/agent/td3.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/agent/td3.yaml b/conf/agent/td3.yaml index 3d9f440..f341f50 100644 --- a/conf/agent/td3.yaml +++ b/conf/agent/td3.yaml @@ -16,5 +16,5 @@ dropout: 0.0 prb: 0 buffer_size: 1000000 reset_params: False -use_bc: True +use_bc: False alpha: 1.0 \ No newline at end of file From 19568b5ca7adea3ace0923aa6617fb29ce389081 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 09:33:41 +0200 Subject: [PATCH 21/53] update bc agent buffer loading --- src/agents/behavior_cloning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 0c0d312..8be954e 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -74,7 +74,6 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - # self.replay_buffer.load(path) loaded_data = TensorDictBase.load_memmap(path) self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: @@ -118,7 +117,8 @@ def create_replay_buffer( batch_size=batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) - replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) + # TODO: check if we have image in observation space if so add this transform + #replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) return replay_buffer From 5a65efc71fcc2377091babe7cfc40b662240d0e2 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 09:38:23 +0200 Subject: [PATCH 22/53] update agents buffer loading --- src/agents/behavior_cloning.py | 3 +-- src/agents/cql.py | 10 ++++++---- src/agents/iql.py | 13 +++---------- src/agents/sac.py | 10 ++++++---- src/agents/td3.py | 8 +++----- 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 8be954e..1fb3e5f 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -99,7 +99,6 @@ def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: def create_replay_buffer( self, - batch_size=256, buffer_size=1000000, buffer_scratch_dir="./tmp", device="cpu", @@ -114,7 +113,7 @@ def create_replay_buffer( buffer_size, scratch_dir=buffer_scratch_dir, ), - batch_size=batch_size, + batch_size=self.batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) # TODO: check if we have image in observation space if so add this transform diff --git a/src/agents/cql.py b/src/agents/cql.py index c99a2e8..f3ebd26 100644 --- a/src/agents/cql.py +++ b/src/agents/cql.py @@ -55,8 +55,8 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.reset_params = agent_config.reset_params # Define Replay Buffer + self.batch_size = agent_config.batch_size self.replay_buffer = self.create_replay_buffer( - batch_size=agent_config.batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, device=device, @@ -111,7 +111,8 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path) + self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: Warning( "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." @@ -146,7 +147,6 @@ def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: def create_replay_buffer( self, - batch_size=256, prb=False, buffer_size=100000, buffer_scratch_dir=None, @@ -173,9 +173,11 @@ def create_replay_buffer( buffer_size, scratch_dir=buffer_scratch_dir, ), - batch_size=batch_size, + batch_size=self.batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + # TODO: check if we have image in observation space if so add this transform + # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) return replay_buffer @torch.no_grad() diff --git a/src/agents/iql.py b/src/agents/iql.py index ed7f7a9..c90f34b 100644 --- a/src/agents/iql.py +++ b/src/agents/iql.py @@ -114,7 +114,6 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - # self.replay_buffer.load(path) loaded_data = TensorDictBase.load_memmap(path) self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: @@ -152,7 +151,6 @@ def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: def create_replay_buffer( self, - batch_size=256, prb=False, buffer_size=100000, buffer_scratch_dir=None, @@ -180,16 +178,11 @@ def create_replay_buffer( buffer_size, scratch_dir=buffer_scratch_dir, ), - batch_size=batch_size, + batch_size=self.batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) - replay_buffer.append_transform( - ToTensorImage( - from_int=True, - shape_tolerant=True, - in_keys=["pixels", ("next", "pixels")], - ) - ) + # TODO: check if we have image in observation space if so add this transform + # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) return replay_buffer diff --git a/src/agents/sac.py b/src/agents/sac.py index e36d327..001a3d5 100644 --- a/src/agents/sac.py +++ b/src/agents/sac.py @@ -54,7 +54,6 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): # Define Replay Buffer self.buffer_batch_size = agent_config.batch_size self.replay_buffer = self.create_replay_buffer( - batch_size=self.batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, buffer_scratch_dir="/tmp", @@ -101,7 +100,8 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - self.replay_buffer.load(path) + loaded_data = TensorDictBase.load_memmap(path) + self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: Warning( "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." @@ -136,7 +136,6 @@ def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: def create_replay_buffer( self, - batch_size=256, prb=False, buffer_size=100000, buffer_scratch_dir=".", @@ -163,9 +162,12 @@ def create_replay_buffer( buffer_size, scratch_dir=buffer_scratch_dir, ), - batch_size=batch_size, + batch_size=self.batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + # TODO: check if we have image in observation space if so add this transform + # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) + return replay_buffer @torch.no_grad() diff --git a/src/agents/td3.py b/src/agents/td3.py index 5c4c024..450dada 100644 --- a/src/agents/td3.py +++ b/src/agents/td3.py @@ -5,13 +5,13 @@ from torch import nn, optim from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage +from torchrl.envs.transforms import ToTensorImage from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.modules import AdditiveGaussianWrapper from torchrl.objectives import SoftUpdate from torchrl.objectives.td3 import TD3Loss from torchrl.objectives.td3_bc import TD3BCLoss -from torchrl.envs.transforms import ToTensorImage from src.agents.base import BaseAgent from src.networks.networks import get_critic, get_deterministic_actor @@ -83,7 +83,6 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.batch_size = agent_config.batch_size # Define Replay Buffer self.replay_buffer = self.create_replay_buffer( - batch_size=self.batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, device=device, @@ -130,7 +129,6 @@ def load_model(self, path): def load_replaybuffer(self, path): """load replay buffer""" try: - #self.replay_buffer.load(path) loaded_data = TensorDictBase.load_memmap(path) self.replay_buffer.extend(loaded_data) if self.replay_buffer._batch_size != self.batch_size: @@ -153,7 +151,6 @@ def reset_networks(self): def create_replay_buffer( self, - batch_size=256, prb=False, buffer_size=100000, buffer_scratch_dir=None, @@ -180,9 +177,10 @@ def create_replay_buffer( buffer_size, scratch_dir=buffer_scratch_dir, ), - batch_size=batch_size, + batch_size=self.batch_size, ) replay_buffer.append_transform(lambda x: x.to(device)) + # TODO: check if we have image in observation space if so add this transform # replay_buffer.append_transform( # ToTensorImage( # from_int=True, From 021ae45a44c3b625f45f30b148c5166333e521e4 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 11:10:02 +0200 Subject: [PATCH 23/53] update readme with dataset info --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index 3426489..b614093 100644 --- a/README.md +++ b/README.md @@ -158,5 +158,38 @@ Evaluation videos of the trained agents can be found [here](https://sites.google +### Offline RL +
+ Click me +With the use of precollected [offline datasets]() we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. + +#### Pretrain an Agent + +The execution of an experiment for offline training is similar to the online training except that you run the **pretrain.py** script: + + ```bash + python experiments/walker/pretrain.py + ``` + +Trained policies can then be evaluated as before with: + + ```bash + python experiments/walker/eval.py + ``` + +Or run training for fine-tuning the policy on the real robot: + + ```bash + python experiments/walker/train.py + ``` +#### Datasets +The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots. + ```bash + git lfs install + git clone git@hf.co:datasets/Sebasdi/BricksRL-Datasets + ``` + + +
From 57ccf2726a807019e73e2eeb8c8ea6e36af848c9 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 11:24:11 +0200 Subject: [PATCH 24/53] update roboarm envs with pretrain flag --- environments/__init__.py | 19 +++++++++++++++---- environments/base/base_env.py | 14 +++++++++----- .../roboarm_mixed_v0/RoboArmMixedEnv.py | 6 +++++- .../RoboArmPickPlaceEnv.py | 8 ++++++-- environments/roboarm_v0/RoboArmEnv.py | 6 +++++- environments/roboarm_v0/RoboArmSim.py | 6 +++++- 6 files changed, 45 insertions(+), 14 deletions(-) diff --git a/environments/__init__.py b/environments/__init__.py index fcde699..9c5857b 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -36,7 +36,7 @@ ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS -def make_env(config): +def make_env(config, pretrain=False): """ Creates a new environment based on the provided configuration. @@ -46,7 +46,7 @@ def make_env(config): Returns: A tuple containing the new environment, its action space, and its state space. """ - env = make(name=config.env.name, env_conf=config.env) + env = make(name=config.env.name, env_conf=config.env, pretain=pretrain) observation_keys = [key for key in env.observation_spec.keys()] transforms = [] @@ -84,7 +84,10 @@ def make_env(config): download=True, size=100, model_name="resnet50", - tensor_pixels_keys=["pixels", ("next", "pixels")], # Does not seem to work + tensor_pixels_keys=[ + "pixels", + ("next", "pixels"), + ], # Does not seem to work ) ) @@ -96,24 +99,27 @@ def make_env(config): return env, action_spec, state_spec -def make(name="RunAway", env_conf=None): +def make(name="RunAway", env_conf=None, pretain=False): if name == "runaway-v0": return RunAwayEnv_v0( max_episode_steps=env_conf.max_episode_steps, min_distance=env_conf.min_distance, verbose=env_conf.verbose, + pretain=pretain, ) elif name == "spinning-v0": return SpinningEnv_v0( max_episode_steps=env_conf.max_episode_steps, sleep_time=env_conf.sleep_time, verbose=env_conf.verbose, + pretain=pretain, ) elif name == "walker-v0": return WalkerEnv_v0( max_episode_steps=env_conf.max_episode_steps, verbose=env_conf.verbose, sleep_time=env_conf.sleep_time, + pretain=pretain, ) elif name == "walker_sim-v0": return WalkerEnvSim_v0( @@ -122,6 +128,7 @@ def make(name="RunAway", env_conf=None): low_action_angle=env_conf.low_action_angle, high_action_angle=env_conf.high_action_angle, verbose=env_conf.verbose, + pretain=pretain, ) elif name == "roboarm-v0": return RoboArmEnv_v0( @@ -129,6 +136,7 @@ def make(name="RunAway", env_conf=None): verbose=env_conf.verbose, sleep_time=env_conf.sleep_time, reward_signal=env_conf.reward_signal, + pretain=pretain, ) elif name == "roboarm_sim-v0": return RoboArmSimEnv_v0( @@ -136,6 +144,7 @@ def make(name="RunAway", env_conf=None): verbose=env_conf.verbose, noise=env_conf.noise, reward_signal=env_conf.reward_signal, + pretain=pretain, ) elif name == "roboarm_mixed-v0": return RoboArmMixedEnv_v0( @@ -145,6 +154,7 @@ def make(name="RunAway", env_conf=None): reward_signal=env_conf.reward_signal, camera_id=env_conf.camera_id, goal_radius=env_conf.goal_radius, + pretain=pretain, ) elif name == "roboarm_pickplace-v0": return RoboArmPickPlaceEnv_v0( @@ -156,6 +166,7 @@ def make(name="RunAway", env_conf=None): image_size=env_conf.image_size, target_image_path=env_conf.target_image_path, use_vip_reward=env_conf.use_vip_reward, + pretain=pretain, ) else: print("Environment not found") diff --git a/environments/base/base_env.py b/environments/base/base_env.py index 3861f58..4507789 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -22,6 +22,7 @@ def __init__( self, action_dim: int, state_dim: int, + use_hub: bool = True, verbose: bool = False, ): self.verbose = verbose @@ -36,11 +37,14 @@ def __init__( # buffer state in case of missing data self.buffered_state = np.zeros(self.state_dim, dtype=np.float32) - self.hub = PybricksHub( - state_dim=state_dim, out_format_str=self.state_format_str - ) - self.hub.connect() - print("Connected to hub.") + if use_hub: + self.hub = PybricksHub( + state_dim=state_dim, out_format_str=self.state_format_str + ) + self.hub.connect() + print("Connected to hub.") + else: + self.hub = None super().__init__(batch_size=torch.Size([1])) def send_to_hub(self, action: np.array) -> None: diff --git a/environments/roboarm_mixed_v0/RoboArmMixedEnv.py b/environments/roboarm_mixed_v0/RoboArmMixedEnv.py index 7c760c3..702ce35 100644 --- a/environments/roboarm_mixed_v0/RoboArmMixedEnv.py +++ b/environments/roboarm_mixed_v0/RoboArmMixedEnv.py @@ -68,6 +68,7 @@ def __init__( max_episode_steps: int = 50, sleep_time: float = 0.0, verbose: bool = False, + pretrain: bool = False, reward_signal: str = "dense", camera_id: int = 0, goal_radius: float = 25, @@ -131,7 +132,10 @@ def __init__( self.goal_positions = self.init_camera_position() super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def init_camera_position( diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py index 948adf4..7650d22 100644 --- a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py +++ b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py @@ -35,6 +35,7 @@ def __init__( max_episode_steps: int = 50, sleep_time: float = 0.0, verbose: bool = False, + pretrain: bool = False, reward_signal: str = "dense", camera_id: int = 0, image_size: Tuple[int, int] = (64, 64), @@ -123,7 +124,10 @@ def __init__( _ = self.init_camera_position() super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def init_camera_position( @@ -158,7 +162,7 @@ def get_reset_tensordict(self, **kwargs) -> TensorDictBase: 1, ], ) - return TensorDict({},batch_size=[1]) + return TensorDict({}, batch_size=[1]) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: """ diff --git a/environments/roboarm_v0/RoboArmEnv.py b/environments/roboarm_v0/RoboArmEnv.py index cd8028a..ea8b176 100644 --- a/environments/roboarm_v0/RoboArmEnv.py +++ b/environments/roboarm_v0/RoboArmEnv.py @@ -32,6 +32,7 @@ def __init__( max_episode_steps: int = 50, sleep_time: float = 0.0, verbose: bool = False, + pretrain: bool = False, reward_signal: str = "dense", ): self.sleep_time = sleep_time @@ -77,7 +78,10 @@ def __init__( self.observation_spec.set(self.observation_key, observation_spec) self.observation_spec.set(self.goal_observation_key, observation_spec) super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: diff --git a/environments/roboarm_v0/RoboArmSim.py b/environments/roboarm_v0/RoboArmSim.py index d0c3ee7..41e72ac 100644 --- a/environments/roboarm_v0/RoboArmSim.py +++ b/environments/roboarm_v0/RoboArmSim.py @@ -30,6 +30,7 @@ def __init__( max_episode_steps: int = 50, noise: float = 0.1, verbose: bool = False, + pretrain: bool = False, reward_signal: str = "dense", ): self.noise = noise @@ -77,7 +78,10 @@ def __init__( self.observation_spec.set(self.observation_key, observation_spec) self.observation_spec.set(self.goal_observation_key, observation_spec) super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=False, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: From bc4e53e8b501263c6bdb244171b7c8f6c61abb10 Mon Sep 17 00:00:00 2001 From: BY571 Date: Tue, 17 Sep 2024 11:25:00 +0200 Subject: [PATCH 25/53] Update pretrain script with pretrain flag --- environments/runaway_v0/RunAwayEnv.py | 6 ++- environments/spinning_v0/SpinningEnv.py | 6 ++- environments/walker_v0/WalkerEnv.py | 6 ++- environments/walker_v0/WalkerEnvSim.py | 6 ++- experiments/2wheeler/pretrain.py | 61 +++++++++++++++++++++++++ experiments/roboarm/pretrain.py | 2 +- experiments/walker/pretrain.py | 2 +- 7 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 experiments/2wheeler/pretrain.py diff --git a/environments/runaway_v0/RunAwayEnv.py b/environments/runaway_v0/RunAwayEnv.py index 89297ed..7adb5ba 100644 --- a/environments/runaway_v0/RunAwayEnv.py +++ b/environments/runaway_v0/RunAwayEnv.py @@ -45,6 +45,7 @@ def __init__( min_distance: float = 40, sleep_time: float = 0.2, verbose: bool = False, + pretrain: bool = False, ): self.sleep_time = sleep_time self.min_distance = min_distance @@ -81,7 +82,10 @@ def __init__( ) self.verbose = verbose super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: diff --git a/environments/spinning_v0/SpinningEnv.py b/environments/spinning_v0/SpinningEnv.py index 84a86fe..eab608b 100644 --- a/environments/spinning_v0/SpinningEnv.py +++ b/environments/spinning_v0/SpinningEnv.py @@ -39,6 +39,7 @@ def __init__( max_episode_steps: int = 50, sleep_time: float = 0.2, verbose: bool = False, + pretrain: bool = False, ): self.sleep_time = sleep_time self._batch_size = torch.Size([1]) @@ -74,7 +75,10 @@ def __init__( ) super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: diff --git a/environments/walker_v0/WalkerEnv.py b/environments/walker_v0/WalkerEnv.py index 6371cfb..9d8bdb9 100644 --- a/environments/walker_v0/WalkerEnv.py +++ b/environments/walker_v0/WalkerEnv.py @@ -45,6 +45,7 @@ def __init__( max_episode_steps: int = 50, sleep_time: float = 0.0, verbose: bool = False, + pretrain: bool = False, ): self.sleep_time = sleep_time self._batch_size = torch.Size([1]) @@ -83,7 +84,10 @@ def __init__( {self.observation_key: observation_spec}, shape=(1,) ) super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=1 - pretrain, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: diff --git a/environments/walker_v0/WalkerEnvSim.py b/environments/walker_v0/WalkerEnvSim.py index 2ab17a2..b9f76f2 100644 --- a/environments/walker_v0/WalkerEnvSim.py +++ b/environments/walker_v0/WalkerEnvSim.py @@ -35,6 +35,7 @@ def __init__( low_action_angle: int = -100, high_action_angle: int = 0, verbose: bool = False, + pretrain: bool = False, ): self._batch_size = torch.Size([1]) self.max_episode_steps = max_episode_steps @@ -74,7 +75,10 @@ def __init__( {self.observation_key: observation_spec}, shape=(1,) ) super().__init__( - action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose + action_dim=self.action_dim, + state_dim=self.state_dim, + verbose=verbose, + use_hub=False, ) def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: diff --git a/experiments/2wheeler/pretrain.py b/experiments/2wheeler/pretrain.py new file mode 100644 index 0000000..0849e94 --- /dev/null +++ b/experiments/2wheeler/pretrain.py @@ -0,0 +1,61 @@ +import os +import sys + +import hydra +import wandb +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm + +# Add the project root to PYTHONPATH +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +from environments import make_env +from src.agents import get_agent +from src.utils import login, logout, setup_check, tensordict2dict + + +@hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") +def run(cfg: DictConfig) -> None: + print(OmegaConf.to_yaml(cfg)) + + # make environment. + setup_check(robot="2wheeler", config=cfg) + env, action_space, state_space = make_env(cfg, pretrain=True) + + # make agent + agent, project_name = get_agent(action_space, state_space, cfg) + login(agent) + + # initialize wandb + wandb.init(project=project_name) + wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) + wandb.watch(agent.actor, log_freq=1) if agent.actor else None + + batch_size = cfg.agent.batch_size + num_updates = cfg.agent.num_updates + train_episodes = cfg.episodes + print("Start training...") + try: + for e in tqdm(range(train_episodes), desc="Training"): + + loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) + + # Metrics Logging + log_dict = { + "epoch": e, + "buffer_size": agent.replay_buffer.__len__(), + } + log_dict.update(tensordict2dict(loss_info)) + wandb.log(log_dict) + + except KeyboardInterrupt: + print("Training interrupted by user.") + + logout(agent) + env.close() + + +if __name__ == "__main__": + run() diff --git a/experiments/roboarm/pretrain.py b/experiments/roboarm/pretrain.py index bd30a2c..5ff54d7 100644 --- a/experiments/roboarm/pretrain.py +++ b/experiments/roboarm/pretrain.py @@ -22,7 +22,7 @@ def run(cfg: DictConfig) -> None: # make environment. setup_check(robot="roboarm", config=cfg) - env, action_space, state_space = make_env(cfg) + env, action_space, state_space = make_env(cfg, pretrain=True) # make agent agent, project_name = get_agent(action_space, state_space, cfg) diff --git a/experiments/walker/pretrain.py b/experiments/walker/pretrain.py index e1c59a1..602d3e6 100644 --- a/experiments/walker/pretrain.py +++ b/experiments/walker/pretrain.py @@ -22,7 +22,7 @@ def run(cfg: DictConfig) -> None: # make environment. setup_check(robot="walker", config=cfg) - env, action_space, state_space = make_env(cfg) + env, action_space, state_space = make_env(cfg, pretrain=True) # make agent agent, project_name = get_agent(action_space, state_space, cfg) From 3d802aadc80e96a3a715e3081c651f0043521a1f Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 09:37:16 +0200 Subject: [PATCH 26/53] update gitgnore for .pth --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 374c6a1..a4fac52 100644 --- a/.gitignore +++ b/.gitignore @@ -132,4 +132,6 @@ dmypy.json # wandb wandb/ # hydra -outputs/ \ No newline at end of file +outputs/ +# .pth files +*.pth \ No newline at end of file From dd0bc2540cbff32f59d86c4799930afa5d6e21ce Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 10:10:43 +0200 Subject: [PATCH 27/53] take off keyboard agent pickplace --- conf/agent/keyboard.yaml | 6 - environments/__init__.py | 17 -- .../RoboArmPickPlaceEnv.py | 247 ------------------ environments/roboarm_pickplace_v0/client.py | 149 ----------- src/agents/keyboard.py | 144 ---------- 5 files changed, 563 deletions(-) delete mode 100644 conf/agent/keyboard.yaml delete mode 100644 environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py delete mode 100644 environments/roboarm_pickplace_v0/client.py delete mode 100644 src/agents/keyboard.py diff --git a/conf/agent/keyboard.yaml b/conf/agent/keyboard.yaml deleted file mode 100644 index 39d1ec7..0000000 --- a/conf/agent/keyboard.yaml +++ /dev/null @@ -1,6 +0,0 @@ -name: keyboard - -batch_size: 256 -buffer_size: 1000000 -num_updates: 2500 -prefill_episodes: 0 \ No newline at end of file diff --git a/environments/__init__.py b/environments/__init__.py index 9c5857b..cbdf5f9 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -1,19 +1,15 @@ import numpy as np -import torch from torchrl.envs import ( CatFrames, Compose, - DoubleToFloat, ObservationNorm, PermuteTransform, - RewardSum, ToTensorImage, TransformedEnv, VIPRewardTransform, ) from environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0 -from environments.roboarm_pickplace_v0.RoboArmPickPlaceEnv import RoboArmPickPlaceEnv_v0 from environments.roboarm_v0.RoboArmEnv import RoboArmEnv_v0 from environments.roboarm_v0.RoboArmSim import RoboArmSimEnv_v0 from environments.runaway_v0.RunAwayEnv import RunAwayEnv_v0 @@ -31,7 +27,6 @@ "roboarm-v0", "roboarm_mixed-v0", "roboarm_sim-v0", - "roboarm_pickplace-v0", ] ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS @@ -156,17 +151,5 @@ def make(name="RunAway", env_conf=None, pretain=False): goal_radius=env_conf.goal_radius, pretain=pretain, ) - elif name == "roboarm_pickplace-v0": - return RoboArmPickPlaceEnv_v0( - max_episode_steps=env_conf.max_episode_steps, - sleep_time=env_conf.sleep_time, - verbose=env_conf.verbose, - reward_signal=env_conf.reward_signal, - camera_id=env_conf.camera_id, - image_size=env_conf.image_size, - target_image_path=env_conf.target_image_path, - use_vip_reward=env_conf.use_vip_reward, - pretain=pretain, - ) else: print("Environment not found") diff --git a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py b/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py deleted file mode 100644 index 7650d22..0000000 --- a/environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py +++ /dev/null @@ -1,247 +0,0 @@ -import random -import time -from typing import Tuple - -import cv2 - -import numpy as np -import torch - -from environments.base.base_env import BaseEnv -from tensordict import TensorDict, TensorDictBase -from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec - - -class RoboArmPickPlaceEnv_v0(BaseEnv): - """ """ - - action_dim = ( - 4 # (grab_motor, high_motor_action, low_motor_action, rotation_motor_action) - ) - - state_dim = 4 # (GM, HM, LM, RM) - - observation_ranges = { - "GM": (-148, -45), # Grab motor range is 0-180 - "HM": (-150, 10), - "LM": (10, 100), - "RM": (-179, 179), # Rotation motor needs to be place in the center - } - observation_key = "observation" - pixels_observation_key = "pixels" - - def __init__( - self, - max_episode_steps: int = 50, - sleep_time: float = 0.0, - verbose: bool = False, - pretrain: bool = False, - reward_signal: str = "dense", - camera_id: int = 0, - image_size: Tuple[int, int] = (64, 64), - human_control: bool = False, - use_vip_reward: bool = False, - target_image_path: str = None, - mixed_observation: bool = True, - ): - self.sleep_time = sleep_time - - assert reward_signal in [ - "dense", - "sparse", - ], "Reward signal must be dense or sparse." - self.reward_signal = reward_signal - self.max_episode_steps = max_episode_steps - self.image_size = image_size - self.human_control = human_control - - self.camera = cv2.VideoCapture(int(camera_id)) - self._batch_size = torch.Size([1]) - - if target_image_path is not None: - target_image = np.load(target_image_path) - else: - target_image = np.load( - "environments/roboarm_pickplace_v0/pickplace_green100_target.npy" - ) - self.target_image = target_image - self.use_vip_reward = use_vip_reward - self.mixed_observation = mixed_observation - - # Define action spec - self.action_spec = BoundedTensorSpec( - low=-1, - high=1, - shape=(1, self.action_dim), - ) - - # Observation 3 motors (HM, LM, RM) - # Define observation spec - bounds = torch.tensor( - [ - self.observation_ranges["GM"], - self.observation_ranges["HM"], - self.observation_ranges["LM"], - self.observation_ranges["RM"], - ] - ) - - low_bounds = bounds[:, 0].unsqueeze(0) - high_bounds = bounds[:, 1].unsqueeze(0) - - observation_spec = BoundedTensorSpec( - low=low_bounds, - high=high_bounds, - dtype=torch.float32, - ) - # get initial observation to define image observation spec - ret, frame = self.camera.read() - if not ret: - raise ValueError("Camera not available.") - resized_frame = cv2.resize(frame, self.image_size) - shape = resized_frame.shape - pixels_observation_spec = BoundedTensorSpec( - low=torch.zeros((1,) + shape, dtype=torch.int64), - high=torch.ones((1,) + shape, dtype=torch.int64) * 255, - dtype=torch.int64, - ) - if self.mixed_observation: - self.observation_spec = CompositeSpec( - { - self.observation_key: observation_spec, - self.pixels_observation_key: pixels_observation_spec, - }, - shape=(1,), - ) - else: - self.observation_spec = CompositeSpec( - { - self.pixels_observation_key: pixels_observation_spec, - }, - shape=(1,), - ) - - _ = self.init_camera_position() - - super().__init__( - action_dim=self.action_dim, - state_dim=self.state_dim, - verbose=verbose, - use_hub=1 - pretrain, - ) - - def init_camera_position( - self, - ): - print( - "\nInitializing camera position... \nMake sure the robot is in the center of the frame.\nPlease press 'c' to continue..." - ) - while True: - ret, frame = self.camera.read() - if not ret: - print("Error: Can't receive frame. Exiting ...") - break - - cv2.imshow("Init RobotPosition", frame) - - if cv2.waitKey(1) == ord("c"): - break - - return - - def get_reset_tensordict(self, **kwargs) -> TensorDictBase: - """ """ - if self.use_vip_reward: - return TensorDict( - { - "goal_image": torch.from_numpy(self.target_image) - .to(torch.int64) - .unsqueeze(0), - }, - batch_size=[ - 1, - ], - ) - return TensorDict({}, batch_size=[1]) - - def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: - """ - Reset the environment and return the initial state. - - Returns: - TensorDictBase: The initial state of the environment. - """ - # TODO solve this fake action sending before to receive first state - self.episode_step_iter = 0 - action = np.zeros(self.action_dim) - self.send_to_hub(action) - time.sleep(self.sleep_time) - observation = self.read_from_hub() - - ret, frame = self.camera.read() - resized_frame = cv2.resize(frame, self.image_size) - - return TensorDict( - { - self.observation_key: torch.tensor(observation).float(), - self.pixels_observation_key: torch.from_numpy(resized_frame)[ - None, : - ].to(torch.int64), - # "goal_image": torch.from_numpy(self.target_image) - # .to(torch.int64) - # .unsqueeze(0), - }, - batch_size=[1], - ) - - def reward( - self, - frame: np.ndarray, - ) -> Tuple[float, bool]: - """ """ - # TODO: Find a way to classify if cup is in the goal location - done = False - reward = 0.0 - return reward, done - - def _step(self, tensordict: TensorDictBase) -> TensorDictBase: - """ """ - # Send action to hub to receive next state - self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze()) - time.sleep( - self.sleep_time - ) # we need to wait some time for sensors to read and to - - # receive the next state - next_observation = self.read_from_hub() - - # get next frame - ret, frame = self.camera.read() - - cv2.imshow("Camera", frame) - cv2.waitKey(1) - # calc reward and done - reward, done = self.reward( - frame, - ) - resized_frame = cv2.resize(frame, self.image_size) - next_tensordict = TensorDict( - { - self.observation_key: torch.tensor(next_observation).float(), - self.pixels_observation_key: torch.from_numpy(resized_frame)[ - None, : - ].to(torch.int64), - "reward": torch.tensor([reward]).float(), - "done": torch.tensor([done]).bool(), - # "goal_image": torch.from_numpy(self.target_image) - # .to(torch.int64) - # .unsqueeze(0), - }, - batch_size=[1], - ) # .to(tensordict.device) - - # increment episode step counter - self.episode_step_iter += 1 - if self.episode_step_iter >= self.max_episode_steps: - next_tensordict.set("done", torch.tensor([True])) - return next_tensordict diff --git a/environments/roboarm_pickplace_v0/client.py b/environments/roboarm_pickplace_v0/client.py deleted file mode 100644 index e7a6b72..0000000 --- a/environments/roboarm_pickplace_v0/client.py +++ /dev/null @@ -1,149 +0,0 @@ -import umath -import urandom -import ustruct -from micropython import kbd_intr -from pybricks.hubs import InventorHub -from pybricks.parameters import Port -from pybricks.pupdevices import Motor -from pybricks.tools import wait -from uselect import poll - -# Standard MicroPython modules -from usys import stdin, stdout - -kbd_intr(-1) - -hub = InventorHub() - -# Initialize the drive base. -# Grab Motor range (130, 179) left side closed (-148, -45) -grab_motor_range = (-148, -45) -grab_motor = Motor(Port.E) -grab_motor.run_target(speed=400, target_angle=-95) # start roughly in the middle -# High Motor range (-150, 30) -high_motor_range = (-150, 10) -high_motor = Motor(Port.A) -high_motor.run_target(speed=400, target_angle=-70) - -# Low motor range (10, 70) -low_motor_range = (10, 100) -low_motor = Motor(Port.D) -low_motor.control.limits(500, 1200, 1000) -low_motor.run_target(speed=400, target_angle=40) - -# Rotation motor range (-360, 360) -# observe as its basically ~ 180 -rotation_motor = Motor(Port.B) - -# color_sensor = ColorSensor(Port.C) -motors = {"GM": grab_motor, "HM": high_motor, "LM": low_motor, "RM": rotation_motor} - - -def get_current_motor_angles(): - angles = {} - for k, v in motors.items(): - angle = normalize_angle(get_angle(v)) - angles.update({k: angle}) - return angles - - -def run_angle(motor, angle, speed=300): - motor.run_angle(speed=speed, rotation_angle=angle, wait=False) - - -def get_angle(motor): - return motor.angle() - - -def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360): - # Normalize angle to be within -179 to 179 degrees - while angle <= low_angle: - angle += original_one_round - while angle > high_angle: - angle -= original_one_round - return angle - - -def transform_range(value, old_min, old_max, new_min, new_max): - """ - Transform a value from one range to another. - - Parameters: - value (float): The value to transform. - old_min (float): The minimum value of the old range. - old_max (float): The maximum value of the old range. - new_min (float): The minimum value of the new range. - new_max (float): The maximum value of the new range. - - Returns: - float: The transformed value. - """ - # Compute the scale factor between the old and new ranges - scale = (new_max - new_min) / (old_max - old_min) - # Apply the transformation - return new_min + (value - old_min) * scale - - -keyboard = poll() -keyboard.register(stdin) - -while True: - - while not keyboard.poll(0): - wait(1) - - # Read action values for the motors - data = stdin.buffer.read(16) # Reading 4 bytes (4 floats) - rotation_action, low_action, high_action, grab_action = ustruct.unpack( - "!ffff", data - ) - - # transform action range for motors - grab_action = transform_range(grab_action, -1, 1, -25, 25) - high_action = transform_range(high_action, -1, 1, -60, 60) - low_action = transform_range(low_action, -1, 1, -30, 30) - rotation_action = transform_range(rotation_action, -1, 1, -180, 180) - - angles = get_current_motor_angles() - - if not (angles["GM"] + grab_action > max(grab_motor_range)) and not ( - angles["GM"] + grab_action < min(grab_motor_range) - ): - grab_motor.run_angle(speed=250, rotation_angle=grab_action, wait=False) - - if not (angles["HM"] + high_action > max(high_motor_range)) and not ( - angles["HM"] + high_action < min(high_motor_range) - ): - high_motor.run_angle(speed=250, rotation_angle=high_action, wait=False) - - if not (angles["LM"] + low_action > max(low_motor_range)) and not ( - angles["LM"] + low_action < min(low_motor_range) - ): - low_motor.control.limits(500, 1200, 1000) - low_motor.run_angle(speed=250, rotation_angle=low_action, wait=False) - - # if not (angles["RM"] + rotation_action > 180) or not (angles["RM"] + rotation_action < -180): - rotation_motor.run_angle(speed=250, rotation_angle=rotation_action, wait=False) - - wait(250) - - rotation_angle = rotation_motor.angle() - high_angle = high_motor.angle() - grab_angle = grab_motor.angle() - low_angle = low_motor.angle() - - # sometimes low angle jumps out of range and cant move back this corrects those cases - if low_angle < 10: - low_motor.run_target(speed=200, target_angle=10) - - # GM HM LM RM - out_msg = ustruct.pack( - "!ffff", - grab_angle, - normalize_angle(high_angle), - low_angle, - normalize_angle( - rotation_angle, low_angle=-900, high_angle=900, original_one_round=1800 - ), - ) - stdout.buffer.write(out_msg) diff --git a/src/agents/keyboard.py b/src/agents/keyboard.py deleted file mode 100644 index bf511d4..0000000 --- a/src/agents/keyboard.py +++ /dev/null @@ -1,144 +0,0 @@ -import time - -import tensordict as td -import torch -from pynput import keyboard -from tensordict import TensorDictBase -from torchrl.data import TensorDictReplayBuffer -from torchrl.data.replay_buffers.storages import LazyMemmapStorage - -from src.agents.base import BaseAgent - - -class KeyboardAgent(BaseAgent): - def __init__(self, state_spec, action_spec, agent_config, device="cpu"): - super(KeyboardAgent, self).__init__( - state_spec, action_spec, agent_config.name, device - ) - - # Define the key to action mapping - self.key_action_mapping = { - "a": [0.15, 0, 0, 0], # Rotate motor -30 - "d": [-0.15, 0, 0, 0], # Rotate motor +30 - "s": [0, -0.20, 0, 0], # Low motor -10 - "w": [0, 0.20, 0, 0], # Low motor +10 - "q": [0, 0, -0.25, 0], # High motor -15 - "e": [0, 0, 0.25, 0], # High motor +15 - "f": [0, 0, 0, -0.25], # Grab motor -10 - "g": [0, 0, 0, 0.25], # Grab motor +10 - } - self.current_action = None - self.setup_key_listener() - self.buffer_batch_size = agent_config.batch_size - # Define Replay Buffer - self.replay_buffer = self.create_replay_buffer( - batch_size=self.buffer_batch_size, - prb=False, - buffer_size=agent_config.buffer_size, - device=device, - ) - - # general stats - self.collected_transitions = 0 - self.total_updates = 0 - - def setup_key_listener(self): - def on_press(key): - try: - if key.char in self.key_action_mapping: - self.current_action = self.key_action_mapping[key.char] - except AttributeError: - pass - - def on_release(key): - self.current_action = None - - self.listener = keyboard.Listener(on_press=on_press, on_release=on_release) - self.listener.start() - - def load_model(self, path): - """load model""" - try: - statedict = torch.load(path) - self.actor.load_state_dict(statedict["actor"]) - self.critic.load_state_dict(statedict["critic"]) - print("Model loaded") - except: - raise ValueError("Model not loaded") - - def load_replaybuffer(self, path): - """load replay buffer""" - try: - # self.replay_buffer.load(path) - loaded_data = TensorDictBase.load_memmap(path).to_tensordict() - self.replay_buffer.extend(loaded_data) - if self.replay_buffer._batch_size != self.buffer_batch_size: - Warning( - "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." - ) - self.replay_buffer._batch_size = self.buffer_batch_size - print("Replay Buffer loaded") - print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") - except: - raise ValueError("Replay Buffer not loaded") - - def eval(self): - """Sets the agent to evaluation mode.""" - pass - - def create_replay_buffer( - self, - batch_size=256, - prb=False, - buffer_size=100000, - buffer_scratch_dir="./scratch", - device="cpu", - prefetch=3, - ): - """Create replay buffer""" - replay_buffer = TensorDictReplayBuffer( - pin_memory=False, - prefetch=prefetch, - storage=LazyMemmapStorage( - buffer_size, - scratch_dir=buffer_scratch_dir, - ), - batch_size=batch_size, - ) - replay_buffer.append_transform(lambda x: x.to(device)) - - - - return replay_buffer - - @torch.no_grad() - def get_action(self, td: TensorDictBase) -> TensorDictBase: - """Get action from actor network or keyboard""" - while self.current_action is None: - time.sleep(0.01) # Add a small sleep to avoid blocking - td.set("action", torch.tensor(self.current_action).float().unsqueeze(0)) - return td - - @torch.no_grad() - def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: - """Get action from actor network or keyboard""" - while self.current_action is None: - time.sleep(0.01) # Add a small sleep to avoid blocking - td.set("action", torch.tensor(self.current_action).float().unsqueeze(0)) - return td - - def add_experience(self, transition: td.TensorDict): - """Add experience to replay buffer""" - - # transform pixels to int if pixels in observation spec - save_transition = transition.copy() - if "pixels" in transition.keys(): - save_transition.set("pixels", (save_transition.get("pixels")*255).to(torch.int64)) - save_transition.set(("next", "pixels"), (save_transition.get(("next", "pixels"))*255).to(torch.int64)) - - self.replay_buffer.extend(save_transition) - self.collected_transitions += 1 - - def train(self, batch_size=64, num_updates=1): - """Train the agent""" - return {} From 2f56409d21e8f1801191d7061ccbf90268c7c972 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 13:41:26 +0200 Subject: [PATCH 28/53] take off pickplace transform --- environments/__init__.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/environments/__init__.py b/environments/__init__.py index cbdf5f9..d32a59f 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -3,10 +3,8 @@ CatFrames, Compose, ObservationNorm, - PermuteTransform, ToTensorImage, TransformedEnv, - VIPRewardTransform, ) from environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0 @@ -71,21 +69,6 @@ def make_env(config, pretrain=False): if "pixels" in observation_keys: transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True)) - if config.env.name == "roboarm_pickplace-v0" and config.env.use_vip_reward: - transforms.append(PermuteTransform((-1, -2, -3), in_keys=["pixels"])) - transforms.append( - VIPRewardTransform( - in_keys=["pixels"], - download=True, - size=100, - model_name="resnet50", - tensor_pixels_keys=[ - "pixels", - ("next", "pixels"), - ], # Does not seem to work - ) - ) - env = TransformedEnv(env, Compose(*transforms)) action_spec = env.action_spec From 5047954e45dbfecb30c5049e194b22fa213ed807 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 13:43:06 +0200 Subject: [PATCH 29/53] take off keyboard agent --- src/agents/__init__.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/agents/__init__.py b/src/agents/__init__.py index c251579..add51cd 100644 --- a/src/agents/__init__.py +++ b/src/agents/__init__.py @@ -1,12 +1,11 @@ from src.agents.behavior_cloning import BehavioralCloningAgent from src.agents.cql import CQLAgent from src.agents.iql import IQLAgent -from src.agents.keyboard import KeyboardAgent from src.agents.random import RandomAgent from src.agents.sac import SACAgent from src.agents.td3 import TD3Agent -all_agents = ["td3", "sac", "iql", "cql", "random", "keyboard"] +all_agents = ["td3", "sac", "iql", "cql", "random"] def get_agent(action_spec, state_spec, cfg): @@ -52,13 +51,6 @@ def get_agent(action_spec, state_spec, cfg): agent_config=cfg.agent, device=cfg.device, ) - elif cfg.agent.name == "keyboard": - agent = KeyboardAgent( - action_spec=action_spec, - state_spec=state_spec, - agent_config=cfg.agent, - device=cfg.device, - ) else: raise NotImplementedError( f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}" From 1fc9e83de661ea6ba77e9f713e775e16cd6c3d03 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 14:01:54 +0200 Subject: [PATCH 30/53] pretrain docstring --- environments/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/environments/__init__.py b/environments/__init__.py index d32a59f..7298277 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -35,6 +35,7 @@ def make_env(config, pretrain=False): Args: config: A configuration object containing the environment name and maximum episode steps. + pretrain: A boolean indicating whether the environment is for pretraining. Returns: A tuple containing the new environment, its action space, and its state space. @@ -106,7 +107,6 @@ def make(name="RunAway", env_conf=None, pretain=False): low_action_angle=env_conf.low_action_angle, high_action_angle=env_conf.high_action_angle, verbose=env_conf.verbose, - pretain=pretain, ) elif name == "roboarm-v0": return RoboArmEnv_v0( @@ -122,7 +122,6 @@ def make(name="RunAway", env_conf=None, pretain=False): verbose=env_conf.verbose, noise=env_conf.noise, reward_signal=env_conf.reward_signal, - pretain=pretain, ) elif name == "roboarm_mixed-v0": return RoboArmMixedEnv_v0( From d07c5ea0813f0841c8ca529eea2c1a6de79921ce Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 14:02:54 +0200 Subject: [PATCH 31/53] update base_env_sim with use_hub --- environments/base/base_env.py | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/base/base_env.py b/environments/base/base_env.py index 4507789..c1652a8 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -150,6 +150,7 @@ def __init__( action_dim: int, state_dim: int, verbose: bool = False, + use_hub: bool = False, ): self.verbose = verbose self.action_dim = action_dim From 153e00cb4751204ab9f800569f5bc84121c227a1 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 14:03:36 +0200 Subject: [PATCH 32/53] take off pretrain from sim envs --- environments/roboarm_v0/RoboArmSim.py | 1 - environments/walker_v0/WalkerEnvSim.py | 1 - 2 files changed, 2 deletions(-) diff --git a/environments/roboarm_v0/RoboArmSim.py b/environments/roboarm_v0/RoboArmSim.py index 41e72ac..cca28cb 100644 --- a/environments/roboarm_v0/RoboArmSim.py +++ b/environments/roboarm_v0/RoboArmSim.py @@ -30,7 +30,6 @@ def __init__( max_episode_steps: int = 50, noise: float = 0.1, verbose: bool = False, - pretrain: bool = False, reward_signal: str = "dense", ): self.noise = noise diff --git a/environments/walker_v0/WalkerEnvSim.py b/environments/walker_v0/WalkerEnvSim.py index b9f76f2..1a1e8e4 100644 --- a/environments/walker_v0/WalkerEnvSim.py +++ b/environments/walker_v0/WalkerEnvSim.py @@ -35,7 +35,6 @@ def __init__( low_action_angle: int = -100, high_action_angle: int = 0, verbose: bool = False, - pretrain: bool = False, ): self._batch_size = torch.Size([1]) self.max_episode_steps = max_episode_steps From 32add07d085af80ddc065bb0aa9020e196999b04 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 14:03:47 +0200 Subject: [PATCH 33/53] update tests --- tests/test_agents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 0a21c6c..bfc0bd2 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -49,7 +49,7 @@ def test_random_agent(env, device): else: device = "cpu" with initialize(config_path="../conf"): - cfg = compose(config_name="config", overrides=["device=" + device]) + cfg = compose(config_name="config", overrides=["device=" + device, "agent=random"]) # Test data collection env = get_env(env) agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) From f67c052f83a3617da4989e33425c2883a6ba37a0 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 14:59:48 +0200 Subject: [PATCH 34/53] update agent network creation --- environments/base/base_env.py | 9 ----- experiments/roboarm/train.py | 7 ++-- src/agents/base.py | 19 +++++----- src/agents/behavior_cloning.py | 4 +-- src/agents/cql.py | 4 +-- src/agents/gail.py | 0 src/agents/iql.py | 7 ++-- src/agents/sac.py | 4 +-- src/agents/td3.py | 5 ++- src/networks/networks.py | 63 +++++++++++++++++++++++----------- src/utils.py | 2 +- 11 files changed, 67 insertions(+), 57 deletions(-) create mode 100644 src/agents/gail.py diff --git a/environments/base/base_env.py b/environments/base/base_env.py index c1652a8..3d03b01 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -125,15 +125,6 @@ def _reset( def _set_seed(self, seed: int): return super()._set_seed(seed) - def get_reset_tensordict(self, **kwargs) -> TensorDictBase: - """ """ - return TensorDict( - { - }, - batch_size=[ - 1, - ], - ) class BaseSimEnv(EnvBase): diff --git a/experiments/roboarm/train.py b/experiments/roboarm/train.py index 5f8901f..64dc6bd 100644 --- a/experiments/roboarm/train.py +++ b/experiments/roboarm/train.py @@ -1,8 +1,7 @@ import os import sys import time -import torch -from tensordict import TensorDict + import hydra import numpy as np import wandb @@ -42,7 +41,7 @@ def run(cfg: DictConfig) -> None: # initialize wandb wandb.init(project=project_name) wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) - # wandb.watch(agent.actor, log_freq=1) if agent.actor else None + wandb.watch(agent.actor, log_freq=1) if agent.actor else None # prefill buffer with random actions prefill_buffer( @@ -61,7 +60,7 @@ def run(cfg: DictConfig) -> None: quit = False try: for e in tqdm(range(train_episodes), desc="Training"): - td = env.reset(env.get_reset_tensordict()) + td = env.reset() done = td.get("done", False) truncated = td.get("truncated", False) ep_return = 0 diff --git a/src/agents/base.py b/src/agents/base.py index d43b6d9..8605478 100644 --- a/src/agents/base.py +++ b/src/agents/base.py @@ -13,18 +13,17 @@ class BaseAgent: """Implements a base agent used to interact with the lego robots. Args: - state_space (gym.Space): The state space of the environment. - action_space (gym.Space): The action space of the environment. - device (torch.device): The device to use for computation. - observation_keys (Tuple[str]): The keys used to access the observation in the tensor dictionary. + state_spec (TensorSpec): The state specification of the environment. + action_spec (TensorSpec): The action specification of the environment. + agent_name (str): The name of the agent. + device (str): The device to use for computation. Attributes: - state_space (gym.Space): The state space of the environment. - action_space (gym.Space): The action space of the environment. - state_dim (int): The dimension of the state space. - action_dim (int): The dimension of the action space. - device (torch.device): The device to use for computation. - observation_keys (Tuple[str]): The keys used to access the observation in the tensor dictionary. + name (str): The name of the agent. + observation_spec (TensorSpec): The state specification of the environment. + action_spec (TensorSpec): The action specification of the environment. + device (str): The device to use for computation. + observation_keys (List[str]): The keys used to access the observation in the tensor dictionary. """ def __init__( diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 1fb3e5f..8faf7f4 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -30,11 +30,11 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): if agent_config.policy_type == "deterministic": self.actor = get_deterministic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) elif agent_config.policy_type == "stochastic": self.actor = get_stochastic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) else: raise ValueError( diff --git a/src/agents/cql.py b/src/agents/cql.py index f3ebd26..8375035 100644 --- a/src/agents/cql.py +++ b/src/agents/cql.py @@ -22,9 +22,9 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): with_lagrange = agent_config.with_lagrange self.actor = get_stochastic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) - self.critic = get_critic(self.observation_keys, agent_config) + self.critic = get_critic(state_spec, agent_config) self.actor.to(device) self.critic.to(device) diff --git a/src/agents/gail.py b/src/agents/gail.py new file mode 100644 index 0000000..e69de29 diff --git a/src/agents/iql.py b/src/agents/iql.py index c90f34b..9c7e684 100644 --- a/src/agents/iql.py +++ b/src/agents/iql.py @@ -21,11 +21,11 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) self.actor = get_stochastic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) - self.critic = get_critic(self.observation_keys, agent_config) + self.critic = get_critic(state_spec, agent_config) - self.value = get_value_operator(self.observation_keys, agent_config) + self.value = get_value_operator(state_spec, agent_config) self.actor.to(device) self.critic.to(device) @@ -57,7 +57,6 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): self.batch_size = agent_config.batch_size self.replay_buffer = self.create_replay_buffer( - batch_size=self.batch_size, prb=agent_config.prb, buffer_size=agent_config.buffer_size, device=device, diff --git a/src/agents/sac.py b/src/agents/sac.py index 001a3d5..1e77f11 100644 --- a/src/agents/sac.py +++ b/src/agents/sac.py @@ -20,9 +20,9 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) self.actor = get_stochastic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) - self.critic = get_critic(self.observation_keys, agent_config) + self.critic = get_critic(state_spec, agent_config) self.actor.to(device) self.critic.to(device) diff --git a/src/agents/td3.py b/src/agents/td3.py index 450dada..db43b8e 100644 --- a/src/agents/td3.py +++ b/src/agents/td3.py @@ -5,7 +5,6 @@ from torch import nn, optim from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage -from torchrl.envs.transforms import ToTensorImage from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.modules import AdditiveGaussianWrapper @@ -33,9 +32,9 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) self.actor = get_deterministic_actor( - self.observation_keys, action_spec, agent_config + state_spec, action_spec, agent_config ) - self.critic = get_critic(self.observation_keys, agent_config) + self.critic = get_critic(state_spec, agent_config) self.model = nn.ModuleList([self.actor, self.critic]).to(device) diff --git a/src/networks/networks.py b/src/networks/networks.py index 98d74d3..70d24a0 100644 --- a/src/networks/networks.py +++ b/src/networks/networks.py @@ -3,7 +3,6 @@ from tensordict.nn.distributions import NormalParamExtractor from torchrl.modules import ( - AdditiveGaussianWrapper, ConvNet, MLP, ProbabilisticActor, @@ -12,7 +11,7 @@ TanhModule, ValueOperator, ) -from torchrl.modules.distributions import TanhDelta, TanhNormal +from torchrl.modules.distributions import TanhNormal def get_normalization(normalization): @@ -26,7 +25,9 @@ def get_normalization(normalization): raise NotImplementedError(f"Normalization {normalization} not implemented") -def get_critic(observation_keys, agent_config): +def get_critic(observation_spec, agent_config): + observation_keys = [key for key in observation_spec.keys()] + if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_critic( in_keys=observation_keys, @@ -44,6 +45,7 @@ def get_critic(observation_keys, agent_config): activation_class=nn.ReLU, normalization=agent_config.normalization, dropout=agent_config.dropout, + img_shape=observation_spec["pixels"].shape, ) elif "pixels" in observation_keys and "observation" in observation_keys: @@ -55,12 +57,14 @@ def get_critic(observation_keys, agent_config): activation_class=nn.ReLU, normalization=agent_config.normalization, dropout=agent_config.dropout, + img_shape=observation_spec["pixels"].shape, ) else: raise NotImplementedError("Critic for this observation space not implemented") -def get_value_operator(observation_keys, agent_config): +def get_value_operator(observation_spec, agent_config): + observation_keys = [key for key in observation_spec.keys()] if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_value( in_keys=observation_keys, @@ -77,7 +81,8 @@ def get_value_operator(observation_keys, agent_config): out_features=1, activation_class=nn.ReLU, normalization=agent_config.normalization, - dropout=agent_config.dropout + dropout=agent_config.dropout, + img_shape=observation_spec["pixels"].shape, ) elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_value( @@ -88,6 +93,7 @@ def get_value_operator(observation_keys, agent_config): activation_class=nn.ReLU, normalization=agent_config.normalization, dropout=agent_config.dropout, + img_shape=observation_spec["pixels"].shape, ) @@ -116,6 +122,7 @@ def get_vec_value( ) return qvalue + def get_img_only_value( img_in_keys, num_cells=[256, 256], @@ -123,6 +130,7 @@ def get_img_only_value( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -132,7 +140,7 @@ def get_img_only_value( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -158,6 +166,7 @@ def get_img_only_value( # model return SafeSequential(image_encoder, v_head) + def get_mixed_value( vec_in_keys, img_in_keys, @@ -166,6 +175,7 @@ def get_mixed_value( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -175,7 +185,7 @@ def get_mixed_value( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -246,6 +256,7 @@ def get_mixed_critic( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -255,7 +266,7 @@ def get_mixed_critic( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -299,6 +310,7 @@ def get_img_only_critic( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -308,7 +320,7 @@ def get_img_only_critic( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -335,7 +347,9 @@ def get_img_only_critic( return SafeSequential(image_encoder, v_head) -def get_deterministic_actor(observation_keys, action_spec, agent_config): +def get_deterministic_actor(observation_spec, action_spec, agent_config): + observation_keys = [key for key in observation_spec.keys()] + if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_deterministic_actor( action_spec=action_spec, @@ -343,13 +357,14 @@ def get_deterministic_actor(observation_keys, action_spec, agent_config): num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, ) - + elif "pixels" in observation_keys and not "observation" in observation_keys: return get_img_only_det_actor( img_in_keys="pixels", action_spec=action_spec, num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, + img_shape=observation_spec["pixels"].shape ) elif "pixels" in observation_keys and "observation" in observation_keys: @@ -359,6 +374,7 @@ def get_deterministic_actor(observation_keys, action_spec, agent_config): action_spec=action_spec, num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, + img_shape=observation_spec["pixels"].shape ) else: raise NotImplementedError("Actor for this observation space not implemented") @@ -402,6 +418,7 @@ def get_vec_deterministic_actor( return actor + def get_img_only_det_actor( img_in_keys, action_spec, @@ -409,6 +426,7 @@ def get_img_only_det_actor( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -418,7 +436,7 @@ def get_img_only_det_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -431,7 +449,6 @@ def get_img_only_det_actor( out_keys=["pixel_embedding"], ) - # output head mlp = MLP( activation_class=torch.nn.ReLU, @@ -453,6 +470,7 @@ def get_img_only_det_actor( out_module, ) + def get_mixed_deterministic_actor( vec_in_keys, img_in_keys, @@ -461,6 +479,7 @@ def get_mixed_deterministic_actor( activation_class=nn.ReLU, normalization="None", dropout=0.0, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) # image encoder @@ -470,7 +489,7 @@ def get_mixed_deterministic_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -516,7 +535,8 @@ def get_mixed_deterministic_actor( ) -def get_stochastic_actor(observation_keys, action_spec, agent_config): +def get_stochastic_actor(observation_spec, action_spec, agent_config): + observation_keys = [key for key in observation_spec.keys()] if "observation" in observation_keys and not "pixels" in observation_keys: return get_vec_stochastic_actor( action_spec, @@ -534,6 +554,7 @@ def get_stochastic_actor(observation_keys, action_spec, agent_config): normalization=agent_config.normalization, dropout=agent_config.dropout, activation_class=nn.ReLU, + img_shape=observation_spec["pixels"].shape, ) elif "pixels" in observation_keys and "observation" in observation_keys: return get_mixed_stochastic_actor( @@ -544,6 +565,7 @@ def get_stochastic_actor(observation_keys, action_spec, agent_config): normalization=agent_config.normalization, dropout=agent_config.dropout, activation_class=nn.ReLU, + img_shape=observation_spec["pixels"].shape, ) else: raise NotImplementedError("Actor for this observation space not implemented") @@ -601,6 +623,7 @@ def get_vec_stochastic_actor( ) return actor + def get_img_only_stochastic_actor( action_spec, img_in_keys, @@ -608,6 +631,7 @@ def get_img_only_stochastic_actor( normalization="None", dropout=0.0, activation_class=nn.ReLU, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) @@ -618,7 +642,7 @@ def get_img_only_stochastic_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, @@ -658,9 +682,7 @@ def get_img_only_stochastic_actor( "scale", ], ) - actor_net_combined = SafeSequential( - image_encoder, actor_module, extractor_module - ) + actor_net_combined = SafeSequential(image_encoder, actor_module, extractor_module) dist_class = TanhNormal dist_kwargs = { @@ -689,6 +711,7 @@ def get_mixed_stochastic_actor( normalization="None", dropout=0.0, activation_class=nn.ReLU, + img_shape=(3, 64, 64), ): normalization = get_normalization(normalization) @@ -699,7 +722,7 @@ def get_mixed_stochastic_actor( kernel_sizes=[8, 4, 3], strides=[4, 2, 1], ) - cnn_output = cnn(torch.ones((3, 100, 100))) + cnn_output = cnn(torch.ones(img_shape)) mlp = MLP( in_features=cnn_output.shape[-1], activation_class=activation_class, diff --git a/src/utils.py b/src/utils.py index a6a63f8..50adae4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -86,7 +86,7 @@ def prefill_buffer(env, agent, num_episodes=10, stop_on_done=False): inpt = input("Press Enter to start prefilling episode: ") for e in tqdm(range(num_episodes), desc="Prefilling buffer"): print("Prefill episode: ", e) - td = env.reset(env.get_reset_tensordict()) + td = env.reset() done = False truncated = False while not done and not truncated: From e1c261bfd26714f372f0376541e6abc84b880bcb Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:00:12 +0200 Subject: [PATCH 35/53] add mixed obs dummy image shape tests --- tests/test_agents.py | 86 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index bfc0bd2..2d6f9dc 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -17,9 +17,9 @@ def collection_round(env, agent, max_steps=1000): td = step_mdp(td) -def get_env(env): +def get_env(env, img_shape=(64, 64, 3)): if env == "mixed": - env = MixedObsDummyEnv() + env = MixedObsDummyEnv(img_shape=img_shape) env = TransformedEnv( env, Compose(ToTensorImage(in_keys=["pixels"], from_int=True)) ) @@ -171,3 +171,85 @@ def test_drq_agent(env, device): eval_td2 = agent.get_eval_action(td) assert torch.allclose(eval_td1["action"], eval_td2["action"]) + +@pytest.mark.parametrize( + "env", + ["mixed", "vec", "vec_goal"], +) +@pytest.mark.parametrize( + "device", + ["cpu", "cuda"], +) +def test_iql_agent(env, device): + if torch.cuda.is_available() and device == "cuda": + device = "cuda" + else: + device = "cpu" + with initialize(config_path="../conf"): + cfg = compose( + config_name="config", overrides=["agent=iql", "device=" + device] + ) + + # Test data collection + env = get_env(env) + agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) + collection_round(env, agent, max_steps=10) + # Test training + agent.train(batch_size=1, num_updates=1) + + # Test evaluation + td = env.reset() + td1 = agent.get_action(td) + td2 = agent.get_action(td) + + assert not torch.allclose(td1["action"], td2["action"]) + + agent.eval() + td = env.reset() + eval_td1 = agent.get_eval_action(td) + eval_td2 = agent.get_eval_action(td) + + assert torch.allclose(eval_td1["action"], eval_td2["action"]) + + +@pytest.mark.parametrize( + "env", + ["mixed"], +) +@pytest.mark.parametrize( + "img_shape", + [(64, 64, 3), (128, 128, 3)], +) +@pytest.mark.parametrize( + "device", + ["cpu", "cuda"], +) +def test_mixd_obs_size_agent(env, device, img_shape): + if torch.cuda.is_available() and device == "cuda": + device = "cuda" + else: + device = "cpu" + with initialize(config_path="../conf"): + cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device]) + + # Test data collection + env = get_env(env, img_shape) + agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) + collection_round(env, agent, max_steps=10) + + # Test training + agent.train(batch_size=1, num_updates=1) + + # Test evaluation + td = env.reset() + td1 = agent.get_action(td) + td2 = agent.get_action(td) + + assert not torch.allclose(td1["action"], td2["action"]) + + agent.eval() + td = env.reset() + eval_td1 = agent.get_eval_action(td) + eval_td2 = agent.get_eval_action(td) + + assert torch.allclose(eval_td1["action"], eval_td2["action"]) \ No newline at end of file From 96c5818c2a23d5129efd4c51418034737cb36c5c Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:00:34 +0200 Subject: [PATCH 36/53] udpate dummy mixed obs env for image shape --- environments/dummy/mixed_obs_dummy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/dummy/mixed_obs_dummy.py b/environments/dummy/mixed_obs_dummy.py index 09b9dd3..14f9b16 100644 --- a/environments/dummy/mixed_obs_dummy.py +++ b/environments/dummy/mixed_obs_dummy.py @@ -21,7 +21,7 @@ class MixedObsDummyEnv(EnvBase): observation_key = "observation" pixel_observation_key = "pixels" - def __init__(self, max_episode_steps=10): + def __init__(self, max_episode_steps=10, img_shape=(64, 64, 3)): self.max_episode_steps = max_episode_steps self._batch_size = torch.Size([1]) self.action_spec = BoundedTensorSpec( @@ -36,8 +36,8 @@ def __init__(self, max_episode_steps=10): ) pixel_observation_spec = BoundedTensorSpec( - low=torch.zeros((1,) + (64, 64, 3), dtype=torch.uint8), - high=torch.ones((1,) + (64, 64, 3), dtype=torch.uint8) * 255, + low=torch.zeros((1,) + img_shape, dtype=torch.uint8), + high=torch.ones((1,) + img_shape, dtype=torch.uint8) * 255, ) self.observation_spec = CompositeSpec(shape=(1,)) From 383766b7fea01e6bfa447b9158a8567f2110cb46 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:02:23 +0200 Subject: [PATCH 37/53] add cql agent test --- experiments/roboarm/train.py | 2 +- tests/test_agents.py | 38 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/experiments/roboarm/train.py b/experiments/roboarm/train.py index 64dc6bd..971ef2b 100644 --- a/experiments/roboarm/train.py +++ b/experiments/roboarm/train.py @@ -56,7 +56,7 @@ def run(cfg: DictConfig) -> None: train_episodes = cfg.episodes max_episode_steps = cfg.env.max_episode_steps - print("Start training...") + print("Start training...") quit = False try: for e in tqdm(range(train_episodes), desc="Training"): diff --git a/tests/test_agents.py b/tests/test_agents.py index 2d6f9dc..0fb6beb 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -211,6 +211,44 @@ def test_iql_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) +@pytest.mark.parametrize( + "env", + ["mixed", "vec", "vec_goal"], +) +@pytest.mark.parametrize( + "device", + ["cpu", "cuda"], +) +def test_cql_agent(env, device): + if torch.cuda.is_available() and device == "cuda": + device = "cuda" + else: + device = "cpu" + with initialize(config_path="../conf"): + cfg = compose( + config_name="config", overrides=["agent=cql", "device=" + device] + ) + + # Test data collection + env = get_env(env) + agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) + collection_round(env, agent, max_steps=10) + # Test training + agent.train(batch_size=1, num_updates=1) + + # Test evaluation + td = env.reset() + td1 = agent.get_action(td) + td2 = agent.get_action(td) + + assert not torch.allclose(td1["action"], td2["action"]) + + agent.eval() + td = env.reset() + eval_td1 = agent.get_eval_action(td) + eval_td2 = agent.get_eval_action(td) + + assert torch.allclose(eval_td1["action"], eval_td2["action"]) @pytest.mark.parametrize( "env", From 793d881927e805014d7394081a825d03c141afbd Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:13:00 +0200 Subject: [PATCH 38/53] update bc agent --- src/agents/behavior_cloning.py | 24 +++++++++++--------- tests/test_agents.py | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 8faf7f4..4d1b1f8 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -33,9 +33,13 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): state_spec, action_spec, agent_config ) elif agent_config.policy_type == "stochastic": - self.actor = get_stochastic_actor( - state_spec, action_spec, agent_config + raise NotImplementedError( + "Stochastic actor training is not implemented yet" ) + # TODO: Implement stochastic actor training + # self.actor = get_stochastic_actor( + # state_spec, action_spec, agent_config + # ) else: raise ValueError( "policy_type not recognized, choose deterministic or stochastic" @@ -122,19 +126,17 @@ def create_replay_buffer( return replay_buffer @torch.no_grad() - def get_action(self, state): + def get_action(self, td: TensorDictBase) -> TensorDictBase: """Get action from actor network""" - - state = torch.from_numpy(state).float().to(self.device)[None, :] - input_td = td.TensorDict({"observation": state}, batch_size=1) - # set exploration mode? - out_td = self.actor(input_td).squeeze(0) - return out_td["action"].cpu().numpy() + with set_exploration_type(ExplorationType.RANDOM): + out_td = self.actor(td.to(self.device)) + return out_td def add_experience(self, transition: td.TensorDict): """Add experience to replay buffer""" - - pass + """Add experience to replay buffer""" + self.replay_buffer.extend(transition) + self.collected_transitions += 1 def train(self, batch_size=64, num_updates=1): """Train the agent""" diff --git a/tests/test_agents.py b/tests/test_agents.py index 0fb6beb..7048ebc 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -250,6 +250,47 @@ def test_cql_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) +@pytest.mark.parametrize( + "env", + ["mixed", "vec", "vec_goal"], +) +@pytest.mark.parametrize( + "device", + ["cpu", "cuda"], +) +def test_bc_agent(env, device): + if torch.cuda.is_available() and device == "cuda": + device = "cuda" + else: + device = "cpu" + with initialize(config_path="../conf"): + cfg = compose( + config_name="config", overrides=["agent=bc", "device=" + device] + ) + + # Test data collection + env = get_env(env) + agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) + collection_round(env, agent, max_steps=10) + # Test training + agent.train(batch_size=1, num_updates=1) + + # Test evaluation + td = env.reset() + td1 = agent.get_action(td) + td2 = agent.get_action(td) + + assert not torch.allclose(td1["action"], td2["action"]) + + agent.eval() + td = env.reset() + eval_td1 = agent.get_eval_action(td) + eval_td2 = agent.get_eval_action(td) + + assert torch.allclose(eval_td1["action"], eval_td2["action"]) + + + @pytest.mark.parametrize( "env", ["mixed"], From f19cbcb78d4dec33975b329d3d31e4e73b6b212d Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:13:13 +0200 Subject: [PATCH 39/53] add bc tests --- tests/test_agents.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 7048ebc..b322fc7 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -276,12 +276,6 @@ def test_bc_agent(env, device): agent.train(batch_size=1, num_updates=1) # Test evaluation - td = env.reset() - td1 = agent.get_action(td) - td2 = agent.get_action(td) - - assert not torch.allclose(td1["action"], td2["action"]) - agent.eval() td = env.reset() eval_td1 = agent.get_eval_action(td) From 7614d88717fc4f1cc28e47b16765d02001793044 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:18:37 +0200 Subject: [PATCH 40/53] update torchrl version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bcfa85e..d9ff24f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pybricksdev tensordict==0.4.0 -torchrl==0.4.0 +torchrl==0.5.0 hydra-core==1.3.2 wandb==0.16.1 opencv-python==4.9.0.80 From 46fc4a6afb15d5bd1889e7ce9bfdb1281656d157 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:21:00 +0200 Subject: [PATCH 41/53] update req --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d9ff24f..8b7f967 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ pybricksdev -tensordict==0.4.0 +tensordict==0.5.0 torchrl==0.5.0 hydra-core==1.3.2 wandb==0.16.1 From 34241925c915cee3261d77a5726ecb01a56b3698 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:27:03 +0200 Subject: [PATCH 42/53] update formatting agents --- src/agents/behavior_cloning.py | 8 +++----- src/agents/cql.py | 4 +--- src/agents/iql.py | 4 +--- src/agents/random.py | 3 +-- src/agents/sac.py | 4 +--- src/agents/td3.py | 4 +--- 6 files changed, 8 insertions(+), 19 deletions(-) diff --git a/src/agents/behavior_cloning.py b/src/agents/behavior_cloning.py index 4d1b1f8..bfb39cb 100644 --- a/src/agents/behavior_cloning.py +++ b/src/agents/behavior_cloning.py @@ -6,8 +6,8 @@ from torchrl.data import BoundedTensorSpec, TensorDictReplayBuffer from torchrl.data.replay_buffers.storages import LazyMemmapStorage -from torchrl.envs.utils import ExplorationType, set_exploration_type from torchrl.envs import RenameTransform, ToTensorImage +from torchrl.envs.utils import ExplorationType, set_exploration_type from src.agents.base import BaseAgent from src.networks.networks import get_deterministic_actor, get_stochastic_actor @@ -29,9 +29,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): ) if agent_config.policy_type == "deterministic": - self.actor = get_deterministic_actor( - state_spec, action_spec, agent_config - ) + self.actor = get_deterministic_actor(state_spec, action_spec, agent_config) elif agent_config.policy_type == "stochastic": raise NotImplementedError( "Stochastic actor training is not implemented yet" @@ -121,7 +119,7 @@ def create_replay_buffer( ) replay_buffer.append_transform(lambda x: x.to(device)) # TODO: check if we have image in observation space if so add this transform - #replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) + # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) return replay_buffer diff --git a/src/agents/cql.py b/src/agents/cql.py index 8375035..54d9d2d 100644 --- a/src/agents/cql.py +++ b/src/agents/cql.py @@ -21,9 +21,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): with_lagrange = agent_config.with_lagrange - self.actor = get_stochastic_actor( - state_spec, action_spec, agent_config - ) + self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) self.critic = get_critic(state_spec, agent_config) self.actor.to(device) diff --git a/src/agents/iql.py b/src/agents/iql.py index 9c7e684..244bb17 100644 --- a/src/agents/iql.py +++ b/src/agents/iql.py @@ -20,9 +20,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): state_spec, action_spec, agent_config.name, device ) - self.actor = get_stochastic_actor( - state_spec, action_spec, agent_config - ) + self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) self.critic = get_critic(state_spec, agent_config) self.value = get_value_operator(state_spec, agent_config) diff --git a/src/agents/random.py b/src/agents/random.py index 389e918..f599f43 100644 --- a/src/agents/random.py +++ b/src/agents/random.py @@ -44,7 +44,6 @@ def train(self, batch_size=64, num_updates=1): """Train the agent""" return {} - def create_replay_buffer( self, batch_size=256, @@ -76,4 +75,4 @@ def create_replay_buffer( ), batch_size=batch_size, ) - return replay_buffer \ No newline at end of file + return replay_buffer diff --git a/src/agents/sac.py b/src/agents/sac.py index 1e77f11..a87049f 100644 --- a/src/agents/sac.py +++ b/src/agents/sac.py @@ -19,9 +19,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): state_spec, action_spec, agent_config.name, device ) - self.actor = get_stochastic_actor( - state_spec, action_spec, agent_config - ) + self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) self.critic = get_critic(state_spec, agent_config) self.actor.to(device) diff --git a/src/agents/td3.py b/src/agents/td3.py index db43b8e..706d821 100644 --- a/src/agents/td3.py +++ b/src/agents/td3.py @@ -31,9 +31,7 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"): state_spec, action_spec, agent_config.name, device ) - self.actor = get_deterministic_actor( - state_spec, action_spec, agent_config - ) + self.actor = get_deterministic_actor(state_spec, action_spec, agent_config) self.critic = get_critic(state_spec, agent_config) self.model = nn.ModuleList([self.actor, self.critic]).to(device) From 41829199d5ca2eb738fa1835a68bc04892721417 Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:28:36 +0200 Subject: [PATCH 43/53] format nets --- src/networks/networks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/networks/networks.py b/src/networks/networks.py index 70d24a0..b1ff9c0 100644 --- a/src/networks/networks.py +++ b/src/networks/networks.py @@ -364,7 +364,7 @@ def get_deterministic_actor(observation_spec, action_spec, agent_config): action_spec=action_spec, num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, - img_shape=observation_spec["pixels"].shape + img_shape=observation_spec["pixels"].shape, ) elif "pixels" in observation_keys and "observation" in observation_keys: @@ -374,7 +374,7 @@ def get_deterministic_actor(observation_spec, action_spec, agent_config): action_spec=action_spec, num_cells=[agent_config.num_cells, agent_config.num_cells], activation_class=nn.ReLU, - img_shape=observation_spec["pixels"].shape + img_shape=observation_spec["pixels"].shape, ) else: raise NotImplementedError("Actor for this observation space not implemented") From 64328b7695cc0362ac8b4befa4ab6d82686dfffa Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:29:16 +0200 Subject: [PATCH 44/53] formatting tests --- environments/base/base_env.py | 1 - tests/test_agents.py | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/environments/base/base_env.py b/environments/base/base_env.py index 3d03b01..4959188 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -126,7 +126,6 @@ def _set_seed(self, seed: int): return super()._set_seed(seed) - class BaseSimEnv(EnvBase): """ The base class for reinforcement learning environments used to simulate Lego robots. diff --git a/tests/test_agents.py b/tests/test_agents.py index b322fc7..754a812 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -49,7 +49,9 @@ def test_random_agent(env, device): else: device = "cpu" with initialize(config_path="../conf"): - cfg = compose(config_name="config", overrides=["device=" + device, "agent=random"]) + cfg = compose( + config_name="config", overrides=["device=" + device, "agent=random"] + ) # Test data collection env = get_env(env) agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) @@ -172,6 +174,7 @@ def test_drq_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) + @pytest.mark.parametrize( "env", ["mixed", "vec", "vec_goal"], @@ -186,9 +189,7 @@ def test_iql_agent(env, device): else: device = "cpu" with initialize(config_path="../conf"): - cfg = compose( - config_name="config", overrides=["agent=iql", "device=" + device] - ) + cfg = compose(config_name="config", overrides=["agent=iql", "device=" + device]) # Test data collection env = get_env(env) @@ -211,6 +212,7 @@ def test_iql_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) + @pytest.mark.parametrize( "env", ["mixed", "vec", "vec_goal"], @@ -225,9 +227,7 @@ def test_cql_agent(env, device): else: device = "cpu" with initialize(config_path="../conf"): - cfg = compose( - config_name="config", overrides=["agent=cql", "device=" + device] - ) + cfg = compose(config_name="config", overrides=["agent=cql", "device=" + device]) # Test data collection env = get_env(env) @@ -250,6 +250,7 @@ def test_cql_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) + @pytest.mark.parametrize( "env", ["mixed", "vec", "vec_goal"], @@ -264,9 +265,7 @@ def test_bc_agent(env, device): else: device = "cpu" with initialize(config_path="../conf"): - cfg = compose( - config_name="config", overrides=["agent=bc", "device=" + device] - ) + cfg = compose(config_name="config", overrides=["agent=bc", "device=" + device]) # Test data collection env = get_env(env) @@ -284,7 +283,6 @@ def test_bc_agent(env, device): assert torch.allclose(eval_td1["action"], eval_td2["action"]) - @pytest.mark.parametrize( "env", ["mixed"], @@ -325,4 +323,4 @@ def test_mixd_obs_size_agent(env, device, img_shape): eval_td1 = agent.get_eval_action(td) eval_td2 = agent.get_eval_action(td) - assert torch.allclose(eval_td1["action"], eval_td2["action"]) \ No newline at end of file + assert torch.allclose(eval_td1["action"], eval_td2["action"]) From 52ef271d9b73f6bb6919944bc4d62fbb758a9e6b Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:30:45 +0200 Subject: [PATCH 45/53] update experiment train evals --- experiments/2wheeler/eval.py | 2 +- experiments/roboarm/eval.py | 2 +- experiments/roboarm/train.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/2wheeler/eval.py b/experiments/2wheeler/eval.py index f1710a4..7d91075 100644 --- a/experiments/2wheeler/eval.py +++ b/experiments/2wheeler/eval.py @@ -17,7 +17,7 @@ from environments import make_env from src.agents import get_agent -from src.utils import login, setup_check , logout +from src.utils import login, logout, setup_check @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index 8ecd0b0..371db03 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -42,7 +42,7 @@ def run(cfg: DictConfig) -> None: _ = input("Press Enter to start evaluation...") try: for e in tqdm(range(eval_episodes), desc="Evaluation"): - td = env.reset(env.get_reset_tensordict()) + td = env.reset() done = td.get("done", False) truncated = td.get("truncated", False) ep_return = 0 diff --git a/experiments/roboarm/train.py b/experiments/roboarm/train.py index 971ef2b..64dc6bd 100644 --- a/experiments/roboarm/train.py +++ b/experiments/roboarm/train.py @@ -56,7 +56,7 @@ def run(cfg: DictConfig) -> None: train_episodes = cfg.episodes max_episode_steps = cfg.env.max_episode_steps - print("Start training...") + print("Start training...") quit = False try: for e in tqdm(range(train_episodes), desc="Training"): From a60ab4316f41dc6bfa8c1978c506382cc8c267ab Mon Sep 17 00:00:00 2001 From: BY571 Date: Fri, 27 Sep 2024 15:47:40 +0200 Subject: [PATCH 46/53] update readme --- README.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b614093..8ec6913 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,18 @@ Evaluation videos of the trained agents can be found [here](https://sites.google Click me With the use of precollected [offline datasets]() we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. +#### Datasets +The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots. + + ```bash + git lfs install + git clone git@hf.co:datasets/compsciencelab/BricksRL-Datasets + ``` + +The datasets consist of TensorDicts containing expert and random transitions, which can be directly loaded into the replay buffer. When initiating (pre-)training, simply provide the path to the desired TensorDict when prompted to load the replay buffer. + + + #### Pretrain an Agent The execution of an experiment for offline training is similar to the online training except that you run the **pretrain.py** script: @@ -183,13 +195,5 @@ Or run training for fine-tuning the policy on the real robot: python experiments/walker/train.py ``` -#### Datasets -The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots. - - ```bash - git lfs install - git clone git@hf.co:datasets/Sebasdi/BricksRL-Datasets - ``` - From 3e41ca80aead2dd1f37df27b46c36b3216284081 Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 09:48:51 +0200 Subject: [PATCH 47/53] fix spelling bug pretrain --- environments/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/environments/__init__.py b/environments/__init__.py index 7298277..e53dbf7 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -40,7 +40,7 @@ def make_env(config, pretrain=False): Returns: A tuple containing the new environment, its action space, and its state space. """ - env = make(name=config.env.name, env_conf=config.env, pretain=pretrain) + env = make(name=config.env.name, env_conf=config.env, pretrain=pretrain) observation_keys = [key for key in env.observation_spec.keys()] transforms = [] @@ -78,27 +78,27 @@ def make_env(config, pretrain=False): return env, action_spec, state_spec -def make(name="RunAway", env_conf=None, pretain=False): +def make(name="RunAway", env_conf=None, pretrain=False): if name == "runaway-v0": return RunAwayEnv_v0( max_episode_steps=env_conf.max_episode_steps, min_distance=env_conf.min_distance, verbose=env_conf.verbose, - pretain=pretain, + pretrain=pretrain, ) elif name == "spinning-v0": return SpinningEnv_v0( max_episode_steps=env_conf.max_episode_steps, sleep_time=env_conf.sleep_time, verbose=env_conf.verbose, - pretain=pretain, + pretrain=pretrain, ) elif name == "walker-v0": return WalkerEnv_v0( max_episode_steps=env_conf.max_episode_steps, verbose=env_conf.verbose, sleep_time=env_conf.sleep_time, - pretain=pretain, + pretrain=pretrain, ) elif name == "walker_sim-v0": return WalkerEnvSim_v0( @@ -114,7 +114,7 @@ def make(name="RunAway", env_conf=None, pretain=False): verbose=env_conf.verbose, sleep_time=env_conf.sleep_time, reward_signal=env_conf.reward_signal, - pretain=pretain, + pretrain=pretrain, ) elif name == "roboarm_sim-v0": return RoboArmSimEnv_v0( @@ -131,7 +131,7 @@ def make(name="RunAway", env_conf=None, pretain=False): reward_signal=env_conf.reward_signal, camera_id=env_conf.camera_id, goal_radius=env_conf.goal_radius, - pretain=pretain, + pretrain=pretrain, ) else: print("Environment not found") From 274f4bcf4ec74c9ca35cb84d42f1d90c5c3a0d4e Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 15:49:29 +0200 Subject: [PATCH 48/53] update readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8ec6913..078db9d 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ Before running experiments, please review and modify the configuration settings ### Robots -Robots utilized for our experiments. Building instructions can be found [here](https://sites.google.com/view/bricksrl/building-instructions). +Robots utilized for our experiments. Building instructions can be found [here](https://bricksrl.github.io/ProjectPage/). | ![2wheeler](https://drive.google.com/uc?export=view&id=1IxqQ1VZchPZMNXyZnTULuNy53-LMYT6W) | ![Walker](https://drive.google.com/uc?export=view&id=1ImR0f1UNjC4sUHXWWg_D06eukrh-doW9) | ![RoboArm](https://drive.google.com/uc?export=view&id=1IYCJrl5rZBvOb6xKwbSUZqYrVwKjCpJH) | |:--:|:--:|:--:| @@ -139,7 +139,7 @@ Robots utilized for our experiments. Building instructions can be found [here](h
Click me -Evaluation videos of the trained agents can be found [here](https://sites.google.com/view/bricksrl/main). +Evaluation videos of the trained agents can be found [here](https://bricksrl.github.io/ProjectPage/). ### 2Wheeler Results: @@ -161,7 +161,7 @@ Evaluation videos of the trained agents can be found [here](https://sites.google ### Offline RL
Click me -With the use of precollected [offline datasets]() we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. +With the use of precollected [offline datasets](https://huggingface.co/datasets/compsciencelab/BricksRL-Datasets) we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. #### Datasets The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots. From 6e1de6a8dc2afb76e5b103512bf0ab66bca65248 Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 18:06:18 +0200 Subject: [PATCH 49/53] update docstring --- environments/base/base_env.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/base/base_env.py b/environments/base/base_env.py index 4959188..b731c72 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -16,6 +16,8 @@ class BaseEnv(EnvBase): Args: action_dim (int): The dimensionality of the action space. state_dim (int): The dimensionality of the state space. + use_hub (bool): Whether to use the Pybricks hub for communication, if False, only the observation spec and action specs are created and can be used. + verbose (bool): Whether to print verbose output. """ def __init__( From aa08ab3682dabc87191270db21125fd815217846 Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 18:07:34 +0200 Subject: [PATCH 50/53] update docstring --- environments/base/base_env.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/base/base_env.py b/environments/base/base_env.py index b731c72..bc8f936 100644 --- a/environments/base/base_env.py +++ b/environments/base/base_env.py @@ -135,6 +135,8 @@ class BaseSimEnv(EnvBase): Args: action_dim (int): The dimensionality of the action space. state_dim (int): The dimensionality of the state space. + verbose (bool): Whether to print verbose output. + use_hub (bool): This argument is kept for compatibility but is not used in the simulation environment. """ def __init__( From 65c5859371bc2dac5e4f542847a3cb4c37a149ac Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 18:10:15 +0200 Subject: [PATCH 51/53] Update device in eval --- experiments/roboarm/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/roboarm/eval.py b/experiments/roboarm/eval.py index 371db03..a70998b 100644 --- a/experiments/roboarm/eval.py +++ b/experiments/roboarm/eval.py @@ -55,7 +55,7 @@ def run(cfg: DictConfig) -> None: ep_steps += 1 step_start_time = time.time() td = agent.get_eval_action(td) - td = env.step(td.to("cpu")) + td = env.step(td) if env_name in VIDEO_LOGGING_ENVS: image_caputres.append( td.get(("next", "original_pixels")).cpu().numpy() From c7e49492463086f80b209a8bc6b55032a80ffdab Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 18:13:21 +0200 Subject: [PATCH 52/53] update all agents --- src/agents/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agents/__init__.py b/src/agents/__init__.py index add51cd..5e5cafd 100644 --- a/src/agents/__init__.py +++ b/src/agents/__init__.py @@ -5,7 +5,7 @@ from src.agents.sac import SACAgent from src.agents.td3 import TD3Agent -all_agents = ["td3", "sac", "iql", "cql", "random"] +all_agents = ["td3", "sac", "iql", "cql", "bc", "random"] def get_agent(action_spec, state_spec, cfg): From 61c6b95737d021186faa4a04699f6bebb54685ef Mon Sep 17 00:00:00 2001 From: BY571 Date: Mon, 30 Sep 2024 18:16:58 +0200 Subject: [PATCH 53/53] remove gail --- src/agents/gail.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/agents/gail.py diff --git a/src/agents/gail.py b/src/agents/gail.py deleted file mode 100644 index e69de29..0000000