Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OfflineRL #4

Merged
merged 57 commits into from
Sep 30, 2024
Merged
Changes from 1 commit
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
7a480ae
init keyboard agent
BY571 May 28, 2024
414b969
add pretrain script for offlineRL
BY571 May 29, 2024
f78b7f5
add iql
BY571 May 30, 2024
c9b9dad
init cql
BY571 May 30, 2024
5eefbfe
add cql
BY571 May 30, 2024
f40f681
Merge branch 'main' into offlinerl
BY571 May 31, 2024
4789324
update random agent
BY571 Jun 4, 2024
063d801
add roboarm eval
BY571 Jun 4, 2024
bee4303
add pretrain roboarm
BY571 Jun 6, 2024
410c3a9
Update eval scripts roboarm walker
BY571 Jun 6, 2024
6e4be42
update keyboard agent
BY571 Jun 18, 2024
f0ffdc1
inint pickplace roboarm env
BY571 Jun 26, 2024
732226b
update pickplace client, roboarm eval
BY571 Jul 3, 2024
b429e9d
update replay buffer saving
BY571 Jul 3, 2024
79a90c6
tanh low high update
BY571 Jul 3, 2024
29dfeb0
Merge branch 'main' into offlinerl
BY571 Jul 3, 2024
5b07957
Merge branch 'imitation_setup' into offlinerl
BY571 Jul 3, 2024
0149606
add pynput install to requirements
BY571 Jul 3, 2024
e9a14e1
update pickplace env and keyboard agent
BY571 Jul 3, 2024
30f0aeb
tests
BY571 Jul 4, 2024
ad38927
update config
BY571 Jul 4, 2024
540edee
Update batched
BY571 Aug 5, 2024
7e0b3ce
set bc default false
BY571 Sep 17, 2024
19568b5
update bc agent buffer loading
BY571 Sep 17, 2024
5a65efc
update agents buffer loading
BY571 Sep 17, 2024
021ae45
update readme with dataset info
BY571 Sep 17, 2024
57ccf27
update roboarm envs with pretrain flag
BY571 Sep 17, 2024
bc4e53e
Update pretrain script with pretrain flag
BY571 Sep 17, 2024
3d802aa
update gitgnore for .pth
BY571 Sep 27, 2024
dd0bc25
take off keyboard agent pickplace
BY571 Sep 27, 2024
2f56409
take off pickplace transform
BY571 Sep 27, 2024
5047954
take off keyboard agent
BY571 Sep 27, 2024
1fc9e83
pretrain docstring
BY571 Sep 27, 2024
d07c5ea
update base_env_sim with use_hub
BY571 Sep 27, 2024
153e00c
take off pretrain from sim envs
BY571 Sep 27, 2024
32add07
update tests
BY571 Sep 27, 2024
f67c052
update agent network creation
BY571 Sep 27, 2024
e1c261b
add mixed obs dummy image shape tests
BY571 Sep 27, 2024
96c5818
udpate dummy mixed obs env for image shape
BY571 Sep 27, 2024
383766b
add cql agent test
BY571 Sep 27, 2024
793d881
update bc agent
BY571 Sep 27, 2024
f19cbcb
add bc tests
BY571 Sep 27, 2024
7614d88
update torchrl version
BY571 Sep 27, 2024
46fc4a6
update req
BY571 Sep 27, 2024
3424192
update formatting agents
BY571 Sep 27, 2024
4182919
format nets
BY571 Sep 27, 2024
64328b7
formatting tests
BY571 Sep 27, 2024
52ef271
update experiment train evals
BY571 Sep 27, 2024
a60ab43
update readme
BY571 Sep 27, 2024
3e41ca8
fix spelling bug pretrain
BY571 Sep 30, 2024
274f4bc
update readme
BY571 Sep 30, 2024
6e1de6a
update docstring
BY571 Sep 30, 2024
aa08ab3
update docstring
BY571 Sep 30, 2024
65c5859
Update device in eval
BY571 Sep 30, 2024
c7e4949
update all agents
BY571 Sep 30, 2024
61c6b95
remove gail
BY571 Sep 30, 2024
0cfa784
Merge branch 'main' into offlinerl
BY571 Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update batched
  • Loading branch information
BY571 committed Aug 5, 2024
commit 540edee08ee76b005b73d1ce4b656a229c850fd1
2 changes: 1 addition & 1 deletion conf/agent/iql.yaml
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@ name: iql
lr: 3e-4
batch_size: 256
num_updates: 1
prefill_episodes: 10
prefill_episodes: 0

num_cells: 256
gamma: 0.99
4 changes: 3 additions & 1 deletion conf/agent/td3.yaml
Original file line number Diff line number Diff line change
@@ -15,4 +15,6 @@ dropout: 0.0

prb: 0
buffer_size: 1000000
reset_params: False
reset_params: False
use_bc: True
alpha: 1.0
6 changes: 3 additions & 3 deletions conf/config.yaml
Original file line number Diff line number Diff line change
@@ -4,10 +4,10 @@ run_name: ""
verbose: 0

device: "cuda"
episodes: 5000
episodes: 250

defaults:
- _self_
# random, sac, td3, droq
- agent: bc
- env: roboarm_pickplace-v0
- agent: sac
- env: walker_sim-v0
17 changes: 17 additions & 0 deletions environments/__init__.py
Original file line number Diff line number Diff line change
@@ -5,9 +5,11 @@
Compose,
DoubleToFloat,
ObservationNorm,
PermuteTransform,
RewardSum,
ToTensorImage,
TransformedEnv,
VIPRewardTransform,
)

from environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0
@@ -74,6 +76,18 @@ def make_env(config):
if "pixels" in observation_keys:
transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True))

if config.env.name == "roboarm_pickplace-v0" and config.env.use_vip_reward:
transforms.append(PermuteTransform((-1, -2, -3), in_keys=["pixels"]))
transforms.append(
VIPRewardTransform(
in_keys=["pixels"],
download=True,
size=100,
model_name="resnet50",
tensor_pixels_keys=["pixels", ("next", "pixels")], # Does not seem to work
)
)

env = TransformedEnv(env, Compose(*transforms))

action_spec = env.action_spec
@@ -139,6 +153,9 @@ def make(name="RunAway", env_conf=None):
verbose=env_conf.verbose,
reward_signal=env_conf.reward_signal,
camera_id=env_conf.camera_id,
image_size=env_conf.image_size,
target_image_path=env_conf.target_image_path,
use_vip_reward=env_conf.use_vip_reward,
)
else:
print("Environment not found")
10 changes: 10 additions & 0 deletions environments/base/base_env.py
Original file line number Diff line number Diff line change
@@ -121,6 +121,16 @@ def _reset(
def _set_seed(self, seed: int):
return super()._set_seed(seed)

def get_reset_tensordict(self, **kwargs) -> TensorDictBase:
""" """
return TensorDict(
{
},
batch_size=[
1,
],
)


class BaseSimEnv(EnvBase):
"""
86 changes: 64 additions & 22 deletions environments/roboarm_pickplace_v0/RoboArmPickPlaceEnv.py
Original file line number Diff line number Diff line change
@@ -37,6 +37,11 @@ def __init__(
verbose: bool = False,
reward_signal: str = "dense",
camera_id: int = 0,
image_size: Tuple[int, int] = (64, 64),
human_control: bool = False,
use_vip_reward: bool = False,
target_image_path: str = None,
mixed_observation: bool = True,
):
self.sleep_time = sleep_time

@@ -46,10 +51,22 @@ def __init__(
], "Reward signal must be dense or sparse."
self.reward_signal = reward_signal
self.max_episode_steps = max_episode_steps
self.image_size = image_size
self.human_control = human_control

self.camera = cv2.VideoCapture(int(camera_id))
self._batch_size = torch.Size([1])

if target_image_path is not None:
target_image = np.load(target_image_path)
else:
target_image = np.load(
"environments/roboarm_pickplace_v0/pickplace_green100_target.npy"
)
self.target_image = target_image
self.use_vip_reward = use_vip_reward
self.mixed_observation = mixed_observation

# Define action spec
self.action_spec = BoundedTensorSpec(
low=-1,
@@ -80,23 +97,30 @@ def __init__(
ret, frame = self.camera.read()
if not ret:
raise ValueError("Camera not available.")
resized_frame = cv2.resize(frame, (64, 64))
resized_frame = cv2.resize(frame, self.image_size)
shape = resized_frame.shape
pixels_observation_spec = BoundedTensorSpec(
low=torch.zeros((1,) + shape, dtype=torch.uint8),
high=torch.ones((1,) + shape, dtype=torch.uint8) * 255,
dtype=torch.uint8,
)

self.observation_spec = CompositeSpec(
{
self.observation_key: observation_spec,
self.pixels_observation_key: pixels_observation_spec,
},
shape=(1,),
low=torch.zeros((1,) + shape, dtype=torch.int64),
high=torch.ones((1,) + shape, dtype=torch.int64) * 255,
dtype=torch.int64,
)
if self.mixed_observation:
self.observation_spec = CompositeSpec(
{
self.observation_key: observation_spec,
self.pixels_observation_key: pixels_observation_spec,
},
shape=(1,),
)
else:
self.observation_spec = CompositeSpec(
{
self.pixels_observation_key: pixels_observation_spec,
},
shape=(1,),
)

self.goal_positions = self.init_camera_position()
_ = self.init_camera_position()

super().__init__(
action_dim=self.action_dim, state_dim=self.state_dim, verbose=verbose
@@ -121,6 +145,21 @@ def init_camera_position(

return

def get_reset_tensordict(self, **kwargs) -> TensorDictBase:
""" """
if self.use_vip_reward:
return TensorDict(
{
"goal_image": torch.from_numpy(self.target_image)
.to(torch.int64)
.unsqueeze(0),
},
batch_size=[
1,
],
)
return TensorDict({},batch_size=[1])

def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
"""
Reset the environment and return the initial state.
@@ -130,23 +169,23 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
"""
# TODO solve this fake action sending before to receive first state
self.episode_step_iter = 0
if tensordict is not None:
action = tensordict.get("action").cpu().numpy().squeeze()
else:
action = np.zeros(self.action_dim)
action = np.zeros(self.action_dim)
self.send_to_hub(action)
time.sleep(self.sleep_time)
observation = self.read_from_hub()

ret, frame = self.camera.read()
resized_frame = cv2.resize(frame, (64, 64))
resized_frame = cv2.resize(frame, self.image_size)

return TensorDict(
{
self.observation_key: torch.tensor(observation).float(),
self.pixels_observation_key: torch.from_numpy(resized_frame)[
None, :
].to(torch.uint8),
].to(torch.int64),
# "goal_image": torch.from_numpy(self.target_image)
# .to(torch.int64)
# .unsqueeze(0),
},
batch_size=[1],
)
@@ -181,18 +220,21 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
reward, done = self.reward(
frame,
)
resized_frame = cv2.resize(frame, (64, 64))
resized_frame = cv2.resize(frame, self.image_size)
next_tensordict = TensorDict(
{
self.observation_key: torch.tensor(next_observation).float(),
self.pixels_observation_key: torch.from_numpy(resized_frame)[
None, :
].to(torch.uint8),
].to(torch.int64),
"reward": torch.tensor([reward]).float(),
"done": torch.tensor([done]).bool(),
# "goal_image": torch.from_numpy(self.target_image)
# .to(torch.int64)
# .unsqueeze(0),
},
batch_size=[1],
)
) # .to(tensordict.device)

# increment episode step counter
self.episode_step_iter += 1
5 changes: 3 additions & 2 deletions experiments/2wheeler/eval.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@

from environments import make_env
from src.agents import get_agent
from src.utils import login, setup_check
from src.utils import login, setup_check , logout


@hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
@@ -56,6 +56,7 @@ def run(cfg: DictConfig) -> None:
td = agent.get_eval_action(td)
actions.append(td.get("action").cpu().numpy())
td = env.step(td)
agent.add_experience(td)
total_agent_step_time = time.time() - step_start_time
total_step_times.append(total_agent_step_time)
done = td.get(("next", "done"), False)
@@ -90,7 +91,7 @@ def run(cfg: DictConfig) -> None:

except KeyboardInterrupt:
print("Evaluation interrupted by user.")

logout(agent)
env.close()


4 changes: 2 additions & 2 deletions experiments/roboarm/eval.py
Original file line number Diff line number Diff line change
@@ -42,7 +42,7 @@ def run(cfg: DictConfig) -> None:
_ = input("Press Enter to start evaluation...")
try:
for e in tqdm(range(eval_episodes), desc="Evaluation"):
td = env.reset()
td = env.reset(env.get_reset_tensordict())
done = td.get("done", False)
truncated = td.get("truncated", False)
ep_return = 0
@@ -55,7 +55,7 @@ def run(cfg: DictConfig) -> None:
ep_steps += 1
step_start_time = time.time()
td = agent.get_eval_action(td)
td = env.step(td)
td = env.step(td.to("cpu"))
if env_name in VIDEO_LOGGING_ENVS:
image_caputres.append(
td.get(("next", "original_pixels")).cpu().numpy()
5 changes: 3 additions & 2 deletions experiments/roboarm/train.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import sys
import time

import torch
from tensordict import TensorDict
import hydra
import numpy as np
import wandb
@@ -60,7 +61,7 @@ def run(cfg: DictConfig) -> None:
quit = False
try:
for e in tqdm(range(train_episodes), desc="Training"):
td = env.reset()
td = env.reset(env.get_reset_tensordict())
done = td.get("done", False)
truncated = td.get("truncated", False)
ep_return = 0
42 changes: 5 additions & 37 deletions src/agents/behavior_cloning.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@

from torchrl.data.replay_buffers.storages import LazyMemmapStorage
from torchrl.envs.utils import ExplorationType, set_exploration_type
from torchrl.envs import RenameTransform
from torchrl.envs import RenameTransform, ToTensorImage

from src.agents.base import BaseAgent
from src.networks.networks import get_deterministic_actor, get_stochastic_actor
@@ -32,12 +32,10 @@ def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
self.actor = get_deterministic_actor(
self.observation_keys, action_spec, agent_config
)
self.pretrain = self.pretrain_deter
elif agent_config.policy_type == "stochastic":
self.actor = get_stochastic_actor(
self.observation_keys, action_spec, agent_config
)
self.pretrain = self.pretrain_stoch
else:
raise ValueError(
"policy_type not recognized, choose deterministic or stochastic"
@@ -76,7 +74,9 @@ def load_model(self, path):
def load_replaybuffer(self, path):
"""load replay buffer"""
try:
self.replay_buffer.load(path)
# self.replay_buffer.load(path)
loaded_data = TensorDictBase.load_memmap(path)
self.replay_buffer.extend(loaded_data)
if self.replay_buffer._batch_size != self.batch_size:
Warning(
"Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
@@ -118,6 +118,7 @@ def create_replay_buffer(
batch_size=batch_size,
)
replay_buffer.append_transform(lambda x: x.to(device))
replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))

return replay_buffer

@@ -134,44 +135,11 @@ def get_action(self, state):
def add_experience(self, transition: td.TensorDict):
"""Add experience to replay buffer"""

# TODO: for bc we dont want to add to replay buffer
pass

def pretrain_stoch(self, wandb, batch_size=64, num_updates=1):
"""Pretrain the agent with simple behavioral cloning"""

for i in range(num_updates):
batch = self.replay_buffer.sample(batch_size)
input_td = td.TensorDict(
{"observation": batch["vec_observations"].float()}, batch_size=(256)
)
dist = self.actor.get_dist(input_td)
loss = -dist.log_prob(batch["actions"]).mean()
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
wandb.log({"pretrain/loss": loss.item()})

self.actor.eval()

def pretrain_deter(self, wandb, batch_size=64, num_updates=1):
"""Pretrain the agent with simple behavioral cloning"""

for i in range(num_updates):
batch = self.replay_buffer.sample(batch_size)
pred, _ = self.actor(batch["vec_observations"].float())
loss = torch.mean((pred - batch["actions"]) ** 2)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
wandb.log({"pretrain/loss": loss.item()})

self.actor.eval()

def train(self, batch_size=64, num_updates=1):
"""Train the agent"""
log_data = {}
rename = RenameTransform(in_keys=["image_observation", ("next", "image_observation")], out_keys=["pixels", ("next", "pixels")])

for i in range(num_updates):
batch = self.replay_buffer.sample(batch_size).to(self.device)
Loading