diff --git a/.gitignore b/.gitignore index 47a7942..ae1b171 100644 --- a/.gitignore +++ b/.gitignore @@ -233,6 +233,7 @@ tools/caption_video.html experiments/2_libero/logs experiments/2_libero/logs/videos demo_data/demos25 +**/videos/** demo_data/libero_spatial_no_noops_1.0.0_lerobot experiments/test diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c369b77 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "experiments/7_franka/deoxys_control"] + path = experiments/7_franka/deoxys_control + url = https://github.com/UT-Austin-RPL/deoxys_control.git diff --git a/experiments/7_franka/README.md b/experiments/7_franka/README.md new file mode 100644 index 0000000..4533e9f --- /dev/null +++ b/experiments/7_franka/README.md @@ -0,0 +1,96 @@ +# Franka Emika Panda Robot Control with EO-1 + +This directory contains the implementation for controlling Franka Emika Panda robots using the EO-1 model. The system enables real-time robot manipulation tasks through vision-language-action integration. + +## 🚀 Quick Start + +### Prerequisites + +**Hardware Requirements:** + +- Franka Emika Panda robot arm +- RealSense cameras (or compatible RGB cameras) +- **NUC**: Configured with real-time kernel for robot control +- **Workstation**: Equipped with GPU for model inference + +**Software Requirements:** + +- Ubuntu 20.04+ with CUDA support +- Python 3.10+ +- Real-time kernel configuration on NUC +- Deoxys control system properly configured + +### Installation + +1. **Setup submodules:** + +```bash +git submodule update --init --recursive experiments/7_franka/deoxys_control +``` + +2. **Configure robot control system:** + Follow the [Deoxys Documentation](https://zhuyifengzju.github.io/deoxys_docs/html/index.html) to configure your NUC and workstation for Franka control. + +3. **Install dependencies on workstation** + +```bash +# Create conda environment +conda create -n eo python=3.10 +conda activate eo + +# Install deoxys for workstation +pip install -e experiments/7_franka/deoxys_control/deoxys + +# Install additional requirements +pip install -r experiments/7_franka/requirements.txt +``` + +**Note**: The NUC handles real-time robot control while the workstation runs the EO-1 model inference. Both systems must be properly configured according to the Deoxys documentation. + +## 🤖 Running Robot Control + +### Basic Usage + +```bash +python experiments/7_franka/eval_franka.py \ + --model-path "path/to/your/model" \ + --repo-id libero_spatial_no_noops_1.0.0_lerobot \ + --task "Pick and place a cube" \ + --video-out-path experiments/7_franka/videos \ + --max-timesteps 300 +``` + +### Parameters + +| Parameter | Description | Default | +| ------------------ | -------------------------------------------- | --------------------------------------- | +| `--model-path` | Path to the trained EO-1 model checkpoint | Required | +| `--repo-id` | Dataset repository ID for task specification | `libero_spatial_no_noops_1.0.0_lerobot` | +| `--task` | Natural language description of the task | `"Pick and place a cube"` | +| `--video-out-path` | Directory to save recorded videos | `experiments/7_franka/videos` | +| `--max-timesteps` | Maximum number of control steps | `300` | +| `--resize-size` | Image resize dimensions for model input | `224` | +| `--replan-steps` | RHC control horizon | `5` | + +### Camera Configuration + +The system supports multiple camera inputs. Update the camera serial numbers in `eval_franka.py`: + +```python +# Camera serial numbers (update these with your actual camera IDs) +EGO_CAMERA = "213522070137" # Wrist camera +THIRD_CAMERA = "243222074139" # External camera +``` + +## 🔒 Safety Considerations + +- Always ensure proper workspace setup before operation +- Monitor robot movements and be ready to use emergency stop +- Verify camera positioning for optimal visual coverage + +## 📝 Notes + +- The system requires both wrist and external cameras for optimal performance +- Model performance depends on lighting conditions and camera positioning +- Regular calibration of the robot and cameras is recommended +- Check the video output directory for recorded demonstrations diff --git a/experiments/7_franka/deoxys_control b/experiments/7_franka/deoxys_control new file mode 160000 index 0000000..97396fd --- /dev/null +++ b/experiments/7_franka/deoxys_control @@ -0,0 +1 @@ +Subproject commit 97396fd91324e9e961f061544e80a208889526ff diff --git a/experiments/7_franka/eval_franka.py b/experiments/7_franka/eval_franka.py new file mode 100644 index 0000000..2ba5bba --- /dev/null +++ b/experiments/7_franka/eval_franka.py @@ -0,0 +1,196 @@ +import os + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +import collections +import copy +import dataclasses +import os.path as osp +import time +from datetime import datetime +from pathlib import Path + +import cv2 +import deoxys.utils.transform_utils as dft +import imageio +import numpy as np +import torch +import tqdm +import tyro +from deoxys import config_root +from deoxys.experimental.motion_utils import reset_joints_to +from deoxys.franka_interface import FrankaInterface +from deoxys.utils import YamlConfig +from PIL import Image +from realsense_camera import MultiCamera +from transformers import AutoModel, AutoProcessor + +# Add your camera serial numbers here +EGO_CAMERA = "213522070137" +THIRD_CAMERA = "243222074139" + +reset_joint_positions = [ + 0.0760389047913384, + -1.0362613022620384, + -0.054254247684777324, + -2.383951857286591, + -0.004505598470154735, + 1.3820559157131187, + 0.784935455988679, +] + + +def save_rollout_video(rollout_images, save_dir): + """Saves an MP4 replay of an episode.""" + date_time = time.strftime("%Y_%m_%d-%H_%M_%S") + mp4_path = Path(save_dir) / f"{date_time}.mp4" + video_writer = imageio.get_writer(mp4_path, fps=5) + for img in rollout_images: + video_writer.append_data(img) + video_writer.close() + print(f"Saved rollout MP4 at path {mp4_path}") + + +@dataclasses.dataclass +class Args: + ################################################################################################################# + # Model parameters + ################################################################################################################# + resize_size: int = 224 + replan_steps: int = 5 + model_path: str = "" + repo_id: str = "" + task: str = "" + + ################################################################################################################# + # Utils + ################################################################################################################# + video_out_path: Path = Path("experiments/7_franka/videos") # Path to save videos + max_timesteps: int = 300 # Number of timesteps to run + + +def convert_gripper_action(action): + action[-1] = 1 - action[-1] + if action[-1] < 0.5: + action[-1] = -1 + + return action + + +def get_robot_interface(): + robot_interface = FrankaInterface(osp.join(config_root, "charmander.yml")) + controller_cfg = YamlConfig(osp.join(config_root, "osc-pose-controller.yml")).as_easydict() + controller_type = "OSC_POSE" + + return robot_interface, controller_cfg, controller_type + + +def main(args: Args): + multi_camera = MultiCamera() + robot_interface, controller_cfg, controller_type = get_robot_interface() + + model = ( + AutoModel.from_pretrained(args.model_path, dtype=torch.bfloat16, trust_remote_code=True).eval().cuda() + ) + + processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True) + + while True: + action_plan = collections.deque() + input("Press Enter to start episode ...") + reset_joints_to(robot_interface, reset_joint_positions) + + replay_images = [] + bar = tqdm.tqdm( + range(args.max_timesteps), + position=0, + leave=True, + ncols=80, + desc="Rollout steps", + ) + + for _ in bar: + try: + images = multi_camera.get_frame() + last_state = robot_interface._state_buffer[-1] + last_gripper_state = robot_interface._gripper_state_buffer[-1] + frame, _ = images[THIRD_CAMERA] + ego_frame, _ = images[EGO_CAMERA] + + if not action_plan: + frame = copy.deepcopy(frame) + ego_frame = copy.deepcopy(ego_frame) + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + ego_frame = cv2.cvtColor(ego_frame, cv2.COLOR_BGR2RGB) + replay_images.append(frame) + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + ego_frame_rgb = cv2.cvtColor(ego_frame, cv2.COLOR_BGR2RGB) + replay_images.append(frame_rgb.copy()) + + eef_pose = np.asarray(last_state.O_T_EE, dtype=np.float32).reshape(4, 4).T + eef_state = np.concatenate( + ( + eef_pose[:3, -1], + dft.mat2euler( + eef_pose[:3, :-1], + ), + ), + axis=-1, + ) + gripper_state = np.array([last_gripper_state.width]) + state_data = np.concatenate([eef_state.flatten(), np.array([0]), gripper_state]) + + frame_resized = cv2.resize(frame_rgb, (args.resize_size, args.resize_size)) + ego_frame_resized = cv2.resize(ego_frame_rgb, (args.resize_size, args.resize_size)) + + ego_frame = Image.fromarray(ego_frame_resized) + frame = Image.fromarray(frame_resized) + + # NOTE: Change the keys to match your model + batch = { + "observation.images.image": [frame], + "observation.images.wrist_image": [ego_frame], + "observation.state": [state_data], + "task": [args.task], + "repo_id": [args.repo_id], + } + ov_out = processor.select_action( + model, + batch, + ) + action_chunk = ov_out.action[0].numpy() + assert len(action_chunk) >= args.replan_steps, ( + f"We want to replan every {args.replan_steps} steps, but policy only predicts {len(action_chunk)} steps." + ) + action_plan.extend(action_chunk[: args.replan_steps]) + + pred_action_chunk = action_plan.popleft() + action = pred_action_chunk + rotation_matrix = dft.euler2mat(action[3:6]) + quat = dft.mat2quat(rotation_matrix) + axis_angle = dft.quat2axisangle(quat) + action[3:6] = axis_angle + action = convert_gripper_action(action) + + robot_interface.control( + controller_type=controller_type, action=action, controller_cfg=controller_cfg + ) + + except KeyboardInterrupt: + break + + # saving video + video_save_path = args.video_out_path / args.task.replace(" ", "_") + video_save_path.mkdir(parents=True, exist_ok=True) + curr_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + save_path = video_save_path / f"{curr_time}.mp4" + video = np.stack(replay_images) + imageio.mimsave(save_path, video, fps=20) + + if input("Do one more eval (default y)? [y/n]").lower() == "n": + break + + +if __name__ == "__main__": + args = tyro.cli(Args) + main(args) diff --git a/experiments/7_franka/realsense_camera.py b/experiments/7_franka/realsense_camera.py new file mode 100755 index 0000000..bd29d95 --- /dev/null +++ b/experiments/7_franka/realsense_camera.py @@ -0,0 +1,185 @@ +import numpy as np +import pyrealsense2 as rs + +# the camera intrinsics from the camera calibration +serial_number_to_cam_intr = { + "243222074139": { + "fx": 604.987548828125, + "fy": 604.9332885742188, + "px": 325.9312438964844, + "py": 243.00851440429688, + }, # eye-to-hand + "213522070137": { + "fx": 596.382688, + "fy": 596.701788, + "px": 333.837001, + "py": 254.401211, + }, # eye-in-hand +} + + +class Camera: + def __init__( + self, + serial_number: str | None = "213522070137", + camera_width: int = 640, + camera_height: int = 480, + camera_fps: int = 30, + ) -> None: + self.camera_width = camera_width + self.camera_height = camera_height + self.camera_fps = camera_fps + self.pipeline = rs.pipeline() + # self.pipeline.wait_for_frames(9999) + self.config = rs.config() + + if serial_number is not None: + self.config.enable_device(serial_number) + self.config.enable_stream( + rs.stream.depth, + self.camera_width, + self.camera_height, + rs.format.z16, + self.camera_fps, + ) + self.config.enable_stream( + rs.stream.color, + self.camera_width, + self.camera_height, + rs.format.bgr8, + self.camera_fps, + ) + + self.profile = self.pipeline.start(self.config) + + depth_sensor = self.profile.get_device().first_depth_sensor() + self.depth_scale: float = depth_sensor.get_depth_scale() + print("Depth Scale is: ", self.depth_scale) + + align_to = rs.stream.color + self.align = rs.align(align_to) + + def close(self): + self.pipeline.stop() + + def get_frame(self, filter=True) -> tuple[np.ndarray, np.ndarray]: + """ + Args: + filter bool optional Whether to apply filters to depth frames. Defaults to True. + Returns: + color_image np.ndarray shape=(H, W, 3) Color image(BGR) + depth_image np.ndarray shape=(H, W) Depth image in meters + """ + + depth_to_disparity = rs.disparity_transform(True) + + spatial = rs.spatial_filter() + spatial.set_option(rs.option.filter_magnitude, 2) + spatial.set_option(rs.option.filter_smooth_alpha, 0.5) + spatial.set_option(rs.option.filter_smooth_delta, 20) + + temporal = rs.temporal_filter() + temporal.set_option(rs.option.filter_smooth_alpha, 0.4) + temporal.set_option(rs.option.filter_smooth_delta, 20) + + disparity_to_depth = rs.disparity_transform(False) + + hole_filling = rs.hole_filling_filter() + + frames = self.pipeline.wait_for_frames() + aligned_frames = self.align.process(frames) + depth_frame = aligned_frames.get_depth_frame() + color_frame = aligned_frames.get_color_frame() + + # Apply the filters to depth frames + if filter: + filtered_depth = depth_to_disparity.process(depth_frame) + filtered_depth = spatial.process(filtered_depth) + filtered_depth = temporal.process(filtered_depth) + filtered_depth = disparity_to_depth.process(filtered_depth) + filtered_depth = hole_filling.process(filtered_depth) + + depth_image = np.asanyarray(filtered_depth.get_data()) + color_image = np.asanyarray(color_frame.get_data()) + else: + depth_image = np.asanyarray(depth_frame.get_data()) + color_image = np.asanyarray(color_frame.get_data()) + + return color_image, depth_image * self.depth_scale + + def get_camera_intrinsics(self, use_raw=False, serial_number: str | None = "213522070137"): + """ + Args: + use_raw bool optional Whether to use the camera intrinsics from the raw stream. Defaults to False. + Returns: + { + "fx": Focal length x, + "fy": Focal length y, + "px": Principal point x, + "py": Principal point y + } + """ + if use_raw: + profile = self.profile.get_stream(rs.stream.color) + intr = profile.as_video_stream_profile().get_intrinsics() + return {"px": intr.ppx, "py": intr.ppy, "fx": intr.fx, "fy": intr.fy} + return serial_number_to_cam_intr[serial_number] + + +class MultiCamera: + def __init__( + self, + serial_numbers: list[str] | None = None, + camera_width: int = 640, + camera_height: int = 480, + camera_fps: int = 30, + ) -> None: + all_serial_numbers = [] + for d in sorted(rs.context().devices, key=lambda x: x.get_info(rs.camera_info.serial_number)): + print( + "Found device: ", + d.get_info(rs.camera_info.name), + " ", + d.get_info(rs.camera_info.serial_number), + ) + if d.get_info(rs.camera_info.name).lower() != "platform camera": + all_serial_numbers.append(d.get_info(rs.camera_info.serial_number)) + + if serial_numbers is None: + serial_numbers = all_serial_numbers + else: + serial_numbers = [ + serial_number for serial_number in serial_numbers if serial_number in all_serial_numbers + ] + + print("Using cameras with serial numbers: ", serial_numbers) + + self.cameras = { + serial_number: Camera(serial_number, camera_width, camera_height, camera_fps) + for serial_number in serial_numbers + } + + for _ in range(20): + print("filter") + self.get_frame() + + def close(self): + for camera in self.cameras.values(): + camera.close() + + def get_frame( + self, serial_numbers: list[str] | None = None, filter=True + ) -> dict[str, tuple[np.ndarray, np.ndarray]]: + if serial_numbers is None: + serial_numbers = list(self.cameras.keys()) + return {serial_number: self.cameras[serial_number].get_frame() for serial_number in serial_numbers} + + def get_camera_intrinsics( + self, serial_numbers: list[str] | None = None, use_raw=False + ) -> dict[str, dict[str, float]]: + if serial_numbers is None: + serial_numbers = self.cameras.keys() + return { + serial_number: self.cameras[serial_number].get_camera_intrinsics(use_raw, serial_number) + for serial_number in serial_numbers + } diff --git a/experiments/7_franka/requirements.txt b/experiments/7_franka/requirements.txt new file mode 100644 index 0000000..c9ca9a2 --- /dev/null +++ b/experiments/7_franka/requirements.txt @@ -0,0 +1,8 @@ +torch +transformers>=4.56.0 +opencv-python +imageio +tyro +pillow +pyrealsense2 +lerobot diff --git a/scripts/eval_policy.py b/scripts/eval_policy.py index 01192d5..e494895 100644 --- a/scripts/eval_policy.py +++ b/scripts/eval_policy.py @@ -22,12 +22,16 @@ def eval_policy(): # set the observation (image, state, etc.) - image0 = "demo_data/example.png" - image1 = Image.open("demo_data/example.png") + import numpy as np - model = AutoModel.from_pretrained(args.model_path, dtype=torch.bfloat16).eval().cuda() + image0 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + image1 = Image.fromarray(image0.copy()) - processor = AutoProcessor.from_pretrained(args.model_path) + model = ( + AutoModel.from_pretrained(args.model_path, dtype=torch.bfloat16, trust_remote_code=True).eval().cuda() + ) + + processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True) batch = { "observation.images.image": [image0],