Skip to content

Commit

Permalink
Release 0.3.0 (ICML version, only L1) (#17)
Browse files Browse the repository at this point in the history
* Internal repo sync

* version bump 0.3.0

* Update README.md

---------

Co-authored-by: Leo Boisvert <[email protected]>
Co-authored-by: Alexandre Drouin <[email protected]>
  • Loading branch information
3 people authored Jun 17, 2024
1 parent e672831 commit d2c0536
Show file tree
Hide file tree
Showing 96 changed files with 27,161 additions and 2,079 deletions.
33 changes: 17 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,9 @@ WorkArena is included in [BrowserGym](https://github.com/ServiceNow/BrowserGym),

https://github.com/ServiceNow/WorkArena/assets/2374980/68640f09-7d6f-4eb1-b556-c294a6afef70

## ⚠️ Pre-Release warning ⚠️
Please note that the WorkArena benchmark is still undergoing minor bug fixes and updates, which may cause discrepancies with results reported in our latest arXiv preprint. We plan to release soon a stable version of WorkArena with enhanced stability, and a final version v1.0.0 with a new suite of tasks.

## Benchmark Contents

At the moment, WorkArena includes `19,951` task instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.
At the moment, WorkArena includes `19,912` unique instances drawn from `33` tasks that cover the main components of the ServiceNow user interface. The following videos show an agent built on `GPT-4-vision` interacting with every such component. As emphasized by our results, this benchmark is not solved and thus, the performance of the agent is not always on point.

### Knowledge Bases

Expand Down Expand Up @@ -53,8 +50,11 @@ https://github.com/ServiceNow/WorkArena/assets/1726818/ca26dfaf-2358-4418-855f-8

### Dashboards

**Goal:** The agent must extract information from a dashboard.
**Goal:** The agent must answer a question that requires reading charts and (optionally) performing simple reasoning over them.

*Note: For demonstration purposes, a human is controlling the cursor since this is a pure retrieval task*

https://github.com/ServiceNow/WorkArena/assets/1726818/0023232c-081f-4be4-99bd-f60c766e6c3f


## Getting Started
Expand Down Expand Up @@ -98,6 +98,8 @@ Your installation is now complete! 🎉

Run this code to see WorkArena in action.

Note: the following example executes WorkArena's oracle (cheat) function to solve each task. To evaluate an agent, calls to `env.step()` must be used instead.

```python
import random

Expand All @@ -112,28 +114,27 @@ for task in ALL_WORKARENA_TASKS:

# Instantiate a new environment
env = BrowserEnv(task_entrypoint=task,
headless=False,
slow_mo=1000)
headless=False)
env.reset()

# Cheat functions use Playwright to automatically solve the task
env.chat.add_message(role="assistant", msg="On it. Please wait...")
env.task.cheat(env.page, env.chat.messages)
cheat_messages = []
env.task.cheat(env.page, cheat_messages)

# Send cheat messages to chat
for cheat_msg in cheat_messages:
env.chat.add_message(role=cheat_msg["role"], msg=cheat_msg["message"])

# Post solution to chat
if "KnowledgeBaseSearchTask" in str(task):
answer = env.chat.messages[-1]["message"]
env.chat.add_message(role="assistant", msg=f"The answer is:")
env.chat.add_message(role="assistant", msg=answer)
else:
env.chat.add_message(role="assistant", msg="I'm done!")
env.chat.add_message(role="assistant", msg="I'm done!")

# Validate the solution
reward, stop, info, message = env.task.validate(env.page, env.chat.messages)
reward, stop, message, info = env.task.validate(env.page, cheat_messages)
if reward == 1:
env.chat.add_message(role="user", msg="Yes, that works. Thanks!")
else:
env.chat.add_message(role="user", msg=f"No, that doesn't work. {message.get('message', '')}")
env.chat.add_message(role="user", msg=f"No, that doesn't work. {info.get('message', '')}")

sleep(3)
env.close()
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ homepage = "https://github.com/ServiceNow/WorkArena"

[project.scripts]
workarena-install = "browsergym.workarena.install:main"
workarena-human-eval = "browsergym.workarena.human_eval.tool:main"

[tool.hatch.version]
path = "src/browsergym/workarena/__init__.py"
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
browsergym-core>=0.2
english-words>=2.0.1
faker>=24.11.0
Faker>=24.8.0
numpy>=1.14
requests>=2.31
tenacity>=8.2.3 # only used in cheat() -> move to tests?
Expand Down
131 changes: 131 additions & 0 deletions scripts/extract_finetuning_traces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
A demonstration of how observation/action traces can be extracted
for WorkArena tasks without modifying the task code.
Author: Alexandre Drouin ([email protected])
Notes:
- This approach relies on monkey patching the playwright actions to log the actions and observations.
It has not been tested for parallel execution. It might work with multiprocessing, but it will for
sure not work with multithreading.
"""

import importlib
import logging
import os
import pickle
import playwright.sync_api as playwright_sync

from browsergym.core.env import BrowserEnv
from browsergym.workarena import ALL_WORKARENA_TASKS
from collections import defaultdict
from tenacity import retry, stop_after_attempt, wait_fixed
from time import time


N_PER_TASK = 10


def monkey_patch_playwright(observation_callback, trace_storage):
"""
A function that overrides the default playwright actions to log the actions and observations.
Parameters:
------------
observation_callback: callable
A function that returns the observation of the environment.
trace_storage: list
A list to store the trace of the actions and observations.
These will be appended in-place.
"""

def wrapper(func, interface):
def wrapped(*args, **kwargs):
# Get the observation
obs = observation_callback()

# Get the BID of the element on which we are acting.
if interface.__name__ == "Locator":
# Get the locator
locator = args[0]
# Get the BID
bid = locator.element_handle().evaluate('(el) => el.getAttribute("bid")')
elif interface.__name__ == "Keyboard":
# Get the BID of the element
bid = "keyboard"
else:
# Get the BID of the element
bid = args[0].evaluate('(el) => el.getAttribute("bid")')

logging.info(f"Action: {func.__name__} BID: {bid} -- Args: {args[1:]} {kwargs}")
trace_storage.append(
{
"obs": obs,
"action": func.__name__,
"args": args[1:],
"kwargs": kwargs,
"bid": bid,
"time": time(),
}
)

# Resume action
return func(*args, **kwargs)

return wrapped

# Interfaces and actions we want to monkey patch
importlib.reload(playwright_sync)
from playwright.sync_api import Page, Frame, Locator, Keyboard, ElementHandle

# TODO: Make sure the list of interfaces and actions is exhaustive
# It covers all that is used in WorkArena cheats as of April 11, 2024
interfaces = [Page, Frame, Locator, Keyboard, ElementHandle]
actions = ["click", "select_option", "set_checked", "fill", "press", "type", "down", "up"]

for interface in interfaces:
for action in actions:
if hasattr(interface, action):
setattr(interface, action, wrapper(getattr(interface, action), interface))
print(f"Monkey patched {interface.__name__}.{action}")


@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def extract_trace(task_cls, headless=True):
"""
Extracts the trace of actions and observations for a given task.
Parameters:
------------
task_cls: class
The class of the task to extract the trace from.
"""
# Instantiate a new environment
env = BrowserEnv(task_entrypoint=task_cls, headless=headless, slow_mo=1000)

# Setup customized tracing
trace = []
monkey_patch_playwright(observation_callback=env._get_obs, trace_storage=trace)

env.reset()
env.task.cheat(env.page, env.chat.messages)
env.close()

return trace


if __name__ == "__main__":
os.makedirs("trace_profiling", exist_ok=True)

task_traces = defaultdict(list)
for task in ALL_WORKARENA_TASKS:
print("Task:", task)
for i in range(N_PER_TASK):
print(f"Extracting trace {i+1}/{N_PER_TASK}")
trace = extract_trace(task, headless=True)
task_traces[task].append(trace)

pickle.dump(task_traces, open("trace_profiling/task_traces.pkl", "wb"))
Loading

0 comments on commit d2c0536

Please sign in to comment.