From e91ab057fecb0bc58196d7bd8fb4b09deffd6995 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 4 Dec 2025 16:56:58 -0500 Subject: [PATCH 01/11] overlay_utils can return array if needed. --- src/agentlab/agents/agent_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py index 179a94d2..954977b2 100644 --- a/src/agentlab/agents/agent_utils.py +++ b/src/agentlab/agents/agent_utils.py @@ -5,6 +5,7 @@ from agentlab.analyze import overlay_utils from agentlab.llm.llm_utils import img_to_base_64 +import numpy as np def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image: @@ -135,7 +136,7 @@ def zoom_webpage(page: Page, zoom_factor: float = 1.5): return page -def overlay_action(obs, action): +def overlay_action(obs, action, return_array=False): """Overlays actions on screenshot in-place""" act_img = copy.deepcopy(obs["screenshot"]) act_img = Image.fromarray(act_img) @@ -153,4 +154,7 @@ def overlay_action(obs, action): pass overlay_utils.annotate_action(act_img, action, properties=new_obs_properties) - return img_to_base_64(act_img) + if return_array: + return np.array(act_img) + else: + return img_to_base_64(act_img) From 2e275cd1bf705009e1c79434df8367ccd518416d Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 4 Dec 2025 16:57:44 -0500 Subject: [PATCH 02/11] exact goal loading in the tool-use-agent --- .../agents/tool_use_agent/tool_use_agent.py | 13 +++++++++---- src/agentlab/utils/hinting.py | 1 - 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py index 062d8ef3..a8c4a9e7 100644 --- a/src/agentlab/agents/tool_use_agent/tool_use_agent.py +++ b/src/agentlab/agents/tool_use_agent/tool_use_agent.py @@ -325,12 +325,17 @@ def _init(self): embedder_server=self.embedder_server, ) - def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict: + def apply(self, llm, discussion: StructuredDiscussion, obs: dict, task_name: str) -> dict: if not self.use_task_hint: return {} - goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content]) - task_hints = self.hints_source.choose_hints(llm, task_name, goal) + # goal = "\n".join([c.get("text", "") for c in discussion.groups[0].messages[1].content]) + try: + goal_text = obs["goal_object"][0]["text"] + except (KeyError, IndexError): + Warning("Goal text not found in observation") + goal_text = "" + task_hints = self.hints_source.choose_hints(llm, task_name, goal_text) hints = [] for hint in task_hints: @@ -472,7 +477,7 @@ def get_action(self, obs: Any) -> float: self.config.summarizer.apply_init(self.llm, self.discussion) self.config.general_hints.apply(self.llm, self.discussion) - self.task_hint.apply(self.llm, self.discussion, self.task_name) + self.task_hint.apply(self.llm, self.discussion, obs=obs, task_name=self.task_name) self.discussion.new_group() diff --git a/src/agentlab/utils/hinting.py b/src/agentlab/utils/hinting.py index 506513d5..83e55efc 100644 --- a/src/agentlab/utils/hinting.py +++ b/src/agentlab/utils/hinting.py @@ -52,7 +52,6 @@ def __init__( self.hint_db_path, header=0, index_col=None, - dtype=str, converters={ "trace_paths_json": lambda x: json.loads(x) if pd.notna(x) else [], "source_trace_goals": lambda x: json.loads(x) if pd.notna(x) else [], From 9ea0ff92dcbc15580c8bcb327c547eda60a53ff5 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 4 Dec 2025 17:00:19 -0500 Subject: [PATCH 03/11] add tool use cua_like_agent --- .../agents/tool_use_agent/cua_like_agent.py | 786 ++++++++++++++++++ 1 file changed, 786 insertions(+) create mode 100644 src/agentlab/agents/tool_use_agent/cua_like_agent.py diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py new file mode 100644 index 00000000..3f1925f9 --- /dev/null +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -0,0 +1,786 @@ +import fnmatch +import json +import logging +import os +import random +import time +from abc import ABC, abstractmethod +from collections import defaultdict +from copy import copy, deepcopy +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Literal + +import bgym +import numpy as np +import pandas as pd +import requests +from bgym import Benchmark as BgymBenchmark +from browsergym.core.observation import extract_screenshot +from browsergym.utils.obs import ( + flatten_axtree_to_str, + flatten_dom_to_str, + overlay_som, + prune_html, +) + +from agentlab.agents.agent_args import AgentArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark +from agentlab.benchmarks.osworld import OSWorldActionSet +from agentlab.llm.base_api import BaseModelArgs +from agentlab.llm.chat_api import ChatModel +from agentlab.llm.litellm_api import LiteLLMModelArgs +from agentlab.llm.llm_utils import image_to_png_base64_url +from agentlab.llm.response_api import ( + APIPayload, + ClaudeResponseModelArgs, + LLMOutput, + MessageBuilder, + OpenAIChatModelArgs, + OpenAIResponseModelArgs, + OpenRouterModelArgs, + AzureChatModelArgs, + ToolCalls, +) +from agentlab.llm.tracking import cost_tracker_decorator +from agentlab.utils.hinting import HintsSource +from agentlab.agents import agent_utils + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +ADDITIONAL_ACTION_INSTRUCTIONS = """ +**Important Rules:** +- Coordinates (x, y) must be NUMBERS, not strings +- Do NOT use named parameters for coordinates unless necessary for clarity +- Button parameter is optional, defaults to 'left' +- String values must be in quotes +- Call send_msg_to_user only with a single number in the answer when sending the final answer for evaluation. + +**Correct Examples:** +- mouse_click(347, 192) +- mouse_click(56, 712.56, 'right') +- keyboard_type('hello@example.com') +- keyboard_type('System Diagnostics') +- keyboard_press('ControlOrMeta+v') +- keyboard_press('Escape') +- mouse_drag_and_drop(100, 200, 300, 400) + +**WRONG Examples (DO NOT DO THIS):** +- mouse_click(x='347, 192', y=192) ❌ x is a string with both coords +- mouse_click('347', '192') ❌ coordinates as strings +- "mouse_click(100, 200)" ❌ wrapped in quotes +- keyboard_press(Escape) ❌ string argument missing quotes +- keyboard_type(System Diagnostics) ❌ text argument missing quotes +""" + +simple_bgym_action_tool = { + "name": "perform_action", + "type": "function", + "description": f"""Return a string representation of a Python function call for browsergym actions. + You must return ONLY the function call string, exactly as it would appear in Python code.""", + "parameters": { + "type": "object", + "properties": { + "thought": { + "type": "string", + "description": "The agent's internal chain of thought for performing the action.", + }, + "action": { + "type": "string", + "description": "The Python function call string (e.g., 'mouse_click(100, 200)' or 'keyboard_type(\"hello\")')", + }, + }, + "required": ["thought", "action"], + }, +} + + +def action_from_generalized_bgym_action_tool( + response: LLMOutput, tool_name: str = "perform_action" +) -> tuple[str | None, str | None]: + + """Extract the action string from the tool call in the LLM response.""" + action, think = None, None + if response.tool_calls is not None: + for tc in response.tool_calls.tool_calls: + if tc.name == tool_name: + action = tc.arguments.get("action") + think = tc.arguments.get("thought") + break + return action, think + + +@dataclass +class Block(ABC): + def _init(self): + """Initialize the block.""" + pass + + def make(self) -> "Block": + """Returns a copy so the init can start adding some stuff to `self` without changing the + original datatclass that should only contain a config. + The aim is avoid having 2 calss definition for each block, e.g. Block and BlockArgs. + + Returns: + Block: A copy of the current block instance with initialization applied. + """ + block = self.__class__(**asdict(self)) + block._init() + return block + + @abstractmethod + def apply(self, llm, messages: list[MessageBuilder], **kwargs): + pass + + +@dataclass +class MsgGroup: + name: str = None + messages: list[MessageBuilder] = field(default_factory=list) + summary: MessageBuilder = None + + @property + def tool_summary(self) -> None: + return [msg for msg in self.messages if msg.role == "tool"] + + @property + def messages_without_images(self) -> list[MessageBuilder]: + _messages = deepcopy(self.messages) + for msg in _messages: + for content in msg.content: + if "image" in content: + content.pop("image") + content["text"] = "[Screenshot Placeholder]" + + return _messages + + +class StructuredDiscussion: + """ + A structured discussion that groups messages into named groups with a potential summary for each group. + + When the discussion is flattened, only the last `keep_last_n_obs` groups are kept in the final list, + the other groups are replaced by their summaries if they have one. + """ + + def __init__(self, keep_last_n_obs=None): + self.groups: list[MsgGroup] = [] + self.keep_last_n_obs: int | None = keep_last_n_obs + + def append(self, message: MessageBuilder): + """Append a message to the last group.""" + self.groups[-1].messages.append(message) + + def new_group(self, name: str = None): + """Start a new group of messages.""" + if name is None: + name = f"group_{len(self.groups)}" + self.groups.append(MsgGroup(name)) + + def flatten(self) -> list[MessageBuilder]: + """Flatten the groups into a single list of messages.""" + + keep_last_n_obs = self.keep_last_n_obs or len(self.groups) + messages = [] + for i, group in enumerate(self.groups): + is_tail = i >= len(self.groups) - keep_last_n_obs + + if not is_tail: + if group.summary is not None: + messages.append(group.summary) + else: + messages.extend(group.messages_without_images) + + else: + messages.extend(group.messages) + # Mark all summarized messages for caching + if i == len(self.groups) - keep_last_n_obs: + for msg in messages: # unset previous cache breakpoints + msg._cache_breakpoint = False + # set new cache breakpoint + messages[i].mark_all_previous_msg_for_caching() + return messages + + def set_last_summary(self, summary: MessageBuilder): + # append None to summaries until we reach the current group index + self.groups[-1].summary = summary + + def get_last_summary(self) -> MessageBuilder | None: + """Get the last summary message.""" + if len(self.groups) == 0: + return None + return self.groups[-1].summary + + def is_goal_set(self) -> bool: + """Check if the goal is set in the first group.""" + return len(self.groups) > 0 + + +SYS_MSG = """You are a web agent. Based on the observation, you will decide which action to take to accomplish your goal. +You strive for excellence and need to be as meticulous as possible. Make sure to explore when not sure. +""" + + +@dataclass +class Goal(Block): + """Block to add the goal to the messages.""" + + goal_as_system_msg: bool = True + + def apply( + self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG + ) -> dict: + system_message = llm.msg.system().add_text(sys_msg) + discussion.append(system_message) + + if self.goal_as_system_msg: + goal_message = llm.msg.system() + else: + goal_message = llm.msg.user() + + goal_message.add_text("# Goal:\n") + for content in obs["goal_object"]: + if content["type"] == "text": + goal_message.add_text(content["text"]) + elif content["type"] == "image_url": + goal_message.add_image(content["image_url"]) + discussion.append(goal_message) + + +AXTREE_NOTE = """ +AXTree extracts most of the interactive elements of the DOM in a tree structure. It may also contain information that is not visible in the screenshot. +A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools, e.g, click(bid="a253"). Make sure to include letters and numbers in the bid. +""" + + +@dataclass +class Obs(Block): + """Block to add the observation to the messages.""" + + use_last_error: bool = True + use_screenshot: bool = True + use_axtree: bool = False + use_dom: bool = False + use_som: bool = False + use_tabs: bool = False + overlay_mouse_action: bool = False + use_zoomed_webpage: bool = False + skip_preprocessing: bool = False + + def _init(self): + self._last_observation = None + + def apply( + self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput + ) -> dict: + obs_msg = llm.msg.user() + tool_calls = last_llm_output.tool_calls + # add the tool call response first in the observation + # to maintain continuity with last response. + if tool_calls: + for call in tool_calls: + call.response_text("See Observation") + tool_response = llm.msg.add_responded_tool_calls(tool_calls) + discussion.append(tool_response) + + if self.use_last_error: + if obs["last_action_error"] != "": + obs_msg.add_text(f"Last action error:\n{obs['last_action_error']}") + + if self.use_screenshot: + if self.use_som: + screenshot = obs["screenshot_som"] + else: + screenshot = obs["screenshot"] + + if self.overlay_mouse_action and self._last_observation is not None: + self.overlay_last_screenshot_with_action( + discussion, obs["last_action"], self._last_observation + ) + + + obs_msg.add_image(image_to_png_base64_url(screenshot)) + if self.use_axtree: + obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}") + if self.use_dom: + obs_msg.add_text(f"DOM:\n{obs['pruned_html']}") + if self.use_tabs: + obs_msg.add_text(_format_tabs(obs)) + + discussion.append(obs_msg) + self._last_observation = deepcopy(obs) + return obs_msg + + @staticmethod + def overlay_last_screenshot_with_action(discussion: StructuredDiscussion, action, obs): + """Update the last image with new_image_base64 overlayed with the action.""" + import base64 + from agentlab.analyze import overlay_utils + from PIL import Image + from io import BytesIO + + for msg_groups in reversed(discussion.groups): + for msg in reversed(msg_groups.messages): + for content in reversed(msg.content): + if "image" in content: + data_url = content["image"] + header, encoded = data_url.split(",", 1) + new_obs_properties = deepcopy(obs["extra_element_properties"]) + sc = Image.open(BytesIO(base64.b64decode(encoded))) + overlay_utils.annotate_action(sc, action, properties=new_obs_properties) + new_base64_image = image_to_png_base64_url(sc) + content["image"] = new_base64_image + return + + +def _format_tabs(obs): + """Format the open tabs in a llm-readable way.""" + prompt_pieces = ["Currently open tabs:"] + for page_index, (page_url, page_title) in enumerate( + zip(obs["open_pages_urls"], obs["open_pages_titles"]) + ): + active_or_not = " (active tab)" if page_index == obs["active_page_index"] else "" + prompt_piece = f"""\ +Tab {page_index}{active_or_not}: + Title: {page_title} + URL: {page_url} +""" + prompt_pieces.append(prompt_piece) + return "\n".join(prompt_pieces) + + +@dataclass +class GeneralHints(Block): + use_hints: bool = True + + def apply(self, llm, discussion: StructuredDiscussion) -> dict: + if not self.use_hints: + return + + hints = [] + + hints.append( + """Use ControlOrMeta instead of Control and Meta for keyboard shortcuts, to be cross-platform compatible. E.g. use ControlOrMeta for mutliple selection in lists.\n""" + ) + # simulated a hint. + # hints.append( + # """Remember to submit the form once all the fields are filled out.\n""" + # ) + + discussion.append(llm.msg.user().add_text("\n".join(hints))) + + +@dataclass +class Summarizer(Block): + """Block to summarize the last action and the current state of the environment.""" + + do_summary: bool = False + high_details: bool = True + + def apply(self, llm, discussion: StructuredDiscussion) -> dict: + if not self.do_summary: + + return + + msg = llm.msg.user().add_text("""Summarize\n""") + + discussion.append(msg) + + summary_response = llm(APIPayload(messages=discussion.flatten())) + + summary_msg = llm.msg.assistant().add_text(summary_response.think) + discussion.append(summary_msg) + discussion.set_last_summary(summary_msg) + return summary_msg + + def apply_init(self, llm, discussion: StructuredDiscussion) -> dict: + """Initialize the summarizer block.""" + if not self.do_summary: + return + + system_msg = llm.msg.system() + if self.high_details: + # Add a system message to the LLM to indicate that it should summarize + system_msg.add_text( + """# Summarizer instructions:\nWhen asked to summarize, do the following: +1) Summarize the effect of the last action, with attention to details. +2) Give a semantic description of the current state of the environment, with attention to details. If there was a repeating mistake, mention the cause of it. +3) Reason about the overall task at a high level. +4) What hint can be relevant for the next action? Only chose from the hints provided in the task description. Or select none. +5) Reason about the next action to take, based on the current state and the goal. +""" + ) + else: + system_msg.add_text( + """When asked to summarize, give a semantic description of the current state of the environment.""" + ) + discussion.append(system_msg) + + +@dataclass +class TaskHint(Block): + use_task_hint: bool = True + hint_db_rel_path: str = "hint_db.csv" + hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct" + top_n: int = 4 # Number of top hints to return when using embedding retrieval + embedder_model: str = "Qwen/Qwen3-Embedding-0.6B" # Model for embedding hints + embedder_server: str = "http://localhost:5000" + skip_hints_for_current_task: bool = False + skip_hints_for_current_goal: bool = False + + def _init(self): + """Initialize the block.""" + if self.use_task_hint: + self.hints_source = HintsSource( + hint_db_path=self.hint_db_rel_path, + hint_retrieval_mode=self.hint_retrieval_mode, + top_n=self.top_n, + embedder_model=self.embedder_model, + embedder_server=self.embedder_server, + skip_hints_for_current_task=self.skip_hints_for_current_task, + skip_hints_for_current_goal=self.skip_hints_for_current_goal, + ) + + def apply(self, llm, discussion: StructuredDiscussion, obs: dict, task_name: str) -> dict: + if not self.use_task_hint: + return {} + + try: + goal_text = obs["goal_object"][0]["text"] + except (KeyError, IndexError): + Warning("Goal text not found in observation") + goal_text = "" + task_hints = self.hints_source.choose_hints(llm, task_name, goal_text) + + hints = [] + for hint in task_hints: + hint = hint.strip() + if hint: + hints.append(f"- {hint}") + + if len(hints) > 0: + hints_str = ( + "\n# Hints:\nHere are some hints for the task you are working on:\n" + + "\n".join(hints) + ) + msg = llm.msg.user().add_text(hints_str) + + discussion.append(msg) + + +@dataclass +class PromptConfig: + tag_screenshot: bool = True # Whether to tag the screenshot with the last action. + goal: Goal = None + obs: Obs = None + summarizer: Summarizer = None + general_hints: GeneralHints = None + task_hint: TaskHint = None + keep_last_n_obs: int = 1 + multiaction: bool = False + action_subsets: tuple[str] = None + use_noop_as_default_action: bool = False + use_generalized_bgym_action_tool: bool = True + + +@dataclass +class ToolUseAgentArgs(AgentArgs): + model_args: BaseModelArgs = None + config: PromptConfig = None + use_raw_page_output: bool = False # This attribute is used in loop.py to setup the env. + action_set: bgym.AbstractActionSet | None = None + + def __post_init__(self): + try: + self.agent_name = f"CUAv2-{self.model_args.model_name}".replace("/", "_") + if self.config.task_hint.use_task_hint: + if self.config.task_hint.hint_retrieval_mode == "direct": + self.agent_name += f"-direct-hint" + if self.config.task_hint.hint_retrieval_mode == "emb": + self.agent_name += f"-emb-hint" + if self.config.task_hint.hint_retrieval_mode == "llm": + self.agent_name += f"-llm-hint" + + except AttributeError: + pass + + def make_agent(self) -> bgym.Agent: + if self.config is None: + self.config = DEFAULT_PROMPT_CONFIG + return ToolUseAgent( + model_args=self.model_args, # type: ignore + config=self.config, + action_set=self.action_set, + ) + + def prepare(self): + return self.model_args.prepare_server() + + def close(self): + return self.model_args.close_server() + + def set_benchmark(self, benchmark: AgentLabBenchmark | BgymBenchmark, demo_mode: bool): + """Set benchmark specific flags.""" + benchmark_name = benchmark.name + if benchmark_name == "osworld": + self.config.obs.skip_preprocessing = True + + self.config.obs.use_tabs = benchmark.is_multi_tab + benchmark_action_set = ( + deepcopy(benchmark.high_level_action_set_args).make_action_set().action_set + ) + # these actions are added based on the benchmark action set + if "send_msg_to_user" in benchmark_action_set: + self.config.action_subsets += ("chat",) + if "report_infeasible" in benchmark_action_set: + self.config.action_subsets += ("infeas",) + + +class ToolUseAgent(bgym.Agent): + def __init__( + self, + model_args: LiteLLMModelArgs, + config: PromptConfig = None, + action_set: bgym.AbstractActionSet | None = None, + ): + self.model_args = model_args + self.config = config + self.action_set: bgym.AbstractActionSet = action_set or bgym.HighLevelActionSet( + self.config.action_subsets, + multiaction=self.config.multiaction, # type: ignore + ) + if self.config.use_generalized_bgym_action_tool: + self.tools = [simple_bgym_action_tool] + else: + self.tools = self.action_set.to_tool_description(api=model_args.api) + + self.call_ids = [] + + self.llm = model_args.make_model() + self.msg_builder = model_args.get_message_builder() + self.llm.msg = self.msg_builder + + self.task_hint = self.config.task_hint.make() + self.obs_block = self.config.obs.make() + + self.discussion = StructuredDiscussion(self.config.keep_last_n_obs) + self.last_response: LLMOutput = LLMOutput() + self._responses: list[LLMOutput] = [] + + def obs_preprocessor(self, obs): + obs = copy(obs) + if self.config.obs.skip_preprocessing: + return obs + page = obs.pop("page", None) + if page is not None: + obs["screenshot"] = extract_screenshot(page) + else: + if self.config.obs.use_dom: + obs["dom_txt"] = flatten_dom_to_str( + obs["dom_object"], + extra_properties=obs["extra_element_properties"], + ) + obs["pruned_html"] = prune_html(obs["dom_txt"]) + + if self.config.obs.use_axtree: + obs["axtree_txt"] = flatten_axtree_to_str( + obs["axtree_object"], + extra_properties=obs["extra_element_properties"], + ) + + if self.config.obs.use_som: + obs["screenshot_som"] = overlay_som( + obs["screenshot"], extra_properties=obs["extra_element_properties"] + ) + if self.config.obs.use_zoomed_webpage: + pass + + return obs + + def set_task_name(self, task_name: str): + """Cheater function that is supposed to be called by loop.py before callling get_action""" + self.task_name = task_name + + @cost_tracker_decorator + def get_action(self, obs: Any) -> float: + self.llm.reset_stats() + if not self.discussion.is_goal_set(): + self.discussion.new_group("goal") + + if self.config.multiaction: + sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed." + else: + sys_msg = SYS_MSG + "\nYou can only take one action at a time." + + sys_msg += ( + "\nAvailable browsergym actions that can be returned with get_action:\n" + + self.action_set.describe() + ) + sys_msg += ADDITIONAL_ACTION_INSTRUCTIONS + self.config.goal.apply(self.llm, self.discussion, obs, sys_msg) + + self.config.summarizer.apply_init(self.llm, self.discussion) + self.config.general_hints.apply(self.llm, self.discussion) + self.task_hint.apply(self.llm, self.discussion, obs, self.task_name) + + self.discussion.new_group() + + self.obs_block.apply(self.llm, self.discussion, obs, last_llm_output=self.last_response) + + self.config.summarizer.apply(self.llm, self.discussion) + + messages = self.discussion.flatten() + response: LLMOutput = self.llm( + APIPayload( + messages=messages, + tools=self.tools, + tool_choice="any", + cache_tool_definition=True, + cache_complete_prompt=False, + use_cache_breakpoints=True, + ) + ) + + if self.config.use_generalized_bgym_action_tool: + action, think = action_from_generalized_bgym_action_tool(response) + else: + action = response.action + think = response.think + + if action is None and self.config.use_noop_as_default_action: + action = "noop()" + + last_summary = self.discussion.get_last_summary() + if last_summary is not None: + think = last_summary.content[0]["text"] + "\n" + think + else: + # Add the think to the history when use_summarizer is False + if think is not None: + self.discussion.append(self.llm.msg.assistant().add_text(think)) + + self.discussion.new_group() + + self.last_response = response + self._responses.append(response) # may be useful for debugging + + tools_str = json.dumps(self.tools, indent=2) + tools_msg = MessageBuilder("tool_description").add_text(tools_str) + + # Adding these extra messages to visualize in gradio + messages.insert(0, tools_msg) # insert at the beginning of the message + # This avoids the assertion error with self.llm.user().add_responded_tool_calls(tool_calls) + msg = self.llm.msg("tool") + msg.responded_tool_calls = response.tool_calls + messages.append(msg) + + agent_info = bgym.AgentInfo( + think=think, + chat_messages=messages, + stats=self.llm.stats.stats_dict, + ) + return action, agent_info + + +CUA_PROMPT_CONFIG = PromptConfig( + tag_screenshot=True, + goal=Goal(goal_as_system_msg=True), + obs=Obs( + use_last_error=True, + use_screenshot=True, + use_axtree=False, + use_dom=False, + use_som=False, + use_tabs=False, + overlay_mouse_action=True, + ), + summarizer=Summarizer(do_summary=False), + general_hints=GeneralHints(use_hints=False), + task_hint=TaskHint(use_task_hint=False), + action_subsets=("coord",), + keep_last_n_obs=5, # max 20 no more than 20 screenshots for claude + multiaction=True, + use_noop_as_default_action = False, + use_generalized_bgym_action_tool = True +) + + +def get_cua_like_agent_config_with_hint( + model_name: str, + hint_db_path: str, + hint_retrieval_mode: Literal["direct", "llm", "emb"] = "direct", +) -> ToolUseAgentArgs: + config = deepcopy(CUA_PROMPT_CONFIG) + config.task_hint.use_task_hint = True + config.task_hint.hint_db_rel_path = hint_db_path + config.task_hint.hint_retrieval_mode = hint_retrieval_mode + return ToolUseAgentArgs( + model_args=LiteLLMModelArgs( + model_name=model_name, + max_new_tokens=2000, + temperature=None, # NONE for claude-4-5 to enable reasoning effort. + ), + config=config, + ) + + +def get_cua_like_agent_config_with_hint_skip_for_current_goal( + model_name: str, + hint_db_path: str, + hint_retrieval_mode: Literal["llm", "emb"] = "llm", +) -> ToolUseAgentArgs: + config = deepcopy(CUA_PROMPT_CONFIG) + config.task_hint.use_task_hint = True + config.task_hint.skip_hints_for_current_goal = True + config.task_hint.hint_db_rel_path = hint_db_path + config.task_hint.hint_retrieval_mode = hint_retrieval_mode + return ToolUseAgentArgs( + model_args=LiteLLMModelArgs( + model_name=model_name, + max_new_tokens=2000, + temperature=None, # NONE for claude-4-5 to enable reasoning effort. + ), + config=config, + ) + + +def get_cua_like_agent_config(model_name: str) -> ToolUseAgentArgs: + + return ToolUseAgentArgs( + model_args=LiteLLMModelArgs( + model_name=model_name, + max_new_tokens=2000, + temperature=None, + ), + config=CUA_PROMPT_CONFIG, + ) + + +CUA_LIKE_CLAUDE_4_SONNET = get_cua_like_agent_config("anthropic/claude-sonnet-4-20250514") + + +if __name__ == "__main__": + + from agentlab.agents.tool_use_agent.cua_like_agent import CUA_LIKE_CLAUDE_4_SONNET + from agentlab.experiments.study import Study + import bgym + import logging + + logging.getLogger().setLevel(logging.INFO) + os.environ["LITELLM_LOG"] = "WARNING" + + benchmark = "workarena_l1" + benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=2) + benchmark = benchmark.subset_from_glob("task_name", "*create*") + for env_args in benchmark.env_args_list: + env_args.max_steps = 20 # increase the number of steps for coord agent testing + + agent_args = [CUA_LIKE_CLAUDE_4_SONNET] + study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING) + study.run( + n_jobs=5, + parallel_backend="ray", + strict_reproducibility=False, + n_relaunch=1, + ) From 9861ca501ee63363c43faab22c2387ec06023e59 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 4 Dec 2025 17:05:54 -0500 Subject: [PATCH 04/11] remove unused imports --- .../agents/tool_use_agent/cua_like_agent.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py index 3f1925f9..fb230f08 100644 --- a/src/agentlab/agents/tool_use_agent/cua_like_agent.py +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -1,20 +1,12 @@ -import fnmatch import json import logging import os -import random -import time from abc import ABC, abstractmethod -from collections import defaultdict from copy import copy, deepcopy from dataclasses import asdict, dataclass, field -from pathlib import Path from typing import Any, Literal import bgym -import numpy as np -import pandas as pd -import requests from bgym import Benchmark as BgymBenchmark from browsergym.core.observation import extract_screenshot from browsergym.utils.obs import ( @@ -26,25 +18,17 @@ from agentlab.agents.agent_args import AgentArgs from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark -from agentlab.benchmarks.osworld import OSWorldActionSet from agentlab.llm.base_api import BaseModelArgs -from agentlab.llm.chat_api import ChatModel from agentlab.llm.litellm_api import LiteLLMModelArgs from agentlab.llm.llm_utils import image_to_png_base64_url from agentlab.llm.response_api import ( APIPayload, - ClaudeResponseModelArgs, LLMOutput, MessageBuilder, - OpenAIChatModelArgs, - OpenAIResponseModelArgs, - OpenRouterModelArgs, - AzureChatModelArgs, - ToolCalls, ) from agentlab.llm.tracking import cost_tracker_decorator from agentlab.utils.hinting import HintsSource -from agentlab.agents import agent_utils + logger = logging.getLogger(__name__) From 307446e4b2f859e8b371668455a3f9a12f1793e8 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 4 Dec 2025 18:14:40 -0500 Subject: [PATCH 05/11] black --- src/agentlab/agents/tool_use_agent/cua_like_agent.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py index fb230f08..3930deb7 100644 --- a/src/agentlab/agents/tool_use_agent/cua_like_agent.py +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -30,7 +30,6 @@ from agentlab.utils.hinting import HintsSource - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -84,7 +83,6 @@ def action_from_generalized_bgym_action_tool( response: LLMOutput, tool_name: str = "perform_action" ) -> tuple[str | None, str | None]: - """Extract the action string from the tool call in the LLM response.""" action, think = None, None if response.tool_calls is not None: @@ -284,7 +282,6 @@ def apply( discussion, obs["last_action"], self._last_observation ) - obs_msg.add_image(image_to_png_base64_url(screenshot)) if self.use_axtree: obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}") @@ -685,8 +682,8 @@ def get_action(self, obs: Any) -> float: action_subsets=("coord",), keep_last_n_obs=5, # max 20 no more than 20 screenshots for claude multiaction=True, - use_noop_as_default_action = False, - use_generalized_bgym_action_tool = True + use_noop_as_default_action=False, + use_generalized_bgym_action_tool=True, ) From fa8b526ce4951ba7be0b01b2fdfb21b6ff37b12f Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Fri, 5 Dec 2025 13:57:08 -0500 Subject: [PATCH 06/11] make extra-dependency user facing and update dev dependency group --- pyproject.toml | 26 +++++++++++++------------- uv.lock | 44 +++++++++++++++++++++----------------------- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 64285668..98fde2d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,24 +49,20 @@ dependencies = [ "pillow", "gymnasium>=0.27", "torch>=2.2.2", - "safetensors>=0.4.0", - "transformers>=4.38.2", "anthropic>=0.62.0", "litellm>=1.75.3", "python-dotenv>=1.1.1", ] [project.optional-dependencies] -dev = [ - "black[jupyter]>=24.2.0", - "blacken-docs", - "pre-commit", - "pytest==7.3.2", - "flaky", - "pytest-xdist", - "pytest-playwright", +hint = [ + "sentence-transformers>=5.0.0", +] +transformers = [ + "transformers>=4.38.2", ] + [project.urls] Homepage = "https://github.com/ServiceNow/AgentLab" @@ -107,9 +103,13 @@ dev = [ "darglint>=1.8.1", "ipykernel>=6.30.1", "pip>=25.2", -] -hint = [ - "sentence-transformers>=5.0.0", + "black[jupyter]>=24.2.0", + "blacken-docs", + "pre-commit", + "pytest==7.3.2", + "flaky", + "pytest-xdist", + "pytest-playwright", ] diff --git a/uv.lock b/uv.lock index eaf9483c..fbfd4be8 100644 --- a/uv.lock +++ b/uv.lock @@ -32,43 +32,39 @@ dependencies = [ { name = "pyyaml" }, { name = "ray", extra = ["default"] }, { name = "requests" }, - { name = "safetensors" }, { name = "tiktoken" }, { name = "torch" }, - { name = "transformers" }, ] [package.optional-dependencies] +hint = [ + { name = "sentence-transformers" }, +] +transformers = [ + { name = "transformers" }, +] + +[package.dev-dependencies] dev = [ { name = "black", extra = ["jupyter"] }, { name = "blacken-docs" }, + { name = "darglint" }, { name = "flaky" }, + { name = "ipykernel" }, + { name = "pip" }, { name = "pre-commit" }, { name = "pytest" }, { name = "pytest-playwright" }, { name = "pytest-xdist" }, ] -[package.dev-dependencies] -dev = [ - { name = "darglint" }, - { name = "ipykernel" }, - { name = "pip" }, -] -hint = [ - { name = "sentence-transformers" }, -] - [package.metadata] requires-dist = [ { name = "anthropic", specifier = ">=0.62.0" }, - { name = "black", extras = ["jupyter"], marker = "extra == 'dev'", specifier = ">=24.2.0" }, - { name = "blacken-docs", marker = "extra == 'dev'" }, { name = "browsergym", specifier = ">=0.7.1" }, { name = "contexttimer" }, { name = "dask" }, { name = "distributed" }, - { name = "flaky", marker = "extra == 'dev'" }, { name = "gitpython" }, { name = "gradio", specifier = ">=5.5" }, { name = "gymnasium", specifier = ">=0.27" }, @@ -80,30 +76,32 @@ requires-dist = [ { name = "openai", specifier = ">=1.7,<2" }, { name = "pandas" }, { name = "pillow" }, - { name = "pre-commit", marker = "extra == 'dev'" }, { name = "pydantic", specifier = "~=2.9" }, - { name = "pytest", marker = "extra == 'dev'", specifier = "==7.3.2" }, - { name = "pytest-playwright", marker = "extra == 'dev'" }, - { name = "pytest-xdist", marker = "extra == 'dev'" }, { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "python-slugify" }, { name = "pyyaml", specifier = ">=6" }, { name = "ray", extras = ["default"] }, { name = "requests" }, - { name = "safetensors", specifier = ">=0.4.0" }, + { name = "sentence-transformers", marker = "extra == 'hint'", specifier = ">=5.0.0" }, { name = "tiktoken" }, { name = "torch", specifier = ">=2.2.2" }, - { name = "transformers", specifier = ">=4.38.2" }, + { name = "transformers", marker = "extra == 'transformers'", specifier = ">=4.38.2" }, ] -provides-extras = ["dev"] +provides-extras = ["hint", "transformers"] [package.metadata.requires-dev] dev = [ + { name = "black", extras = ["jupyter"], specifier = ">=24.2.0" }, + { name = "blacken-docs" }, { name = "darglint", specifier = ">=1.8.1" }, + { name = "flaky" }, { name = "ipykernel", specifier = ">=6.30.1" }, { name = "pip", specifier = ">=25.2" }, + { name = "pre-commit" }, + { name = "pytest", specifier = "==7.3.2" }, + { name = "pytest-playwright" }, + { name = "pytest-xdist" }, ] -hint = [{ name = "sentence-transformers", specifier = ">=5.0.0" }] [[package]] name = "aiofiles" From b5b2af3cf5d4c32805ff62cc90925ea50d3c3477 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Fri, 5 Dec 2025 14:07:32 -0500 Subject: [PATCH 07/11] update CI/CD env installation (dev is default group) --- .github/workflows/code_format.yml | 2 +- .github/workflows/darglint.yml | 2 +- .github/workflows/unit_tests.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code_format.yml b/.github/workflows/code_format.yml index f5551244..1985c4e2 100644 --- a/.github/workflows/code_format.yml +++ b/.github/workflows/code_format.yml @@ -27,7 +27,7 @@ jobs: run: uv python install 3.11 - name: Install dependencies - run: uv sync --frozen --extra dev + run: uv sync --frozen - name: List packages run: uv pip list diff --git a/.github/workflows/darglint.yml b/.github/workflows/darglint.yml index 7fca9321..414a9781 100644 --- a/.github/workflows/darglint.yml +++ b/.github/workflows/darglint.yml @@ -27,7 +27,7 @@ jobs: run: uv python install 3.12 # this fails in 3.11 - name: Install dependencies - run: uv sync --frozen --extra dev + run: uv sync --frozen - name: List packages run: uv pip list diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 7ce65722..d3194d93 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -32,7 +32,7 @@ jobs: run: uv python install 3.11 - name: Install AgentLab - run: uv sync --frozen --extra dev + run: uv sync --frozen - name: List packages run: uv pip list From a41baaf7f444b2ea7e626f97c9a25af1ba474d64 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Fri, 5 Dec 2025 14:48:34 -0500 Subject: [PATCH 08/11] update makefile to use uv --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 23799f32..1c33ce09 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,14 @@ .PHONY: test setup miniwob lint stop-miniwob osworld setup: - @pip install -e . - @playwright install chromium --with-deps - @python -c 'import nltk; nltk.download("punkt_tab")' + @uv sync --python 3.12 + @uv run playwright install chromium --with-deps + @uv run python -c 'import nltk; nltk.download("punkt_tab")' miniwob: stop-miniwob @git clone https://github.com/Farama-Foundation/miniwob-plusplus.git || true @cd miniwob-plusplus && git checkout 7fd85d71a4b60325c6585396ec4f48377d049838 - @python -m http.server 8080 --directory miniwob-plusplus/miniwob/html & echo $$! > .miniwob-server.pid + @uv run python -m http.server 8080 --directory miniwob-plusplus/miniwob/html & echo $$! > .miniwob-server.pid @sleep 3 @echo "MiniWob server started on http://localhost:8080" @@ -22,14 +22,14 @@ stop-miniwob: @echo "MiniWob server stopped" run-tests: - @MINIWOB_URL="http://localhost:8080/miniwob/" pytest -n 5 --durations=10 -m 'not pricy' tests/ + @MINIWOB_URL="http://localhost:8080/miniwob/" uv run pytest -n 5 --durations=10 -m 'not pricy' tests/ @echo "Tests completed" test: setup miniwob check-miniwob run-tests stop-miniwob lint: setup - @black src/ --check --diff - @darglint -v 2 -z short src/ + @uv run black src/ --check --diff + @uv run darglint -v 2 -z short src/ osworld: @echo "Setting up OSWorld..." @@ -42,9 +42,9 @@ osworld: sed -i.bak 's/tqdm~=.*/tqdm/' requirements.txt && \ sed -i.bak 's/pandas~=.*/pandas/' requirements.txt @echo "Installing OSWorld requirements..." - @cd OSWorld && pip install -r requirements.txt + @cd OSWorld && uv pip install -r requirements.txt @echo "Installing OSWorld in development mode..." - @cd OSWorld && pip install -e . + @cd OSWorld && uv pip install -e . @echo "OSWorld setup completed!" @echo "Next steps:" @echo "1. Configure your VM (VMware/VirtualBox) according to OSWorld documentation" From 241f2cd33040d4bb7dfd7d33c65e92c4a13aef09 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Fri, 5 Dec 2025 15:06:22 -0500 Subject: [PATCH 09/11] black and remove unnessary items pip list from code formatting CI/CD. --- .github/workflows/code_format.yml | 5 +---- .github/workflows/darglint.yml | 3 --- src/agentlab/agents/tool_use_agent/cua_like_agent.py | 6 +++--- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code_format.yml b/.github/workflows/code_format.yml index 1985c4e2..7c052c4e 100644 --- a/.github/workflows/code_format.yml +++ b/.github/workflows/code_format.yml @@ -24,13 +24,10 @@ jobs: enable-cache: true - name: Set up Python - run: uv python install 3.11 + run: uv python install 3.12 - name: Install dependencies run: uv sync --frozen - - name: List packages - run: uv pip list - - name: Code Formatting run: uv run black src/ --check --diff diff --git a/.github/workflows/darglint.yml b/.github/workflows/darglint.yml index 414a9781..76c947ae 100644 --- a/.github/workflows/darglint.yml +++ b/.github/workflows/darglint.yml @@ -29,8 +29,5 @@ jobs: - name: Install dependencies run: uv sync --frozen - - name: List packages - run: uv pip list - - name: Darglint checks run: uv run darglint -v 2 -z short src/ diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py index 3930deb7..2b485a49 100644 --- a/src/agentlab/agents/tool_use_agent/cua_like_agent.py +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -281,7 +281,7 @@ def apply( self.overlay_last_screenshot_with_action( discussion, obs["last_action"], self._last_observation ) - + obs_msg.add_image(image_to_png_base64_url(screenshot)) if self.use_axtree: obs_msg.add_text(f"AXTree:\n{AXTREE_NOTE}\n{obs['axtree_txt']}") @@ -617,7 +617,7 @@ def get_action(self, obs: Any) -> float: APIPayload( messages=messages, tools=self.tools, - tool_choice="any", + tool_choice="any", cache_tool_definition=True, cache_complete_prompt=False, use_cache_breakpoints=True, @@ -631,7 +631,7 @@ def get_action(self, obs: Any) -> float: think = response.think if action is None and self.config.use_noop_as_default_action: - action = "noop()" + action = "noop()" last_summary = self.discussion.get_last_summary() if last_summary is not None: From 11ffb9b7e10531ef8abbd58541366bf49a6f14e0 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:42:54 -0500 Subject: [PATCH 10/11] fix typos Co-authored-by: Patrice Bechard --- src/agentlab/agents/tool_use_agent/cua_like_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py index 2b485a49..bac76724 100644 --- a/src/agentlab/agents/tool_use_agent/cua_like_agent.py +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -102,8 +102,8 @@ def _init(self): def make(self) -> "Block": """Returns a copy so the init can start adding some stuff to `self` without changing the - original datatclass that should only contain a config. - The aim is avoid having 2 calss definition for each block, e.g. Block and BlockArgs. + original dataclass that should only contain a config. + The aim is avoid having 2 class definition for each block, e.g. Block and BlockArgs. Returns: Block: A copy of the current block instance with initialization applied. From 51126ae59a73e53b62028aaa460f6e46a3daee28 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:43:30 -0500 Subject: [PATCH 11/11] fix typos 2 Co-authored-by: Patrice Bechard --- src/agentlab/agents/tool_use_agent/cua_like_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/agents/tool_use_agent/cua_like_agent.py b/src/agentlab/agents/tool_use_agent/cua_like_agent.py index bac76724..8a6259a2 100644 --- a/src/agentlab/agents/tool_use_agent/cua_like_agent.py +++ b/src/agentlab/agents/tool_use_agent/cua_like_agent.py @@ -680,7 +680,7 @@ def get_action(self, obs: Any) -> float: general_hints=GeneralHints(use_hints=False), task_hint=TaskHint(use_task_hint=False), action_subsets=("coord",), - keep_last_n_obs=5, # max 20 no more than 20 screenshots for claude + keep_last_n_obs=5, # no more than 20 screenshots for claude multiaction=True, use_noop_as_default_action=False, use_generalized_bgym_action_tool=True,