Skip to content

Commit 519abed

Browse files
authored
WebArena-Verified (#319)
* lazy import for webarena-verified * use litellm pricing for Azure and Anthropic ChatModels * upd README
1 parent 5dfec6d commit 519abed

File tree

3 files changed

+47
-7
lines changed

3 files changed

+47
-7
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ AgentLab Features:
5252
| Benchmark | Setup <br> Link | # Task <br> Template| Seed <br> Diversity | Max <br> Step | Multi-tab | Hosted Method | BrowserGym <br> Leaderboard |
5353
|-----------|------------|---------|----------------|-----------|-----------|---------------|----------------------|
5454
| [WebArena](https://webarena.dev/) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/webarena/README.md) | 812 | None | 30 | yes | self hosted (docker) | soon |
55+
| [WebArena-Verified](https://github.com/ServiceNow/webarena-verified) | [setup](https://github.com/ServiceNow/BrowserGym/blob/wa_verified/browsergym/webarena_verified/README.md) | 812 | None | 30 | yes | self hosted | soon |
5556
| [WorkArena](https://github.com/ServiceNow/WorkArena) L1 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 33 | High | 30 | no | demo instance | soon |
5657
| [WorkArena](https://github.com/ServiceNow/WorkArena) L2 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon |
5758
| [WorkArena](https://github.com/ServiceNow/WorkArena) L3 | [setup](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) | 341 | High | 50 | no | demo instance | soon |

src/agentlab/experiments/loop.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,11 @@ def _get_env_name(task_name: str):
915915
elif task_name.startswith("webarena"):
916916
import browsergym.webarena
917917
import browsergym.webarenalite
918+
919+
try:
920+
import browsergym.webarena_verified
921+
except ImportError:
922+
logger.warning("browsergym.webarena_verified not found. Skipping import.")
918923
elif task_name.startswith("visualwebarena"):
919924
import browsergym.visualwebarena
920925
elif task_name.startswith("assistantbench"):

src/agentlab/llm/chat_api.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def __init__(
433433
min_retry_wait_time=min_retry_wait_time,
434434
client_class=OpenAI,
435435
client_args=client_args,
436-
pricing_func=tracking.get_pricing_openai,
436+
pricing_func=tracking.partial(tracking.get_pricing_litellm, model_name=model_name),
437437
log_probs=log_probs,
438438
)
439439

@@ -492,6 +492,7 @@ def __init__(
492492
temperature=0.5,
493493
max_tokens=100,
494494
max_retry=4,
495+
pricing_func=None,
495496
):
496497
self.model_name = model_name
497498
self.temperature = temperature
@@ -501,6 +502,22 @@ def __init__(
501502
api_key = api_key or os.getenv("ANTHROPIC_API_KEY")
502503
self.client = anthropic.Anthropic(api_key=api_key)
503504

505+
# Get pricing information
506+
if pricing_func:
507+
pricings = pricing_func()
508+
try:
509+
self.input_cost = float(pricings[model_name]["prompt"])
510+
self.output_cost = float(pricings[model_name]["completion"])
511+
except KeyError:
512+
logging.warning(
513+
f"Model {model_name} not found in the pricing information, prices are set to 0. Maybe try upgrading langchain_community."
514+
)
515+
self.input_cost = 0.0
516+
self.output_cost = 0.0
517+
else:
518+
self.input_cost = 0.0
519+
self.output_cost = 0.0
520+
504521
def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict:
505522
# Convert OpenAI format to Anthropic format
506523
system_message = None
@@ -528,13 +545,29 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
528545

529546
response = self.client.messages.create(**kwargs)
530547

548+
usage = getattr(response, "usage", {})
549+
new_input_tokens = getattr(usage, "input_tokens", 0)
550+
output_tokens = getattr(usage, "output_tokens", 0)
551+
cache_read_tokens = getattr(usage, "cache_input_tokens", 0)
552+
cache_write_tokens = getattr(usage, "cache_creation_input_tokens", 0)
553+
cache_read_cost = (
554+
self.input_cost * tracking.ANTHROPIC_CACHE_PRICING_FACTOR["cache_read_tokens"]
555+
)
556+
cache_write_cost = (
557+
self.input_cost * tracking.ANTHROPIC_CACHE_PRICING_FACTOR["cache_write_tokens"]
558+
)
559+
cost = (
560+
new_input_tokens * self.input_cost
561+
+ output_tokens * self.output_cost
562+
+ cache_read_tokens * cache_read_cost
563+
+ cache_write_tokens * cache_write_cost
564+
)
565+
531566
# Track usage if available
532-
if hasattr(tracking.TRACKER, "instance"):
533-
tracking.TRACKER.instance(
534-
response.usage.input_tokens,
535-
response.usage.output_tokens,
536-
0, # cost calculation would need pricing info
537-
)
567+
if hasattr(tracking.TRACKER, "instance") and isinstance(
568+
tracking.TRACKER.instance, tracking.LLMTracker
569+
):
570+
tracking.TRACKER.instance(new_input_tokens, output_tokens, cost)
538571

539572
return AIMessage(response.content[0].text)
540573

@@ -552,6 +585,7 @@ def make_model(self):
552585
model_name=self.model_name,
553586
temperature=self.temperature,
554587
max_tokens=self.max_new_tokens,
588+
pricing_func=partial(tracking.get_pricing_litellm, model_name=self.model_name),
555589
)
556590

557591

0 commit comments

Comments
 (0)