Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"permissions": {
"allow": [
"Bash(printf:*)",
"Bash(\"C:\\work2\\README_EXPLORATION_RESULTS.md\")",
"Bash(cat:*)"
],
"deny": [],
"ask": []
}
}
47 changes: 42 additions & 5 deletions src/uipath/_cli/_evals/_console_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from rich.rule import Rule
from rich.table import Table

from uipath._cli._evals._models._evaluation_set import AnyEvaluator
from uipath._events._event_bus import EventBus
from uipath._events._events import (
EvalRunCreatedEvent,
Expand All @@ -15,7 +16,6 @@
EvalSetRunUpdatedEvent,
EvaluationEvents,
)
from uipath.eval.evaluators import BaseEvaluator
from uipath.eval.models import ScoreType

logger = logging.getLogger(__name__)
Expand All @@ -26,9 +26,10 @@ class ConsoleProgressReporter:

def __init__(self):
self.console = Console()
self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {}
self.evaluators: Dict[str, AnyEvaluator] = {}
self.display_started = False
self.eval_results_by_name: Dict[str, list[Any]] = {}
self.evaluator_weights: Dict[str, float] = {}

def _convert_score_to_numeric(self, eval_result) -> float:
"""Convert evaluation result score to numeric value."""
Expand Down Expand Up @@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N
"""Handle evaluation set run creation."""
try:
self.evaluators = {eval.id: eval for eval in payload.evaluators}
if payload.evaluator_weights:
self.evaluator_weights = payload.evaluator_weights
except Exception as e:
logger.error(f"Failed to handle create eval set run event: {e}")

Expand Down Expand Up @@ -206,9 +209,20 @@ def display_final_results(self):

summary_table.add_row(*row_values)

# Add separator row before average
# Add separator row before weights and average
summary_table.add_section()

# Add weights row if weights are defined
if self.evaluator_weights:
weight_row_values = ["[bold]Weights[/bold]"]
for evaluator_id in evaluator_ids:
weight = self.evaluator_weights.get(evaluator_id, "-")
if weight != "-":
weight_row_values.append(f"[bold]{weight:.2f}[/bold]")
else:
weight_row_values.append("[bold]-[/bold]")
summary_table.add_row(*weight_row_values)

# Add average row
avg_row_values = ["[bold]Average[/bold]"]
for evaluator_id in evaluator_ids:
Expand All @@ -217,8 +231,31 @@ def display_final_results(self):

summary_table.add_row(*avg_row_values)

self.console.print(summary_table)
self.console.print()
# Calculate and display weighted final score if weights are defined
if self.evaluator_weights:
weighted_total = 0.0
weights_sum = 0.0
for evaluator_id in evaluator_ids:
weight = self.evaluator_weights.get(evaluator_id)
if weight is not None:
avg_score = self.final_results[evaluator_id]
weighted_total += weight * avg_score
weights_sum += weight

# Display as a separate info line
self.console.print(summary_table)
self.console.print()
self.console.print(
f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]"
)
if weights_sum != 1.0:
self.console.print(
f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]"
)
self.console.print()
else:
self.console.print(summary_table)
self.console.print()
else:
self.console.print(
"→ [bold green]All evaluations completed successfully![/bold green]"
Expand Down
19 changes: 18 additions & 1 deletion src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from enum import Enum, IntEnum
from typing import Annotated, Any, Dict, List, Literal, Optional, Union

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
from pydantic.alias_generators import to_camel

from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator


class EvaluationSimulationTool(BaseModel):
name: str = Field(..., alias="name")
Expand Down Expand Up @@ -158,6 +160,9 @@ class EvaluationSet(BaseModel):
version: Literal["1.0"] = "1.0"
evaluator_refs: List[str] = Field(default_factory=list)
evaluations: List[EvaluationItem] = Field(default_factory=list)
evaluator_weights: Optional[Dict[str, float]] = Field(
default=None, alias="evaluatorWeights"
)

def extract_selected_evals(self, eval_ids) -> None:
selected_evals: list[EvaluationItem] = []
Expand Down Expand Up @@ -214,3 +219,15 @@ def _discriminate_eval_set(
if version == "1.0":
return "evaluation_set"
return "legacy_evaluation_set"


AnyEvaluationSet = Annotated[
Union[
Annotated[EvaluationSet, Tag("evaluation_set")],
Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
],
Discriminator(_discriminate_eval_set),
]

AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
8 changes: 7 additions & 1 deletion src/uipath/_cli/_evals/_models/_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel):
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)

score: float
details: Optional[str | BaseModel] = None
details: Optional[str | Dict[str, Any] | BaseModel] = None
evaluation_time: Optional[float] = None

@model_serializer(mode="wrap")
Expand All @@ -56,6 +56,7 @@ def serialize_model(
info: core_schema.SerializationInfo,
) -> Any:
data = serializer(self)
# Only remove details if it's None, keep empty dicts and populated dicts
if self.details is None and isinstance(data, dict):
data.pop("details", None)
return data
Expand Down Expand Up @@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel):

evaluator_name: str
evaluator_id: str
evaluator_type: Optional[str] = None
node_id: Optional[str] = None
result: EvaluationResultDto


Expand All @@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel):

evaluation_name: str
evaluation_run_results: List[EvaluationRunResultDto]
workflow: Optional[List[str]] = None
agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None

@property
Expand All @@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel):

evaluation_set_name: str
evaluation_set_results: List[EvaluationRunResult]
weighted_final_score: Optional[float] = None
evaluator_weights: Optional[Dict[str, float]] = None

@property
def score(self) -> float:
Expand Down
61 changes: 33 additions & 28 deletions src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,21 @@
from urllib.parse import urlparse

from opentelemetry import trace
from pydantic import BaseModel
from rich.console import Console

from uipath import UiPath
from uipath._cli._evals._models._evaluation_set import (
AnyEvaluationItem,
AnyEvaluator,
EvaluationItem,
EvaluationStatus,
)
from uipath._cli._evals._models._evaluator import Evaluator
from uipath._cli._evals._models._sw_reporting import (
StudioWebAgentSnapshot,
StudioWebProgressItem,
)
from uipath._cli._utils._console import ConsoleLogger
from uipath._cli._utils._project_files import ( # type: ignore
get_project_config,
)
from uipath._cli._utils._project_files import get_project_config # type: ignore
from uipath._events._event_bus import EventBus
from uipath._events._events import (
EvalRunCreatedEvent,
Expand All @@ -40,10 +38,7 @@
ENV_TENANT_ID,
HEADER_INTERNAL_TENANT_ID,
)
from uipath.eval.evaluators import (
BaseEvaluator,
LegacyBaseEvaluator,
)
from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
from uipath.eval.models import EvalItemResult, ScoreType
from uipath.tracing import LlmOpsHttpExporter

Expand Down Expand Up @@ -136,9 +131,7 @@ def _get_endpoint_prefix(self) -> str:
return "api/"
return "agentsruntime_/api/"

def _is_coded_evaluator(
self, evaluators: List[BaseEvaluator[Any, Any, Any]]
) -> bool:
def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
"""Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).

Args:
Expand All @@ -150,7 +143,7 @@ def _is_coded_evaluator(
if not evaluators:
return False
# Check the first evaluator type
return not isinstance(evaluators[0], LegacyBaseEvaluator)
return isinstance(evaluators[0], BaseEvaluator)

def _extract_usage_from_spans(
self, spans: list[Any]
Expand Down Expand Up @@ -240,7 +233,7 @@ async def create_eval_set_run_sw(

@gracefully_handle_errors
async def create_eval_run(
self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
) -> str:
"""Create a new evaluation run in StudioWeb.

Expand All @@ -267,7 +260,7 @@ async def create_eval_run(
async def update_eval_run(
self,
sw_progress_item: StudioWebProgressItem,
evaluators: dict[str, Evaluator],
evaluators: dict[str, AnyEvaluator],
is_coded: bool = False,
spans: list[Any] | None = None,
):
Expand Down Expand Up @@ -334,10 +327,11 @@ async def update_eval_set_run(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
weighted_final_score: float | None = None,
):
"""Update the evaluation set run status to complete."""
spec = self._update_eval_set_run_spec(
eval_set_run_id, evaluator_scores, is_coded
eval_set_run_id, evaluator_scores, is_coded, weighted_final_score
)
await self._client.request_async(
method=spec.method,
Expand Down Expand Up @@ -457,6 +451,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
eval_set_run_id,
payload.evaluator_scores,
is_coded=is_coded,
weighted_final_score=payload.weighted_final_score,
)
logger.debug(
f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
Expand Down Expand Up @@ -485,9 +480,7 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:

logger.debug("StudioWeb progress reporter subscribed to evaluation events")

def _serialize_justification(
self, justification: BaseModel | str | None
) -> str | None:
def _serialize_justification(self, justification: Any) -> str | None:
"""Serialize justification to JSON string for API compatibility.

Args:
Expand All @@ -497,9 +490,12 @@ def _serialize_justification(
Returns:
JSON string representation or None if justification is None
"""
if isinstance(justification, BaseModel):
justification = json.dumps(justification.model_dump())

if justification is None:
return None
if hasattr(justification, "model_dump"):
return json.dumps(justification.model_dump())
if not isinstance(justification, str):
return json.dumps(justification)
return justification

def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
Expand Down Expand Up @@ -708,7 +704,7 @@ def _update_coded_eval_run_spec(
)

def _create_eval_run_spec(
self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
) -> RequestSpec:
# Legacy API expects eval IDs as GUIDs, coded accepts strings
# Convert string IDs to deterministic GUIDs for legacy
Expand Down Expand Up @@ -801,6 +797,7 @@ def _update_eval_set_run_spec(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
is_coded: bool = False,
weighted_final_score: float | None = None,
) -> RequestSpec:
# Legacy API expects evaluatorId as GUID, coded accepts string
evaluator_scores_list = []
Expand All @@ -824,16 +821,24 @@ def _update_eval_set_run_spec(

# For legacy evaluations, endpoint is without /coded
endpoint_suffix = "coded/" if is_coded else ""

# Build the JSON payload
json_payload = {
"evalSetRunId": eval_set_run_id,
"status": EvaluationStatus.COMPLETED.value,
"evaluatorScores": evaluator_scores_list,
}

# Add weighted final score if available
if weighted_final_score is not None:
json_payload["weightedFinalScore"] = weighted_final_score

return RequestSpec(
method="PUT",
endpoint=Endpoint(
f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun"
),
json={
"evalSetRunId": eval_set_run_id,
"status": EvaluationStatus.COMPLETED.value,
"evaluatorScores": evaluator_scores_list,
},
json=json_payload,
headers=self._tenant_header(),
)

Expand Down
Loading