UiPath · giuliastf · Nov 9, 2025 · Nov 9, 2025 · Nov 20, 2025
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(printf:*)",
+      "Bash(\"C:\\work2\\README_EXPLORATION_RESULTS.md\")",
+      "Bash(cat:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -7,6 +7,7 @@
 from rich.rule import Rule
 from rich.table import Table
 
+from uipath._cli._evals._models._evaluation_set import AnyEvaluator
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
     EvalRunCreatedEvent,
@@ -15,7 +16,6 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
-from uipath.eval.evaluators import BaseEvaluator
 from uipath.eval.models import ScoreType
 
 logger = logging.getLogger(__name__)
@@ -26,9 +26,10 @@ class ConsoleProgressReporter:
 
     def __init__(self):
         self.console = Console()
-        self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {}
+        self.evaluators: Dict[str, AnyEvaluator] = {}
         self.display_started = False
         self.eval_results_by_name: Dict[str, list[Any]] = {}
+        self.evaluator_weights: Dict[str, float] = {}
 
     def _convert_score_to_numeric(self, eval_result) -> float:
         """Convert evaluation result score to numeric value."""
@@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N
         """Handle evaluation set run creation."""
         try:
             self.evaluators = {eval.id: eval for eval in payload.evaluators}
+            if payload.evaluator_weights:
+                self.evaluator_weights = payload.evaluator_weights
         except Exception as e:
             logger.error(f"Failed to handle create eval set run event: {e}")
 
@@ -206,9 +209,20 @@ def display_final_results(self):
 
                 summary_table.add_row(*row_values)
 
-            # Add separator row before average
+            # Add separator row before weights and average
             summary_table.add_section()
 
+            # Add weights row if weights are defined
+            if self.evaluator_weights:
+                weight_row_values = ["[bold]Weights[/bold]"]
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id, "-")
+                    if weight != "-":
+                        weight_row_values.append(f"[bold]{weight:.2f}[/bold]")
+                    else:
+                        weight_row_values.append("[bold]-[/bold]")
+                summary_table.add_row(*weight_row_values)
+
             # Add average row
             avg_row_values = ["[bold]Average[/bold]"]
             for evaluator_id in evaluator_ids:
@@ -217,8 +231,31 @@ def display_final_results(self):
 
             summary_table.add_row(*avg_row_values)
 
-            self.console.print(summary_table)
-            self.console.print()
+            # Calculate and display weighted final score if weights are defined
+            if self.evaluator_weights:
+                weighted_total = 0.0
+                weights_sum = 0.0
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id)
+                    if weight is not None:
+                        avg_score = self.final_results[evaluator_id]
+                        weighted_total += weight * avg_score
+                        weights_sum += weight
+
+                # Display as a separate info line
+                self.console.print(summary_table)
+                self.console.print()
+                self.console.print(
+                    f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]"
+                )
+                if weights_sum != 1.0:
+                    self.console.print(
+                        f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]"
+                    )
+                self.console.print()
+            else:
+                self.console.print(summary_table)
+                self.console.print()
         else:
             self.console.print(
                 "→ [bold green]All evaluations completed successfully![/bold green]"

diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -1,9 +1,11 @@
 from enum import Enum, IntEnum
 from typing import Annotated, Any, Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
 from pydantic.alias_generators import to_camel
 
+from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
+
 
 class EvaluationSimulationTool(BaseModel):
     name: str = Field(..., alias="name")
@@ -158,6 +160,9 @@ class EvaluationSet(BaseModel):
     version: Literal["1.0"] = "1.0"
     evaluator_refs: List[str] = Field(default_factory=list)
     evaluations: List[EvaluationItem] = Field(default_factory=list)
+    evaluator_weights: Optional[Dict[str, float]] = Field(
+        default=None, alias="evaluatorWeights"
+    )
 
     def extract_selected_evals(self, eval_ids) -> None:
         selected_evals: list[EvaluationItem] = []
@@ -214,3 +219,15 @@ def _discriminate_eval_set(
         if version == "1.0":
             return "evaluation_set"
     return "legacy_evaluation_set"
+
+
+AnyEvaluationSet = Annotated[
+    Union[
+        Annotated[EvaluationSet, Tag("evaluation_set")],
+        Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
+    ],
+    Discriminator(_discriminate_eval_set),
+]
+
+AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
+AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py
@@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     score: float
-    details: Optional[str | BaseModel] = None
+    details: Optional[str | Dict[str, Any] | BaseModel] = None
     evaluation_time: Optional[float] = None
 
     @model_serializer(mode="wrap")
@@ -56,6 +56,7 @@ def serialize_model(
         info: core_schema.SerializationInfo,
     ) -> Any:
         data = serializer(self)
+        # Only remove details if it's None, keep empty dicts and populated dicts
         if self.details is None and isinstance(data, dict):
             data.pop("details", None)
         return data
@@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel):
 
     evaluator_name: str
     evaluator_id: str
+    evaluator_type: Optional[str] = None
+    node_id: Optional[str] = None
     result: EvaluationResultDto
 
 
@@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel):
 
     evaluation_name: str
     evaluation_run_results: List[EvaluationRunResultDto]
+    workflow: Optional[List[str]] = None
     agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None
 
     @property
@@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: List[EvaluationRunResult]
+    weighted_final_score: Optional[float] = None
+    evaluator_weights: Optional[Dict[str, float]] = None
 
     @property
     def score(self) -> float:

diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -9,23 +9,21 @@
 from urllib.parse import urlparse
 
 from opentelemetry import trace
-from pydantic import BaseModel
 from rich.console import Console
 
 from uipath import UiPath
 from uipath._cli._evals._models._evaluation_set import (
+    AnyEvaluationItem,
+    AnyEvaluator,
     EvaluationItem,
     EvaluationStatus,
 )
-from uipath._cli._evals._models._evaluator import Evaluator
 from uipath._cli._evals._models._sw_reporting import (
     StudioWebAgentSnapshot,
     StudioWebProgressItem,
 )
 from uipath._cli._utils._console import ConsoleLogger
-from uipath._cli._utils._project_files import (  # type: ignore
-    get_project_config,
-)
+from uipath._cli._utils._project_files import get_project_config  # type: ignore
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
     EvalRunCreatedEvent,
@@ -40,10 +38,7 @@
     ENV_TENANT_ID,
     HEADER_INTERNAL_TENANT_ID,
 )
-from uipath.eval.evaluators import (
-    BaseEvaluator,
-    LegacyBaseEvaluator,
-)
+from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
 from uipath.eval.models import EvalItemResult, ScoreType
 from uipath.tracing import LlmOpsHttpExporter
 
@@ -136,9 +131,7 @@ def _get_endpoint_prefix(self) -> str:
             return "api/"
         return "agentsruntime_/api/"
 
-    def _is_coded_evaluator(
-        self, evaluators: List[BaseEvaluator[Any, Any, Any]]
-    ) -> bool:
+    def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
         """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).
 
         Args:
@@ -150,7 +143,7 @@ def _is_coded_evaluator(
         if not evaluators:
             return False
         # Check the first evaluator type
-        return not isinstance(evaluators[0], LegacyBaseEvaluator)
+        return isinstance(evaluators[0], BaseEvaluator)
 
     def _extract_usage_from_spans(
         self, spans: list[Any]
@@ -240,7 +233,7 @@ async def create_eval_set_run_sw(
 
     @gracefully_handle_errors
     async def create_eval_run(
-        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> str:
         """Create a new evaluation run in StudioWeb.
 
@@ -267,7 +260,7 @@ async def create_eval_run(
     async def update_eval_run(
         self,
         sw_progress_item: StudioWebProgressItem,
-        evaluators: dict[str, Evaluator],
+        evaluators: dict[str, AnyEvaluator],
         is_coded: bool = False,
         spans: list[Any] | None = None,
     ):
@@ -334,10 +327,11 @@ async def update_eval_set_run(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ):
         """Update the evaluation set run status to complete."""
         spec = self._update_eval_set_run_spec(
-            eval_set_run_id, evaluator_scores, is_coded
+            eval_set_run_id, evaluator_scores, is_coded, weighted_final_score
         )
         await self._client.request_async(
             method=spec.method,
@@ -457,6 +451,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                     eval_set_run_id,
                     payload.evaluator_scores,
                     is_coded=is_coded,
+                    weighted_final_score=payload.weighted_final_score,
                 )
                 logger.debug(
                     f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
@@ -485,9 +480,7 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
 
         logger.debug("StudioWeb progress reporter subscribed to evaluation events")
 
-    def _serialize_justification(
-        self, justification: BaseModel | str | None
-    ) -> str | None:
+    def _serialize_justification(self, justification: Any) -> str | None:
         """Serialize justification to JSON string for API compatibility.
 
         Args:
@@ -497,9 +490,12 @@ def _serialize_justification(
         Returns:
             JSON string representation or None if justification is None
         """
-        if isinstance(justification, BaseModel):
-            justification = json.dumps(justification.model_dump())
-
+        if justification is None:
+            return None
+        if hasattr(justification, "model_dump"):
+            return json.dumps(justification.model_dump())
+        if not isinstance(justification, str):
+            return json.dumps(justification)
         return justification
 
     def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
@@ -708,7 +704,7 @@ def _update_coded_eval_run_spec(
         )
 
     def _create_eval_run_spec(
-        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> RequestSpec:
         # Legacy API expects eval IDs as GUIDs, coded accepts strings
         # Convert string IDs to deterministic GUIDs for legacy
@@ -801,6 +797,7 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ) -> RequestSpec:
         # Legacy API expects evaluatorId as GUID, coded accepts string
         evaluator_scores_list = []
@@ -824,16 +821,24 @@ def _update_eval_set_run_spec(
 
         # For legacy evaluations, endpoint is without /coded
         endpoint_suffix = "coded/" if is_coded else ""
+
+        # Build the JSON payload
+        json_payload = {
+            "evalSetRunId": eval_set_run_id,
+            "status": EvaluationStatus.COMPLETED.value,
+            "evaluatorScores": evaluator_scores_list,
+        }
+
+        # Add weighted final score if available
+        if weighted_final_score is not None:
+            json_payload["weightedFinalScore"] = weighted_final_score
+
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
                 f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun"
             ),
-            json={
-                "evalSetRunId": eval_set_run_id,
-                "status": EvaluationStatus.COMPLETED.value,
-                "evaluatorScores": evaluator_scores_list,
-            },
+            json=json_payload,
             headers=self._tenant_header(),
         )