diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 000000000..8cbde18fd
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(printf:*)",
+      "Bash(\"C:\\work2\\README_EXPLORATION_RESULTS.md\")",
+      "Bash(cat:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
index 4e0d7a177..3b813e9a3 100644
--- a/src/uipath/_cli/_evals/_console_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -7,6 +7,7 @@
 from rich.rule import Rule
 from rich.table import Table
 
+from uipath._cli._evals._models._evaluation_set import AnyEvaluator
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
     EvalRunCreatedEvent,
@@ -15,7 +16,6 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
-from uipath.eval.evaluators import BaseEvaluator
 from uipath.eval.models import ScoreType
 
 logger = logging.getLogger(__name__)
@@ -26,9 +26,10 @@ class ConsoleProgressReporter:
 
     def __init__(self):
         self.console = Console()
-        self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {}
+        self.evaluators: Dict[str, AnyEvaluator] = {}
         self.display_started = False
         self.eval_results_by_name: Dict[str, list[Any]] = {}
+        self.evaluator_weights: Dict[str, float] = {}
 
     def _convert_score_to_numeric(self, eval_result) -> float:
         """Convert evaluation result score to numeric value."""
@@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N
         """Handle evaluation set run creation."""
         try:
             self.evaluators = {eval.id: eval for eval in payload.evaluators}
+            if payload.evaluator_weights:
+                self.evaluator_weights = payload.evaluator_weights
         except Exception as e:
             logger.error(f"Failed to handle create eval set run event: {e}")
 
@@ -206,9 +209,20 @@ def display_final_results(self):
 
                 summary_table.add_row(*row_values)
 
-            # Add separator row before average
+            # Add separator row before weights and average
             summary_table.add_section()
 
+            # Add weights row if weights are defined
+            if self.evaluator_weights:
+                weight_row_values = ["[bold]Weights[/bold]"]
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id, "-")
+                    if weight != "-":
+                        weight_row_values.append(f"[bold]{weight:.2f}[/bold]")
+                    else:
+                        weight_row_values.append("[bold]-[/bold]")
+                summary_table.add_row(*weight_row_values)
+
             # Add average row
             avg_row_values = ["[bold]Average[/bold]"]
             for evaluator_id in evaluator_ids:
@@ -217,8 +231,31 @@ def display_final_results(self):
 
             summary_table.add_row(*avg_row_values)
 
-            self.console.print(summary_table)
-            self.console.print()
+            # Calculate and display weighted final score if weights are defined
+            if self.evaluator_weights:
+                weighted_total = 0.0
+                weights_sum = 0.0
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id)
+                    if weight is not None:
+                        avg_score = self.final_results[evaluator_id]
+                        weighted_total += weight * avg_score
+                        weights_sum += weight
+
+                # Display as a separate info line
+                self.console.print(summary_table)
+                self.console.print()
+                self.console.print(
+                    f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]"
+                )
+                if weights_sum != 1.0:
+                    self.console.print(
+                        f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]"
+                    )
+                self.console.print()
+            else:
+                self.console.print(summary_table)
+                self.console.print()
         else:
             self.console.print(
                 "→ [bold green]All evaluations completed successfully![/bold green]"
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
index 31e87dd95..e81c46d77 100644
--- a/src/uipath/_cli/_evals/_models/_evaluation_set.py
+++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -1,9 +1,11 @@
 from enum import Enum, IntEnum
 from typing import Annotated, Any, Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
 from pydantic.alias_generators import to_camel
 
+from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
+
 
 class EvaluationSimulationTool(BaseModel):
     name: str = Field(..., alias="name")
@@ -158,6 +160,9 @@ class EvaluationSet(BaseModel):
     version: Literal["1.0"] = "1.0"
     evaluator_refs: List[str] = Field(default_factory=list)
     evaluations: List[EvaluationItem] = Field(default_factory=list)
+    evaluator_weights: Optional[Dict[str, float]] = Field(
+        default=None, alias="evaluatorWeights"
+    )
 
     def extract_selected_evals(self, eval_ids) -> None:
         selected_evals: list[EvaluationItem] = []
@@ -214,3 +219,15 @@ def _discriminate_eval_set(
         if version == "1.0":
             return "evaluation_set"
     return "legacy_evaluation_set"
+
+
+AnyEvaluationSet = Annotated[
+    Union[
+        Annotated[EvaluationSet, Tag("evaluation_set")],
+        Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
+    ],
+    Discriminator(_discriminate_eval_set),
+]
+
+AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
+AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]]
diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py
index c3ba1c728..28aa7f42e 100644
--- a/src/uipath/_cli/_evals/_models/_output.py
+++ b/src/uipath/_cli/_evals/_models/_output.py
@@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     score: float
-    details: Optional[str | BaseModel] = None
+    details: Optional[str | Dict[str, Any] | BaseModel] = None
     evaluation_time: Optional[float] = None
 
     @model_serializer(mode="wrap")
@@ -56,6 +56,7 @@ def serialize_model(
         info: core_schema.SerializationInfo,
     ) -> Any:
         data = serializer(self)
+        # Only remove details if it's None, keep empty dicts and populated dicts
         if self.details is None and isinstance(data, dict):
             data.pop("details", None)
         return data
@@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel):
 
     evaluator_name: str
     evaluator_id: str
+    evaluator_type: Optional[str] = None
+    node_id: Optional[str] = None
     result: EvaluationResultDto
 
 
@@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel):
 
     evaluation_name: str
     evaluation_run_results: List[EvaluationRunResultDto]
+    workflow: Optional[List[str]] = None
     agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None
 
     @property
@@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: List[EvaluationRunResult]
+    weighted_final_score: Optional[float] = None
+    evaluator_weights: Optional[Dict[str, float]] = None
 
     @property
     def score(self) -> float:
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
index dbf305eaf..3dc23efb9 100644
--- a/src/uipath/_cli/_evals/_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -9,23 +9,21 @@
 from urllib.parse import urlparse
 
 from opentelemetry import trace
-from pydantic import BaseModel
 from rich.console import Console
 
 from uipath import UiPath
 from uipath._cli._evals._models._evaluation_set import (
+    AnyEvaluationItem,
+    AnyEvaluator,
     EvaluationItem,
     EvaluationStatus,
 )
-from uipath._cli._evals._models._evaluator import Evaluator
 from uipath._cli._evals._models._sw_reporting import (
     StudioWebAgentSnapshot,
     StudioWebProgressItem,
 )
 from uipath._cli._utils._console import ConsoleLogger
-from uipath._cli._utils._project_files import (  # type: ignore
-    get_project_config,
-)
+from uipath._cli._utils._project_files import get_project_config  # type: ignore
 from uipath._events._event_bus import EventBus
 from uipath._events._events import (
     EvalRunCreatedEvent,
@@ -40,10 +38,7 @@
     ENV_TENANT_ID,
     HEADER_INTERNAL_TENANT_ID,
 )
-from uipath.eval.evaluators import (
-    BaseEvaluator,
-    LegacyBaseEvaluator,
-)
+from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
 from uipath.eval.models import EvalItemResult, ScoreType
 from uipath.tracing import LlmOpsHttpExporter
 
@@ -136,9 +131,7 @@ def _get_endpoint_prefix(self) -> str:
             return "api/"
         return "agentsruntime_/api/"
 
-    def _is_coded_evaluator(
-        self, evaluators: List[BaseEvaluator[Any, Any, Any]]
-    ) -> bool:
+    def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool:
         """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).
 
         Args:
@@ -150,7 +143,7 @@ def _is_coded_evaluator(
         if not evaluators:
             return False
         # Check the first evaluator type
-        return not isinstance(evaluators[0], LegacyBaseEvaluator)
+        return isinstance(evaluators[0], BaseEvaluator)
 
     def _extract_usage_from_spans(
         self, spans: list[Any]
@@ -240,7 +233,7 @@ async def create_eval_set_run_sw(
 
     @gracefully_handle_errors
     async def create_eval_run(
-        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> str:
         """Create a new evaluation run in StudioWeb.
 
@@ -267,7 +260,7 @@ async def create_eval_run(
     async def update_eval_run(
         self,
         sw_progress_item: StudioWebProgressItem,
-        evaluators: dict[str, Evaluator],
+        evaluators: dict[str, AnyEvaluator],
         is_coded: bool = False,
         spans: list[Any] | None = None,
     ):
@@ -334,10 +327,11 @@ async def update_eval_set_run(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ):
         """Update the evaluation set run status to complete."""
         spec = self._update_eval_set_run_spec(
-            eval_set_run_id, evaluator_scores, is_coded
+            eval_set_run_id, evaluator_scores, is_coded, weighted_final_score
         )
         await self._client.request_async(
             method=spec.method,
@@ -457,6 +451,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                     eval_set_run_id,
                     payload.evaluator_scores,
                     is_coded=is_coded,
+                    weighted_final_score=payload.weighted_final_score,
                 )
                 logger.debug(
                     f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
@@ -485,9 +480,7 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None:
 
         logger.debug("StudioWeb progress reporter subscribed to evaluation events")
 
-    def _serialize_justification(
-        self, justification: BaseModel | str | None
-    ) -> str | None:
+    def _serialize_justification(self, justification: Any) -> str | None:
         """Serialize justification to JSON string for API compatibility.
 
         Args:
@@ -497,9 +490,12 @@ def _serialize_justification(
         Returns:
             JSON string representation or None if justification is None
         """
-        if isinstance(justification, BaseModel):
-            justification = json.dumps(justification.model_dump())
-
+        if justification is None:
+            return None
+        if hasattr(justification, "model_dump"):
+            return json.dumps(justification.model_dump())
+        if not isinstance(justification, str):
+            return json.dumps(justification)
         return justification
 
     def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
@@ -708,7 +704,7 @@ def _update_coded_eval_run_spec(
         )
 
     def _create_eval_run_spec(
-        self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False
+        self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False
     ) -> RequestSpec:
         # Legacy API expects eval IDs as GUIDs, coded accepts strings
         # Convert string IDs to deterministic GUIDs for legacy
@@ -801,6 +797,7 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ) -> RequestSpec:
         # Legacy API expects evaluatorId as GUID, coded accepts string
         evaluator_scores_list = []
@@ -824,16 +821,24 @@ def _update_eval_set_run_spec(
 
         # For legacy evaluations, endpoint is without /coded
         endpoint_suffix = "coded/" if is_coded else ""
+
+        # Build the JSON payload
+        json_payload = {
+            "evalSetRunId": eval_set_run_id,
+            "status": EvaluationStatus.COMPLETED.value,
+            "evaluatorScores": evaluator_scores_list,
+        }
+
+        # Add weighted final score if available
+        if weighted_final_score is not None:
+            json_payload["weightedFinalScore"] = weighted_final_score
+
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
                 f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun"
             ),
-            json={
-                "evalSetRunId": eval_set_run_id,
-                "status": EvaluationStatus.COMPLETED.value,
-                "evaluatorScores": evaluator_scores_list,
-            },
+            json=json_payload,
             headers=self._tenant_header(),
         )
 
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index a605d9835..347d68474 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -12,10 +12,7 @@
 from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
-from uipath._cli._evals.mocks.cache_manager import CacheManager
-from uipath._cli._evals.mocks.input_mocker import (
-    generate_llm_input,
-)
+from uipath._cli._evals.mocks.input_mocker import generate_llm_input
 
 from ..._events._event_bus import EventBus
 from ..._events._events import (
@@ -26,7 +23,7 @@
     EvalSetRunUpdatedEvent,
     EvaluationEvents,
 )
-from ...eval.evaluators import BaseEvaluator
+from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator
 from ...eval.models import EvaluationResult
 from ...eval.models.models import AgentExecution, EvalItemResult
 from .._runtime._contracts import (
@@ -44,8 +41,11 @@
 from ..models.runtime_schema import Entrypoint
 from ._evaluator_factory import EvaluatorFactory
 from ._models._evaluation_set import (
+    AnyEvaluationItem,
+    AnyEvaluationSet,
+    AnyEvaluator,
     EvaluationItem,
-    EvaluationSet,
+    LegacyEvaluationItem,
 )
 from ._models._exceptions import EvaluationRuntimeException
 from ._models._output import (
@@ -57,16 +57,116 @@
     convert_eval_execution_output_to_serializable,
 )
 from ._span_collection import ExecutionSpanCollector
-from .mocks.mocks import (
-    cache_manager_context,
-    clear_execution_context,
-    set_execution_context,
-)
+from .mocks.mocks import clear_execution_context, set_execution_context
 
 T = TypeVar("T", bound=UiPathBaseRuntime)
 C = TypeVar("C", bound=UiPathRuntimeContext)
 
 
+def extract_workflow_from_spans(spans: list[ReadableSpan]) -> list[str]:
+    """Extract ordered list of main workflow nodes from execution spans.
+
+    Only captures workflow nodes that are direct children of a LangGraph parent span,
+    which naturally filters out sub-nodes and internal components.
+
+    Args:
+        spans: List of ReadableSpan objects from agent execution
+
+    Returns:
+        List of unique main node names in execution order
+    """
+
+    for i, span in enumerate(spans):
+        span_name = getattr(span, "name", "NO_NAME")
+        attributes = getattr(span, "attributes", {})
+        parent_context = getattr(span, "parent", None)
+        parent_span_id = None
+        if parent_context:
+            parent_span_id = getattr(parent_context, "span_id", None)
+
+        span_context = span.get_span_context()
+        span_id = span_context.span_id if span_context else "NO_ID"
+
+        if isinstance(attributes, dict):
+            node_name = attributes.get("node_name")
+            langgraph_node = attributes.get("langgraph.node")
+
+    node_order = []
+    seen_nodes = set()
+
+    # System nodes to exclude
+    system_nodes = {"__start__", "__end__"}
+
+    # First pass: Find LangGraph-related parent span IDs
+    # Look for spans that could be the main graph span (could have different names)
+    langgraph_span_ids = set()
+    for span in spans:
+        span_name = getattr(span, "name", "")
+        # Check if this is a LangGraph main span
+        if span_name and "langgraph" in span_name.lower():
+            span_context = span.get_span_context()
+            if span_context:
+                langgraph_span_ids.add(span_context.span_id)
+
+    # If we found potential parent spans, use them; otherwise we'll check all spans with langgraph.node
+    if langgraph_span_ids:
+        # Second pass: Collect spans that have a LangGraph parent
+        for span in spans:
+            # Get parent span ID
+            parent_context = getattr(span, "parent", None)
+            parent_span_id = None
+            if parent_context:
+                parent_span_id = getattr(parent_context, "span_id", None)
+
+            # Skip if parent is not one of the LangGraph spans
+            if parent_span_id not in langgraph_span_ids:
+                continue
+
+            # Get node name - use span name directly since attributes might not have it
+            span_name = getattr(span, "name", "")
+            attributes = getattr(span, "attributes", {})
+
+            # Try to get from attributes first, then fall back to span name
+            node_name = None
+            if isinstance(attributes, dict):
+                node_name = attributes.get("langgraph.node") or attributes.get(
+                    "node_name"
+                )
+
+            if not node_name:
+                node_name = span_name
+
+            # Skip if no node name found
+            if not node_name:
+                continue
+
+            # Filter out system nodes
+            if node_name in system_nodes:
+                continue
+
+            # Add to workflow if not seen before
+            if node_name not in seen_nodes:
+                seen_nodes.add(node_name)
+                node_order.append(node_name)
+    else:
+        # Fallback: Just get all spans with langgraph.node attribute
+        for span in spans:
+            attributes = getattr(span, "attributes", None)
+            if not attributes or not isinstance(attributes, dict):
+                continue
+
+            node_name = attributes.get("langgraph.node")
+
+            if not node_name or node_name in system_nodes:
+                continue
+
+            if node_name not in seen_nodes:
+                seen_nodes.add(node_name)
+                node_order.append(node_name)
+
+    return node_order
+
+
 class ExecutionSpanExporter(SpanExporter):
     """Custom exporter that stores spans grouped by execution ids."""
 
@@ -152,7 +252,6 @@ class UiPathEvalContext(UiPathRuntimeContext):
     eval_ids: Optional[List[str]] = None
     eval_set_run_id: Optional[str] = None
     verbose: bool = False
-    enable_mocker_cache: bool = False
 
 
 class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
@@ -204,60 +303,45 @@ async def execute(self) -> UiPathRuntimeResult:
 
         event_bus = self.event_bus
 
-        # Create cache manager if enabled
-        if self.context.enable_mocker_cache:
-            cache_mgr = CacheManager()
-            cache_manager_context.set(cache_mgr)
+        # Load eval set (path is already resolved in cli_eval.py)
+        evaluation_set, _ = EvalHelpers.load_eval_set(
+            self.context.eval_set, self.context.eval_ids
+        )
+        evaluators = self._load_evaluators(evaluation_set)
 
-        try:
-            # Load eval set (path is already resolved in cli_eval.py)
-            evaluation_set, _ = EvalHelpers.load_eval_set(
-                self.context.eval_set, self.context.eval_ids
-            )
-            evaluators = self._load_evaluators(evaluation_set)
+        await event_bus.publish(
+            EvaluationEvents.CREATE_EVAL_SET_RUN,
+            EvalSetRunCreatedEvent(
+                execution_id=self.execution_id,
+                entrypoint=self.context.entrypoint or "",
+                eval_set_run_id=self.context.eval_set_run_id,
+                eval_set_id=evaluation_set.id,
+                no_of_evals=len(evaluation_set.evaluations),
+                evaluators=evaluators,
+                evaluator_weights=getattr(evaluation_set, "evaluator_weights", None),
+            ),
+        )
 
-            await event_bus.publish(
-                EvaluationEvents.CREATE_EVAL_SET_RUN,
-                EvalSetRunCreatedEvent(
-                    execution_id=self.execution_id,
-                    entrypoint=self.context.entrypoint or "",
-                    eval_set_run_id=self.context.eval_set_run_id,
-                    eval_set_id=evaluation_set.id,
-                    no_of_evals=len(evaluation_set.evaluations),
-                    evaluators=evaluators,
-                ),
+        # Check if parallel execution should be used
+        if (
+            self.context.workers
+            and self.context.workers > 1
+            and len(evaluation_set.evaluations) > 1
+        ):
+            eval_run_result_list = await self._execute_parallel(
+                evaluation_set, evaluators, event_bus, self.context.workers
             )
-
-            # Check if parallel execution should be used
-            if (
-                self.context.workers
-                and self.context.workers > 1
-                and len(evaluation_set.evaluations) > 1
-            ):
-                eval_run_result_list = await self._execute_parallel(
-                    evaluation_set, evaluators, event_bus, self.context.workers
-                )
-            else:
-                eval_run_result_list = await self._execute_sequential(
-                    evaluation_set, evaluators, event_bus
-                )
-            results = UiPathEvalOutput(
-                evaluation_set_name=evaluation_set.name,
-                evaluation_set_results=eval_run_result_list,
+        else:
+            eval_run_result_list = await self._execute_sequential(
+                evaluation_set, evaluators, event_bus
             )
-        finally:
-            # Flush cache to disk at end of eval set and cleanup
-            if self.context.enable_mocker_cache:
-                cache_manager = cache_manager_context.get()
-                if cache_manager is not None:
-                    cache_manager.flush()
-                cache_manager_context.set(None)
 
         # Computing evaluator averages
         evaluator_averages: Dict[str, float] = defaultdict(float)
         evaluator_count: Dict[str, int] = defaultdict(int)
 
-        for eval_run_result in results.evaluation_set_results:
+        # Collect all evaluation results first
+        for eval_run_result in eval_run_result_list:
             for result_dto in eval_run_result.evaluation_run_results:
                 evaluator_averages[result_dto.evaluator_id] += result_dto.result.score
                 evaluator_count[result_dto.evaluator_id] += 1
@@ -266,11 +350,33 @@ async def execute(self) -> UiPathRuntimeResult:
             evaluator_averages[eval_id] = (
                 evaluator_averages[eval_id] / evaluator_count[eval_id]
             )
+
+        # Calculate weighted final score if weights are defined
+        evaluator_weights = getattr(evaluation_set, "evaluator_weights", None)
+        weighted_final_score = None
+        if evaluator_weights:
+            weighted_total = 0.0
+            for evaluator_id, avg_score in evaluator_averages.items():
+                weight = evaluator_weights.get(evaluator_id)
+                if weight is not None:
+                    weighted_total += weight * avg_score
+            weighted_final_score = weighted_total
+
+        # Create results with weighted score and weights
+        results = UiPathEvalOutput(
+            evaluation_set_name=evaluation_set.name,
+            evaluation_set_results=eval_run_result_list,
+            weighted_final_score=weighted_final_score,
+            evaluator_weights=evaluator_weights,
+        )
+
         await event_bus.publish(
             EvaluationEvents.UPDATE_EVAL_SET_RUN,
             EvalSetRunUpdatedEvent(
                 execution_id=self.execution_id,
                 evaluator_scores=evaluator_averages,
+                weighted_final_score=weighted_final_score,
+                evaluator_weights=evaluator_weights,
             ),
             wait_for_completion=False,
         )
@@ -283,8 +389,8 @@ async def execute(self) -> UiPathRuntimeResult:
 
     async def _execute_sequential(
         self,
-        evaluation_set: EvaluationSet,
-        evaluators: List[BaseEvaluator[Any, Any, Any]],
+        evaluation_set: AnyEvaluationSet,
+        evaluators: List[AnyEvaluator],
         event_bus: EventBus,
     ) -> List[EvaluationRunResult]:
         all_eval_run_result: list[EvaluationRunResult] = []
@@ -298,13 +404,13 @@ async def _execute_sequential(
 
     async def _execute_parallel(
         self,
-        evaluation_set: EvaluationSet,
-        evaluators: List[BaseEvaluator[Any, Any, Any]],
+        evaluation_set: AnyEvaluationSet,
+        evaluators: List[AnyEvaluator],
         event_bus: EventBus,
         workers: int,
     ) -> List[EvaluationRunResult]:
         # Create a queue with max concurrency
-        queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue(
+        queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue(
             maxsize=workers
         )
 
@@ -314,7 +420,7 @@ async def _execute_parallel(
         # Producer task to fill the queue
         async def producer() -> None:
             for index, eval_item in enumerate(evaluation_set.evaluations):
-                await queue.put((index, eval_item))
+                await queue.put((index, eval_item))  # type: ignore[arg-type]
             # Signal completion by putting None markers
             for _ in range(workers):
                 await queue.put(None)  # type: ignore
@@ -356,8 +462,8 @@ async def worker(worker_id: int) -> None:
 
     async def _execute_eval(
         self,
-        eval_item: EvaluationItem,
-        evaluators: List[BaseEvaluator[Any, Any, Any]],
+        eval_item: AnyEvaluationItem,
+        evaluators: List[AnyEvaluator],
         event_bus: EventBus,
     ) -> EvaluationRunResult:
         # Generate LLM-based input if input_mocking_strategy is defined
@@ -418,6 +524,11 @@ async def _execute_eval(
                             )
                         )
                     )
+                    # Extract workflow nodes from spans even in error case
+                    if spans:
+                        workflow = extract_workflow_from_spans(spans)
+                        if workflow:
+                            evaluation_run_results.workflow = workflow
                 raise
 
             if self.context.verbose:
@@ -426,34 +537,79 @@ async def _execute_eval(
                         agent_execution_output
                     )
                 )
+
+            # Extract workflow nodes from spans
+            workflow = extract_workflow_from_spans(agent_execution_output.spans)
+            # Always set workflow, even if empty, to distinguish from no extraction
+            evaluation_run_results.workflow = workflow if workflow else None
+
             evaluation_item_results: list[EvalItemResult] = []
 
             for evaluator in evaluators:
-                if evaluator.id not in eval_item.evaluation_criterias:
-                    # Skip!
-                    continue
-                evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]
+                # Determine which evaluator method to use based on evaluation set/item type
+                evaluation_result: Optional[EvaluationResult] = None
+
+                match eval_item:
+                    case LegacyEvaluationItem():
+                        # Legacy evaluation - use run_legacy_evaluator
+                        evaluation_result = await self.run_legacy_evaluator(
+                            evaluator=evaluator,  # type: ignore
+                            execution_output=agent_execution_output,
+                            eval_item=eval_item,
+                        )
+                    case EvaluationItem() if (
+                        evaluator.id in eval_item.evaluation_criterias
+                    ):
+                        # New evaluation with criteria
+                        evaluation_criteria = eval_item.evaluation_criterias[
+                            evaluator.id
+                        ]
+
+                        evaluation_result = await self.run_evaluator(
+                            evaluator=evaluator,  # type: ignore
+                            execution_output=agent_execution_output,
+                            eval_item=eval_item,
+                            evaluation_criteria=evaluator.evaluation_criteria_type(  # type: ignore
+                                **evaluation_criteria
+                            )
+                            if evaluation_criteria
+                            else evaluator.evaluator_config.default_evaluation_criteria,  # type: ignore
+                        )
+                    case _:
+                        # Skip if evaluator not in evaluation criteria
+                        continue
 
-                evaluation_result = await self.run_evaluator(
-                    evaluator=evaluator,
-                    execution_output=agent_execution_output,
-                    eval_item=eval_item,
-                    evaluation_criteria=evaluator.evaluation_criteria_type(
-                        **evaluation_criteria
-                    )
-                    if evaluation_criteria
-                    else evaluator.evaluator_config.default_evaluation_criteria,
-                )
+                if evaluation_result is None:
+                    continue
 
                 dto_result = EvaluationResultDto.from_evaluation_result(
                     evaluation_result
                 )
 
+                # Extract node_id from evaluation criteria if available
+                node_id = None
+                if (
+                    isinstance(eval_item, EvaluationItem)
+                    and evaluator.id in eval_item.evaluation_criterias
+                ):
+                    criteria_dict = eval_item.evaluation_criterias[evaluator.id]
+                    if criteria_dict:
+                        node_id = criteria_dict.get("nodeId")
+
+                # Get evaluator type from evaluator's get_evaluator_id method
+                evaluator_type = None
+                try:
+                    evaluator_type = evaluator.get_evaluator_id()
+                except AttributeError:
+                    pass
+
                 evaluation_run_results.evaluation_run_results.append(
                     EvaluationRunResultDto(
                         evaluator_name=evaluator.name,
                         result=dto_result,
                         evaluator_id=evaluator.id,
+                        evaluator_type=evaluator_type,
+                        node_id=node_id,
                     )
                 )
                 evaluation_item_results.append(
@@ -482,10 +638,29 @@ async def _execute_eval(
             exception_details = EvalItemExceptionDetails(exception=e)
 
             for evaluator in evaluators:
+                # Extract node_id from evaluation criteria if available
+                node_id = None
+                if (
+                    isinstance(eval_item, EvaluationItem)
+                    and evaluator.id in eval_item.evaluation_criterias
+                ):
+                    criteria_dict = eval_item.evaluation_criterias[evaluator.id]
+                    if criteria_dict:
+                        node_id = criteria_dict.get("nodeId")
+
+                # Get evaluator type from evaluator's get_evaluator_id method
+                evaluator_type = None
+                try:
+                    evaluator_type = evaluator.get_evaluator_id()
+                except AttributeError:
+                    pass
+
                 evaluation_run_results.evaluation_run_results.append(
                     EvaluationRunResultDto(
                         evaluator_name=evaluator.name,
                         evaluator_id=evaluator.id,
+                        evaluator_type=evaluator_type,
+                        node_id=node_id,
                         result=EvaluationResultDto(score=0),
                     )
                 )
@@ -520,8 +695,8 @@ async def _execute_eval(
         return evaluation_run_results
 
     async def _generate_input_for_eval(
-        self, eval_item: EvaluationItem
-    ) -> EvaluationItem:
+        self, eval_item: AnyEvaluationItem
+    ) -> AnyEvaluationItem:
         """Use LLM to generate a mock input for an evaluation item."""
         generated_input = await generate_llm_input(
             eval_item, (await self.get_entrypoint()).input
@@ -542,7 +717,7 @@ def _get_and_clear_execution_data(
         return spans, logs
 
     async def execute_runtime(
-        self, eval_item: EvaluationItem, execution_id: str
+        self, eval_item: AnyEvaluationItem, execution_id: str
     ) -> UiPathEvalRunExecutionOutput:
         context_args = self.context.model_dump()
         context_args["execution_id"] = execution_id
@@ -615,9 +790,28 @@ async def run_evaluator(
 
         return result
 
-    def _load_evaluators(
-        self, evaluation_set: EvaluationSet
-    ) -> list[BaseEvaluator[Any, Any, Any]]:
+    async def run_legacy_evaluator(
+        self,
+        evaluator: LegacyBaseEvaluator[Any],
+        execution_output: UiPathEvalRunExecutionOutput,
+        eval_item: LegacyEvaluationItem,
+    ) -> EvaluationResult:
+        agent_execution = AgentExecution(
+            agent_input=eval_item.inputs,
+            agent_output=execution_output.result.output or {},
+            agent_trace=execution_output.spans,
+            expected_agent_behavior=eval_item.expected_agent_behavior,
+        )
+
+        result = await evaluator.evaluate(
+            agent_execution=agent_execution,
+            # at the moment evaluation_criteria is always the expected output
+            evaluation_criteria=eval_item.expected_output,
+        )
+
+        return result
+
+    def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]:
         """Load evaluators referenced by the evaluation set."""
         evaluators = []
         evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators"  # type: ignore
diff --git a/src/uipath/_cli/cli_add.py b/src/uipath/_cli/cli_add.py
index ec0856fad..1064e4972 100644
--- a/src/uipath/_cli/cli_add.py
+++ b/src/uipath/_cli/cli_add.py
@@ -6,7 +6,6 @@
 
 import click
 
-from ..telemetry import track
 from ._utils._console import ConsoleLogger
 from ._utils._constants import EVALS_DIRECTORY_NAME
 from ._utils._resources import Resources
@@ -85,7 +84,6 @@ def create_evaluator(evaluator_name):
 @click.command()
 @click.argument("resource", required=True)
 @click.argument("args", nargs=-1)
-@track
 def add(resource: str, args: tuple[str]) -> None:
     """Create a local resource.
 
diff --git a/src/uipath/_cli/cli_auth.py b/src/uipath/_cli/cli_auth.py
index 0ee1dca55..e56d99657 100644
--- a/src/uipath/_cli/cli_auth.py
+++ b/src/uipath/_cli/cli_auth.py
@@ -2,7 +2,6 @@
 
 import click
 
-from ..telemetry import track
 from ._auth._auth_service import AuthService
 from ._utils._common import environment_options
 from ._utils._console import ConsoleLogger
@@ -46,7 +45,6 @@
     default="OR.Execution",
     help="Space-separated list of OAuth scopes to request (e.g., 'OR.Execution OR.Queues'). Defaults to 'OR.Execution'",
 )
-@track
 def auth(
     environment: str,
     force: bool = False,
diff --git a/src/uipath/_cli/cli_debug.py b/src/uipath/_cli/cli_debug.py
index d3a329c49..5bc942d79 100644
--- a/src/uipath/_cli/cli_debug.py
+++ b/src/uipath/_cli/cli_debug.py
@@ -1,7 +1,6 @@
 # type: ignore
 import asyncio
 import os
-from os import environ as env
 from typing import Optional
 
 import click
@@ -12,16 +11,9 @@
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.tracing import LlmOpsHttpExporter
 
-from .._utils.constants import (
-    ENV_JOB_ID,
-)
-from ..telemetry import track
 from ._debug._bridge import UiPathDebugBridge, get_debug_bridge
 from ._debug._runtime import UiPathDebugRuntime
-from ._runtime._contracts import (
-    UiPathRuntimeContext,
-    UiPathRuntimeFactory,
-)
+from ._runtime._contracts import UiPathRuntimeContext, UiPathRuntimeFactory
 from ._runtime._runtime import UiPathScriptRuntime
 from ._utils._console import ConsoleLogger
 from .middlewares import Middlewares
@@ -63,7 +55,6 @@
     default=5678,
     help="Port for the debug server (default: 5678)",
 )
-@track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None)
 def debug(
     entrypoint: Optional[str],
     input: Optional[str],
diff --git a/src/uipath/_cli/cli_deploy.py b/src/uipath/_cli/cli_deploy.py
index fdbf44e7d..377417d40 100644
--- a/src/uipath/_cli/cli_deploy.py
+++ b/src/uipath/_cli/cli_deploy.py
@@ -1,7 +1,6 @@
 # type: ignore
 import click
 
-from ..telemetry import track
 from .cli_pack import pack
 from .cli_publish import publish
 
@@ -22,7 +21,6 @@
     help="Whether to publish to the personal workspace",
 )
 @click.argument("root", type=str, default="./")
-@track
 def deploy(root, feed):
     """Pack and publish the project."""
     ctx = click.get_current_context()
diff --git a/src/uipath/_cli/cli_dev.py b/src/uipath/_cli/cli_dev.py
index 3ff1fa275..ef15ba8f9 100644
--- a/src/uipath/_cli/cli_dev.py
+++ b/src/uipath/_cli/cli_dev.py
@@ -5,13 +5,13 @@
 import click
 
 from uipath._cli._dev._terminal import UiPathDevTerminal
-from uipath._cli._runtime._contracts import UiPathRuntimeContext, UiPathRuntimeFactory
+from uipath._cli._runtime._contracts import (UiPathRuntimeContext,
+                                             UiPathRuntimeFactory)
 from uipath._cli._runtime._runtime import UiPathScriptRuntime
 from uipath._cli._utils._console import ConsoleLogger
 from uipath._cli._utils._debug import setup_debugging
 from uipath._cli.cli_init import init  # type: ignore[attr-defined]
 from uipath._cli.middlewares import Middlewares
-from uipath.telemetry import track
 
 console = ConsoleLogger()
 
@@ -29,7 +29,6 @@
     default=5678,
     help="Port for the debug server (default: 5678)",
 )
-@track
 def dev(interface: Optional[str], debug: bool, debug_port: int) -> None:
     """Launch interactive debugging interface."""
     project_file = os.path.join(os.getcwd(), "uipath.json")
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 5db1e1722..11501433a 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -8,9 +8,7 @@
 from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter
 from uipath._cli._evals._evaluate import evaluate
 from uipath._cli._evals._progress_reporter import StudioWebProgressReporter
-from uipath._cli._evals._runtime import (
-    UiPathEvalContext,
-)
+from uipath._cli._evals._runtime import UiPathEvalContext
 from uipath._cli._runtime._runtime_factory import generate_runtime_factory
 from uipath._cli._utils._folders import get_personal_workspace_key_async
 from uipath._cli._utils._studio_project import StudioClient
@@ -21,8 +19,6 @@
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.tracing import LlmOpsHttpExporter
 
-from .._utils.constants import ENV_JOB_ID
-from ..telemetry import track
 from ._utils._console import ConsoleLogger
 from ._utils._eval_set import EvalHelpers
 
@@ -83,12 +79,11 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     help="File path where the output will be written",
 )
 @click.option(
-    "--enable-mocker-cache",
+    "--verbose",
     is_flag=True,
+    help="Enable verbose debug output for evaluators",
     default=False,
-    help="Enable caching for LLM mocker responses",
 )
-@track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
 def eval(
     entrypoint: Optional[str],
     eval_set: Optional[str],
@@ -97,7 +92,7 @@ def eval(
     no_report: bool,
     workers: int,
     output_file: Optional[str],
-    enable_mocker_cache: bool,
+    verbose: bool,
 ) -> None:
     """Run an evaluation set against the agent.
 
@@ -108,8 +103,16 @@ def eval(
         eval_set_run_id: Custom evaluation set run ID (optional, will generate UUID if not specified)
         workers: Number of parallel workers for running evaluations
         no_report: Do not report the evaluation results
-        enable_mocker_cache: Enable caching for LLM mocker responses
+        verbose: Enable verbose debug output for evaluators
     """
+    # Configure logging level for evaluators if verbose is enabled
+    if verbose:
+        import logging
+
+        logging.basicConfig(level=logging.DEBUG, format="%(message)s")
+        # Set the evaluators logger to DEBUG
+        logging.getLogger("uipath.eval.evaluators").setLevel(logging.DEBUG)
+
     context_args = {
         "entrypoint": entrypoint or auto_discover_entrypoint(),
         "eval_set": eval_set,
@@ -118,7 +121,6 @@ def eval(
         "workers": workers,
         "no_report": no_report,
         "output_file": output_file,
-        "enable_mocker_cache": enable_mocker_cache,
     }
 
     should_register_progress_reporter = setup_reporting_prereq(no_report)
@@ -152,7 +154,6 @@ def eval(
         eval_context.no_report = no_report
         eval_context.workers = workers
         eval_context.eval_set_run_id = eval_set_run_id
-        eval_context.enable_mocker_cache = enable_mocker_cache
 
         # Load eval set to resolve the path
         eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set()
@@ -195,3 +196,4 @@ async def execute_eval():
 
 if __name__ == "__main__":
     eval()
+    eval()
diff --git a/src/uipath/_cli/cli_init.py b/src/uipath/_cli/cli_init.py
index 9811023a4..72600e48a 100644
--- a/src/uipath/_cli/cli_init.py
+++ b/src/uipath/_cli/cli_init.py
@@ -12,7 +12,6 @@
 
 from .._config import UiPathConfig
 from .._utils.constants import ENV_TELEMETRY_ENABLED
-from ..telemetry import track
 from ..telemetry._constants import _PROJECT_KEY, _TELEMETRY_CONFIG_FILE
 from ._runtime._runtime import get_user_script
 from ._runtime._runtime_factory import generate_runtime_factory
@@ -180,7 +179,6 @@ def write_config_file(config_data: Dict[str, Any] | RuntimeSchema) -> None:
     default=False,
     help="Won't override existing .agent files and AGENTS.md file.",
 )
-@track
 def init(entrypoint: str, infer_bindings: bool, no_agents_md_override: bool) -> None:
     """Create uipath.json with input/output schemas and bindings."""
     with console.spinner("Initializing UiPath project ..."):
diff --git a/src/uipath/_cli/cli_invoke.py b/src/uipath/_cli/cli_invoke.py
index 14fef6ebb..72e111703 100644
--- a/src/uipath/_cli/cli_invoke.py
+++ b/src/uipath/_cli/cli_invoke.py
@@ -15,7 +15,6 @@
     import tomli as tomllib
 
 from .._utils._ssl_context import get_httpx_client_kwargs
-from ..telemetry import track
 from ._utils._common import get_env_vars
 from ._utils._folders import get_personal_workspace_info_async
 from ._utils._processes import get_release_info
@@ -51,7 +50,6 @@ def _read_project_details() -> [str, str]:
     type=click.Path(exists=True),
     help="File path for the .json input",
 )
-@track
 def invoke(
     entrypoint: Optional[str], input: Optional[str], file: Optional[str]
 ) -> None:
diff --git a/src/uipath/_cli/cli_new.py b/src/uipath/_cli/cli_new.py
index a5088a3cd..c0e96dc21 100644
--- a/src/uipath/_cli/cli_new.py
+++ b/src/uipath/_cli/cli_new.py
@@ -4,7 +4,6 @@
 
 import click
 
-from ..telemetry import track
 from ._utils._console import ConsoleLogger
 from .middlewares import Middlewares
 
@@ -39,7 +38,6 @@ def generate_pyproject(target_directory, project_name):
 
 @click.command()
 @click.argument("name", type=str, default="")
-@track
 def new(name: str):
     """Generate a quick-start project."""
     directory = os.getcwd()
diff --git a/src/uipath/_cli/cli_pack.py b/src/uipath/_cli/cli_pack.py
index 276233def..39977855f 100644
--- a/src/uipath/_cli/cli_pack.py
+++ b/src/uipath/_cli/cli_pack.py
@@ -11,7 +11,6 @@
 from uipath._cli.models.runtime_schema import Bindings, RuntimeSchema
 from uipath._config import UiPathConfig
 
-from ..telemetry import track
 from ..telemetry._constants import _PROJECT_KEY, _TELEMETRY_CONFIG_FILE
 from ._utils._console import ConsoleLogger
 from ._utils._project_files import (
@@ -311,7 +310,6 @@ def display_project_info(config):
     is_flag=True,
     help="Skip running uv lock and exclude uv.lock from the package",
 )
-@track
 def pack(root, nolock):
     """Pack the project."""
     version = get_project_version(root)
diff --git a/src/uipath/_cli/cli_publish.py b/src/uipath/_cli/cli_publish.py
index cecfd143c..825de68a4 100644
--- a/src/uipath/_cli/cli_publish.py
+++ b/src/uipath/_cli/cli_publish.py
@@ -7,7 +7,6 @@
 import httpx
 
 from .._utils._ssl_context import get_httpx_client_kwargs
-from ..telemetry import track
 from ._utils._common import get_env_vars
 from ._utils._console import ConsoleLogger
 from ._utils._folders import get_personal_workspace_info_async
@@ -67,7 +66,6 @@ def get_available_feeds(
     flag_value="personal",
     help="Whether to publish to the personal workspace",
 )
-@track
 def publish(feed):
     """Publish the package."""
     [base_url, token] = get_env_vars()
diff --git a/src/uipath/_cli/cli_pull.py b/src/uipath/_cli/cli_pull.py
index 2a2d7f63a..4f7be1499 100644
--- a/src/uipath/_cli/cli_pull.py
+++ b/src/uipath/_cli/cli_pull.py
@@ -16,7 +16,6 @@
 import click
 
 from .._config import UiPathConfig
-from ..telemetry import track
 from ._utils._console import ConsoleLogger
 from ._utils._constants import EVALS_DIRECTORY_NAME
 from ._utils._project_files import (
@@ -34,7 +33,6 @@
     type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path),
     default=Path("."),
 )
-@track
 def pull(root: Path) -> None:
     """Pull remote project files from Studio Web Project.
 
diff --git a/src/uipath/_cli/cli_push.py b/src/uipath/_cli/cli_push.py
index afeffe84d..6b3688a7e 100644
--- a/src/uipath/_cli/cli_push.py
+++ b/src/uipath/_cli/cli_push.py
@@ -8,7 +8,6 @@
 from uipath.models.exceptions import EnrichedException
 
 from .._config import UiPathConfig
-from ..telemetry import track
 from ._push.sw_file_handler import FileOperationUpdate, SwFileHandler
 from ._utils._console import ConsoleLogger
 from ._utils._project_files import (
@@ -71,7 +70,6 @@ async def upload_source_files_to_project(
     is_flag=True,
     help="Skip running uv lock and exclude uv.lock from the package",
 )
-@track
 def push(root: str, nolock: bool) -> None:
     """Push local project files to Studio Web Project.
 
diff --git a/src/uipath/_cli/cli_register.py b/src/uipath/_cli/cli_register.py
index f18e23470..b371e5982 100644
--- a/src/uipath/_cli/cli_register.py
+++ b/src/uipath/_cli/cli_register.py
@@ -3,7 +3,6 @@
 
 import click
 
-from ..telemetry import track
 from ._evals._helpers import register_evaluator
 from ._utils._console import ConsoleLogger
 from ._utils._resources import Resources
@@ -15,7 +14,6 @@
 @click.command()
 @click.argument("resource", required=True)
 @click.argument("args", nargs=-1)
-@track
 def register(resource: str, args: tuple[str]) -> None:
     """Register a local resource.
 
diff --git a/src/uipath/_cli/cli_run.py b/src/uipath/_cli/cli_run.py
index 2f8cf0571..e50815cfe 100644
--- a/src/uipath/_cli/cli_run.py
+++ b/src/uipath/_cli/cli_run.py
@@ -1,28 +1,443 @@
 # type: ignore
 import asyncio
+import json
 import os
-from os import environ as env
-from typing import Optional
+import uuid
+from datetime import datetime
+from typing import Optional, Sequence
 
 import click
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
 from uipath._cli._runtime._runtime_factory import generate_runtime_factory
 from uipath._cli._utils._common import read_resource_overwrites_from_file
 from uipath._cli._utils._debug import setup_debugging
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter
+from uipath.tracing._utils import _SpanUtils
 
-from .._utils.constants import (
-    ENV_JOB_ID,
-)
-from ..telemetry import track
 from ._runtime._contracts import UiPathRuntimeError
 from ._utils._console import ConsoleLogger
 from .middlewares import Middlewares
 
+# Import LangChain instrumentor for automatic span generation
+try:
+    from openinference.instrumentation.langchain import (
+        LangChainInstrumentor,
+        get_current_span,
+    )
+
+    LANGCHAIN_INSTRUMENTATION_AVAILABLE = True
+except ImportError:
+    LANGCHAIN_INSTRUMENTATION_AVAILABLE = False
+
 console = ConsoleLogger()
 
 
+class MemorySpanExporter(SpanExporter):
+    """Span exporter that collects spans in memory for later processing."""
+
+    def __init__(self):
+        self.spans = []
+
+    def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
+        """Export spans to memory."""
+        try:
+            for span in spans:
+                uipath_span = _SpanUtils.otel_span_to_uipath_span(
+                    span, serialize_attributes=True
+                )
+                self.spans.append(uipath_span.to_dict(serialize_attributes=False))
+            return SpanExportResult.SUCCESS
+        except Exception:
+            return SpanExportResult.FAILURE
+
+    def shutdown(self) -> None:
+        """Shutdown the exporter."""
+        pass
+
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Force flush any buffered spans."""
+        return True
+
+
+def _generate_evaluation_set(
+    input_data: str,
+    output_data: str,
+    entrypoint: str,
+    eval_set_path: str,
+    evaluators: list[str] = None,
+    spans: list[dict] = None,
+) -> None:
+    """Generate an evaluation set JSON file from a run execution.
+
+    Args:
+        input_data: The input data used for the run (as JSON string)
+        output_data: The output data from the run (as JSON string)
+        entrypoint: Path to the agent script
+        eval_set_path: Path where the evaluation set JSON file will be saved
+        evaluators: List of evaluator names to use (e.g., ['json_similarity', 'exact_match'])
+        spans: Optional list of span dictionaries containing node execution data
+    """
+    try:
+        # Use json_similarity as default if no evaluators specified
+        if not evaluators:
+            evaluators = ["json_similarity"]
+
+        # Create the directory structure for eval sets and evaluators
+        eval_set_file = os.path.abspath(eval_set_path)
+        eval_set_dir = os.path.dirname(eval_set_file)
+
+        # If not already in an eval-sets dir, create proper structure
+        if not eval_set_dir.endswith("eval-sets"):
+            eval_set_dir = os.path.join(eval_set_dir, "evals", "eval-sets")
+            eval_set_file = os.path.join(eval_set_dir, os.path.basename(eval_set_path))
+
+        os.makedirs(eval_set_dir, exist_ok=True)
+
+        # Create evaluators directory at the sibling level
+        evaluators_dir = os.path.join(os.path.dirname(eval_set_dir), "evaluators")
+        os.makedirs(evaluators_dir, exist_ok=True)
+        # Parse input and output
+        try:
+            parsed_input = json.loads(input_data) if input_data else {}
+        except (json.JSONDecodeError, TypeError):
+            # If input_data is already a dict or not JSON, handle it
+            if isinstance(input_data, dict):
+                parsed_input = input_data
+            else:
+                parsed_input = {"raw_input": str(input_data)}
+
+        try:
+            # Handle output_data which might be a string, dict, or other object
+            if isinstance(output_data, str):
+                parsed_output = json.loads(output_data)
+            elif isinstance(output_data, dict):
+                parsed_output = output_data
+            else:
+                # For other types, try to convert to dict
+                parsed_output = json.loads(str(output_data))
+        except (json.JSONDecodeError, TypeError):
+            parsed_output = {"raw_output": str(output_data)}
+
+        # Generate unique IDs
+        eval_id = str(uuid.uuid4())
+        timestamp = datetime.utcnow().isoformat() + "Z"
+
+        # Build evaluation criteria and create evaluator files
+        evaluation_criteria = {}
+        evaluator_refs = []
+
+        # Evaluator type mapping (supports both short names and full type IDs)
+        evaluator_type_map = {
+            "json_similarity": {
+                "name": "JsonSimilarityEvaluator",
+                "evaluatorTypeId": "uipath-json-similarity",
+                "config_defaults": {"name": "JsonSimilarityEvaluator"},
+            },
+            "uipath-json-similarity": {
+                "name": "JsonSimilarityEvaluator",
+                "evaluatorTypeId": "uipath-json-similarity",
+                "config_defaults": {"name": "JsonSimilarityEvaluator"},
+            },
+            "exact_match": {
+                "name": "ExactMatchEvaluator",
+                "evaluatorTypeId": "uipath-exact-match",
+                "config_defaults": {
+                    "name": "ExactMatchEvaluator",
+                    "case_sensitive": False,
+                },
+            },
+            "uipath-exact-match": {
+                "name": "ExactMatchEvaluator",
+                "evaluatorTypeId": "uipath-exact-match",
+                "config_defaults": {
+                    "name": "ExactMatchEvaluator",
+                    "case_sensitive": False,
+                },
+            },
+            "contains": {
+                "name": "ContainsEvaluator",
+                "evaluatorTypeId": "uipath-contains",
+                "config_defaults": {"name": "ContainsEvaluator"},
+            },
+            "uipath-contains": {
+                "name": "ContainsEvaluator",
+                "evaluatorTypeId": "uipath-contains",
+                "config_defaults": {"name": "ContainsEvaluator"},
+            },
+            "llm_judge": {
+                "name": "LLMJudgeOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+                "config_defaults": {
+                    "name": "LLMJudgeOutputEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "uipath-llm-judge-output-semantic-similarity": {
+                "name": "LLMJudgeOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+                "config_defaults": {
+                    "name": "LLMJudgeOutputEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "llm_judge_strict_json": {
+                "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity",
+                "config_defaults": {
+                    "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "uipath-llm-judge-output-strict-json-similarity": {
+                "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity",
+                "config_defaults": {
+                    "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "llm_judge_trajectory": {
+                "name": "LLMJudgeTrajectoryEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory",
+                "config_defaults": {
+                    "name": "LLMJudgeTrajectoryEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "uipath-llm-judge-trajectory": {
+                "name": "LLMJudgeTrajectoryEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory",
+                "config_defaults": {
+                    "name": "LLMJudgeTrajectoryEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "llm_judge_trajectory_simulation": {
+                "name": "LLMJudgeTrajectorySimulationEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation",
+                "config_defaults": {
+                    "name": "LLMJudgeTrajectorySimulationEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "uipath-llm-judge-trajectory-simulation": {
+                "name": "LLMJudgeTrajectorySimulationEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation",
+                "config_defaults": {
+                    "name": "LLMJudgeTrajectorySimulationEvaluator",
+                    "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+                },
+            },
+            "tool_call_args": {
+                "name": "ToolCallArgsEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-args",
+                "config_defaults": {"name": "ToolCallArgsEvaluator"},
+            },
+            "uipath-tool-call-args": {
+                "name": "ToolCallArgsEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-args",
+                "config_defaults": {"name": "ToolCallArgsEvaluator"},
+            },
+            "tool_call_count": {
+                "name": "ToolCallCountEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-count",
+                "config_defaults": {"name": "ToolCallCountEvaluator"},
+            },
+            "uipath-tool-call-count": {
+                "name": "ToolCallCountEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-count",
+                "config_defaults": {"name": "ToolCallCountEvaluator"},
+            },
+            "tool_call_order": {
+                "name": "ToolCallOrderEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-order",
+                "config_defaults": {"name": "ToolCallOrderEvaluator"},
+            },
+            "uipath-tool-call-order": {
+                "name": "ToolCallOrderEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-order",
+                "config_defaults": {"name": "ToolCallOrderEvaluator"},
+            },
+            "tool_call_output": {
+                "name": "ToolCallOutputEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-output",
+                "config_defaults": {"name": "ToolCallOutputEvaluator"},
+            },
+            "uipath-tool-call-output": {
+                "name": "ToolCallOutputEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-output",
+                "config_defaults": {"name": "ToolCallOutputEvaluator"},
+            },
+        }
+
+        for evaluator_name in evaluators:
+            if evaluator_name not in evaluator_type_map:
+                console.warning(f"Unknown evaluator '{evaluator_name}', skipping")
+                continue
+
+            evaluator_info = evaluator_type_map[evaluator_name]
+            evaluator_id = str(uuid.uuid4())
+            evaluator_refs.append(evaluator_id)
+
+            # Create evaluator JSON file
+            evaluator_def = {
+                "id": evaluator_id,
+                "name": f"{evaluator_info['name']} (auto-generated)",
+                "version": "1.0",
+                "evaluatorTypeId": evaluator_info["evaluatorTypeId"],
+                "evaluatorConfig": evaluator_info["config_defaults"],
+            }
+
+            evaluator_file = os.path.join(
+                evaluators_dir, f"{evaluator_name}-{evaluator_id[:8]}.json"
+            )
+            with open(evaluator_file, "w") as f:
+                json.dump(evaluator_def, f, indent=2)
+
+            # Add evaluation criteria for this eval item (keyed by evaluator ID)
+            evaluation_criteria[evaluator_id] = {
+                "expected_output": parsed_output,
+            }
+
+        # Create evaluation items
+        evaluation_items = []
+
+        # If spans are provided, create per-node evaluations
+        if spans:
+            # Filter spans to only include workflow nodes
+            node_spans = {}
+            node_order = []  # Track order of nodes
+
+            for span in spans:
+                # First try to get the span name from the Name field (UiPath format)
+                span_name = span.get("Name", span.get("name", ""))
+                attributes = span.get("Attributes", span.get("attributes", {}))
+
+                # Parse attributes if they're a JSON string
+                if isinstance(attributes, str):
+                    try:
+                        attributes = json.loads(attributes)
+                    except:
+                        attributes = {}
+
+                # Determine the node name from various possible sources
+                node_name = None
+                if isinstance(attributes, dict):
+                    node_name = attributes.get(
+                        "node_name", attributes.get("langgraph.node", None)
+                    )
+
+                # If no node_name attribute, use the span Name as the node name
+                if not node_name and span_name:
+                    node_name = span_name
+
+                # Only include valid workflow nodes (exclude system nodes, internal components, and LLM calls)
+                if (
+                    node_name
+                    and node_name not in ["__start__", "__end__"]
+                    and not any(
+                        node_name.startswith(prefix)
+                        for prefix in ["Runnable", "UiPath", "JsonOutput"]
+                    )
+                ):
+                    if node_name not in node_spans:
+                        node_spans[node_name] = []
+                        node_order.append(node_name)
+                    node_spans[node_name].append(span)
+
+            if node_spans:
+                console.info(
+                    f"Found {len(node_spans)} workflow node(s) for evaluation generation"
+                )
+
+                # Create evaluation for each node in execution order
+                for node_name in node_order:
+                    node_span_list = node_spans[node_name]
+                    # Get the most recent span for this node
+                    node_span = node_span_list[-1]
+                    node_attributes = node_span.get(
+                        "Attributes", node_span.get("attributes", {})
+                    )
+
+                    # Parse attributes if they're a JSON string
+                    if isinstance(node_attributes, str):
+                        try:
+                            node_attributes = json.loads(node_attributes)
+                        except:
+                            node_attributes = {}
+
+                    # Try different output keys: output.value, output, outputs
+                    node_output = node_attributes.get(
+                        "output.value",
+                        node_attributes.get(
+                            "output", node_attributes.get("outputs", None)
+                        ),
+                    )
+                    if isinstance(node_output, str):
+                        try:
+                            node_output = json.loads(node_output)
+                        except:
+                            pass
+
+                    if node_output:
+                        # Create node-specific evaluation
+                        node_eval_id = str(uuid.uuid4())
+                        node_evaluation_criteria = {}
+
+                        # Add evaluation criteria for each evaluator with node output
+                        for evaluator_id in evaluator_refs:
+                            node_evaluation_criteria[evaluator_id] = {
+                                "expected_output": node_output,
+                            }
+
+                        evaluation_items.append(
+                            {
+                                "id": node_eval_id,
+                                "name": f"Node: {node_name}",
+                                "inputs": parsed_input,  # Use agent input, not node-specific input
+                                "evaluationCriterias": node_evaluation_criteria,
+                                "expectedAgentBehavior": f"The agent should execute node '{node_name}' and produce the expected output during the workflow execution.",
+                                "nodeId": node_name,  # Add node identifier for evaluators to match against trace
+                            }
+                        )
+
+        # Always include final output evaluation
+        evaluation_item = {
+            "id": eval_id,
+            "name": f"Final Output",
+            "inputs": parsed_input,
+            "evaluationCriterias": evaluation_criteria,
+            "expectedAgentBehavior": "Agent should produce the expected output for the given input",
+        }
+        evaluation_items.append(evaluation_item)
+
+        # Create evaluation set
+        eval_set = {
+            "id": str(uuid.uuid4()),
+            "name": f"Evaluation set generated from {entrypoint}",
+            "version": "1.0",
+            "evaluatorRefs": evaluator_refs,
+            "evaluations": evaluation_items,
+        }
+
+        # Save eval set to file
+        with open(eval_set_file, "w") as f:
+            json.dump(eval_set, f, indent=2)
+
+        console.success(f"Evaluation set generated and saved to: {eval_set_file}")
+        console.info(
+            f"Generated {len(evaluation_items)} evaluation(s) with {len(evaluator_refs)} evaluator(s) in: {evaluators_dir}"
+        )
+
+    except Exception as e:
+        console.error(
+            f"Failed to generate evaluation set: {str(e)}", include_traceback=True
+        )
+
+
 @click.command()
 @click.argument("entrypoint", required=False)
 @click.argument("input", required=False, default="{}")
@@ -43,8 +458,8 @@
 @click.option(
     "--output-file",
     required=False,
-    type=click.Path(exists=False),
-    help="File path where the output will be written",
+    type=click.Path(),
+    help="File path where the output will be written (will overwrite if exists)",
 )
 @click.option(
     "--trace-file",
@@ -63,7 +478,18 @@
     default=5678,
     help="Port for the debug server (default: 5678)",
 )
-@track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None)
+@click.option(
+    "--generate-evals",
+    required=False,
+    type=click.Path(),
+    help="Generate an evaluation set file from this run and save it to the specified path (will overwrite if exists)",
+)
+@click.option(
+    "--eval-evaluators",
+    multiple=True,
+    default=["json_similarity"],
+    help="Evaluators to use for generated eval set (can be specified multiple times). Available: json_similarity, exact_match, contains, llm_judge, llm_judge_strict_json, llm_judge_trajectory, llm_judge_trajectory_simulation, tool_call_args, tool_call_count, tool_call_order, tool_call_output. You can also use full type IDs like 'uipath-json-similarity'.",
+)
 def run(
     entrypoint: Optional[str],
     input: Optional[str],
@@ -74,6 +500,8 @@ def run(
     trace_file: Optional[str],
     debug: bool,
     debug_port: int,
+    generate_evals: Optional[str],
+    eval_evaluators: tuple[str],
 ) -> None:
     """Execute the project."""
     context_args = {
@@ -84,6 +512,9 @@ def run(
         "execution_output_file": output_file,
         "trace_file": trace_file,
         "debug": debug,
+        "generate_evals": generate_evals,
+        # Enable tracing if we're generating evals to capture node data
+        "tracing_enabled": True if generate_evals else None,
     }
     input_file = file or input_file
     # Setup debugging if requested
@@ -115,8 +546,11 @@ def run(
     Usage: `uipath run <entrypoint_path> <input_arguments> [-f <input_json_file_path>]`""")
 
         try:
+            execution_result = None
+            memory_span_exporter = None
 
             async def execute() -> None:
+                nonlocal execution_result, memory_span_exporter
                 runtime_factory = generate_runtime_factory()
                 context = runtime_factory.new_context(**context_args)
                 if context.job_id:
@@ -125,6 +559,18 @@ async def execute() -> None:
                 if trace_file:
                     runtime_factory.add_span_exporter(JsonLinesFileExporter(trace_file))
 
+                # Add memory span exporter if generating evals to capture node-level data
+                # Use batch=False to ensure immediate export of spans
+                if generate_evals:
+                    memory_span_exporter = MemorySpanExporter()
+                    runtime_factory.add_span_exporter(memory_span_exporter, batch=False)
+
+                    # Add LangChain instrumentor to automatically trace LangChain/LangGraph operations
+                    if LANGCHAIN_INSTRUMENTATION_AVAILABLE:
+                        runtime_factory.add_instrumentor(
+                            LangChainInstrumentor, get_current_span
+                        )
+
                 if context.job_id:
                     async with ResourceOverwritesContext(
                         lambda: read_resource_overwrites_from_file(context.runtime_dir)
@@ -133,15 +579,64 @@ async def execute() -> None:
                             f"Applied {ctx.overwrites_count} resource overwrite(s)"
                         )
 
-                        result = await runtime_factory.execute(context)
+                        execution_result = await runtime_factory.execute(context)
                 else:
-                    result = await runtime_factory.execute(context)
+                    execution_result = await runtime_factory.execute(context)
 
                 if not context.job_id:
-                    console.info(result.output)
+                    console.info(execution_result.output)
 
             asyncio.run(execute())
 
+            # Generate evaluation set if requested
+            if generate_evals and execution_result:
+                # Get the actual input data (from file or argument)
+                actual_input = input
+                if input_file and os.path.exists(input_file):
+                    try:
+                        with open(input_file, "r") as f:
+                            actual_input = f.read()
+                    except Exception as e:
+                        console.warning(
+                            f"Failed to read input file for eval generation: {e}"
+                        )
+
+                # Convert output to proper format for eval generation
+                output_for_eval = (
+                    execution_result.output
+                    if hasattr(execution_result, "output")
+                    else execution_result
+                )
+
+                # If output is a Pydantic model, convert to dict
+                if hasattr(output_for_eval, "model_dump"):
+                    output_for_eval = output_for_eval.model_dump()
+                elif hasattr(output_for_eval, "dict"):
+                    output_for_eval = output_for_eval.dict()
+                # If it's already a dict, ensure it's not wrapped
+                elif isinstance(output_for_eval, dict) and "dict" in output_for_eval:
+                    # Unwrap if it's in the format {"dict": "..."}
+                    try:
+                        import ast
+
+                        output_for_eval = ast.literal_eval(output_for_eval["dict"])
+                    except:
+                        pass  # Keep as-is if parsing fails
+
+                # Get spans from memory exporter if available
+                collected_spans = (
+                    memory_span_exporter.spans if memory_span_exporter else None
+                )
+
+                _generate_evaluation_set(
+                    input_data=actual_input,
+                    output_data=output_for_eval,
+                    entrypoint=entrypoint,
+                    eval_set_path=generate_evals,
+                    evaluators=list(eval_evaluators) if eval_evaluators else None,
+                    spans=collected_spans,
+                )
+
         except UiPathRuntimeError as e:
             console.error(f"{e.error_info.title} - {e.error_info.detail}")
         except Exception as e:
diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py
index 486c48aaf..44c6ba8e7 100644
--- a/src/uipath/_events/_events.py
+++ b/src/uipath/_events/_events.py
@@ -5,8 +5,7 @@
 from opentelemetry.sdk.trace import ReadableSpan
 from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator
 
-from uipath._cli._evals._models._evaluation_set import EvaluationItem
-from uipath.eval.evaluators import BaseEvaluator
+from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator
 from uipath.eval.models import EvalItemResult
 
 
@@ -24,12 +23,13 @@ class EvalSetRunCreatedEvent(BaseModel):
     eval_set_run_id: Optional[str] = None
     no_of_evals: int
     # skip validation to avoid abstract class instantiation
-    evaluators: SkipValidation[List[BaseEvaluator[Any, Any, Any]]]
+    evaluators: SkipValidation[List[AnyEvaluator]]
+    evaluator_weights: Optional[Dict[str, float]] = None
 
 
 class EvalRunCreatedEvent(BaseModel):
     execution_id: str
-    eval_item: EvaluationItem
+    eval_item: AnyEvaluationItem
 
 
 class EvalItemExceptionDetails(BaseModel):
@@ -43,7 +43,7 @@ class EvalRunUpdatedEvent(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     execution_id: str
-    eval_item: EvaluationItem
+    eval_item: AnyEvaluationItem
     eval_results: List[EvalItemResult]
     success: bool
     agent_output: Any
@@ -62,6 +62,8 @@ def validate_exception_details(self):
 class EvalSetRunUpdatedEvent(BaseModel):
     execution_id: str
     evaluator_scores: dict[str, float]
+    weighted_final_score: Optional[float] = None
+    evaluator_weights: Optional[Dict[str, float]] = None
 
 
 ProgressEvent = Union[
diff --git a/src/uipath/_services/context_grounding_service.py b/src/uipath/_services/context_grounding_service.py
index fdb706a38..1fe0a63a3 100644
--- a/src/uipath/_services/context_grounding_service.py
+++ b/src/uipath/_services/context_grounding_service.py
@@ -20,7 +20,7 @@
 from ..models import IngestionInProgressException
 from ..models.context_grounding import ContextGroundingQueryResponse
 from ..models.context_grounding_index import ContextGroundingIndex
-from ..tracing import traced
+from ..tracing._traced import traced
 from ._base_service import BaseService
 from .buckets_service import BucketsService
 from .folder_service import FolderService
diff --git a/src/uipath/eval/_helpers/evaluators_helpers.py b/src/uipath/eval/_helpers/evaluators_helpers.py
index 8620130cf..3bfb9573f 100644
--- a/src/uipath/eval/_helpers/evaluators_helpers.py
+++ b/src/uipath/eval/_helpers/evaluators_helpers.py
@@ -6,10 +6,7 @@
 
 from opentelemetry.sdk.trace import ReadableSpan
 
-from ..models import (
-    ToolCall,
-    ToolOutput,
-)
+from ..models import ToolCall, ToolOutput
 
 COMPARATOR_MAPPINGS = {
     ">": "gt",
@@ -420,6 +417,47 @@ def tool_calls_output_score(
     ), justifications
 
 
+def extract_node_output_from_trace(
+    agent_trace: Sequence[ReadableSpan], node_id: str
+) -> Any:
+    """Extract the output of a specific node from the agent execution trace.
+
+    Args:
+        agent_trace: List of ReadableSpan objects from agent execution.
+        node_id: The identifier of the node to extract output from.
+
+    Returns:
+        The output value of the node, or None if not found.
+    """
+    for span in agent_trace:
+        if not span.attributes:
+            continue
+
+        # Check if this span matches the node_id
+        span_name = span.name
+        node_name_attr = span.attributes.get("node_name") or span.attributes.get(
+            "langgraph.node"
+        )
+
+        # Match by span name or node_name attribute
+        if span_name == node_id or node_name_attr == node_id:
+            # Extract output from span attributes
+            output_value = span.attributes.get("output.value") or span.attributes.get(
+                "output"
+            )
+
+            # Try to parse if it's a JSON string
+            if isinstance(output_value, str):
+                try:
+                    return json.loads(output_value)
+                except (json.JSONDecodeError, ValueError):
+                    return output_value
+
+            return output_value
+
+    return None
+
+
 def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
     """Convert OTEL spans to a platform-style agent run history string.
 
diff --git a/src/uipath/eval/models/models.py b/src/uipath/eval/models/models.py
index f3e9e3ca9..5ed4805ea 100644
--- a/src/uipath/eval/models/models.py
+++ b/src/uipath/eval/models/models.py
@@ -39,7 +39,7 @@ class ScoreType(IntEnum):
 class BaseEvaluationResult(BaseModel):
     """Base class for evaluation results."""
 
-    details: Optional[str | BaseModel] = None
+    details: Optional[str | Dict[str, Any] | BaseModel] = None
     # this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
     evaluation_time: Optional[float] = None