diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000..8cbde18fd --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,11 @@ +{ + "permissions": { + "allow": [ + "Bash(printf:*)", + "Bash(\"C:\\work2\\README_EXPLORATION_RESULTS.md\")", + "Bash(cat:*)" + ], + "deny": [], + "ask": [] + } +} diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index 4e0d7a177..3b813e9a3 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -7,6 +7,7 @@ from rich.rule import Rule from rich.table import Table +from uipath._cli._evals._models._evaluation_set import AnyEvaluator from uipath._events._event_bus import EventBus from uipath._events._events import ( EvalRunCreatedEvent, @@ -15,7 +16,6 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from uipath.eval.evaluators import BaseEvaluator from uipath.eval.models import ScoreType logger = logging.getLogger(__name__) @@ -26,9 +26,10 @@ class ConsoleProgressReporter: def __init__(self): self.console = Console() - self.evaluators: Dict[str, BaseEvaluator[Any, Any, Any]] = {} + self.evaluators: Dict[str, AnyEvaluator] = {} self.display_started = False self.eval_results_by_name: Dict[str, list[Any]] = {} + self.evaluator_weights: Dict[str, float] = {} def _convert_score_to_numeric(self, eval_result) -> float: """Convert evaluation result score to numeric value.""" @@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N """Handle evaluation set run creation.""" try: self.evaluators = {eval.id: eval for eval in payload.evaluators} + if payload.evaluator_weights: + self.evaluator_weights = payload.evaluator_weights except Exception as e: logger.error(f"Failed to handle create eval set run event: {e}") @@ -206,9 +209,20 @@ def display_final_results(self): summary_table.add_row(*row_values) - # Add separator row before average + # Add separator row before weights and average summary_table.add_section() + # Add weights row if weights are defined + if self.evaluator_weights: + weight_row_values = ["[bold]Weights[/bold]"] + for evaluator_id in evaluator_ids: + weight = self.evaluator_weights.get(evaluator_id, "-") + if weight != "-": + weight_row_values.append(f"[bold]{weight:.2f}[/bold]") + else: + weight_row_values.append("[bold]-[/bold]") + summary_table.add_row(*weight_row_values) + # Add average row avg_row_values = ["[bold]Average[/bold]"] for evaluator_id in evaluator_ids: @@ -217,8 +231,31 @@ def display_final_results(self): summary_table.add_row(*avg_row_values) - self.console.print(summary_table) - self.console.print() + # Calculate and display weighted final score if weights are defined + if self.evaluator_weights: + weighted_total = 0.0 + weights_sum = 0.0 + for evaluator_id in evaluator_ids: + weight = self.evaluator_weights.get(evaluator_id) + if weight is not None: + avg_score = self.final_results[evaluator_id] + weighted_total += weight * avg_score + weights_sum += weight + + # Display as a separate info line + self.console.print(summary_table) + self.console.print() + self.console.print( + f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]" + ) + if weights_sum != 1.0: + self.console.print( + f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]" + ) + self.console.print() + else: + self.console.print(summary_table) + self.console.print() else: self.console.print( "→ [bold green]All evaluations completed successfully![/bold green]" diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 31e87dd95..e81c46d77 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -1,9 +1,11 @@ from enum import Enum, IntEnum from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag from pydantic.alias_generators import to_camel +from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator + class EvaluationSimulationTool(BaseModel): name: str = Field(..., alias="name") @@ -158,6 +160,9 @@ class EvaluationSet(BaseModel): version: Literal["1.0"] = "1.0" evaluator_refs: List[str] = Field(default_factory=list) evaluations: List[EvaluationItem] = Field(default_factory=list) + evaluator_weights: Optional[Dict[str, float]] = Field( + default=None, alias="evaluatorWeights" + ) def extract_selected_evals(self, eval_ids) -> None: selected_evals: list[EvaluationItem] = [] @@ -214,3 +219,15 @@ def _discriminate_eval_set( if version == "1.0": return "evaluation_set" return "legacy_evaluation_set" + + +AnyEvaluationSet = Annotated[ + Union[ + Annotated[EvaluationSet, Tag("evaluation_set")], + Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")], + ], + Discriminator(_discriminate_eval_set), +] + +AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem] +AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]] diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py index c3ba1c728..28aa7f42e 100644 --- a/src/uipath/_cli/_evals/_models/_output.py +++ b/src/uipath/_cli/_evals/_models/_output.py @@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) score: float - details: Optional[str | BaseModel] = None + details: Optional[str | Dict[str, Any] | BaseModel] = None evaluation_time: Optional[float] = None @model_serializer(mode="wrap") @@ -56,6 +56,7 @@ def serialize_model( info: core_schema.SerializationInfo, ) -> Any: data = serializer(self) + # Only remove details if it's None, keep empty dicts and populated dicts if self.details is None and isinstance(data, dict): data.pop("details", None) return data @@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel): evaluator_name: str evaluator_id: str + evaluator_type: Optional[str] = None + node_id: Optional[str] = None result: EvaluationResultDto @@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel): evaluation_name: str evaluation_run_results: List[EvaluationRunResultDto] + workflow: Optional[List[str]] = None agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None @property @@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel): evaluation_set_name: str evaluation_set_results: List[EvaluationRunResult] + weighted_final_score: Optional[float] = None + evaluator_weights: Optional[Dict[str, float]] = None @property def score(self) -> float: diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index dbf305eaf..3dc23efb9 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -9,23 +9,21 @@ from urllib.parse import urlparse from opentelemetry import trace -from pydantic import BaseModel from rich.console import Console from uipath import UiPath from uipath._cli._evals._models._evaluation_set import ( + AnyEvaluationItem, + AnyEvaluator, EvaluationItem, EvaluationStatus, ) -from uipath._cli._evals._models._evaluator import Evaluator from uipath._cli._evals._models._sw_reporting import ( StudioWebAgentSnapshot, StudioWebProgressItem, ) from uipath._cli._utils._console import ConsoleLogger -from uipath._cli._utils._project_files import ( # type: ignore - get_project_config, -) +from uipath._cli._utils._project_files import get_project_config # type: ignore from uipath._events._event_bus import EventBus from uipath._events._events import ( EvalRunCreatedEvent, @@ -40,10 +38,7 @@ ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID, ) -from uipath.eval.evaluators import ( - BaseEvaluator, - LegacyBaseEvaluator, -) +from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator from uipath.eval.models import EvalItemResult, ScoreType from uipath.tracing import LlmOpsHttpExporter @@ -136,9 +131,7 @@ def _get_endpoint_prefix(self) -> str: return "api/" return "agentsruntime_/api/" - def _is_coded_evaluator( - self, evaluators: List[BaseEvaluator[Any, Any, Any]] - ) -> bool: + def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool: """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator). Args: @@ -150,7 +143,7 @@ def _is_coded_evaluator( if not evaluators: return False # Check the first evaluator type - return not isinstance(evaluators[0], LegacyBaseEvaluator) + return isinstance(evaluators[0], BaseEvaluator) def _extract_usage_from_spans( self, spans: list[Any] @@ -240,7 +233,7 @@ async def create_eval_set_run_sw( @gracefully_handle_errors async def create_eval_run( - self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False + self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> str: """Create a new evaluation run in StudioWeb. @@ -267,7 +260,7 @@ async def create_eval_run( async def update_eval_run( self, sw_progress_item: StudioWebProgressItem, - evaluators: dict[str, Evaluator], + evaluators: dict[str, AnyEvaluator], is_coded: bool = False, spans: list[Any] | None = None, ): @@ -334,10 +327,11 @@ async def update_eval_set_run( eval_set_run_id: str, evaluator_scores: dict[str, float], is_coded: bool = False, + weighted_final_score: float | None = None, ): """Update the evaluation set run status to complete.""" spec = self._update_eval_set_run_spec( - eval_set_run_id, evaluator_scores, is_coded + eval_set_run_id, evaluator_scores, is_coded, weighted_final_score ) await self._client.request_async( method=spec.method, @@ -457,6 +451,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N eval_set_run_id, payload.evaluator_scores, is_coded=is_coded, + weighted_final_score=payload.weighted_final_score, ) logger.debug( f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})" @@ -485,9 +480,7 @@ async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: logger.debug("StudioWeb progress reporter subscribed to evaluation events") - def _serialize_justification( - self, justification: BaseModel | str | None - ) -> str | None: + def _serialize_justification(self, justification: Any) -> str | None: """Serialize justification to JSON string for API compatibility. Args: @@ -497,9 +490,12 @@ def _serialize_justification( Returns: JSON string representation or None if justification is None """ - if isinstance(justification, BaseModel): - justification = json.dumps(justification.model_dump()) - + if justification is None: + return None + if hasattr(justification, "model_dump"): + return json.dumps(justification.model_dump()) + if not isinstance(justification, str): + return json.dumps(justification) return justification def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: @@ -708,7 +704,7 @@ def _update_coded_eval_run_spec( ) def _create_eval_run_spec( - self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False + self, eval_item: AnyEvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> RequestSpec: # Legacy API expects eval IDs as GUIDs, coded accepts strings # Convert string IDs to deterministic GUIDs for legacy @@ -801,6 +797,7 @@ def _update_eval_set_run_spec( eval_set_run_id: str, evaluator_scores: dict[str, float], is_coded: bool = False, + weighted_final_score: float | None = None, ) -> RequestSpec: # Legacy API expects evaluatorId as GUID, coded accepts string evaluator_scores_list = [] @@ -824,16 +821,24 @@ def _update_eval_set_run_spec( # For legacy evaluations, endpoint is without /coded endpoint_suffix = "coded/" if is_coded else "" + + # Build the JSON payload + json_payload = { + "evalSetRunId": eval_set_run_id, + "status": EvaluationStatus.COMPLETED.value, + "evaluatorScores": evaluator_scores_list, + } + + # Add weighted final score if available + if weighted_final_score is not None: + json_payload["weightedFinalScore"] = weighted_final_score + return RequestSpec( method="PUT", endpoint=Endpoint( f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun" ), - json={ - "evalSetRunId": eval_set_run_id, - "status": EvaluationStatus.COMPLETED.value, - "evaluatorScores": evaluator_scores_list, - }, + json=json_payload, headers=self._tenant_header(), ) diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index a605d9835..347d68474 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -12,10 +12,7 @@ from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult -from uipath._cli._evals.mocks.cache_manager import CacheManager -from uipath._cli._evals.mocks.input_mocker import ( - generate_llm_input, -) +from uipath._cli._evals.mocks.input_mocker import generate_llm_input from ..._events._event_bus import EventBus from ..._events._events import ( @@ -26,7 +23,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from ...eval.evaluators import BaseEvaluator +from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult from .._runtime._contracts import ( @@ -44,8 +41,11 @@ from ..models.runtime_schema import Entrypoint from ._evaluator_factory import EvaluatorFactory from ._models._evaluation_set import ( + AnyEvaluationItem, + AnyEvaluationSet, + AnyEvaluator, EvaluationItem, - EvaluationSet, + LegacyEvaluationItem, ) from ._models._exceptions import EvaluationRuntimeException from ._models._output import ( @@ -57,16 +57,116 @@ convert_eval_execution_output_to_serializable, ) from ._span_collection import ExecutionSpanCollector -from .mocks.mocks import ( - cache_manager_context, - clear_execution_context, - set_execution_context, -) +from .mocks.mocks import clear_execution_context, set_execution_context T = TypeVar("T", bound=UiPathBaseRuntime) C = TypeVar("C", bound=UiPathRuntimeContext) +def extract_workflow_from_spans(spans: list[ReadableSpan]) -> list[str]: + """Extract ordered list of main workflow nodes from execution spans. + + Only captures workflow nodes that are direct children of a LangGraph parent span, + which naturally filters out sub-nodes and internal components. + + Args: + spans: List of ReadableSpan objects from agent execution + + Returns: + List of unique main node names in execution order + """ + + for i, span in enumerate(spans): + span_name = getattr(span, "name", "NO_NAME") + attributes = getattr(span, "attributes", {}) + parent_context = getattr(span, "parent", None) + parent_span_id = None + if parent_context: + parent_span_id = getattr(parent_context, "span_id", None) + + span_context = span.get_span_context() + span_id = span_context.span_id if span_context else "NO_ID" + + if isinstance(attributes, dict): + node_name = attributes.get("node_name") + langgraph_node = attributes.get("langgraph.node") + + node_order = [] + seen_nodes = set() + + # System nodes to exclude + system_nodes = {"__start__", "__end__"} + + # First pass: Find LangGraph-related parent span IDs + # Look for spans that could be the main graph span (could have different names) + langgraph_span_ids = set() + for span in spans: + span_name = getattr(span, "name", "") + # Check if this is a LangGraph main span + if span_name and "langgraph" in span_name.lower(): + span_context = span.get_span_context() + if span_context: + langgraph_span_ids.add(span_context.span_id) + + # If we found potential parent spans, use them; otherwise we'll check all spans with langgraph.node + if langgraph_span_ids: + # Second pass: Collect spans that have a LangGraph parent + for span in spans: + # Get parent span ID + parent_context = getattr(span, "parent", None) + parent_span_id = None + if parent_context: + parent_span_id = getattr(parent_context, "span_id", None) + + # Skip if parent is not one of the LangGraph spans + if parent_span_id not in langgraph_span_ids: + continue + + # Get node name - use span name directly since attributes might not have it + span_name = getattr(span, "name", "") + attributes = getattr(span, "attributes", {}) + + # Try to get from attributes first, then fall back to span name + node_name = None + if isinstance(attributes, dict): + node_name = attributes.get("langgraph.node") or attributes.get( + "node_name" + ) + + if not node_name: + node_name = span_name + + # Skip if no node name found + if not node_name: + continue + + # Filter out system nodes + if node_name in system_nodes: + continue + + # Add to workflow if not seen before + if node_name not in seen_nodes: + seen_nodes.add(node_name) + node_order.append(node_name) + else: + # Fallback: Just get all spans with langgraph.node attribute + for span in spans: + attributes = getattr(span, "attributes", None) + if not attributes or not isinstance(attributes, dict): + continue + + node_name = attributes.get("langgraph.node") + + if not node_name or node_name in system_nodes: + continue + + if node_name not in seen_nodes: + seen_nodes.add(node_name) + node_order.append(node_name) + + return node_order + + class ExecutionSpanExporter(SpanExporter): """Custom exporter that stores spans grouped by execution ids.""" @@ -152,7 +252,6 @@ class UiPathEvalContext(UiPathRuntimeContext): eval_ids: Optional[List[str]] = None eval_set_run_id: Optional[str] = None verbose: bool = False - enable_mocker_cache: bool = False class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]): @@ -204,60 +303,45 @@ async def execute(self) -> UiPathRuntimeResult: event_bus = self.event_bus - # Create cache manager if enabled - if self.context.enable_mocker_cache: - cache_mgr = CacheManager() - cache_manager_context.set(cache_mgr) + # Load eval set (path is already resolved in cli_eval.py) + evaluation_set, _ = EvalHelpers.load_eval_set( + self.context.eval_set, self.context.eval_ids + ) + evaluators = self._load_evaluators(evaluation_set) - try: - # Load eval set (path is already resolved in cli_eval.py) - evaluation_set, _ = EvalHelpers.load_eval_set( - self.context.eval_set, self.context.eval_ids - ) - evaluators = self._load_evaluators(evaluation_set) + await event_bus.publish( + EvaluationEvents.CREATE_EVAL_SET_RUN, + EvalSetRunCreatedEvent( + execution_id=self.execution_id, + entrypoint=self.context.entrypoint or "", + eval_set_run_id=self.context.eval_set_run_id, + eval_set_id=evaluation_set.id, + no_of_evals=len(evaluation_set.evaluations), + evaluators=evaluators, + evaluator_weights=getattr(evaluation_set, "evaluator_weights", None), + ), + ) - await event_bus.publish( - EvaluationEvents.CREATE_EVAL_SET_RUN, - EvalSetRunCreatedEvent( - execution_id=self.execution_id, - entrypoint=self.context.entrypoint or "", - eval_set_run_id=self.context.eval_set_run_id, - eval_set_id=evaluation_set.id, - no_of_evals=len(evaluation_set.evaluations), - evaluators=evaluators, - ), + # Check if parallel execution should be used + if ( + self.context.workers + and self.context.workers > 1 + and len(evaluation_set.evaluations) > 1 + ): + eval_run_result_list = await self._execute_parallel( + evaluation_set, evaluators, event_bus, self.context.workers ) - - # Check if parallel execution should be used - if ( - self.context.workers - and self.context.workers > 1 - and len(evaluation_set.evaluations) > 1 - ): - eval_run_result_list = await self._execute_parallel( - evaluation_set, evaluators, event_bus, self.context.workers - ) - else: - eval_run_result_list = await self._execute_sequential( - evaluation_set, evaluators, event_bus - ) - results = UiPathEvalOutput( - evaluation_set_name=evaluation_set.name, - evaluation_set_results=eval_run_result_list, + else: + eval_run_result_list = await self._execute_sequential( + evaluation_set, evaluators, event_bus ) - finally: - # Flush cache to disk at end of eval set and cleanup - if self.context.enable_mocker_cache: - cache_manager = cache_manager_context.get() - if cache_manager is not None: - cache_manager.flush() - cache_manager_context.set(None) # Computing evaluator averages evaluator_averages: Dict[str, float] = defaultdict(float) evaluator_count: Dict[str, int] = defaultdict(int) - for eval_run_result in results.evaluation_set_results: + # Collect all evaluation results first + for eval_run_result in eval_run_result_list: for result_dto in eval_run_result.evaluation_run_results: evaluator_averages[result_dto.evaluator_id] += result_dto.result.score evaluator_count[result_dto.evaluator_id] += 1 @@ -266,11 +350,33 @@ async def execute(self) -> UiPathRuntimeResult: evaluator_averages[eval_id] = ( evaluator_averages[eval_id] / evaluator_count[eval_id] ) + + # Calculate weighted final score if weights are defined + evaluator_weights = getattr(evaluation_set, "evaluator_weights", None) + weighted_final_score = None + if evaluator_weights: + weighted_total = 0.0 + for evaluator_id, avg_score in evaluator_averages.items(): + weight = evaluator_weights.get(evaluator_id) + if weight is not None: + weighted_total += weight * avg_score + weighted_final_score = weighted_total + + # Create results with weighted score and weights + results = UiPathEvalOutput( + evaluation_set_name=evaluation_set.name, + evaluation_set_results=eval_run_result_list, + weighted_final_score=weighted_final_score, + evaluator_weights=evaluator_weights, + ) + await event_bus.publish( EvaluationEvents.UPDATE_EVAL_SET_RUN, EvalSetRunUpdatedEvent( execution_id=self.execution_id, evaluator_scores=evaluator_averages, + weighted_final_score=weighted_final_score, + evaluator_weights=evaluator_weights, ), wait_for_completion=False, ) @@ -283,8 +389,8 @@ async def execute(self) -> UiPathRuntimeResult: async def _execute_sequential( self, - evaluation_set: EvaluationSet, - evaluators: List[BaseEvaluator[Any, Any, Any]], + evaluation_set: AnyEvaluationSet, + evaluators: List[AnyEvaluator], event_bus: EventBus, ) -> List[EvaluationRunResult]: all_eval_run_result: list[EvaluationRunResult] = [] @@ -298,13 +404,13 @@ async def _execute_sequential( async def _execute_parallel( self, - evaluation_set: EvaluationSet, - evaluators: List[BaseEvaluator[Any, Any, Any]], + evaluation_set: AnyEvaluationSet, + evaluators: List[AnyEvaluator], event_bus: EventBus, workers: int, ) -> List[EvaluationRunResult]: # Create a queue with max concurrency - queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue( + queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue( maxsize=workers ) @@ -314,7 +420,7 @@ async def _execute_parallel( # Producer task to fill the queue async def producer() -> None: for index, eval_item in enumerate(evaluation_set.evaluations): - await queue.put((index, eval_item)) + await queue.put((index, eval_item)) # type: ignore[arg-type] # Signal completion by putting None markers for _ in range(workers): await queue.put(None) # type: ignore @@ -356,8 +462,8 @@ async def worker(worker_id: int) -> None: async def _execute_eval( self, - eval_item: EvaluationItem, - evaluators: List[BaseEvaluator[Any, Any, Any]], + eval_item: AnyEvaluationItem, + evaluators: List[AnyEvaluator], event_bus: EventBus, ) -> EvaluationRunResult: # Generate LLM-based input if input_mocking_strategy is defined @@ -418,6 +524,11 @@ async def _execute_eval( ) ) ) + # Extract workflow nodes from spans even in error case + if spans: + workflow = extract_workflow_from_spans(spans) + if workflow: + evaluation_run_results.workflow = workflow raise if self.context.verbose: @@ -426,34 +537,79 @@ async def _execute_eval( agent_execution_output ) ) + + # Extract workflow nodes from spans + workflow = extract_workflow_from_spans(agent_execution_output.spans) + # Always set workflow, even if empty, to distinguish from no extraction + evaluation_run_results.workflow = workflow if workflow else None + evaluation_item_results: list[EvalItemResult] = [] for evaluator in evaluators: - if evaluator.id not in eval_item.evaluation_criterias: - # Skip! - continue - evaluation_criteria = eval_item.evaluation_criterias[evaluator.id] + # Determine which evaluator method to use based on evaluation set/item type + evaluation_result: Optional[EvaluationResult] = None + + match eval_item: + case LegacyEvaluationItem(): + # Legacy evaluation - use run_legacy_evaluator + evaluation_result = await self.run_legacy_evaluator( + evaluator=evaluator, # type: ignore + execution_output=agent_execution_output, + eval_item=eval_item, + ) + case EvaluationItem() if ( + evaluator.id in eval_item.evaluation_criterias + ): + # New evaluation with criteria + evaluation_criteria = eval_item.evaluation_criterias[ + evaluator.id + ] + + evaluation_result = await self.run_evaluator( + evaluator=evaluator, # type: ignore + execution_output=agent_execution_output, + eval_item=eval_item, + evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore + **evaluation_criteria + ) + if evaluation_criteria + else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore + ) + case _: + # Skip if evaluator not in evaluation criteria + continue - evaluation_result = await self.run_evaluator( - evaluator=evaluator, - execution_output=agent_execution_output, - eval_item=eval_item, - evaluation_criteria=evaluator.evaluation_criteria_type( - **evaluation_criteria - ) - if evaluation_criteria - else evaluator.evaluator_config.default_evaluation_criteria, - ) + if evaluation_result is None: + continue dto_result = EvaluationResultDto.from_evaluation_result( evaluation_result ) + # Extract node_id from evaluation criteria if available + node_id = None + if ( + isinstance(eval_item, EvaluationItem) + and evaluator.id in eval_item.evaluation_criterias + ): + criteria_dict = eval_item.evaluation_criterias[evaluator.id] + if criteria_dict: + node_id = criteria_dict.get("nodeId") + + # Get evaluator type from evaluator's get_evaluator_id method + evaluator_type = None + try: + evaluator_type = evaluator.get_evaluator_id() + except AttributeError: + pass + evaluation_run_results.evaluation_run_results.append( EvaluationRunResultDto( evaluator_name=evaluator.name, result=dto_result, evaluator_id=evaluator.id, + evaluator_type=evaluator_type, + node_id=node_id, ) ) evaluation_item_results.append( @@ -482,10 +638,29 @@ async def _execute_eval( exception_details = EvalItemExceptionDetails(exception=e) for evaluator in evaluators: + # Extract node_id from evaluation criteria if available + node_id = None + if ( + isinstance(eval_item, EvaluationItem) + and evaluator.id in eval_item.evaluation_criterias + ): + criteria_dict = eval_item.evaluation_criterias[evaluator.id] + if criteria_dict: + node_id = criteria_dict.get("nodeId") + + # Get evaluator type from evaluator's get_evaluator_id method + evaluator_type = None + try: + evaluator_type = evaluator.get_evaluator_id() + except AttributeError: + pass + evaluation_run_results.evaluation_run_results.append( EvaluationRunResultDto( evaluator_name=evaluator.name, evaluator_id=evaluator.id, + evaluator_type=evaluator_type, + node_id=node_id, result=EvaluationResultDto(score=0), ) ) @@ -520,8 +695,8 @@ async def _execute_eval( return evaluation_run_results async def _generate_input_for_eval( - self, eval_item: EvaluationItem - ) -> EvaluationItem: + self, eval_item: AnyEvaluationItem + ) -> AnyEvaluationItem: """Use LLM to generate a mock input for an evaluation item.""" generated_input = await generate_llm_input( eval_item, (await self.get_entrypoint()).input @@ -542,7 +717,7 @@ def _get_and_clear_execution_data( return spans, logs async def execute_runtime( - self, eval_item: EvaluationItem, execution_id: str + self, eval_item: AnyEvaluationItem, execution_id: str ) -> UiPathEvalRunExecutionOutput: context_args = self.context.model_dump() context_args["execution_id"] = execution_id @@ -615,9 +790,28 @@ async def run_evaluator( return result - def _load_evaluators( - self, evaluation_set: EvaluationSet - ) -> list[BaseEvaluator[Any, Any, Any]]: + async def run_legacy_evaluator( + self, + evaluator: LegacyBaseEvaluator[Any], + execution_output: UiPathEvalRunExecutionOutput, + eval_item: LegacyEvaluationItem, + ) -> EvaluationResult: + agent_execution = AgentExecution( + agent_input=eval_item.inputs, + agent_output=execution_output.result.output or {}, + agent_trace=execution_output.spans, + expected_agent_behavior=eval_item.expected_agent_behavior, + ) + + result = await evaluator.evaluate( + agent_execution=agent_execution, + # at the moment evaluation_criteria is always the expected output + evaluation_criteria=eval_item.expected_output, + ) + + return result + + def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]: """Load evaluators referenced by the evaluation set.""" evaluators = [] evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore diff --git a/src/uipath/_cli/cli_add.py b/src/uipath/_cli/cli_add.py index ec0856fad..1064e4972 100644 --- a/src/uipath/_cli/cli_add.py +++ b/src/uipath/_cli/cli_add.py @@ -6,7 +6,6 @@ import click -from ..telemetry import track from ._utils._console import ConsoleLogger from ._utils._constants import EVALS_DIRECTORY_NAME from ._utils._resources import Resources @@ -85,7 +84,6 @@ def create_evaluator(evaluator_name): @click.command() @click.argument("resource", required=True) @click.argument("args", nargs=-1) -@track def add(resource: str, args: tuple[str]) -> None: """Create a local resource. diff --git a/src/uipath/_cli/cli_auth.py b/src/uipath/_cli/cli_auth.py index 0ee1dca55..e56d99657 100644 --- a/src/uipath/_cli/cli_auth.py +++ b/src/uipath/_cli/cli_auth.py @@ -2,7 +2,6 @@ import click -from ..telemetry import track from ._auth._auth_service import AuthService from ._utils._common import environment_options from ._utils._console import ConsoleLogger @@ -46,7 +45,6 @@ default="OR.Execution", help="Space-separated list of OAuth scopes to request (e.g., 'OR.Execution OR.Queues'). Defaults to 'OR.Execution'", ) -@track def auth( environment: str, force: bool = False, diff --git a/src/uipath/_cli/cli_debug.py b/src/uipath/_cli/cli_debug.py index d3a329c49..5bc942d79 100644 --- a/src/uipath/_cli/cli_debug.py +++ b/src/uipath/_cli/cli_debug.py @@ -1,7 +1,6 @@ # type: ignore import asyncio import os -from os import environ as env from typing import Optional import click @@ -12,16 +11,9 @@ from uipath._utils._bindings import ResourceOverwritesContext from uipath.tracing import LlmOpsHttpExporter -from .._utils.constants import ( - ENV_JOB_ID, -) -from ..telemetry import track from ._debug._bridge import UiPathDebugBridge, get_debug_bridge from ._debug._runtime import UiPathDebugRuntime -from ._runtime._contracts import ( - UiPathRuntimeContext, - UiPathRuntimeFactory, -) +from ._runtime._contracts import UiPathRuntimeContext, UiPathRuntimeFactory from ._runtime._runtime import UiPathScriptRuntime from ._utils._console import ConsoleLogger from .middlewares import Middlewares @@ -63,7 +55,6 @@ default=5678, help="Port for the debug server (default: 5678)", ) -@track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None) def debug( entrypoint: Optional[str], input: Optional[str], diff --git a/src/uipath/_cli/cli_deploy.py b/src/uipath/_cli/cli_deploy.py index fdbf44e7d..377417d40 100644 --- a/src/uipath/_cli/cli_deploy.py +++ b/src/uipath/_cli/cli_deploy.py @@ -1,7 +1,6 @@ # type: ignore import click -from ..telemetry import track from .cli_pack import pack from .cli_publish import publish @@ -22,7 +21,6 @@ help="Whether to publish to the personal workspace", ) @click.argument("root", type=str, default="./") -@track def deploy(root, feed): """Pack and publish the project.""" ctx = click.get_current_context() diff --git a/src/uipath/_cli/cli_dev.py b/src/uipath/_cli/cli_dev.py index 3ff1fa275..ef15ba8f9 100644 --- a/src/uipath/_cli/cli_dev.py +++ b/src/uipath/_cli/cli_dev.py @@ -5,13 +5,13 @@ import click from uipath._cli._dev._terminal import UiPathDevTerminal -from uipath._cli._runtime._contracts import UiPathRuntimeContext, UiPathRuntimeFactory +from uipath._cli._runtime._contracts import (UiPathRuntimeContext, + UiPathRuntimeFactory) from uipath._cli._runtime._runtime import UiPathScriptRuntime from uipath._cli._utils._console import ConsoleLogger from uipath._cli._utils._debug import setup_debugging from uipath._cli.cli_init import init # type: ignore[attr-defined] from uipath._cli.middlewares import Middlewares -from uipath.telemetry import track console = ConsoleLogger() @@ -29,7 +29,6 @@ default=5678, help="Port for the debug server (default: 5678)", ) -@track def dev(interface: Optional[str], debug: bool, debug_port: int) -> None: """Launch interactive debugging interface.""" project_file = os.path.join(os.getcwd(), "uipath.json") diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 5db1e1722..11501433a 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -8,9 +8,7 @@ from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter from uipath._cli._evals._evaluate import evaluate from uipath._cli._evals._progress_reporter import StudioWebProgressReporter -from uipath._cli._evals._runtime import ( - UiPathEvalContext, -) +from uipath._cli._evals._runtime import UiPathEvalContext from uipath._cli._runtime._runtime_factory import generate_runtime_factory from uipath._cli._utils._folders import get_personal_workspace_key_async from uipath._cli._utils._studio_project import StudioClient @@ -21,8 +19,6 @@ from uipath.eval._helpers import auto_discover_entrypoint from uipath.tracing import LlmOpsHttpExporter -from .._utils.constants import ENV_JOB_ID -from ..telemetry import track from ._utils._console import ConsoleLogger from ._utils._eval_set import EvalHelpers @@ -83,12 +79,11 @@ def setup_reporting_prereq(no_report: bool) -> bool: help="File path where the output will be written", ) @click.option( - "--enable-mocker-cache", + "--verbose", is_flag=True, + help="Enable verbose debug output for evaluators", default=False, - help="Enable caching for LLM mocker responses", ) -@track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None) def eval( entrypoint: Optional[str], eval_set: Optional[str], @@ -97,7 +92,7 @@ def eval( no_report: bool, workers: int, output_file: Optional[str], - enable_mocker_cache: bool, + verbose: bool, ) -> None: """Run an evaluation set against the agent. @@ -108,8 +103,16 @@ def eval( eval_set_run_id: Custom evaluation set run ID (optional, will generate UUID if not specified) workers: Number of parallel workers for running evaluations no_report: Do not report the evaluation results - enable_mocker_cache: Enable caching for LLM mocker responses + verbose: Enable verbose debug output for evaluators """ + # Configure logging level for evaluators if verbose is enabled + if verbose: + import logging + + logging.basicConfig(level=logging.DEBUG, format="%(message)s") + # Set the evaluators logger to DEBUG + logging.getLogger("uipath.eval.evaluators").setLevel(logging.DEBUG) + context_args = { "entrypoint": entrypoint or auto_discover_entrypoint(), "eval_set": eval_set, @@ -118,7 +121,6 @@ def eval( "workers": workers, "no_report": no_report, "output_file": output_file, - "enable_mocker_cache": enable_mocker_cache, } should_register_progress_reporter = setup_reporting_prereq(no_report) @@ -152,7 +154,6 @@ def eval( eval_context.no_report = no_report eval_context.workers = workers eval_context.eval_set_run_id = eval_set_run_id - eval_context.enable_mocker_cache = enable_mocker_cache # Load eval set to resolve the path eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set() @@ -195,3 +196,4 @@ async def execute_eval(): if __name__ == "__main__": eval() + eval() diff --git a/src/uipath/_cli/cli_init.py b/src/uipath/_cli/cli_init.py index 9811023a4..72600e48a 100644 --- a/src/uipath/_cli/cli_init.py +++ b/src/uipath/_cli/cli_init.py @@ -12,7 +12,6 @@ from .._config import UiPathConfig from .._utils.constants import ENV_TELEMETRY_ENABLED -from ..telemetry import track from ..telemetry._constants import _PROJECT_KEY, _TELEMETRY_CONFIG_FILE from ._runtime._runtime import get_user_script from ._runtime._runtime_factory import generate_runtime_factory @@ -180,7 +179,6 @@ def write_config_file(config_data: Dict[str, Any] | RuntimeSchema) -> None: default=False, help="Won't override existing .agent files and AGENTS.md file.", ) -@track def init(entrypoint: str, infer_bindings: bool, no_agents_md_override: bool) -> None: """Create uipath.json with input/output schemas and bindings.""" with console.spinner("Initializing UiPath project ..."): diff --git a/src/uipath/_cli/cli_invoke.py b/src/uipath/_cli/cli_invoke.py index 14fef6ebb..72e111703 100644 --- a/src/uipath/_cli/cli_invoke.py +++ b/src/uipath/_cli/cli_invoke.py @@ -15,7 +15,6 @@ import tomli as tomllib from .._utils._ssl_context import get_httpx_client_kwargs -from ..telemetry import track from ._utils._common import get_env_vars from ._utils._folders import get_personal_workspace_info_async from ._utils._processes import get_release_info @@ -51,7 +50,6 @@ def _read_project_details() -> [str, str]: type=click.Path(exists=True), help="File path for the .json input", ) -@track def invoke( entrypoint: Optional[str], input: Optional[str], file: Optional[str] ) -> None: diff --git a/src/uipath/_cli/cli_new.py b/src/uipath/_cli/cli_new.py index a5088a3cd..c0e96dc21 100644 --- a/src/uipath/_cli/cli_new.py +++ b/src/uipath/_cli/cli_new.py @@ -4,7 +4,6 @@ import click -from ..telemetry import track from ._utils._console import ConsoleLogger from .middlewares import Middlewares @@ -39,7 +38,6 @@ def generate_pyproject(target_directory, project_name): @click.command() @click.argument("name", type=str, default="") -@track def new(name: str): """Generate a quick-start project.""" directory = os.getcwd() diff --git a/src/uipath/_cli/cli_pack.py b/src/uipath/_cli/cli_pack.py index 276233def..39977855f 100644 --- a/src/uipath/_cli/cli_pack.py +++ b/src/uipath/_cli/cli_pack.py @@ -11,7 +11,6 @@ from uipath._cli.models.runtime_schema import Bindings, RuntimeSchema from uipath._config import UiPathConfig -from ..telemetry import track from ..telemetry._constants import _PROJECT_KEY, _TELEMETRY_CONFIG_FILE from ._utils._console import ConsoleLogger from ._utils._project_files import ( @@ -311,7 +310,6 @@ def display_project_info(config): is_flag=True, help="Skip running uv lock and exclude uv.lock from the package", ) -@track def pack(root, nolock): """Pack the project.""" version = get_project_version(root) diff --git a/src/uipath/_cli/cli_publish.py b/src/uipath/_cli/cli_publish.py index cecfd143c..825de68a4 100644 --- a/src/uipath/_cli/cli_publish.py +++ b/src/uipath/_cli/cli_publish.py @@ -7,7 +7,6 @@ import httpx from .._utils._ssl_context import get_httpx_client_kwargs -from ..telemetry import track from ._utils._common import get_env_vars from ._utils._console import ConsoleLogger from ._utils._folders import get_personal_workspace_info_async @@ -67,7 +66,6 @@ def get_available_feeds( flag_value="personal", help="Whether to publish to the personal workspace", ) -@track def publish(feed): """Publish the package.""" [base_url, token] = get_env_vars() diff --git a/src/uipath/_cli/cli_pull.py b/src/uipath/_cli/cli_pull.py index 2a2d7f63a..4f7be1499 100644 --- a/src/uipath/_cli/cli_pull.py +++ b/src/uipath/_cli/cli_pull.py @@ -16,7 +16,6 @@ import click from .._config import UiPathConfig -from ..telemetry import track from ._utils._console import ConsoleLogger from ._utils._constants import EVALS_DIRECTORY_NAME from ._utils._project_files import ( @@ -34,7 +33,6 @@ type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), default=Path("."), ) -@track def pull(root: Path) -> None: """Pull remote project files from Studio Web Project. diff --git a/src/uipath/_cli/cli_push.py b/src/uipath/_cli/cli_push.py index afeffe84d..6b3688a7e 100644 --- a/src/uipath/_cli/cli_push.py +++ b/src/uipath/_cli/cli_push.py @@ -8,7 +8,6 @@ from uipath.models.exceptions import EnrichedException from .._config import UiPathConfig -from ..telemetry import track from ._push.sw_file_handler import FileOperationUpdate, SwFileHandler from ._utils._console import ConsoleLogger from ._utils._project_files import ( @@ -71,7 +70,6 @@ async def upload_source_files_to_project( is_flag=True, help="Skip running uv lock and exclude uv.lock from the package", ) -@track def push(root: str, nolock: bool) -> None: """Push local project files to Studio Web Project. diff --git a/src/uipath/_cli/cli_register.py b/src/uipath/_cli/cli_register.py index f18e23470..b371e5982 100644 --- a/src/uipath/_cli/cli_register.py +++ b/src/uipath/_cli/cli_register.py @@ -3,7 +3,6 @@ import click -from ..telemetry import track from ._evals._helpers import register_evaluator from ._utils._console import ConsoleLogger from ._utils._resources import Resources @@ -15,7 +14,6 @@ @click.command() @click.argument("resource", required=True) @click.argument("args", nargs=-1) -@track def register(resource: str, args: tuple[str]) -> None: """Register a local resource. diff --git a/src/uipath/_cli/cli_run.py b/src/uipath/_cli/cli_run.py index 2f8cf0571..e50815cfe 100644 --- a/src/uipath/_cli/cli_run.py +++ b/src/uipath/_cli/cli_run.py @@ -1,28 +1,443 @@ # type: ignore import asyncio +import json import os -from os import environ as env -from typing import Optional +import uuid +from datetime import datetime +from typing import Optional, Sequence import click +from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from uipath._cli._runtime._runtime_factory import generate_runtime_factory from uipath._cli._utils._common import read_resource_overwrites_from_file from uipath._cli._utils._debug import setup_debugging from uipath._utils._bindings import ResourceOverwritesContext from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter +from uipath.tracing._utils import _SpanUtils -from .._utils.constants import ( - ENV_JOB_ID, -) -from ..telemetry import track from ._runtime._contracts import UiPathRuntimeError from ._utils._console import ConsoleLogger from .middlewares import Middlewares +# Import LangChain instrumentor for automatic span generation +try: + from openinference.instrumentation.langchain import ( + LangChainInstrumentor, + get_current_span, + ) + + LANGCHAIN_INSTRUMENTATION_AVAILABLE = True +except ImportError: + LANGCHAIN_INSTRUMENTATION_AVAILABLE = False + console = ConsoleLogger() +class MemorySpanExporter(SpanExporter): + """Span exporter that collects spans in memory for later processing.""" + + def __init__(self): + self.spans = [] + + def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: + """Export spans to memory.""" + try: + for span in spans: + uipath_span = _SpanUtils.otel_span_to_uipath_span( + span, serialize_attributes=True + ) + self.spans.append(uipath_span.to_dict(serialize_attributes=False)) + return SpanExportResult.SUCCESS + except Exception: + return SpanExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the exporter.""" + pass + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered spans.""" + return True + + +def _generate_evaluation_set( + input_data: str, + output_data: str, + entrypoint: str, + eval_set_path: str, + evaluators: list[str] = None, + spans: list[dict] = None, +) -> None: + """Generate an evaluation set JSON file from a run execution. + + Args: + input_data: The input data used for the run (as JSON string) + output_data: The output data from the run (as JSON string) + entrypoint: Path to the agent script + eval_set_path: Path where the evaluation set JSON file will be saved + evaluators: List of evaluator names to use (e.g., ['json_similarity', 'exact_match']) + spans: Optional list of span dictionaries containing node execution data + """ + try: + # Use json_similarity as default if no evaluators specified + if not evaluators: + evaluators = ["json_similarity"] + + # Create the directory structure for eval sets and evaluators + eval_set_file = os.path.abspath(eval_set_path) + eval_set_dir = os.path.dirname(eval_set_file) + + # If not already in an eval-sets dir, create proper structure + if not eval_set_dir.endswith("eval-sets"): + eval_set_dir = os.path.join(eval_set_dir, "evals", "eval-sets") + eval_set_file = os.path.join(eval_set_dir, os.path.basename(eval_set_path)) + + os.makedirs(eval_set_dir, exist_ok=True) + + # Create evaluators directory at the sibling level + evaluators_dir = os.path.join(os.path.dirname(eval_set_dir), "evaluators") + os.makedirs(evaluators_dir, exist_ok=True) + # Parse input and output + try: + parsed_input = json.loads(input_data) if input_data else {} + except (json.JSONDecodeError, TypeError): + # If input_data is already a dict or not JSON, handle it + if isinstance(input_data, dict): + parsed_input = input_data + else: + parsed_input = {"raw_input": str(input_data)} + + try: + # Handle output_data which might be a string, dict, or other object + if isinstance(output_data, str): + parsed_output = json.loads(output_data) + elif isinstance(output_data, dict): + parsed_output = output_data + else: + # For other types, try to convert to dict + parsed_output = json.loads(str(output_data)) + except (json.JSONDecodeError, TypeError): + parsed_output = {"raw_output": str(output_data)} + + # Generate unique IDs + eval_id = str(uuid.uuid4()) + timestamp = datetime.utcnow().isoformat() + "Z" + + # Build evaluation criteria and create evaluator files + evaluation_criteria = {} + evaluator_refs = [] + + # Evaluator type mapping (supports both short names and full type IDs) + evaluator_type_map = { + "json_similarity": { + "name": "JsonSimilarityEvaluator", + "evaluatorTypeId": "uipath-json-similarity", + "config_defaults": {"name": "JsonSimilarityEvaluator"}, + }, + "uipath-json-similarity": { + "name": "JsonSimilarityEvaluator", + "evaluatorTypeId": "uipath-json-similarity", + "config_defaults": {"name": "JsonSimilarityEvaluator"}, + }, + "exact_match": { + "name": "ExactMatchEvaluator", + "evaluatorTypeId": "uipath-exact-match", + "config_defaults": { + "name": "ExactMatchEvaluator", + "case_sensitive": False, + }, + }, + "uipath-exact-match": { + "name": "ExactMatchEvaluator", + "evaluatorTypeId": "uipath-exact-match", + "config_defaults": { + "name": "ExactMatchEvaluator", + "case_sensitive": False, + }, + }, + "contains": { + "name": "ContainsEvaluator", + "evaluatorTypeId": "uipath-contains", + "config_defaults": {"name": "ContainsEvaluator"}, + }, + "uipath-contains": { + "name": "ContainsEvaluator", + "evaluatorTypeId": "uipath-contains", + "config_defaults": {"name": "ContainsEvaluator"}, + }, + "llm_judge": { + "name": "LLMJudgeOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "config_defaults": { + "name": "LLMJudgeOutputEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "uipath-llm-judge-output-semantic-similarity": { + "name": "LLMJudgeOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "config_defaults": { + "name": "LLMJudgeOutputEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "llm_judge_strict_json": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "config_defaults": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "uipath-llm-judge-output-strict-json-similarity": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "config_defaults": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "llm_judge_trajectory": { + "name": "LLMJudgeTrajectoryEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory", + "config_defaults": { + "name": "LLMJudgeTrajectoryEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "uipath-llm-judge-trajectory": { + "name": "LLMJudgeTrajectoryEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory", + "config_defaults": { + "name": "LLMJudgeTrajectoryEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "llm_judge_trajectory_simulation": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation", + "config_defaults": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "uipath-llm-judge-trajectory-simulation": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation", + "config_defaults": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "model": "anthropic.claude-3-5-sonnet-20240620-v1:0", + }, + }, + "tool_call_args": { + "name": "ToolCallArgsEvaluator", + "evaluatorTypeId": "uipath-tool-call-args", + "config_defaults": {"name": "ToolCallArgsEvaluator"}, + }, + "uipath-tool-call-args": { + "name": "ToolCallArgsEvaluator", + "evaluatorTypeId": "uipath-tool-call-args", + "config_defaults": {"name": "ToolCallArgsEvaluator"}, + }, + "tool_call_count": { + "name": "ToolCallCountEvaluator", + "evaluatorTypeId": "uipath-tool-call-count", + "config_defaults": {"name": "ToolCallCountEvaluator"}, + }, + "uipath-tool-call-count": { + "name": "ToolCallCountEvaluator", + "evaluatorTypeId": "uipath-tool-call-count", + "config_defaults": {"name": "ToolCallCountEvaluator"}, + }, + "tool_call_order": { + "name": "ToolCallOrderEvaluator", + "evaluatorTypeId": "uipath-tool-call-order", + "config_defaults": {"name": "ToolCallOrderEvaluator"}, + }, + "uipath-tool-call-order": { + "name": "ToolCallOrderEvaluator", + "evaluatorTypeId": "uipath-tool-call-order", + "config_defaults": {"name": "ToolCallOrderEvaluator"}, + }, + "tool_call_output": { + "name": "ToolCallOutputEvaluator", + "evaluatorTypeId": "uipath-tool-call-output", + "config_defaults": {"name": "ToolCallOutputEvaluator"}, + }, + "uipath-tool-call-output": { + "name": "ToolCallOutputEvaluator", + "evaluatorTypeId": "uipath-tool-call-output", + "config_defaults": {"name": "ToolCallOutputEvaluator"}, + }, + } + + for evaluator_name in evaluators: + if evaluator_name not in evaluator_type_map: + console.warning(f"Unknown evaluator '{evaluator_name}', skipping") + continue + + evaluator_info = evaluator_type_map[evaluator_name] + evaluator_id = str(uuid.uuid4()) + evaluator_refs.append(evaluator_id) + + # Create evaluator JSON file + evaluator_def = { + "id": evaluator_id, + "name": f"{evaluator_info['name']} (auto-generated)", + "version": "1.0", + "evaluatorTypeId": evaluator_info["evaluatorTypeId"], + "evaluatorConfig": evaluator_info["config_defaults"], + } + + evaluator_file = os.path.join( + evaluators_dir, f"{evaluator_name}-{evaluator_id[:8]}.json" + ) + with open(evaluator_file, "w") as f: + json.dump(evaluator_def, f, indent=2) + + # Add evaluation criteria for this eval item (keyed by evaluator ID) + evaluation_criteria[evaluator_id] = { + "expected_output": parsed_output, + } + + # Create evaluation items + evaluation_items = [] + + # If spans are provided, create per-node evaluations + if spans: + # Filter spans to only include workflow nodes + node_spans = {} + node_order = [] # Track order of nodes + + for span in spans: + # First try to get the span name from the Name field (UiPath format) + span_name = span.get("Name", span.get("name", "")) + attributes = span.get("Attributes", span.get("attributes", {})) + + # Parse attributes if they're a JSON string + if isinstance(attributes, str): + try: + attributes = json.loads(attributes) + except: + attributes = {} + + # Determine the node name from various possible sources + node_name = None + if isinstance(attributes, dict): + node_name = attributes.get( + "node_name", attributes.get("langgraph.node", None) + ) + + # If no node_name attribute, use the span Name as the node name + if not node_name and span_name: + node_name = span_name + + # Only include valid workflow nodes (exclude system nodes, internal components, and LLM calls) + if ( + node_name + and node_name not in ["__start__", "__end__"] + and not any( + node_name.startswith(prefix) + for prefix in ["Runnable", "UiPath", "JsonOutput"] + ) + ): + if node_name not in node_spans: + node_spans[node_name] = [] + node_order.append(node_name) + node_spans[node_name].append(span) + + if node_spans: + console.info( + f"Found {len(node_spans)} workflow node(s) for evaluation generation" + ) + + # Create evaluation for each node in execution order + for node_name in node_order: + node_span_list = node_spans[node_name] + # Get the most recent span for this node + node_span = node_span_list[-1] + node_attributes = node_span.get( + "Attributes", node_span.get("attributes", {}) + ) + + # Parse attributes if they're a JSON string + if isinstance(node_attributes, str): + try: + node_attributes = json.loads(node_attributes) + except: + node_attributes = {} + + # Try different output keys: output.value, output, outputs + node_output = node_attributes.get( + "output.value", + node_attributes.get( + "output", node_attributes.get("outputs", None) + ), + ) + if isinstance(node_output, str): + try: + node_output = json.loads(node_output) + except: + pass + + if node_output: + # Create node-specific evaluation + node_eval_id = str(uuid.uuid4()) + node_evaluation_criteria = {} + + # Add evaluation criteria for each evaluator with node output + for evaluator_id in evaluator_refs: + node_evaluation_criteria[evaluator_id] = { + "expected_output": node_output, + } + + evaluation_items.append( + { + "id": node_eval_id, + "name": f"Node: {node_name}", + "inputs": parsed_input, # Use agent input, not node-specific input + "evaluationCriterias": node_evaluation_criteria, + "expectedAgentBehavior": f"The agent should execute node '{node_name}' and produce the expected output during the workflow execution.", + "nodeId": node_name, # Add node identifier for evaluators to match against trace + } + ) + + # Always include final output evaluation + evaluation_item = { + "id": eval_id, + "name": f"Final Output", + "inputs": parsed_input, + "evaluationCriterias": evaluation_criteria, + "expectedAgentBehavior": "Agent should produce the expected output for the given input", + } + evaluation_items.append(evaluation_item) + + # Create evaluation set + eval_set = { + "id": str(uuid.uuid4()), + "name": f"Evaluation set generated from {entrypoint}", + "version": "1.0", + "evaluatorRefs": evaluator_refs, + "evaluations": evaluation_items, + } + + # Save eval set to file + with open(eval_set_file, "w") as f: + json.dump(eval_set, f, indent=2) + + console.success(f"Evaluation set generated and saved to: {eval_set_file}") + console.info( + f"Generated {len(evaluation_items)} evaluation(s) with {len(evaluator_refs)} evaluator(s) in: {evaluators_dir}" + ) + + except Exception as e: + console.error( + f"Failed to generate evaluation set: {str(e)}", include_traceback=True + ) + + @click.command() @click.argument("entrypoint", required=False) @click.argument("input", required=False, default="{}") @@ -43,8 +458,8 @@ @click.option( "--output-file", required=False, - type=click.Path(exists=False), - help="File path where the output will be written", + type=click.Path(), + help="File path where the output will be written (will overwrite if exists)", ) @click.option( "--trace-file", @@ -63,7 +478,18 @@ default=5678, help="Port for the debug server (default: 5678)", ) -@track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None) +@click.option( + "--generate-evals", + required=False, + type=click.Path(), + help="Generate an evaluation set file from this run and save it to the specified path (will overwrite if exists)", +) +@click.option( + "--eval-evaluators", + multiple=True, + default=["json_similarity"], + help="Evaluators to use for generated eval set (can be specified multiple times). Available: json_similarity, exact_match, contains, llm_judge, llm_judge_strict_json, llm_judge_trajectory, llm_judge_trajectory_simulation, tool_call_args, tool_call_count, tool_call_order, tool_call_output. You can also use full type IDs like 'uipath-json-similarity'.", +) def run( entrypoint: Optional[str], input: Optional[str], @@ -74,6 +500,8 @@ def run( trace_file: Optional[str], debug: bool, debug_port: int, + generate_evals: Optional[str], + eval_evaluators: tuple[str], ) -> None: """Execute the project.""" context_args = { @@ -84,6 +512,9 @@ def run( "execution_output_file": output_file, "trace_file": trace_file, "debug": debug, + "generate_evals": generate_evals, + # Enable tracing if we're generating evals to capture node data + "tracing_enabled": True if generate_evals else None, } input_file = file or input_file # Setup debugging if requested @@ -115,8 +546,11 @@ def run( Usage: `uipath run [-f ]`""") try: + execution_result = None + memory_span_exporter = None async def execute() -> None: + nonlocal execution_result, memory_span_exporter runtime_factory = generate_runtime_factory() context = runtime_factory.new_context(**context_args) if context.job_id: @@ -125,6 +559,18 @@ async def execute() -> None: if trace_file: runtime_factory.add_span_exporter(JsonLinesFileExporter(trace_file)) + # Add memory span exporter if generating evals to capture node-level data + # Use batch=False to ensure immediate export of spans + if generate_evals: + memory_span_exporter = MemorySpanExporter() + runtime_factory.add_span_exporter(memory_span_exporter, batch=False) + + # Add LangChain instrumentor to automatically trace LangChain/LangGraph operations + if LANGCHAIN_INSTRUMENTATION_AVAILABLE: + runtime_factory.add_instrumentor( + LangChainInstrumentor, get_current_span + ) + if context.job_id: async with ResourceOverwritesContext( lambda: read_resource_overwrites_from_file(context.runtime_dir) @@ -133,15 +579,64 @@ async def execute() -> None: f"Applied {ctx.overwrites_count} resource overwrite(s)" ) - result = await runtime_factory.execute(context) + execution_result = await runtime_factory.execute(context) else: - result = await runtime_factory.execute(context) + execution_result = await runtime_factory.execute(context) if not context.job_id: - console.info(result.output) + console.info(execution_result.output) asyncio.run(execute()) + # Generate evaluation set if requested + if generate_evals and execution_result: + # Get the actual input data (from file or argument) + actual_input = input + if input_file and os.path.exists(input_file): + try: + with open(input_file, "r") as f: + actual_input = f.read() + except Exception as e: + console.warning( + f"Failed to read input file for eval generation: {e}" + ) + + # Convert output to proper format for eval generation + output_for_eval = ( + execution_result.output + if hasattr(execution_result, "output") + else execution_result + ) + + # If output is a Pydantic model, convert to dict + if hasattr(output_for_eval, "model_dump"): + output_for_eval = output_for_eval.model_dump() + elif hasattr(output_for_eval, "dict"): + output_for_eval = output_for_eval.dict() + # If it's already a dict, ensure it's not wrapped + elif isinstance(output_for_eval, dict) and "dict" in output_for_eval: + # Unwrap if it's in the format {"dict": "..."} + try: + import ast + + output_for_eval = ast.literal_eval(output_for_eval["dict"]) + except: + pass # Keep as-is if parsing fails + + # Get spans from memory exporter if available + collected_spans = ( + memory_span_exporter.spans if memory_span_exporter else None + ) + + _generate_evaluation_set( + input_data=actual_input, + output_data=output_for_eval, + entrypoint=entrypoint, + eval_set_path=generate_evals, + evaluators=list(eval_evaluators) if eval_evaluators else None, + spans=collected_spans, + ) + except UiPathRuntimeError as e: console.error(f"{e.error_info.title} - {e.error_info.detail}") except Exception as e: diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py index 486c48aaf..44c6ba8e7 100644 --- a/src/uipath/_events/_events.py +++ b/src/uipath/_events/_events.py @@ -5,8 +5,7 @@ from opentelemetry.sdk.trace import ReadableSpan from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator -from uipath._cli._evals._models._evaluation_set import EvaluationItem -from uipath.eval.evaluators import BaseEvaluator +from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator from uipath.eval.models import EvalItemResult @@ -24,12 +23,13 @@ class EvalSetRunCreatedEvent(BaseModel): eval_set_run_id: Optional[str] = None no_of_evals: int # skip validation to avoid abstract class instantiation - evaluators: SkipValidation[List[BaseEvaluator[Any, Any, Any]]] + evaluators: SkipValidation[List[AnyEvaluator]] + evaluator_weights: Optional[Dict[str, float]] = None class EvalRunCreatedEvent(BaseModel): execution_id: str - eval_item: EvaluationItem + eval_item: AnyEvaluationItem class EvalItemExceptionDetails(BaseModel): @@ -43,7 +43,7 @@ class EvalRunUpdatedEvent(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) execution_id: str - eval_item: EvaluationItem + eval_item: AnyEvaluationItem eval_results: List[EvalItemResult] success: bool agent_output: Any @@ -62,6 +62,8 @@ def validate_exception_details(self): class EvalSetRunUpdatedEvent(BaseModel): execution_id: str evaluator_scores: dict[str, float] + weighted_final_score: Optional[float] = None + evaluator_weights: Optional[Dict[str, float]] = None ProgressEvent = Union[ diff --git a/src/uipath/_services/context_grounding_service.py b/src/uipath/_services/context_grounding_service.py index fdb706a38..1fe0a63a3 100644 --- a/src/uipath/_services/context_grounding_service.py +++ b/src/uipath/_services/context_grounding_service.py @@ -20,7 +20,7 @@ from ..models import IngestionInProgressException from ..models.context_grounding import ContextGroundingQueryResponse from ..models.context_grounding_index import ContextGroundingIndex -from ..tracing import traced +from ..tracing._traced import traced from ._base_service import BaseService from .buckets_service import BucketsService from .folder_service import FolderService diff --git a/src/uipath/eval/_helpers/evaluators_helpers.py b/src/uipath/eval/_helpers/evaluators_helpers.py index 8620130cf..3bfb9573f 100644 --- a/src/uipath/eval/_helpers/evaluators_helpers.py +++ b/src/uipath/eval/_helpers/evaluators_helpers.py @@ -6,10 +6,7 @@ from opentelemetry.sdk.trace import ReadableSpan -from ..models import ( - ToolCall, - ToolOutput, -) +from ..models import ToolCall, ToolOutput COMPARATOR_MAPPINGS = { ">": "gt", @@ -420,6 +417,47 @@ def tool_calls_output_score( ), justifications +def extract_node_output_from_trace( + agent_trace: Sequence[ReadableSpan], node_id: str +) -> Any: + """Extract the output of a specific node from the agent execution trace. + + Args: + agent_trace: List of ReadableSpan objects from agent execution. + node_id: The identifier of the node to extract output from. + + Returns: + The output value of the node, or None if not found. + """ + for span in agent_trace: + if not span.attributes: + continue + + # Check if this span matches the node_id + span_name = span.name + node_name_attr = span.attributes.get("node_name") or span.attributes.get( + "langgraph.node" + ) + + # Match by span name or node_name attribute + if span_name == node_id or node_name_attr == node_id: + # Extract output from span attributes + output_value = span.attributes.get("output.value") or span.attributes.get( + "output" + ) + + # Try to parse if it's a JSON string + if isinstance(output_value, str): + try: + return json.loads(output_value) + except (json.JSONDecodeError, ValueError): + return output_value + + return output_value + + return None + + def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str: """Convert OTEL spans to a platform-style agent run history string. diff --git a/src/uipath/eval/models/models.py b/src/uipath/eval/models/models.py index f3e9e3ca9..5ed4805ea 100644 --- a/src/uipath/eval/models/models.py +++ b/src/uipath/eval/models/models.py @@ -39,7 +39,7 @@ class ScoreType(IntEnum): class BaseEvaluationResult(BaseModel): """Base class for evaluation results.""" - details: Optional[str | BaseModel] = None + details: Optional[str | Dict[str, Any] | BaseModel] = None # this is marked as optional, as it is populated inside the 'measure_execution_time' decorator evaluation_time: Optional[float] = None