diff --git a/assets/evaluators/builtin/task_navigation_efficiency/evaluator/_task_navigation_efficiency.py b/assets/evaluators/builtin/task_navigation_efficiency/evaluator/_task_navigation_efficiency.py index 0dc67d2ae1..e7405995df 100644 --- a/assets/evaluators/builtin/task_navigation_efficiency/evaluator/_task_navigation_efficiency.py +++ b/assets/evaluators/builtin/task_navigation_efficiency/evaluator/_task_navigation_efficiency.py @@ -397,7 +397,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s ground_truth_names = [] ground_truth_params_dict: Dict[str, Dict[str, Any]] = {} - if isinstance(ground_truth, tuple) and len(ground_truth) == 2: + if isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth): + # List format: just tool names + ground_truth_names = [step.strip() for step in ground_truth] + use_parameter_matching = False + elif (isinstance(ground_truth, tuple) or isinstance(ground_truth, list)) and len(ground_truth) == 2: # Tuple format: (tool_names, parameters_dict) tool_names_list, params_dict = ground_truth @@ -429,10 +433,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s ground_truth_names = [name.strip() for name in tool_names_list] ground_truth_params_dict = params_dict use_parameter_matching = True - elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth): - # List format: just tool names - ground_truth_names = [step.strip() for step in ground_truth] - use_parameter_matching = False else: raise TypeError( "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])" diff --git a/assets/evaluators/builtin/task_navigation_efficiency/spec.yaml b/assets/evaluators/builtin/task_navigation_efficiency/spec.yaml index 6aba3a98fb..723307c5c9 100644 --- a/assets/evaluators/builtin/task_navigation_efficiency/spec.yaml +++ b/assets/evaluators/builtin/task_navigation_efficiency/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.task_navigation_efficiency" -version: 2 +version: 3 displayName: "Task-Navigation-Efficiency-Evaluator" description: "Determines whether an agent’s sequence of steps (e.g., tool calls and parameters) matches an optimal or ground truth path for completing a task. Use it to evaluate how effectively an agent follows expected sequence of actions and executes multi-step workflows." evaluatorType: "builtin" @@ -29,11 +29,10 @@ dataMappingSchema: items: type: "string" - type: "array" - items: - - type: "array" - items: - type: "string" - - type: "object" + prefixItems: [ + { type: "array", items: { type: "string" } }, + { type: "object" } + ] required: ["response", "ground_truth"] outputSchema: task_navigation_efficiency: