33import time
44import traceback
55from typing import List , Tuple
6-
6+ from pathlib import Path
77import openai
88from langgraph .errors import GraphRecursionError
99from pydantic import ValidationError
1717from evaluator .interfaces .algorithm import Algorithm
1818from evaluator .utils .csv_logger import CSVLogger
1919from evaluator .components .llm_provider import get_llm
20+ from evaluator .utils .parsing_tools import generate_and_save_additional_queries
21+ import json as _json
2022from dotenv import load_dotenv
2123
2224from evaluator .utils .tool_logger import ToolLogger
@@ -35,13 +37,13 @@ class Evaluator(object):
3537
3638 config : EvaluationConfig
3739
38- def __init__ (self , config_path : str | None , use_defaults : bool ):
40+ def __init__ (self , config_path : str | None , use_defaults : bool , test_with_additional_queries : bool = False ):
3941 try :
4042 self .config = load_config (config_path , use_defaults = use_defaults )
4143 except ConfigError as ce :
4244 log (f"Configuration error: { ce } " )
4345 raise SystemExit (2 )
44-
46+ self . test_with_additional_queries = test_with_additional_queries
4547 async def run (self ) -> None :
4648
4749 # Set up the necessary components for the experiments:
@@ -112,15 +114,13 @@ async def _run_experiment(self,
112114 Runs the specified experiment and returns the number of evaluated queries.
113115 """
114116 processed_queries_num = 0
115-
116117 try :
117118 queries = await self ._set_up_experiment (spec , metric_collectors , mcp_proxy_manager )
118119 algorithm , environment = spec
119120
120121 try :
121122 for i , query_spec in enumerate (queries ):
122123 log (f"Processing query #{ query_spec .id } (Experiment { exp_index } of { total_exp_num } , query { i + 1 } of { len (queries )} )..." )
123-
124124 for mc in metric_collectors :
125125 mc .prepare_for_measurement (query_spec )
126126
@@ -199,12 +199,12 @@ async def _set_up_experiment(self,
199199 log (f"Initializing LLM connection: { environment .model_id } " )
200200 llm = get_llm (model_id = environment .model_id , model_config = self .config .models )
201201 log ("Connection established successfully.\n " )
202-
203202 log ("Fetching queries for the current experiment..." )
204203 queries = get_queries (environment , self .config .data )
205204 log (f"Successfully loaded { len (queries )} queries.\n " )
206205 print_iterable_verbose ("The following queries will be executed:\n " , queries )
207-
206+ log (f"Generating additional queries queries.\n " )
207+ generate_and_save_additional_queries (llm , queries )
208208 log ("Retrieving tool definitions for the current experiment..." )
209209 tool_specs = get_tools_from_queries (queries )
210210 tools = await mcp_proxy_manager .run_mcp_proxy (tool_specs , init_client = True ).get_tools ()
@@ -213,8 +213,40 @@ async def _set_up_experiment(self,
213213
214214 log ("Setting up the algorithm and the metric collectors..." )
215215 algorithm .set_up (llm , tools )
216+
217+ if algorithm .__module__ == "evaluator.algorithms.tool_rag_algorithm" :
218+ log ("Embedding additional queries..." )
219+ additional_query_embeddings = algorithm .embed_additional_queries (queries )
220+ print ("Additional query embedding counts per query/tool:" ,
221+ {k : {tk : len (tv ) for tk , tv in v .items ()} for k , v in additional_query_embeddings .items ()})
222+ else :
223+ print ("No additional queries to embed." )
216224 for mc in metric_collectors :
217225 mc .set_up ()
218226 log ("All set!\n " )
219227
220228 return queries
229+
230+ if __name__ == "__main__" :
231+ import argparse
232+ parser = argparse .ArgumentParser (description = "Run the Evaluator experiments." )
233+ parser .add_argument ("--config" , type = str , default = None , help = "Path to evaluation config YAML file" )
234+ parser .add_argument ("--defaults" , action = "store_true" , help = "Use default config options if set" )
235+ parser .add_argument ("--test-with-additional-queries" , action = "store_true" , help = "Test with additional queries" )
236+ args = parser .parse_args ()
237+
238+ from evaluator .utils .utils import log
239+
240+ log ("Starting Evaluator main..." )
241+ evaluator = Evaluator (
242+ config_path = args .config ,
243+ use_defaults = args .defaults ,
244+ test_with_additional_queries = args .test_with_additional_queries
245+ )
246+ try :
247+ import asyncio
248+ asyncio .run (evaluator .run ())
249+ log ("Evaluator finished successfully!" )
250+ except Exception as e :
251+ log (f"Evaluator failed: { e } " )
252+ raise
0 commit comments