Skip to content

Commit 2f96e31

Browse files
authored
Multi turn evals (#347)
# Create multi turn complex evals We want to check if the agent can perform a chain of actions. Example: ``` turns: [ { input: "What's the price of KING?" }, { input: "The mint address is 5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", expectedToolCall: { tool: "solana_token_data", params: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", }, }, { input: "Buy 20 tokens using USDC", expectedToolCall: { tool: "solana_trade", params: { inputAmount: 20, inputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", outputMint: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", slippageBps: 100, }, }, }, { input: "And check my KING balance", expectedToolCall: { tool: "solana_balance_other", params: { walletAddress: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB", tokenAddress: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump", }, }, }, ``` This test checks if the agent responds appropriately to the 4 user queries. 1) The agent should ask for the mint address to find the right token. 2) Once provided it should call `solana_token_data` and get the token information 3) The user asked it to buy the token using 20 USDC. The agent has all the information needed to execute the trade in its context and should use `solana_trade` 4) The agent should get the user's balance for the token using `solana_balance_other` ## Changes Made This PR adds the following changes: ## Implementation Details - Added runComplexEvals function that handles multi turn evals ## Additional Notes Most of the evals are for basic actions and already pass <img width="641" alt="Screenshot 2025-03-26 at 1 28 14 PM" src="https://github.com/user-attachments/assets/a042d754-56ce-4f4b-b877-4382ae6f84b5" /> ## Checklist - [X] I have tested these changes locally - [X] I have added the prompt used to test it
2 parents 57805d6 + 5e97454 commit 2f96e31

14 files changed

+1214
-23
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals";
2+
3+
const DATASET: ComplexEvalDataset[] = [
4+
{
5+
description: "Multi-turn create Gibwork task",
6+
inputs: {
7+
query: "I need to create a new Gibwork task",
8+
},
9+
turns: [
10+
{
11+
input: "I need to create a new Gibwork task",
12+
expectedResponse: "Sure, please provide the task title or details.",
13+
},
14+
{
15+
input: "The task is titled 'Fix my website'",
16+
expectedResponse:
17+
"Understood, 'Fix my website' is the task title. Any more details?",
18+
},
19+
{
20+
input: "Also, what's the current price of JUP?",
21+
expectedToolCall: {
22+
tool: "solana_token_data",
23+
params: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN",
24+
},
25+
},
26+
{
27+
input:
28+
"The Gibwork job should be for 1000 JUP tokens with no extra content.",
29+
expectedResponse:
30+
"Okay, 1000 JUP tokens, no additional content. Any requirements or tags?",
31+
},
32+
{
33+
input: "Set content and requirements to N/A and tag it as webdev",
34+
expectedToolCall: {
35+
tool: "create_gibwork_task",
36+
params: {
37+
title: "Fix my website",
38+
content: "N/A",
39+
requirements: "N/A",
40+
tags: ["webdev"],
41+
tokenMintAddress: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN",
42+
amount: 10,
43+
},
44+
},
45+
},
46+
],
47+
},
48+
];
49+
50+
runComplexEval(DATASET, "Multi-turn Create Gibwork Task test");
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals";
2+
3+
const DATASET: ComplexEvalDataset[] = [
4+
{
5+
description: "Multi-turn token data inquiry",
6+
inputs: {
7+
query: "What's the price of KING?",
8+
},
9+
turns: [
10+
{
11+
input: "What's the price of KING?",
12+
expectedResponse: "Sure, can you provide the mint address of KING?",
13+
},
14+
{
15+
input:
16+
"The mint address is 5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump",
17+
expectedToolCall: {
18+
tool: "solana_token_data",
19+
params: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump",
20+
},
21+
},
22+
{
23+
input: "Buy 20 tokens using USDC",
24+
expectedToolCall: {
25+
tool: "solana_trade",
26+
params: {
27+
inputAmount: 20,
28+
inputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v",
29+
outputMint: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump",
30+
slippageBps: 100,
31+
},
32+
},
33+
},
34+
{
35+
input: "And check my KING balance",
36+
expectedToolCall: {
37+
tool: "solana_balance_other",
38+
params: {
39+
walletAddress: "GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB",
40+
tokenAddress: "5eqNDjbsWL9hfAqUfhegTxgEa3XardzGdVAboMA4pump",
41+
},
42+
},
43+
},
44+
],
45+
},
46+
];
47+
48+
runComplexEval(DATASET, "Multi-turn Token Data test");
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals";
2+
3+
const DATASET: ComplexEvalDataset[] = [
4+
{
5+
description: "Multi-turn token swap",
6+
inputs: {
7+
query: "I want to swap some tokens",
8+
},
9+
turns: [
10+
{
11+
input: "I want to swap some tokens",
12+
expectedResponse: "Sure, which tokens would you like to swap?",
13+
},
14+
{
15+
input: "I want to exchange USDC for JUP tokens",
16+
expectedResponse: "How much USDC?",
17+
},
18+
{
19+
input: "Swap 10 USDC for JUP with 1% slippage",
20+
expectedToolCall: {
21+
tool: "solana_trade",
22+
params: {
23+
outputMint: "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN",
24+
inputAmount: 10,
25+
inputMint: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v",
26+
slippageBps: 100,
27+
},
28+
},
29+
},
30+
{
31+
input:
32+
"Then check the USDC balance of GZbQmKYYzwjP3nbdqRWPLn98ipAni9w5eXMGp7bmZbGB",
33+
expectedToolCall: {
34+
tool: "solana_balance_other",
35+
params: {
36+
tokenAddress: "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v",
37+
},
38+
},
39+
},
40+
],
41+
},
42+
];
43+
44+
runComplexEval(DATASET, "Multi-turn Token Swap test");
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { runComplexEval, ComplexEvalDataset } from "../utils/runEvals";
2+
3+
const DATASET: ComplexEvalDataset[] = [
4+
{
5+
description: "Multi-turn NFT collection deployment",
6+
inputs: {
7+
query: "I want to deploy an NFT collection",
8+
},
9+
turns: [
10+
{
11+
input: "I want to deploy an NFT collection",
12+
expectedResponse:
13+
"Sure, what's the name of your collection? I also need the metadata URI and royalty basis points.",
14+
},
15+
{
16+
input: "The collection should be named MyCollection",
17+
expectedResponse: "Got it. Metadata URI and royalty basis points?",
18+
},
19+
{
20+
input:
21+
"Its metadata URI is https://metadata.mycoll.io/collection.json. Set the royalty to 250 basis points",
22+
expectedToolCall: {
23+
tool: "solana_deploy_collection",
24+
params: {
25+
name: "MyCollection",
26+
uri: "https://metadata.mycoll.io/collection.json",
27+
royaltyBasisPoints: 250,
28+
},
29+
},
30+
},
31+
{
32+
input: "Also, retrieve the deployed collection details",
33+
expectedToolCall: {
34+
tool: "solana_get_asset",
35+
params: { collection: "MyCollection" },
36+
},
37+
},
38+
],
39+
},
40+
];
41+
42+
runComplexEval(DATASET, "Multi-turn solana_deploy_collection test");

0 commit comments

Comments
 (0)