Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 29 additions & 49 deletions javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts
Original file line number Diff line number Diff line change
@@ -1,79 +1,59 @@
import { openai } from "@ai-sdk/openai";
import scenario, { AgentRole } from "@langwatch/scenario";
import scenario, { AgentRole, audioFromFile } from "@langwatch/scenario";
import { UserModelMessage } from "ai";
import { describe, it, expect } from "vitest";
import {
encodeAudioToBase64,
getFixturePath,
wrapJudgeForAudioTranscription,
} from "./helpers";
import { getFixturePath } from "./helpers";
import { OpenAiVoiceAgent } from "./helpers/openai-voice-agent";

class AudioAgent extends OpenAiVoiceAgent {
role: AgentRole = AgentRole.AGENT;
}

// Use setId to group together for visualizing in the UI
const setId = "multimodal-audio-test";

/**
* This example shows how to test an agent that can take audio input
* from a fixture and respond with audio output.
*
* Uses:
* - audioFromFile() to load audio
* - scenario.message() to inject the audio message
* - scenario.judgeAgent({ audio: true }) for multimodal evaluation
*/
describe("Multimodal Audio to Audio Tests", () => {
it("should handle audio input", async () => {
it("should handle audio input from file", async () => {
const myAgent = new AudioAgent({
systemPrompt: `
You are a helpful assistant that can analyze audio input and respond with audio output.
You must respond with audio output.
`,
systemPrompt: `You are a helpful assistant that analyzes audio input.
Answer questions about the audio content.`,
voice: "alloy",
forceUserRole: true,
});

const data = encodeAudioToBase64(
getFixturePath("male_or_female_voice.wav"),
);
// Load audio file using the utility
const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));

// The AI-SDK will only support file parts,
// so we cannot use the OpenAI shape from above
// @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
const audioMessage = {
// Create audio message with instructions
const audioMessage: UserModelMessage = {
role: "user",
content: [
{
type: "text",
text: `
Answer the question in the a text.
If you're not sure, you're required to take a best guess.
After you've guessed, you must repeat the question and say what format the input was in (audio or text)
`,
},
{
type: "file",
mediaType: "audio/wav",
data,
},
{ type: "text", text: "Is this a male or female voice? Take a guess." },
{ type: "file", mediaType: audio.mediaType, data: audio.data },
],
} satisfies UserModelMessage;

const audioJudge = wrapJudgeForAudioTranscription(
scenario.judgeAgent({
model: openai("gpt-4o"),
criteria: [
"The agent correctly guesses it's a male voice",
"The agent repeats the question",
"The agent says what format the input was in (audio or text)",
],
}),
);
};

const result = await scenario.run({
setId,
name: "multimodal audio to audio",
description:
"User sends audio file, agent analyzes and transcribes the content",
agents: [myAgent, scenario.userSimulatorAgent(), audioJudge],
name: "audio to audio - file input",
description: "User sends audio file, agent analyzes and responds",
agents: [
myAgent,
scenario.userSimulatorAgent(),
scenario.judgeAgent({
model: openai("gpt-4o"),
criteria: ["The agent guesses the voice gender"],
audio: true,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
audio: true,
transcribeOnly?: true

}),
],
script: [
scenario.message(audioMessage),
scenario.agent(),
Expand All @@ -94,7 +74,7 @@ describe("Multimodal Audio to Audio Tests", () => {
it.todo("should handle multiple audio formats (WAV, MP3, M4A)");
it.todo("should handle long audio files gracefully");
it.todo(
"should provide appropriate responses for unclear or corrupted audio",
"should provide appropriate responses for unclear or corrupted audio"
);
it.todo("should handle audio with background noise");
it.todo("should transcribe speech in different languages");
Expand Down
90 changes: 33 additions & 57 deletions javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,104 +3,80 @@ import scenario, {
AgentAdapter,
AgentInput,
AgentRole,
audioFromFile,
} from "@langwatch/scenario";
import { UserModelMessage } from "ai";
import OpenAI from "openai";
import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs";
import { describe, it, expect } from "vitest";
import {
encodeAudioToBase64,
getFixturePath,
wrapJudgeForAudioTranscription,
} from "./helpers";
import { getFixturePath } from "./helpers";
import { convertModelMessagesToOpenAIMessages } from "./helpers/convert-core-messages-to-openai";

class AudioAgent extends AgentAdapter {
/**
* Agent that takes audio input and responds with text
*/
class AudioToTextAgent extends AgentAdapter {
role: AgentRole = AgentRole.AGENT;
private openai = new OpenAI();

call = async (input: AgentInput) => {
// To use the OpenAI "voice-to-voice" model, we need to use the
// OpenAI api directly, and so we need to convert the messages to the correct
// shape here.
// @see https://platform.openai.com/docs/guides/audio?example=audio-in
const messages = convertModelMessagesToOpenAIMessages(input.messages);
const response = await this.respond(messages);

// Scenario expects the response to be a string, so we only send the transcript
const transcript = response.choices[0].message?.audio?.transcript;

// Handle text response
if (typeof transcript === "string") {
return transcript;
} else {
throw new Error("Agent failed to generate a response");
}
throw new Error("Agent failed to generate a response");
};

private async respond(messages: ChatCompletionMessageParam[]) {
return await this.openai.chat.completions.create({
model: "gpt-4o-audio-preview",
modalities: ["text", "audio"],
audio: { voice: "alloy", format: "wav" },
// We need to strip the id, or the openai client will throw an error
messages,
store: false,
});
}
}

// Use setId to group together for visualizing in the UI
const setId = "multimodal-audio-test";

/**
* This example shows how to test an agent that can take audio input
* and respond with text output.
* This example shows how to test an agent that takes audio input
* and responds with text output.
*
* Uses:
* - audioFromFile() to load audio
* - scenario.message() to inject the audio message
* - scenario.judgeAgent({ audio: true }) for multimodal evaluation
*/
describe("Multimodal Audio to Text Tests", () => {
it("should handle audio input", async () => {
const data = encodeAudioToBase64(
getFixturePath("male_or_female_voice.wav"),
);
it("should handle audio input from file", async () => {
// Load audio file
const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));

// The AI-SDK will only support file parts,
// so we cannot use the OpenAI shape from above
// @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
const audioMessage = {
const audioMessage: UserModelMessage = {
role: "user",
content: [
{
type: "text",
text: `
Answer the question in the audio.
If you're not sure, you're required to take a best guess.
After you've guessed, you must repeat the question and say what format the input was in (audio or text)
`,
},
{
type: "file",
mediaType: "audio/wav",
data,
},
{ type: "text", text: "Is this a male or female voice?" },
{ type: "file", mediaType: audio.mediaType, data: audio.data },
],
} satisfies UserModelMessage;

const audioJudge = wrapJudgeForAudioTranscription(
scenario.judgeAgent({
model: openai("gpt-5"),
criteria: [
"The agent guesses it's a male voice",
"The agent repeats the question",
"The agent says what format the input was in (audio or text)",
],
}),
);
};

const result = await scenario.run({
name: "multimodal audio to text",
description:
"User sends audio file, agent analyzes and transcribes the content",
agents: [new AudioAgent(), scenario.userSimulatorAgent(), audioJudge],
name: "audio to text",
description: "User sends audio, agent responds with text",
agents: [
new AudioToTextAgent(),
scenario.userSimulatorAgent(),
scenario.judgeAgent({
model: openai("gpt-4o"),
criteria: ["The agent identifies the voice gender"],
audio: true,
}),
],
script: [
scenario.message(audioMessage),
scenario.agent(),
Expand All @@ -122,7 +98,7 @@ describe("Multimodal Audio to Text Tests", () => {
it.todo("should handle multiple audio formats (WAV, MP3)");
it.todo("should handle long audio files gracefully");
it.todo(
"should provide appropriate responses for unclear or corrupted audio",
"should provide appropriate responses for unclear or corrupted audio"
);
it.todo("should handle audio with background noise");
it.todo("should transcribe speech in different languages");
Expand Down
Loading