langwatch · drewdrewthis · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts
@@ -1,79 +1,59 @@
 import { openai } from "@ai-sdk/openai";
-import scenario, { AgentRole } from "@langwatch/scenario";
+import scenario, { AgentRole, audioFromFile } from "@langwatch/scenario";
 import { UserModelMessage } from "ai";
 import { describe, it, expect } from "vitest";
-import {
-  encodeAudioToBase64,
-  getFixturePath,
-  wrapJudgeForAudioTranscription,
-} from "./helpers";
+import { getFixturePath } from "./helpers";
 import { OpenAiVoiceAgent } from "./helpers/openai-voice-agent";
 
 class AudioAgent extends OpenAiVoiceAgent {
   role: AgentRole = AgentRole.AGENT;
 }
 
-// Use setId to group together for visualizing in the UI
 const setId = "multimodal-audio-test";
 
 /**
  * This example shows how to test an agent that can take audio input
  * from a fixture and respond with audio output.
+ *
+ * Uses:
+ * - audioFromFile() to load audio
+ * - scenario.message() to inject the audio message
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 describe("Multimodal Audio to Audio Tests", () => {
-  it("should handle audio input", async () => {
+  it("should handle audio input from file", async () => {
     const myAgent = new AudioAgent({
-      systemPrompt: `
-      You are a helpful assistant that can analyze audio input and respond with audio output.
-      You must respond with audio output.
-      `,
+      systemPrompt: `You are a helpful assistant that analyzes audio input.
+      Answer questions about the audio content.`,
       voice: "alloy",
       forceUserRole: true,
     });
 
-    const data = encodeAudioToBase64(
-      getFixturePath("male_or_female_voice.wav"),
-    );
+    // Load audio file using the utility
+    const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));
 
-    // The AI-SDK will only support file parts,
-    // so we cannot use the OpenAI shape from above
-    // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
-    const audioMessage = {
+    // Create audio message with instructions
+    const audioMessage: UserModelMessage = {
       role: "user",
       content: [
-        {
-          type: "text",
-          text: `
-          Answer the question in the a text.
-          If you're not sure, you're required to take a best guess.
-          After you've guessed, you must repeat the question and say what format the input was in (audio or text)
-          `,
-        },
-        {
-          type: "file",
-          mediaType: "audio/wav",
-          data,
-        },
+        { type: "text", text: "Is this a male or female voice? Take a guess." },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
       ],
-    } satisfies UserModelMessage;
-
-    const audioJudge = wrapJudgeForAudioTranscription(
-      scenario.judgeAgent({
-        model: openai("gpt-4o"),
-        criteria: [
-          "The agent correctly guesses it's a male voice",
-          "The agent repeats the question",
-          "The agent says what format the input was in (audio or text)",
-        ],
-      }),
-    );
+    };
 
     const result = await scenario.run({
       setId,
-      name: "multimodal audio to audio",
-      description:
-        "User sends audio file, agent analyzes and transcribes the content",
-      agents: [myAgent, scenario.userSimulatorAgent(), audioJudge],
+      name: "audio to audio - file input",
+      description: "User sends audio file, agent analyzes and responds",
+      agents: [
+        myAgent,
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["The agent guesses the voice gender"],
+          audio: true,
-          audio: true,
+          transcribeOnly?: true
-          audio: true,
+          transcribeOnly?: true
+        }),
+      ],
       script: [
         scenario.message(audioMessage),
         scenario.agent(),
@@ -94,7 +74,7 @@ describe("Multimodal Audio to Audio Tests", () => {
   it.todo("should handle multiple audio formats (WAV, MP3, M4A)");
   it.todo("should handle long audio files gracefully");
   it.todo(
-    "should provide appropriate responses for unclear or corrupted audio",
+    "should provide appropriate responses for unclear or corrupted audio"
   );
   it.todo("should handle audio with background noise");
   it.todo("should transcribe speech in different languages");

diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts
@@ -3,104 +3,80 @@ import scenario, {
   AgentAdapter,
   AgentInput,
   AgentRole,
+  audioFromFile,
 } from "@langwatch/scenario";
 import { UserModelMessage } from "ai";
 import OpenAI from "openai";
 import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs";
 import { describe, it, expect } from "vitest";
-import {
-  encodeAudioToBase64,
-  getFixturePath,
-  wrapJudgeForAudioTranscription,
-} from "./helpers";
+import { getFixturePath } from "./helpers";
 import { convertModelMessagesToOpenAIMessages } from "./helpers/convert-core-messages-to-openai";
 
-class AudioAgent extends AgentAdapter {
+/**
+ * Agent that takes audio input and responds with text
+ */
+class AudioToTextAgent extends AgentAdapter {
   role: AgentRole = AgentRole.AGENT;
   private openai = new OpenAI();
 
   call = async (input: AgentInput) => {
-    // To use the OpenAI "voice-to-voice" model, we need to use the
-    // OpenAI api directly, and so we need to convert the messages to the correct
-    // shape here.
-    // @see https://platform.openai.com/docs/guides/audio?example=audio-in
     const messages = convertModelMessagesToOpenAIMessages(input.messages);
     const response = await this.respond(messages);
-
-    // Scenario expects the response to be a string, so we only send the transcript
     const transcript = response.choices[0].message?.audio?.transcript;
 
-    // Handle text response
     if (typeof transcript === "string") {
       return transcript;
-    } else {
-      throw new Error("Agent failed to generate a response");
     }
+    throw new Error("Agent failed to generate a response");
   };
 
   private async respond(messages: ChatCompletionMessageParam[]) {
     return await this.openai.chat.completions.create({
       model: "gpt-4o-audio-preview",
       modalities: ["text", "audio"],
       audio: { voice: "alloy", format: "wav" },
-      // We need to strip the id, or the openai client will throw an error
       messages,
       store: false,
     });
   }
 }
 
-// Use setId to group together for visualizing in the UI
 const setId = "multimodal-audio-test";
 
 /**
- * This example shows how to test an agent that can take audio input
- * and respond with text output.
+ * This example shows how to test an agent that takes audio input
+ * and responds with text output.
+ *
+ * Uses:
+ * - audioFromFile() to load audio
+ * - scenario.message() to inject the audio message
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 describe("Multimodal Audio to Text Tests", () => {
-  it("should handle audio input", async () => {
-    const data = encodeAudioToBase64(
-      getFixturePath("male_or_female_voice.wav"),
-    );
+  it("should handle audio input from file", async () => {
+    // Load audio file
+    const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));
 
-    // The AI-SDK will only support file parts,
-    // so we cannot use the OpenAI shape from above
-    // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
-    const audioMessage = {
+    const audioMessage: UserModelMessage = {
       role: "user",
       content: [
-        {
-          type: "text",
-          text: `
-          Answer the question in the audio.
-          If you're not sure, you're required to take a best guess.
-          After you've guessed, you must repeat the question and say what format the input was in (audio or text)
-          `,
-        },
-        {
-          type: "file",
-          mediaType: "audio/wav",
-          data,
-        },
+        { type: "text", text: "Is this a male or female voice?" },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
       ],
-    } satisfies UserModelMessage;
-
-    const audioJudge = wrapJudgeForAudioTranscription(
-      scenario.judgeAgent({
-        model: openai("gpt-5"),
-        criteria: [
-          "The agent guesses it's a male voice",
-          "The agent repeats the question",
-          "The agent says what format the input was in (audio or text)",
-        ],
-      }),
-    );
+    };
 
     const result = await scenario.run({
-      name: "multimodal audio to text",
-      description:
-        "User sends audio file, agent analyzes and transcribes the content",
-      agents: [new AudioAgent(), scenario.userSimulatorAgent(), audioJudge],
+      name: "audio to text",
+      description: "User sends audio, agent responds with text",
+      agents: [
+        new AudioToTextAgent(),
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["The agent identifies the voice gender"],
+          audio: true,
+        }),
+      ],
       script: [
         scenario.message(audioMessage),
         scenario.agent(),
@@ -122,7 +98,7 @@ describe("Multimodal Audio to Text Tests", () => {
   it.todo("should handle multiple audio formats (WAV, MP3)");
   it.todo("should handle long audio files gracefully");
   it.todo(
-    "should provide appropriate responses for unclear or corrupted audio",
+    "should provide appropriate responses for unclear or corrupted audio"
   );
   it.todo("should handle audio with background noise");
   it.todo("should transcribe speech in different languages");