refactor: move enable_structured_output_with_tools to LitellmModel

devv-shayan · devv-shayan · commit cfcd77706f56 · 2025-11-11T11:50:53.000+05:00
Moved the enable_structured_output_with_tools parameter from the Agent class to
LitellmModel.__init__() to minimize the diff and isolate changes within the LiteLLM
adapter as requested during code review.

Changes:
- Added enable_structured_output_with_tools parameter to LitellmModel.__init__()
- Stored as instance variable and used throughout LitellmModel
- Removed parameter from Agent class and related validation
- Removed parameter from Model interface (get_response / stream_response)
- Removed parameter from Runner (no longer passed to model calls)
- Removed parameter from OpenAI model implementations
- Reverted test mock models to original signatures
- Updated test_gemini_local.py for model-level configuration
- Updated documentation to reflect model-level usage

Before:
  Agent(model=..., enable_structured_output_with_tools=True)

After:
  Agent(model=LitellmModel(..., enable_structured_output_with_tools=True))
diff --git a/docs/agents.md b/docs/agents.md
@@ -81,14 +81,16 @@ from agents.extensions.models.litellm_model import LitellmModel
 
 agent = Agent(
     name="Weather assistant",
-    model=LitellmModel("gemini/gemini-1.5-flash"),
+    model=LitellmModel(
+        "gemini/gemini-2.5-flash",
+        enable_structured_output_with_tools=True,  # Required for Gemini
+    ),
     tools=[get_weather],
     output_type=WeatherReport,
-    enable_structured_output_with_tools=True,  # Required for Gemini
 )
 ```
 
-The `enable_structured_output_with_tools` parameter injects JSON formatting instructions into the system prompt as a workaround. This is only needed for models accessed via [`LitellmModel`][agents.extensions.models.litellm_model.LitellmModel] that lack native support. OpenAI models ignore this parameter.
+The `enable_structured_output_with_tools` parameter on [`LitellmModel`][agents.extensions.models.litellm_model.LitellmModel] injects JSON formatting instructions into the system prompt as a workaround. This is only needed for models that lack native support for using tools and structured outputs simultaneously (like Gemini).
 
 See the [prompt injection documentation](models/structured_output_with_tools.md) for more details.
 
diff --git a/docs/models/litellm.md b/docs/models/litellm.md
@@ -111,13 +111,15 @@ def analyze_data(query: str) -> dict:
 
 agent = Agent(
     name="Analyst",
-    model=LitellmModel("gemini/gemini-1.5-flash"),
+    model=LitellmModel(
+        "gemini/gemini-2.5-flash",
+        enable_structured_output_with_tools=True,  # Required for Gemini
+    ),
     tools=[analyze_data],
     output_type=Report,
-    enable_structured_output_with_tools=True,  # Required for Gemini
 )
 ```
 
-The `enable_structured_output_with_tools` parameter enables a workaround that injects JSON formatting instructions into the system prompt instead of using the native API. This allows models like Gemini to return structured outputs even when using tools.
+The `enable_structured_output_with_tools` parameter on `LitellmModel` enables a workaround that injects JSON formatting instructions into the system prompt instead of using the native API. This allows models like Gemini to return structured outputs even when using tools.
 
 See the [prompt injection documentation](structured_output_with_tools.md) for complete details.
diff --git a/docs/models/structured_output_with_tools.md b/docs/models/structured_output_with_tools.md
@@ -25,7 +25,7 @@ def get_weather(city: str) -> dict:
 
 # This causes an error with Gemini
 agent = Agent(
-    model=LitellmModel("gemini/gemini-1.5-flash"),
+    model=LitellmModel("gemini/gemini-2.5-flash"),
     tools=[get_weather],
     output_type=WeatherReport,  # Error: can't use both!
 )
@@ -40,14 +40,16 @@ GeminiException BadRequestError - Function calling with a response mime type
 
 ## The Solution
 
-Enable prompt injection by setting `enable_structured_output_with_tools=True` on your agent:
+Enable prompt injection by setting `enable_structured_output_with_tools=True` on the `LitellmModel`:
 
 ```python
 agent = Agent(
-    model=LitellmModel("gemini/gemini-1.5-flash"),
+    model=LitellmModel(
+        "gemini/gemini-2.5-flash",
+        enable_structured_output_with_tools=True,  # ← Enables the workaround
+    ),
     tools=[get_weather],
     output_type=WeatherReport,
-    enable_structured_output_with_tools=True,  # ← Enables the workaround
 )
 ```
 
@@ -90,10 +92,12 @@ async def main():
     agent = Agent(
         name="WeatherBot",
         instructions="Use the get_weather tool, then provide a structured report.",
-        model=LitellmModel("gemini/gemini-1.5-flash"),
+        model=LitellmModel(
+            "gemini/gemini-2.5-flash",
+            enable_structured_output_with_tools=True,  # Required for Gemini
+        ),
         tools=[get_weather],
         output_type=WeatherReport,
-        enable_structured_output_with_tools=True,  # Required for Gemini
     )
 
     result = await Runner.run(agent, "What's the weather in Tokyo?")
diff --git a/src/agents/agent.py b/src/agents/agent.py
@@ -231,16 +231,6 @@ class Agent(AgentBase, Generic[TContext]):
     """Whether to reset the tool choice to the default value after a tool has been called. Defaults
     to True. This ensures that the agent doesn't enter an infinite loop of tool usage."""
 
-    enable_structured_output_with_tools: bool = False
-    """Enable structured outputs when using tools on models that don't natively support both
-    simultaneously (e.g., Gemini). When enabled, injects JSON formatting instructions into the
-    system prompt as a workaround instead of using the native API. Defaults to False (use native
-    API support when available).
-
-    Set to True when using models that don't support both features natively (e.g., Gemini via
-    LiteLLM).
-    """
-
     def __post_init__(self):
         from typing import get_origin
 
@@ -374,12 +364,6 @@ def __post_init__(self):
                 f"got {type(self.reset_tool_choice).__name__}"
             )
 
-        if not isinstance(self.enable_structured_output_with_tools, bool):
-            raise TypeError(
-                f"Agent enable_structured_output_with_tools must be a boolean, "
-                f"got {type(self.enable_structured_output_with_tools).__name__}"
-            )
-
     def clone(self, **kwargs: Any) -> Agent[TContext]:
         """Make a copy of the agent, with the given arguments changed.
         Notes:
diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py
@@ -74,10 +74,12 @@ def __init__(
         model: str,
         base_url: str | None = None,
         api_key: str | None = None,
+        enable_structured_output_with_tools: bool = False,
     ):
         self.model = model
         self.base_url = base_url
         self.api_key = api_key
+        self.enable_structured_output_with_tools = enable_structured_output_with_tools
 
     async def get_response(
         self,
@@ -89,9 +91,8 @@ async def get_response(
         handoffs: list[Handoff],
         tracing: ModelTracing,
         previous_response_id: str | None = None,  # unused
-        conversation_id: str | None = None,  # unused
+        conversation_id: str | None = None,
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ModelResponse:
         with generation_span(
             model=str(self.model),
@@ -110,7 +111,6 @@ async def get_response(
                 tracing,
                 stream=False,
                 prompt=prompt,
-                enable_structured_output_with_tools=enable_structured_output_with_tools,
             )
 
             message: litellm.types.utils.Message | None = None
@@ -195,9 +195,8 @@ async def stream_response(
         handoffs: list[Handoff],
         tracing: ModelTracing,
         previous_response_id: str | None = None,  # unused
-        conversation_id: str | None = None,  # unused
+        conversation_id: str | None = None,
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> AsyncIterator[TResponseStreamEvent]:
         with generation_span(
             model=str(self.model),
@@ -216,7 +215,6 @@ async def stream_response(
                 tracing,
                 stream=True,
                 prompt=prompt,
-                enable_structured_output_with_tools=enable_structured_output_with_tools,
             )
 
             final_response: Response | None = None
@@ -248,7 +246,6 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: Literal[True],
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> tuple[Response, AsyncStream[ChatCompletionChunk]]: ...
 
     @overload
@@ -264,7 +261,6 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: Literal[False],
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> litellm.types.utils.ModelResponse: ...
 
     async def _fetch_response(
@@ -279,7 +275,6 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: bool = False,
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> litellm.types.utils.ModelResponse | tuple[Response, AsyncStream[ChatCompletionChunk]]:
         # Preserve reasoning messages for tool calls when reasoning is on
         # This is needed for models like Claude 4 Sonnet/Opus which support interleaved thinking
@@ -298,7 +293,7 @@ async def _fetch_response(
         # Check if we need to inject JSON output prompt for models that don't support
         # tools + structured output simultaneously (like Gemini)
         inject_json_prompt = should_inject_json_prompt(
-            output_schema, tools, enable_structured_output_with_tools
+            output_schema, tools, self.enable_structured_output_with_tools
         )
         if inject_json_prompt and output_schema:
             json_prompt = get_json_output_prompt(output_schema)
diff --git a/src/agents/models/interface.py b/src/agents/models/interface.py
@@ -50,7 +50,6 @@ async def get_response(
         previous_response_id: str | None,
         conversation_id: str | None,
         prompt: ResponsePromptParam | None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ModelResponse:
         """Get a response from the model.
 
@@ -66,9 +65,6 @@ async def get_response(
                 except for the OpenAI Responses API.
             conversation_id: The ID of the stored conversation, if any.
             prompt: The prompt config to use for the model.
-            enable_structured_output_with_tools: Whether to inject JSON formatting instructions
-                into the system prompt when using structured outputs with tools. Required for
-                models that don't support both features natively (like Gemini).
 
         Returns:
             The full model response.
@@ -89,7 +85,6 @@ def stream_response(
         previous_response_id: str | None,
         conversation_id: str | None,
         prompt: ResponsePromptParam | None,
-        enable_structured_output_with_tools: bool = False,
     ) -> AsyncIterator[TResponseStreamEvent]:
         """Stream a response from the model.
 
@@ -105,9 +100,6 @@ def stream_response(
                 except for the OpenAI Responses API.
             conversation_id: The ID of the stored conversation, if any.
             prompt: The prompt config to use for the model.
-            enable_structured_output_with_tools: Whether to inject JSON formatting instructions
-                into the system prompt when using structured outputs with tools. Required for
-                models that don't support both features natively (like Gemini).
 
         Returns:
             An iterator of response stream events, in OpenAI Responses format.
diff --git a/src/agents/models/openai_chatcompletions.py b/src/agents/models/openai_chatcompletions.py
@@ -59,7 +59,6 @@ async def get_response(
         previous_response_id: str | None = None,  # unused
         conversation_id: str | None = None,  # unused
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ModelResponse:
         with generation_span(
             model=str(self.model),
@@ -77,7 +76,6 @@ async def get_response(
                 tracing,
                 stream=False,
                 prompt=prompt,
-                enable_structured_output_with_tools=enable_structured_output_with_tools,
             )
 
             message: ChatCompletionMessage | None = None
@@ -149,7 +147,6 @@ async def stream_response(
         previous_response_id: str | None = None,  # unused
         conversation_id: str | None = None,  # unused
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> AsyncIterator[TResponseStreamEvent]:
         """
         Yields a partial message as it is generated, as well as the usage information.
@@ -170,7 +167,6 @@ async def stream_response(
                 tracing,
                 stream=True,
                 prompt=prompt,
-                enable_structured_output_with_tools=enable_structured_output_with_tools,
             )
 
             final_response: Response | None = None
@@ -202,7 +198,6 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: Literal[True],
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> tuple[Response, AsyncStream[ChatCompletionChunk]]: ...
 
     @overload
@@ -218,7 +213,6 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: Literal[False],
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ChatCompletion: ...
 
     async def _fetch_response(
@@ -233,12 +227,7 @@ async def _fetch_response(
         tracing: ModelTracing,
         stream: bool = False,
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ChatCompletion | tuple[Response, AsyncStream[ChatCompletionChunk]]:
-        # Note: enable_structured_output_with_tools parameter is accepted for interface consistency
-        # but not used for OpenAI models since they have native support for
-        # tools + structured outputs simultaneously
-
         converted_messages = Converter.items_to_messages(input)
 
         if system_instructions:
diff --git a/src/agents/models/openai_responses.py b/src/agents/models/openai_responses.py
@@ -84,7 +84,6 @@ async def get_response(
         previous_response_id: str | None = None,
         conversation_id: str | None = None,
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ModelResponse:
         with response_span(disabled=tracing.is_disabled()) as span_response:
             try:
@@ -162,7 +161,6 @@ async def stream_response(
         previous_response_id: str | None = None,
         conversation_id: str | None = None,
         prompt: ResponsePromptParam | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> AsyncIterator[ResponseStreamEvent]:
         """
         Yields a partial message as it is generated, as well as the usage information.
diff --git a/src/agents/run.py b/src/agents/run.py
@@ -1287,18 +1287,6 @@ async def _run_single_turn_streamed(
         )
 
         # 1. Stream the output events
-        # Build kwargs for model call
-        model_kwargs: dict[str, Any] = {
-            "previous_response_id": previous_response_id,
-            "conversation_id": conversation_id,
-            "prompt": prompt_config,
-        }
-
-        # Only pass enable_structured_output_with_tools when enabled
-        # to maintain backward compatibility with third-party Model implementations
-        if agent.enable_structured_output_with_tools:
-            model_kwargs["enable_structured_output_with_tools"] = True
-
         async for event in model.stream_response(
             filtered.instructions,
             filtered.input,
@@ -1309,7 +1297,9 @@ async def _run_single_turn_streamed(
             get_model_tracing_impl(
                 run_config.tracing_disabled, run_config.trace_include_sensitive_data
             ),
-            **model_kwargs,
+            previous_response_id=previous_response_id,
+            conversation_id=conversation_id,
+            prompt=prompt_config,
         ):
             # Emit the raw event ASAP
             streamed_result._event_queue.put_nowait(RawResponsesStreamEvent(data=event))
@@ -1732,18 +1722,6 @@ async def _get_new_response(
             server_conversation_tracker.conversation_id if server_conversation_tracker else None
         )
 
-        # Build kwargs for model call
-        model_kwargs: dict[str, Any] = {
-            "previous_response_id": previous_response_id,
-            "conversation_id": conversation_id,
-            "prompt": prompt_config,
-        }
-
-        # Only pass enable_structured_output_with_tools when enabled
-        # to maintain backward compatibility with third-party Model implementations
-        if agent.enable_structured_output_with_tools:
-            model_kwargs["enable_structured_output_with_tools"] = True
-
         new_response = await model.get_response(
             system_instructions=filtered.instructions,
             input=filtered.input,
@@ -1754,7 +1732,9 @@ async def _get_new_response(
             tracing=get_model_tracing_impl(
                 run_config.tracing_disabled, run_config.trace_include_sensitive_data
             ),
-            **model_kwargs,
+            previous_response_id=previous_response_id,
+            conversation_id=conversation_id,
+            prompt=prompt_config,
         )
 
         context_wrapper.usage.add(new_response.usage)
diff --git a/tests/fake_model.py b/tests/fake_model.py
@@ -90,7 +90,6 @@ async def get_response(
         previous_response_id: str | None,
         conversation_id: str | None,
         prompt: Any | None,
-        enable_structured_output_with_tools: bool = False,
     ) -> ModelResponse:
         turn_args = {
             "system_instructions": system_instructions,
@@ -141,7 +140,6 @@ async def stream_response(
         previous_response_id: str | None = None,
         conversation_id: str | None = None,
         prompt: Any | None = None,
-        enable_structured_output_with_tools: bool = False,
     ) -> AsyncIterator[TResponseStreamEvent]:
         turn_args = {
             "system_instructions": system_instructions,
diff --git a/tests/test_agent_prompt.py b/tests/test_agent_prompt.py
@@ -26,7 +26,6 @@ async def get_response(
         previous_response_id,
         conversation_id,
         prompt,
-        enable_structured_output_with_tools: bool = False,
     ):
         # Record the prompt that the agent resolved and passed in.
         self.last_prompt = prompt
diff --git a/tests/test_gemini_local.py b/tests/test_gemini_local.py
diff --git a/tests/test_streaming_tool_call_arguments.py b/tests/test_streaming_tool_call_arguments.py
diff --git a/tests/voice/test_workflow.py b/tests/voice/test_workflow.py