Skip to content

Commit e45a1f4

Browse files
committed
[Responses API] Implement parse_referenced_documents_from_responses_api
Implements the function parse_referenced_documents_from_responses_api checking at the Response API output at: - file_search_call objects (filename and attributes) - annotations within messages content (type, url, title) - 2 type of annoations, url_citation and file_citation
1 parent 6cfaeab commit e45a1f4

File tree

3 files changed

+280
-24
lines changed

3 files changed

+280
-24
lines changed

src/app/endpoints/query_v2.py

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Handler for REST API call to provide answer to query using Response API."""
22

3+
import json
34
import logging
45
from typing import Annotated, Any, cast
56

@@ -38,7 +39,10 @@
3839
get_topic_summary_system_prompt,
3940
)
4041
from utils.mcp_headers import mcp_headers_dependency
41-
from utils.responses import extract_text_from_response_output_item
42+
from utils.responses import (
43+
extract_text_from_response_output_item,
44+
parse_referenced_documents_from_responses_api,
45+
)
4246
from utils.shields import detect_shield_violations, get_available_shields
4347
from utils.token_counter import TokenCounter
4448
from utils.types import ToolCallSummary, TurnSummary
@@ -132,7 +136,7 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too-
132136
id=str(getattr(output_item, "id")),
133137
name=DEFAULT_RAG_TOOL,
134138
args=args,
135-
response=response_payload,
139+
response=json.dumps(response_payload) if response_payload else None,
136140
)
137141

138142
if item_type == "web_search_call":
@@ -394,27 +398,6 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche
394398
return (summary, conversation_id, referenced_documents, token_usage)
395399

396400

397-
def parse_referenced_documents_from_responses_api(
398-
response: OpenAIResponseObject, # pylint: disable=unused-argument
399-
) -> list[ReferencedDocument]:
400-
"""
401-
Parse referenced documents from OpenAI Responses API response.
402-
403-
Args:
404-
response: The OpenAI Response API response object
405-
406-
Returns:
407-
list[ReferencedDocument]: List of referenced documents with doc_url and doc_title
408-
"""
409-
# TODO(ltomasbo): need to parse source documents from Responses API response.
410-
# The Responses API has a different structure than Agent API for referenced documents.
411-
# Need to extract from:
412-
# - OpenAIResponseOutputMessageFileSearchToolCall.results
413-
# - OpenAIResponseAnnotationCitation in message content
414-
# - OpenAIResponseAnnotationFileCitation in message content
415-
return []
416-
417-
418401
def extract_token_usage_from_responses_api(
419402
response: OpenAIResponseObject,
420403
model: str,

src/utils/responses.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
from typing import Any
44

5+
from pydantic import AnyUrl, ValidationError
6+
7+
from models.responses import ReferencedDocument
8+
59

610
def extract_text_from_response_output_item(output_item: Any) -> str:
711
"""Extract assistant message text from a Responses API output item.
@@ -54,3 +58,177 @@ def extract_text_from_response_output_item(output_item: Any) -> str:
5458
text_fragments.append(str(dict_text))
5559

5660
return "".join(text_fragments)
61+
62+
63+
def _parse_file_search_result(
64+
result: Any,
65+
) -> tuple[str | None, str | None]:
66+
"""
67+
Extract filename and URL from a file search result.
68+
69+
Args:
70+
result: A file search result (dict or object)
71+
72+
Returns:
73+
tuple[str | None, str | None]: (doc_url, filename) tuple
74+
"""
75+
# Handle both object and dict access
76+
if isinstance(result, dict):
77+
filename = result.get("filename")
78+
attributes = result.get("attributes", {})
79+
else:
80+
filename = getattr(result, "filename", None)
81+
attributes = getattr(result, "attributes", {}) or {}
82+
83+
# Try to get URL from attributes - look for common URL fields
84+
doc_url = (
85+
attributes.get("link") or attributes.get("url") or attributes.get("doc_url")
86+
)
87+
# Treat empty string as None for URL to satisfy AnyUrl | None
88+
final_url = doc_url if doc_url else None
89+
return (final_url, filename)
90+
91+
92+
def _parse_annotation(
93+
annotation: Any,
94+
) -> tuple[str | None, str | None, str | None]:
95+
"""
96+
Extract type, URL, and title from an annotation.
97+
98+
Args:
99+
annotation: An annotation (dict or object)
100+
101+
Returns:
102+
tuple[str | None, str | None, str | None]: (type, url, title) tuple
103+
"""
104+
# Handle both object and dict access for annotations
105+
if isinstance(annotation, dict):
106+
anno_type = annotation.get("type")
107+
anno_url = annotation.get("url")
108+
anno_title = annotation.get("title") or annotation.get("filename")
109+
else:
110+
anno_type = getattr(annotation, "type", None)
111+
anno_url = getattr(annotation, "url", None)
112+
anno_title = getattr(annotation, "title", None) or getattr(
113+
annotation, "filename", None
114+
)
115+
return (anno_type, anno_url, anno_title)
116+
117+
118+
def _add_document_if_unique(
119+
documents: list[ReferencedDocument],
120+
seen_docs: set[tuple[str | None, str | None]],
121+
doc_url: str | None,
122+
doc_title: str | None,
123+
) -> None:
124+
"""
125+
Add document to list if not already seen.
126+
127+
Args:
128+
documents: List of documents to append to
129+
seen_docs: Set of seen (url, title) tuples
130+
doc_url: Document URL string (may be None)
131+
doc_title: Document title (may be None)
132+
"""
133+
if (doc_url, doc_title) not in seen_docs:
134+
# Convert string URL to AnyUrl type; None is acceptable as-is.
135+
validated_url: AnyUrl | None = None
136+
if doc_url:
137+
try:
138+
validated_url = AnyUrl(doc_url) # type: ignore[arg-type]
139+
except ValidationError:
140+
# Skip documents with invalid URLs
141+
return
142+
documents.append(ReferencedDocument(doc_url=validated_url, doc_title=doc_title))
143+
seen_docs.add((doc_url, doc_title))
144+
145+
146+
def _parse_file_search_output(
147+
output_item: Any,
148+
documents: list[ReferencedDocument],
149+
seen_docs: set[tuple[str | None, str | None]],
150+
) -> None:
151+
"""
152+
Parse file search results from an output item.
153+
154+
Args:
155+
output_item: Output item of type "file_search_call"
156+
documents: List to append found documents to
157+
seen_docs: Set of seen (url, title) tuples
158+
"""
159+
results = getattr(output_item, "results", []) or []
160+
for result in results:
161+
doc_url, filename = _parse_file_search_result(result)
162+
# If we have at least a filename or url
163+
if filename or doc_url:
164+
_add_document_if_unique(documents, seen_docs, doc_url, filename)
165+
166+
167+
def _parse_message_annotations(
168+
output_item: Any,
169+
documents: list[ReferencedDocument],
170+
seen_docs: set[tuple[str | None, str | None]],
171+
) -> None:
172+
"""
173+
Parse annotations from a message output item.
174+
175+
Args:
176+
output_item: Output item of type "message"
177+
documents: List to append found documents to
178+
seen_docs: Set of seen (url, title) tuples
179+
"""
180+
content = getattr(output_item, "content", None)
181+
if not isinstance(content, list):
182+
return
183+
184+
for part in content:
185+
# Skip if part is a string or doesn't have annotations
186+
if isinstance(part, str):
187+
continue
188+
189+
annotations = getattr(part, "annotations", []) or []
190+
for annotation in annotations:
191+
anno_type, anno_url, anno_title = _parse_annotation(annotation)
192+
193+
if anno_type == "url_citation":
194+
# Treat empty string as None
195+
final_url = anno_url if anno_url else None
196+
_add_document_if_unique(documents, seen_docs, final_url, anno_title)
197+
elif anno_type == "file_citation":
198+
_add_document_if_unique(documents, seen_docs, None, anno_title)
199+
200+
201+
def parse_referenced_documents_from_responses_api(
202+
response: Any,
203+
) -> list[ReferencedDocument]:
204+
"""
205+
Parse referenced documents from OpenAI Responses API response.
206+
207+
This function extracts document references from two sources:
208+
1. file_search_call results - Documents retrieved via RAG/file search
209+
2. message content annotations - Citation annotations in assistant messages
210+
211+
Args:
212+
response: The OpenAI Response API response object (OpenAIResponseObject)
213+
214+
Returns:
215+
list[ReferencedDocument]: List of unique referenced documents with doc_url and doc_title
216+
"""
217+
documents: list[ReferencedDocument] = []
218+
# Use a set to track unique documents by (doc_url, doc_title) tuple
219+
seen_docs: set[tuple[str | None, str | None]] = set()
220+
221+
if not response.output:
222+
return documents
223+
224+
for output_item in response.output:
225+
item_type = getattr(output_item, "type", None)
226+
227+
# 1. Parse from file_search_call results
228+
if item_type == "file_search_call":
229+
_parse_file_search_output(output_item, documents, seen_docs)
230+
# 2. Parse from message content annotations
231+
elif item_type == "message":
232+
_parse_message_annotations(output_item, documents, seen_docs)
233+
234+
return documents

tests/unit/app/endpoints/test_query_v2.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,15 @@ async def test_retrieve_response_parses_output_and_tool_calls(
198198
mock_client = mocker.Mock()
199199

200200
# Build output with content variants and tool calls
201+
part1 = mocker.Mock(text="Hello ")
202+
part1.annotations = [] # Ensure annotations is a list to avoid iteration error
203+
part2 = mocker.Mock(text="world")
204+
part2.annotations = []
205+
201206
output_item_1 = mocker.Mock()
202207
output_item_1.type = "message"
203208
output_item_1.role = "assistant"
204-
output_item_1.content = [mocker.Mock(text="Hello "), mocker.Mock(text="world")]
209+
output_item_1.content = [part1, part2]
205210

206211
output_item_2 = mocker.Mock()
207212
output_item_2.type = "message"
@@ -710,3 +715,93 @@ async def test_retrieve_response_no_violation_with_shields(
710715

711716
# Verify that the validation error metric was NOT incremented
712717
validation_metric.inc.assert_not_called()
718+
719+
720+
def _create_message_output_with_annotations(mocker: MockerFixture) -> Any:
721+
"""Create a message output item with url and file citations."""
722+
content_part = mocker.Mock()
723+
content_part.type = "output_text"
724+
content_part.text = "Here is a citation."
725+
726+
annotation1 = mocker.Mock()
727+
annotation1.type = "url_citation"
728+
annotation1.url = "http://example.com/doc1"
729+
annotation1.title = "Doc 1"
730+
731+
annotation2 = mocker.Mock()
732+
annotation2.type = "file_citation"
733+
annotation2.filename = "file1.txt"
734+
annotation2.url = None
735+
annotation2.title = None
736+
737+
content_part.annotations = [annotation1, annotation2]
738+
739+
output_item = mocker.Mock()
740+
output_item.type = "message"
741+
output_item.role = "assistant"
742+
output_item.content = [content_part]
743+
return output_item
744+
745+
746+
@pytest.mark.asyncio
747+
async def test_retrieve_response_parses_referenced_documents(
748+
mocker: MockerFixture,
749+
) -> None:
750+
"""Test that retrieve_response correctly parses referenced documents from response."""
751+
mock_client = mocker.Mock()
752+
753+
# 1. Output item with message content annotations (citations)
754+
output_item_1 = _create_message_output_with_annotations(mocker)
755+
756+
# 2. Output item with file search tool call results
757+
output_item_2 = mocker.Mock()
758+
output_item_2.type = "file_search_call"
759+
output_item_2.queries = (
760+
[]
761+
) # Ensure queries is a list to avoid iteration error in tool summary
762+
output_item_2.status = "completed"
763+
output_item_2.results = [
764+
{"filename": "file2.pdf", "attributes": {"url": "http://example.com/doc2"}},
765+
{"filename": "file3.docx", "attributes": {}}, # No URL
766+
]
767+
768+
response_obj = mocker.Mock()
769+
response_obj.id = "resp-docs"
770+
response_obj.output = [output_item_1, output_item_2]
771+
response_obj.usage = None
772+
773+
mock_client.responses.create = mocker.AsyncMock(return_value=response_obj)
774+
mock_vector_stores = mocker.Mock()
775+
mock_vector_stores.data = []
776+
mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores)
777+
mock_client.shields.list = mocker.AsyncMock(return_value=[])
778+
779+
mocker.patch("app.endpoints.query_v2.get_system_prompt", return_value="PROMPT")
780+
mocker.patch("app.endpoints.query_v2.configuration", mocker.Mock(mcp_servers=[]))
781+
782+
qr = QueryRequest(query="query with docs")
783+
_summary, _conv_id, referenced_docs, _token_usage = await retrieve_response(
784+
mock_client, "model-docs", qr, token="tkn", provider_id="test-provider"
785+
)
786+
787+
assert len(referenced_docs) == 4
788+
789+
# Verify Doc 1 (URL citation)
790+
doc1 = next((d for d in referenced_docs if d.doc_title == "Doc 1"), None)
791+
assert doc1
792+
assert str(doc1.doc_url) == "http://example.com/doc1"
793+
794+
# Verify file1.txt (File citation)
795+
doc2 = next((d for d in referenced_docs if d.doc_title == "file1.txt"), None)
796+
assert doc2
797+
assert doc2.doc_url is None
798+
799+
# Verify file2.pdf (File search result with URL)
800+
doc3 = next((d for d in referenced_docs if d.doc_title == "file2.pdf"), None)
801+
assert doc3
802+
assert str(doc3.doc_url) == "http://example.com/doc2"
803+
804+
# Verify file3.docx (File search result without URL)
805+
doc4 = next((d for d in referenced_docs if d.doc_title == "file3.docx"), None)
806+
assert doc4
807+
assert doc4.doc_url is None

0 commit comments

Comments
 (0)