|
2 | 2 |
|
3 | 3 | from typing import Any |
4 | 4 |
|
| 5 | +from pydantic import AnyUrl, ValidationError |
| 6 | + |
| 7 | +from models.responses import ReferencedDocument |
| 8 | + |
5 | 9 |
|
6 | 10 | def extract_text_from_response_output_item(output_item: Any) -> str: |
7 | 11 | """Extract assistant message text from a Responses API output item. |
@@ -54,3 +58,177 @@ def extract_text_from_response_output_item(output_item: Any) -> str: |
54 | 58 | text_fragments.append(str(dict_text)) |
55 | 59 |
|
56 | 60 | return "".join(text_fragments) |
| 61 | + |
| 62 | + |
| 63 | +def _parse_file_search_result( |
| 64 | + result: Any, |
| 65 | +) -> tuple[str | None, str | None]: |
| 66 | + """ |
| 67 | + Extract filename and URL from a file search result. |
| 68 | +
|
| 69 | + Args: |
| 70 | + result: A file search result (dict or object) |
| 71 | +
|
| 72 | + Returns: |
| 73 | + tuple[str | None, str | None]: (doc_url, filename) tuple |
| 74 | + """ |
| 75 | + # Handle both object and dict access |
| 76 | + if isinstance(result, dict): |
| 77 | + filename = result.get("filename") |
| 78 | + attributes = result.get("attributes", {}) |
| 79 | + else: |
| 80 | + filename = getattr(result, "filename", None) |
| 81 | + attributes = getattr(result, "attributes", {}) or {} |
| 82 | + |
| 83 | + # Try to get URL from attributes - look for common URL fields |
| 84 | + doc_url = ( |
| 85 | + attributes.get("link") or attributes.get("url") or attributes.get("doc_url") |
| 86 | + ) |
| 87 | + # Treat empty string as None for URL to satisfy AnyUrl | None |
| 88 | + final_url = doc_url if doc_url else None |
| 89 | + return (final_url, filename) |
| 90 | + |
| 91 | + |
| 92 | +def _parse_annotation( |
| 93 | + annotation: Any, |
| 94 | +) -> tuple[str | None, str | None, str | None]: |
| 95 | + """ |
| 96 | + Extract type, URL, and title from an annotation. |
| 97 | +
|
| 98 | + Args: |
| 99 | + annotation: An annotation (dict or object) |
| 100 | +
|
| 101 | + Returns: |
| 102 | + tuple[str | None, str | None, str | None]: (type, url, title) tuple |
| 103 | + """ |
| 104 | + # Handle both object and dict access for annotations |
| 105 | + if isinstance(annotation, dict): |
| 106 | + anno_type = annotation.get("type") |
| 107 | + anno_url = annotation.get("url") |
| 108 | + anno_title = annotation.get("title") or annotation.get("filename") |
| 109 | + else: |
| 110 | + anno_type = getattr(annotation, "type", None) |
| 111 | + anno_url = getattr(annotation, "url", None) |
| 112 | + anno_title = getattr(annotation, "title", None) or getattr( |
| 113 | + annotation, "filename", None |
| 114 | + ) |
| 115 | + return (anno_type, anno_url, anno_title) |
| 116 | + |
| 117 | + |
| 118 | +def _add_document_if_unique( |
| 119 | + documents: list[ReferencedDocument], |
| 120 | + seen_docs: set[tuple[str | None, str | None]], |
| 121 | + doc_url: str | None, |
| 122 | + doc_title: str | None, |
| 123 | +) -> None: |
| 124 | + """ |
| 125 | + Add document to list if not already seen. |
| 126 | +
|
| 127 | + Args: |
| 128 | + documents: List of documents to append to |
| 129 | + seen_docs: Set of seen (url, title) tuples |
| 130 | + doc_url: Document URL string (may be None) |
| 131 | + doc_title: Document title (may be None) |
| 132 | + """ |
| 133 | + if (doc_url, doc_title) not in seen_docs: |
| 134 | + # Convert string URL to AnyUrl type; None is acceptable as-is. |
| 135 | + validated_url: AnyUrl | None = None |
| 136 | + if doc_url: |
| 137 | + try: |
| 138 | + validated_url = AnyUrl(doc_url) # type: ignore[arg-type] |
| 139 | + except ValidationError: |
| 140 | + # Skip documents with invalid URLs |
| 141 | + return |
| 142 | + documents.append(ReferencedDocument(doc_url=validated_url, doc_title=doc_title)) |
| 143 | + seen_docs.add((doc_url, doc_title)) |
| 144 | + |
| 145 | + |
| 146 | +def _parse_file_search_output( |
| 147 | + output_item: Any, |
| 148 | + documents: list[ReferencedDocument], |
| 149 | + seen_docs: set[tuple[str | None, str | None]], |
| 150 | +) -> None: |
| 151 | + """ |
| 152 | + Parse file search results from an output item. |
| 153 | +
|
| 154 | + Args: |
| 155 | + output_item: Output item of type "file_search_call" |
| 156 | + documents: List to append found documents to |
| 157 | + seen_docs: Set of seen (url, title) tuples |
| 158 | + """ |
| 159 | + results = getattr(output_item, "results", []) or [] |
| 160 | + for result in results: |
| 161 | + doc_url, filename = _parse_file_search_result(result) |
| 162 | + # If we have at least a filename or url |
| 163 | + if filename or doc_url: |
| 164 | + _add_document_if_unique(documents, seen_docs, doc_url, filename) |
| 165 | + |
| 166 | + |
| 167 | +def _parse_message_annotations( |
| 168 | + output_item: Any, |
| 169 | + documents: list[ReferencedDocument], |
| 170 | + seen_docs: set[tuple[str | None, str | None]], |
| 171 | +) -> None: |
| 172 | + """ |
| 173 | + Parse annotations from a message output item. |
| 174 | +
|
| 175 | + Args: |
| 176 | + output_item: Output item of type "message" |
| 177 | + documents: List to append found documents to |
| 178 | + seen_docs: Set of seen (url, title) tuples |
| 179 | + """ |
| 180 | + content = getattr(output_item, "content", None) |
| 181 | + if not isinstance(content, list): |
| 182 | + return |
| 183 | + |
| 184 | + for part in content: |
| 185 | + # Skip if part is a string or doesn't have annotations |
| 186 | + if isinstance(part, str): |
| 187 | + continue |
| 188 | + |
| 189 | + annotations = getattr(part, "annotations", []) or [] |
| 190 | + for annotation in annotations: |
| 191 | + anno_type, anno_url, anno_title = _parse_annotation(annotation) |
| 192 | + |
| 193 | + if anno_type == "url_citation": |
| 194 | + # Treat empty string as None |
| 195 | + final_url = anno_url if anno_url else None |
| 196 | + _add_document_if_unique(documents, seen_docs, final_url, anno_title) |
| 197 | + elif anno_type == "file_citation": |
| 198 | + _add_document_if_unique(documents, seen_docs, None, anno_title) |
| 199 | + |
| 200 | + |
| 201 | +def parse_referenced_documents_from_responses_api( |
| 202 | + response: Any, |
| 203 | +) -> list[ReferencedDocument]: |
| 204 | + """ |
| 205 | + Parse referenced documents from OpenAI Responses API response. |
| 206 | +
|
| 207 | + This function extracts document references from two sources: |
| 208 | + 1. file_search_call results - Documents retrieved via RAG/file search |
| 209 | + 2. message content annotations - Citation annotations in assistant messages |
| 210 | +
|
| 211 | + Args: |
| 212 | + response: The OpenAI Response API response object (OpenAIResponseObject) |
| 213 | +
|
| 214 | + Returns: |
| 215 | + list[ReferencedDocument]: List of unique referenced documents with doc_url and doc_title |
| 216 | + """ |
| 217 | + documents: list[ReferencedDocument] = [] |
| 218 | + # Use a set to track unique documents by (doc_url, doc_title) tuple |
| 219 | + seen_docs: set[tuple[str | None, str | None]] = set() |
| 220 | + |
| 221 | + if not response.output: |
| 222 | + return documents |
| 223 | + |
| 224 | + for output_item in response.output: |
| 225 | + item_type = getattr(output_item, "type", None) |
| 226 | + |
| 227 | + # 1. Parse from file_search_call results |
| 228 | + if item_type == "file_search_call": |
| 229 | + _parse_file_search_output(output_item, documents, seen_docs) |
| 230 | + # 2. Parse from message content annotations |
| 231 | + elif item_type == "message": |
| 232 | + _parse_message_annotations(output_item, documents, seen_docs) |
| 233 | + |
| 234 | + return documents |
0 commit comments