22import io
33import json
44import logging
5- import operator
65import os
76import tempfile
87from collections .abc import Mapping , Sequence
1211import pandas as pd
1312import pypdfium2 # type: ignore
1413import yaml # type: ignore
14+ from docx .document import Document
15+ from docx .oxml .table import CT_Tbl
16+ from docx .oxml .text .paragraph import CT_P
1517from docx .table import Table
1618from docx .text .paragraph import Paragraph
1719
@@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
231233 raise TextExtractionError (f"Failed to extract text from DOC: { str (e )} " ) from e
232234
233235
236+ def paser_docx_part (block , doc : Document , content_items , i ):
237+ if isinstance (block , CT_P ):
238+ content_items .append ((i , "paragraph" , Paragraph (block , doc )))
239+ elif isinstance (block , CT_Tbl ):
240+ content_items .append ((i , "table" , Table (block , doc )))
241+
242+
234243def _extract_text_from_docx (file_content : bytes ) -> str :
235244 """
236245 Extract text from a DOCX file.
@@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
244253 # Keep track of paragraph and table positions
245254 content_items : list [tuple [int , str , Table | Paragraph ]] = []
246255
247- # Process paragraphs and tables
248- for i , paragraph in enumerate (doc .paragraphs ):
249- if paragraph .text .strip ():
250- content_items .append ((i , "paragraph" , paragraph ))
251-
252- for i , table in enumerate (doc .tables ):
253- content_items .append ((i , "table" , table ))
254-
255- # Sort content items based on their original position
256- content_items .sort (key = operator .itemgetter (0 ))
256+ it = iter (doc .element .body )
257+ part = next (it , None )
258+ i = 0
259+ while part is not None :
260+ paser_docx_part (part , doc , content_items , i )
261+ i = i + 1
262+ part = next (it , None )
257263
258264 # Process sorted content
259265 for _ , item_type , item in content_items :
0 commit comments