Skip to content

Commit 1789437

Browse files
authored
1 parent ac80c04 commit 1789437

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

api/core/workflow/nodes/document_extractor/node.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import io
33
import json
44
import logging
5-
import operator
65
import os
76
import tempfile
87
from collections.abc import Mapping, Sequence
@@ -12,6 +11,9 @@
1211
import pandas as pd
1312
import pypdfium2 # type: ignore
1413
import yaml # type: ignore
14+
from docx.document import Document
15+
from docx.oxml.table import CT_Tbl
16+
from docx.oxml.text.paragraph import CT_P
1517
from docx.table import Table
1618
from docx.text.paragraph import Paragraph
1719

@@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
231233
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
232234

233235

236+
def paser_docx_part(block, doc: Document, content_items, i):
237+
if isinstance(block, CT_P):
238+
content_items.append((i, "paragraph", Paragraph(block, doc)))
239+
elif isinstance(block, CT_Tbl):
240+
content_items.append((i, "table", Table(block, doc)))
241+
242+
234243
def _extract_text_from_docx(file_content: bytes) -> str:
235244
"""
236245
Extract text from a DOCX file.
@@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
244253
# Keep track of paragraph and table positions
245254
content_items: list[tuple[int, str, Table | Paragraph]] = []
246255

247-
# Process paragraphs and tables
248-
for i, paragraph in enumerate(doc.paragraphs):
249-
if paragraph.text.strip():
250-
content_items.append((i, "paragraph", paragraph))
251-
252-
for i, table in enumerate(doc.tables):
253-
content_items.append((i, "table", table))
254-
255-
# Sort content items based on their original position
256-
content_items.sort(key=operator.itemgetter(0))
256+
it = iter(doc.element.body)
257+
part = next(it, None)
258+
i = 0
259+
while part is not None:
260+
paser_docx_part(part, doc, content_items, i)
261+
i = i + 1
262+
part = next(it, None)
257263

258264
# Process sorted content
259265
for _, item_type, item in content_items:

api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from unittest.mock import Mock, patch
22

33
import pytest
4+
from docx.oxml.text.paragraph import CT_P
45

56
from core.file import File, FileTransferMethod
67
from core.variables import ArrayFileSegment
@@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
169170
mock_paragraph2 = Mock()
170171
mock_paragraph2.text = "Paragraph 2"
171172
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
172-
173+
mock_ct_p1 = Mock(spec=CT_P)
174+
mock_ct_p1.text = "Paragraph 1"
175+
mock_ct_p2 = Mock(spec=CT_P)
176+
mock_ct_p2.text = "Paragraph 2"
177+
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
178+
mock_document.return_value.element = mock_element
173179
text = _extract_text_from_docx(b"PK\x03\x04")
174180
assert text == "Paragraph 1\nParagraph 2"
175181

0 commit comments

Comments
 (0)