Skip to content

Commit deb0413

Browse files
Merge pull request #88 from akaashdash/main
Adding Support for Other Filing Types
2 parents 0171480 + 034817c commit deb0413

File tree

18 files changed

+5113
-4025
lines changed

18 files changed

+5113
-4025
lines changed

docs/rtd_requirements.txt

Lines changed: 1656 additions & 1453 deletions
Large diffs are not rendered by default.

docs/source/notebooks/developer_guide.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@
696696
" SEC filings usually have a flat HTML structure, which simplifies the\n",
697697
" parsing process. Each top-level HTML tag often directly corresponds\n",
698698
" to a single semantic element. This is different from many websites\n",
699-
" where HTML tags are nested deeply, requiring more complex parsing.\n",
699+
" where HTML tags are nested deeply,requiring more complex parsing.\n",
700700
"\n",
701701
" For Advanced Users:\n",
702702
" ====================\n",

docs/source/notebooks/user_guide.ipynb

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@
212212
"\u001b[1;34mTopSectionTitle\u001b[0m: Item 1.    Financial Statements\n",
213213
"\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n",
214214
"\u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...ousands, and per-share amounts)\n",
215-
"\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~40 numbers, and 742 characters.\n",
215+
"\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n",
216216
"\u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n",
217217
"\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMEN...OMPREHENSIVE INCOME (Unaudited)\n",
218218
"...\n"
@@ -248,7 +248,7 @@
248248
"├── \u001b[1;34mTopSectionTitle\u001b[0m: Item 1.    Financial Statements\n",
249249
"│ ├── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n",
250250
"│ │ ├── \u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...ousands, and per-share amounts)\n",
251-
"│ │ ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~40 numbers, and 742 characters.\n",
251+
"│ │ ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n",
252252
"│ │ └── \u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n",
253253
"│ ├── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMEN...OMPREHENSIVE INCOME (Unaudited)\n",
254254
"...\n"
@@ -390,16 +390,18 @@
390390
"output_type": "stream",
391391
"text": [
392392
"\u001b[1;34mTitleElement\u001b[0m: Segment Operating Performance\n",
393-
"├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s...31, 2022 (dollars in millions):\n",
394-
"├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~20 numbers, and 264 characters.\n",
393+
"├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s... 1, 2023 (dollars in millions):\n",
394+
"├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~39 numbers, and 408 characters.\n",
395395
"├── \u001b[1;34mTitleElement\u001b[0m: Americas\n",
396-
"│ └── \u001b[1;34mTextElement\u001b[0m: Americas net sales increased 2%...ring the first quarter of 2024.\n",
396+
"│ └── \u001b[1;34mMyElement\u001b[0m: Americas net sales increased du... the first nine months of 2024.\n",
397+
"├── \u001b[1;34mTitleElement\u001b[0m: Europe\n",
398+
"│ └── \u001b[1;34mMyElement\u001b[0m: Europe net sales increased duri...earables, Home and Accessories.\n",
397399
"├── \u001b[1;34mTitleElement\u001b[0m: Greater China\n",
398-
"│ └── \u001b[1;34mMyElement\u001b[0m: Greater China net sales decreas...ring the first quarter of 2024.\n",
400+
"│ └── \u001b[1;34mMyElement\u001b[0m: Greater China net sales decreas... and first nine months of 2024.\n",
399401
"├── \u001b[1;34mTitleElement\u001b[0m: Japan\n",
400-
"│ └── \u001b[1;34mMyElement\u001b[0m: Japan net sales increased 15% o...ring the first quarter of 2024.\n",
402+
"│ └── \u001b[1;34mMyElement\u001b[0m: Japan net sales increased durin... and first nine months of 2024.\n",
401403
"└── \u001b[1;34mTitleElement\u001b[0m: Rest of Asia Pacific\n",
402-
" └── \u001b[1;34mMyElement\u001b[0m: Rest of Asia Pacific net sales ...earables, Home and Accessories.\n",
404+
" └── \u001b[1;34mMyElement\u001b[0m: Rest of Asia Pacific net sales ... and first nine months of 2024.\n",
403405
"...\n"
404406
]
405407
}

poetry.lock

Lines changed: 2812 additions & 2410 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sec_parser/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
SecParserRuntimeError,
44
SecParserValueError,
55
)
6-
from sec_parser.processing_engine.core import Edgar10QParser
6+
from sec_parser.processing_engine.core import (
7+
Edgar10KParser,
8+
Edgar10QParser,
9+
)
710
from sec_parser.processing_engine.html_tag import HtmlTag
811
from sec_parser.processing_engine.types import ParsingOptions
912
from sec_parser.processing_steps.abstract_classes.abstract_processing_step import (
@@ -34,6 +37,7 @@
3437

3538
__all__ = [
3639
# Main parser classes
40+
"Edgar10KParser",
3741
"Edgar10QParser",
3842
"TreeBuilder",
3943
# Common semantic elements

sec_parser/processing_engine/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from sec_parser.processing_engine.core import (
1010
AbstractSemanticElementParser,
11+
Edgar10KParser,
1112
Edgar10QParser,
1213
)
1314
from sec_parser.processing_engine.html_tag import HtmlTag
@@ -16,6 +17,7 @@
1617
__all__ = [
1718
"HtmlTagParser",
1819
"AbstractSemanticElementParser",
20+
"Edgar10KParser",
1921
"Edgar10QParser",
2022
"HtmlTag",
2123
]

sec_parser/processing_engine/core.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
from sec_parser.processing_steps.text_classifier import TextClassifier
4444
from sec_parser.processing_steps.text_element_merger import TextElementMerger
4545
from sec_parser.processing_steps.title_classifier import TitleClassifier
46-
from sec_parser.processing_steps.top_section_manager_for_10q import (
46+
from sec_parser.processing_steps.top_section_manager import (
47+
TopSectionManagerFor10K,
4748
TopSectionManagerFor10Q,
4849
)
4950
from sec_parser.semantic_elements.composite_semantic_element import (
@@ -85,7 +86,7 @@ class AbstractSemanticElementParser(ABC):
8586
Why Focus on Top-Level Tags?
8687
============================
8788
SEC filings usually have a flat HTML structure, which simplifies the
88-
parsing process.Each top-level HTML tag often directly corresponds
89+
parsing process. Each top-level HTML tag often directly corresponds
8990
to a single semantic element. This is different from many websites
9091
where HTML tags are nested deeply,requiring more complex parsing.
9192
@@ -208,3 +209,48 @@ def get_default_single_element_checks(self) -> list[AbstractSingleElementCheck]:
208209
ImageCheck(),
209210
TopSectionTitleCheck(),
210211
]
212+
213+
class Edgar10KParser(AbstractSemanticElementParser):
214+
"""
215+
The Edgar10KParser class is responsible for parsing SEC EDGAR 10-K
216+
quarterly reports. It transforms the HTML documents into a list
217+
of elements. Each element in this list represents a part of
218+
the visual structure of the original document.
219+
"""
220+
221+
def get_default_steps(
222+
self,
223+
get_checks: Callable[[], list[AbstractSingleElementCheck]] | None = None,
224+
) -> list[AbstractProcessingStep]:
225+
return [
226+
IndividualSemanticElementExtractor(
227+
get_checks=get_checks or self.get_default_single_element_checks,
228+
),
229+
ImageClassifier(types_to_process={NotYetClassifiedElement}),
230+
EmptyElementClassifier(types_to_process={NotYetClassifiedElement}),
231+
TableClassifier(types_to_process={NotYetClassifiedElement}),
232+
TableOfContentsClassifier(types_to_process={TableElement}),
233+
TopSectionManagerFor10K(types_to_process={NotYetClassifiedElement}),
234+
IntroductorySectionElementClassifier(),
235+
TextClassifier(types_to_process={NotYetClassifiedElement}),
236+
HighlightedTextClassifier(types_to_process={TextElement}),
237+
SupplementaryTextClassifier(
238+
types_to_process={TextElement, HighlightedTextElement},
239+
),
240+
PageHeaderClassifier(
241+
types_to_process={TextElement, HighlightedTextElement},
242+
),
243+
PageNumberClassifier(
244+
types_to_process={TextElement, HighlightedTextElement},
245+
),
246+
TitleClassifier(types_to_process={HighlightedTextElement}),
247+
TextElementMerger(),
248+
]
249+
250+
def get_default_single_element_checks(self) -> list[AbstractSingleElementCheck]:
251+
return [
252+
TableCheck(),
253+
XbrlTagCheck(),
254+
ImageCheck(),
255+
TopSectionTitleCheck(),
256+
]

sec_parser/processing_steps/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
from sec_parser.processing_steps.text_classifier import TextClassifier
4747
from sec_parser.processing_steps.text_element_merger import TextElementMerger
4848
from sec_parser.processing_steps.title_classifier import TitleClassifier
49-
from sec_parser.processing_steps.top_section_manager_for_10q import (
49+
from sec_parser.processing_steps.top_section_manager import (
50+
TopSectionManagerFor10K,
5051
TopSectionManagerFor10Q,
5152
)
5253

@@ -68,6 +69,7 @@
6869
"TextClassifier",
6970
"TextElementMerger",
7071
"TitleClassifier",
72+
"TopSectionManagerFor10K",
7173
"TopSectionManagerFor10Q",
7274
"TopSectionTitleCheck",
7375
"XbrlTagCheck",

sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
66
AbstractSingleElementCheck,
77
)
8-
from sec_parser.processing_steps.top_section_manager_for_10q import (
8+
from sec_parser.processing_steps.top_section_manager import (
99
TopSectionManagerFor10Q,
1010
)
1111

sec_parser/processing_steps/top_section_manager_for_10q.py renamed to sec_parser/processing_steps/top_section_manager.py

Lines changed: 73 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
)
1313
from sec_parser.semantic_elements.top_section_title import TopSectionTitle
1414
from sec_parser.semantic_elements.top_section_title_types import (
15-
IDENTIFIER_TO_10Q_SECTION,
16-
InvalidTopSectionIn10Q,
17-
TopSectionType,
15+
FilingSections,
16+
FilingSectionsIn10K,
17+
FilingSectionsIn10Q,
18+
InvalidTopSectionInFiling,
19+
TopSectionInFiling,
1820
)
1921

2022
if TYPE_CHECKING: # pragma: no cover
@@ -23,17 +25,17 @@
2325
)
2426

2527

26-
part_pattern = re.compile(r"part\s+(i+)[.\s]*", re.IGNORECASE)
27-
item_pattern = re.compile(r"item\s+(\d+a?)[.\s]*", re.IGNORECASE)
28+
part_pattern = re.compile(r"part\s+([iv]+)[.\s]*", re.IGNORECASE)
29+
item_pattern = re.compile(r"item\s+(\d+[a-c]?)[.\s]*", re.IGNORECASE)
2830

2931

3032
@dataclass
3133
class _Candidate:
32-
section_type: TopSectionType
34+
section_type: TopSectionInFiling
3335
element: AbstractSemanticElement
3436

3537

36-
class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep):
38+
class TopSectionManager(AbstractElementwiseProcessingStep):
3739
"""
3840
Documents are divided into sections, subsections, and so on.
3941
Top level sections are the highest level of sections and are
@@ -49,6 +51,7 @@ class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep):
4951

5052
def __init__(
5153
self,
54+
filing_sections: FilingSections,
5255
*,
5356
types_to_process: set[type[AbstractSemanticElement]] | None = None,
5457
types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
@@ -57,6 +60,7 @@ def __init__(
5760
types_to_process=types_to_process,
5861
types_to_exclude=types_to_exclude,
5962
)
63+
self._filing_sections = filing_sections
6064
self._candidates: list[_Candidate] = []
6165
self._selected_candidates: tuple[_Candidate, ...] | None = None
6266
self._last_part: str = "?"
@@ -71,7 +75,10 @@ def is_match_part_or_item(cls, text: str) -> bool:
7175
@staticmethod
7276
def match_part(text: str) -> str | None:
7377
if match := part_pattern.match(text):
74-
return str(len(match.group(1)))
78+
part_text = match.group(1).lower()
79+
# Map roman numerals to arabic numbers
80+
roman_map = {"i": "1", "ii": "2", "iii": "3", "iv": "4"}
81+
return roman_map.get(part_text)
7582
return None
7683

7784
@staticmethod
@@ -156,21 +163,21 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
156163
if part := self.match_part(element.text):
157164
self._last_part = part
158165
section_type = self._get_section_type(f"part{self._last_part}")
159-
if section_type is InvalidTopSectionIn10Q:
160-
warnings.warn(
161-
f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopSectionIn10Q.",
162-
UserWarning,
163-
stacklevel=8,
164-
)
166+
if section_type is InvalidTopSectionInFiling:
167+
warnings.warn(
168+
f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopSectionInFiling.",
169+
UserWarning,
170+
stacklevel=8,
171+
)
165172
candidate = _Candidate(section_type, element)
166173
elif item := self.match_item(element.text):
167174
section_type = self._get_section_type(f"part{self._last_part}item{item}")
168-
if section_type is InvalidTopSectionIn10Q:
169-
warnings.warn(
170-
f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopSectionIn10Q.",
171-
UserWarning,
172-
stacklevel=8,
173-
)
175+
if section_type is InvalidTopSectionInFiling:
176+
warnings.warn(
177+
f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopSectionInFiling.",
178+
UserWarning,
179+
stacklevel=8,
180+
)
174181
candidate = _Candidate(section_type, element)
175182

176183

@@ -182,17 +189,19 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
182189
)
183190

184191
"""
185-
Returns the corresponding TopSectionType of the given identifier. The TopSectionType represents a standard top section type in the context of a 10-Q report.
186-
The function utilizes the IDENTIFIER_TO_10Q_SECTION dictionary.
192+
Returns the corresponding TopSectionInFiling of the given identifier.
193+
The TopSectionInFiling represents a standard top section type in the context of an SEC filing.
194+
The function utilizes the identifier_to_section dictionary of the given FilingSections object.
187195
188196
Input:
189197
- identifier (type: String): an identifier of a top section title expressed by a string
190198
191199
Output:
192-
- returns the corresponding TopSectionType of the given identifier. Returns InvalisTopSectionIn10Q if the identifier doesn't match any TopSectionType.
200+
- returns the corresponding TopSectionInFiling of the given identifier.
201+
- Returns InvalidTopSectionInFiling if the identifier doesn't match any TopSectionInFiling.
193202
"""
194-
def _get_section_type(self, identifier: str) -> TopSectionType:
195-
return IDENTIFIER_TO_10Q_SECTION.get(identifier, InvalidTopSectionIn10Q)
203+
def _get_section_type(self, identifier: str) -> TopSectionInFiling:
204+
return self._filing_sections.identifier_to_section.get(identifier, InvalidTopSectionInFiling)
196205

197206
""""
198207
Groups candidates by section type. Then selects the first element candidate of each section type by using the helper function select_element.
@@ -229,7 +238,7 @@ def select_element(elements: list[AbstractSemanticElement]) -> AbstractSemanticE
229238
if not element.html_tag.contains_tag("table", include_self = True)
230239
]
231240
if len(elements_without_table) >= 1:
232-
return elements_without_table[0]
241+
return elements_without_table[0]
233242
return elements[0]
234243

235244

@@ -296,6 +305,44 @@ def _create_top_section_title(
296305
log_origin=self.__class__.__name__,
297306
)
298307

308+
class TopSectionManagerFor10Q(TopSectionManager):
309+
"""
310+
Specialized version of TopSectionManagerForFiling for handling 10-Q filings.
311+
Automatically uses FilingSectionsIn10Q while maintaining all the functionality
312+
of the base class.
313+
"""
314+
315+
def __init__(
316+
self,
317+
*,
318+
types_to_process: set[type[AbstractSemanticElement]] | None = None,
319+
types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
320+
) -> None:
321+
super().__init__(
322+
filing_sections=FilingSectionsIn10Q,
323+
types_to_process=types_to_process,
324+
types_to_exclude=types_to_exclude,
325+
)
326+
327+
class TopSectionManagerFor10K(TopSectionManager):
328+
"""
329+
Specialized version of TopSectionManagerForFiling for handling 10-K filings.
330+
Automatically uses FilingSectionsIn10K while maintaining all the functionality
331+
of the base class.
332+
"""
333+
334+
def __init__(
335+
self,
336+
*,
337+
types_to_process: set[type[AbstractSemanticElement]] | None = None,
338+
types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
339+
) -> None:
340+
super().__init__(
341+
filing_sections=FilingSectionsIn10K,
342+
types_to_process=types_to_process,
343+
types_to_exclude=types_to_exclude,
344+
)
345+
299346

300347
"""
301348
Algorithm:

0 commit comments

Comments
 (0)