1212)
1313from sec_parser .semantic_elements .top_section_title import TopSectionTitle
1414from sec_parser .semantic_elements .top_section_title_types import (
15- IDENTIFIER_TO_10Q_SECTION ,
16- InvalidTopSectionIn10Q ,
17- TopSectionType ,
15+ FilingSections ,
16+ FilingSectionsIn10K ,
17+ FilingSectionsIn10Q ,
18+ InvalidTopSectionInFiling ,
19+ TopSectionInFiling ,
1820)
1921
2022if TYPE_CHECKING : # pragma: no cover
2325 )
2426
2527
26- part_pattern = re .compile (r"part\s+(i +)[.\s]*" , re .IGNORECASE )
27- item_pattern = re .compile (r"item\s+(\d+a ?)[.\s]*" , re .IGNORECASE )
28+ part_pattern = re .compile (r"part\s+([iv] +)[.\s]*" , re .IGNORECASE )
29+ item_pattern = re .compile (r"item\s+(\d+[a-c] ?)[.\s]*" , re .IGNORECASE )
2830
2931
3032@dataclass
3133class _Candidate :
32- section_type : TopSectionType
34+ section_type : TopSectionInFiling
3335 element : AbstractSemanticElement
3436
3537
36- class TopSectionManagerFor10Q (AbstractElementwiseProcessingStep ):
38+ class TopSectionManager (AbstractElementwiseProcessingStep ):
3739 """
3840 Documents are divided into sections, subsections, and so on.
3941 Top level sections are the highest level of sections and are
@@ -49,6 +51,7 @@ class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep):
4951
5052 def __init__ (
5153 self ,
54+ filing_sections : FilingSections ,
5255 * ,
5356 types_to_process : set [type [AbstractSemanticElement ]] | None = None ,
5457 types_to_exclude : set [type [AbstractSemanticElement ]] | None = None ,
@@ -57,6 +60,7 @@ def __init__(
5760 types_to_process = types_to_process ,
5861 types_to_exclude = types_to_exclude ,
5962 )
63+ self ._filing_sections = filing_sections
6064 self ._candidates : list [_Candidate ] = []
6165 self ._selected_candidates : tuple [_Candidate , ...] | None = None
6266 self ._last_part : str = "?"
@@ -71,7 +75,10 @@ def is_match_part_or_item(cls, text: str) -> bool:
7175 @staticmethod
7276 def match_part (text : str ) -> str | None :
7377 if match := part_pattern .match (text ):
74- return str (len (match .group (1 )))
78+ part_text = match .group (1 ).lower ()
79+ # Map roman numerals to arabic numbers
80+ roman_map = {"i" : "1" , "ii" : "2" , "iii" : "3" , "iv" : "4" }
81+ return roman_map .get (part_text )
7582 return None
7683
7784 @staticmethod
@@ -156,21 +163,21 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
156163 if part := self .match_part (element .text ):
157164 self ._last_part = part
158165 section_type = self ._get_section_type (f"part{ self ._last_part } " )
159- if section_type is InvalidTopSectionIn10Q :
160- warnings .warn (
161- f"Invalid section type for part{ self ._last_part } . Defaulting to InvalidTopSectionIn10Q ." ,
162- UserWarning ,
163- stacklevel = 8 ,
164- )
166+ if section_type is InvalidTopSectionInFiling :
167+ warnings .warn (
168+ f"Invalid section type for part{ self ._last_part } . Defaulting to InvalidTopSectionInFiling ." ,
169+ UserWarning ,
170+ stacklevel = 8 ,
171+ )
165172 candidate = _Candidate (section_type , element )
166173 elif item := self .match_item (element .text ):
167174 section_type = self ._get_section_type (f"part{ self ._last_part } item{ item } " )
168- if section_type is InvalidTopSectionIn10Q :
169- warnings .warn (
170- f"Invalid section type for part{ self ._last_part } item{ item } . Defaulting to InvalidTopSectionIn10Q ." ,
171- UserWarning ,
172- stacklevel = 8 ,
173- )
175+ if section_type is InvalidTopSectionInFiling :
176+ warnings .warn (
177+ f"Invalid section type for part{ self ._last_part } item{ item } . Defaulting to InvalidTopSectionInFiling ." ,
178+ UserWarning ,
179+ stacklevel = 8 ,
180+ )
174181 candidate = _Candidate (section_type , element )
175182
176183
@@ -182,17 +189,19 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
182189 )
183190
184191 """
185- Returns the corresponding TopSectionType of the given identifier. The TopSectionType represents a standard top section type in the context of a 10-Q report.
186- The function utilizes the IDENTIFIER_TO_10Q_SECTION dictionary.
192+ Returns the corresponding TopSectionInFiling of the given identifier.
193+ The TopSectionInFiling represents a standard top section type in the context of an SEC filing.
194+ The function utilizes the identifier_to_section dictionary of the given FilingSections object.
187195
188196 Input:
189197 - identifier (type: String): an identifier of a top section title expressed by a string
190198
191199 Output:
192- - returns the corresponding TopSectionType of the given identifier. Returns InvalisTopSectionIn10Q if the identifier doesn't match any TopSectionType.
200+ - returns the corresponding TopSectionInFiling of the given identifier.
201+ - Returns InvalidTopSectionInFiling if the identifier doesn't match any TopSectionInFiling.
193202 """
194- def _get_section_type (self , identifier : str ) -> TopSectionType :
195- return IDENTIFIER_TO_10Q_SECTION . get (identifier , InvalidTopSectionIn10Q )
203+ def _get_section_type (self , identifier : str ) -> TopSectionInFiling :
204+ return self . _filing_sections . identifier_to_section . get (identifier , InvalidTopSectionInFiling )
196205
197206 """"
198207 Groups candidates by section type. Then selects the first element candidate of each section type by using the helper function select_element.
@@ -229,7 +238,7 @@ def select_element(elements: list[AbstractSemanticElement]) -> AbstractSemanticE
229238 if not element .html_tag .contains_tag ("table" , include_self = True )
230239 ]
231240 if len (elements_without_table ) >= 1 :
232- return elements_without_table [0 ]
241+ return elements_without_table [0 ]
233242 return elements [0 ]
234243
235244
@@ -296,6 +305,44 @@ def _create_top_section_title(
296305 log_origin = self .__class__ .__name__ ,
297306 )
298307
308+ class TopSectionManagerFor10Q (TopSectionManager ):
309+ """
310+ Specialized version of TopSectionManagerForFiling for handling 10-Q filings.
311+ Automatically uses FilingSectionsIn10Q while maintaining all the functionality
312+ of the base class.
313+ """
314+
315+ def __init__ (
316+ self ,
317+ * ,
318+ types_to_process : set [type [AbstractSemanticElement ]] | None = None ,
319+ types_to_exclude : set [type [AbstractSemanticElement ]] | None = None ,
320+ ) -> None :
321+ super ().__init__ (
322+ filing_sections = FilingSectionsIn10Q ,
323+ types_to_process = types_to_process ,
324+ types_to_exclude = types_to_exclude ,
325+ )
326+
327+ class TopSectionManagerFor10K (TopSectionManager ):
328+ """
329+ Specialized version of TopSectionManagerForFiling for handling 10-K filings.
330+ Automatically uses FilingSectionsIn10K while maintaining all the functionality
331+ of the base class.
332+ """
333+
334+ def __init__ (
335+ self ,
336+ * ,
337+ types_to_process : set [type [AbstractSemanticElement ]] | None = None ,
338+ types_to_exclude : set [type [AbstractSemanticElement ]] | None = None ,
339+ ) -> None :
340+ super ().__init__ (
341+ filing_sections = FilingSectionsIn10K ,
342+ types_to_process = types_to_process ,
343+ types_to_exclude = types_to_exclude ,
344+ )
345+
299346
300347"""
301348Algorithm:
0 commit comments