alphanome-ai
diff --git a/‎docs/rtd_requirements.txt‎
Lines changed: 1656 additions & 1453 deletions b/‎docs/rtd_requirements.txt‎
Lines changed: 1656 additions & 1453 deletions
diff --git a/‎docs/source/notebooks/developer_guide.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/notebooks/developer_guide.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/notebooks/user_guide.ipynb‎
Lines changed: 10 additions & 8 deletions b/‎docs/source/notebooks/user_guide.ipynb‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎poetry.lock‎
Lines changed: 2812 additions & 2410 deletions b/‎poetry.lock‎
Lines changed: 2812 additions & 2410 deletions
diff --git a/‎sec_parser/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎sec_parser/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sec_parser/processing_engine/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎sec_parser/processing_engine/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sec_parser/processing_engine/core.py‎
Lines changed: 48 additions & 2 deletions b/‎sec_parser/processing_engine/core.py‎
Lines changed: 48 additions & 2 deletions
diff --git a/‎sec_parser/processing_steps/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎sec_parser/processing_steps/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py‎
Lines changed: 1 addition & 1 deletion b/‎sec_parser/processing_steps/individual_semantic_element_extractor/single_element_checks/top_section_title_check.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sec_parser/processing_steps/top_section_manager_for_10q.py‎ renamed to ‎sec_parser/processing_steps/top_section_manager.py‎
Lines changed: 73 additions & 26 deletions b/‎sec_parser/processing_steps/top_section_manager_for_10q.py‎ renamed to ‎sec_parser/processing_steps/top_section_manager.py‎
Lines changed: 73 additions & 26 deletions
@@ -696,7 +696,7 @@
       "    SEC filings usually have a flat HTML structure, which simplifies the\n",
       "    parsing process. Each top-level HTML tag often directly corresponds\n",
       "    to a single semantic element. This is different from many websites\n",
-      "    where HTML tags are nested deeply, requiring more complex parsing.\n",
+      "    where HTML tags are nested deeply,requiring more complex parsing.\n",
       "\n",
       "    For Advanced Users:\n",
       "    ====================\n",
 
@@ -212,7 +212,7 @@
       "\u001b[1;34mTopSectionTitle\u001b[0m: Item 1.    Financial Statements\n",
       "\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n",
       "\u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...ousands, and per-share amounts)\n",
-      "\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~40 numbers, and 742 characters.\n",
+      "\u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n",
       "\u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n",
       "\u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMEN...OMPREHENSIVE INCOME (Unaudited)\n",
       "...\n"
@@ -248,7 +248,7 @@
       "├── \u001b[1;34mTopSectionTitle\u001b[0m: Item 1.    Financial Statements\n",
       "│   ├── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)\n",
       "│   │   ├── \u001b[1;34mSupplementaryText\u001b[0m: (In millions, except number of ...ousands, and per-share amounts)\n",
-      "│   │   ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~40 numbers, and 742 characters.\n",
+      "│   │   ├── \u001b[1;34mTableElement\u001b[0m: Table with ~24 rows, ~80 numbers, and 1058 characters.\n",
       "│   │   └── \u001b[1;34mSupplementaryText\u001b[0m: See accompanying Notes to Conde...solidated Financial Statements.\n",
       "│   ├── \u001b[1;34mTitleElement\u001b[0m: CONDENSED CONSOLIDATED STATEMEN...OMPREHENSIVE INCOME (Unaudited)\n",
       "...\n"
@@ -390,16 +390,18 @@
      "output_type": "stream",
      "text": [
       "\u001b[1;34mTitleElement\u001b[0m: Segment Operating Performance\n",
-      "├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s...31, 2022 (dollars in millions):\n",
-      "├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~20 numbers, and 264 characters.\n",
+      "├── \u001b[1;34mMyElement\u001b[0m: The following table shows net s... 1, 2023 (dollars in millions):\n",
+      "├── \u001b[1;34mTableElement\u001b[0m: Table with ~7 rows, ~39 numbers, and 408 characters.\n",
       "├── \u001b[1;34mTitleElement\u001b[0m: Americas\n",
-      "│   └── \u001b[1;34mTextElement\u001b[0m: Americas net sales increased 2%...ring the first quarter of 2024.\n",
+      "│   └── \u001b[1;34mMyElement\u001b[0m: Americas net sales increased du... the first nine months of 2024.\n",
+      "├── \u001b[1;34mTitleElement\u001b[0m: Europe\n",
+      "│   └── \u001b[1;34mMyElement\u001b[0m: Europe net sales increased duri...earables, Home and Accessories.\n",
       "├── \u001b[1;34mTitleElement\u001b[0m: Greater China\n",
-      "│   └── \u001b[1;34mMyElement\u001b[0m: Greater China net sales decreas...ring the first quarter of 2024.\n",
+      "│   └── \u001b[1;34mMyElement\u001b[0m: Greater China net sales decreas... and first nine months of 2024.\n",
       "├── \u001b[1;34mTitleElement\u001b[0m: Japan\n",
-      "│   └── \u001b[1;34mMyElement\u001b[0m: Japan net sales increased 15% o...ring the first quarter of 2024.\n",
+      "│   └── \u001b[1;34mMyElement\u001b[0m: Japan net sales increased durin... and first nine months of 2024.\n",
       "└── \u001b[1;34mTitleElement\u001b[0m: Rest of Asia Pacific\n",
-      "    └── \u001b[1;34mMyElement\u001b[0m: Rest of Asia Pacific net sales ...earables, Home and Accessories.\n",
+      "    └── \u001b[1;34mMyElement\u001b[0m: Rest of Asia Pacific net sales ... and first nine months of 2024.\n",
       "...\n"
      ]
     }
 
@@ -3,7 +3,10 @@
     SecParserRuntimeError,
     SecParserValueError,
 )
-from sec_parser.processing_engine.core import Edgar10QParser
+from sec_parser.processing_engine.core import (
+    Edgar10KParser,
+    Edgar10QParser,
+)
 from sec_parser.processing_engine.html_tag import HtmlTag
 from sec_parser.processing_engine.types import ParsingOptions
 from sec_parser.processing_steps.abstract_classes.abstract_processing_step import (
@@ -34,6 +37,7 @@
 
 __all__ = [
     # Main parser classes
+    "Edgar10KParser",
     "Edgar10QParser",
     "TreeBuilder",
     # Common semantic elements
 
@@ -8,6 +8,7 @@
 
 from sec_parser.processing_engine.core import (
     AbstractSemanticElementParser,
+    Edgar10KParser,
     Edgar10QParser,
 )
 from sec_parser.processing_engine.html_tag import HtmlTag
@@ -16,6 +17,7 @@
 __all__ = [
     "HtmlTagParser",
     "AbstractSemanticElementParser",
+    "Edgar10KParser",
     "Edgar10QParser",
     "HtmlTag",
 ]
@@ -43,7 +43,8 @@
 from sec_parser.processing_steps.text_classifier import TextClassifier
 from sec_parser.processing_steps.text_element_merger import TextElementMerger
 from sec_parser.processing_steps.title_classifier import TitleClassifier
-from sec_parser.processing_steps.top_section_manager_for_10q import (
+from sec_parser.processing_steps.top_section_manager import (
+    TopSectionManagerFor10K,
     TopSectionManagerFor10Q,
 )
 from sec_parser.semantic_elements.composite_semantic_element import (
@@ -85,7 +86,7 @@ class AbstractSemanticElementParser(ABC):
     Why Focus on Top-Level Tags?
     ============================
     SEC filings usually have a flat HTML structure, which simplifies the
-    parsing process.Each top-level HTML tag often directly corresponds
+    parsing process. Each top-level HTML tag often directly corresponds
     to a single semantic element. This is different from many websites
     where HTML tags are nested deeply,requiring more complex parsing.
 
@@ -208,3 +209,48 @@ def get_default_single_element_checks(self) -> list[AbstractSingleElementCheck]:
             ImageCheck(),
             TopSectionTitleCheck(),
         ]
+
+class Edgar10KParser(AbstractSemanticElementParser):
+    """
+    The Edgar10KParser class is responsible for parsing SEC EDGAR 10-K
+    quarterly reports. It transforms the HTML documents into a list
+    of elements. Each element in this list represents a part of
+    the visual structure of the original document.
+    """
+
+    def get_default_steps(
+        self,
+        get_checks: Callable[[], list[AbstractSingleElementCheck]] | None = None,
+    ) -> list[AbstractProcessingStep]:
+        return [
+            IndividualSemanticElementExtractor(
+                get_checks=get_checks or self.get_default_single_element_checks,
+            ),
+            ImageClassifier(types_to_process={NotYetClassifiedElement}),
+            EmptyElementClassifier(types_to_process={NotYetClassifiedElement}),
+            TableClassifier(types_to_process={NotYetClassifiedElement}),
+            TableOfContentsClassifier(types_to_process={TableElement}),
+            TopSectionManagerFor10K(types_to_process={NotYetClassifiedElement}),
+            IntroductorySectionElementClassifier(),
+            TextClassifier(types_to_process={NotYetClassifiedElement}),
+            HighlightedTextClassifier(types_to_process={TextElement}),
+            SupplementaryTextClassifier(
+                types_to_process={TextElement, HighlightedTextElement},
+            ),
+            PageHeaderClassifier(
+                types_to_process={TextElement, HighlightedTextElement},
+            ),
+            PageNumberClassifier(
+                types_to_process={TextElement, HighlightedTextElement},
+            ),
+            TitleClassifier(types_to_process={HighlightedTextElement}),
+            TextElementMerger(),
+        ]
+
+    def get_default_single_element_checks(self) -> list[AbstractSingleElementCheck]:
+        return [
+            TableCheck(),
+            XbrlTagCheck(),
+            ImageCheck(),
+            TopSectionTitleCheck(),
+        ]
@@ -46,7 +46,8 @@
 from sec_parser.processing_steps.text_classifier import TextClassifier
 from sec_parser.processing_steps.text_element_merger import TextElementMerger
 from sec_parser.processing_steps.title_classifier import TitleClassifier
-from sec_parser.processing_steps.top_section_manager_for_10q import (
+from sec_parser.processing_steps.top_section_manager import (
+    TopSectionManagerFor10K,
     TopSectionManagerFor10Q,
 )
 
@@ -68,6 +69,7 @@
     "TextClassifier",
     "TextElementMerger",
     "TitleClassifier",
+    "TopSectionManagerFor10K",
     "TopSectionManagerFor10Q",
     "TopSectionTitleCheck",
     "XbrlTagCheck",
 
@@ -5,7 +5,7 @@
 from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
     AbstractSingleElementCheck,
 )
-from sec_parser.processing_steps.top_section_manager_for_10q import (
+from sec_parser.processing_steps.top_section_manager import (
     TopSectionManagerFor10Q,
 )
 
 
@@ -12,9 +12,11 @@
 )
 from sec_parser.semantic_elements.top_section_title import TopSectionTitle
 from sec_parser.semantic_elements.top_section_title_types import (
-    IDENTIFIER_TO_10Q_SECTION,
-    InvalidTopSectionIn10Q,
-    TopSectionType,
+    FilingSections,
+    FilingSectionsIn10K,
+    FilingSectionsIn10Q,
+    InvalidTopSectionInFiling,
+    TopSectionInFiling,
 )
 
 if TYPE_CHECKING:  # pragma: no cover
@@ -23,17 +25,17 @@
     )
 
 
-part_pattern = re.compile(r"part\s+(i+)[.\s]*", re.IGNORECASE)
-item_pattern = re.compile(r"item\s+(\d+a?)[.\s]*", re.IGNORECASE)
+part_pattern = re.compile(r"part\s+([iv]+)[.\s]*", re.IGNORECASE)
+item_pattern = re.compile(r"item\s+(\d+[a-c]?)[.\s]*", re.IGNORECASE)
 
 
 @dataclass
 class _Candidate:
-    section_type: TopSectionType
+    section_type: TopSectionInFiling
     element: AbstractSemanticElement
 
 
-class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep):
+class TopSectionManager(AbstractElementwiseProcessingStep):
     """
     Documents are divided into sections, subsections, and so on.
     Top level sections are the highest level of sections and are
@@ -49,6 +51,7 @@ class TopSectionManagerFor10Q(AbstractElementwiseProcessingStep):
 
     def __init__(
         self,
+        filing_sections: FilingSections,
         *,
         types_to_process: set[type[AbstractSemanticElement]] | None = None,
         types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
@@ -57,6 +60,7 @@ def __init__(
             types_to_process=types_to_process,
             types_to_exclude=types_to_exclude,
         )
+        self._filing_sections = filing_sections
         self._candidates: list[_Candidate] = []
         self._selected_candidates: tuple[_Candidate, ...] | None = None
         self._last_part: str = "?"
@@ -71,7 +75,10 @@ def is_match_part_or_item(cls, text: str) -> bool:
     @staticmethod
     def match_part(text: str) -> str | None:
         if match := part_pattern.match(text):
-            return str(len(match.group(1)))
+            part_text = match.group(1).lower()
+            # Map roman numerals to arabic numbers
+            roman_map = {"i": "1", "ii": "2", "iii": "3", "iv": "4"}
+            return roman_map.get(part_text)
         return None
 
     @staticmethod
@@ -156,21 +163,21 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
         if part := self.match_part(element.text):
             self._last_part = part
             section_type = self._get_section_type(f"part{self._last_part}")
-            if section_type is InvalidTopSectionIn10Q:
-                    warnings.warn(
-                        f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopSectionIn10Q.",
-                        UserWarning,
-                        stacklevel=8,
-                    )
+            if section_type is InvalidTopSectionInFiling:
+                warnings.warn(
+                    f"Invalid section type for part{self._last_part}. Defaulting to InvalidTopSectionInFiling.",
+                    UserWarning,
+                    stacklevel=8,
+                )
             candidate = _Candidate(section_type, element)
         elif item := self.match_item(element.text):
             section_type = self._get_section_type(f"part{self._last_part}item{item}")
-            if section_type is InvalidTopSectionIn10Q:
-                    warnings.warn(
-                        f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopSectionIn10Q.",
-                        UserWarning,
-                        stacklevel=8,
-                    )
+            if section_type is InvalidTopSectionInFiling:
+                warnings.warn(
+                    f"Invalid section type for part{self._last_part}item{item}. Defaulting to InvalidTopSectionInFiling.",
+                    UserWarning,
+                    stacklevel=8,
+                )
             candidate = _Candidate(section_type, element)
 
 
@@ -182,17 +189,19 @@ def _identify_candidate(self, element: AbstractSemanticElement) -> None:
             )
 
     """
-    Returns the corresponding TopSectionType of the given identifier. The TopSectionType represents a standard top section type in the context of a 10-Q report.
-    The function utilizes the IDENTIFIER_TO_10Q_SECTION dictionary.
+    Returns the corresponding TopSectionInFiling of the given identifier.
+    The TopSectionInFiling represents a standard top section type in the context of an SEC filing.
+    The function utilizes the identifier_to_section dictionary of the given FilingSections object.
 
     Input:
     - identifier (type: String): an identifier of a top section title expressed by a string
 
     Output:
-    - returns the corresponding TopSectionType of the given identifier. Returns InvalisTopSectionIn10Q if the identifier doesn't match any TopSectionType.
+    - returns the corresponding TopSectionInFiling of the given identifier.
+    - Returns InvalidTopSectionInFiling if the identifier doesn't match any TopSectionInFiling.
     """
-    def _get_section_type(self, identifier: str) -> TopSectionType:
-        return IDENTIFIER_TO_10Q_SECTION.get(identifier, InvalidTopSectionIn10Q)
+    def _get_section_type(self, identifier: str) -> TopSectionInFiling:
+        return self._filing_sections.identifier_to_section.get(identifier, InvalidTopSectionInFiling)
 
     """"
     Groups candidates by section type. Then selects the first element candidate of each section type by using the helper function select_element.
@@ -229,7 +238,7 @@ def select_element(elements: list[AbstractSemanticElement]) -> AbstractSemanticE
                         if not element.html_tag.contains_tag("table", include_self = True)
                     ]
             if len(elements_without_table) >= 1:
-                    return elements_without_table[0]
+                return elements_without_table[0]
             return elements[0]
 
 
@@ -296,6 +305,44 @@ def _create_top_section_title(
             log_origin=self.__class__.__name__,
         )
 
+class TopSectionManagerFor10Q(TopSectionManager):
+    """
+    Specialized version of TopSectionManagerForFiling for handling 10-Q filings.
+    Automatically uses FilingSectionsIn10Q while maintaining all the functionality
+    of the base class.
+    """
+
+    def __init__(
+        self,
+        *,
+        types_to_process: set[type[AbstractSemanticElement]] | None = None,
+        types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
+    ) -> None:
+        super().__init__(
+            filing_sections=FilingSectionsIn10Q,
+            types_to_process=types_to_process,
+            types_to_exclude=types_to_exclude,
+        )
+
+class TopSectionManagerFor10K(TopSectionManager):
+    """
+    Specialized version of TopSectionManagerForFiling for handling 10-K filings.
+    Automatically uses FilingSectionsIn10K while maintaining all the functionality
+    of the base class.
+    """
+
+    def __init__(
+        self,
+        *,
+        types_to_process: set[type[AbstractSemanticElement]] | None = None,
+        types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
+    ) -> None:
+        super().__init__(
+            filing_sections=FilingSectionsIn10K,
+            types_to_process=types_to_process,
+            types_to_exclude=types_to_exclude,
+        )
+
 
 """
 Algorithm:
Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`
`9`	`9`	`from sec_parser.processing_engine.core import (`
`10`	`10`	`AbstractSemanticElementParser,`
	`11`	`+ Edgar10KParser,`
`11`	`12`	`Edgar10QParser,`
`12`	`13`	`)`
`13`	`14`	`from sec_parser.processing_engine.html_tag import HtmlTag`
`@@ -16,6 +17,7 @@`
`16`	`17`	`__all__ = [`
`17`	`18`	`"HtmlTagParser",`
`18`	`19`	`"AbstractSemanticElementParser",`
	`20`	`+ "Edgar10KParser",`
`19`	`21`	`"Edgar10QParser",`
`20`	`22`	`"HtmlTag",`
`21`	`23`	`]`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (`
`6`	`6`	`AbstractSingleElementCheck,`
`7`	`7`	`)`
`8`		`-from sec_parser.processing_steps.top_section_manager_for_10q import (`
	`8`	`+from sec_parser.processing_steps.top_section_manager import (`
`9`	`9`	`TopSectionManagerFor10Q,`
`10`	`10`	`)`
`11`	`11`