feat(dev_utils): remove sec-api.io integration from Debug Dashboard

alphanome-ai · Oct 12, 2023 · 2dbdc33 · 2dbdc33
1 parent 1a8ef9f
commit 2dbdc33
Show file tree

Hide file tree

Showing 6 changed files with 181 additions and 201 deletions.
diff --git a/README.md b/README.md
@@ -68,30 +68,19 @@ pip install sec-parser
 
 # Usage
 
-Before using the parser, you need to create an account at https://sec-api.io/. The first 100 API calls are free. After creating an account, you will receive an API key. This key should be added to your environment variables. You can do this using the following *bash* command:
+To retrieve the most recent 10-Q SEC EDGAR document in HTML format for Apple, follow these steps:
 
-```bash
-# Replace "your key here" with your actual key. 
-# An example key might look like "aef7f2f22c8b3456de55"
-export SECAPIO_API_KEY="your key here"
+```
 ```
 
-> **Note**
-sec-api.io is a third-party service that is not affiliated with `sec-parser`. We are planning to move away from this service in the near future and download the documents directly from the SEC EDGAR website. 
-
-> **Note**
-The parser utilizes caching, so multiple calls to retrieve the same data will not consume your API calls limit.
+report into a collection of semantic elements extracted from the document. 
 
-Once you have set up your API key, you can start using the parser in your *Python* code. Start by importing the `sec_parser` module as shown below:
+The following code snippet demonstrates how to do this:
 
 ```python
 import sec_parser as sp
 
-# Fetch and parse the latest Apple 10-Q report
-tree = sp.parse_latest("10-Q", ticker="AAPL")
-
-# Display the tree structure of the parsed document
-print(tree.render())
+elements = sp.SecParser().parse(html)
 ```
 Here is an example of the output you can expect:
 ```

diff --git a/dev_utils/debug_dashboard/app.py b/dev_utils/debug_dashboard/app.py
@@ -1,4 +1,3 @@
-import os
 from collections import Counter
 from dataclasses import dataclass
 from itertools import zip_longest
@@ -14,8 +13,7 @@
 import sec_parser.semantic_elements as se
 from dev_utils.debug_dashboard.general_utils import interleave_lists
 from dev_utils.debug_dashboard.sec_data_retrieval import (
-    download_html,
-    get_metadata,
+    get_latest_10q_html,
     get_semantic_elements,
     get_semantic_tree,
 )
@@ -79,7 +77,7 @@ def streamlit_app(
     do_element_render_html = True
     selected_step = 0
     do_interleave = False
-    use_tree_view = True
+    use_tree_view = False
     show_text_length = False
 
     if not HIDE_UI_ELEMENTS:
@@ -89,14 +87,15 @@ def streamlit_app(
             with PassthroughContext():  # replace with st.expander("") if needed
                 FIND_BY_TICKER = "Ticker symbols"
                 ENTER_URL_DIRECTLY = "URLs"
-                data_source_option = sac.segmented(
-                    items=[
-                        sac.SegmentedItem(label=FIND_BY_TICKER),
-                        sac.SegmentedItem(label=ENTER_URL_DIRECTLY),
-                    ],
-                    size="xs",
-                    grow=True,
-                )
+                # data_source_option = sac.segmented(
+                #     items=[
+                #         sac.SegmentedItem(label=FIND_BY_TICKER),
+                #         sac.SegmentedItem(label=ENTER_URL_DIRECTLY),
+                #     ],
+                #     size="xs",
+                #     grow=True,
+                # )
+                data_source_option = FIND_BY_TICKER
                 selected_ticker = data_source_option == FIND_BY_TICKER
                 selected_url = data_source_option == ENTER_URL_DIRECTLY
                 if selected_ticker:
@@ -140,32 +139,26 @@ def streamlit_app(
                     if not input_urls:
                         st.info("Please enter at least one URL.")
                         st.stop()
-                section_1_2, all_sections = st_radio(
-                    "Select Report Sections",
-                    ["Only MD&A", "All Report Sections"],
-                    horizontal=True,
-                    help="MD&A stands for Management Discussion and Analysis. It's a section of a company's annual report in which management discusses numerous aspects of the company, such as market dynamics, operating results, risk factors, and more.",
-                )
-                if section_1_2:
-                    sections = ["part1item2"]
-                elif all_sections:
-                    sections = None
+                section_1_2 = False
+                all_sections = True
+                # section_1_2, all_sections = st_radio(
+                #     "Select Report Sections",
+                #     ["Only MD&A", "All Report Sections"],
+                #     horizontal=True,
+                #     help="MD&A stands for Management Discussion and Analysis. It's a section of a company's annual report in which management discusses numerous aspects of the company, such as market dynamics, operating results, risk factors, and more.",
+                # )
+                # if section_1_2:
+                #     sections = ["part1item2"]
+                # elif all_sections:
+                #     sections = None
 
     assert tickers or input_urls
     for ticker in tickers:
-        metadata = get_metadata(
-            doc="10-Q",
-            latest_from_ticker=ticker,
-        )
+        metadata = None
         metadatas.append(metadata)
-        url = metadata["linkToFilingDetails"]
-        html = download_html(
-            doc="10-Q",
-            url=url,
-            sections=sections,
-            ticker=ticker,
-        )
-        htmls_urls.append(url)
+        # url = metadata["linkToFilingDetails"]
+        html = get_latest_10q_html(ticker=ticker)
+        htmls_urls.append(ticker)
         htmls.append(html)
     for url in input_urls:
         html = download_html(
@@ -174,7 +167,7 @@ def streamlit_app(
             sections=sections,
             ticker=None,
         )
-        metadata = get_metadata(doc="10-Q", url=url)
+        metadata = None
         metadatas.append(metadata)
         htmls_urls.append(url)
         htmls.append(html)
@@ -295,11 +288,10 @@ def format_cls(cls):
                                 value=True,
                             )
                         if selected_step == 3:
-                            use_expanders = st.checkbox(
-                                "Merged view",
-                                value=not use_tree_view,
+                            use_tree_view = st.checkbox(
+                                "Tree view",
+                                value=use_tree_view,
                             )
-                            use_tree_view = not use_expanders
                     with right:
                         if selected_step == 2:
                             element_column_count = st.number_input(
@@ -345,7 +337,10 @@ def format_cls(cls):
 
     def get_label(metadata, url):
         if not metadata:
-            return url.split("/")[-1]
+            if url and "/" in url:
+                return url.split("/")[-1]
+            else:
+                return url
         company_name = normalize_company_name(metadata["companyName"])
         form_type = metadata["formType"]
         filed_at = parse(metadata["filedAt"]).astimezone(tzutc()).strftime("%b %d, %Y")
@@ -355,6 +350,8 @@ def get_label(metadata, url):
         return f"**{company_name}** | {form_type} filed on {filed_at} for the period ended {period_of_report}"
 
     def get_buttons(metadata, url, *, align="end"):
+        if "/" not in url:
+            return
         if metadata:
             url_buttons = [
                 {

diff --git a/dev_utils/debug_dashboard/sec_data_retrieval.py b/dev_utils/debug_dashboard/sec_data_retrieval.py
@@ -1,53 +1,30 @@
-from typing import Optional
+from pathlib import Path
 
 import streamlit as st
+from sec_downloader import DownloadStorage
+from sec_edgar_downloader import Downloader
 
 import sec_parser as sp
-from dev_utils.debug_dashboard.cache_utils import cache_to_file
 
-
-@st.cache_data(
-    experimental_allow_widgets=True,
-    show_spinner="Retrieving SEC EDGAR document...",
-)
-@cache_to_file(
-    cache_by_keys={"latest_from_ticker", "doc", "url", "sections"},
-    cache_dir=".cache/metadata",
-)
-def get_metadata(
-    _secapi_api_key: str,  # prefix _ prevents hashing in st.cache_data
-    *,
-    doc: sp.DocumentType | str,
-    url: str | None = None,
-    latest_from_ticker: str | None = None,
-) -> str:
-    from sec_edgar_downloader import Downloader
-    retriever = sp.SecapioDataRetriever(api_key=_secapi_api_key)
-    return retriever.retrieve_report_metadata(
-        doc,
-        url=url,
-        latest_from_ticker=latest_from_ticker,
-    )
+EDGAR_CLIENT_NAME = "Alphanome.AI"
+EDGAR_CLIENT_EMAIL = "[email protected]"
 
 
 @st.cache_data(
     experimental_allow_widgets=True,
     show_spinner="Retrieving SEC EDGAR document...",
 )
-@cache_to_file(
-    cache_by_keys={"url", "ticker", "doc", "sections"},
-    cache_dir=".cache/html",
-)
-def download_html(
-    _secapi_api_key: str,  # prefix _ prevents hashing in st.cache_data
+def get_latest_10q_html(
     *,
-    ticker: str,  # added just to make the cache write ticker as part of the filename
-    doc: sp.DocumentType | str,
-    url: str,
-    sections: Optional[list[sp.SectionType | str]] = None,
+    ticker: str,
 ) -> str:
-    retriever = sp.SecapioDataRetriever(api_key=_secapi_api_key)
-    return retriever.get_report_html(doc, url, sections=sections)
+    ticker = ticker.upper().strip()
+    assert ticker, "Ticker must not be empty"
+    storage = DownloadStorage(filter_pattern="**/*.htm*")
+    with storage as path:
+        dl = Downloader(EDGAR_CLIENT_NAME, EDGAR_CLIENT_EMAIL, path)
+        dl.get("10-Q", ticker, limit=1, download_details=True)
+    return storage.get_file_contents()[0].content
 
 
 @st.cache_resource