catalyst-cooperative · e-belfer · Apr 11, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 24, 2023
diff --git a/inputs/deprish_eia.xlsx b/inputs/deprish_eia.xlsx
diff --git a/inputs/deprish_raw.csv b/inputs/deprish_raw.csv
diff --git a/inputs/deprish_raw.xlsx b/inputs/deprish_raw.xlsx
diff --git a/notebooks/connect_deprish_to_eia.ipynb b/notebooks/connect_deprish_to_eia.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19,9 +19,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
+      "sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
+      "pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
+      "sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n"
+     ]
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
@@ -33,9 +44,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'pudl_rmi'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpudl_rmi\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpudl_rmi\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mconnect_deprish_to_eia\u001b[39;00m \u001b[39mimport\u001b[39;00m \u001b[39m*\u001b[39m\n\u001b[1;32m      4\u001b[0m \u001b[39m# from fuzzywuzzy import fuzz \u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[39m# from fuzzywuzzy import process\u001b[39;00m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pudl_rmi'"
+     ]
+    }
+   ],
    "source": [
     "import pudl_rmi\n",
     "from pudl_rmi.connect_deprish_to_eia import *\n",
@@ -144,7 +167,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.2"
+   "version": "3.11.3"
   }
  },
  "nbformat": 4,

diff --git a/src/pudl_rmi/__init__.py b/src/pudl_rmi/__init__.py
@@ -31,8 +31,10 @@
 ``pudl_rmi.coordinate.Output()`` will either grab them or clobber them.
 """
 
-DEPRISH_RAW_XLSX: Path = INPUTS_DIR / "deprish_raw.xlsx"
+DEPRISH_RAW_CSV: Path = INPUTS_DIR / "deprish_raw.csv"
 """Path to the raw depreciation data."""
+DEPRISH_EIA_XLSX: Path = INPUTS_DIR / "deprish_eia.xlsx"
+"""Path to the EIA depreciation column maps and plant parts linkages."""
 DEPRISH_COMMON_LABELS_XLSX: Path = INPUTS_DIR / "deprish_common_labels.xlsx"
 """Path to mannual label of common records in depreciation studies."""
 FERC_ACCT_NAMES_CSV: Path = INPUTS_DIR / "ferc_acct_names.csv"

diff --git a/src/pudl_rmi/connect_deprish_to_eia.py b/src/pudl_rmi/connect_deprish_to_eia.py
@@ -101,7 +101,7 @@ def prep_deprish(deprish, plant_parts_eia, key_deprish):
         raise AssertionError(
             f"Found {len(baddies)} depreciation records which don't have "
             "cooresponding EIA plant-parts records. Check plant_id_eia's "
-            f"in {pudl_rmi.DEPRISH_RAW_XLSX}"
+            f"in {pudl_rmi.DEPRISH_EIA_XLSX}"
         )
     deprish_ids = (
         deprish_ids.loc[deprish_ids._merge == "both"]
@@ -280,7 +280,7 @@ def match_deprish_eia(deprish, plant_parts_eia, sheet_name_output):
         .pipe(add_record_id_fuzzy, plant_parts_eia=ppe, key_ppe=key_ppe)
         .pipe(
             add_overrides,
-            file_path_deprish=pudl_rmi.DEPRISH_RAW_XLSX,
+            file_path_deprish=pudl_rmi.DEPRISH_EIA_XLSX,
             sheet_name_output=sheet_name_output,
         )
     )
@@ -418,7 +418,7 @@ def execute(
             "Subset of Master Unit List": possible_matches_ppe,
         }
         save_to_workbook(
-            file_path=pudl_rmi.DEPRISH_RAW_XLSX, sheets_df_dict=sheets_df_dict
+            file_path=pudl_rmi.DEPRISH_EIA_XLSX, sheets_df_dict=sheets_df_dict
         )
     return deprish_match
 

diff --git a/src/pudl_rmi/deprish.py b/src/pudl_rmi/deprish.py
@@ -26,11 +26,9 @@
 
 how to run this module:
 file_path_deprish = pathlib.Path().cwd().parent/'depreciation_rmi.xlsx'
-sheet_name_deprish='Depreciation Studies Raw'
 transformer = deprish.Transformer(
     deprish.Extractor(
-        file_path=file_path_deprish,
-        sheet_name=sheet_name_deprish
+        file_path=file_path_deprish
     ).execute())
 deprish_df = transformer.execute()
 """
@@ -101,19 +99,16 @@ def execute(start_date=None, end_date=None):
 
 class Extractor:
     """
-    Extractor for turning excel based depreciation data into a dataframe.
-
-    Note: this should be overhualed if/when we switch from storing the
-    depreciation studies into a CSV. Also, if/when we integrate this into pudl,
-    we need to think more seriously about where to store the excel sheet/CSV.
-    Is it in pudl.package_data or do we store it through the datastore? If it
-    felt stable would it be worthwhile to store via zendo?.. in which case we
-    will want to use a datastore object to handle the path.
+    Extractor for turning CSV-based depreciation data into a dataframe.
+
+    Note: If/when we integrate this into pudl, we need to think more seriously about
+    where to store the CSV. Is it in pudl.package_data or do we store it
+    through the datastore? If it felt stable would it be worthwhile to store via zenodo?
+    In which case we will want to use a datastore object to handle the path.
     """
 
     def __init__(
         self,
-        sheet_name="Depreciation Studies Raw",
         skiprows=0,
         start_date=None,
         end_date=None,
@@ -122,32 +117,30 @@ def __init__(
         Initialize a for deprish.Extractor.
 
         Args:
-            sheet_name (str, int): String used for excel sheet name or
-                integer used for zero-indexed sheet location.
             skiprows (int): rows to skip in zero-indexed column location,
                 default is 0.
             start_date (int): The start date of the date range to extract.
                 Default is None and all records before end_date will be extracted.
             end_date (int): The end date of the date range to extract.
                 Default is None and all records after start_date will be extracted.
         """
-        self.sheet_name = sheet_name
         self.skiprows = skiprows
         self.start_date = start_date
         self.end_date = end_date
 
     def execute(self):
-        """Turn excel-based depreciation data into a dataframe."""
-        logger.info(
-            "Reading the depreciation data from " f"{pudl_rmi.DEPRISH_RAW_XLSX}"
-        )
-        df = pd.read_excel(
-            pudl_rmi.DEPRISH_RAW_XLSX,
+        """Turn CSV-based depreciation data into a dataframe."""
+        logger.info("Reading the depreciation data from " f"{pudl_rmi.DEPRISH_RAW_CSV}")
+        df = pd.read_csv(
+            pudl_rmi.DEPRISH_RAW_CSV,
             skiprows=self.skiprows,
-            sheet_name=self.sheet_name,
             dtype={i: pd.Int64Dtype() for i in INT_IDS},
             na_values=NA_VALUES,
         )
+        df["report_date"] = pd.to_datetime(df["report_date"])
+        logger.info(df)
+        logger.info(df.dtypes)
+
         if self.start_date is None:
             self.start_date = min(df.report_date)
         if self.end_date is None:
@@ -408,7 +401,7 @@ def convert_rate_cols(self, tidy_df):
         # and decimal rates (i.e. .882 for 88.2%).
         # numbers of decimals (e.g. 88.2% would either be represented as
         # 88.2 or .882). Some % columns have boolean columns (ending in
-        # "type_pct") that we fleshed out to know wether the values were
+        # "type_pct") that we fleshed out to know whether the values were
         # reported as numbers or %s.
         to_num_cols = ["net_salvage_rate", "reserve_rate", "depreciation_annual_rate"]
         for col in to_num_cols: