Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migrate the depreciation studies out of excel sheet -> CSV for easier multi-person editing/adding #336

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added inputs/deprish_eia.xlsx
Binary file not shown.
18,767 changes: 18,767 additions & 0 deletions inputs/deprish_raw.csv

Large diffs are not rendered by default.

Binary file removed inputs/deprish_raw.xlsx
Binary file not shown.
35 changes: 29 additions & 6 deletions notebooks/connect_deprish_to_eia.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -19,9 +19,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"pudl_settings is being deprecated in favor of environment variables PUDL_OUTPUT and PUDL_INPUT. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n",
"sqlite and parquet directories are no longer being used. Make sure there is a single directory named 'output' at the root of your workspace. For more info see: https://catalystcoop-pudl.readthedocs.io/en/dev/dev/dev_setup.html\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
Expand All @@ -33,9 +44,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pudl_rmi'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpudl_rmi\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpudl_rmi\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mconnect_deprish_to_eia\u001b[39;00m \u001b[39mimport\u001b[39;00m \u001b[39m*\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[39m# from fuzzywuzzy import fuzz \u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[39m# from fuzzywuzzy import process\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pudl_rmi'"
]
}
],
"source": [
"import pudl_rmi\n",
"from pudl_rmi.connect_deprish_to_eia import *\n",
Expand Down Expand Up @@ -144,7 +167,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
"version": "3.11.3"
}
},
"nbformat": 4,
Expand Down
4 changes: 3 additions & 1 deletion src/pudl_rmi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
``pudl_rmi.coordinate.Output()`` will either grab them or clobber them.
"""

DEPRISH_RAW_XLSX: Path = INPUTS_DIR / "deprish_raw.xlsx"
DEPRISH_RAW_CSV: Path = INPUTS_DIR / "deprish_raw.csv"
"""Path to the raw depreciation data."""
DEPRISH_EIA_XLSX: Path = INPUTS_DIR / "deprish_eia.xlsx"
"""Path to the EIA depreciation column maps and plant parts linkages."""
DEPRISH_COMMON_LABELS_XLSX: Path = INPUTS_DIR / "deprish_common_labels.xlsx"
"""Path to mannual label of common records in depreciation studies."""
FERC_ACCT_NAMES_CSV: Path = INPUTS_DIR / "ferc_acct_names.csv"
Expand Down
6 changes: 3 additions & 3 deletions src/pudl_rmi/connect_deprish_to_eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def prep_deprish(deprish, plant_parts_eia, key_deprish):
raise AssertionError(
f"Found {len(baddies)} depreciation records which don't have "
"cooresponding EIA plant-parts records. Check plant_id_eia's "
f"in {pudl_rmi.DEPRISH_RAW_XLSX}"
f"in {pudl_rmi.DEPRISH_EIA_XLSX}"
)
deprish_ids = (
deprish_ids.loc[deprish_ids._merge == "both"]
Expand Down Expand Up @@ -280,7 +280,7 @@ def match_deprish_eia(deprish, plant_parts_eia, sheet_name_output):
.pipe(add_record_id_fuzzy, plant_parts_eia=ppe, key_ppe=key_ppe)
.pipe(
add_overrides,
file_path_deprish=pudl_rmi.DEPRISH_RAW_XLSX,
file_path_deprish=pudl_rmi.DEPRISH_EIA_XLSX,
sheet_name_output=sheet_name_output,
)
)
Expand Down Expand Up @@ -418,7 +418,7 @@ def execute(
"Subset of Master Unit List": possible_matches_ppe,
}
save_to_workbook(
file_path=pudl_rmi.DEPRISH_RAW_XLSX, sheets_df_dict=sheets_df_dict
file_path=pudl_rmi.DEPRISH_EIA_XLSX, sheets_df_dict=sheets_df_dict
)
return deprish_match

Expand Down
39 changes: 16 additions & 23 deletions src/pudl_rmi/deprish.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,9 @@

how to run this module:
file_path_deprish = pathlib.Path().cwd().parent/'depreciation_rmi.xlsx'
sheet_name_deprish='Depreciation Studies Raw'
transformer = deprish.Transformer(
deprish.Extractor(
file_path=file_path_deprish,
sheet_name=sheet_name_deprish
file_path=file_path_deprish
).execute())
deprish_df = transformer.execute()
"""
Expand Down Expand Up @@ -101,19 +99,16 @@ def execute(start_date=None, end_date=None):

class Extractor:
"""
Extractor for turning excel based depreciation data into a dataframe.

Note: this should be overhualed if/when we switch from storing the
depreciation studies into a CSV. Also, if/when we integrate this into pudl,
we need to think more seriously about where to store the excel sheet/CSV.
Is it in pudl.package_data or do we store it through the datastore? If it
felt stable would it be worthwhile to store via zendo?.. in which case we
will want to use a datastore object to handle the path.
Extractor for turning CSV-based depreciation data into a dataframe.

Note: If/when we integrate this into pudl, we need to think more seriously about
where to store the CSV. Is it in pudl.package_data or do we store it
through the datastore? If it felt stable would it be worthwhile to store via zenodo?
In which case we will want to use a datastore object to handle the path.
"""

def __init__(
self,
sheet_name="Depreciation Studies Raw",
skiprows=0,
start_date=None,
end_date=None,
Expand All @@ -122,32 +117,30 @@ def __init__(
Initialize a for deprish.Extractor.

Args:
sheet_name (str, int): String used for excel sheet name or
integer used for zero-indexed sheet location.
skiprows (int): rows to skip in zero-indexed column location,
default is 0.
start_date (int): The start date of the date range to extract.
Default is None and all records before end_date will be extracted.
end_date (int): The end date of the date range to extract.
Default is None and all records after start_date will be extracted.
"""
self.sheet_name = sheet_name
self.skiprows = skiprows
self.start_date = start_date
self.end_date = end_date

def execute(self):
"""Turn excel-based depreciation data into a dataframe."""
logger.info(
"Reading the depreciation data from " f"{pudl_rmi.DEPRISH_RAW_XLSX}"
)
df = pd.read_excel(
pudl_rmi.DEPRISH_RAW_XLSX,
"""Turn CSV-based depreciation data into a dataframe."""
logger.info("Reading the depreciation data from " f"{pudl_rmi.DEPRISH_RAW_CSV}")
df = pd.read_csv(
pudl_rmi.DEPRISH_RAW_CSV,
skiprows=self.skiprows,
sheet_name=self.sheet_name,
dtype={i: pd.Int64Dtype() for i in INT_IDS},
na_values=NA_VALUES,
)
df["report_date"] = pd.to_datetime(df["report_date"])
logger.info(df)
logger.info(df.dtypes)

if self.start_date is None:
self.start_date = min(df.report_date)
if self.end_date is None:
Expand Down Expand Up @@ -408,7 +401,7 @@ def convert_rate_cols(self, tidy_df):
# and decimal rates (i.e. .882 for 88.2%).
# numbers of decimals (e.g. 88.2% would either be represented as
# 88.2 or .882). Some % columns have boolean columns (ending in
# "type_pct") that we fleshed out to know wether the values were
# "type_pct") that we fleshed out to know whether the values were
# reported as numbers or %s.
to_num_cols = ["net_salvage_rate", "reserve_rate", "depreciation_annual_rate"]
for col in to_num_cols:
Expand Down