Skip to content

Commit

Permalink
Refactor calculation of annualized_respondents_ferc714
Browse files Browse the repository at this point in the history
Rather than loading huge demand_hourly_pa_ferc714 dataset and calculating
report_date columns from these, we can infer these values from the ferc714_settings.
Additionally, we can use cross product merge to blow out the respondents, rather
than doing the complex procedure that we did up to this point.
  • Loading branch information
rousik committed Nov 7, 2023
1 parent f251def commit 14b7978
Showing 1 changed file with 10 additions and 49 deletions.
59 changes: 10 additions & 49 deletions src/pudl/output/ferc714.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Functions & classes for compiling derived aspects of the FERC Form 714 data."""
from datetime import datetime
from typing import Any

import geopandas as gpd
Expand Down Expand Up @@ -83,41 +82,6 @@
################################################################################


def add_dates(rids_ferc714: pd.DataFrame, report_dates: list[datetime]) -> pd.DataFrame:
"""Broadcast respondent data across dates.
Args:
rids_ferc714: A simple FERC 714 Respondent ID dataframe,
without any date information.
report_dates: Dates for which each respondent
should be given a record.
Raises:
ValueError: if a ``report_date`` column exists in ``rids_ferc714``.
Returns:
A Dataframe having all the same columns as the input
``rids_ferc714`` with the addition of a ``report_date`` column, but with all
records associated with each ``respondent_id_ferc714`` duplicated on a per-date
basis.
"""
if "report_date" in rids_ferc714.columns:
raise ValueError("report_date already present, can't be added again!")
# Create DataFrame with all report_date and respondent_id_ferc714 combos
logger.info(f"Got {len(report_dates)} report_dates.")
unique_rids = rids_ferc714.respondent_id_ferc714.unique()
logger.info(f"found {len(unique_rids)} unique FERC-714 respondent IDs.")
dates_rids_df = pd.DataFrame(
index=pd.MultiIndex.from_product(
[report_dates, unique_rids],
names=["report_date", "respondent_id_ferc714"],
)
).reset_index()
rids_with_dates = pd.merge(rids_ferc714, dates_rids_df, on="respondent_id_ferc714")
logger.info(f"Generated {len(rids_with_dates)} report_date + respondent_id rows.")
return rids_with_dates


def categorize_eia_code(
eia_codes: list[int],
ba_ids: list[int],
Expand Down Expand Up @@ -372,26 +336,23 @@ def filled_service_territory_eia861(

@asset(compute_kind="Python")
def annualized_respondents_ferc714(
demand_hourly_pa_ferc714: pd.DataFrame, respondent_id_ferc714: pd.DataFrame
context, respondent_id_ferc714: pd.DataFrame
) -> pd.DataFrame:
"""Broadcast respondent data across all years with reported demand.
The FERC 714 Respondent IDs and names are reported in their own table, without any
refence to individual years, but much of the information we are associating with
them varies annually. This method creates an annualized version of the respondent
table, with each respondent having an entry corresponding to every year in which
hourly demand was reported in the FERC 714 dataset as a whole -- this necessarily
means that many of the respondents will end up having entries for years in which
they reported no demand, and that's fine. They can be filtered later.
table, with each respondent having an entry corresponding to every year for which
FERC 714 has been processed. This means that many of the respondents will end up
having entries for years in which they reported no demand, and that's fine.
They can be filtered later.
"""
# Calculate the total demand per respondent, per year:
report_dates = [
time for time in demand_hourly_pa_ferc714.report_date.unique() if pd.notna(time)
]
annualized_respondents_ferc714 = respondent_id_ferc714.pipe(
add_dates, report_dates
).pipe(apply_pudl_dtypes)
return annualized_respondents_ferc714
ferc714_settings = context.resources.dataset_settings.ferc714
report_dates = pd.DataFrame(
{"report_date": pd.to_datetime(sorted(ferc714_settings.years), format="%Y")}
)
return respondent_id_ferc714.merge(report_dates, how="cross")


@asset(
Expand Down

0 comments on commit 14b7978

Please sign in to comment.