Skip to content

Commit

Permalink
Take only report_years to calculate ferc-ferc distance penalty
Browse files Browse the repository at this point in the history
  • Loading branch information
zschira committed Nov 9, 2023
1 parent 688b577 commit 1878348
Showing 1 changed file with 14 additions and 16 deletions.
30 changes: 14 additions & 16 deletions src/pudl/analysis/record_linkage/classify_plants_ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,16 @@
class DistancePenalizeSameYear(BaseEstimator, TransformerMixin):
"""Custom estimator to compute distances used to identify clusters of plants."""

def __init__(self, plants_steam_df, metric="euclidean", penalty=100):
def __init__(self, report_years: np.array, metric="euclidean", penalty=100):
"""Initialize estimator with configurable parameters.
Args:
plants_steam_df: Plants steam DataFrame used to get report years.
report_years: reporty_year column used to penalize distance for records
from same year.
metric: Distance metric to use in computation.
penalty: Penalty to apply to records with the same report year.
"""
self.plants_steam_df = plants_steam_df
self.report_years = report_years
self.metric = metric
self.penalty = penalty

Expand All @@ -48,18 +49,13 @@ def fit(self, X, y=None, **fit_params): # noqa: N803
def transform(self, X, y=None, **fit_params): # noqa: N803
"""Compute distance between records then add penalty to records from same year."""
dist_matrix = pairwise_distances(X, metric=self.metric)
report_years = range(
self.plants_steam_df.report_year.min(),
self.plants_steam_df.report_year.max() + 1,
)
penalty_matrix = np.full(dist_matrix.shape, 0)
for yr in report_years:
# get the indices of all the record pairs that have matching report years
yr_idx = self.plants_steam_df[self.plants_steam_df.report_year == yr].index
yr_match_pairs_idx = np.array(np.meshgrid(yr_idx, yr_idx)).T.reshape(-1, 2)
idx_x = yr_match_pairs_idx[:, 0]
idx_y = yr_match_pairs_idx[:, 1]
penalty_matrix[idx_x, idx_y] = self.penalty

# Create penalty matrix
# Probably not the most elegant way to handle this
penalty_matrix = pairwise_distances(self.report_years.reshape(-1, 1))
penalty_matrix += self.penalty
penalty_matrix[penalty_matrix > self.penalty] = 0

# distance from node to itself should still be 0
np.fill_diagonal(penalty_matrix, 0)
dist_matrix += penalty_matrix
Expand Down Expand Up @@ -222,7 +218,9 @@ def make_ferc1_clf(
),
(
"precompute_dist",
DistancePenalizeSameYear(plants_df, metric="euclidean"),
DistancePenalizeSameYear(
np.array(plants_df.report_year), metric="euclidean"
),
),
(
"classifier",
Expand Down

0 comments on commit 1878348

Please sign in to comment.