Skip to content

Commit

Permalink
330 consecutive imputation links (#15)
Browse files Browse the repository at this point in the history
* add function for cumulative imputation links

* added tests for forward and backward cumulative links

* adding pre-commit hooks

* changes after review
  • Loading branch information
robertswh authored May 21, 2024
1 parent 8f5c987 commit 0e4b261
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/cumulative_imputation_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np


def get_cumulative_links(
dataframe,
forward_or_backward,
strata,
reference,
target,
period,
imputation_link,
time_difference=1,
):
"""
Create cumulative imputation links for multiple consecutive periods
without a return.
Parameters
----------
dataframe : pandas.DataFrame
forward_or_backward: str
either f or b for forward or backward method
strata : str
column name containing strata information (sic)
reference : str
column name containing business reference id
target : str
column name containing target variable
period : str
column name containing time period
imputation_link : string
column name containing imputation links
time_difference : int
time difference between predictive and target period in months
Returns
-------
pandas.DataFrame
dataframe with imputation_group and
cumulative_forward/backward_imputation_link column
"""

dataframe.sort_values([strata, reference, period], inplace=True)
dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False)

dataframe["imputation_group"] = (
(
(dataframe["missing_value"].diff(time_difference) != 0)
| (dataframe[strata].diff(time_difference) != 0)
| (dataframe[reference].diff(time_difference) != 0)
)
.astype("int")
.cumsum()
)

if forward_or_backward == "f":
dataframe["cumulative_" + imputation_link] = dataframe.groupby(
"imputation_group"
)[imputation_link].cumprod()
elif forward_or_backward == "b":
dataframe["cumulative_" + imputation_link] = (
dataframe[::-1].groupby("imputation_group")[imputation_link].cumprod()[::-1]
)

dataframe["cumulative_" + imputation_link] = np.where(
~dataframe[target].isnull(),
np.nan,
dataframe["cumulative_" + imputation_link],
)

return dataframe[["imputation_group", "cumulative_" + imputation_link]]
7 changes: 7 additions & 0 deletions tests/cumulative_links.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link
100,100000,200,202402,1,2,1,,
100,100000,,202403,2,0.6,2,2,0.6
100,100000,,202404,3,1,2,6,1
200,100001,,202402,1,4,3,1,2
200,100001,,202403,3,0.5,3,3,0.5
200,100001,300,202404,0.5,1,4,,
64 changes: 64 additions & 0 deletions tests/test_cumulative_imputation_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pathlib import Path

import pytest
from helper_functions import load_and_format
from pandas.testing import assert_frame_equal

from src.cumulative_imputation_links import get_cumulative_links


@pytest.fixture(scope="class")
def cumulative_links_test_data():
return load_and_format(Path("tests") / "cumulative_links.csv")


class TestComulativeLinks:
def test_get_cumulative_links_forward(self, cumulative_links_test_data):
input_data = cumulative_links_test_data.drop(
columns=["cumulative_forward_imputation_link", "imputation_group"]
)

expected_output = cumulative_links_test_data[
[
"imputation_group",
"cumulative_forward_imputation_link",
]
]

actual_output = get_cumulative_links(
input_data,
"f",
"strata",
"reference",
"target",
"period",
"forward_imputation_link",
1,
)

assert_frame_equal(actual_output, expected_output)

def test_get_cumulative_links_backward(self, cumulative_links_test_data):
input_data = cumulative_links_test_data.drop(
columns=["cumulative_backward_imputation_link", "imputation_group"]
)

expected_output = cumulative_links_test_data[
[
"imputation_group",
"cumulative_backward_imputation_link",
]
]

actual_output = get_cumulative_links(
input_data,
"b",
"strata",
"reference",
"target",
"period",
"backward_imputation_link",
1,
)

assert_frame_equal(actual_output, expected_output)

0 comments on commit 0e4b261

Please sign in to comment.