Skip to content

Commit

Permalink
Update branch
Browse files Browse the repository at this point in the history
*Branch is behind from main by many commits, which are
needed for the imputation wrapper
  • Loading branch information
AntonZogk committed Jun 24, 2024
2 parents 0a203c4 + 6ea8d2a commit d65cf94
Show file tree
Hide file tree
Showing 23 changed files with 275 additions and 300 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ repos:
types: [python]
stages: [commit]
args: ["--verbose"]
exclude: ^playground/

#works
- repo: local
Expand Down
13 changes: 13 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd

from src.utils.hdfs_mods import hdfs_load_json as read_json

# TODO: read from config
folder_path = "/dapsen/workspace_zone/mbs-results/"
file_name = "snapshot-202212-002-2156d36b-e61f-42f1-a0f1-61d1f8568b8e.json"
file_path = folder_path + file_name

snapshot = read_json(file_path)

contributors = pd.DataFrame(snapshot["contributors"])
responses = pd.DataFrame(snapshot["responses"])
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ black
isort
nbstripout
nbqa
#research_and_development==1.0.0
pre_commit_hooks
flake8
pandas==1.1.5
Expand Down
3 changes: 3 additions & 0 deletions src/apply_imputation_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def create_and_merge_imputation_values(
auxiliary,
construction_link,
imputation_types=("c", "fir", "bir", "fic"),
**kwargs
):
"""
Loop through different imputation types and merge the results according
Expand Down Expand Up @@ -44,6 +45,8 @@ def create_and_merge_imputation_values(
tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
For 'fic' to produce the correct result, the C marker must be in the first
period for a given reference.
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
Expand Down
8 changes: 8 additions & 0 deletions src/calculate_imputation_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ def calculate_imputation_link(
target_variable: str,
predictive_variable: str,
link_col: str,
**kwargs
link_col: str,
) -> pd.DataFrame:
"""
Calculate link between target_variable and predictive_variable by strata,
Expand All @@ -32,11 +34,16 @@ def calculate_imputation_link(
Column name of the predicted target variable.
link_col : str
Name to use for the new column containing imputation link
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
link_col : str
Name to use for the new column containing imputation link
Returns
-------
df : pd.DataFrame
A pandas DataFrame with a new column containing imputation link.
A pandas DataFrame with a new column containing imputation link.
"""

df_intermediate = df.copy()
Expand All @@ -59,6 +66,7 @@ def calculate_imputation_link(

denominator.replace(0, np.nan, inplace=True) # cover division with 0

df[link_col] = numerator / denominator
df[link_col] = numerator / denominator

return df
5 changes: 3 additions & 2 deletions src/construction_matches.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd


def flag_construction_matches(dataframe, target, period, auxiliary):
def flag_construction_matches(dataframe, target, period, auxiliary, **kwargs):
"""
Add flag to indicate whether the record has non-null target, period and
auxiliary variables, and is therefore valid to use when calculating
Expand All @@ -16,7 +16,8 @@ def flag_construction_matches(dataframe, target, period, auxiliary):
name of column containing time period
auxiliary : string
name of column containing auxiliary information
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
pandas.DataFrame
Expand Down
3 changes: 3 additions & 0 deletions src/cumulative_imputation_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def get_cumulative_links(
period,
imputation_link,
time_difference=1,
**kwargs
):
"""
Create cumulative imputation links for multiple consecutive periods
Expand All @@ -33,6 +34,8 @@ def get_cumulative_links(
column name containing imputation links
time_difference : int
time difference between predictive and target period in months
kwargs : mapping, optional
A dictionary of keyword arguments passed into func
Returns
-------
Expand Down
150 changes: 45 additions & 105 deletions src/flag_and_count_matched_pairs.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,16 @@
import numpy as np
import pandas as pd


def flag_matched_pair_merge(
df, forward_or_backward, target, period, reference, strata, time_difference=1
):
"""
function to add flag to df if data forms a matched pair
i.e. data is given for both period and predictive period
Parameters
----------
df : pd.DataFrame
pandas dataframe of original data
forward_or_backward: str
either f or b for forward or backward method
target : str
column name containing target variable
period : str
column name containing time period
reference : str
column name containing business reference id
strata : str
column name containing strata information (sic)
time_difference : int
time difference between predictive and target period in months
Returns
-------
pd.DataFrame
dataframe with column added flagging forward matched paris and
predictive target variable data column
"""

if forward_or_backward == "f":
time_difference = time_difference
elif forward_or_backward == "b":
time_difference = -time_difference

# Creating new DF, shifting period for forward or backward
df_with_predictive_column = df.copy()[[reference, strata, target]]
df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
months=time_difference
)
predictive_col_name = forward_or_backward + "_predictive_" + target
df_with_predictive_column.rename(
columns={target: predictive_col_name}, inplace=True
)

df = df.merge(
df_with_predictive_column,
left_on=[reference, period, strata],
right_on=[reference, "predictive_period", strata],
how="left",
)

matched_col_name = forward_or_backward + "_matched_pair_" + target

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1), False, True
)

df.drop(["predictive_period"], axis=1, inplace=True)
return df


def flag_matched_pair_shift(
df, forward_or_backward, target, period, reference, strata, shift=1
import numpy as np # noqa F401
import pandas as pd # noqa F401


def flag_matched_pair(
df,
forward_or_backward,
target,
period,
reference,
strata,
time_difference=1,
**kwargs
):
"""
function to flag matched pairs using the shift method
Expand All @@ -75,7 +19,7 @@ def flag_matched_pair_shift(
----------
df : pd.DataFrame
pandas dataframe of original data
shift : int
forward_or_backward : str
number of rows to shift up or down
target : str
column name containing target variable
Expand All @@ -85,67 +29,63 @@ def flag_matched_pair_shift(
column name containing business reference id
strata : str
column name containing strata information (sic)
time_difference: int
lookup distance for matched pairs
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
_type_
pandas dataframe with column added flagging forward matched pairs and
two pandas dataframes: the main dataframe with column added flagging
forward matched pairs and
predictive target variable data column
"""

if forward_or_backward == "f":
shift = shift
elif forward_or_backward == "b":
shift = -shift

df = df.sort_values(by=[reference, period])
predictive_col_name = forward_or_backward + "_predictive_" + target
df[[predictive_col_name, "predictive_period"]] = df.groupby(
[reference, strata]
).shift(shift)[[target, period]]

df["validate_date"] = np.where(
df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
)
matched_col_name = forward_or_backward + "_matched_pair_" + target
if forward_or_backward == "b":
time_difference = -time_difference

df[matched_col_name] = np.where(
df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
False,
True,
df[forward_or_backward + "_match"] = (
df.groupby([strata, reference])
.shift(time_difference)[target]
.notnull()
.mul(df[target].notnull())
.mul(
df[period] - pd.DateOffset(months=time_difference)
== df.shift(time_difference)[period]
)
)

df.drop(["validate_date", "predictive_period"], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

return df


def count_matches(df, flag_column_name, period, strata, count_column_name=None):
def count_matches(df, flag, period, strata, **kwargs):
"""
Function to count the number of records with matches per period and stratum
function to flag matched pairs using the shift method
Parameters
----------
df : pd.DataFrame
pandas dataframe of original data
flag_column_name : str
name of column containing flags if a match exists
pandas dataframe of original data with imputation flags
flag : str/list
the imputation flag column/s. Single string if one column, list of
strings for multiple columns.
period : str
column name containing time period
strata : str
column name containing strata information (sic)
count_col_name : str, None
name to give to count column. If `None`, name will be derived based on
flag column name
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
pd.DataFrame
dataframe with column added for count of records with matches
_type_
pandas dataframe: match counts for each flag column.
"""
if count_column_name is None:
count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count"
df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform(
"sum"
)
return df

return df.groupby([strata, period])[flag].agg("sum").reset_index()
3 changes: 3 additions & 0 deletions src/imputation_flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def create_impute_flags(
strata: str,
auxiliary: str,
predictive_auxiliary: str,
**kwargs
):

"""
Expand Down Expand Up @@ -36,6 +37,8 @@ def create_impute_flags(
predictive_auxiliary: str
Column name containing predictive auxiliary data, this is created,
by flag_matched_pair_merge function.
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
Expand Down
55 changes: 55 additions & 0 deletions src/predictive_variable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd


def shift_by_strata_period(
df: pd.DataFrame,
target: str,
period: str,
strata: str,
reference: str,
time_difference: int,
new_col: str,
**kwargs
) -> pd.DataFrame:
"""
It will perform the usual shift by desired time_difference for each value
in strata and for consecutive period.
Parameters
----------
df : pd.DataFrame
Pandas dataframe of original data
target : str
Column name containing target variable to be shifted.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
reference : str
Column name containing business reference id.
time_difference : int
Number of periods to shift. Can be positive or negative.
new_col : str
Column name containing the shifted values.
kwargs : mapping, optional
A dictionary of keyword arguments passed into func.
Returns
-------
df : pd.DataFrame
Pandas dataframe of original data with a new column containing the
shifted values.
"""

df.sort_values([reference, strata, period], inplace=True)

df[new_col] = df.groupby(
(
(df[period] - pd.DateOffset(months=1) != df.shift(1)[period])
| (df[strata].diff(1) != 0)
| (df[reference].diff(1) != 0)
).cumsum()
).shift(time_difference)[target]

return df
Loading

0 comments on commit d65cf94

Please sign in to comment.