Update branch

*Branch is behind from main by many commits, which are needed for the imputation wrapper
ONSdigital · Jun 24, 2024 · d65cf94 · d65cf94
2 parents 0a203c4 + 6ea8d2a
commit d65cf94
Show file tree

Hide file tree

Showing 23 changed files with 275 additions and 300 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -73,6 +73,7 @@ repos:
         types: [python]
         stages: [commit]
         args: ["--verbose"]
+        exclude: ^playground/
 
 #works
 -   repo: local

diff --git a/main.py b/main.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+from src.utils.hdfs_mods import hdfs_load_json as read_json
+
+# TODO: read from config
+folder_path = "/dapsen/workspace_zone/mbs-results/"
+file_name = "snapshot-202212-002-2156d36b-e61f-42f1-a0f1-61d1f8568b8e.json"
+file_path = folder_path + file_name
+
+snapshot = read_json(file_path)
+
+contributors = pd.DataFrame(snapshot["contributors"])
+responses = pd.DataFrame(snapshot["responses"])
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ black
 isort
 nbstripout
 nbqa
+#research_and_development==1.0.0
 pre_commit_hooks
 flake8
 pandas==1.1.5

diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py
@@ -11,6 +11,7 @@ def create_and_merge_imputation_values(
     auxiliary,
     construction_link,
     imputation_types=("c", "fir", "bir", "fic"),
+    **kwargs
 ):
     """
     Loop through different imputation types and merge the results according
@@ -44,6 +45,8 @@ def create_and_merge_imputation_values(
         tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
         For 'fic' to produce the correct result, the C marker must be in the first
         period for a given reference.
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
 
     Returns
     -------

diff --git a/src/calculate_imputation_link.py b/src/calculate_imputation_link.py
@@ -10,6 +10,8 @@ def calculate_imputation_link(
     target_variable: str,
     predictive_variable: str,
     link_col: str,
+    **kwargs
+    link_col: str,
 ) -> pd.DataFrame:
     """
     Calculate link between target_variable and predictive_variable by strata,
@@ -32,11 +34,16 @@ def calculate_imputation_link(
         Column name of the predicted target variable.
     link_col : str
         Name to use for the new column containing imputation link
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
+    link_col : str
+        Name to use for the new column containing imputation link
 
     Returns
     -------
     df : pd.DataFrame
         A pandas DataFrame with a new column containing imputation link.
+        A pandas DataFrame with a new column containing imputation link.
     """
 
     df_intermediate = df.copy()
@@ -59,6 +66,7 @@ def calculate_imputation_link(
 
     denominator.replace(0, np.nan, inplace=True)  # cover division with 0
 
+    df[link_col] = numerator / denominator
     df[link_col] = numerator / denominator
 
     return df
diff --git a/src/construction_matches.py b/src/construction_matches.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 
-def flag_construction_matches(dataframe, target, period, auxiliary):
+def flag_construction_matches(dataframe, target, period, auxiliary, **kwargs):
     """
     Add flag to indicate whether the record has non-null target, period and
     auxiliary variables, and is therefore valid to use when calculating
@@ -16,7 +16,8 @@ def flag_construction_matches(dataframe, target, period, auxiliary):
         name of column containing time period
     auxiliary : string
         name of column containing auxiliary information
-
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
     Returns
     -------
     pandas.DataFrame

diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py
@@ -10,6 +10,7 @@ def get_cumulative_links(
     period,
     imputation_link,
     time_difference=1,
+    **kwargs
 ):
     """
     Create cumulative imputation links for multiple consecutive periods
@@ -33,6 +34,8 @@ def get_cumulative_links(
         column name containing imputation links
     time_difference : int
         time difference between predictive and target period in months
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func
 
     Returns
     -------

diff --git a/src/flag_and_count_matched_pairs.py b/src/flag_and_count_matched_pairs.py
@@ -1,72 +1,16 @@
-import numpy as np
-import pandas as pd
-
-
-def flag_matched_pair_merge(
-    df, forward_or_backward, target, period, reference, strata, time_difference=1
-):
-    """
-    function to add flag to df if data forms a matched pair
-    i.e. data is given for both period and predictive period
-    Parameters
-    ----------
-    df : pd.DataFrame
-        pandas dataframe of original data
-    forward_or_backward: str
-        either f or b for forward or backward method
-    target : str
-        column name containing target variable
-    period : str
-        column name containing time period
-    reference : str
-        column name containing business reference id
-    strata : str
-        column name containing strata information (sic)
-    time_difference : int
-        time difference between predictive and target period in months
-
-
-    Returns
-    -------
-    pd.DataFrame
-        dataframe with column added flagging forward matched paris and
-        predictive target variable data column
-    """
-
-    if forward_or_backward == "f":
-        time_difference = time_difference
-    elif forward_or_backward == "b":
-        time_difference = -time_difference
-
-    # Creating new DF, shifting period for forward or backward
-    df_with_predictive_column = df.copy()[[reference, strata, target]]
-    df_with_predictive_column["predictive_period"] = df[period] + pd.DateOffset(
-        months=time_difference
-    )
-    predictive_col_name = forward_or_backward + "_predictive_" + target
-    df_with_predictive_column.rename(
-        columns={target: predictive_col_name}, inplace=True
-    )
-
-    df = df.merge(
-        df_with_predictive_column,
-        left_on=[reference, period, strata],
-        right_on=[reference, "predictive_period", strata],
-        how="left",
-    )
-
-    matched_col_name = forward_or_backward + "_matched_pair_" + target
-
-    df[matched_col_name] = np.where(
-        df[[target, predictive_col_name]].isnull().any(axis=1), False, True
-    )
-
-    df.drop(["predictive_period"], axis=1, inplace=True)
-    return df
-
-
-def flag_matched_pair_shift(
-    df, forward_or_backward, target, period, reference, strata, shift=1
+import numpy as np  # noqa F401
+import pandas as pd  # noqa F401
+
+
+def flag_matched_pair(
+    df,
+    forward_or_backward,
+    target,
+    period,
+    reference,
+    strata,
+    time_difference=1,
+    **kwargs
 ):
     """
     function to flag matched pairs using the shift method
@@ -75,7 +19,7 @@ def flag_matched_pair_shift(
     ----------
     df : pd.DataFrame
         pandas dataframe of original data
-    shift : int
+    forward_or_backward : str
         number of rows to shift up or down
     target : str
         column name containing target variable
@@ -85,67 +29,63 @@ def flag_matched_pair_shift(
         column name containing business reference id
     strata : str
         column name containing strata information (sic)
+    time_difference: int
+        lookup distance for matched pairs
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
 
     Returns
     -------
     _type_
-        pandas dataframe with column added flagging forward matched pairs and
+        two pandas dataframes: the main dataframe with column added flagging
+        forward matched pairs and
         predictive target variable data column
     """
 
-    if forward_or_backward == "f":
-        shift = shift
-    elif forward_or_backward == "b":
-        shift = -shift
-
     df = df.sort_values(by=[reference, period])
-    predictive_col_name = forward_or_backward + "_predictive_" + target
-    df[[predictive_col_name, "predictive_period"]] = df.groupby(
-        [reference, strata]
-    ).shift(shift)[[target, period]]
 
-    df["validate_date"] = np.where(
-        df[period].dt.month - df["predictive_period"].dt.month == shift, True, False
-    )
-    matched_col_name = forward_or_backward + "_matched_pair_" + target
+    if forward_or_backward == "b":
+        time_difference = -time_difference
 
-    df[matched_col_name] = np.where(
-        df[[target, predictive_col_name]].isnull().any(axis=1) | (~df["validate_date"]),
-        False,
-        True,
+    df[forward_or_backward + "_match"] = (
+        df.groupby([strata, reference])
+        .shift(time_difference)[target]
+        .notnull()
+        .mul(df[target].notnull())
+        .mul(
+            df[period] - pd.DateOffset(months=time_difference)
+            == df.shift(time_difference)[period]
+        )
     )
 
-    df.drop(["validate_date", "predictive_period"], axis=1, inplace=True)
+    df.reset_index(drop=True, inplace=True)
 
     return df
 
 
-def count_matches(df, flag_column_name, period, strata, count_column_name=None):
+def count_matches(df, flag, period, strata, **kwargs):
     """
-    Function to count the number of records with matches per period and stratum
+    function to flag matched pairs using the shift method
+
 
     Parameters
     ----------
     df : pd.DataFrame
-        pandas dataframe of original data
-    flag_column_name : str
-        name of column containing flags if a match exists
+        pandas dataframe of original data with imputation flags
+    flag : str/list
+        the imputation flag column/s. Single string if one column, list of
+        strings for multiple columns.
     period : str
         column name containing time period
     strata : str
         column name containing strata information (sic)
-    count_col_name : str, None
-        name to give to count column. If `None`, name will be derived based on
-        flag column name
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
 
     Returns
     -------
-    pd.DataFrame
-        dataframe with column added for count of records with matches
+    _type_
+        pandas dataframe: match counts for each flag column.
     """
-    if count_column_name is None:
-        count_column_name = flag_column_name.split("_")[0] + "_matched_pair_count"
-    df[count_column_name] = df.groupby([strata, period])[flag_column_name].transform(
-        "sum"
-    )
-    return df
+
+    return df.groupby([strata, period])[flag].agg("sum").reset_index()
diff --git a/src/imputation_flags.py b/src/imputation_flags.py
@@ -9,6 +9,7 @@ def create_impute_flags(
     strata: str,
     auxiliary: str,
     predictive_auxiliary: str,
+    **kwargs
 ):
 
     """
@@ -36,6 +37,8 @@ def create_impute_flags(
     predictive_auxiliary: str
         Column name containing predictive auxiliary data, this is created,
         by flag_matched_pair_merge function.
+    kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
 
     Returns
     -------

diff --git a/src/predictive_variable.py b/src/predictive_variable.py
@@ -0,0 +1,55 @@
+import pandas as pd
+
+
+def shift_by_strata_period(
+    df: pd.DataFrame,
+    target: str,
+    period: str,
+    strata: str,
+    reference: str,
+    time_difference: int,
+    new_col: str,
+    **kwargs
+) -> pd.DataFrame:
+    """
+    It will perform the usual shift by desired time_difference for each value
+    in strata and for consecutive period.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Pandas dataframe of original data
+    target : str
+        Column name containing target variable to be shifted.
+    period : str
+        Column name containing time period.
+    strata : str
+        Column name containing strata information (sic).
+    reference : str
+        Column name containing business reference id.
+    time_difference : int
+        Number of periods to shift. Can be positive or negative.
+    new_col : str
+        Column name containing the shifted values.
+     kwargs : mapping, optional
+        A dictionary of keyword arguments passed into func.
+
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Pandas dataframe of original data with a new column containing the
+        shifted values.
+    """
+
+    df.sort_values([reference, strata, period], inplace=True)
+
+    df[new_col] = df.groupby(
+        (
+            (df[period] - pd.DateOffset(months=1) != df.shift(1)[period])
+            | (df[strata].diff(1) != 0)
+            | (df[reference].diff(1) != 0)
+        ).cumsum()
+    ).shift(time_difference)[target]
+
+    return df