418 anon data estimation (#64)

* Rename estimation module * Add apply_estimation function * Fix estimation preprocessing and application * Fix filepaths * Fix estimation after running on anon data * Update tests * Update utils and config * Optimise for memory usage * Refactor for combined output run * Fix get_estimation_data period parameter * Remove get_qv_df function * Make is_close tolerance explicit
ONSdigital · Aug 1, 2024 · 0559c73 · 0559c73
1 parent e5ed3c2
commit 0559c73
Show file tree

Hide file tree

Showing 10 changed files with 203 additions and 93 deletions.
diff --git a/config.json b/config.json
@@ -19,6 +19,11 @@
                 "frozenturnover" : "float",
                 "cellnumber" : "int"},
     "temporarily_remove_cols": [],
+    "non_sampled_strata": [
+        "5141", "5142", "5143",
+        "5371", "5372", "5373",
+        "5661", "5662", "5663"
+    ],
     "current_period" : 202401,
     "previous_period" : 202312
 }
diff --git a/mbs_results/apply_estimation.py b/mbs_results/apply_estimation.py
@@ -0,0 +1,59 @@
+import glob
+
+import pandas as pd
+
+from mbs_results.calculate_estimation_weights import (
+    calculate_calibration_factor,
+    calculate_design_weight,
+)
+from mbs_results.pre_processing_estimation import get_estimation_data
+
+# from mbs_results.validate_estimation import validate_estimation
+
+
+def apply_estimation(population_path, sample_path, period, **config):
+    """
+    Read population frame and sample, merge key variables onto df then derive
+    and validate estimation weights.
+
+    Parameters
+    ----------
+    population_path : str
+        filepath for population frame data
+    sample_path : str
+        filepath for sample data
+    period : str
+        name of column containing period
+
+    Returns
+    -------
+    population frame with calibration group, sampled flag, design weight and
+    calibration factor
+
+    Raises
+    ------
+    `ValueError`
+
+    """
+    population_files = glob.glob(population_path)
+    sample_files = glob.glob(sample_path)
+
+    estimation_df_list = []
+
+    for population_file, sample_file in zip(population_files, sample_files):
+        estimation_data = get_estimation_data(
+            population_file, sample_file, period, **config
+        )
+
+        estimation_data = calculate_design_weight(estimation_data, period, **config)
+        estimation_data = calculate_calibration_factor(
+            estimation_data, period, **config
+        )
+
+        estimation_df_list.append(estimation_data)
+
+    estimation_df = pd.concat(estimation_df_list, ignore_index=True)
+
+    # validate_estimation(estimation_df, **config)
+
+    return estimation_df
diff --git a/mbs_results/estimation.py → mbs_results/calculate_estimation_weights.py b/mbs_results/estimation.py → mbs_results/calculate_estimation_weights.py
@@ -2,18 +2,19 @@
 
 
 def calculate_design_weight(
-    dataframe: pd.DataFrame,
+    population_frame: pd.DataFrame,
     period: str,
     strata: str,
     sampled: str,
+    **config,
 ) -> pd.DataFrame:
     """
     Add column to dataframe containing design weights based on sampled flag
 
     Parameters
     ----------
-    dataframe : pd.DataFrame
-        data to be estimated
+    population_frame : pd.DataFrame
+        data to use to calculate weights
     period : str
         name of column in dataframe containing period variable
     strata : str
@@ -30,60 +31,63 @@ def calculate_design_weight(
     -----
     #TODO: Add link to specification once added to repository
     """
-    population_counts = dataframe.groupby([period, strata]).size()
+    population_counts = population_frame.groupby([period, strata]).size()
 
-    sample = dataframe[dataframe[sampled] == 1]
+    sample = population_frame[population_frame[sampled] == 1]
     sample_counts = sample.groupby([period, strata]).size()
 
     design_weights = population_counts / sample_counts
 
     design_weights.name = "design_weight"
     design_weights = design_weights.reset_index()
 
-    dataframe = dataframe.merge(design_weights, how="left", on=[period, strata])
+    population_frame = population_frame.merge(
+        design_weights, how="left", on=[period, strata]
+    )
 
-    return dataframe
+    return population_frame
 
 
 def calculate_calibration_factor(
-    dataframe: pd.DataFrame,
+    population_frame: pd.DataFrame,
     period: str,
     group: str,
     sampled: str,
     auxiliary: str,
     design_weight: str,
+    **config,
 ) -> pd.DataFrame:
     """
-     Add column to dataframe to calculate calibration factor
-
-     Parameters
-     ----------
-     dataframe : pd.DataFrame
-         data to be weighted
-     period : str
-         name of column in dataframe containing period variable
+    Add column to dataframe to calculate calibration factor
+
+    Parameters
+    ----------
+    population_frame : pd.DataFrame
+        data to use to calculate weights
+    period : str
+        name of column in dataframe containing period variable
     group: str
-         name of column in dataframe containing group level data
-         for separate ratio, use strata variable
-         for combined ratio, use calibration group level variable
-     sampled : str
-         name of column in dataframe containing sample flag
-     auxiliary : str
-         name of column in dataframe containing auxiliary variable
-     design_weight: str
-         name of column in dataframe containing design weight
-
-     Returns
-     -------
-     pd.DataFrame
-         dataframe with new column `calibration_factor`
+        name of column in dataframe containing group level data
+        for separate ratio, use strata variable
+        for combined ratio, use calibration group level variable
+    sampled : str
+        name of column in dataframe containing sample flag
+    auxiliary : str
+        name of column in dataframe containing auxiliary variable
+    design_weight: str
+        name of column in dataframe containing design weight
+
+    Returns
+    -------
+    pd.DataFrame
+        dataframe with new column `calibration_factor`
     """
 
-    population_sums = dataframe.groupby([period, group])[auxiliary].sum()
+    population_sums = population_frame.groupby([period, group])[auxiliary].sum()
 
     # copy to avoid SettingWithCopy warning
     # (not required with later versions of pandas)
-    sample = dataframe.copy()[dataframe[sampled] == 1]
+    sample = population_frame.copy()[population_frame[sampled] == 1]
     sample["weighted_auxiliary"] = sample[auxiliary] * sample[design_weight]
     weighted_sample_sums = sample.groupby([period, group])["weighted_auxiliary"].sum()
 
@@ -92,6 +96,8 @@ def calculate_calibration_factor(
     calibration_factor.name = "calibration_factor"
     calibration_factor = calibration_factor.reset_index()
 
-    dataframe = dataframe.merge(calibration_factor, how="left", on=[period, group])
+    population_frame = population_frame.merge(
+        calibration_factor, how="left", on=[period, group]
+    )
 
-    return dataframe
+    return population_frame
diff --git a/mbs_results/pre_processing_estimation.py b/mbs_results/pre_processing_estimation.py
@@ -1,17 +1,15 @@
-from glob import glob
-
-import pandas as pd
-
 from mbs_results.utils import read_colon_separated_file
 
 
 def get_estimation_data(
-    population_path,
+    population_file,
+    sample_file,
+    period,
     population_column_names,
-    sample_path,
     sample_column_names,
+    population_keep_columns,
+    sample_keep_columns,
     calibration_group_map,
-    period,
     reference,
     cell_number,
     **config
@@ -21,14 +19,18 @@ def get_estimation_data(
 
     Parameters
     ----------
-    population_path: pd.DataFrame
+    population_file: pd.DataFrame
         file path to the folder containing the population frames
-    population_column_names: List
-        list of column names for the population frames
     sample_path: pd.DataFrame
         file path to the folder containing the sample data
-    sample_column_names: List
+    population_column_names: List[str]
+        list of column names for the population frames
+    sample_column_names: List[str]
         list of column names for the sample data
+    population_keep_columns: List[str]
+        list of names of columns to keep from population frame
+    sample_keep_columns: List[str]
+        list of names of columns to keep from sample
     calibration_group_map: pd.DataFrame
         dataframe containing map between cell number and calibration group
     period: Str
@@ -46,29 +48,16 @@ def get_estimation_data(
         population frame containing period and sampled columns.
 
     """
-    population_files = glob("universe.*", root_dir=population_path)
-    sample_files = glob("finalsel.*", root_dir=sample_path)
-
-    population_dfs = []
-    for file in population_files:
+    population_df = read_colon_separated_file(population_file, population_column_names)
 
-        population_df = read_colon_separated_file(file, population_column_names)
+    population_df = population_df[population_keep_columns]
 
-        population_dfs.append(population_df)
+    sample_df = read_colon_separated_file(sample_file, sample_column_names)
 
-    population = pd.concat(population_dfs, ignore_index=True)
-
-    sample_dfs = []
-    for file in sample_files:
-
-        sample_df = read_colon_separated_file(file, sample_column_names)
-
-        sample_dfs.append(sample_df)
-
-    sample = pd.concat(sample_dfs, ignore_index=True)
+    sample_df = sample_df[sample_keep_columns]
 
     estimation_data = derive_estimation_variables(
-        population, sample, calibration_group_map, period, reference, cell_number
+        population_df, sample_df, calibration_group_map, period, reference, cell_number
     )
 
     return estimation_data
@@ -109,10 +98,18 @@ def derive_estimation_variables(
         population frame containing sampled column
 
     """
-    population_frame.merge(calibration_group_map, on=[cell_number], how="left")
-    # TODO: check if cell_no is the strata or if it should be dropped
+    population_frame[cell_number] = (
+        population_frame[cell_number]
+        .astype(str)
+        .map(lambda x: str(5) + x[1:] if x[0] == str(7) else x)
+        .astype(int)
+    )
+
+    population_frame = population_frame.merge(
+        calibration_group_map, on=[cell_number], how="left"
+    )
 
-    sample = sample[[reference, period]]
+    sample = sample.copy()[[reference, period]]
     sample["sampled"] = 1
 
     return population_frame.merge(sample, on=[reference, period], how="left").fillna(

diff --git a/mbs_results/utils.py b/mbs_results/utils.py
@@ -20,7 +20,9 @@ def convert_column_to_datetime(dates):
     return pd.to_datetime(dates, format="%Y%m")
 
 
-def read_colon_separated_file(filepath: str, column_names: List[str]) -> pd.DataFrame:
+def read_colon_separated_file(
+    filepath: str, column_names: List[str], period="period"
+) -> pd.DataFrame:
     """
     Read data stored as text file, columns separated by colon and any amount of
     white space, and return the data as a dataframe with an additional column
@@ -42,6 +44,6 @@ def read_colon_separated_file(filepath: str, column_names: List[str]) -> pd.Data
         buffer = BytesIO(file.read())
         df = pd.read_csv(buffer, sep=r"\s*:\s*", names=column_names, engine="python")
         date_string = re.findall(r"_(\d{6})", filepath)
-        df["period"] = int(date_string[0])
+        df[period] = int(date_string[0])
 
     return df