Skip to content

Commit

Permalink
418 anon data estimation (#64)
Browse files Browse the repository at this point in the history
* Rename estimation module

* Add apply_estimation function

* Fix estimation preprocessing and application

* Fix filepaths

* Fix estimation after running on anon data

* Update tests

* Update utils and config

* Optimise for memory usage

* Refactor for combined output run

* Fix get_estimation_data period parameter

* Remove get_qv_df function

* Make is_close tolerance explicit
  • Loading branch information
rowanhemsi authored Aug 1, 2024
1 parent e5ed3c2 commit 0559c73
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 93 deletions.
5 changes: 5 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
"frozenturnover" : "float",
"cellnumber" : "int"},
"temporarily_remove_cols": [],
"non_sampled_strata": [
"5141", "5142", "5143",
"5371", "5372", "5373",
"5661", "5662", "5663"
],
"current_period" : 202401,
"previous_period" : 202312
}
59 changes: 59 additions & 0 deletions mbs_results/apply_estimation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import glob

import pandas as pd

from mbs_results.calculate_estimation_weights import (
calculate_calibration_factor,
calculate_design_weight,
)
from mbs_results.pre_processing_estimation import get_estimation_data

# from mbs_results.validate_estimation import validate_estimation


def apply_estimation(population_path, sample_path, period, **config):
"""
Read population frame and sample, merge key variables onto df then derive
and validate estimation weights.
Parameters
----------
population_path : str
filepath for population frame data
sample_path : str
filepath for sample data
period : str
name of column containing period
Returns
-------
population frame with calibration group, sampled flag, design weight and
calibration factor
Raises
------
`ValueError`
"""
population_files = glob.glob(population_path)
sample_files = glob.glob(sample_path)

estimation_df_list = []

for population_file, sample_file in zip(population_files, sample_files):
estimation_data = get_estimation_data(
population_file, sample_file, period, **config
)

estimation_data = calculate_design_weight(estimation_data, period, **config)
estimation_data = calculate_calibration_factor(
estimation_data, period, **config
)

estimation_df_list.append(estimation_data)

estimation_df = pd.concat(estimation_df_list, ignore_index=True)

# validate_estimation(estimation_df, **config)

return estimation_df
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@


def calculate_design_weight(
dataframe: pd.DataFrame,
population_frame: pd.DataFrame,
period: str,
strata: str,
sampled: str,
**config,
) -> pd.DataFrame:
"""
Add column to dataframe containing design weights based on sampled flag
Parameters
----------
dataframe : pd.DataFrame
data to be estimated
population_frame : pd.DataFrame
data to use to calculate weights
period : str
name of column in dataframe containing period variable
strata : str
Expand All @@ -30,60 +31,63 @@ def calculate_design_weight(
-----
#TODO: Add link to specification once added to repository
"""
population_counts = dataframe.groupby([period, strata]).size()
population_counts = population_frame.groupby([period, strata]).size()

sample = dataframe[dataframe[sampled] == 1]
sample = population_frame[population_frame[sampled] == 1]
sample_counts = sample.groupby([period, strata]).size()

design_weights = population_counts / sample_counts

design_weights.name = "design_weight"
design_weights = design_weights.reset_index()

dataframe = dataframe.merge(design_weights, how="left", on=[period, strata])
population_frame = population_frame.merge(
design_weights, how="left", on=[period, strata]
)

return dataframe
return population_frame


def calculate_calibration_factor(
dataframe: pd.DataFrame,
population_frame: pd.DataFrame,
period: str,
group: str,
sampled: str,
auxiliary: str,
design_weight: str,
**config,
) -> pd.DataFrame:
"""
Add column to dataframe to calculate calibration factor
Parameters
----------
dataframe : pd.DataFrame
data to be weighted
period : str
name of column in dataframe containing period variable
Add column to dataframe to calculate calibration factor
Parameters
----------
population_frame : pd.DataFrame
data to use to calculate weights
period : str
name of column in dataframe containing period variable
group: str
name of column in dataframe containing group level data
for separate ratio, use strata variable
for combined ratio, use calibration group level variable
sampled : str
name of column in dataframe containing sample flag
auxiliary : str
name of column in dataframe containing auxiliary variable
design_weight: str
name of column in dataframe containing design weight
Returns
-------
pd.DataFrame
dataframe with new column `calibration_factor`
name of column in dataframe containing group level data
for separate ratio, use strata variable
for combined ratio, use calibration group level variable
sampled : str
name of column in dataframe containing sample flag
auxiliary : str
name of column in dataframe containing auxiliary variable
design_weight: str
name of column in dataframe containing design weight
Returns
-------
pd.DataFrame
dataframe with new column `calibration_factor`
"""

population_sums = dataframe.groupby([period, group])[auxiliary].sum()
population_sums = population_frame.groupby([period, group])[auxiliary].sum()

# copy to avoid SettingWithCopy warning
# (not required with later versions of pandas)
sample = dataframe.copy()[dataframe[sampled] == 1]
sample = population_frame.copy()[population_frame[sampled] == 1]
sample["weighted_auxiliary"] = sample[auxiliary] * sample[design_weight]
weighted_sample_sums = sample.groupby([period, group])["weighted_auxiliary"].sum()

Expand All @@ -92,6 +96,8 @@ def calculate_calibration_factor(
calibration_factor.name = "calibration_factor"
calibration_factor = calibration_factor.reset_index()

dataframe = dataframe.merge(calibration_factor, how="left", on=[period, group])
population_frame = population_frame.merge(
calibration_factor, how="left", on=[period, group]
)

return dataframe
return population_frame
61 changes: 29 additions & 32 deletions mbs_results/pre_processing_estimation.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from glob import glob

import pandas as pd

from mbs_results.utils import read_colon_separated_file


def get_estimation_data(
population_path,
population_file,
sample_file,
period,
population_column_names,
sample_path,
sample_column_names,
population_keep_columns,
sample_keep_columns,
calibration_group_map,
period,
reference,
cell_number,
**config
Expand All @@ -21,14 +19,18 @@ def get_estimation_data(
Parameters
----------
population_path: pd.DataFrame
population_file: pd.DataFrame
file path to the folder containing the population frames
population_column_names: List
list of column names for the population frames
sample_path: pd.DataFrame
file path to the folder containing the sample data
sample_column_names: List
population_column_names: List[str]
list of column names for the population frames
sample_column_names: List[str]
list of column names for the sample data
population_keep_columns: List[str]
list of names of columns to keep from population frame
sample_keep_columns: List[str]
list of names of columns to keep from sample
calibration_group_map: pd.DataFrame
dataframe containing map between cell number and calibration group
period: Str
Expand All @@ -46,29 +48,16 @@ def get_estimation_data(
population frame containing period and sampled columns.
"""
population_files = glob("universe.*", root_dir=population_path)
sample_files = glob("finalsel.*", root_dir=sample_path)

population_dfs = []
for file in population_files:
population_df = read_colon_separated_file(population_file, population_column_names)

population_df = read_colon_separated_file(file, population_column_names)
population_df = population_df[population_keep_columns]

population_dfs.append(population_df)
sample_df = read_colon_separated_file(sample_file, sample_column_names)

population = pd.concat(population_dfs, ignore_index=True)

sample_dfs = []
for file in sample_files:

sample_df = read_colon_separated_file(file, sample_column_names)

sample_dfs.append(sample_df)

sample = pd.concat(sample_dfs, ignore_index=True)
sample_df = sample_df[sample_keep_columns]

estimation_data = derive_estimation_variables(
population, sample, calibration_group_map, period, reference, cell_number
population_df, sample_df, calibration_group_map, period, reference, cell_number
)

return estimation_data
Expand Down Expand Up @@ -109,10 +98,18 @@ def derive_estimation_variables(
population frame containing sampled column
"""
population_frame.merge(calibration_group_map, on=[cell_number], how="left")
# TODO: check if cell_no is the strata or if it should be dropped
population_frame[cell_number] = (
population_frame[cell_number]
.astype(str)
.map(lambda x: str(5) + x[1:] if x[0] == str(7) else x)
.astype(int)
)

population_frame = population_frame.merge(
calibration_group_map, on=[cell_number], how="left"
)

sample = sample[[reference, period]]
sample = sample.copy()[[reference, period]]
sample["sampled"] = 1

return population_frame.merge(sample, on=[reference, period], how="left").fillna(
Expand Down
6 changes: 4 additions & 2 deletions mbs_results/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def convert_column_to_datetime(dates):
return pd.to_datetime(dates, format="%Y%m")


def read_colon_separated_file(filepath: str, column_names: List[str]) -> pd.DataFrame:
def read_colon_separated_file(
filepath: str, column_names: List[str], period="period"
) -> pd.DataFrame:
"""
Read data stored as text file, columns separated by colon and any amount of
white space, and return the data as a dataframe with an additional column
Expand All @@ -42,6 +44,6 @@ def read_colon_separated_file(filepath: str, column_names: List[str]) -> pd.Data
buffer = BytesIO(file.read())
df = pd.read_csv(buffer, sep=r"\s*:\s*", names=column_names, engine="python")
date_string = re.findall(r"_(\d{6})", filepath)
df["period"] = int(date_string[0])
df[period] = int(date_string[0])

return df
Loading

0 comments on commit 0559c73

Please sign in to comment.