Skip to content

Commit

Permalink
Add function and unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
sarahcollyer committed Jun 25, 2024
1 parent f2fa736 commit 3b85894
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 43 deletions.
33 changes: 33 additions & 0 deletions src/flag_for_winsorisation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
def winsorisation_flag(df, a_weight, g_weight):

"""
Function to create a column to flag whether or not a row should have
winsorisation applied.
Function requires a_weight and g_weight columns produced
by the estimation method.
Parameters
----------
df : pd.DataFrame
DataFrame containing a weights and g weights supplied through
estimation.
a_weight: float
Column containing the a weights.
g_weight: float
Column containing the g weights.
Returns
-------
pd.DataFrame
Dataframe with an additional column (nw_ag_flag) that indicates if
winsorisation should be applied.
"""

df["flag_calculation"] = df["a_weight"] * df["g_weight"]

df["nw_ag_flag"] = df["flag_calculation"].apply(lambda x: 1 if x <= 1 else 0)

df = df.drop("flag_calculation", axis=1)

return df
28 changes: 15 additions & 13 deletions tests/data/winsorisation/flag_data.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,
101, 202401, 10, 0, 1.666666667, 1.023809524, 12,
101, 202401, 23, 1, 1.666666667, 1.023809524, 20,
101, 202401, 41, 1, 1.666666667, 1.023809524, 20,
101, 202401, 53, 1, 1.666666667, 1.023809524, 40,
101, 202401, 12, 0, 1.666666667, 1.023809524, 10,
102, 202401, 50, 1, 2.5, 1.023809524, 60,
102, 202401, 40, 1, 2.5, 1.023809524, 50,
102, 202401, 45, 0, 2.5, 1.023809524, 50,
102, 202401, 70, 0, 2.5, 1.023809524, 60,
102, 202401, 86, 0, 2.5, 1.023809524, 90,
103, 202401, 20, 0, 0.32, 0.004, 90,
103, 202401, 30, 0, 0.32, 0.004, 90,
a_weight,g_weight,nw_ag_flag
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
1.666666667,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
2.5,1.023809524,0
0.32,0.004,1
0.32,0.004,1
,0.004,0
,0.004,0
15 changes: 15 additions & 0 deletions tests/data/winsorisation/predicted_unit_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
strata,period,aux,sampled,a_weight,g_weight,target_variable,nw_ag_flag
101,202401,10,0,1.666666667,1.023809524,12,0
101,202401,23,1,1.666666667,1.023809524,20,0
101,202401,41,1,1.666666667,1.023809524,20,0
101,202401,53,1,1.666666667,1.023809524,40,0
101,202401,12,0,1.666666667,1.023809524,10,0
102,202401,50,1,2.5,1.023809524,60,0
102,202401,40,1,2.5,1.023809524,50,0
102,202401,45,0,2.5,1.023809524,50,0
102,202401,70,0,2.5,1.023809524,60,0
102,202401,86,0,2.5,1.023809524,90,0
103,202401,20,0,0.32,0.004,90,1
103,202401,30,0,0.32,0.004,90,1
104,202401,20,0,,0.004,90,0
104,202401,30,0,,0.004,90,0
62 changes: 32 additions & 30 deletions tests/test_flag_for_winsorisation.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
""" What we need
dataframe
a weight
g weight
a * g to create a new column
then flag anything <=1 as not to be winsorised
"""

import pandas as pd


def winsorisation_flag(df, a_weight, g_weight):

df["new_col"] = df.a_weight * df.g_weight

df["NW_AG_flag"] = df["new_col"].apply(lambda x: "NW_AG" if x <= 1 else "")

return df


data = pd.read_csv(
"/home/cdsw/monthly-business-survey-results/tests/data/winsorisation/flag_data.csv"
)

print(data)

test = winsorisation_flag(data, data.a_weight, data.g_weight)
import pytest
from pandas.testing import assert_frame_equal

from src.flag_for_winsorisation import winsorisation_flag

path = "/home/cdsw/monthly-business-survey-results"


@pytest.fixture(scope="class")
def winsorisation_flag_test_data():
return pd.read_csv(
path + "/tests/data/winsorisation/flag_data.csv",
low_memory=False,
usecols=lambda c: not c.startswith("Unnamed:"),
)


class TestWinsorisationFlag:
def test_winsorisation_flag(self, winsorisation_flag_test_data):
df_expected_output = winsorisation_flag_test_data.copy()
df_input = df_expected_output.drop(columns=["nw_ag_flag"])
df_input = df_input[
[
"a_weight",
"g_weight",
]
]
df_output = winsorisation_flag(
df=df_input, a_weight="a_weight", g_weight="g_weight"
)

assert_frame_equal(df_output, df_expected_output)

0 comments on commit 3b85894

Please sign in to comment.