Skip to content

Commit

Permalink
Merge pull request #82 from ONSdigital/bugfix-join-manual-construction
Browse files Browse the repository at this point in the history
quick and easy fix for period and refernce being index
  • Loading branch information
giuliag92 authored Sep 5, 2024
2 parents 4a0af88 + 170f8d1 commit b117311
Show file tree
Hide file tree
Showing 22 changed files with 346 additions and 323 deletions.
49 changes: 35 additions & 14 deletions mbs_results/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def join_manual_constructions(
manual_constructions: pd.DataFrame,
reference: str,
period: str,
question_no: str = "question_no",
**config
):
"""
Expand All @@ -206,6 +207,8 @@ def join_manual_constructions(
the name of the reference column
period: str
the name of the period column
period: str
the name of the question number column
**config: Dict
main pipeline configuration. Can be used to input the entire config dictionary
Expand All @@ -214,24 +217,42 @@ def join_manual_constructions(
pd.DataFrame
dataframe with correctly formatted column datatypes.
"""
if not is_same_dtype(df, manual_constructions, period):
manual_constructions[period] = convert_column_to_datetime(
manual_constructions[period]
)

if not is_same_dtype(df, manual_constructions, reference):
manual_constructions[reference] = manual_constructions[reference].astype(
df[reference].dtype
)
question_no_from_df = df[question_no].unique().tolist()
manual_constructions_filter = manual_constructions.loc[
manual_constructions[question_no].isin(question_no_from_df)
]

if manual_constructions_filter.empty:
# return original df as nothing present to use
# as manual construction
return df
else:
manual_constructions_filter.drop(columns=[question_no], inplace=True)
if period not in df.columns or reference not in df.columns:
df = df.reset_index()

if not is_same_dtype(df, manual_constructions_filter, period):
manual_constructions_filter[period] = convert_column_to_datetime(
manual_constructions_filter[period]
)

manual_constructions.set_index([reference, period], inplace=True)
df.set_index([reference, period], inplace=True)
if not is_same_dtype(df, manual_constructions_filter, reference):
manual_constructions_filter[reference] = manual_constructions_filter[
reference
].astype(df[reference].dtype)

validate_manual_constructions(df, manual_constructions)
manual_constructions_filter.set_index([reference, period], inplace=True)
df.set_index([reference, period], inplace=True)

return df.merge(
manual_constructions, on=[reference, period], how="left", suffixes=("", "_man")
).reset_index()
validate_manual_constructions(df, manual_constructions_filter)

return df.merge(
manual_constructions_filter,
on=[reference, period],
how="left",
suffixes=("", "_man"),
).reset_index()


def is_same_dtype(df: pd.DataFrame, df2: pd.DataFrame, col_name: str) -> bool:
Expand Down
26 changes: 13 additions & 13 deletions tests/data/ratio_of_means/rom_test_data_case_mc_10_input.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
identifier,date,group,question,other,question_man
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,64,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,
identifier,date,group,question,other,question_man,question_no
30001,202001,100,8444,51,,42
30001,202002,100,7476,51,,42
30001,202003,100,2003,51,,42
30002,202001,100,9343,72,,42
30002,202002,100,7818,72,,42
30002,202003,100,4897,72,,42
30003,202001,100,7511,7,,42
30003,202002,100,1761,7,,42
30003,202003,100,6492,7,,42
30004,202001,100,64,81,4321,42
30004,202002,100,2113,81,,42
30004,202003,100,,81,,42
26 changes: 13 additions & 13 deletions tests/data/ratio_of_means/rom_test_data_case_mc_10_output.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,8444.000000,202001,30001
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,7476.000000,202002,30001
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,2003.000000,202003,30001
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,9343.000000,202001,30002
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,7818.000000,202002,30002
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,4897.000000,202003,30002
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,7511.000000,202001,30003
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,1761.000000,202002,30003
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,6492.000000,202003,30003
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,64.000000,202001,30004
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,2113.000000,202002,30004
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,FIR,1659.178892,202003,30004
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier,question_no
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,8444.000000,202001,30001,42
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,7476.000000,202002,30001,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,2003.000000,202003,30001,42
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,9343.000000,202001,30002,42
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,7818.000000,202002,30002,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,4897.000000,202003,30002,42
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,7511.000000,202001,30003,42
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,1761.000000,202002,30003,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,6492.000000,202003,30003,42
1.3231427378965,120.1990521327014,4,4,0,False,False,True,1.0000000000000,100,R,64.000000,202001,30004,42
1.2735215053763,90.8436018957346,3,4,4,False,False,False,0.7557763583314,100,R,2113.000000,202002,30004,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,FIR,1659.178892,202003,30004,42
26 changes: 13 additions & 13 deletions tests/data/ratio_of_means/rom_test_data_case_mc_1_input.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
identifier,date,group,question,other,question_man
30001,202001,100,8444,51,
30001,202002,100,7476,51,
30001,202003,100,2003,51,
30002,202001,100,9343,72,
30002,202002,100,7818,72,
30002,202003,100,4897,72,
30003,202001,100,7511,7,
30003,202002,100,1761,7,
30003,202003,100,6492,7,
30004,202001,100,,81,4321
30004,202002,100,2113,81,
30004,202003,100,,81,3189
identifier,date,group,question,other,question_man,question_no
30001,202001,100,8444,51,,42
30001,202002,100,7476,51,,42
30001,202003,100,2003,51,,42
30002,202001,100,9343,72,,42
30002,202002,100,7818,72,,42
30002,202003,100,4897,72,,42
30003,202001,100,7511,7,,42
30003,202002,100,1761,7,,42
30003,202003,100,6492,7,,42
30004,202001,100,,81,4321,42
30004,202002,100,2113,81,,42
30004,202003,100,,81,3189,42
26 changes: 13 additions & 13 deletions tests/data/ratio_of_means/rom_test_data_case_mc_1_output.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,8444.000000,202001,30001
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,7476.000000,202002,30001
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,2003.000000,202003,30001
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,9343.000000,202001,30002
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,7818.000000,202002,30002
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,4897.000000,202003,30002
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,7511.000000,202001,30003
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,1761.000000,202002,30003
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,6492.000000,202003,30003
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,MC,4321.000000,202001,30004
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,2113.000000,202002,30004
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,MC,3189.000000,202003,30004
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier,question_no
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,8444.000000,202001,30001,42
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,7476.000000,202002,30001,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,2003.000000,202003,30001,42
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,9343.000000,202001,30002,42
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,7818.000000,202002,30002,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,4897.000000,202003,30002,42
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,R,7511.000000,202001,30003,42
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,1761.000000,202002,30003,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,R,6492.000000,202003,30003,42
1.4833186748754,194.6000000000000,3,3,0,False,False,True,1.0000000000000,100,MC,4321.000000,202001,30004,42
1.2735215053763,90.8436018957346,3,4,3,False,False,False,0.6741639655309,100,R,2113.000000,202002,30004,42
1.0000000000000,103.0153846153846,0,3,3,True,False,False,0.7852242744063,100,MC,3189.000000,202003,30004,42
34 changes: 17 additions & 17 deletions tests/data/ratio_of_means/rom_test_data_case_mc_2_input.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
identifier,date,group,question,other,question_man
40001,202001,100,9491,35,
40001,202002,100,4783,35,
40001,202003,100,7902,35,
40001,202004,100,4911,35,
40002,202001,100,2095,63,
40002,202002,100,442,63,
40002,202003,100,3136,63,
40002,202004,100,2115,63,
40003,202001,100,7863,16,
40003,202002,100,8121,16,
40003,202003,100,2151,16,
40003,202004,100,1377,16,
40004,202001,100,5131,78,
40004,202002,100,9836,78,
40004,202003,100,,78,7525
40004,202004,100,,78,
identifier,date,group,question,other,question_man,question_no
40001,202001,100,9491,35,,42
40001,202002,100,4783,35,,42
40001,202003,100,7902,35,,42
40001,202004,100,4911,35,,42
40002,202001,100,2095,63,,42
40002,202002,100,442,63,,42
40002,202003,100,3136,63,,42
40002,202004,100,2115,63,,42
40003,202001,100,7863,16,,42
40003,202002,100,8121,16,,42
40003,202003,100,2151,16,,42
40003,202004,100,1377,16,,42
40004,202001,100,5131,78,,42
40004,202002,100,9836,78,,42
40004,202003,100,,78,7525,42
40004,202004,100,,78,,42
34 changes: 17 additions & 17 deletions tests/data/ratio_of_means/rom_test_data_case_mc_2_output.csv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,9491.000000,202001,40001
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,4783.000000,202002,40001
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,7902.000000,202003,40001
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,4911.000000,202004,40001
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,2095.000000,202001,40002
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,442.000000,202002,40002
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,3136.000000,202003,40002
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,2115.000000,202004,40002
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,7863.000000,202001,40003
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,8121.000000,202002,40003
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,2151.000000,202003,40003
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,1377.000000,202004,40003
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,5131.000000,202001,40004
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,9836.000000,202002,40004
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,MC,7525.000000,202003,40004
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,FIMC,4794.341876,202004,40004
backward,construction,count_backward,count_construction,count_forward,default_backward,default_construction,default_forward,forward,group,marker,output,date,identifier,question_no
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,9491.000000,202001,40001,42
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,4783.000000,202002,40001,42
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,7902.000000,202003,40001,42
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,4911.000000,202004,40001,42
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,2095.000000,202001,40002,42
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,442.000000,202002,40002,42
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,3136.000000,202003,40002,42
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,2115.000000,202004,40002,42
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,7863.000000,202001,40003,42
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,8121.000000,202002,40003,42
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,R,2151.000000,202003,40003,42
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,R,1377.000000,202004,40003,42
1.0603054093693,128.0208333333333,4,4,0,False,False,True,1.0000000000000,100,R,5131.000000,202001,40004,42
1.0119038592767,120.7395833333333,3,4,4,False,False,False,0.9431244914565,100,R,9836.000000,202002,40004,42
1.5695584910151,115.6929824561404,3,3,3,False,False,False,0.9882361756331,100,MC,7525.000000,202003,40004,42
1.0000000000000,73.7105263157895,0,3,3,True,False,False,0.6371218439609,100,FIMC,4794.341876,202004,40004,42
42 changes: 21 additions & 21 deletions tests/data/ratio_of_means/rom_test_data_case_mc_3_input.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
identifier,date,group,question,other,question_man
50001,202001,100,6362,59,
50001,202002,100,8542,59,
50001,202003,100,5623,59,
50001,202004,100,7769,59,
50001,202005,100,4687,59,
50002,202001,100,4851,36,
50002,202002,100,8894,36,
50002,202003,100,3372,36,
50002,202004,100,3522,36,
50002,202005,100,2327,36,
50003,202001,100,2238,76,
50003,202002,100,769,76,
50003,202003,100,7722,76,
50003,202004,100,6445,76,
50003,202005,100,1521,76,
50004,202001,100,688,30,
50004,202002,100,3245,30,
50004,202003,100,,30,1487
50004,202004,100,,30,
50004,202005,100,,30,
identifier,date,group,question,other,question_man,question_no
50001,202001,100,6362,59,,42
50001,202002,100,8542,59,,42
50001,202003,100,5623,59,,42
50001,202004,100,7769,59,,42
50001,202005,100,4687,59,,42
50002,202001,100,4851,36,,42
50002,202002,100,8894,36,,42
50002,202003,100,3372,36,,42
50002,202004,100,3522,36,,42
50002,202005,100,2327,36,,42
50003,202001,100,2238,76,,42
50003,202002,100,769,76,,42
50003,202003,100,7722,76,,42
50003,202004,100,6445,76,,42
50003,202005,100,1521,76,,42
50004,202001,100,688,30,,42
50004,202002,100,3245,30,,42
50004,202003,100,,30,1487,42
50004,202004,100,,30,,42
50004,202005,100,,30,,42
Loading

0 comments on commit b117311

Please sign in to comment.