From cc0f73632359d7f3944ffa747f08f43e5fcecd0d Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Wed, 23 Aug 2023 15:26:36 +0100 Subject: [PATCH 1/9] initial changes to main.py --- src/rred_reports/redcap/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index bfe6e219..24ba372f 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -86,7 +86,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram processed_extract = labelled_data.copy(deep=True) # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns - cls._fill_school_id_with_coalesce(raw_data, processed_extract) + cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id") + cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -97,9 +98,9 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram return cls._rename_wide_cols_with_student_number_suffix(filtered) @staticmethod - def _fill_school_id_with_coalesce(raw_data, processed_extract): + def _fill_school_id_with_coalesce(raw_data, processed_extract, column_name): school_id_cols = [col for col in raw_data if col.startswith("entry_school_")] - processed_extract["school_id"] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0] + processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0] @staticmethod def _fill_region_with_coalesce(extract: pd.DataFrame): From 54eb15ae545aef4d784af0be2450260739cb9e92 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Wed, 23 Aug 2023 15:40:33 +0100 Subject: [PATCH 2/9] adding to wide columns --- src/rred_reports/redcap/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 24ba372f..b3d8ef0e 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -138,6 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da _parsing_cols = { "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"], "wide_columns": [ + "rrcp_school_name", "assessi_engtest2", "assessi_iretest1", "assessi_iretype1", From a684b6a6718e4ef2c900c0e2ca025ba562a33592 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Wed, 23 Aug 2023 15:47:38 +0100 Subject: [PATCH 3/9] changing the name of rrcp_name --- src/rred_reports/redcap/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index b3d8ef0e..b5ebbf6d 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -87,7 +87,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id") - cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school") + cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -138,7 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da _parsing_cols = { "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"], "wide_columns": [ - "rrcp_school_name", + "redcap_school_name", "assessi_engtest2", "assessi_iretest1", "assessi_iretype1", From 79b308cffafd50a0cb87d5b13525c853449782df Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Wed, 23 Aug 2023 15:57:19 +0100 Subject: [PATCH 4/9] solving stubname issues --- src/rred_reports/redcap/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index b5ebbf6d..0a507330 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -138,7 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da _parsing_cols = { "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"], "wide_columns": [ - "redcap_school_name", + "rrcp_school", "assessi_engtest2", "assessi_iretest1", "assessi_iretype1", From d2d9873fdeb88d9345c8c66587172f633d2af3ba Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Wed, 23 Aug 2023 17:03:34 +0100 Subject: [PATCH 5/9] changes to the test_redcap --- src/rred_reports/redcap/main.py | 5 ++--- tests/test_redcap.py | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 0a507330..93bcdbce 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -87,7 +87,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id") - cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name") + cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -136,9 +136,8 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da # Hardcoded columns for exporting, could finesse this but probably isn't worth the time # The final columns output are under unit testing so will catch any changes to input or output data _parsing_cols = { - "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"], + "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "rrcp_school", "school_id"], "wide_columns": [ - "rrcp_school", "assessi_engtest2", "assessi_iretest1", "assessi_iretype1", diff --git a/tests/test_redcap.py b/tests/test_redcap.py index 3a648cfe..885e24be 100644 --- a/tests/test_redcap.py +++ b/tests/test_redcap.py @@ -52,8 +52,11 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract): 6 rows should exist, and the output columns should match what is in our masterfile definition """ + expected_columns = set(masterfile_columns()) + actual_columns = set(redcap_extract.columns) + assert redcap_extract.shape[0] == 6 - assert list(redcap_extract.columns.values) == masterfile_columns() + assert actual_columns == expected_columns def test_redcap_calculated_columns(redcap_extract): @@ -69,3 +72,6 @@ def test_redcap_calculated_columns(redcap_extract): not_summer_dob_and_not_ongoing = redcap_extract.loc[redcap_extract.pupil_no == "2_2021-2022"] assert (not_summer_dob_and_not_ongoing["summer"] == "No").all() assert (not_summer_dob_and_not_ongoing["exit_outcome"] == "Discontinued").all() + + +# list(redcap_extract.columns.values) == masterfile_columns() From 1da650162397db60a13e16131e6b00a90d439e75 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Thu, 24 Aug 2023 15:33:11 +0100 Subject: [PATCH 6/9] changes to rrcp_school name and adding to masterfile_columns --- src/rred_reports/masterfile.py | 2 +- src/rred_reports/redcap/main.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/rred_reports/masterfile.py b/src/rred_reports/masterfile.py index 37b1a2b2..50ef24ce 100644 --- a/src/rred_reports/masterfile.py +++ b/src/rred_reports/masterfile.py @@ -167,7 +167,7 @@ def masterfile_columns() -> list[str]: assert _school_id == school_id, "Sanity check for school ID columns being the same failed, these were not the same" - return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields] + return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields, "redcap_school_name"] def read_and_process_masterfile(data_path: Path) -> pd.DataFrame: diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 93bcdbce..a19b1f5f 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -60,7 +60,8 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame: processed_wide = self.preprocess_wide_data(raw_data, labelled_data) long = self.wide_to_long(processed_wide, redcap_fields.survey_period) long_with_names = self._add_school_name_column(long) - return long_with_names[masterfile_columns()].copy() + masterfile_and_debug_columns = [*masterfile_columns(), "redcap_school_name"] + return long_with_names[masterfile_and_debug_columns].copy() @classmethod def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFrame) -> pd.DataFrame: @@ -87,7 +88,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id") - cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school") + cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -136,7 +137,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da # Hardcoded columns for exporting, could finesse this but probably isn't worth the time # The final columns output are under unit testing so will catch any changes to input or output data _parsing_cols = { - "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "rrcp_school", "school_id"], + "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "redcap_school_name", "school_id"], "wide_columns": [ "assessi_engtest2", "assessi_iretest1", From 4bd1283ff0f990e68483480f4f9dbdce7474fe83 Mon Sep 17 00:00:00 2001 From: Katie Buntic <96536608+katiebuntic@users.noreply.github.com> Date: Fri, 25 Aug 2023 09:58:00 +0100 Subject: [PATCH 7/9] Update src/rred_reports/redcap/main.py Co-authored-by: Stef Piatek --- src/rred_reports/redcap/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index a19b1f5f..3984c2cf 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -99,7 +99,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram return cls._rename_wide_cols_with_student_number_suffix(filtered) @staticmethod - def _fill_school_id_with_coalesce(raw_data, processed_extract, column_name): + def _fill_school_column_with_coalesce(school_data: pd.DataFrame, processed_extract: pd.DataFrame, column_name: str): school_id_cols = [col for col in raw_data if col.startswith("entry_school_")] processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0] From 261ff98536051f603dfff9e835c2b51dcd06de3b Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 25 Aug 2023 10:47:07 +0100 Subject: [PATCH 8/9] reverted test back to og and changes to read_single_redcap_year to avoid duplicates --- src/rred_reports/redcap/main.py | 10 +++++----- tests/test_redcap.py | 8 +------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 3984c2cf..85490f96 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -60,7 +60,7 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame: processed_wide = self.preprocess_wide_data(raw_data, labelled_data) long = self.wide_to_long(processed_wide, redcap_fields.survey_period) long_with_names = self._add_school_name_column(long) - masterfile_and_debug_columns = [*masterfile_columns(), "redcap_school_name"] + masterfile_and_debug_columns = [*masterfile_columns()] return long_with_names[masterfile_and_debug_columns].copy() @classmethod @@ -87,8 +87,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram processed_extract = labelled_data.copy(deep=True) # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns - cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id") - cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name") + cls._fill_school_column_with_coalesce(raw_data, processed_extract, "school_id") + cls._fill_school_column_with_coalesce(processed_extract, processed_extract, "redcap_school_name") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -100,8 +100,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram @staticmethod def _fill_school_column_with_coalesce(school_data: pd.DataFrame, processed_extract: pd.DataFrame, column_name: str): - school_id_cols = [col for col in raw_data if col.startswith("entry_school_")] - processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0] + school_id_cols = [col for col in school_data if col.startswith("entry_school_")] + processed_extract[column_name] = school_data[school_id_cols].bfill(axis=1).iloc[:, 0] @staticmethod def _fill_region_with_coalesce(extract: pd.DataFrame): diff --git a/tests/test_redcap.py b/tests/test_redcap.py index 885e24be..3a648cfe 100644 --- a/tests/test_redcap.py +++ b/tests/test_redcap.py @@ -52,11 +52,8 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract): 6 rows should exist, and the output columns should match what is in our masterfile definition """ - expected_columns = set(masterfile_columns()) - actual_columns = set(redcap_extract.columns) - assert redcap_extract.shape[0] == 6 - assert actual_columns == expected_columns + assert list(redcap_extract.columns.values) == masterfile_columns() def test_redcap_calculated_columns(redcap_extract): @@ -72,6 +69,3 @@ def test_redcap_calculated_columns(redcap_extract): not_summer_dob_and_not_ongoing = redcap_extract.loc[redcap_extract.pupil_no == "2_2021-2022"] assert (not_summer_dob_and_not_ongoing["summer"] == "No").all() assert (not_summer_dob_and_not_ongoing["exit_outcome"] == "Discontinued").all() - - -# list(redcap_extract.columns.values) == masterfile_columns() From 61314c4197e9517bc631a6845a4770e9e2833cc8 Mon Sep 17 00:00:00 2001 From: Katie Buntic <96536608+katiebuntic@users.noreply.github.com> Date: Wed, 30 Aug 2023 10:26:05 +0100 Subject: [PATCH 9/9] Update src/rred_reports/redcap/main.py Co-authored-by: Stef Piatek --- src/rred_reports/redcap/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 85490f96..110ffee9 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -60,8 +60,7 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame: processed_wide = self.preprocess_wide_data(raw_data, labelled_data) long = self.wide_to_long(processed_wide, redcap_fields.survey_period) long_with_names = self._add_school_name_column(long) - masterfile_and_debug_columns = [*masterfile_columns()] - return long_with_names[masterfile_and_debug_columns].copy() + return long_with_names[masterfile_columns()].copy() @classmethod def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFrame) -> pd.DataFrame: