diff --git a/src/rred_reports/masterfile.py b/src/rred_reports/masterfile.py index 37b1a2b2..50ef24ce 100644 --- a/src/rred_reports/masterfile.py +++ b/src/rred_reports/masterfile.py @@ -167,7 +167,7 @@ def masterfile_columns() -> list[str]: assert _school_id == school_id, "Sanity check for school ID columns being the same failed, these were not the same" - return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields] + return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields, "redcap_school_name"] def read_and_process_masterfile(data_path: Path) -> pd.DataFrame: diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index bfe6e219..110ffee9 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -86,7 +86,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram processed_extract = labelled_data.copy(deep=True) # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names processed_extract.columns = raw_data.columns - cls._fill_school_id_with_coalesce(raw_data, processed_extract) + cls._fill_school_column_with_coalesce(raw_data, processed_extract, "school_id") + cls._fill_school_column_with_coalesce(processed_extract, processed_extract, "redcap_school_name") cls._fill_region_with_coalesce(processed_extract) cls._convert_timestamps_to_dates(processed_extract) # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning @@ -97,9 +98,9 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram return cls._rename_wide_cols_with_student_number_suffix(filtered) @staticmethod - def _fill_school_id_with_coalesce(raw_data, processed_extract): - school_id_cols = [col for col in raw_data if col.startswith("entry_school_")] - processed_extract["school_id"] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0] + def _fill_school_column_with_coalesce(school_data: pd.DataFrame, processed_extract: pd.DataFrame, column_name: str): + school_id_cols = [col for col in school_data if col.startswith("entry_school_")] + processed_extract[column_name] = school_data[school_id_cols].bfill(axis=1).iloc[:, 0] @staticmethod def _fill_region_with_coalesce(extract: pd.DataFrame): @@ -135,7 +136,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da # Hardcoded columns for exporting, could finesse this but probably isn't worth the time # The final columns output are under unit testing so will catch any changes to input or output data _parsing_cols = { - "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"], + "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "redcap_school_name", "school_id"], "wide_columns": [ "assessi_engtest2", "assessi_iretest1",