From cc0f73632359d7f3944ffa747f08f43e5fcecd0d Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Wed, 23 Aug 2023 15:26:36 +0100
Subject: [PATCH 1/9] initial changes to main.py

---
 src/rred_reports/redcap/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index bfe6e219..24ba372f 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -86,7 +86,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         processed_extract = labelled_data.copy(deep=True)
         # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names
         processed_extract.columns = raw_data.columns
-        cls._fill_school_id_with_coalesce(raw_data, processed_extract)
+        cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id")
+        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school")
         cls._fill_region_with_coalesce(processed_extract)
         cls._convert_timestamps_to_dates(processed_extract)
         # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning
@@ -97,9 +98,9 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         return cls._rename_wide_cols_with_student_number_suffix(filtered)
 
     @staticmethod
-    def _fill_school_id_with_coalesce(raw_data, processed_extract):
+    def _fill_school_id_with_coalesce(raw_data, processed_extract, column_name):
         school_id_cols = [col for col in raw_data if col.startswith("entry_school_")]
-        processed_extract["school_id"] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0]
+        processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0]
 
     @staticmethod
     def _fill_region_with_coalesce(extract: pd.DataFrame):

From 54eb15ae545aef4d784af0be2450260739cb9e92 Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Wed, 23 Aug 2023 15:40:33 +0100
Subject: [PATCH 2/9] adding to wide columns

---
 src/rred_reports/redcap/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index 24ba372f..b3d8ef0e 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -138,6 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da
     _parsing_cols = {
         "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"],
         "wide_columns": [
+            "rrcp_school_name",
             "assessi_engtest2",
             "assessi_iretest1",
             "assessi_iretype1",

From a684b6a6718e4ef2c900c0e2ca025ba562a33592 Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Wed, 23 Aug 2023 15:47:38 +0100
Subject: [PATCH 3/9] changing the name of rrcp_name

---
 src/rred_reports/redcap/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index b3d8ef0e..b5ebbf6d 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -87,7 +87,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names
         processed_extract.columns = raw_data.columns
         cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id")
-        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school")
+        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name")
         cls._fill_region_with_coalesce(processed_extract)
         cls._convert_timestamps_to_dates(processed_extract)
         # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning
@@ -138,7 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da
     _parsing_cols = {
         "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"],
         "wide_columns": [
-            "rrcp_school_name",
+            "redcap_school_name",
             "assessi_engtest2",
             "assessi_iretest1",
             "assessi_iretype1",

From 79b308cffafd50a0cb87d5b13525c853449782df Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Wed, 23 Aug 2023 15:57:19 +0100
Subject: [PATCH 4/9] solving stubname issues

---
 src/rred_reports/redcap/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index b5ebbf6d..0a507330 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -138,7 +138,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da
     _parsing_cols = {
         "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"],
         "wide_columns": [
-            "redcap_school_name",
+            "rrcp_school",
             "assessi_engtest2",
             "assessi_iretest1",
             "assessi_iretype1",

From d2d9873fdeb88d9345c8c66587172f633d2af3ba Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Wed, 23 Aug 2023 17:03:34 +0100
Subject: [PATCH 5/9] changes to the test_redcap

---
 src/rred_reports/redcap/main.py | 5 ++---
 tests/test_redcap.py            | 8 +++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index 0a507330..93bcdbce 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -87,7 +87,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names
         processed_extract.columns = raw_data.columns
         cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id")
-        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name")
+        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school")
         cls._fill_region_with_coalesce(processed_extract)
         cls._convert_timestamps_to_dates(processed_extract)
         # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning
@@ -136,9 +136,8 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da
     # Hardcoded columns for exporting, could finesse this but probably isn't worth the time
     # The final columns output are under unit testing so will catch any changes to input or output data
     _parsing_cols = {
-        "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "school_id"],
+        "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "rrcp_school", "school_id"],
         "wide_columns": [
-            "rrcp_school",
             "assessi_engtest2",
             "assessi_iretest1",
             "assessi_iretype1",
diff --git a/tests/test_redcap.py b/tests/test_redcap.py
index 3a648cfe..885e24be 100644
--- a/tests/test_redcap.py
+++ b/tests/test_redcap.py
@@ -52,8 +52,11 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract):
     6 rows should exist, and the output columns should match what is in our masterfile definition
     """
 
+    expected_columns = set(masterfile_columns())
+    actual_columns = set(redcap_extract.columns)
+
     assert redcap_extract.shape[0] == 6
-    assert list(redcap_extract.columns.values) == masterfile_columns()
+    assert actual_columns == expected_columns
 
 
 def test_redcap_calculated_columns(redcap_extract):
@@ -69,3 +72,6 @@ def test_redcap_calculated_columns(redcap_extract):
     not_summer_dob_and_not_ongoing = redcap_extract.loc[redcap_extract.pupil_no == "2_2021-2022"]
     assert (not_summer_dob_and_not_ongoing["summer"] == "No").all()
     assert (not_summer_dob_and_not_ongoing["exit_outcome"] == "Discontinued").all()
+
+
+# list(redcap_extract.columns.values) == masterfile_columns()

From 1da650162397db60a13e16131e6b00a90d439e75 Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Thu, 24 Aug 2023 15:33:11 +0100
Subject: [PATCH 6/9] changes to rrcp_school name and adding to
 masterfile_columns

---
 src/rred_reports/masterfile.py  | 2 +-
 src/rred_reports/redcap/main.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/rred_reports/masterfile.py b/src/rred_reports/masterfile.py
index 37b1a2b2..50ef24ce 100644
--- a/src/rred_reports/masterfile.py
+++ b/src/rred_reports/masterfile.py
@@ -167,7 +167,7 @@ def masterfile_columns() -> list[str]:
 
     assert _school_id == school_id, "Sanity check for school ID columns being the same failed, these were not the same"
 
-    return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields]
+    return [pupil_no, user_id, *other_teacher_fields, *other_school_fields, school_id, *other_pupil_fields, "redcap_school_name"]
 
 
 def read_and_process_masterfile(data_path: Path) -> pd.DataFrame:
diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index 93bcdbce..a19b1f5f 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -60,7 +60,8 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame:
         processed_wide = self.preprocess_wide_data(raw_data, labelled_data)
         long = self.wide_to_long(processed_wide, redcap_fields.survey_period)
         long_with_names = self._add_school_name_column(long)
-        return long_with_names[masterfile_columns()].copy()
+        masterfile_and_debug_columns = [*masterfile_columns(), "redcap_school_name"]
+        return long_with_names[masterfile_and_debug_columns].copy()
 
     @classmethod
     def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFrame) -> pd.DataFrame:
@@ -87,7 +88,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names
         processed_extract.columns = raw_data.columns
         cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id")
-        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "rrcp_school")
+        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name")
         cls._fill_region_with_coalesce(processed_extract)
         cls._convert_timestamps_to_dates(processed_extract)
         # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning
@@ -136,7 +137,7 @@ def _rename_wide_cols_with_student_number_suffix(extract: pd.DataFrame) -> pd.Da
     # Hardcoded columns for exporting, could finesse this but probably isn't worth the time
     # The final columns output are under unit testing so will catch any changes to input or output data
     _parsing_cols = {
-        "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "rrcp_school", "school_id"],
+        "non_wide_columns": ["reg_rr_title", "rrcp_country", "rrcp_area", "redcap_school_name", "school_id"],
         "wide_columns": [
             "assessi_engtest2",
             "assessi_iretest1",

From 4bd1283ff0f990e68483480f4f9dbdce7474fe83 Mon Sep 17 00:00:00 2001
From: Katie Buntic <96536608+katiebuntic@users.noreply.github.com>
Date: Fri, 25 Aug 2023 09:58:00 +0100
Subject: [PATCH 7/9] Update src/rred_reports/redcap/main.py

Co-authored-by: Stef Piatek <s.piatek@ucl.ac.uk>
---
 src/rred_reports/redcap/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index a19b1f5f..3984c2cf 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -99,7 +99,7 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         return cls._rename_wide_cols_with_student_number_suffix(filtered)
 
     @staticmethod
-    def _fill_school_id_with_coalesce(raw_data, processed_extract, column_name):
+    def _fill_school_column_with_coalesce(school_data: pd.DataFrame, processed_extract: pd.DataFrame, column_name: str):
         school_id_cols = [col for col in raw_data if col.startswith("entry_school_")]
         processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0]
 

From 261ff98536051f603dfff9e835c2b51dcd06de3b Mon Sep 17 00:00:00 2001
From: katiebuntic <k.buntic@ucl.ac.uk>
Date: Fri, 25 Aug 2023 10:47:07 +0100
Subject: [PATCH 8/9] reverted test back to og and changes to
 read_single_redcap_year to avoid duplicates

---
 src/rred_reports/redcap/main.py | 10 +++++-----
 tests/test_redcap.py            |  8 +-------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index 3984c2cf..85490f96 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -60,7 +60,7 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame:
         processed_wide = self.preprocess_wide_data(raw_data, labelled_data)
         long = self.wide_to_long(processed_wide, redcap_fields.survey_period)
         long_with_names = self._add_school_name_column(long)
-        masterfile_and_debug_columns = [*masterfile_columns(), "redcap_school_name"]
+        masterfile_and_debug_columns = [*masterfile_columns()]
         return long_with_names[masterfile_and_debug_columns].copy()
 
     @classmethod
@@ -87,8 +87,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
         processed_extract = labelled_data.copy(deep=True)
         # Unify on using the raw_data column names, labelled uses the questions given on the survey as column names
         processed_extract.columns = raw_data.columns
-        cls._fill_school_id_with_coalesce(raw_data, processed_extract, "school_id")
-        cls._fill_school_id_with_coalesce(processed_extract, processed_extract, "redcap_school_name")
+        cls._fill_school_column_with_coalesce(raw_data, processed_extract, "school_id")
+        cls._fill_school_column_with_coalesce(processed_extract, processed_extract, "redcap_school_name")
         cls._fill_region_with_coalesce(processed_extract)
         cls._convert_timestamps_to_dates(processed_extract)
         # Making a copy, so we have a de-fragmented frame for adding row number, was getting a performance warning
@@ -100,8 +100,8 @@ def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFram
 
     @staticmethod
     def _fill_school_column_with_coalesce(school_data: pd.DataFrame, processed_extract: pd.DataFrame, column_name: str):
-        school_id_cols = [col for col in raw_data if col.startswith("entry_school_")]
-        processed_extract[column_name] = raw_data[school_id_cols].bfill(axis=1).iloc[:, 0]
+        school_id_cols = [col for col in school_data if col.startswith("entry_school_")]
+        processed_extract[column_name] = school_data[school_id_cols].bfill(axis=1).iloc[:, 0]
 
     @staticmethod
     def _fill_region_with_coalesce(extract: pd.DataFrame):
diff --git a/tests/test_redcap.py b/tests/test_redcap.py
index 885e24be..3a648cfe 100644
--- a/tests/test_redcap.py
+++ b/tests/test_redcap.py
@@ -52,11 +52,8 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract):
     6 rows should exist, and the output columns should match what is in our masterfile definition
     """
 
-    expected_columns = set(masterfile_columns())
-    actual_columns = set(redcap_extract.columns)
-
     assert redcap_extract.shape[0] == 6
-    assert actual_columns == expected_columns
+    assert list(redcap_extract.columns.values) == masterfile_columns()
 
 
 def test_redcap_calculated_columns(redcap_extract):
@@ -72,6 +69,3 @@ def test_redcap_calculated_columns(redcap_extract):
     not_summer_dob_and_not_ongoing = redcap_extract.loc[redcap_extract.pupil_no == "2_2021-2022"]
     assert (not_summer_dob_and_not_ongoing["summer"] == "No").all()
     assert (not_summer_dob_and_not_ongoing["exit_outcome"] == "Discontinued").all()
-
-
-# list(redcap_extract.columns.values) == masterfile_columns()

From 61314c4197e9517bc631a6845a4770e9e2833cc8 Mon Sep 17 00:00:00 2001
From: Katie Buntic <96536608+katiebuntic@users.noreply.github.com>
Date: Wed, 30 Aug 2023 10:26:05 +0100
Subject: [PATCH 9/9] Update src/rred_reports/redcap/main.py

Co-authored-by: Stef Piatek <s.piatek@ucl.ac.uk>
---
 src/rred_reports/redcap/main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py
index 85490f96..110ffee9 100644
--- a/src/rred_reports/redcap/main.py
+++ b/src/rred_reports/redcap/main.py
@@ -60,8 +60,7 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame:
         processed_wide = self.preprocess_wide_data(raw_data, labelled_data)
         long = self.wide_to_long(processed_wide, redcap_fields.survey_period)
         long_with_names = self._add_school_name_column(long)
-        masterfile_and_debug_columns = [*masterfile_columns()]
-        return long_with_names[masterfile_and_debug_columns].copy()
+        return long_with_names[masterfile_columns()].copy()
 
     @classmethod
     def preprocess_wide_data(cls, raw_data: pd.DataFrame, labelled_data: pd.DataFrame) -> pd.DataFrame: