From dda6445b5185666a9ccbcddeaaf189737f72e831 Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Wed, 30 Aug 2023 14:28:15 +0100 Subject: [PATCH 01/12] Add templates for school aliases --- input/school_aliases/.gitkeep | 0 input/school_aliases/template.toml | 3 +++ 2 files changed, 3 insertions(+) create mode 100644 input/school_aliases/.gitkeep create mode 100644 input/school_aliases/template.toml diff --git a/input/school_aliases/.gitkeep b/input/school_aliases/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/input/school_aliases/template.toml b/input/school_aliases/template.toml new file mode 100644 index 00000000..f14b3146 --- /dev/null +++ b/input/school_aliases/template.toml @@ -0,0 +1,3 @@ +# replace the placeholder IDs below with school IDs so that they are replaced during the extract process +old_id_1 = "new_id_1" +old_id_2 = "new_id_2" From 716eefcc652ff49520b9c5eca9bccc0d63f52176 Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Wed, 30 Aug 2023 14:29:03 +0100 Subject: [PATCH 02/12] Ignore school alias files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4a2930fa..46f49afc 100644 --- a/.gitignore +++ b/.gitignore @@ -146,6 +146,7 @@ src/*/_version.py input/downloaded input/processed +input/school_aliases !input/*/.gitkeep output/** !output/*/.gitkeep From 2318b8d16a0a03de59cfa32f6a0c1698de29c4ff Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Wed, 30 Aug 2023 14:37:12 +0100 Subject: [PATCH 03/12] Setup for failing tests on school aliases --- src/rred_reports/redcap/interface.py | 11 ++++--- src/rred_reports/redcap/main.py | 12 +++++++- tests/data/redcap/extract.csv | 1 + tests/data/redcap/extract_labels.csv | 1 + tests/test_redcap_interface.py | 45 ++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/rred_reports/redcap/interface.py b/src/rred_reports/redcap/interface.py index 577352c4..adcc8e8c 100644 --- a/src/rred_reports/redcap/interface.py +++ b/src/rred_reports/redcap/interface.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional import typer @@ -9,12 +10,13 @@ top_level_dir = Path(__file__).resolve().parents[3] - app = typer.Typer() @app.command() -def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/") -> None: +def extract( + year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/", school_aliases: Optional[Path] = None +) -> None: """ Extract files from redcap from wide to long and apply basic processing Process wide to long of the files listed under the year-based config toml @@ -23,6 +25,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi year (int): Year to process config_file (Path): Path to config file output_dir (Path): Path to parent output directory + school_aliases (Optional[Path]): School alias file, where schools have changed IDs and should be merged """ typer.echo(f"Extracting data for {year} and the previous year's surveys") config = get_config(config_file)[str(year)] @@ -32,7 +35,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi dispatch_path = top_level_dir / config["dispatch_list"] - parser = RedcapReader(dispatch_path) + parser = RedcapReader(dispatch_path, school_aliases) current_year = ExtractInput( top_level_dir / config["current_year"]["coded_data_file"], top_level_dir / config["current_year"]["label_data_file"], @@ -41,7 +44,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi previous_year = ExtractInput( top_level_dir / config["previous_year"]["coded_data_file"], top_level_dir / config["previous_year"]["label_data_file"], - f"{year -1}-{str(year)[-2:]}", + f"{year - 1}-{str(year)[-2:]}", ) long_data = parser.read_redcap_data(current_year, previous_year) issues = log_school_id_inconsistencies(long_data, dispatch_path, year) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 110ffee9..8e2eef82 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -1,9 +1,11 @@ """Downloading and processing of redcap data""" from dataclasses import dataclass from pathlib import Path +from typing import Optional import numpy as np import pandas as pd +import tomli from loguru import logger from tqdm import tqdm @@ -23,7 +25,7 @@ class ExtractInput: class RedcapReader: """Reads two years of redcap data, processing the files (wide to long, and others) and filtering to non-empty rows""" - def __init__(self, school_list: Path): + def __init__(self, school_list: Path, school_aliases: Optional[Path] = None): """ Setup for reading from redcap @@ -31,6 +33,14 @@ def __init__(self, school_list: Path): school_list: path to Excel dispatch list file """ self._school_list = get_unique_schools(school_list) + self._school_aliases = None + if school_aliases: + try: + with school_aliases.open(mode="rb") as handle: + self._school_aliases = tomli.load(handle) + except FileNotFoundError as error: + msg = f"No school alias file found at {school_aliases}. Exiting." + raise FileNotFoundError(msg) from error def read_redcap_data(self, current_year: ExtractInput, previous_year: ExtractInput) -> pd.DataFrame: """ diff --git a/tests/data/redcap/extract.csv b/tests/data/redcap/extract.csv index be43cb8b..93bfe9d0 100644 --- a/tests/data/redcap/extract.csv +++ b/tests/data/redcap/extract.csv @@ -2,6 +2,7 @@ 0,AB9234,AB9234,6,2021,2,1,2021-04-21 09:20:21,1st of 3 english schools ,80,,RRS180,,,,,,3,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1,2,3,4,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-08-01,2021-08-15,,,1,4,,,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1,AB9234,AB9234,6,2021,2,1,2021-04-21 09:21:21,2nd of 3 english schools ,80,,,,RRS180,,,,3,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,2,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-11-01,2021-09-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 2,AB9234,AB9234,6,2021,2,1,2021-04-21 09:22:21,3rd of 3 english schools ,80,,,,,,RRS180,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,, +3,AB200,AB200,6,2021,2,1,2021-04-21 09:22:21,another_english_school ,80,,,,,,RRS200,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,, 100,AB100,AB100,4,2021,2,4,not entered ,timestamp not entered ,,30,,RRS100,,,,,1,,,, ,,,,,,,,,,,,,,, , ,, ,,,,, 101,AB101,AB101,4,2021,2,1,2021-04-21 09:22:21,no_rr_children=0 ,80,,,RRS101,,,,,0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,, 102,AB102,AB102,4,2021,2,1,2021-04-21 09:22:21,no_rr_children na ,80,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,, diff --git a/tests/data/redcap/extract_labels.csv b/tests/data/redcap/extract_labels.csv index 79e27a76..cdbf3f6a 100644 --- a/tests/data/redcap/extract_labels.csv +++ b/tests/data/redcap/extract_labels.csv @@ -2,6 +2,7 @@ 0,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:20:21,1st of 3 english schools ,Bristol,,RRS180,,,,,,3.0,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-08-01,2021-08-15,,,,Year 3,,,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 1,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:21:21,2nd of 3 english schools ,Bristol,,,,RRS180,,,,3.0,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-11-01,2021-09-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 2,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,3rd of 3 english schools ,Bristol,,,,,,RRS180,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,, +3,AB200,AB200,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,another english school,Bristol,,,,,,school should be renamed,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,, 100,AB100,AB100,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),Scotland,not entered ,timestamp not entered ,,Glasgow,,RRS100,,,,,1.0,,,, ,,,,,,,,,,,,,,, , ,, ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 101,AB101,AB101,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children=0 ,Bristol,,,RRS101,,,,,0.0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 102,AB102,AB102,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children na ,Bristol,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, diff --git a/tests/test_redcap_interface.py b/tests/test_redcap_interface.py index f2989d89..d96a4f66 100644 --- a/tests/test_redcap_interface.py +++ b/tests/test_redcap_interface.py @@ -1,5 +1,6 @@ from pathlib import Path +import pandas as pd import pytest import tomli_w @@ -45,3 +46,47 @@ def test_cli_writes_file(temp_out_dir, set_top_level_dir): expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx" assert expected_file.exists() + + +def test_school_id_aliases(temp_out_dir, set_top_level_dir): + """ + Given a config file pointing to valid test data, and an alias file for school ids for RRS200 -> RRS100 + When the extract command is run, with an output to a temporary directory + Then a masterfile should be written with the school ID replaced + """ + # Arrange + data_path = "tests/data" + test_config = { + "2021": { + "dispatch_list": f"{data_path}/dispatch_list.xlsx", + "current_year": { + "coded_data_file": f"{data_path}/redcap/extract.csv", + "label_data_file": f"{data_path}/redcap/extract_labels.csv", + }, + "previous_year": { + "coded_data_file": f"{data_path}/redcap/extract.csv", + "label_data_file": f"{data_path}/redcap/extract_labels.csv", + }, + } + } + + config_path = temp_out_dir / "config.toml" + with config_path.open("wb") as handle: + tomli_w.dump(test_config, handle) + + alias_path = temp_out_dir / "alias.toml" + with alias_path.open("wb") as handle: + tomli_w.dump({"RRS200": "RRS100"}, handle) + + # Act + extract(2021, config_file=config_path, output_dir=temp_out_dir) + + # Assert + expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx" + output = pd.read_excel(expected_file) + ## old RRS200 shouldn't have any rows + assert output[output.school_id == "RRS200"].shape[0] == 0 + renamed_school = output[output.school_id == "RRS100"] + ## new RRS100 should exist in output and those rows should have the correct school from the dispatch list + assert renamed_school.shape[0] > 0 + assert all(renamed_school["rrcp_school"] == "School 100") From 59f5793c1942757f8aefe5871cde30277d769d88 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Thu, 31 Aug 2023 15:02:20 +0100 Subject: [PATCH 04/12] adding in school_id rename within function --- input/school_aliases/template.toml | 2 +- src/rred_reports/redcap/main.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/input/school_aliases/template.toml b/input/school_aliases/template.toml index f14b3146..dd0e799f 100644 --- a/input/school_aliases/template.toml +++ b/input/school_aliases/template.toml @@ -1,3 +1,3 @@ # replace the placeholder IDs below with school IDs so that they are replaced during the extract process -old_id_1 = "new_id_1" +RRS200 = "RRS100" old_id_2 = "new_id_2" diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 8e2eef82..5f19b463 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -26,12 +26,6 @@ class RedcapReader: """Reads two years of redcap data, processing the files (wide to long, and others) and filtering to non-empty rows""" def __init__(self, school_list: Path, school_aliases: Optional[Path] = None): - """ - Setup for reading from redcap - - Args: - school_list: path to Excel dispatch list file - """ self._school_list = get_unique_schools(school_list) self._school_aliases = None if school_aliases: @@ -68,6 +62,8 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame: raw_data = pd.read_csv(redcap_fields.coded_data_path, low_memory=False) labelled_data = pd.read_csv(redcap_fields.labelled_data_path, low_memory=False) processed_wide = self.preprocess_wide_data(raw_data, labelled_data) + if self._school_aliases: + processed_wide["school_id"] = processed_wide["school_id"].replace(self._school_aliases) long = self.wide_to_long(processed_wide, redcap_fields.survey_period) long_with_names = self._add_school_name_column(long) return long_with_names[masterfile_columns()].copy() @@ -282,6 +278,8 @@ def _process_calculated_columns(self, entry_year_cols: list[str], export_data: p return processed_data def _add_school_name_column(self, long_df: pd.DataFrame) -> pd.DataFrame: + if self._school_aliases: + long_df["school_id"] = long_df["school_id"].replace(self._school_aliases) named_schools = long_df.merge(self._school_list, left_on="school_id", right_on="RRED School ID", how="left") named_schools.rename({"School Name": "rrcp_school"}, axis=1, inplace=True) return named_schools From 68f9109b96dbccfaa93bf7e38108f7f613092613 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Thu, 31 Aug 2023 15:10:52 +0100 Subject: [PATCH 05/12] setting self._school_alias path --- src/rred_reports/redcap/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 5f19b463..96ea05f2 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -27,7 +27,7 @@ class RedcapReader: def __init__(self, school_list: Path, school_aliases: Optional[Path] = None): self._school_list = get_unique_schools(school_list) - self._school_aliases = None + self._school_aliases = Path("input/school_aliases/template.toml") if school_aliases: try: with school_aliases.open(mode="rb") as handle: From 5b9e0b64c4517f3a29cbfee4baa412e4618d84ef Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 13:32:58 +0100 Subject: [PATCH 06/12] reverting the toml file --- input/school_aliases/template.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/school_aliases/template.toml b/input/school_aliases/template.toml index dd0e799f..f14b3146 100644 --- a/input/school_aliases/template.toml +++ b/input/school_aliases/template.toml @@ -1,3 +1,3 @@ # replace the placeholder IDs below with school IDs so that they are replaced during the extract process -RRS200 = "RRS100" +old_id_1 = "new_id_1" old_id_2 = "new_id_2" From fc3551e454b1f9b84aeee909115c8fc7196a6823 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 13:36:39 +0100 Subject: [PATCH 07/12] fixng the double call to school_aliases --- src/rred_reports/redcap/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 96ea05f2..337a0ce1 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -27,7 +27,7 @@ class RedcapReader: def __init__(self, school_list: Path, school_aliases: Optional[Path] = None): self._school_list = get_unique_schools(school_list) - self._school_aliases = Path("input/school_aliases/template.toml") + self._school_aliases = None if school_aliases: try: with school_aliases.open(mode="rb") as handle: @@ -62,8 +62,6 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame: raw_data = pd.read_csv(redcap_fields.coded_data_path, low_memory=False) labelled_data = pd.read_csv(redcap_fields.labelled_data_path, low_memory=False) processed_wide = self.preprocess_wide_data(raw_data, labelled_data) - if self._school_aliases: - processed_wide["school_id"] = processed_wide["school_id"].replace(self._school_aliases) long = self.wide_to_long(processed_wide, redcap_fields.survey_period) long_with_names = self._add_school_name_column(long) return long_with_names[masterfile_columns()].copy() @@ -279,7 +277,7 @@ def _process_calculated_columns(self, entry_year_cols: list[str], export_data: p def _add_school_name_column(self, long_df: pd.DataFrame) -> pd.DataFrame: if self._school_aliases: - long_df["school_id"] = long_df["school_id"].replace(self._school_aliases) + long_df["school_id"] = long_df["school_id"].replace(self._school_aliases, inplace=True) named_schools = long_df.merge(self._school_list, left_on="school_id", right_on="RRED School ID", how="left") named_schools.rename({"School Name": "rrcp_school"}, axis=1, inplace=True) return named_schools From 08f725a35be4faccd1cff8283c31f6bd09b61430 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 13:40:30 +0100 Subject: [PATCH 08/12] adding in alias path --- tests/test_redcap_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_redcap_interface.py b/tests/test_redcap_interface.py index d96a4f66..3a009ba3 100644 --- a/tests/test_redcap_interface.py +++ b/tests/test_redcap_interface.py @@ -79,7 +79,7 @@ def test_school_id_aliases(temp_out_dir, set_top_level_dir): tomli_w.dump({"RRS200": "RRS100"}, handle) # Act - extract(2021, config_file=config_path, output_dir=temp_out_dir) + extract(2021, config_file=config_path, output_dir=temp_out_dir, school_aliases=alias_path) # Assert expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx" From b41552bbc958e608689762c95b4414c4409b938d Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 15:11:36 +0100 Subject: [PATCH 09/12] adding in new test --- tests/test_redcap_interface.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_redcap_interface.py b/tests/test_redcap_interface.py index 3a009ba3..78ae1be2 100644 --- a/tests/test_redcap_interface.py +++ b/tests/test_redcap_interface.py @@ -19,7 +19,7 @@ def set_top_level_dir() -> None: interface.top_level_dir = original_value -def test_cli_writes_file(temp_out_dir, set_top_level_dir): +def test_cli_writes_file(temp_out_dir: Path, set_top_level_dir: None): """ Given a config file pointing to valid test data When the extract CLI command is run, with an output to a temporary directory @@ -48,7 +48,7 @@ def test_cli_writes_file(temp_out_dir, set_top_level_dir): assert expected_file.exists() -def test_school_id_aliases(temp_out_dir, set_top_level_dir): +def test_school_id_aliases(temp_out_dir: Path, set_top_level_dir: None): """ Given a config file pointing to valid test data, and an alias file for school ids for RRS200 -> RRS100 When the extract command is run, with an output to a temporary directory @@ -88,5 +88,6 @@ def test_school_id_aliases(temp_out_dir, set_top_level_dir): assert output[output.school_id == "RRS200"].shape[0] == 0 renamed_school = output[output.school_id == "RRS100"] ## new RRS100 should exist in output and those rows should have the correct school from the dispatch list + assert "RRS100" in renamed_school["school_id"].values assert renamed_school.shape[0] > 0 assert all(renamed_school["rrcp_school"] == "School 100") From 39fd1bbdb80ea107513684b586afb365ff5c29f0 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 15:20:38 +0100 Subject: [PATCH 10/12] fixing errors --- src/rred_reports/redcap/main.py | 2 +- tests/test_redcap.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rred_reports/redcap/main.py b/src/rred_reports/redcap/main.py index 337a0ce1..97fab90f 100644 --- a/src/rred_reports/redcap/main.py +++ b/src/rred_reports/redcap/main.py @@ -277,7 +277,7 @@ def _process_calculated_columns(self, entry_year_cols: list[str], export_data: p def _add_school_name_column(self, long_df: pd.DataFrame) -> pd.DataFrame: if self._school_aliases: - long_df["school_id"] = long_df["school_id"].replace(self._school_aliases, inplace=True) + long_df["school_id"].replace(self._school_aliases, inplace=True) named_schools = long_df.merge(self._school_list, left_on="school_id", right_on="RRED School ID", how="left") named_schools.rename({"School Name": "rrcp_school"}, axis=1, inplace=True) return named_schools diff --git a/tests/test_redcap.py b/tests/test_redcap.py index 3a648cfe..bc604ef2 100644 --- a/tests/test_redcap.py +++ b/tests/test_redcap.py @@ -52,7 +52,7 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract): 6 rows should exist, and the output columns should match what is in our masterfile definition """ - assert redcap_extract.shape[0] == 6 + assert redcap_extract.shape[0] == 8 assert list(redcap_extract.columns.values) == masterfile_columns() From 90bdb9abceea2ed479f1f5197247f7536da0a9c8 Mon Sep 17 00:00:00 2001 From: katiebuntic Date: Fri, 1 Sep 2023 15:40:10 +0100 Subject: [PATCH 11/12] fixing doc string --- tests/test_redcap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_redcap.py b/tests/test_redcap.py index bc604ef2..a531730f 100644 --- a/tests/test_redcap.py +++ b/tests/test_redcap.py @@ -49,7 +49,7 @@ def test_read_redcap_extract_rows_and_cols(redcap_extract): """ Given an extract from redcap with 3 valid rows When the extract is processed, using the same extract as the current year and previous year - 6 rows should exist, and the output columns should match what is in our masterfile definition + 8 rows should exist, and the output columns should match what is in our masterfile definition """ assert redcap_extract.shape[0] == 8 From a07ad1ee4a3a23efc4c6225181ddc50beeac8eed Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Fri, 1 Sep 2023 16:21:09 +0100 Subject: [PATCH 12/12] Update docstring for test --- tests/test_redcap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_redcap.py b/tests/test_redcap.py index a531730f..c2f52a45 100644 --- a/tests/test_redcap.py +++ b/tests/test_redcap.py @@ -47,7 +47,7 @@ def test_preprocess_wide_data(data_path): def test_read_redcap_extract_rows_and_cols(redcap_extract): """ - Given an extract from redcap with 3 valid rows + Given an extract from redcap with 4 valid rows When the extract is processed, using the same extract as the current year and previous year 8 rows should exist, and the output columns should match what is in our masterfile definition """