Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

School aliases in extract #57

Merged
merged 12 commits into from
Sep 1, 2023
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ src/*/_version.py

input/downloaded
input/processed
input/school_aliases
!input/*/.gitkeep
output/**
!output/*/.gitkeep
Expand Down
Empty file added input/school_aliases/.gitkeep
Empty file.
3 changes: 3 additions & 0 deletions input/school_aliases/template.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# replace the placeholder IDs below with school IDs so that they are replaced during the extract process
RRS200 = "RRS100"
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
old_id_2 = "new_id_2"
11 changes: 7 additions & 4 deletions src/rred_reports/redcap/interface.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from typing import Optional

import typer

Expand All @@ -9,12 +10,13 @@

top_level_dir = Path(__file__).resolve().parents[3]


app = typer.Typer()


@app.command()
def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/") -> None:
def extract(
year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/", school_aliases: Optional[Path] = None
) -> None:
"""
Extract files from redcap from wide to long and apply basic processing
Process wide to long of the files listed under the year-based config toml
Expand All @@ -23,6 +25,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi
year (int): Year to process
config_file (Path): Path to config file
output_dir (Path): Path to parent output directory
school_aliases (Optional[Path]): School alias file, where schools have changed IDs and should be merged
"""
typer.echo(f"Extracting data for {year} and the previous year's surveys")
config = get_config(config_file)[str(year)]
Expand All @@ -32,7 +35,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi

dispatch_path = top_level_dir / config["dispatch_list"]

parser = RedcapReader(dispatch_path)
parser = RedcapReader(dispatch_path, school_aliases)
current_year = ExtractInput(
top_level_dir / config["current_year"]["coded_data_file"],
top_level_dir / config["current_year"]["label_data_file"],
Expand All @@ -41,7 +44,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi
previous_year = ExtractInput(
top_level_dir / config["previous_year"]["coded_data_file"],
top_level_dir / config["previous_year"]["label_data_file"],
f"{year -1}-{str(year)[-2:]}",
f"{year - 1}-{str(year)[-2:]}",
)
long_data = parser.read_redcap_data(current_year, previous_year)
issues = log_school_id_inconsistencies(long_data, dispatch_path, year)
Expand Down
22 changes: 15 additions & 7 deletions src/rred_reports/redcap/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Downloading and processing of redcap data"""
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import tomli
from loguru import logger
from tqdm import tqdm

Expand All @@ -23,14 +25,16 @@ class ExtractInput:
class RedcapReader:
"""Reads two years of redcap data, processing the files (wide to long, and others) and filtering to non-empty rows"""

def __init__(self, school_list: Path):
"""
Setup for reading from redcap

Args:
school_list: path to Excel dispatch list file
"""
def __init__(self, school_list: Path, school_aliases: Optional[Path] = None):
self._school_list = get_unique_schools(school_list)
self._school_aliases = Path("input/school_aliases/template.toml")
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
if school_aliases:
try:
with school_aliases.open(mode="rb") as handle:
self._school_aliases = tomli.load(handle)
except FileNotFoundError as error:
msg = f"No school alias file found at {school_aliases}. Exiting."
raise FileNotFoundError(msg) from error

def read_redcap_data(self, current_year: ExtractInput, previous_year: ExtractInput) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -58,6 +62,8 @@ def read_single_redcap_year(self, redcap_fields: ExtractInput) -> pd.DataFrame:
raw_data = pd.read_csv(redcap_fields.coded_data_path, low_memory=False)
labelled_data = pd.read_csv(redcap_fields.labelled_data_path, low_memory=False)
processed_wide = self.preprocess_wide_data(raw_data, labelled_data)
if self._school_aliases:
processed_wide["school_id"] = processed_wide["school_id"].replace(self._school_aliases)
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
long = self.wide_to_long(processed_wide, redcap_fields.survey_period)
long_with_names = self._add_school_name_column(long)
return long_with_names[masterfile_columns()].copy()
Expand Down Expand Up @@ -272,6 +278,8 @@ def _process_calculated_columns(self, entry_year_cols: list[str], export_data: p
return processed_data

def _add_school_name_column(self, long_df: pd.DataFrame) -> pd.DataFrame:
if self._school_aliases:
long_df["school_id"] = long_df["school_id"].replace(self._school_aliases)
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved
named_schools = long_df.merge(self._school_list, left_on="school_id", right_on="RRED School ID", how="left")
named_schools.rename({"School Name": "rrcp_school"}, axis=1, inplace=True)
return named_schools
1 change: 1 addition & 0 deletions tests/data/redcap/extract.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
0,AB9234,AB9234,6,2021,2,1,2021-04-21 09:20:21,1st of 3 english schools ,80,,RRS180,,,,,,3,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1,2,3,4,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-08-01,2021-08-15,,,1,4,,,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AB9234,AB9234,6,2021,2,1,2021-04-21 09:21:21,2nd of 3 english schools ,80,,,,RRS180,,,,3,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,2,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-11-01,2021-09-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AB9234,AB9234,6,2021,2,1,2021-04-21 09:22:21,3rd of 3 english schools ,80,,,,,,RRS180,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,
3,AB200,AB200,6,2021,2,1,2021-04-21 09:22:21,another_english_school ,80,,,,,,RRS200,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,
100,AB100,AB100,4,2021,2,4,not entered ,timestamp not entered ,,30,,RRS100,,,,,1,,,, ,,,,,,,,,,,,,,, , ,, ,,,,,
101,AB101,AB101,4,2021,2,1,2021-04-21 09:22:21,no_rr_children=0 ,80,,,RRS101,,,,,0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,
102,AB102,AB102,4,2021,2,1,2021-04-21 09:22:21,no_rr_children na ,80,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,
Expand Down
1 change: 1 addition & 0 deletions tests/data/redcap/extract_labels.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
0,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:20:21,1st of 3 english schools ,Bristol,,RRS180,,,,,,3.0,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-08-01,2021-08-15,,,,Year 3,,,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:21:21,2nd of 3 english schools ,Bristol,,,,RRS180,,,,3.0,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-11-01,2021-09-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,3rd of 3 english schools ,Bristol,,,,,,RRS180,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,
3,AB200,AB200,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,another english school,Bristol,,,,,,school should be renamed,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,
100,AB100,AB100,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),Scotland,not entered ,timestamp not entered ,,Glasgow,,RRS100,,,,,1.0,,,, ,,,,,,,,,,,,,,, , ,, ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
101,AB101,AB101,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children=0 ,Bristol,,,RRS101,,,,,0.0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
102,AB102,AB102,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children na ,Bristol,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Expand Down
45 changes: 45 additions & 0 deletions tests/test_redcap_interface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

import pandas as pd
import pytest
import tomli_w

Expand Down Expand Up @@ -45,3 +46,47 @@ def test_cli_writes_file(temp_out_dir, set_top_level_dir):

expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx"
assert expected_file.exists()


def test_school_id_aliases(temp_out_dir, set_top_level_dir):
"""
Given a config file pointing to valid test data, and an alias file for school ids for RRS200 -> RRS100
When the extract command is run, with an output to a temporary directory
Then a masterfile should be written with the school ID replaced
"""
# Arrange
data_path = "tests/data"
test_config = {
"2021": {
"dispatch_list": f"{data_path}/dispatch_list.xlsx",
"current_year": {
"coded_data_file": f"{data_path}/redcap/extract.csv",
"label_data_file": f"{data_path}/redcap/extract_labels.csv",
},
"previous_year": {
"coded_data_file": f"{data_path}/redcap/extract.csv",
"label_data_file": f"{data_path}/redcap/extract_labels.csv",
},
}
}

config_path = temp_out_dir / "config.toml"
with config_path.open("wb") as handle:
tomli_w.dump(test_config, handle)

alias_path = temp_out_dir / "alias.toml"
with alias_path.open("wb") as handle:
tomli_w.dump({"RRS200": "RRS100"}, handle)

# Act
extract(2021, config_file=config_path, output_dir=temp_out_dir)
stefpiatek marked this conversation as resolved.
Show resolved Hide resolved

# Assert
expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx"
output = pd.read_excel(expected_file)
## old RRS200 shouldn't have any rows
assert output[output.school_id == "RRS200"].shape[0] == 0
renamed_school = output[output.school_id == "RRS100"]
## new RRS100 should exist in output and those rows should have the correct school from the dispatch list
assert renamed_school.shape[0] > 0
assert all(renamed_school["rrcp_school"] == "School 100")
Loading