Skip to content

Commit

Permalink
Merge pull request #57 from UCL-ARC/school_aliases_in_extract
Browse files Browse the repository at this point in the history
School aliases in extract
  • Loading branch information
stefpiatek authored Sep 1, 2023
2 parents 38e80d6 + a07ad1e commit bcf1c3a
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ src/*/_version.py

input/downloaded
input/processed
input/school_aliases
!input/*/.gitkeep
output/**
!output/*/.gitkeep
Expand Down
Empty file added input/school_aliases/.gitkeep
Empty file.
3 changes: 3 additions & 0 deletions input/school_aliases/template.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# replace the placeholder IDs below with school IDs so that they are replaced during the extract process
old_id_1 = "new_id_1"
old_id_2 = "new_id_2"
11 changes: 7 additions & 4 deletions src/rred_reports/redcap/interface.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from typing import Optional

import typer

Expand All @@ -9,12 +10,13 @@

top_level_dir = Path(__file__).resolve().parents[3]


app = typer.Typer()


@app.command()
def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/") -> None:
def extract(
year: int, config_file: Path = "src/rred_reports/redcap/redcap_config.toml", output_dir: Path = "output/", school_aliases: Optional[Path] = None
) -> None:
"""
Extract files from redcap from wide to long and apply basic processing
Process wide to long of the files listed under the year-based config toml
Expand All @@ -23,6 +25,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi
year (int): Year to process
config_file (Path): Path to config file
output_dir (Path): Path to parent output directory
school_aliases (Optional[Path]): School alias file, where schools have changed IDs and should be merged
"""
typer.echo(f"Extracting data for {year} and the previous year's surveys")
config = get_config(config_file)[str(year)]
Expand All @@ -32,7 +35,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi

dispatch_path = top_level_dir / config["dispatch_list"]

parser = RedcapReader(dispatch_path)
parser = RedcapReader(dispatch_path, school_aliases)
current_year = ExtractInput(
top_level_dir / config["current_year"]["coded_data_file"],
top_level_dir / config["current_year"]["label_data_file"],
Expand All @@ -41,7 +44,7 @@ def extract(year: int, config_file: Path = "src/rred_reports/redcap/redcap_confi
previous_year = ExtractInput(
top_level_dir / config["previous_year"]["coded_data_file"],
top_level_dir / config["previous_year"]["label_data_file"],
f"{year -1}-{str(year)[-2:]}",
f"{year - 1}-{str(year)[-2:]}",
)
long_data = parser.read_redcap_data(current_year, previous_year)
issues = log_school_id_inconsistencies(long_data, dispatch_path, year)
Expand Down
20 changes: 13 additions & 7 deletions src/rred_reports/redcap/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Downloading and processing of redcap data"""
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import tomli
from loguru import logger
from tqdm import tqdm

Expand All @@ -23,14 +25,16 @@ class ExtractInput:
class RedcapReader:
"""Reads two years of redcap data, processing the files (wide to long, and others) and filtering to non-empty rows"""

def __init__(self, school_list: Path):
"""
Setup for reading from redcap
Args:
school_list: path to Excel dispatch list file
"""
def __init__(self, school_list: Path, school_aliases: Optional[Path] = None):
self._school_list = get_unique_schools(school_list)
self._school_aliases = None
if school_aliases:
try:
with school_aliases.open(mode="rb") as handle:
self._school_aliases = tomli.load(handle)
except FileNotFoundError as error:
msg = f"No school alias file found at {school_aliases}. Exiting."
raise FileNotFoundError(msg) from error

def read_redcap_data(self, current_year: ExtractInput, previous_year: ExtractInput) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -272,6 +276,8 @@ def _process_calculated_columns(self, entry_year_cols: list[str], export_data: p
return processed_data

def _add_school_name_column(self, long_df: pd.DataFrame) -> pd.DataFrame:
if self._school_aliases:
long_df["school_id"].replace(self._school_aliases, inplace=True)
named_schools = long_df.merge(self._school_list, left_on="school_id", right_on="RRED School ID", how="left")
named_schools.rename({"School Name": "rrcp_school"}, axis=1, inplace=True)
return named_schools
1 change: 1 addition & 0 deletions tests/data/redcap/extract.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
0,AB9234,AB9234,6,2021,2,1,2021-04-21 09:20:21,1st of 3 english schools ,80,,RRS180,,,,,,3,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1,2,3,4,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-08-01,2021-08-15,,,1,4,,,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AB9234,AB9234,6,2021,2,1,2021-04-21 09:21:21,2nd of 3 english schools ,80,,,,RRS180,,,,3,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,2,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-11-01,2021-09-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AB9234,AB9234,6,2021,2,1,2021-04-21 09:22:21,3rd of 3 english schools ,80,,,,,,RRS180,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,
3,AB200,AB200,6,2021,2,1,2021-04-21 09:22:21,another_english_school ,80,,,,,,RRS200,,3,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0,0,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38,,,,1,,,,1,1,1,1,1,1,,,,1,1,1,,2015-12-01,2021-12-15,,2022-04-01,1,,,4,,2,1,1,1,2,5,1,45,10,8,10,27,1,,,,,,,,,,,,,,,,,,,,
100,AB100,AB100,4,2021,2,4,not entered ,timestamp not entered ,,30,,RRS100,,,,,1,,,, ,,,,,,,,,,,,,,, , ,, ,,,,,
101,AB101,AB101,4,2021,2,1,2021-04-21 09:22:21,no_rr_children=0 ,80,,,RRS101,,,,,0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,
102,AB102,AB102,4,2021,2,1,2021-04-21 09:22:21,no_rr_children na ,80,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,
Expand Down
1 change: 1 addition & 0 deletions tests/data/redcap/extract_labels.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
0,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:20:21,1st of 3 english schools ,Bristol,,RRS180,,,,,,3.0,2021-12-21 09:20:21,2022-03-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,1.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-08-01,2021-08-15,,,,Year 3,,,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:21:21,2nd of 3 english schools ,Bristol,,,,RRS180,,,,3.0,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,2.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-11-01,2021-09-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AB9234,AB9234,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,3rd of 3 english schools ,Bristol,,,,,,RRS180,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,
3,AB200,AB200,RR Teacher + Other Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,another english school,Bristol,,,,,,school should be renamed,,3.0,,,,,,,,,,,,,,,,,,,2021-12-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,0.0,0.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,38.0,,,,Yes,,,,Yes,1.0,1.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,,2015-12-01,2021-12-15,,2022-04-01,Discontinued,,,Year 3,,Female,White - British,English,Yes,Receiving School Based Support,No,1.0,45.0,10.0,8.0,10.0,27.0,1.0,,,,,,,,,,,,,,,,,,,,,
100,AB100,AB100,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),Scotland,not entered ,timestamp not entered ,,Glasgow,,RRS100,,,,,1.0,,,, ,,,,,,,,,,,,,,, , ,, ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
101,AB101,AB101,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children=0 ,Bristol,,,RRS101,,,,,0.0,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
102,AB102,AB102,RR Teacher + Support Role,2021,Continuing Professional Development (CPD),England / Channel Islands,2021-04-21 09:22:21,no_rr_children na ,Bristol,,,RRS101,,,,,,2021-12-21 09:20:21,,2022-09-21 09:20:21,2022-09-21 09:20:21,2022-09-21 09:20:21,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Expand Down
6 changes: 3 additions & 3 deletions tests/test_redcap.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ def test_preprocess_wide_data(data_path):

def test_read_redcap_extract_rows_and_cols(redcap_extract):
"""
Given an extract from redcap with 3 valid rows
Given an extract from redcap with 4 valid rows
When the extract is processed, using the same extract as the current year and previous year
6 rows should exist, and the output columns should match what is in our masterfile definition
8 rows should exist, and the output columns should match what is in our masterfile definition
"""

assert redcap_extract.shape[0] == 6
assert redcap_extract.shape[0] == 8
assert list(redcap_extract.columns.values) == masterfile_columns()


Expand Down
48 changes: 47 additions & 1 deletion tests/test_redcap_interface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

import pandas as pd
import pytest
import tomli_w

Expand All @@ -18,7 +19,7 @@ def set_top_level_dir() -> None:
interface.top_level_dir = original_value


def test_cli_writes_file(temp_out_dir, set_top_level_dir):
def test_cli_writes_file(temp_out_dir: Path, set_top_level_dir: None):
"""
Given a config file pointing to valid test data
When the extract CLI command is run, with an output to a temporary directory
Expand All @@ -45,3 +46,48 @@ def test_cli_writes_file(temp_out_dir, set_top_level_dir):

expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx"
assert expected_file.exists()


def test_school_id_aliases(temp_out_dir: Path, set_top_level_dir: None):
"""
Given a config file pointing to valid test data, and an alias file for school ids for RRS200 -> RRS100
When the extract command is run, with an output to a temporary directory
Then a masterfile should be written with the school ID replaced
"""
# Arrange
data_path = "tests/data"
test_config = {
"2021": {
"dispatch_list": f"{data_path}/dispatch_list.xlsx",
"current_year": {
"coded_data_file": f"{data_path}/redcap/extract.csv",
"label_data_file": f"{data_path}/redcap/extract_labels.csv",
},
"previous_year": {
"coded_data_file": f"{data_path}/redcap/extract.csv",
"label_data_file": f"{data_path}/redcap/extract_labels.csv",
},
}
}

config_path = temp_out_dir / "config.toml"
with config_path.open("wb") as handle:
tomli_w.dump(test_config, handle)

alias_path = temp_out_dir / "alias.toml"
with alias_path.open("wb") as handle:
tomli_w.dump({"RRS200": "RRS100"}, handle)

# Act
extract(2021, config_file=config_path, output_dir=temp_out_dir, school_aliases=alias_path)

# Assert
expected_file = temp_out_dir / "processed" / "masterfile_2021-22.xlsx"
output = pd.read_excel(expected_file)
## old RRS200 shouldn't have any rows
assert output[output.school_id == "RRS200"].shape[0] == 0
renamed_school = output[output.school_id == "RRS100"]
## new RRS100 should exist in output and those rows should have the correct school from the dispatch list
assert "RRS100" in renamed_school["school_id"].values
assert renamed_school.shape[0] > 0
assert all(renamed_school["rrcp_school"] == "School 100")

0 comments on commit bcf1c3a

Please sign in to comment.