Merge pull request #61 from UCL-ARC/duplicate_report_rows

Warnings for duplicate report rows
UCL-ARC · Sep 8, 2023 · 8358b75 · 8358b75
2 parents a836efb + f4c5ea1
commit 8358b75
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 11 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
   "pypdf == 3.8.1",
   "python-docx == 0.8.11",
   "numpy == 1.24.2",
+  "tabulate == 0.9.0",
   "tomli == 2.0.1",
   "tomli-w == 1.0.0",
   "typer == 0.7.0",
@@ -47,12 +48,14 @@ test = [
   "pytest-cov >=4",
   "pytest-mock >=3.10.0",
   "email-validator==1.3.0",
+  "loguru_caplog >= 0.2.0",
 ]
 dev = [
   "pytest >=7",
   "pytest-cov >=4",
   "pytest-mock >=3.10.0",
   "pre-commit >= 3",
+  "loguru_caplog >= 0.2.0",
 ]
 
 [project.urls]

diff --git a/src/rred_reports/masterfile.py b/src/rred_reports/masterfile.py
@@ -129,6 +129,10 @@ def clmnlist(i: int, data: pd.DataFrame = full_data) -> list:
 
     all_schools_df = School.new(clmnlist(6), clmnlist(3), clmnlist(4), clmnlist(5))  # pylint: disable=E1121
     all_schools_df = all_schools_df.drop_duplicates()  # pylint: disable=E1101
+    is_duplicated = all_schools_df.duplicated(["rrcp_school"])
+    if any(is_duplicated):
+        duplicated_schools = all_schools_df[all_schools_df["rrcp_school"].isin(all_schools_df.loc[is_duplicated, "rrcp_school"])]
+        logger.warning("The following School IDs had duplicate information:\n{duplicated_df}", duplicated_df=duplicated_schools.to_markdown())
 
     teach_df = Teacher.new(clmnlist(1), clmnlist(2), clmnlist(6))  # pylint: disable=E1121
     teach_df.drop_duplicates(subset=["rred_user_id", "school_id"], inplace=True)  # pylint: disable=E1101

diff --git a/src/rred_reports/reports/interface.py b/src/rred_reports/reports/interface.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Optional
+from typing import Annotated, Optional
 
 import typer
 from loguru import logger
@@ -133,7 +133,7 @@ def create(level: ReportType, year: int, config_file: Path = "src/rred_reports/r
 @app.command()
 def send_school(
     year: int,
-    manual_id: Optional[list[str]] = typer.Option(None),  # noqa: B008
+    manual_id: Annotated[Optional[list[str]], typer.Option([])] = (),
     attachment_name: str = "RRED_report.pdf",
     config_file: Path = "src/rred_reports/reports/report_config.toml",
     top_level_dir: Optional[Path] = None,
@@ -157,16 +157,15 @@ def send_school(
         top_level_dir = TOP_LEVEL_DIR
 
     dispatch_list = top_level_dir / dispatch_path
-
+    school_ids = list(manual_id)
     if not manual_id:
-        manual_id = []
         report_directory = top_level_dir / "output" / "reports" / str(year) / "schools"
         for report_path in sorted(report_directory.glob("report_*.pdf")):
-            manual_id.append(report_path.stem.split("_")[-1])
+            school_ids.append(report_path.stem.split("_")[-1])
 
     email_details = []
     logger.info("Getting dispatch list details for each school report pdf found")
-    for school_id in tqdm(manual_id):
+    for school_id in tqdm(school_ids):
         email_info = get_mailing_info(school_id, dispatch_list, override_mailto)
         email_details.append({"school_id": school_id, "mail_info": email_info})
 
@@ -177,7 +176,7 @@ def send_school(
             school_mailer(email_detail["school_id"], year, email_detail["mail_info"], report_name=attachment_name)
             emailed_ids.add(email_detail["school_id"])
         except Exception as error:
-            all_schools = set(manual_id)
+            all_schools = set(school_ids)
             schools_to_send = sorted(all_schools.difference(emailed_ids))
             school_command = f"--manual-id {' --manual-id '.join(schools_to_send)}"
             logger.error(
@@ -198,8 +197,13 @@ def main():
 
 
 if __name__ == "__main__":
-    create(ReportType.SCHOOL, 2022, TOP_LEVEL_DIR / "src/rred_reports/reports/report_config.toml")
+    # create(ReportType.SCHOOL, 2022, TOP_LEVEL_DIR / "src/rred_reports/reports/report_config.toml")
     ## test sending reports to specific UCL user
-    # send_school(2021, config_file=TOP_LEVEL_DIR / "src/rred_reports/reports/report_config.toml", top_level_dir=TOP_LEVEL_DIR, override_mailto="[email protected]")
+    send_school(
+        2022,
+        config_file=TOP_LEVEL_DIR / "src/rred_reports/reports/report_config.toml",
+        top_level_dir=TOP_LEVEL_DIR,
+        override_mailto="[email protected]",
+    )
     ## test sending reports to RRED email for UAT
     # send_school(2021, config_file=TOP_LEVEL_DIR / "src/rred_reports/reports/report_config.toml", top_level_dir=TOP_LEVEL_DIR, override_mailto="[email protected]")
diff --git a/src/rred_reports/reports/schools.py b/src/rred_reports/reports/schools.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import pandas as pd
+from loguru import logger
 
 from rred_reports.reports.filler import TemplateFiller
 
@@ -192,7 +193,7 @@ def get_outcome_from_summary(outcome_df: pd.DataFrame, outcome_type: str) -> int
             return 0
 
     filtered = filter_by_entry_and_exit(school_df, report_year)
-    filtered_summary_table = filtered[columns_used].copy()
+    filtered_summary_table = filtered[columns_used].drop_duplicates().copy()
     # let's try and reduce the pain with exit outcome labels
     filtered_summary_table["exit_outcome"] = filtered_summary_table["exit_outcome"].str.lower().str.strip()
 
@@ -245,7 +246,12 @@ def populate_school_tables(school_df: pd.DataFrame, template_path: Path, report_
         columns, filter_function = column_and_filter
         filtered = filter_function(school_df, report_year)
         table_to_write = filtered[columns]
-        template_filler.populate_table(index + 1, table_to_write)
+        if index == 0 and any(table_to_write.duplicated()):
+            logger.warning(
+                "Duplicate students found, this suggests an issue with the masterfile school or teacher data. Table 1 data:\n{school_data}",
+                school_data=table_to_write.to_markdown(),
+            )
+        template_filler.populate_table(index + 1, table_to_write.drop_duplicates())
 
     return template_filler
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,5 @@
+from loguru_caplog import loguru_caplog  # noqa: F401
+
 pytest_plugins = [
     "tests.fixtures.test_template_files",
     "tests.fixtures.test_redcap_files",

diff --git a/tests/data/masterfile_with_school_in_two_regions.xlsx b/tests/data/masterfile_with_school_in_two_regions.xlsx
diff --git a/tests/test_masterfile.py b/tests/test_masterfile.py
@@ -16,3 +16,27 @@ def test_masterfile_read(data_path):
     assert nested_data["teachers"].shape == (11, 3)
     assert nested_data["schools"].shape == (10, 4)
     assert joined_data.shape == (40, 69)  # should be the same number of students as in the pupils df
+
+
+def test_masterfile_warns_duplicate_school(data_path, loguru_caplog):
+    """
+    Given a masterfile with two pupils in the same school, but the region is different for each pupil
+    When the masterfile is parsed
+    The nested data should have two schools, and there should be a loguru message about the duplicate ID
+    """
+    file_path = data_path / "masterfile_with_school_in_two_regions.xlsx"
+    nested_data = parse_masterfile(file_path)
+    # two schools, even though same id
+    assert nested_data["schools"].shape[0] == 2
+    assert "The following School IDs had duplicate information" in loguru_caplog.text
+
+
+def test_masterfile_no_duplicate_school(data_path, caplog):
+    """
+    Given a masterfile with no duplicate school details
+    When the masterfile is parsed
+    There should be no logging about duplicate school IDs
+    """
+    file_path = data_path / "example_masterfile.xlsx"
+    parse_masterfile(file_path)
+    assert "The following School IDs had duplicate information" not in caplog.text
diff --git a/tests/test_report_writing.py b/tests/test_report_writing.py
@@ -57,6 +57,22 @@ def test_school_tables_filled(example_school_data: pd.DataFrame, templates_dir:
     assert output_doc.exists()
 
 
+def test_duplicate_student_warning(example_school_data: pd.DataFrame, templates_dir: Path, temp_out_dir: Path, loguru_caplog):
+    """
+    Given a masterfile dataframe with one duplicated row
+    When the school template is populated
+    Then the resulting table will have one less row than the input data, and there will be a loguru message for the duplication
+    """
+    output_doc = temp_out_dir / "school.docx"
+
+    duplicate_school_data = example_school_data.copy()
+    duplicate_school_data.iloc[3] = duplicate_school_data.iloc[5]
+
+    populated_template = populate_school_data(duplicate_school_data, templates_dir / "2021/2021-22_template.docx", 2021, output_doc)
+    assert (duplicate_school_data.shape[0] - 1) == len(populated_template.tables[1].rows)
+    assert "Duplicate students found" in loguru_caplog.text
+
+
 def test_school_name_replaced_in_paragraphs(example_school_data, templates_dir: Path, temp_out_dir: Path):
     """
     Given a school template with the first non-blank paragraph having a "School A" placeholder