From 356bf26c2b5b781dfc01c195114578cdb09484cd Mon Sep 17 00:00:00 2001 From: SimonKonar Date: Tue, 9 Jan 2024 13:50:20 +0100 Subject: [PATCH] feat: add sample donor csv repository along with tests --- persistence/sample_donor_csv_repository.py | 46 +++++++++++++++ .../test_sample_donor_csv_repository.py | 58 +++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 persistence/sample_donor_csv_repository.py create mode 100644 test/unit/persistence/test_sample_donor_csv_repository.py diff --git a/persistence/sample_donor_csv_repository.py b/persistence/sample_donor_csv_repository.py new file mode 100644 index 0000000..72c7d84 --- /dev/null +++ b/persistence/sample_donor_csv_repository.py @@ -0,0 +1,46 @@ +from datetime import datetime +import logging +import os +from typing import List + +from model.gender import get_gender_from_abbreviation +from model.sample_donor import SampleDonor +from persistence.sample_donor_repository import SampleDonorRepository +from util.custom_logger import setup_logger +import pandas as pd + +setup_logger() +logger = logging.getLogger() + + +class SampleDonorCsvRepository(SampleDonorRepository): + """Class for handling sample donors stored in Csv files""" + + def __init__(self, records_path: str, separator: str, donor_parsing_map: dict): + self._dir_path = records_path + self._ids: set = set() + self.separator = separator + self._donor_parsing_map = donor_parsing_map + logger.debug(f"Loaded the following donor parsing map {donor_parsing_map}") + + def get_all(self) -> List[SampleDonor]: + for dir_entry in os.scandir(self._dir_path): + if dir_entry.name.endswith(".csv"): + yield from self.__extract_donor_from_csv_file(dir_entry) + + def __extract_donor_from_csv_file(self, dir_entry: os.DirEntry) -> SampleDonor: + file_content = pd.read_csv(dir_entry, sep= self.separator, dtype=str) + for _, row in file_content.iterrows(): + try: + donor = SampleDonor(row[self._donor_parsing_map.get("id")]) + donor.gender = get_gender_from_abbreviation(row[self._donor_parsing_map.get("gender")]) + year_of_birth = row[self._donor_parsing_map.get("birthDate")] + if year_of_birth is not None: + donor.date_of_birth = datetime.strptime(year_of_birth, '%Y') + if donor.identifier not in self._ids: + self._ids.add(donor.identifier) + yield donor + except TypeError as e: + logger.info(e , "Skipping...") + return + diff --git a/test/unit/persistence/test_sample_donor_csv_repository.py b/test/unit/persistence/test_sample_donor_csv_repository.py new file mode 100644 index 0000000..68f74f1 --- /dev/null +++ b/test/unit/persistence/test_sample_donor_csv_repository.py @@ -0,0 +1,58 @@ +import unittest + +import pytest +from pyfakefs.fake_filesystem_unittest import patchfs + +from model.gender import Gender +from model.sample_donor import SampleDonor +from persistence.sample_donor_csv_repository import SampleDonorCsvRepository +from util.config import PARSING_MAP_CSV + + +class TestDonorCsvRepo(unittest.TestCase): + header = "sample_ID;patient_pseudonym;sex;birth_year;date_of_diagnosis;diagnosis;donor_age;sampling_date;sampling_type;storage_temperature;available_number_of_samples \n" + + content = "34;1113;f;1939;2100-10-22;M329;49;2007-10-22;serum;-20;1" + + dir_path = "/mock_dir/" + + @pytest.fixture(autouse=True) + def run_around_tests(self): + self.donor_repository = SampleDonorCsvRepository(records_path=self.dir_path, + donor_parsing_map=PARSING_MAP_CSV['donor_map'], + separator=";") + + @patchfs + def test_get_all_ok(self, fake_fs): + fake_fs.create_file(self.dir_path + "mock_file.csv", contents=self.header + self.content) + for donor in self.donor_repository.get_all(): + self.assertIsInstance(donor, SampleDonor) + self.assertEqual("1113", donor.identifier) + self.assertEqual(Gender.FEMALE, donor.gender) + + @patchfs + def test_get_all_with_one_wrongly_formatted_file(self, fake_fs): + fake_fs.create_file(self.dir_path + "mock_file.csv", contents=self.header + self.content) + fake_fs.create_file(self.dir_path + "mock_wrong_file.csv", contents="badly_formated_csv") + for donor in self.donor_repository.get_all(): + self.assertIsInstance(donor, SampleDonor) + self.assertEqual("1113", donor.identifier) + + @patchfs + def test_get_all_does_not_return_duplicate_patients(self, fake_fs): + fake_fs.create_file(self.dir_path + "mock_file.csv", contents=self.header + self.content) + fake_fs.create_file(self.dir_path + "mock_file_duplicate.csv", contents=self.header + self.content) + counter = 0 + for donor in self.donor_repository.get_all(): + self.assertIsInstance(donor, SampleDonor) + self.assertEqual("1113", donor.identifier) + counter += 1 + self.assertEqual(1, counter) + + @patchfs + def test_get_all_with_empty_repository_throws_no_errors(self, fake_fs): + fake_fs.create_dir(self.dir_path) + counter = 0 + for _ in self.donor_repository.get_all(): + counter += 1 + self.assertEqual(0, counter)