diff --git a/src/rcx_tk/process_metadata_file.py b/src/rcx_tk/process_metadata_file.py index 81493cd..813a331 100644 --- a/src/rcx_tk/process_metadata_file.py +++ b/src/rcx_tk/process_metadata_file.py @@ -1,5 +1,8 @@ import os +import re +from typing import Tuple import pandas as pd +from numpy import int64 def read_file(file_path: str) -> pd.DataFrame: @@ -47,19 +50,117 @@ def process_metadata_file(file_path: str, out_path: str) -> None: file_path (str): A path to the metadata file. out_path (str): A path where processed metadata dataframe is exported. """ - columns_to_keep = { - "File name": "sampleName", + df = read_file(file_path) + df = process_metadata(df) + save_dataframe_as_tsv(df, out_path) + +def process_metadata(df: pd.DataFrame) -> pd.DataFrame: + """Processes the metadata dataframe. + + Args: + df (pd.DataFrame): The metadata dataframe. + + Returns: + pd.DataFrame: A metadata dataframe with rearranged and newly derived columns. + """ + df = rearrange_columns(df) + validate_filenames_column(df) + validate_injection_order(df) + df = derive_additional_metadata(df) + df = cleanup(df) + return df + +def cleanup(df: pd.DataFrame) -> pd.DataFrame: + """Removes the file Name column and moves the sampleName col. + + Args: + df (pd.DataFrame): The metadata dataframe. + + Returns: + pd.DataFrame: The processed dataframe. + """ + df = df.drop('File name', axis = 1) + column_to_move = df.pop("sampleName") + df.insert(0, "sampleName", column_to_move) + return df + +def validate_injection_order(df: pd.DataFrame) -> bool: + """Validates if injectionOrder is of integer type. + + Args: + df (pd.DataFrame): The metadata dataframe. + + Returns: + bool: Whether the injectionOrder is integer. + """ + return(df['injectionOrder'].dtypes == int64) + +def derive_additional_metadata(df: pd.DataFrame) -> pd.DataFrame: + """Derives additional metadata columns. + + Args: + df (pd.DataFrame): The metadata dataframe. + + Returns: + pd.DataFrame: The processed dataframe. + """ + df['sampleName'] = df['File name'].apply(replace_spaces) + df['sequenceIdentifier'] = df['File name'].apply(add_sequence_identifier) + df['subjectIdentifier'] = df['File name'].apply(add_subject_identifier) + df['localOrder'] = df['File name'].apply(add_local_order) + return df + +def rearrange_columns(df: pd.DataFrame) -> pd.DataFrame: + """Rearranges the columns. + + Args: + df (pd.DataFrame): The metadata dataframe. + + Returns: + pd.DataFrame: The processed dataframe. + """ + columns_to_keep = [ + "File name", + "Type", + "Class ID", + "Batch", + "Analytical order" + ] + + df = df[list(columns_to_keep)] + + df = df.rename(columns={ "Type": "sampleType", "Class ID": "class", "Batch": "batch", - "Analytical order": "injectionOrder", - } + "Analytical order": "injectionOrder" + }) - df = read_file(file_path) - df = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep) - df["sampleName"] = df["sampleName"].str.replace(" ", "_") - save_dataframe_as_tsv(df, out_path) + return df + +def validate_filenames_column(df: pd.DataFrame) -> None: + """Validates the file names. + + Args: + df (pd.DataFrame): A dataframe to process. + + Raises: + ValueError: An error if there is any invalid file name. + """ + if not df['File name'].apply(validate_filename).all(): + raise ValueError("Invalid File name.") + +def replace_spaces(file_name: str) -> str: + """Replaces spaces with underscores in Filename. + + Args: + file_name (str): The filename. + Returns: + str: The replaced filename. + """ + x = file_name.replace(" ", "_") + return x def process_alkane_ri_file(file_path: str, out_path: str) -> None: """Processes an alkane file, keeping and renaming specific columns. @@ -90,3 +191,56 @@ def is_not_empty(x: str) -> bool: tokens: list[str] = list(filter(is_not_empty, file_name.split('_'))) return len(tokens) > 1 and tokens[-1].isdigit() + + + +def add_local_order(file_name: str) -> int: + """Returns the localOrder value, i.e. the last n-digits after the last underscore. + + Args: + file_name (str): The filename. + + Returns: + int: The localOrder value. + """ + _, b = separate_filename(file_name) + return(int(b)) + +def add_sequence_identifier(file_name: str) -> str: + """Returns the sequenceIdentifier value, i.e. everything before last _[digits]. + + Args: + file_name (str): The filename. + + Returns: + str: The sequenceIdentifier value. + """ + a, _ = separate_filename(file_name) + a = a.rstrip('_') + a = a.strip() + return(a) + +def separate_filename(file_name: str) -> Tuple[str, str]: + """Splits the file_name based on a regex. + + Args: + file_name (str): The filename. + + Returns: + Tuple[str, str]: Splitted file_name. + """ + a, b = re.findall(r'(.*(?:\D|^))(\d+)', file_name)[0] + return (a, b) + +def add_subject_identifier(file_name: str) -> str: + """Returns the subjectIdentifier value, i.e. everything between [digit_] and [_digit]. + + Args: + file_name (str): The filename. + + Returns: + str: The subjectIdentifier value. + """ + _, b, _ = re.findall(r'(\d+_)(.*)(_\d+)', file_name)[0] + b = b.strip() + return(b) diff --git a/tests/test_process_metadata_file.py b/tests/test_process_metadata_file.py index 1d97984..eb59eb4 100644 --- a/tests/test_process_metadata_file.py +++ b/tests/test_process_metadata_file.py @@ -3,11 +3,17 @@ from typing import Final import pandas as pd import pytest +from rcx_tk.process_metadata_file import add_local_order +from rcx_tk.process_metadata_file import add_sequence_identifier +from rcx_tk.process_metadata_file import add_subject_identifier from rcx_tk.process_metadata_file import process_alkane_ri_file from rcx_tk.process_metadata_file import process_metadata_file from rcx_tk.process_metadata_file import read_file +from rcx_tk.process_metadata_file import replace_spaces from rcx_tk.process_metadata_file import save_dataframe_as_tsv +from rcx_tk.process_metadata_file import separate_filename from rcx_tk.process_metadata_file import validate_filename +from rcx_tk.process_metadata_file import validate_injection_order __location__: Final[Path] = Path(__file__).parent.resolve() @@ -89,9 +95,9 @@ def processed_dataframe() -> pd.DataFrame: "11_QC_16_11", "12_QC_8_12", "15_QC_non-dilute_15", - "18_QC_4_18", + "18_QC_4__18", "19_QC_8_19", - "29_instrument_blank_29", + "29_instrument_blank_29" ], "sampleType": [ "Standard", @@ -109,12 +115,49 @@ def processed_dataframe() -> pd.DataFrame: "class": [3, 5, 3, 6, 2, 2, 2, 2, 2, 2, 3], "batch": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "injectionOrder": [1, 4, 6, 7, 8, 11, 12, 15, 18, 19, 29], + "sequenceIdentifier" : [ + "1_instrumental blank", + "4_Alkane mix", + "6_instrumental blank", + "7_procedural blank", + "8_QC non-dilute", + "11_QC 16", + "12_QC 8", + "15_QC non-dilute", + "18_QC 4", + "19_QC 8", + "29_instrument blank" + ], + "subjectIdentifier": [ + "instrumental blank", + "Alkane mix", + "instrumental blank", + "procedural blank", + "QC non-dilute", + "QC 16", + "QC 8", + "QC non-dilute", + "QC 4", + "QC 8", + "instrument blank" + ], + "localOrder": [ + 1, + 4, + 6, + 7, + 8, + 11, + 12, + 15, + 18, + 19, + 29 + ] } return pd.DataFrame(data=d) - - @pytest.fixture def alkanes() -> pd.DataFrame: """Creates a dataframe corresponding to processed alkane file. @@ -192,7 +235,6 @@ def test_read_save_dataframe_as_tsv_error(dataframe: pd.DataFrame, tmp_path: str save_dataframe_as_tsv(dataframe, out_path) -@pytest.mark.skip(reason="Test fails due to a inconsistency in the input file (metadata)") def test_process_metadata_file(processed_dataframe: pd.DataFrame, tmp_path: str): """Tests processing the metadata file. @@ -246,3 +288,89 @@ def test_process_metadata_file_raise_columns_missing(tmp_path: str): def test_validate_filename(file_name: str, expected: bool): """Test to validate filenames.""" assert validate_filename(file_name) == expected + +@pytest.mark.parametrize("file_name, expected", [ + ["18_QC 4 _18", 18], + ["1_QC_1", 1] +]) +def test_add_localOrder(file_name: str, expected: int): + """Tests the add_localOrder function. + + Args: + file_name (str): The filename. + expected (int): The localOrder value. + """ + actual = add_local_order(file_name) + assert actual == expected + +@pytest.mark.parametrize("file_name, expected", [ + ["18_QC 4 _18", "18_QC 4"], + ["1_QC_1", "1_QC"] +]) +def test_add_sequenceIdentifier(file_name: str, expected: str): + """Tests the add_sequenceIdentifier function. + + Args: + file_name (str): The filename. + expected (str): The sequenceIdentifier value. + """ + actual = add_sequence_identifier(file_name) + assert actual == expected + +@pytest.mark.parametrize("file_name, expected", [ + ["18_QC 4 _18", "QC 4"], + ["1_QC_1", "QC"], + ["11_QC 16_11", "QC 16"] +]) +def test_add_subjectIdentifier(file_name: str, expected: str): + """Tests the add_subjectIdentifier function. + + Args: + file_name (str): The filename. + expected (str): The subjectIdentifier value. + """ + actual = add_subject_identifier(file_name) + assert actual == expected + +@pytest.mark.parametrize("file_name, expected", [ + ["18_QC 4 _18", "18_QC_4__18"], + ["1_QC_1", "1_QC_1"] +]) +def test_replace_fileName(file_name: str, expected: str): + """Tests tge replace_fileName function. + + Args: + file_name (str): The filename. + expected (str): The filename with replaced spaces by underscores. + """ + actual = replace_spaces(file_name) + assert actual == expected + + +@pytest.mark.parametrize("file_name, expected", [ + ["18_QC 4 _18", ("18_QC 4 _", "18")], + ["1_QC_1", ("1_QC_", "1")] +]) +def test_separate_filename(file_name: str, expected: str): + """Tests the regex to separate filenames. + + Args: + file_name (str): The filename. + expected (str): The splitted filename. + """ + actual = separate_filename(file_name) + assert actual == expected + +@pytest.mark.parametrize("dataFrame, expected", [ + [pd.DataFrame({"injectionOrder": [1,4,5]}), True], + [pd.DataFrame({"injectionOrder": ["1",None,5]}), False] +]) +def test_validateInjectionOrder(dataFrame: pd.DataFrame, expected: bool): + """Tests the injection order validation function. + + Args: + dataFrame (pd.DataFrame): A dataframe with injection order. + expected (bool): Whether it is of integer (True) or other data type (False) + """ + actual = validate_injection_order(dataFrame) + assert expected == actual