Skip to content

Commit

Permalink
Merge pull request #30 from RECETOX/sonarcloud_runner
Browse files Browse the repository at this point in the history
Sonarcloud runner
  • Loading branch information
hechth authored Jul 15, 2024
2 parents 0ddd9ee + faea82a commit 461da35
Show file tree
Hide file tree
Showing 2 changed files with 295 additions and 13 deletions.
170 changes: 162 additions & 8 deletions src/rcx_tk/process_metadata_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os
import re
from typing import Tuple
import pandas as pd
from numpy import int64


def read_file(file_path: str) -> pd.DataFrame:
Expand Down Expand Up @@ -47,19 +50,117 @@ def process_metadata_file(file_path: str, out_path: str) -> None:
file_path (str): A path to the metadata file.
out_path (str): A path where processed metadata dataframe is exported.
"""
columns_to_keep = {
"File name": "sampleName",
df = read_file(file_path)
df = process_metadata(df)
save_dataframe_as_tsv(df, out_path)

def process_metadata(df: pd.DataFrame) -> pd.DataFrame:
"""Processes the metadata dataframe.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: A metadata dataframe with rearranged and newly derived columns.
"""
df = rearrange_columns(df)
validate_filenames_column(df)
validate_injection_order(df)
df = derive_additional_metadata(df)
df = cleanup(df)
return df

def cleanup(df: pd.DataFrame) -> pd.DataFrame:
"""Removes the file Name column and moves the sampleName col.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: The processed dataframe.
"""
df = df.drop('File name', axis = 1)
column_to_move = df.pop("sampleName")
df.insert(0, "sampleName", column_to_move)
return df

def validate_injection_order(df: pd.DataFrame) -> bool:
"""Validates if injectionOrder is of integer type.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
bool: Whether the injectionOrder is integer.
"""
return(df['injectionOrder'].dtypes == int64)

def derive_additional_metadata(df: pd.DataFrame) -> pd.DataFrame:
"""Derives additional metadata columns.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: The processed dataframe.
"""
df['sampleName'] = df['File name'].apply(replace_spaces)
df['sequenceIdentifier'] = df['File name'].apply(add_sequence_identifier)
df['subjectIdentifier'] = df['File name'].apply(add_subject_identifier)
df['localOrder'] = df['File name'].apply(add_local_order)
return df

def rearrange_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Rearranges the columns.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: The processed dataframe.
"""
columns_to_keep = [
"File name",
"Type",
"Class ID",
"Batch",
"Analytical order"
]

df = df[list(columns_to_keep)]

df = df.rename(columns={
"Type": "sampleType",
"Class ID": "class",
"Batch": "batch",
"Analytical order": "injectionOrder",
}
"Analytical order": "injectionOrder"
})

df = read_file(file_path)
df = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
df["sampleName"] = df["sampleName"].str.replace(" ", "_")
save_dataframe_as_tsv(df, out_path)
return df

def validate_filenames_column(df: pd.DataFrame) -> None:
"""Validates the file names.
Args:
df (pd.DataFrame): A dataframe to process.
Raises:
ValueError: An error if there is any invalid file name.
"""
if not df['File name'].apply(validate_filename).all():
raise ValueError("Invalid File name.")

def replace_spaces(file_name: str) -> str:
"""Replaces spaces with underscores in Filename.
Args:
file_name (str): The filename.
Returns:
str: The replaced filename.
"""
x = file_name.replace(" ", "_")
return x

def process_alkane_ri_file(file_path: str, out_path: str) -> None:
"""Processes an alkane file, keeping and renaming specific columns.
Expand Down Expand Up @@ -90,3 +191,56 @@ def is_not_empty(x: str) -> bool:

tokens: list[str] = list(filter(is_not_empty, file_name.split('_')))
return len(tokens) > 1 and tokens[-1].isdigit()



def add_local_order(file_name: str) -> int:
"""Returns the localOrder value, i.e. the last n-digits after the last underscore.
Args:
file_name (str): The filename.
Returns:
int: The localOrder value.
"""
_, b = separate_filename(file_name)
return(int(b))

def add_sequence_identifier(file_name: str) -> str:
"""Returns the sequenceIdentifier value, i.e. everything before last _[digits].
Args:
file_name (str): The filename.
Returns:
str: The sequenceIdentifier value.
"""
a, _ = separate_filename(file_name)
a = a.rstrip('_')
a = a.strip()
return(a)

def separate_filename(file_name: str) -> Tuple[str, str]:
"""Splits the file_name based on a regex.
Args:
file_name (str): The filename.
Returns:
Tuple[str, str]: Splitted file_name.
"""
a, b = re.findall(r'(.*(?:\D|^))(\d+)', file_name)[0]
return (a, b)

def add_subject_identifier(file_name: str) -> str:
"""Returns the subjectIdentifier value, i.e. everything between [digit_] and [_digit].
Args:
file_name (str): The filename.
Returns:
str: The subjectIdentifier value.
"""
_, b, _ = re.findall(r'(\d+_)(.*)(_\d+)', file_name)[0]
b = b.strip()
return(b)
138 changes: 133 additions & 5 deletions tests/test_process_metadata_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
from typing import Final
import pandas as pd
import pytest
from rcx_tk.process_metadata_file import add_local_order
from rcx_tk.process_metadata_file import add_sequence_identifier
from rcx_tk.process_metadata_file import add_subject_identifier
from rcx_tk.process_metadata_file import process_alkane_ri_file
from rcx_tk.process_metadata_file import process_metadata_file
from rcx_tk.process_metadata_file import read_file
from rcx_tk.process_metadata_file import replace_spaces
from rcx_tk.process_metadata_file import save_dataframe_as_tsv
from rcx_tk.process_metadata_file import separate_filename
from rcx_tk.process_metadata_file import validate_filename
from rcx_tk.process_metadata_file import validate_injection_order

__location__: Final[Path] = Path(__file__).parent.resolve()

Expand Down Expand Up @@ -89,9 +95,9 @@ def processed_dataframe() -> pd.DataFrame:
"11_QC_16_11",
"12_QC_8_12",
"15_QC_non-dilute_15",
"18_QC_4_18",
"18_QC_4__18",
"19_QC_8_19",
"29_instrument_blank_29",
"29_instrument_blank_29"
],
"sampleType": [
"Standard",
Expand All @@ -109,12 +115,49 @@ def processed_dataframe() -> pd.DataFrame:
"class": [3, 5, 3, 6, 2, 2, 2, 2, 2, 2, 3],
"batch": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
"injectionOrder": [1, 4, 6, 7, 8, 11, 12, 15, 18, 19, 29],
"sequenceIdentifier" : [
"1_instrumental blank",
"4_Alkane mix",
"6_instrumental blank",
"7_procedural blank",
"8_QC non-dilute",
"11_QC 16",
"12_QC 8",
"15_QC non-dilute",
"18_QC 4",
"19_QC 8",
"29_instrument blank"
],
"subjectIdentifier": [
"instrumental blank",
"Alkane mix",
"instrumental blank",
"procedural blank",
"QC non-dilute",
"QC 16",
"QC 8",
"QC non-dilute",
"QC 4",
"QC 8",
"instrument blank"
],
"localOrder": [
1,
4,
6,
7,
8,
11,
12,
15,
18,
19,
29
]
}

return pd.DataFrame(data=d)



@pytest.fixture
def alkanes() -> pd.DataFrame:
"""Creates a dataframe corresponding to processed alkane file.
Expand Down Expand Up @@ -192,7 +235,6 @@ def test_read_save_dataframe_as_tsv_error(dataframe: pd.DataFrame, tmp_path: str
save_dataframe_as_tsv(dataframe, out_path)


@pytest.mark.skip(reason="Test fails due to a inconsistency in the input file (metadata)")
def test_process_metadata_file(processed_dataframe: pd.DataFrame, tmp_path: str):
"""Tests processing the metadata file.
Expand Down Expand Up @@ -246,3 +288,89 @@ def test_process_metadata_file_raise_columns_missing(tmp_path: str):
def test_validate_filename(file_name: str, expected: bool):
"""Test to validate filenames."""
assert validate_filename(file_name) == expected

@pytest.mark.parametrize("file_name, expected", [
["18_QC 4 _18", 18],
["1_QC_1", 1]
])
def test_add_localOrder(file_name: str, expected: int):
"""Tests the add_localOrder function.
Args:
file_name (str): The filename.
expected (int): The localOrder value.
"""
actual = add_local_order(file_name)
assert actual == expected

@pytest.mark.parametrize("file_name, expected", [
["18_QC 4 _18", "18_QC 4"],
["1_QC_1", "1_QC"]
])
def test_add_sequenceIdentifier(file_name: str, expected: str):
"""Tests the add_sequenceIdentifier function.
Args:
file_name (str): The filename.
expected (str): The sequenceIdentifier value.
"""
actual = add_sequence_identifier(file_name)
assert actual == expected

@pytest.mark.parametrize("file_name, expected", [
["18_QC 4 _18", "QC 4"],
["1_QC_1", "QC"],
["11_QC 16_11", "QC 16"]
])
def test_add_subjectIdentifier(file_name: str, expected: str):
"""Tests the add_subjectIdentifier function.
Args:
file_name (str): The filename.
expected (str): The subjectIdentifier value.
"""
actual = add_subject_identifier(file_name)
assert actual == expected

@pytest.mark.parametrize("file_name, expected", [
["18_QC 4 _18", "18_QC_4__18"],
["1_QC_1", "1_QC_1"]
])
def test_replace_fileName(file_name: str, expected: str):
"""Tests tge replace_fileName function.
Args:
file_name (str): The filename.
expected (str): The filename with replaced spaces by underscores.
"""
actual = replace_spaces(file_name)
assert actual == expected


@pytest.mark.parametrize("file_name, expected", [
["18_QC 4 _18", ("18_QC 4 _", "18")],
["1_QC_1", ("1_QC_", "1")]
])
def test_separate_filename(file_name: str, expected: str):
"""Tests the regex to separate filenames.
Args:
file_name (str): The filename.
expected (str): The splitted filename.
"""
actual = separate_filename(file_name)
assert actual == expected

@pytest.mark.parametrize("dataFrame, expected", [
[pd.DataFrame({"injectionOrder": [1,4,5]}), True],
[pd.DataFrame({"injectionOrder": ["1",None,5]}), False]
])
def test_validateInjectionOrder(dataFrame: pd.DataFrame, expected: bool):
"""Tests the injection order validation function.
Args:
dataFrame (pd.DataFrame): A dataframe with injection order.
expected (bool): Whether it is of integer (True) or other data type (False)
"""
actual = validate_injection_order(dataFrame)
assert expected == actual

0 comments on commit 461da35

Please sign in to comment.