Skip to content

Commit

Permalink
Column assignment: write tests, optimize variable names and add docst…
Browse files Browse the repository at this point in the history
…ring, add messages for special cases
  • Loading branch information
henninggaertner committed Nov 17, 2023
1 parent 17ff747 commit ddb8d04
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 43 deletions.
26 changes: 10 additions & 16 deletions protzilla/constants/workflow_meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,34 +83,28 @@
},
"metadata_column_assignment": {
"name": "Metadata Column Assignment",
"description": "Assign columns to metadata categories",
"description": "Assign columns to metadata categories, repeatable for each category",
"parameters": {
"metadata_df": {
"type": "empty",
"name": "metadata_df",
"default": null
},
"metadata_sample_column": {
"name": "Sample Column",
"fill": "metadata_columns",
"metadata_required_column": {
"name": "Missing, but required Metadata Columns",
"fill": "metadata_required_columns",
"type": "categorical",
"categories": [],
"default": "Sample"
},
"metadata_group_column": {
"name": "Group Column",
"fill": "metadata_columns",
"type": "categorical",
"categories": [],
"default": "Group"
"default": null
},
"metadata_batch_column": {
"name": "Batch Column",
"fill": "metadata_columns",
"metadata_unknown_column": {
"name": "Existing, but unknown Metadata Columns",
"fill": "metadata_unknown_columns",
"type": "categorical",
"categories": [],
"default": "Batch"
"default": null
}

}
}
},
Expand Down
57 changes: 32 additions & 25 deletions protzilla/importing/metadata_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,32 +60,39 @@ def metadata_import_method(df, file_path, feature_orientation):


def metadata_column_assignment(
df,
metadata_df,
metadata_sample_column,
metadata_group_column,
metadata_batch_column,
df: pd.DataFrame,
metadata_df: pd.DataFrame,
metadata_required_column: str = None,
metadata_unknown_column: str = None,
):
assert (
metadata_sample_column in metadata_df.columns
), f"Sample column {metadata_sample_column} not found in metadata file."
assert (
metadata_group_column in metadata_df.columns
), f"Group column {metadata_group_column} not found in metadata file."
assert (
metadata_batch_column in metadata_df.columns
), f"Batch column {metadata_batch_column} not found in metadata file."
# assert that all columns parameters are unique
assert (
len({metadata_sample_column, metadata_group_column, metadata_batch_column}) == 3
), "The columns parameters must be unique."
"""
This function renames a column in the metadata dataframe to the required column name.
rename_dict = {
metadata_sample_column: "Sample",
metadata_group_column: "Group",
metadata_batch_column: "Batch",
}
# rename given in metadata_sample_column column to "Sample" if it is called otherwise
renamed_metadata_df = metadata_df.rename(columns=rename_dict, inplace=True)
:param df: this is passed for consistency, but not used
:type df: pandas DataFrame
:param metadata_df: the metadata dataframe to be changed
:type metadata_df: float
:param metadata_required_column: the name of the column in the dataframe that is used for the metadata assignment
:type metadata_df: str
:param metadata_unknown_column: the name of the column in the metadata dataframe that is renamed to the
required column name
:type metadata_unknown_column: str
:return: returns the unchanged dataframe and a dict with messages, potentially empty if no messages
:rtype: pd.DataFrame, dict
"""

# TODO add info box in UI explaining that no option for unknown columns means all columns are named correctly
# check if required column already in metadata, if so give error message
if metadata_required_column is None or metadata_unknown_column is None:
msg = f"You can proceed, as there is nothing that needs to be changed."
return df, dict(messages=[dict(level=messages.INFO, msg=msg)])

if metadata_required_column in metadata_df.columns:
msg = f"Metadata already contains column '{metadata_required_column}'. \
Please rename the column or select another column."
return df, dict(messages=[dict(level=messages.ERROR, msg=msg)])
# rename given in metadata_sample_column column to "Sample" if it is called otherwise
renamed_metadata_df = metadata_df.rename(
columns={metadata_unknown_column: metadata_required_column}, inplace=True
)
return df, dict()
18 changes: 16 additions & 2 deletions protzilla/run_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,22 @@ def insert_special_params(param_dict, run):
param_dict["categories"] = run.metadata.columns[
run.metadata.columns != "Sample"
].unique()
elif param_dict["fill"] == "metadata_columns":
param_dict["categories"] = run.metadata.columns.unique()
elif param_dict["fill"] == "metadata_unknown_columns":
# give selection of existing columns without ["Sample", "Group", "Batch"]
# as they are already named correctly for our purposes
param_dict["categories"] = run.metadata.columns[
~run.metadata.columns.isin(["Sample", "Group", "Batch"])
].unique()

elif param_dict["fill"] == "metadata_required_columns":
# TODO add other possible metadata columns
# exclude columns that are already in metadata and known to be required
param_dict["categories"] = [
col
for col in ["Sample", "Group", "Batch"]
if col not in run.metadata.columns
]

elif param_dict["fill"] == "metadata_column_data":
# per default fill with second column data since it is selected in dropdown
param_dict["categories"] = run.metadata.iloc[:, 1].unique()
Expand Down
44 changes: 44 additions & 0 deletions tests/protzilla/importing/test_metadata_import.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from shutil import rmtree

import pandas as pd
from django.contrib import messages

from protzilla.constants.paths import PROJECT_PATH, RUNS_PATH
from protzilla.importing import metadata_import
Expand Down Expand Up @@ -42,3 +43,46 @@ def test_metadata_orientation():
pd.testing.assert_frame_equal(run1.metadata, run2.metadata)
rmtree(RUNS_PATH / name1)
rmtree(RUNS_PATH / name2)


def test_metadata_column_assignment():
name = "test_run" + random_string()
run = Run.create(name)
run.step_index += 1
run.calculate_and_next(
metadata_import.metadata_import_method,
file_path=f"{PROJECT_PATH}/tests/metadata_cut_columns.csv",
feature_orientation="Columns (samples in rows, features in columns)",
)
# this is a workaround because the metadata is not passed properly using calculate_and_next,
# TODO but it works in the UI, it would be better to fix this
metadata_import.metadata_column_assignment(
df=run.df,
metadata_df=run.metadata,
metadata_required_column="Sample_renamed",
metadata_unknown_column="Sample",
)
assert run.metadata.columns[0] == "Sample_renamed"
metadata_import.metadata_column_assignment(
df=run.df,
metadata_df=run.metadata,
metadata_required_column="Sample",
metadata_unknown_column="Sample_renamed",
)
assert run.metadata.columns[0] == "Sample"
df, out = metadata_import.metadata_column_assignment(
df=run.df,
metadata_df=run.metadata,
metadata_required_column="Group",
metadata_unknown_column="Sample",
)
assert out["messages"][0]["level"] == messages.ERROR
assert out["messages"][0]["msg"]
df_new, out_new = metadata_import.metadata_column_assignment(
df=run.df,
metadata_df=run.metadata,
metadata_required_column="",
metadata_unknown_column="",
)

rmtree(RUNS_PATH / name)

0 comments on commit ddb8d04

Please sign in to comment.