From ddb8d04d3b6f42d759d2126e9f9be7ca3e61f74b Mon Sep 17 00:00:00 2001 From: henninggaertner Date: Fri, 17 Nov 2023 17:31:29 +0100 Subject: [PATCH] Column assignment: write tests, optimize variable names and add docstring, add messages for special cases --- protzilla/constants/workflow_meta.json | 26 ++++----- protzilla/importing/metadata_import.py | 57 +++++++++++-------- protzilla/run_helper.py | 18 +++++- .../importing/test_metadata_import.py | 44 ++++++++++++++ 4 files changed, 102 insertions(+), 43 deletions(-) diff --git a/protzilla/constants/workflow_meta.json b/protzilla/constants/workflow_meta.json index 1c8df8aa3..fd685f864 100644 --- a/protzilla/constants/workflow_meta.json +++ b/protzilla/constants/workflow_meta.json @@ -83,34 +83,28 @@ }, "metadata_column_assignment": { "name": "Metadata Column Assignment", - "description": "Assign columns to metadata categories", + "description": "Assign columns to metadata categories, repeatable for each category", "parameters": { "metadata_df": { "type": "empty", "name": "metadata_df", "default": null }, - "metadata_sample_column": { - "name": "Sample Column", - "fill": "metadata_columns", + "metadata_required_column": { + "name": "Missing, but required Metadata Columns", + "fill": "metadata_required_columns", "type": "categorical", "categories": [], - "default": "Sample" - }, - "metadata_group_column": { - "name": "Group Column", - "fill": "metadata_columns", - "type": "categorical", - "categories": [], - "default": "Group" + "default": null }, - "metadata_batch_column": { - "name": "Batch Column", - "fill": "metadata_columns", + "metadata_unknown_column": { + "name": "Existing, but unknown Metadata Columns", + "fill": "metadata_unknown_columns", "type": "categorical", "categories": [], - "default": "Batch" + "default": null } + } } }, diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py index ed596e882..6a82a1bd7 100644 --- a/protzilla/importing/metadata_import.py +++ b/protzilla/importing/metadata_import.py @@ -60,32 +60,39 @@ def metadata_import_method(df, file_path, feature_orientation): def metadata_column_assignment( - df, - metadata_df, - metadata_sample_column, - metadata_group_column, - metadata_batch_column, + df: pd.DataFrame, + metadata_df: pd.DataFrame, + metadata_required_column: str = None, + metadata_unknown_column: str = None, ): - assert ( - metadata_sample_column in metadata_df.columns - ), f"Sample column {metadata_sample_column} not found in metadata file." - assert ( - metadata_group_column in metadata_df.columns - ), f"Group column {metadata_group_column} not found in metadata file." - assert ( - metadata_batch_column in metadata_df.columns - ), f"Batch column {metadata_batch_column} not found in metadata file." - # assert that all columns parameters are unique - assert ( - len({metadata_sample_column, metadata_group_column, metadata_batch_column}) == 3 - ), "The columns parameters must be unique." + """ + This function renames a column in the metadata dataframe to the required column name. - rename_dict = { - metadata_sample_column: "Sample", - metadata_group_column: "Group", - metadata_batch_column: "Batch", - } - # rename given in metadata_sample_column column to "Sample" if it is called otherwise - renamed_metadata_df = metadata_df.rename(columns=rename_dict, inplace=True) + :param df: this is passed for consistency, but not used + :type df: pandas DataFrame + :param metadata_df: the metadata dataframe to be changed + :type metadata_df: float + :param metadata_required_column: the name of the column in the dataframe that is used for the metadata assignment + :type metadata_df: str + :param metadata_unknown_column: the name of the column in the metadata dataframe that is renamed to the + required column name + :type metadata_unknown_column: str + :return: returns the unchanged dataframe and a dict with messages, potentially empty if no messages + :rtype: pd.DataFrame, dict + """ + + # TODO add info box in UI explaining that no option for unknown columns means all columns are named correctly + # check if required column already in metadata, if so give error message + if metadata_required_column is None or metadata_unknown_column is None: + msg = f"You can proceed, as there is nothing that needs to be changed." + return df, dict(messages=[dict(level=messages.INFO, msg=msg)]) + if metadata_required_column in metadata_df.columns: + msg = f"Metadata already contains column '{metadata_required_column}'. \ + Please rename the column or select another column." + return df, dict(messages=[dict(level=messages.ERROR, msg=msg)]) + # rename given in metadata_sample_column column to "Sample" if it is called otherwise + renamed_metadata_df = metadata_df.rename( + columns={metadata_unknown_column: metadata_required_column}, inplace=True + ) return df, dict() diff --git a/protzilla/run_helper.py b/protzilla/run_helper.py index ff1f6c3a7..6ffc695aa 100644 --- a/protzilla/run_helper.py +++ b/protzilla/run_helper.py @@ -29,8 +29,22 @@ def insert_special_params(param_dict, run): param_dict["categories"] = run.metadata.columns[ run.metadata.columns != "Sample" ].unique() - elif param_dict["fill"] == "metadata_columns": - param_dict["categories"] = run.metadata.columns.unique() + elif param_dict["fill"] == "metadata_unknown_columns": + # give selection of existing columns without ["Sample", "Group", "Batch"] + # as they are already named correctly for our purposes + param_dict["categories"] = run.metadata.columns[ + ~run.metadata.columns.isin(["Sample", "Group", "Batch"]) + ].unique() + + elif param_dict["fill"] == "metadata_required_columns": + # TODO add other possible metadata columns + # exclude columns that are already in metadata and known to be required + param_dict["categories"] = [ + col + for col in ["Sample", "Group", "Batch"] + if col not in run.metadata.columns + ] + elif param_dict["fill"] == "metadata_column_data": # per default fill with second column data since it is selected in dropdown param_dict["categories"] = run.metadata.iloc[:, 1].unique() diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py index 7ed1d1785..5f29ce38f 100644 --- a/tests/protzilla/importing/test_metadata_import.py +++ b/tests/protzilla/importing/test_metadata_import.py @@ -1,6 +1,7 @@ from shutil import rmtree import pandas as pd +from django.contrib import messages from protzilla.constants.paths import PROJECT_PATH, RUNS_PATH from protzilla.importing import metadata_import @@ -42,3 +43,46 @@ def test_metadata_orientation(): pd.testing.assert_frame_equal(run1.metadata, run2.metadata) rmtree(RUNS_PATH / name1) rmtree(RUNS_PATH / name2) + + +def test_metadata_column_assignment(): + name = "test_run" + random_string() + run = Run.create(name) + run.step_index += 1 + run.calculate_and_next( + metadata_import.metadata_import_method, + file_path=f"{PROJECT_PATH}/tests/metadata_cut_columns.csv", + feature_orientation="Columns (samples in rows, features in columns)", + ) + # this is a workaround because the metadata is not passed properly using calculate_and_next, + # TODO but it works in the UI, it would be better to fix this + metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Sample_renamed", + metadata_unknown_column="Sample", + ) + assert run.metadata.columns[0] == "Sample_renamed" + metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Sample", + metadata_unknown_column="Sample_renamed", + ) + assert run.metadata.columns[0] == "Sample" + df, out = metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="Group", + metadata_unknown_column="Sample", + ) + assert out["messages"][0]["level"] == messages.ERROR + assert out["messages"][0]["msg"] + df_new, out_new = metadata_import.metadata_column_assignment( + df=run.df, + metadata_df=run.metadata, + metadata_required_column="", + metadata_unknown_column="", + ) + + rmtree(RUNS_PATH / name)