diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 64260ad6..a1571988 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -37,7 +37,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra edge_tuples = [] for file in file_paths: try: - # collecting and sorting the edge pairs per algortihm + # collecting and sorting the edge pairs per algorithm with open(file, 'r') as f: lines = f.readlines() @@ -61,9 +61,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra p = PurePath(file) edge_tuples.append((p.parts[-2], edges)) - except FileNotFoundError: - print(file, ' not found during ML analysis') # should not hit this - continue + except FileNotFoundError as exc: + raise FileNotFoundError(str(file) + ' not found during ML analysis') from exc # initially construct separate dataframes per algorithm edge_dataframes = [] @@ -81,8 +80,19 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = pd.concat(edge_dataframes, axis=1, join='outer') concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') + + # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1 + if concated_df.empty: + raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " + "suggest setting ml include: false in the configuration file to avoid this error.") + if min(concated_df.shape) <= 1: + raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " + f"The ml post-processing requires more than one pathway, but currently " + f"there are only {min(concated_df.shape)} pathways.") + return concated_df + def create_palette(column_names): """ Generates a dictionary mapping each column name (algorithm name) @@ -141,7 +151,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: # saving the coordinates of each algorithm make_required_dirs(output_coord) - coordinates_df = pd.DataFrame(X_pca, columns = ['PC' + str(i) for i in range(1, components+1)]) + coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) coordinates_df.insert(0, 'algorithm', columns.tolist()) coordinates_df.to_csv(output_coord, sep='\t', index=False) diff --git a/test/ml/input/test-data-single/single.txt b/test/ml/input/test-data-single/single.txt new file mode 100644 index 00000000..30397283 --- /dev/null +++ b/test/ml/input/test-data-single/single.txt @@ -0,0 +1 @@ +L M 1 U diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index fa868a59..6e8132b2 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -2,6 +2,7 @@ from pathlib import Path import pandas as pd +import pytest import spras.analysis.ml as ml @@ -25,6 +26,14 @@ def test_summarize_networks(self): dataframe.to_csv(OUT_DIR + 'dataframe.csv') assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) + def test_summarize_networks_empty(self): + with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + + def test_single_line(self): + with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing + ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) + def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt',