From f223dc64f5cab58c0239b509553eb4df82244c27 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 22 Dec 2023 13:17:48 -0800 Subject: [PATCH 01/13] fix this big: https://github.com/Reed-CompBio/spras/issues/117 --- spras/analysis/ml.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 64260ad6..05edde4e 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -112,6 +112,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: df = df.transpose() # based on the algorithms rather than the edges X = df.values + min_shape = min(df.shape) if components < 2: raise ValueError(f"components={components} must be greater than or equal to 2 in the config file.") @@ -121,6 +122,10 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: if not isinstance(labels, bool): raise ValueError(f"labels={labels} must be True or False") + if df.empty: + raise ValueError("The summarize network dataFrame is empty.\nEnsure that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for PCA.") + + scaler = StandardScaler() scaler.fit(X) # calc mean and standard deviation X_scaled = scaler.transform(X) From 3df91181a0b6bed6327dce2c1420cfb85809b2ce Mon Sep 17 00:00:00 2001 From: ntalluri Date: Sat, 23 Dec 2023 13:08:36 -0800 Subject: [PATCH 02/13] changed placement of raising the error and added a test case --- spras/analysis/ml.py | 10 +++++----- test/ml/test_ml.py | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 05edde4e..bbf6493a 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -81,6 +81,11 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = pd.concat(edge_dataframes, axis=1, join='outer') concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') + + # don't do ml post processing if there is an empty dataframe + if concated_df.empty: + raise ValueError("The summarize network dataFrame is empty.\nEnsure that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for ml post processing.") + return concated_df def create_palette(column_names): @@ -112,7 +117,6 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: df = df.transpose() # based on the algorithms rather than the edges X = df.values - min_shape = min(df.shape) if components < 2: raise ValueError(f"components={components} must be greater than or equal to 2 in the config file.") @@ -122,10 +126,6 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: if not isinstance(labels, bool): raise ValueError(f"labels={labels} must be True or False") - if df.empty: - raise ValueError("The summarize network dataFrame is empty.\nEnsure that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for PCA.") - - scaler = StandardScaler() scaler.fit(X) # calc mean and standard deviation X_scaled = scaler.transform(X) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index fa868a59..61fe69c5 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -2,6 +2,7 @@ from pathlib import Path import pandas as pd +import pytest import spras.analysis.ml as ml @@ -25,6 +26,10 @@ def test_summarize_networks(self): dataframe.to_csv(OUT_DIR + 'dataframe.csv') assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) + def test_summarize_networks_empty(self): + with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', From 2d67e95e3eac70994a6fb6c58a55b69167f9cd1d Mon Sep 17 00:00:00 2001 From: ntalluri Date: Sat, 23 Dec 2023 13:10:34 -0800 Subject: [PATCH 03/13] precommit --- spras/analysis/ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index bbf6493a..374aef4f 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -81,7 +81,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = pd.concat(edge_dataframes, axis=1, join='outer') concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') - + # don't do ml post processing if there is an empty dataframe if concated_df.empty: raise ValueError("The summarize network dataFrame is empty.\nEnsure that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for ml post processing.") From 51e5a7633dd751820fc47b774c477bae40a87c06 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 26 Dec 2023 11:58:29 -0800 Subject: [PATCH 04/13] Fixed https://github.com/Reed-CompBio/spras/issues/131 --- spras/analysis/ml.py | 6 ++++-- test/ml/input/test-data-single/single.txt | 1 + test/ml/test_ml.py | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 test/ml/input/test-data-single/single.txt diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 374aef4f..cfc3c3f6 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -82,9 +82,11 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') - # don't do ml post processing if there is an empty dataframe + # don't do ml post processing if there is an empty dataframe or the number of samples is <= 1 if concated_df.empty: - raise ValueError("The summarize network dataFrame is empty.\nEnsure that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for ml post processing.") + raise ValueError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nCheck that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for ml post processing.") + if min(concated_df.shape) <= 1: + raise ValueError (f"ML post-processing cannot proceed as the available number of samples is insufficient. The process requires more than one sample, but currently, there are only {min(concated_df.shape)} samples.") return concated_df diff --git a/test/ml/input/test-data-single/single.txt b/test/ml/input/test-data-single/single.txt new file mode 100644 index 00000000..30397283 --- /dev/null +++ b/test/ml/input/test-data-single/single.txt @@ -0,0 +1 @@ +L M 1 U diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 61fe69c5..9eb6795d 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -30,6 +30,10 @@ def test_summarize_networks_empty(self): with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + def test_summarize_networks_empty(self): + with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) + def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', From 5b4e4b6da03f1852b0f0e6466a5536a0a2894fd6 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 13:58:05 -0600 Subject: [PATCH 05/13] updated errors --- spras/analysis/ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index cfc3c3f6..61891437 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -84,9 +84,9 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra # don't do ml post processing if there is an empty dataframe or the number of samples is <= 1 if concated_df.empty: - raise ValueError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nCheck that the output files and configuration parameters are correct and non-empty to produce a non-empty dataframe for ml post processing.") + raise ValueError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nWe suggest setting ml's include: true in the configuration file to false to avoid this error.") if min(concated_df.shape) <= 1: - raise ValueError (f"ML post-processing cannot proceed as the available number of samples is insufficient. The process requires more than one sample, but currently, there are only {min(concated_df.shape)} samples.") + raise ValueError (f"ML post-processing cannot proceed as the available number of pathways is insufficient. The process requires more than one sample, but currently, there are only {min(concated_df.shape)} pathways.") return concated_df From d6fe57a538ab1982de6d7047da36332122e4b0fd Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:01:00 -0600 Subject: [PATCH 06/13] updated errors --- spras/analysis/ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 61891437..77b73be3 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -86,7 +86,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra if concated_df.empty: raise ValueError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nWe suggest setting ml's include: true in the configuration file to false to avoid this error.") if min(concated_df.shape) <= 1: - raise ValueError (f"ML post-processing cannot proceed as the available number of pathways is insufficient. The process requires more than one sample, but currently, there are only {min(concated_df.shape)} pathways.") + raise ValueError (f"ML post-processing cannot proceed because the available number of pathways is insufficient. The ml post processing requires more than one pathway, but currently, there are only {min(concated_df.shape)} pathways.") return concated_df From 741d104f6240fe6c4a346338a6d27ba28d3af328 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:31:29 -0600 Subject: [PATCH 07/13] added empty test case --- test/ml/test_ml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 9eb6795d..7c2ca77c 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -67,3 +67,7 @@ def test_ensemble_network(self): expected = expected.round(5) assert en.equals(expected) + + def test_summarize_networks_empty(self): + with pytest.raises(ValueError): + ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) From 45b266b3ab65ca52182bd720a0c68d9c03525320 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:33:33 -0600 Subject: [PATCH 08/13] added single pathway test --- test/ml/input/test-data-s4/s4.txt | 1 + test/ml/test_ml.py | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 test/ml/input/test-data-s4/s4.txt diff --git a/test/ml/input/test-data-s4/s4.txt b/test/ml/input/test-data-s4/s4.txt new file mode 100644 index 00000000..30397283 --- /dev/null +++ b/test/ml/input/test-data-s4/s4.txt @@ -0,0 +1 @@ +L M 1 U diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 7c2ca77c..f60de43a 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -71,3 +71,7 @@ def test_ensemble_network(self): def test_summarize_networks_empty(self): with pytest.raises(ValueError): ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + + def test_summarize_networks_single_pathway(self): + with pytest.raises(ValueError): + ml.summarize_networks([INPUT_DIR + 'test-data-s4/s4.txt']) From 6744c68ade136e18aa163a8ba7e9f2c3cae35643 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Mar 2024 16:52:20 -0500 Subject: [PATCH 09/13] update ml and test_ml --- spras/analysis/ml.py | 4 ++-- test/ml/test_ml.py | 13 +++---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 77b73be3..4dfe0ce6 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -84,9 +84,9 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra # don't do ml post processing if there is an empty dataframe or the number of samples is <= 1 if concated_df.empty: - raise ValueError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nWe suggest setting ml's include: true in the configuration file to false to avoid this error.") + raise OSError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nWe suggest setting ml's include: true in the configuration file to false to avoid this error.") if min(concated_df.shape) <= 1: - raise ValueError (f"ML post-processing cannot proceed because the available number of pathways is insufficient. The ml post processing requires more than one pathway, but currently, there are only {min(concated_df.shape)} pathways.") + raise OSError (f"ML post-processing cannot proceed because the available number of pathways is insufficient. The ml post processing requires more than one pathway, but currently, there are only {min(concated_df.shape)} pathways.") return concated_df diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index f60de43a..b4c1289d 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -27,11 +27,11 @@ def test_summarize_networks(self): assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) def test_summarize_networks_empty(self): - with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + with pytest.raises(OSError): #raises error if empty dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) - def test_summarize_networks_empty(self): - with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + def test_single_line(self): + with pytest.raises(OSError): #raises error if single line in file ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) def test_pca(self): @@ -68,10 +68,3 @@ def test_ensemble_network(self): assert en.equals(expected) - def test_summarize_networks_empty(self): - with pytest.raises(ValueError): - ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) - - def test_summarize_networks_single_pathway(self): - with pytest.raises(ValueError): - ml.summarize_networks([INPUT_DIR + 'test-data-s4/s4.txt']) From 24987872a3db47d555083f4eba44dda4e37437ec Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Mar 2024 16:57:05 -0500 Subject: [PATCH 10/13] cleaned up --- test/ml/input/test-data-s4/s4.txt | 1 - test/ml/test_ml.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 test/ml/input/test-data-s4/s4.txt diff --git a/test/ml/input/test-data-s4/s4.txt b/test/ml/input/test-data-s4/s4.txt deleted file mode 100644 index 30397283..00000000 --- a/test/ml/input/test-data-s4/s4.txt +++ /dev/null @@ -1 +0,0 @@ -L M 1 U diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index b4c1289d..95768817 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -31,7 +31,7 @@ def test_summarize_networks_empty(self): ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) def test_single_line(self): - with pytest.raises(OSError): #raises error if single line in file + with pytest.raises(OSError): #raises error if single line in file s.t. single row in dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) def test_pca(self): From ee10817c4c520164b4a5707483e8499e7c9d9c03 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 13 Jun 2024 21:28:55 -0500 Subject: [PATCH 11/13] Apply local formatting suggestions: --- spras/analysis/ml.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 4dfe0ce6..1c85d3da 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -37,7 +37,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra edge_tuples = [] for file in file_paths: try: - # collecting and sorting the edge pairs per algortihm + # collecting and sorting the edge pairs per algorithm with open(file, 'r') as f: lines = f.readlines() @@ -82,14 +82,18 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') - # don't do ml post processing if there is an empty dataframe or the number of samples is <= 1 + # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1 if concated_df.empty: - raise OSError("ML post-processing cannot proceed because the summarize network dataFrame is empty.\nWe suggest setting ml's include: true in the configuration file to false to avoid this error.") + raise OSError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " + "suggest setting ml include: false in the configuration file to avoid this error.") if min(concated_df.shape) <= 1: - raise OSError (f"ML post-processing cannot proceed because the available number of pathways is insufficient. The ml post processing requires more than one pathway, but currently, there are only {min(concated_df.shape)} pathways.") + raise OSError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " + f"The ml post-processing requires more than one pathway, but currently " + f"there are only {min(concated_df.shape)} pathways.") return concated_df + def create_palette(column_names): """ Generates a dictionary mapping each column name (algorithm name) @@ -148,7 +152,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: # saving the coordinates of each algorithm make_required_dirs(output_coord) - coordinates_df = pd.DataFrame(X_pca, columns = ['PC' + str(i) for i in range(1, components+1)]) + coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) coordinates_df.insert(0, 'algorithm', columns.tolist()) coordinates_df.to_csv(output_coord, sep='\t', index=False) From 1841f6572f62f26968f8c04201217f70ac32c0e1 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 13 Jun 2024 21:47:18 -0500 Subject: [PATCH 12/13] Switch error type and raise error when file missing --- spras/analysis/ml.py | 9 ++++----- test/ml/test_ml.py | 6 +++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 1c85d3da..a1571988 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -61,9 +61,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra p = PurePath(file) edge_tuples.append((p.parts[-2], edges)) - except FileNotFoundError: - print(file, ' not found during ML analysis') # should not hit this - continue + except FileNotFoundError as exc: + raise FileNotFoundError(str(file) + ' not found during ML analysis') from exc # initially construct separate dataframes per algorithm edge_dataframes = [] @@ -84,10 +83,10 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1 if concated_df.empty: - raise OSError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " + raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe " "suggest setting ml include: false in the configuration file to avoid this error.") if min(concated_df.shape) <= 1: - raise OSError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " + raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. " f"The ml post-processing requires more than one pathway, but currently " f"there are only {min(concated_df.shape)} pathways.") diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 95768817..93a9cc43 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -27,11 +27,11 @@ def test_summarize_networks(self): assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) def test_summarize_networks_empty(self): - with pytest.raises(OSError): #raises error if empty dataframe is used for post processing - ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) + with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + ml.summarize_networks([INPUT_DIR + 'test-data-empty/emptya.txt']) def test_single_line(self): - with pytest.raises(OSError): #raises error if single line in file s.t. single row in dataframe is used for post processing + with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) def test_pca(self): From ab554027b75d3459902851e534dd0c03fadd63c3 Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Fri, 14 Jun 2024 07:56:27 -0500 Subject: [PATCH 13/13] Revert test filename change Was testing file not found error behavior --- test/ml/test_ml.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 93a9cc43..6e8132b2 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -28,7 +28,7 @@ def test_summarize_networks(self): def test_summarize_networks_empty(self): with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing - ml.summarize_networks([INPUT_DIR + 'test-data-empty/emptya.txt']) + ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) def test_single_line(self): with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing @@ -67,4 +67,3 @@ def test_ensemble_network(self): expected = expected.round(5) assert en.equals(expected) -