Merge pull request Reed-CompBio#143 from ntalluri/debugging

Debugging ML
ntalluri · Jun 14, 2024 · 1f59293 · 1f59293
2 parents 5616f76 + ab55402
commit 1f59293
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 5 deletions.
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
@@ -37,7 +37,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     edge_tuples = []
     for file in file_paths:
         try:
-            # collecting and sorting the edge pairs per algortihm
+            # collecting and sorting the edge pairs per algorithm
             with open(file, 'r') as f:
                 lines = f.readlines()
 
@@ -61,9 +61,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
             p = PurePath(file)
             edge_tuples.append((p.parts[-2], edges))
 
-        except FileNotFoundError:
-            print(file, ' not found during ML analysis')  # should not hit this
-            continue
+        except FileNotFoundError as exc:
+            raise FileNotFoundError(str(file) + ' not found during ML analysis') from exc
 
     # initially construct separate dataframes per algorithm
     edge_dataframes = []
@@ -81,8 +80,19 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     concated_df = pd.concat(edge_dataframes, axis=1, join='outer')
     concated_df = concated_df.fillna(0)
     concated_df = concated_df.astype('int64')
+
+    # don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
+    if concated_df.empty:
+        raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
+                      "suggest setting ml include: false in the configuration file to avoid this error.")
+    if min(concated_df.shape) <= 1:
+        raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
+                      f"The ml post-processing requires more than one pathway, but currently "
+                      f"there are only {min(concated_df.shape)} pathways.")
+
     return concated_df
 
+
 def create_palette(column_names):
     """
     Generates a dictionary mapping each column name (algorithm name)
@@ -141,7 +151,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
 
     # saving the coordinates of each algorithm
     make_required_dirs(output_coord)
-    coordinates_df = pd.DataFrame(X_pca, columns = ['PC' + str(i) for i in range(1, components+1)])
+    coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
     coordinates_df.insert(0, 'algorithm', columns.tolist())
     coordinates_df.to_csv(output_coord, sep='\t', index=False)
 

diff --git a/test/ml/input/test-data-single/single.txt b/test/ml/input/test-data-single/single.txt
@@ -0,0 +1 @@
+L	M	1	U
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import pandas as pd
+import pytest
 
 import spras.analysis.ml as ml
 
@@ -25,6 +26,14 @@ def test_summarize_networks(self):
         dataframe.to_csv(OUT_DIR + 'dataframe.csv')
         assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False)
 
+    def test_summarize_networks_empty(self):
+        with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing
+            ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])
+
+    def test_single_line(self):
+        with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing
+            ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])
+
     def test_pca(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
         ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt',