first pass at fixing oi2 error

ntalluri · Sep 3, 2024 · 059c0fc · 059c0fc
1 parent cf0b401
commit 059c0fc
Show file tree

Hide file tree

Showing 9 changed files with 66 additions and 19 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -45,13 +45,13 @@ container_registry:
 algorithms:
       - name: "pathlinker"
         params:
-              include: true
+              include: false
               run1:
                   k: range(100,201,100)
 
       - name: "omicsintegrator1"
         params:
-              include: true
+              include: false
               run1:
                   b: [5, 6]
                   w: np.linspace(0,5,2)
@@ -69,26 +69,26 @@ algorithms:
 
       - name: "meo"
         params:
-              include: true
+              include: false
               run1:
                   max_path_length: [3]
                   local_search: ["Yes"]
                   rand_restarts: [10]
 
       - name: "mincostflow"
         params:
-              include: true
+              include: false
               run1:
                   flow: [1] # The flow must be an int
                   capacity: [1]
 
       - name: "allpairs"
         params:
-              include: true
+              include: false
 
       - name: "domino"
         params:
-              include: true
+              include: false
               run1:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]
@@ -149,28 +149,28 @@ reconstruction_settings:
 analysis:
       # Create one summary per pathway file and a single summary table for all pathways for each dataset
       summary:
-        include: true
+        include: false
       # Create output files for each pathway that can be visualized with GraphSpace
       graphspace:
-        include: true
+        include: false
       # Create Cytoscape session file with all pathway graphs for each dataset
       cytoscape:
-        include: true
+        include: false
       # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
       ml:
         # ml analysis per dataset
-        include: true
+        include: false
         # adds ml analysis per algorithm output
         # only runs for algorithms with multiple parameter combinations chosen
-        aggregate_per_algorithm: true
+        aggregate_per_algorithm: false
         # specify how many principal components to calculate
         components: 2
         # boolean to show the labels on the pca graph
-        labels: true
+        labels: false
         # 'ward', 'complete', 'average', 'single'
         # if linkage: ward, must use metric: euclidean
         linkage: 'ward'
         # 'euclidean', 'manhattan', 'cosine'
         metric: 'euclidean'
       evaluation:
-        include: true
+        include: false
diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py
@@ -148,14 +148,22 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         """
         # Omicsintegrator2 returns a single line file if no network is found
         num_lines = sum(1 for line in open(raw_pathway_file))
+        df = pd.read_csv(raw_pathway_file, sep='\t', header=0)
+        print(df)
+        # Omicsintegrator2 has corrupted output, list of correct column names and order
+        correct_columns = ['protein1', 'protein2', 'cost', 'in_solution']
+
         if num_lines < 2:
             df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
         else:
             df = pd.read_csv(raw_pathway_file, sep='\t', header=0)
-            df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
-            df = df.take([0, 1], axis=1)
-            df = add_rank_column(df)
-            df = reinsert_direction_col_undirected(df)
-            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            if (len(df.columns) == len(correct_columns)) and all(df.columns == correct_columns):
+                df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line 
+                df = df.take([0, 1], axis=1)
+                df = add_rank_column(df)
+                df = reinsert_direction_col_undirected(df)
+                df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            else: 
+                df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/test/parse-outputs/expected/oi2-expected/oi2-expected-empty.txt b/test/parse-outputs/expected/oi2-expected/oi2-expected-empty.txt
@@ -0,0 +1 @@
+Node1	Node2	Rank	Direction
diff --git a/test/parse-outputs/expected/oi2-expected/oi2-expected.txt b/test/parse-outputs/expected/oi2-expected/oi2-expected.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+B	A	1	U
+B	C	1	U
diff --git a/test/parse-outputs/input/oi2-raw-pathways/oi2-correct.txt b/test/parse-outputs/input/oi2-raw-pathways/oi2-correct.txt
@@ -0,0 +1,3 @@
+protein1	protein2	cost	in_solution
+B	A	0.52	True
+B	C	0.73	True
diff --git a/test/parse-outputs/input/oi2-raw-pathways/oi2-empty.txt b/test/parse-outputs/input/oi2-raw-pathways/oi2-empty.txt
@@ -0,0 +1 @@
+protein1	protein2
diff --git a/test/parse-outputs/input/oi2-raw-pathways/oi2-miss-insolution.txt b/test/parse-outputs/input/oi2-raw-pathways/oi2-miss-insolution.txt
@@ -0,0 +1,3 @@
+protein1	protein2	cost
+B	A	0.52
+B	C	0.73
diff --git a/test/parse-outputs/input/oi2-raw-pathways/oi2-wrong-order.txt b/test/parse-outputs/input/oi2-raw-pathways/oi2-wrong-order.txt
@@ -0,0 +1,3 @@
+protein1	protein2	in_solution	cost
+B	A	True	0.52
+B	C	True	0.73
diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py
@@ -6,14 +6,15 @@
 INDIR = "test/parse-outputs/input/"
 OUTDIR = "test/parse-outputs/output/"
 EXPDIR = "test/parse-outputs/expected/"
+RAW_PATHS_INDIR = 'test/parse-outputs/input/oi2-raw-pathways/'
+RAW_PATHS_EXPDIR = 'test/parse-outputs/expected/oi2-expected/'
 
 # DOMINO input is the concatenated module_0.html and module_1.html file from
 # the DOMINO output of the network dip.sif and the nodes tnfa_active_genes_file.txt
 # from https://github.com/Shamir-Lab/DOMINO/tree/master/examples
 
 algorithms = ['mincostflow', 'meo', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino']
 
-
 class TestParseOutputs:
     @classmethod
     def setup_class(cls):
@@ -37,3 +38,27 @@ def test_empty_file(self):
 
             runner.parse_output(algo, test_file, out_file)
             assert filecmp.cmp(OUTDIR + f"{algo}-empty-pathway.txt", EXPDIR + f"empty-pathway-expected.txt", shallow=False)
+
+    def test_oi2_correct_parse_output(self):
+        test_file = RAW_PATHS_INDIR + f"oi2-correct.txt"
+        out_file = OUTDIR + f"oi2-correct-pathway.txt"
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, RAW_PATHS_EXPDIR + f"oi2-expected.txt", shallow=False)
+
+    def test_oi2_empty_parse_output(self):
+        test_file = RAW_PATHS_INDIR + f"oi2-empty.txt"
+        out_file = OUTDIR + f"oi2-empty-pathway.txt"
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, RAW_PATHS_EXPDIR + f"oi2-expected-empty.txt", shallow=False)
+
+    def test_oi2_miss_insolution_parse_output(self):
+        test_file = RAW_PATHS_INDIR + f"oi2-miss-insolution.txt"
+        out_file = OUTDIR + f"oi2-miss-insolution-pathway.txt"
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, RAW_PATHS_EXPDIR + f"oi2-expected-empty.txt", shallow=False)
+
+    def test_oi2_wrong_order_parse_output(self):
+        test_file = RAW_PATHS_INDIR + f"oi2-wrong-order.txt"
+        out_file = OUTDIR + f"oi2-wrong-order-pathway.txt"
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, RAW_PATHS_EXPDIR + f"oi2-expected-empty.txt", shallow=False)