added in parse_output directionality

ntalluri · Aug 30, 2023 · 1e7fef9 · 1e7fef9
1 parent a214cc8
commit 1e7fef9
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 42 deletions.
diff --git a/src/allpairs.py b/src/allpairs.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+from src.dataset import convert_directed_to_undirected, readd_direction_col_undirected
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -42,8 +43,12 @@ def generate_inputs(data, filename_map):
 
         input_df.to_csv(filename_map["nodetypes"], sep="\t", index=False, columns=["#Node", "Node type"])
 
+        # Create network file
+        edges_df = data.get_interactome()
+        # Format network file
+        edges_df = convert_directed_to_undirected(edges_df)
         # This is pretty memory intensive. We might want to keep the interactome centralized.
-        data.get_interactome().to_csv(filename_map["network"], sep="\t", index=False,
+        edges_df.to_csv(filename_map["network"], sep="\t", index=False,
                                       columns=["Interactor1", "Interactor2", "Weight"],
                                       header=["#Interactor1", "Interactor2", "Weight"])
 
@@ -100,4 +105,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         """
         df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
         df['Rank'] = 1  # add a rank column of 1s since the edges are not ranked.
+        df = readd_direction_col_undirected(df, 2)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/dataset.py b/src/dataset.py
@@ -264,3 +264,55 @@ def add_directionality_seperators(df: pd.DataFrame, col_loc: int, col_name: str,
             )
 
     return df
+
+def readd_direction_col_mixed(df: pd.DataFrame, direction_col_loc: int, existing_direction_column: str, dir_sep: str, undir_sep: str) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'U' or 'D' based on the dir/undir seperators in the existing direction column
+
+    *user must keep the existing direction column when using the function
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @param existing_direction_column: the name of the existing directionality column
+    @param dir_sep: the directed edge sep
+    @param undir_sep: the undirected edge sep
+    @return a df with Direction column added back
+    """
+
+    df.insert(direction_col_loc, "Direction", "D")
+
+    for index, row in df.iterrows():
+        if row[existing_direction_column] == undir_sep:
+            df.at[index, "Direction"] = "U"
+
+        elif row[existing_direction_column] == dir_sep:
+            df.at[index, "Direction"] = "D"
+
+        else:
+            raise ValueError(
+                f'direction must be a \'{dir_sep}\' or \'{undir_sep}\', but found {row[existing_direction_column]}'
+            )
+
+    return df
+
+def readd_direction_col_undirected(df: pd.DataFrame, direction_col_loc: int) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'U'
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @return a df with Direction column added back
+    """
+    df.insert(direction_col_loc, "Direction", "U")
+    return df
+
+def readd_direction_col_directed(df: pd.DataFrame, direction_col_loc: int) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'D'
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @return a df with Direction column added back
+    """
+    df.insert(direction_col_loc, "Direction", "D")
+    return df
diff --git a/src/domino.py b/src/domino.py
@@ -3,7 +3,11 @@
 
 import pandas as pd
 
-from src.dataset import add_seperator, convert_directed_to_undirected
+from src.dataset import (
+    add_seperator,
+    convert_directed_to_undirected,
+    readd_direction_col_undirected,
+)
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -199,7 +203,9 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             # Remove the prefix
             edges_df['source'] = edges_df['source'].apply(post_domino_id_transform)
             edges_df['target'] = edges_df['target'].apply(post_domino_id_transform)
+            edges_df = readd_direction_col_undirected(edges_df, 3)
 
+        print(edges_df)
         edges_df.to_csv(standardized_pathway_file, sep='\t', header=False, index=False)
 
 

diff --git a/src/meo.py b/src/meo.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from src.dataset import add_directionality_seperators
+from src.dataset import add_directionality_seperators, readd_direction_col_mixed
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -179,4 +179,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         # TODO what should be the edge rank?
         # Would need to load the paths output file to rank edges correctly
         df.insert(5, 'Rank', 1)  # Add a constant rank of 1
-        df.to_csv(standardized_pathway_file, columns=['Source', 'Target', 'Rank'], header=False, index=False, sep='\t')
+
+        # TODO: add direction column
+        df = readd_direction_col_mixed(df, 6, "Type", "pd", "pp")
+
+        df.to_csv(standardized_pathway_file, columns=['Source', 'Target', 'Rank', "Direction"], header=False, index=False, sep='\t')
diff --git a/src/mincostflow.py b/src/mincostflow.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from src.dataset import convert_undirected_to_directed
+from src.dataset import convert_undirected_to_directed, readd_direction_col_directed
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -140,4 +140,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
 
         df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
         df.insert(2, 'Rank', 1)  # adds in a rank column of 1s because the edges are not ranked
+        df = readd_direction_col_directed(df, 3)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/omicsintegrator1.py b/src/omicsintegrator1.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 
+from src.dataset import readd_direction_col_mixed
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -193,6 +194,9 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             with open(standardized_pathway_file, 'w'):
                 pass
             return
-        df = df.take([0, 2], axis=1)
-        df[3] = [1 for _ in range(len(df.index))]
-        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
+
+        df.columns = ["Edge1", "InteractionType", "Edge2"]
+        df.insert (3, "Rank", 1)
+        df = readd_direction_col_mixed(df, 4, "InteractionType", "pd", "pp")
+
+        df.to_csv(standardized_pathway_file,columns=['Edge1', 'Edge2', 'Rank', "Direction"], header=False, index=False, sep='\t')
diff --git a/src/omicsintegrator2.py b/src/omicsintegrator2.py
@@ -4,7 +4,7 @@
 import docker
 import pandas as pd
 
-from src.dataset import convert_directed_to_undirected
+from src.dataset import convert_directed_to_undirected, readd_direction_col_undirected
 from src.prm import PRM
 from src.util import prepare_path_docker
 
@@ -173,4 +173,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
         df = df.take([0, 1], axis=1)
         df[3] = [1 for _ in range(len(df.index))]
+        df = readd_direction_col_undirected(df, 3)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/pathlinker.py b/src/pathlinker.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from src.dataset import convert_undirected_to_directed
+from src.dataset import convert_undirected_to_directed, readd_direction_col_directed
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -137,4 +137,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         # Questions: should there be a header/optional columns?
         # What about multiple raw_pathway_files
         df = pd.read_csv(raw_pathway_file, sep='\t').take([0, 1, 2], axis=1)
+        df = readd_direction_col_directed(df, 3)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/test/DOMINO/expected_output/domino-parse-output.txt b/test/DOMINO/expected_output/domino-parse-output.txt
@@ -1,32 +1,32 @@
-ENSG00000122691	ENSG00000138757	1
-ENSG00000122691	ENSG00000109320	1
-ENSG00000134954	ENSG00000077150	1
-ENSG00000077150	ENSG00000107968	1
-ENSG00000077150	ENSG00000157557	1
-ENSG00000077150	ENSG00000109320	1
-ENSG00000173120	ENSG00000173039	1
-ENSG00000173039	ENSG00000109320	1
-ENSG00000168884	ENSG00000109320	1
-ENSG00000109320	ENSG00000282905	1
-ENSG00000109320	ENSG00000104856	1
-ENSG00000109320	ENSG00000146232	1
-ENSG00000109320	ENSG00000166135	1
-ENSG00000109320	ENSG00000170606	1
-ENSG00000100906	ENSG00000166135	1
-ENSG00000100906	ENSG00000198873	1
-ENSG00000100906	ENSG00000173020	1
-ENSG00000162924	ENSG00000170606	1
-ENSG00000187079	ENSG00000177606	1
-ENSG00000177606	ENSG00000168062	1
-ENSG00000177606	ENSG00000182979	1
-ENSG00000177606	ENSG00000050748	1
-ENSG00000177606	ENSG00000109339	1
-ENSG00000177606	ENSG00000170345	1
-ENSG00000177606	ENSG00000175592	1
-ENSG00000177606	ENSG00000085276	1
-ENSG00000171223	ENSG00000100721	1
-ENSG00000171223	ENSG00000075426	1
-ENSG00000171223	ENSG00000085276	1
-ENSG00000130522	ENSG00000175592	1
-ENSG00000175592	ENSG00000128272	1
-ENSG00000128272	ENSG00000162772	1
+ENSG00000122691	ENSG00000138757	1	U
+ENSG00000122691	ENSG00000109320	1	U
+ENSG00000134954	ENSG00000077150	1	U
+ENSG00000077150	ENSG00000107968	1	U
+ENSG00000077150	ENSG00000157557	1	U
+ENSG00000077150	ENSG00000109320	1	U
+ENSG00000173120	ENSG00000173039	1	U
+ENSG00000173039	ENSG00000109320	1	U
+ENSG00000168884	ENSG00000109320	1	U
+ENSG00000109320	ENSG00000282905	1	U
+ENSG00000109320	ENSG00000104856	1	U
+ENSG00000109320	ENSG00000146232	1	U
+ENSG00000109320	ENSG00000166135	1	U
+ENSG00000109320	ENSG00000170606	1	U
+ENSG00000100906	ENSG00000166135	1	U
+ENSG00000100906	ENSG00000198873	1	U
+ENSG00000100906	ENSG00000173020	1	U
+ENSG00000162924	ENSG00000170606	1	U
+ENSG00000187079	ENSG00000177606	1	U
+ENSG00000177606	ENSG00000168062	1	U
+ENSG00000177606	ENSG00000182979	1	U
+ENSG00000177606	ENSG00000050748	1	U
+ENSG00000177606	ENSG00000109339	1	U
+ENSG00000177606	ENSG00000170345	1	U
+ENSG00000177606	ENSG00000175592	1	U
+ENSG00000177606	ENSG00000085276	1	U
+ENSG00000171223	ENSG00000100721	1	U
+ENSG00000171223	ENSG00000075426	1	U
+ENSG00000171223	ENSG00000085276	1	U
+ENSG00000130522	ENSG00000175592	1	U
+ENSG00000175592	ENSG00000128272	1	U
+ENSG00000128272	ENSG00000162772	1	U