added directionality for generate inputs

ntalluri · Aug 29, 2023 · 3f173b5 · 3f173b5
1 parent af6a590
commit 3f173b5
Show file tree

Hide file tree

Showing 9 changed files with 200 additions and 42 deletions.
diff --git a/input/alternative-network.txt b/input/alternative-network.txt
@@ -1,9 +1,9 @@
-A	B	0.98
-B	C	0.77
-A	D	0.12
-C	D	0.89
-C	E	0.59
-C	F	0.50
-F	G	0.76
-G	H	0.92
-G	I	0.66
+A	B	0.98	U
+B	C	0.77	U
+A	D	0.12	U
+C	D	0.89	U
+C	E	0.59	U
+C	F	0.50	U
+F	G	0.76	U
+G	H	0.92	U
+G	I	0.66	U
diff --git a/input/network.txt b/input/network.txt
@@ -1,2 +1,2 @@
-A	B	0.98
-B	C	0.77
+A	B	0.98	U
+B	C	0.77	U
diff --git a/src/dataset.py b/src/dataset.py
@@ -14,7 +14,7 @@
 class Dataset:
 
     NODE_ID = "NODEID"
-    warning_threshold = 0.05 #Threshold for scarcity of columns to warn user
+    warning_threshold = 0.05  # Threshold for scarcity of columns to warn user
 
     def __init__(self, dataset_dict):
         self.label = None
@@ -63,25 +63,57 @@ def load_files_from_dict(self, dataset_dict):
 
         self.label = dataset_dict["label"]
 
-        #Get file paths from config
+        # Get file paths from config
         # TODO support multiple edge files
         interactome_loc = dataset_dict["edge_files"][0]
         node_data_files = dataset_dict["node_files"]
-        #edge_data_files = [""]  # Currently None
+        # edge_data_files = [""]  # Currently None
         data_loc = dataset_dict["data_dir"]
 
-        #Load everything as pandas tables
-        self.interactome = pd.read_table(os.path.join(data_loc,interactome_loc), names = ["Interactor1","Interactor2","Weight"])
+        # Load everything as pandas tables
+        print("about to create self.interactome")
+        print(data_loc)
+        print(interactome_loc)
+
+        with open(os.path.join(data_loc, interactome_loc), "r") as f:
+            for _i in range(9):  # first 5 lines
+                print(f.readline())
+
+        self.interactome = pd.read_table(
+            os.path.join(data_loc, interactome_loc), sep="\t", header=None
+        )
+        print(self.interactome)
+        num_cols = self.interactome.shape[1]
+        print(num_cols)
+        if num_cols == 3:
+
+            self.interactome.columns = ["Interactor1", "Interactor2", "Weight"]
+            self.interactome["Direction"] = "U"
+
+        elif num_cols == 4:
+            self.interactome.columns = [
+                "Interactor1",
+                "Interactor2",
+                "Weight",
+                "Direction",
+            ]
+        else:
+            raise ValueError(
+                f"edge_files must have three or four columns but found {num_cols}"
+            )
+
         node_set = set(self.interactome.Interactor1.unique())
         node_set = node_set.union(set(self.interactome.Interactor2.unique()))
 
-        #Load generic node tables
+        # Load generic node tables
         self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID])
         for node_file in node_data_files:
-            single_node_table = pd.read_table(os.path.join(data_loc,node_file))
-            #If we have only 1 column, assume this is an indicator variable
-            if len(single_node_table.columns)==1:
-                single_node_table = pd.read_table(os.path.join(data_loc,node_file),header=None)
+            single_node_table = pd.read_table(os.path.join(data_loc, node_file))
+            # If we have only 1 column, assume this is an indicator variable
+            if len(single_node_table.columns) == 1:
+                single_node_table = pd.read_table(
+                    os.path.join(data_loc, node_file), header=None
+                )
                 single_node_table.columns = [self.NODE_ID]
                 new_col_name = node_file.split(".")[0]
                 single_node_table[new_col_name] = True
@@ -91,7 +123,9 @@ def load_files_from_dict(self, dataset_dict):
             # will be ignored
             # TODO may want to warn about duplicate before removing them, for instance, if a user loads two files that
             #  both have prizes
-            self.node_table = self.node_table.merge(single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")).filter(regex="^(?!.*DROP)")
+            self.node_table = self.node_table.merge(
+                single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")
+            ).filter(regex="^(?!.*DROP)")
         # Ensure that the NODEID column always appears first, which is required for some downstream analyses
         self.node_table.insert(0, "NODEID", self.node_table.pop("NODEID"))
         self.other_files = dataset_dict["other_files"]
@@ -103,11 +137,18 @@ def request_node_columns(self, col_names):
         """
         col_names.append(self.NODE_ID)
         filtered_table = self.node_table[col_names]
-        filtered_table = filtered_table.dropna(axis=0, how='all',subset=filtered_table.columns.difference([self.NODE_ID]))
-        percent_hit = (float(len(filtered_table))/len(self.node_table))*100
-        if percent_hit <= self.warning_threshold*100:
+        filtered_table = filtered_table.dropna(
+            axis=0, how="all", subset=filtered_table.columns.difference([self.NODE_ID])
+        )
+        percent_hit = (float(len(filtered_table)) / len(self.node_table)) * 100
+        if percent_hit <= self.warning_threshold * 100:
             # Only use stacklevel 1 because this is due to the data not the code context
-            warnings.warn("Only %0.2f of data had one or more of the following columns filled:"%(percent_hit) + str(col_names), stacklevel=1)
+            warnings.warn(
+                "Only %0.2f of data had one or more of the following columns filled:"
+                % (percent_hit)
+                + str(col_names),
+                stacklevel=1,
+            )
         return filtered_table
 
     def contains_node_columns(self, col_names):
@@ -131,3 +172,95 @@ def get_other_files(self):
 
     def get_interactome(self):
         return self.interactome.copy()
+
+def convert_undirected_to_directed(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    turns a graph into a fully directed graph
+    - turns every unidirected edges into a bi-directed edge
+    - with bi-directed edges, we are not loosing too much information because the relationship of the undirected egde is still preserved
+
+   *A user must keep the Direction column when using this function
+
+    @param df: input network df of edges, weights, and directionality
+    @return a dataframe with no undirected edges in Direction column
+    """
+
+    # TODO: add a check to make sure there is a direction column in df
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "U":
+            df.at[index, "Direction"] = "D"
+
+            new_directed_row = row.copy(deep=True)
+            new_directed_row["Interactor1"], new_directed_row["Interactor2"] = (
+                row["Interactor2"],
+                row["Interactor1"],
+            )
+            print("new directed row\n", new_directed_row)
+            new_directed_row["Direction"] = "D"
+            df.loc[len(df)] = new_directed_row
+
+    return df
+
+
+def convert_directed_to_undirected(df: pd.DataFrame) -> pd.DateOffset:
+    """
+    turns a graph into a fully undirected graph
+    - turns the directed edges directly into undirected edges
+    - we will loose any sense of directionality and the graph won't be inherently accurate, but the basic relationship between the two connected nodes will still remain intact.
+
+    @param df: input network df of edges, weights, and directionality
+    @return a dataframe with no directed edges in Direction column
+    """
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "D":
+            df.at[index, "Direction"] = "U"
+
+    return df
+
+
+def add_seperator(df: pd.DataFrame, col_loc: int, col_name: str, sep: str) -> pd.DataFrame:
+    """
+    adds a seperator somewhere into the input dataframe
+
+    @param df: input network df of edges, weights, and directionality
+    @param col_loc: the spot in the dataframe to put the new column
+    @param col_name: the name of the new column
+    @param sep: some type of seperator needed in the df
+    @return a df with a new seperator added to every row
+    """
+
+    df.insert(col_loc, col_name, sep)
+    return df
+
+
+def add_directionality_seperators(df: pd.DataFrame, col_loc: int, col_name: str, dir_sep: str, undir_sep: str) -> pd.DataFrame:
+    """
+    deals with adding in directionality seperators for mixed graphs that isn't in the universal input
+
+    *user must keep the Direction column when using the function
+
+    @param df: input network df of edges, weights, and directionality
+    @param col_loc: the spot in the dataframe to put the new column
+    @param col_name: the name of the new column
+    @param dir_sep: the directed edge sep
+    @param undir_sep: the undirected edge sep
+    @return a df converted to show directionality differently
+    """
+
+    # TODO: add a check to make sure there is a direction column in df
+
+    df.insert(col_loc, col_name, dir_sep)
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "U":
+            df.at[index, col_name] = undir_sep
+        elif row["Direction"] == "D":
+            continue
+        else:
+            raise ValueError(
+                f'direction must be a \'U\' or \'D\', but found {row["Direction"]}'
+            )
+
+    return df
diff --git a/src/domino.py b/src/domino.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+from src.dataset import add_seperator, convert_directed_to_undirected
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -15,7 +16,7 @@
 Domino will construct a fully undirected graph from the provided input file
 - in the algorithm, it uses nx.Graph()
 
-Expected raw input format: 
+Expected raw input format:
 Interactor1     ppi     Interactor2
 - the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column
 - it can include repeated and bidirectional edges
@@ -51,7 +52,10 @@ def generate_inputs(data, filename_map):
 
         # Create network file
         edges_df = data.get_interactome()
-        edges_df['ppi'] = 'ppi'
+
+        # Format network file
+        edges_df = convert_directed_to_undirected(edges_df)
+        edges_df = add_seperator(edges_df, 1, 'ppi', 'ppi')
 
         # Transform each node id with a prefix
         edges_df['Interactor1'] = edges_df['Interactor1'].apply(pre_domino_id_transform)

diff --git a/src/meo.py b/src/meo.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 
+from src.dataset import add_directionality_seperators
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -49,7 +50,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None,
 """
 MEO can support partially directed graphs
 
-Expected raw input format: 
+Expected raw input format:
 Interactor1   pp/pd   Interactor2   Weight
 - the expected raw input file should have node pairs in the 1st and 3rd columns, with a directionality in the 2nd column and the weight in the 4th column
 - it use pp for undirected edges and pd for directed edges
@@ -83,11 +84,12 @@ def generate_inputs(data, filename_map):
             nodes = nodes.loc[nodes[node_type]]
             nodes.to_csv(filename_map[node_type], index=False, columns=['NODEID'], header=False)
 
-        # TODO need to support partially directed graphs
-        # Expected columns are Node1 EdgeType Node2 Weight
+        # Create network file
         edges = data.get_interactome()
-        # For now all edges are undirected
-        edges.insert(1, 'EdgeType', '(pp)')
+
+        # Format network file
+        edges = add_directionality_seperators(edges, 1, 'EdgeType', '(pd)', '(pp)')
+
         edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'EdgeType', 'Interactor2', 'Weight'], header=False)
 
 

diff --git a/src/mincostflow.py b/src/mincostflow.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 
+from src.dataset import convert_undirected_to_directed
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -12,7 +13,7 @@
 - OR Tools MCF is designed for directed graphs
 - when an edge (arc), it has a source and target node, so flow it only allowed to moced from source to the target
 
-Expected raw input format: 
+Expected raw input format:
 Interactor1  Interactor2   Weight
 - the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column
 - it can include repeated and bidirectional edges
@@ -46,6 +47,9 @@ def generate_inputs(data, filename_map):
         # create the network of edges
         edges = data.get_interactome()
 
+        # Format network edges
+        edges = convert_undirected_to_directed(edges)
+
         # creates the edges files that contains the head and tail nodes and the weights after them
         edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=["Interactor1","Interactor2","Weight"], header=False)
 

diff --git a/src/omicsintegrator1.py b/src/omicsintegrator1.py
@@ -37,9 +37,9 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi
 
 """
 Omics Integrator 1 will construct works with partially directed graphs
-- it takes in the universal input directly 
+- it takes in the universal input directly
 
-Expected raw input format: 
+Expected raw input format:
 Interactor1    Interactor2   Weight    Direction
 - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column and directionality in the 4th column
 - it can include repeated and bidirectional edges
@@ -74,10 +74,11 @@ def generate_inputs(data, filename_map):
         #Omics Integrator already gives warnings for strange prize values, so we won't here
         node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize'])
 
-        #For now we assume all input networks are undirected until we expand how edge tables work
+        # Get network file
         edges_df = data.get_interactome()
-        edges_df['directionality'] = 'U'
-        edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','Weight','directionality'],header=['protein1','protein2','weight','directionality'])
+
+        # Rename Direction column
+        edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','Weight','Direction'],header=['protein1','protein2','weight','directionality'])
 
 
     # TODO add parameter validation

diff --git a/src/omicsintegrator2.py b/src/omicsintegrator2.py
@@ -4,6 +4,7 @@
 import docker
 import pandas as pd
 
+from src.dataset import convert_directed_to_undirected
 from src.prm import PRM
 from src.util import prepare_path_docker
 
@@ -14,7 +15,7 @@
 - in the algorithm, it uses nx.Graph() objects, which are undirected
 - uses a pcst_fast solver which supports undirected graphs
 
-Expected raw input format: 
+Expected raw input format:
 Interactor1   Interactor2   Weight
 - the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column
 - it can include repeated and bidirectional edges
@@ -46,8 +47,13 @@ def generate_inputs(data, filename_map):
 
         #Omics Integrator already gives warnings for strange prize values, so we won't here
         node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize'])
+
+        # Create network file
         edges_df = data.get_interactome()
 
+        # Format network file
+        edges_df = convert_directed_to_undirected(edges_df)
+
         #We'll have to update this when we make iteractomes more proper, but for now
         # assume we always get a weight and turn it into a cost.
         # use the same approach as omicsintegrator2 by adding half the max cost as the base cost.