Skip to content

Commit

Permalink
added directionality for generate inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
ntalluri committed Aug 29, 2023
1 parent af6a590 commit 3f173b5
Show file tree
Hide file tree
Showing 9 changed files with 200 additions and 42 deletions.
18 changes: 9 additions & 9 deletions input/alternative-network.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
A B 0.98
B C 0.77
A D 0.12
C D 0.89
C E 0.59
C F 0.50
F G 0.76
G H 0.92
G I 0.66
A B 0.98 U
B C 0.77 U
A D 0.12 U
C D 0.89 U
C E 0.59 U
C F 0.50 U
F G 0.76 U
G H 0.92 U
G I 0.66 U
4 changes: 2 additions & 2 deletions input/network.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
A B 0.98
B C 0.77
A B 0.98 U
B C 0.77 U
163 changes: 148 additions & 15 deletions src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class Dataset:

NODE_ID = "NODEID"
warning_threshold = 0.05 #Threshold for scarcity of columns to warn user
warning_threshold = 0.05 # Threshold for scarcity of columns to warn user

def __init__(self, dataset_dict):
self.label = None
Expand Down Expand Up @@ -63,25 +63,57 @@ def load_files_from_dict(self, dataset_dict):

self.label = dataset_dict["label"]

#Get file paths from config
# Get file paths from config
# TODO support multiple edge files
interactome_loc = dataset_dict["edge_files"][0]
node_data_files = dataset_dict["node_files"]
#edge_data_files = [""] # Currently None
# edge_data_files = [""] # Currently None
data_loc = dataset_dict["data_dir"]

#Load everything as pandas tables
self.interactome = pd.read_table(os.path.join(data_loc,interactome_loc), names = ["Interactor1","Interactor2","Weight"])
# Load everything as pandas tables
print("about to create self.interactome")
print(data_loc)
print(interactome_loc)

with open(os.path.join(data_loc, interactome_loc), "r") as f:
for _i in range(9): # first 5 lines
print(f.readline())

self.interactome = pd.read_table(
os.path.join(data_loc, interactome_loc), sep="\t", header=None
)
print(self.interactome)
num_cols = self.interactome.shape[1]
print(num_cols)
if num_cols == 3:

self.interactome.columns = ["Interactor1", "Interactor2", "Weight"]
self.interactome["Direction"] = "U"

elif num_cols == 4:
self.interactome.columns = [
"Interactor1",
"Interactor2",
"Weight",
"Direction",
]
else:
raise ValueError(
f"edge_files must have three or four columns but found {num_cols}"
)

node_set = set(self.interactome.Interactor1.unique())
node_set = node_set.union(set(self.interactome.Interactor2.unique()))

#Load generic node tables
# Load generic node tables
self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID])
for node_file in node_data_files:
single_node_table = pd.read_table(os.path.join(data_loc,node_file))
#If we have only 1 column, assume this is an indicator variable
if len(single_node_table.columns)==1:
single_node_table = pd.read_table(os.path.join(data_loc,node_file),header=None)
single_node_table = pd.read_table(os.path.join(data_loc, node_file))
# If we have only 1 column, assume this is an indicator variable
if len(single_node_table.columns) == 1:
single_node_table = pd.read_table(
os.path.join(data_loc, node_file), header=None
)
single_node_table.columns = [self.NODE_ID]
new_col_name = node_file.split(".")[0]
single_node_table[new_col_name] = True
Expand All @@ -91,7 +123,9 @@ def load_files_from_dict(self, dataset_dict):
# will be ignored
# TODO may want to warn about duplicate before removing them, for instance, if a user loads two files that
# both have prizes
self.node_table = self.node_table.merge(single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")).filter(regex="^(?!.*DROP)")
self.node_table = self.node_table.merge(
single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")
).filter(regex="^(?!.*DROP)")
# Ensure that the NODEID column always appears first, which is required for some downstream analyses
self.node_table.insert(0, "NODEID", self.node_table.pop("NODEID"))
self.other_files = dataset_dict["other_files"]
Expand All @@ -103,11 +137,18 @@ def request_node_columns(self, col_names):
"""
col_names.append(self.NODE_ID)
filtered_table = self.node_table[col_names]
filtered_table = filtered_table.dropna(axis=0, how='all',subset=filtered_table.columns.difference([self.NODE_ID]))
percent_hit = (float(len(filtered_table))/len(self.node_table))*100
if percent_hit <= self.warning_threshold*100:
filtered_table = filtered_table.dropna(
axis=0, how="all", subset=filtered_table.columns.difference([self.NODE_ID])
)
percent_hit = (float(len(filtered_table)) / len(self.node_table)) * 100
if percent_hit <= self.warning_threshold * 100:
# Only use stacklevel 1 because this is due to the data not the code context
warnings.warn("Only %0.2f of data had one or more of the following columns filled:"%(percent_hit) + str(col_names), stacklevel=1)
warnings.warn(
"Only %0.2f of data had one or more of the following columns filled:"
% (percent_hit)
+ str(col_names),
stacklevel=1,
)
return filtered_table

def contains_node_columns(self, col_names):
Expand All @@ -131,3 +172,95 @@ def get_other_files(self):

def get_interactome(self):
return self.interactome.copy()

def convert_undirected_to_directed(df: pd.DataFrame) -> pd.DataFrame:
"""
turns a graph into a fully directed graph
- turns every unidirected edges into a bi-directed edge
- with bi-directed edges, we are not loosing too much information because the relationship of the undirected egde is still preserved
*A user must keep the Direction column when using this function
@param df: input network df of edges, weights, and directionality
@return a dataframe with no undirected edges in Direction column
"""

# TODO: add a check to make sure there is a direction column in df

for index, row in df.iterrows():
if row["Direction"] == "U":
df.at[index, "Direction"] = "D"

new_directed_row = row.copy(deep=True)
new_directed_row["Interactor1"], new_directed_row["Interactor2"] = (
row["Interactor2"],
row["Interactor1"],
)
print("new directed row\n", new_directed_row)
new_directed_row["Direction"] = "D"
df.loc[len(df)] = new_directed_row

return df


def convert_directed_to_undirected(df: pd.DataFrame) -> pd.DateOffset:
"""
turns a graph into a fully undirected graph
- turns the directed edges directly into undirected edges
- we will loose any sense of directionality and the graph won't be inherently accurate, but the basic relationship between the two connected nodes will still remain intact.
@param df: input network df of edges, weights, and directionality
@return a dataframe with no directed edges in Direction column
"""

for index, row in df.iterrows():
if row["Direction"] == "D":
df.at[index, "Direction"] = "U"

return df


def add_seperator(df: pd.DataFrame, col_loc: int, col_name: str, sep: str) -> pd.DataFrame:
"""
adds a seperator somewhere into the input dataframe
@param df: input network df of edges, weights, and directionality
@param col_loc: the spot in the dataframe to put the new column
@param col_name: the name of the new column
@param sep: some type of seperator needed in the df
@return a df with a new seperator added to every row
"""

df.insert(col_loc, col_name, sep)
return df


def add_directionality_seperators(df: pd.DataFrame, col_loc: int, col_name: str, dir_sep: str, undir_sep: str) -> pd.DataFrame:
"""
deals with adding in directionality seperators for mixed graphs that isn't in the universal input
*user must keep the Direction column when using the function
@param df: input network df of edges, weights, and directionality
@param col_loc: the spot in the dataframe to put the new column
@param col_name: the name of the new column
@param dir_sep: the directed edge sep
@param undir_sep: the undirected edge sep
@return a df converted to show directionality differently
"""

# TODO: add a check to make sure there is a direction column in df

df.insert(col_loc, col_name, dir_sep)

for index, row in df.iterrows():
if row["Direction"] == "U":
df.at[index, col_name] = undir_sep
elif row["Direction"] == "D":
continue
else:
raise ValueError(
f'direction must be a \'U\' or \'D\', but found {row["Direction"]}'
)

return df
8 changes: 6 additions & 2 deletions src/domino.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd

from src.dataset import add_seperator, convert_directed_to_undirected
from src.prm import PRM
from src.util import prepare_volume, run_container

Expand All @@ -15,7 +16,7 @@
Domino will construct a fully undirected graph from the provided input file
- in the algorithm, it uses nx.Graph()
Expected raw input format:
Expected raw input format:
Interactor1 ppi Interactor2
- the expected raw input file should have node pairs in the 1st and 3rd columns, with a 'ppi' in the 2nd column
- it can include repeated and bidirectional edges
Expand Down Expand Up @@ -51,7 +52,10 @@ def generate_inputs(data, filename_map):

# Create network file
edges_df = data.get_interactome()
edges_df['ppi'] = 'ppi'

# Format network file
edges_df = convert_directed_to_undirected(edges_df)
edges_df = add_seperator(edges_df, 1, 'ppi', 'ppi')

# Transform each node id with a prefix
edges_df['Interactor1'] = edges_df['Interactor1'].apply(pre_domino_id_transform)
Expand Down
12 changes: 7 additions & 5 deletions src/meo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd

from src.dataset import add_directionality_seperators
from src.prm import PRM
from src.util import prepare_volume, run_container

Expand Down Expand Up @@ -49,7 +50,7 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None,
"""
MEO can support partially directed graphs
Expected raw input format:
Expected raw input format:
Interactor1 pp/pd Interactor2 Weight
- the expected raw input file should have node pairs in the 1st and 3rd columns, with a directionality in the 2nd column and the weight in the 4th column
- it use pp for undirected edges and pd for directed edges
Expand Down Expand Up @@ -83,11 +84,12 @@ def generate_inputs(data, filename_map):
nodes = nodes.loc[nodes[node_type]]
nodes.to_csv(filename_map[node_type], index=False, columns=['NODEID'], header=False)

# TODO need to support partially directed graphs
# Expected columns are Node1 EdgeType Node2 Weight
# Create network file
edges = data.get_interactome()
# For now all edges are undirected
edges.insert(1, 'EdgeType', '(pp)')

# Format network file
edges = add_directionality_seperators(edges, 1, 'EdgeType', '(pd)', '(pp)')

edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'EdgeType', 'Interactor2', 'Weight'], header=False)


Expand Down
6 changes: 5 additions & 1 deletion src/mincostflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd

from src.dataset import convert_undirected_to_directed
from src.prm import PRM
from src.util import prepare_volume, run_container

Expand All @@ -12,7 +13,7 @@
- OR Tools MCF is designed for directed graphs
- when an edge (arc), it has a source and target node, so flow it only allowed to moced from source to the target
Expected raw input format:
Expected raw input format:
Interactor1 Interactor2 Weight
- the expected raw input file should have node pairs in the 1st and 2nd columns, with the weight in the 3rd column
- it can include repeated and bidirectional edges
Expand Down Expand Up @@ -46,6 +47,9 @@ def generate_inputs(data, filename_map):
# create the network of edges
edges = data.get_interactome()

# Format network edges
edges = convert_undirected_to_directed(edges)

# creates the edges files that contains the head and tail nodes and the weights after them
edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=["Interactor1","Interactor2","Weight"], header=False)

Expand Down
11 changes: 6 additions & 5 deletions src/omicsintegrator1.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def write_conf(filename=Path('config.txt'), w=None, b=None, d=None, mu=None, noi

"""
Omics Integrator 1 will construct works with partially directed graphs
- it takes in the universal input directly
- it takes in the universal input directly
Expected raw input format:
Expected raw input format:
Interactor1 Interactor2 Weight Direction
- the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column and directionality in the 4th column
- it can include repeated and bidirectional edges
Expand Down Expand Up @@ -74,10 +74,11 @@ def generate_inputs(data, filename_map):
#Omics Integrator already gives warnings for strange prize values, so we won't here
node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize'])

#For now we assume all input networks are undirected until we expand how edge tables work
# Get network file
edges_df = data.get_interactome()
edges_df['directionality'] = 'U'
edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','Weight','directionality'],header=['protein1','protein2','weight','directionality'])

# Rename Direction column
edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','Weight','Direction'],header=['protein1','protein2','weight','directionality'])


# TODO add parameter validation
Expand Down
8 changes: 7 additions & 1 deletion src/omicsintegrator2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import docker
import pandas as pd

from src.dataset import convert_directed_to_undirected
from src.prm import PRM
from src.util import prepare_path_docker

Expand All @@ -14,7 +15,7 @@
- in the algorithm, it uses nx.Graph() objects, which are undirected
- uses a pcst_fast solver which supports undirected graphs
Expected raw input format:
Expected raw input format:
Interactor1 Interactor2 Weight
- the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column
- it can include repeated and bidirectional edges
Expand Down Expand Up @@ -46,8 +47,13 @@ def generate_inputs(data, filename_map):

#Omics Integrator already gives warnings for strange prize values, so we won't here
node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize'])

# Create network file
edges_df = data.get_interactome()

# Format network file
edges_df = convert_directed_to_undirected(edges_df)

#We'll have to update this when we make iteractomes more proper, but for now
# assume we always get a weight and turn it into a cost.
# use the same approach as omicsintegrator2 by adding half the max cost as the base cost.
Expand Down
Loading

0 comments on commit 3f173b5

Please sign in to comment.