-
Notifications
You must be signed in to change notification settings - Fork 0
/
interfaces_all.py
executable file
·131 lines (106 loc) · 7.22 KB
/
interfaces_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import numpy as np
import glob
from biopandas.pdb import PandasPdb
import warnings
warnings.filterwarnings("ignore")
from get_distances import *
import sys
from multiprocessing import Pool
import time
import math
import datetime
import pickle
import os
interfaces_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/interfaces/2024.05.28_interface_summary_5A.tsv", delimiter='\t', header=0)
def run_parallel_interfaces(number_of_threads, stability_data):
unique_uniprot_stability = [stability_data[stability_data["protein_acc"]==uniprot_id].copy() for uniprot_id in stability_data["protein_acc"].unique()]
start_time = time.perf_counter()
with Pool(number_of_threads) as pool:
output = pool.map(find_interfaces_per_uniprot, unique_uniprot_stability)
finish_time = time.perf_counter()
# save the csv and output the start and end times
print("Program finished in {} - using multiprocessing with {} cores".format(str(datetime.timedelta(seconds=finish_time-start_time)), number_of_threads))
full_df = pd.concat(output)
return(full_df)
'''
this function does interfaces calcuations for each uniprot tthat it is given
'''
# for each unique uniprotID...
# for uniprot in unique_uniprots:
def find_interfaces_per_uniprot(uniprot_only_stability, interfaces_data = interfaces_data, pickle_output = "/qfs/projects/proteometer/pheno_analysis/interfaces_pickle_files"):
# read in interfaces data, uniprot human data, and get uniprot
uniprot = uniprot_only_stability["protein_acc"].to_list()[0]
pickle_file_path = f"{pickle_output}/{uniprot}.pkl"
interface_only_uniprot = interfaces_data.loc[(interfaces_data['uniprot_id1'] == uniprot) | (interfaces_data['uniprot_id2'] == uniprot)] # isolate to uniprot in either 1 or 2
if os.path.isfile(pickle_file_path):
with open(pickle_file_path, 'rb') as handle:
full_data = pickle.load(handle)
return(full_data)
# add columns to df
uniprot_only_stability['closest_interface'] = ""
uniprot_only_stability['inside_interface'] = 0
uniprot_only_stability['min_distance_from_interface'] = np.nan
uniprot_only_stability['mean_distance_from_interface'] = np.nan
# parse your structure here
pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "-F1-*") # change this to have F1 using pdb file name
print(pdb_name)
#print("name of pdb is:", pdb_name)
if len(pdb_name) != 0:
ppdb = PandasPdb()
ppdb.read_pdb(pdb_name[0])
input_struct = ppdb.df['ATOM']
# for each psp
for phosphosite_row_index in uniprot_only_stability.index:
if pd.notna(uniprot_only_stability.loc[phosphosite_row_index,'position']): # only if there is a residue # (this threw an error previously)
residue_num = int(uniprot_only_stability.loc[phosphosite_row_index,'position']) # finding the residue number of the psp
min_dist = np.inf # make min dist extremely high at first
mean_dist = np.inf
#print(residue_num)
# use the residue # to get the coordinates in space from pdb file
interface_list = ""
for interface_index in interface_only_uniprot.index : # get all the residues in all of the interfaces
if pd.notna(interface_only_uniprot.loc[interface_index,'ifresid1']) & pd.notna(interface_only_uniprot.loc[interface_index,'ifresid2']):
if interfaces_data.loc[interface_index,'uniprot_id1'] == uniprot:
interface_residues = interface_only_uniprot.loc[interface_index,'ifresid1']
elif interfaces_data.loc[interface_index,'uniprot_id2'] == uniprot:
interface_residues = interface_only_uniprot.loc[interface_index,'ifresid2']
# check if it's inside of a interface
#print(interfaces_data.loc[interface_index,'interaction_id'])
interface_residues = [int(e[1:]) for e in interface_residues.split(",")] # remove the first letter from each bc it includes residue type ormat the interface_residues because it's a string
#print(interface_residues)
if residue_num in interface_residues:
#print("found inside interface!")
uniprot_only_stability.loc[phosphosite_row_index,'inside_interface'] = 1 # if residue is in the interface, put 1 in the inside interface column
interface_to_add = (interface_only_uniprot.loc[interface_index,'interaction_id'].split('_'))
interface_to_add.remove(uniprot)
interface_list = ','.join([interface_list, interface_to_add[0]])
uniprot_only_stability.loc[phosphosite_row_index,'closest_interface'] = interface_list # put unique interfaceID in closest interface
uniprot_only_stability.loc[phosphosite_row_index,'min_distance_from_interface'] = 0.0
new_min_dist, new_mean_dist = find_min_and_mean_distance(input_struct, residue_num, interface_residues)
if new_mean_dist < mean_dist:
mean_dist = new_mean_dist
uniprot_only_stability.loc[phosphosite_row_index,'mean_distance_from_interface'] = mean_dist
min_dist = 0.0
elif min_dist != 0.0: # if the phosphosite isn't in any interfaces
new_min_dist, new_mean_dist = find_min_and_mean_distance(input_struct, residue_num, interface_residues)
#print("the new dist is:" , new_dist)
if new_mean_dist:
if mean_dist > new_mean_dist: # if this is the smallest distance so far, replace min_dist with new_dist
interface_to_add = (interface_only_uniprot.loc[interface_index,'interaction_id'].split('_'))
interface_to_add.remove(uniprot)
uniprot_only_stability.loc[phosphosite_row_index,'closest_interface'] = interface_to_add[0]
uniprot_only_stability.loc[phosphosite_row_index,'mean_distance_from_interface'] = new_mean_dist # replace distance_from_interface with min_dist
mean_dist = new_mean_dist
uniprot_only_stability.loc[phosphosite_row_index,'min_distance_from_interface'] = new_min_dist
min_dist = new_min_dist
#print("replaced old dist with", min_dist)
with open(pickle_file_path, 'wb') as handle:
pickle.dump(uniprot_only_stability, handle, protocol=pickle.HIGHEST_PROTOCOL)
return(uniprot_only_stability)
if __name__ == "__main__":
num_threads = 64
stability_data = pd.read_csv(sys.argv[1])
output_location = sys.argv[2]
df_to_export = run_parallel_interfaces(num_threads, stability_data)
pd.DataFrame(df_to_export).to_csv(output_location)