diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 421468061..1b9e26f80 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -32,6 +32,10 @@ from prody.ensemble import Ensemble import multiprocessing +from .fixer import * +from .compare import * +from prody.measure import calcTransformation, calcDistance, calcRMSD, superpose + __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges', 'calcRepulsiveIonicBonding', 'calcPiStacking', 'calcPiCation', @@ -47,7 +51,8 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms'] + 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 'runDali', + 'runBLAST', 'extractMultiModelPDB', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -310,6 +315,225 @@ def showPairEnergy(data, **kwargs): return data +# SignatureInteractions supporting functions +def remove_empty_strings(row): + """Remove empty strings from a list.""" + return [elem for elem in row if elem != ''] + +def log_message(message, level="INFO"): + """Log a message with a specified log level.""" + LOGGER.info("[{}] {}".format(level, message)) + +def is_module_installed(module_name): + """Check if a Python module is installed.""" + import importlib.util + spec = importlib.util.find_spec(module_name) + return spec is not None + +def is_command_installed(command): + """Check if a command-line tool is installed.""" + import shutil + return shutil.which(command) is not None + +def load_residues_from_pdb(pdb_file): + """Extract residue numbers and their corresponding one-letter amino acid codes from a PDB file.""" + from prody.atomic.atomic import AAMAP + structure = parsePDB(pdb_file) + residues = structure.iterResidues() + residue_dict = {} + + for res in residues: + resnum = res.getResnum() + resname = res.getResname() # Three-letter amino acid code + try: + one_letter_code = AAMAP[resname] + residue_dict[resnum] = one_letter_code + except KeyError: + log_message("Unknown residue: {} at position {}".format(resname, resnum), "WARNING") + + return residue_dict + +def append_residue_code(residue_num, residue_dict): + """Return a string with one-letter amino acid code and residue number.""" + aa_code = residue_dict.get(residue_num, "X") # Use "X" for unknown residues + return "{}{}".format(aa_code, residue_num) + +def process_data(mapping_file, pdb_folder, interaction_func, bond_type, files_for_analysis): + """Process the mapping file and the PDB folder to compute interaction counts and percentages.""" + log_message("Loading mapping file: {}".format(mapping_file)) + + # Load and clean the mapping file + try: + mapping = np.loadtxt(mapping_file, delimiter=' ', dtype=str) + except Exception as e: + log_message("Error loading mapping file: {}".format(e), "ERROR") + return None + + mapping = np.where(mapping == '-', np.nan, mapping) + filtered_mapping = np.array([remove_empty_strings(row) for row in mapping]) + mapping_num = filtered_mapping.astype(float) + + # Load the one-letter amino acid codes from model1.pdb + import os + pdb_model_path = os.path.join(pdb_folder, 'model1.pdb') + residue_dict = load_residues_from_pdb(pdb_model_path) + + log_message("Processing PDB files in folder: {}".format(pdb_folder)) + + tar_bond_ind = [] + processed_files = 0 # To track the number of files successfully processed + fixed_files = [] # Store paths of fixed files to remove at the end + + for i, files in enumerate(files_for_analysis): + log_message("Processing file {}: {}".format(i + 1, files)) + + try: + coords = parsePDB(files) + atoms = coords.select('protein') + bonds = interaction_func(atoms) + saveInteractionsAsDummyAtoms(atoms, bonds, filename=files.rstrip(files.split('/')[-1])+'INT_'+bond_type+'_'+files.split('/')[-1]) + except Exception as e: + log_message("Error processing PDB file {}: {}".format(files, e), "ERROR") + continue + + # If no bonds were found, skip this file + if len(bonds) == 0: + log_message("No {} found in file {}, skipping.".format(bond_type, files), "WARNING") + continue + + processed_files += 1 # Increment successfully processed files + fixed_files.append(files) + + if processed_files == 1: # First valid file with bonds determines target indices + for entries in bonds: + ent = list(np.sort((int(entries[0][3:]), int(entries[3][3:])))) # Ensure integers + tar_bond_ind.append(ent) + tar_bond_ind = np.unique(np.array(tar_bond_ind), axis=0).astype(int) + count = np.zeros(tar_bond_ind.shape[0], dtype=int) + + bond_ind = [] + for entries in bonds: + ent = list(np.sort((int(entries[0][3:]), int(entries[3][3:])))) # Ensure integers + bond_ind.append(ent) + bond_ind = np.unique(np.array(bond_ind), axis=0) + + # Ensure bond_ind is a 2D array + if bond_ind.ndim == 1: + bond_ind = bond_ind.reshape(-1, 2) # Reshape to (n, 2) if it's 1D + + for j, pairs in enumerate(tar_bond_ind): + ind1_matches = np.where(mapping_num[0] == pairs[0])[0] + ind2_matches = np.where(mapping_num[0] == pairs[1])[0] + + if ind1_matches.size > 0 and ind2_matches.size > 0: + if processed_files - 1 < mapping_num.shape[0]: + ind1 = mapping_num[processed_files - 1, ind1_matches[0]] + ind2 = mapping_num[processed_files - 1, ind2_matches[0]] + + if not (np.isnan(ind1) or np.isnan(ind2)): + index = np.where(np.logical_and(bond_ind[:, 0] == int(ind1), bond_ind[:, 1] == int(ind2)))[0] + if index.size != 0: + count[j] += 1 + else: + log_message("Skipping file {} due to index out of bounds error".format(files), "WARNING") + else: + log_message("No matching indices found for {} in {}".format(pairs, files), "WARNING") + + # If no files were successfully processed or no bonds were found + if processed_files == 0 or len(tar_bond_ind) == 0: + log_message("No valid {} entries found across all PDB files.".format(bond_type), "ERROR") + return None + + count_reshaped = count.reshape(-1, 1) + count_normalized = (count / processed_files * 100).reshape(-1, 1) + + # Modify tar_bond_ind to append the amino acid code before residue index + tar_bond_with_aa = [] + for bond in tar_bond_ind: + res1 = append_residue_code(bond[0], residue_dict) # Append AA code for Res1 + res2 = append_residue_code(bond[1], residue_dict) # Append AA code for Res2 + tar_bond_with_aa.append([res1, res2]) + + tar_bond_with_aa = np.array(tar_bond_with_aa) + + # Combine tar_bond_with_aa with count and percentage + output_data = np.hstack((tar_bond_with_aa, count_reshaped, count_normalized)) + + log_message("Finished processing {} PDB files.".format(processed_files)) + output_filename = '{}_consensus.txt'.format(bond_type) + + # Save the result with amino acid codes and numeric values + np.savetxt(output_filename, output_data, fmt='%s %s %s %s', delimiter=' ', + header='Res1 Res2 Count Percentage', comments='') + + return output_data, fixed_files # Return fixed_files to remove later + + +def plot_barh(result, bond_type, **kwargs): + """Plot horizontal bar plots of percentages of interactions, splitting the data into fixed-sized plots. + + :arg n_per_plot: The number of results per one plot + Default is 20 if set to None + :type n_per_plot: int + + :arg min_height: Size of the bar plot + Default is 8 + :type min_height: int + """ + + import matplotlib.pylab as plt + plt.rcParams.update({'font.size': 20}) + + n_per_plot = kwargs.pop('n_per_plot', None) + min_height = kwargs.pop('min_height', 8) + + if n_per_plot is None: + n_per_plot = 20 + + # Error handling if result is None or empty + if result is None or len(result) == 0: + log_message("Skipping plot for {} due to insufficient data.".format(bond_type), "ERROR") + return + + num_entries = result.shape[0] + num_plots = (num_entries + n_per_plot - 1) // n_per_plot # Number of plots required + + for plot_idx in range(num_plots): + # Slice the data for the current plot (take the next 'n_per_plot' entries) + start_idx = plot_idx * n_per_plot + end_idx = min((plot_idx + 1) * n_per_plot, num_entries) + result_chunk = result[start_idx:end_idx] + + log_message("Plotting entries {} to {} for {}.".format(start_idx + 1, end_idx, bond_type)) + # Use residue numbers for y-axis labels + y_labels = ["{}-{}".format(str(row[0]), str(row[1])) for row in result_chunk] + percentage_values = result_chunk[:, 3].astype('float') + + norm = plt.Normalize(vmin=0, vmax=100) + cmap = plt.cm.get_cmap('coolwarm') + + # Set the figure height with a minimum height of `min_height` for small datasets + fig_height = max(min_height, len(result_chunk) * 0.4) + plt.figure(figsize=(18, fig_height)) + bars = plt.barh(y_labels, percentage_values, color=cmap(norm(percentage_values))) + + sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) + sm.set_array([]) + + # Adjust color bar height to match the number of entries + plt.colorbar(sm, label='Percentage', fraction=0.02, pad=0.04) + + plt.ylim(-1, result_chunk.shape[0]) + plt.ylabel('{} Pairs of Residue Numbers'.format(bond_type)) + plt.xlabel('Percentage') + plt.title('Persistence of {} Bonds (entries {}-{})'.format(bond_type, start_idx + 1, end_idx)) + # Save each plot with an incremented filename for multiple plots + output_plot_file = '{}_plot_part{}.png'.format(bond_type, plot_idx + 1) + log_message("Saving plot to: {}".format(output_plot_file)) + plt.savefig(output_plot_file) + plt.close() + log_message("Plot saved successfully.") + def calcHydrophobicOverlapingAreas(atoms, **kwargs): """Provide information about hydrophobic contacts between pairs of residues based on @@ -392,15 +616,15 @@ def calcSASA(atoms, **kwargs): :type atoms: :class:`.Atomic` :arg selection: selection string - by default all the protein structure is used + Default all the protein structure is used :type selection: str :arg sasa_cutoff: cutoff for SASA values - default is 0.0 + Default is 0.0 :type sasa_cutoff: float, int :arg resnames: residues name included - default is False + Default is False :type resnames: bool """ @@ -450,19 +674,19 @@ def calcVolume(atoms, **kwargs): :type atoms: :class:`.Atomic` :arg selection: selection string - by default all the protein structure is used + Default all the protein structure is used :type selection: str :arg volume_cutoff: cutoff for volume - default is 0.0 to include all the results + Default is 0.0 to include all the results :type sasa_volume: float, int :arg split_residues: it will provide values for each residue - default is False + Default is False :type split_residues: bool :arg resnames: residues name included - default is False + Default is False True - will give residue names and values for each residue False - will give only the values for each residues :type resnames: bool @@ -1905,35 +2129,35 @@ def showInteractionsGraph(statistics, **kwargs): :type statistics: list :arg cutoff: Minimal number of counts per residue in the trajectory - by default 0.1. + Default is 0.1. :type cutoff: int, float :arg code: representation of the residues, 3-letter or 1-letter - by default 3-letter + Default is 3-letter :type code: str :arg multiple_chains: display chain name for structure with many chains - by default False + Default is False :type multiple_chains: bool :arg edge_cmap: color of the residue connection - by default plt.cm.Blues (blue color) + Default is plt.cm.Blues (blue color) :type edge_cmap: str :arg node_size: size of the nodes which describes residues - by default 300 + Default is 300 :type node_size: int :arg node_distance: value which will scale residue-residue interactions - by default 5 + Default is 5 :type node_distance: int :arg font_size: size of the font - by default 14 + Default is 14 :type font_size: int :arg seed: random number which affect the distribution of residues - by default 42 + Default is 42 :type seed: int """ @@ -2069,11 +2293,11 @@ def showInteractionsHist(statistics, **kwargs): :type statistics: list :arg clip: maxmimal number of residue pairs on the bar plot - by default 20 + Default is 20 :type clip: int :arg font_size: size of the font - by default 18 + Default is 18 :type font_size: int """ @@ -2376,7 +2600,7 @@ def listLigandInteractions(PLIP_output, **kwargs): :arg output: parameter to print the interactions on the screen while analyzing the structure (True | False) - by default is False + Default is False :type output: bool Note that five types of interactions are considered: hydrogen bonds, salt bridges, @@ -2758,23 +2982,23 @@ def calcSminaBindingAffinity(atoms, trajectory=None, **kwargs): :arg protein_selection: selection string for the protein and other compoment of the system that should be included, e.g. "protein and chain A", - by default "protein" + Default is "protein" :type protein_selection: str :arg ligand_selection: selection string for ligand, e.g. "resname ADP", - by default "all not protein" + Default is "all not protein" :type ligand_selection: str :arg ligand_selection: scoring function (vina or vinardo) - by default is "vina" + Default is "vina" :type ligand_selection: str :arg atom_terms: write per-atom interaction term values. It will provide more information as dictionary for each frame/model, and details for atom terms will be saved in terms_*frame_number*.txt, - by default is False + Default is False :type atom_terms: bool @@ -3028,6 +3252,748 @@ def showSminaTermValues(data): return show +def createFoldseekAlignment(prot_seq, prot_foldseek, **kwargs): + """Aligns sequences from prot_seq with homologous sequences identified in prot_foldseek, + generating a multiple sequence alignment. + + :arg prot_seq: The natural sequence extracted from the PDB (seq file) + :type prot_seq: str + + :arg prot_foldseek: The results from foldseek (foldseek file) + :type prot_foldseek: str + + :arg msa_output_name: The natural sequence extracted from the PDB (msa file) + :type msa_output_name: str + """ + + msa_output_name = kwargs.pop('msa_output_name', 'prot_struc.msa') + + def find_match_index(tar_nogap, nat_seq): + tar_nogap_str = ''.join(tar_nogap) + nat_seq_str = ''.join(nat_seq) + index = nat_seq_str.find(tar_nogap_str) + return index + + # Read input files + with open(prot_seq, 'r') as f: + file1 = f.readlines() + + with open(prot_foldseek, 'r') as f: + file2 = f.readlines() + + # Open output file + with open(msa_output_name, 'w') as fp: + nat_seq = list(file1[0].strip()) + + # Write the natural sequence to the output file + fp.write(''.join(nat_seq) + "\n") + + # Process each foldseek entry + for line in file2: + entries = line.split() + + if float(entries[2]) >= 0.5: + tar_seq = list(entries[11].strip()) + mat_seq = list(entries[12].strip()) + + tar_nogap = [] + processed_mat = [] + + for j in range(len(tar_seq)): + if tar_seq[j] != '-': + tar_nogap.append(tar_seq[j]) + processed_mat.append(mat_seq[j]) + + match_index = find_match_index(tar_nogap, nat_seq) + end_index = match_index + len(tar_nogap) + m = 0 + + for l in range(len(nat_seq)): + if l < match_index: + fp.write("-") + elif l >= match_index and l < end_index: + fp.write(processed_mat[m]) + m += 1 + else: + fp.write("-") + fp.write("\n") + + LOGGER.info("MSA file is now created, and saved as {}.".format(msa_output_name)) + + +def extractMultiModelPDB(multimodelPDB, **kwargs): + """Extracts individual PDB models from multimodel PDB and places them into the pointed directory. + If used for calculating calcSignatureInteractions align the models. + + :arg multimodelPDB: The file containing models in multi-model PDB format + :type multimodelPDB: str + + :arg folder_name: The name of the folder to which PDBs will be extracted + :type folder_name: str + """ + + import os + + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + with open(multimodelPDB, 'r') as f: + file = f.readlines() + os.makedirs(folder_name, exist_ok=True) + + fp = None + for line in file: + line = line.strip() + sig1 = line[:5] + sig2 = line[:6] + sig3 = line[:4] + + if sig1 == 'MODEL': + model_number = line.split()[1] + filename = 'model{}.pdb'.format(model_number) + fp = open(filename, 'w') + continue + + if sig2 == 'ENDMDL': + if fp: + fp.close() + os.rename(filename, './{}/{}'.format(folder_name,filename)) + continue + + if sig3 == 'ATOM' and fp: + fp.write("{}\n".format(line)) + + LOGGER.info("Individual models are saved in {}.".format(folder_name)) + + +def runFoldseek(pdb_file, chain, **kwargs): + """This script processes a PDB file to extract a specified chain's sequence, searches for + homologous structures using foldseek, and prepares alignment outputs for further analysis. + + Before using the function, install Foldseek: + >>> conda install bioconda::foldseek + More information can be found: + https://github.com/steineggerlab/foldseek?tab=readme-ov-file#databasesand + + This function will not work under Windows. + Example usage: runFoldseek('5kqm.pdb', 'A', database_folder='~/Downloads/foldseek/pdb') + where previous a folder called 'foldseek' were created and PDB database was uploaded using: + >>> foldseek databases PDB pdb tmp (Linux console) + + :arg pdb_file: A PDB file path + :type pdb_file: str + + :arg chain: Chain identifier + :type chain: str + + :arg coverage_threshold: Coverage threshold + Default is 0.3 + :type coverage_threshold: float + + :arg tm_threshold: TM-score threshold + Default is 0.5 + :type tm_threshold: float + + :arg database_folder: Folder with the database + Default is '~/Downloads/foldseek/pdb' + To download the database use: 'foldseek databases PDB pdb tmp' in the console + :type database_folder: str + + :arg fixer: The method for fixing lack of hydrogen bonds + Default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + Default is 'ca' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + Default is 5 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + Default is 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + Default is 'struc_homologs' + :type folder_name: str """ + + import os + import subprocess + import re + import sys + + database_folder = kwargs.pop('database_folder', '~/Downloads/foldseek/pdb') + coverage_threshold = kwargs.pop('coverage_threshold', 0.3) + tm_threshold = kwargs.pop('tm_threshold', 0.5) + folder_name = kwargs.pop('folder_name', 'struc_homologs') + subset = kwargs.pop('subset', 'ca') + seqid = kwargs.pop('seqid', 5) + overlap = kwargs.pop('overlap', 50) + + if not isinstance(pdb_file, str): + raise TypeError('Please provide the name of the PDB file.') + + full_path = os.path.expanduser(database_folder) + if not os.path.exists(full_path.strip('pdb')): + raise ValueError('The required database is not found in {0}. Please download it first.'.format(database_folder.strip('pdb'))) + + # Define the amino acid conversion function + def aa_onelet(three_letter_code): + codes = { + 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', + 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', + 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', + 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', + 'MSE': 'M' + } + return codes.get(three_letter_code) + + # Function to extract the sequence from the PDB file + def extract_sequence_from_pdb(pdb_file, chain, output_file): + sequence = [] + prev_resid = -9999 + + with open(pdb_file, 'r') as pdb: + for line in pdb: + if line.startswith("ATOM"): + ch = line[21] + resid = int(line[22:26].strip()) + aa = line[17:20].strip() + + if ch == chain and resid != prev_resid: + one_aa = aa_onelet(aa) + if one_aa: + sequence.append(one_aa) + prev_resid = resid + + with open(output_file, 'w') as seq_file: + seq_file.write(''.join(sequence)) + + # Inputs + fpath = pdb_file.strip() + chain = chain.strip() + cov_threshold = float(coverage_threshold) + tm_threshold = float(tm_threshold) + + # Filter PDB file + awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) + subprocess.run("cat {0} | grep '^ATOM' | {1} > inp.pdb".format(fpath, awk_command), shell=True) + + # Run foldseek and other commands + subprocess.run([ + 'foldseek', 'easy-search', 'inp.pdb', database_folder, 'prot.foldseek', + 'tmp2', '--exhaustive-search', '1', '--format-output', + "query,target,qstart,qend,tstart,tend,qcov,tcov,qtmscore,ttmscore,rmsd,qaln,taln", + '-c', str(cov_threshold), '--cov-mode', '0' + ]) + + # Extract sequence and write to prot.seq + extract_sequence_from_pdb('inp.pdb', chain, 'prot.seq') + createFoldseekAlignment('prot.seq', 'prot.foldseek', msa_output_name='prot_struc.msa') + + # Read input files + with open('inp.pdb', 'r') as f: + file1 = f.readlines() + + with open('prot.foldseek', 'r') as f: + file2 = f.readlines() + + with open('prot_struc.msa', 'r') as f: + file3 = f.readlines() + + # Open output files + fp1 = open("aligned_structures.pdb", 'w') + fp2 = open("shortlisted_resind.msa", 'w') + fp6 = open("seq_match_reconfirm.txt", 'w') + fp7 = open("aligned_structures_extended.pdb", 'w') + fp9 = open("shortlisted.foldseek", 'w') + fp10 = open("shortlisted.msa", 'w') + + # Initialize variables + mod_count = 1 + fp1.write("MODEL\t{0}\n".format(mod_count)) + fp7.write("MODEL\t{0}\n".format(mod_count)) + + seq1 = list(file3[0].strip()) + ind1 = 0 + prev_id = -9999 + + # Process input PDB file + for line in file1: + line = line.strip() + id_ = line[0:4] + ch = line[21] + resid = int(line[22:26].strip()) + aa = line[17:20] + + if id_ == 'ATOM' and ch == chain and resid != prev_id: + prev_id = resid + one_aa = aa_onelet(aa) + if one_aa == seq1[ind1]: + fp1.write("{0}\n".format(line)) + fp7.write("{0}\n".format(line)) + fp2.write("{0} ".format(resid)) + ind1 += 1 + else: + print("Mismatch in sequence and structure of Query protein at Index {0}".format(ind1)) + break + elif id_ == 'ATOM' and ch == chain and resid == prev_id: + fp1.write("{0}\n".format(line)) + fp7.write("{0}\n".format(line)) + + fp1.write("TER\nENDMDL\n\n") + fp7.write("TER\nENDMDL\n\n") + fp2.write("\n") + fp10.write("{0}\n".format(file3[0].strip())) + + # Processing foldseek results + os.makedirs("temp", exist_ok=True) + + for i, entry in enumerate(file2): + entries = re.split(r'\s+', entry.strip()) + + if float(entries[8]) < tm_threshold: + continue + + tstart = int(entries[4]) + tend = int(entries[5]) + pdb = entries[1][:4] + chain = entries[1][-1] + fname = "{0}.pdb".format(pdb) + + # Download and process the target PDB file + subprocess.run(['wget', '-P', 'temp', "https://files.rcsb.org/download/{0}".format(fname)]) + + awk_command = "awk '{{if (substr($0, 22, 1) == \"{0}\") print}}'".format(chain) + subprocess.run("cat ./temp/{0} | grep -E '^(ATOM|HETATM)' | {1} > target.pdb".format(fname, awk_command), shell=True) + + # Check if target.pdb has fewer than 5 lines + if sum(1 for _ in open("target.pdb")) < 5: + LOGGER.info("target.pdb has fewer than 5 lines. Skipping further processing.") + subprocess.run(['rm', 'target.pdb']) + continue + + with open('target.pdb', 'r') as target_file: + file4 = target_file.readlines() + + qseq = list(entries[11]) + tseq = list(entries[12]) + + start_line = 0 + prevind = -9999 + tarind = 0 + + for j, line in enumerate(file4): + resid = int(line[22:26].strip()) + if resid != prevind: + prevind = resid + tarind += 1 + if tarind == tstart: + start_line = j + break + + prevind = -9999 + ind2 = 0 + j = start_line + flag2 = False + + with open("temp_coord.txt", 'w') as fp4, \ + open("temp_resind.txt", 'w') as fp3, \ + open("temp_seq.txt", 'w') as fp5: + + for k in range(len(qseq)): + if tseq[k] != '-' and qseq[k] != '-': + line = file4[j].strip() + resid = int(line[22:26].strip()) + aa = line[17:20] + one_aa = aa_onelet(aa) + + if one_aa == tseq[k]: + fp3.write("{0} ".format(resid)) + fp5.write("{0}".format(one_aa)) + prevind = resid + else: + print("Mismatch in sequence and structure of Target protein {0}{1} at line {2} Index {3}-{4} ne {5}".format( + pdb, chain, j, k, one_aa, tseq[k])) + flag2 = True + break + + while resid == prevind: + fp4.write("{0}\n".format(line)) + j += 1 + if j >= len(file4): + break + line = file4[j].strip() + resid = int(line[22:26].strip()) + elif tseq[k] != '-' and qseq[k] == '-': + line = file4[j].strip() + resid = int(line[22:26].strip()) + aa = line[17:20] + one_aa = aa_onelet(aa) + + if one_aa == tseq[k]: + prevind = resid + else: + print("Mismatch in sequence and structure of Target protein {0}{1} at Line {2} Index {3}-{4} ne {5}".format( + pdb, chain, j, k, one_aa, tseq[k])) + flag2 = True + break + + while resid == prevind: + j += 1 + if j >= len(file4): + break + line = file4[j].strip() + resid = int(line[22:26].strip()) + + elif tseq[k] == '-' and qseq[k] != '-': + fp3.write("- ") + fp5.write("-") + + if flag2: + continue + + with open("temp_coord.txt", 'r') as f: + tmpcord = f.readlines() + + with open("temp_resind.txt", 'r') as f: + tmpresind = f.readlines() + + with open("temp_seq.txt", 'r') as f: + tmpseq = f.readlines() + + ind3 = i + 1 + seq1 = list(file3[ind3].strip()) + seq2 = tmpresind[0].strip().split() + + fp9.write("{0}".format(file2[i])) + fp10.write("{0}\n".format(file3[ind3].strip())) + + for m in range(len(seq1)): + if seq1[m] == '-': + fp2.write("{0} ".format(seq1[m])) + else: + break + + for n in range(len(seq2)): + fp2.write("{0} ".format(seq2[n])) + fp6.write("{0}".format(seq1[m])) + m += 1 + + for o in range(m, len(seq1)): + fp2.write("{0} ".format(seq1[m])) + + fp2.write("\n") + fp6.write("\n{0}\n\n\n".format(tmpseq[0])) + + mod_count += 1 + fp1.write("MODEL\t{0}\n".format(mod_count)) + fp7.write("MODEL\t{0}\n".format(mod_count)) + + for line in file4: + if line.strip(): + fp7.write("{0}\n".format(line.strip())) + + for line in tmpcord: + fp1.write("{0}".format(line)) + + fp1.write("TER\nENDMDL\n\n") + fp7.write("TER\nENDMDL\n\n") + + # Cleanup + fp1.close() + fp2.close() + fp6.close() + fp7.close() + fp9.close() + fp10.close() + + extractMultiModelPDB('aligned_structures.pdb', folder_name=folder_name) + subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True) + subprocess.run("rm -rf tmp2 temp", shell=True) + + # New part + pwd = os.path.abspath(folder_name) + list_pdbs = [pwd+'/'+ff for ff in os.listdir(pwd) if ff.endswith('.pdb')] + list_pdbs.sort(key=lambda x: int(''.join(filter(str.isdigit, x)))) # all structures will be aligned on model1.pdb as in the oryginal code + + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = pwd+'/'+'align__'+i+'.pdb' + writePDB(source_file, mobile) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + + +def runDali(pdb, chain, **kwargs): + """This function calls searchDali() and downloads all the PDB files, separate by chains, add hydrogens and missing side chains, + and finally align them and put into the newly created folder. + + :arg pdb: A PDB code + :type pdb: str + + :arg chain: chain identifier + :type chain: str + + :arg cutoff_len: Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1) + See searchDali for more details + Default is 0.5 + :type cutoff_len: float + + :arg cutoff_rmsd: RMSD cutoff (see searchDali) + Default is 1.0 + :type cutoff_rmsd: float + + :arg subset_Dali: fullPDB, PDB25, PDB50, PDB90 + Default is 'fullPDB' + :type subset_Dali: str + + :arg fixer: The method for fixing lack of hydrogen bonds + Default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + Default is 'ca' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + Default is 5 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + Default is 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + Default is 'struc_homologs' + :type folder_name: str """ + + import os + import shutil + from prody.database import dali + + cutoff_len = kwargs.pop('cutoff_len', 0.5) + cutoff_rmsd = kwargs.pop('cutoff_rmsd', 1.0) + fixer = kwargs.pop('fixer', 'pdbfixer') + subset_Dali = kwargs.pop('subset_Dali', 'fullPDB') + subset = kwargs.pop('subset', 'ca') + seqid = kwargs.pop('seqid', 5) + overlap = kwargs.pop('overlap', 50) + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + dali_rec = dali.searchDali(pdb, chain, subset=subset_Dali) + + while not dali_rec.isSuccess: + dali_rec.fetch() + + pdb_ids = dali_rec.filter(cutoff_len=cutoff_len, cutoff_rmsd=cutoff_rmsd) + pdb_hits = [ (i[:4], i[4:]) for i in pdb_ids ] + + list_pdbs = [] + LOGGER.info('Separating chains and saving into PDB file') + for i in pdb_hits: + LOGGER.info('PDB code {} and chain {}'.format(i[0], i[1])) + p = parsePDB(i[0]).select('chain '+i[1]+' and protein') + writePDB(i[0]+i[1]+'.pdb', p) + list_pdbs.append(i[0]+i[1]+'.pdb') + + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + os.makedirs(folder_name) + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = 'align__'+i+'.pdb' + writePDB(source_file, mobile) + shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + + +def runBLAST(pdb, chain, **kwargs): + """This function calls blastPDB to find homologs and downloads all of them in PDB format to the local directory, + separate chains that were identified by BLAST, add hydrogens and missing side chains, + and finally align them and put into the newly created folder. + + :arg pdb: A PDB code + :type pdb: str + + :arg chain: chain identifier + :type chain: str + + :arg fixer: The method for fixing lack of hydrogen bonds + Default is 'pdbfixer' + :type fixer: 'pdbfixer' or 'openbabel' + + :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all' (see matchChains()) + Default is 'bb' + :type subset: str + + :arg seqid: Minimum value of the sequence identity (see matchChains()) + Default is 90 + :type seqid: float + + :arg overlap: percent overlap (see matchChains()) + Default is 50 + :type overlap: float + + :arg folder_name: Folder where the results will be collected + Default is 'struc_homologs' + :type folder_name: str """ + + import os + import shutil + from prody.proteins.blastpdb import blastPDB + + fixer = kwargs.pop('fixer', 'pdbfixer') + seqid = kwargs.pop('seqid', 90) + overlap = kwargs.pop('overlap', 50) + subset = kwargs.pop('subset', 'bb') + folder_name = kwargs.pop('folder_name', 'struc_homologs') + + ref_prot = parsePDB(pdb) + ref_hv = ref_prot.getHierView()[chain] + sequence = ref_hv.getSequence() + + blast_record = blastPDB(sequence) + while not blast_record.isSuccess: + blast_record.fetch() + + pdb_hits = [] + for key, item in blast_record.getHits(seqid).items(): + pdb_hits.append((key, item['chain_id'])) + + list_pdbs = [] + LOGGER.info('Separating chains and saving into PDB file') + for i in pdb_hits: + LOGGER.info('PDB code {} and chain {}'.format(i[0], i[1])) + p = parsePDB(i[0]).select('chain '+i[1]+' and protein') + writePDB(i[0]+i[1]+'.pdb', p) + list_pdbs.append(i[0]+i[1]+'.pdb') + + LOGGER.info('Adding hydrogens to the structures..') + new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True) + + os.makedirs(folder_name) + structures = parsePDB(new_pdbids) + target = structures[0] + rmsds = [] + for mobile in structures[1:]: + try: + LOGGER.info('Aligning the structures..') + i = mobile.getTitle() + LOGGER.info(i) + matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap) + m = matches[0] + m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped")) + rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped"))) + source_file = 'align__'+i+'.pdb' + writePDB(source_file, mobile) + shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file))) + except: + LOGGER.warn('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i)) + + +def calcSignatureInteractions(PDB_folder, **kwargs): + """Analyzes protein structures to identify various interactions using InSty. + Processes data from the MSA file and folder with selected models. + + Example usage: + >>> calcSignatureInteractions('./struc_homologs') + >>> calcSignatureInteractions('./struc_homologs', mapping_file='shortlisted_resind.msa') + + :arg PDB_folder: Directory containing PDB model files + :type PDB_folder: str + + :arg use_prefix: Whether to use perfix to select particular file names in the PDB_folder + Default is True + :type use_prefix: bool + + :arg mapping_file: Aligned residue indices, MSA file type + required in Foldseek analyisis + :type mapping_file: str """ + + import os + mapping_file = kwargs.get('mapping_file') + use_prefix = kwargs.pop('use_prefix', True) + + if use_prefix == True: + align_files = [os.path.join(PDB_folder, file) for file in os.listdir(PDB_folder) if file.startswith("align_")] + + else: + align_files = [os.path.join(PDB_folder, file) for file in os.listdir(PDB_folder)] + + functions_dict = { + "HBs": calcHydrogenBonds, + "SBs": calcSaltBridges, + "RIB": calcRepulsiveIonicBonding, + "PiStack": calcPiStacking, + "PiCat": calcPiCation, + "HPh": calcHydrophobic, + "DiBs": calcDisulfideBonds + } + + if not mapping_file: + for bond_type, func in functions_dict.items(): + for file in align_files: + try: + LOGGER.info(file) + atoms = parsePDB(file) + interactions = func(atoms.select('protein')) + saveInteractionsAsDummyAtoms(atoms, interactions, filename=file.rstrip(file.split('/')[-1])+'INT_'+bond_type+'_'+file.split('/')[-1]) + except: pass + + if mapping_file: + # for MSA file (Foldseek) + n_per_plot = kwargs.pop('n_per_plot', None) + min_height = kwargs.pop('min_height', 8) + + # Process each bond type + for bond_type, func in functions_dict.items(): + # Check if the consensus file already exists + consensus_file = '{}_consensus.txt'.format(bond_type) + if os.path.exists(consensus_file): + log_message("Consensus file for {} already exists, skipping.".format(bond_type), "INFO") + continue + + log_message("Processing {}".format(bond_type)) + result = process_data(mapping_file, PDB_folder, func, bond_type, files_for_analysis=align_files) + + # Check if the result is None (no valid bonds found) + if result is None: + log_message("No valid {} entries found, skipping further processing.".format(bond_type), "WARNING") + continue + + result, fixed_files = result + + # Proceed with plotting + plot_barh(result, bond_type, n_per_plot=n_per_plot, min_height=min_height) + + class Interactions(object): @@ -3462,7 +4428,7 @@ def buildInteractionMatrixEnergy(self, **kwargs): """Build matrix with interaction energy comming from energy of pairs of specific residues. :arg energy_list_type: name of the list with energies - default is 'IB_solv' + Default is 'IB_solv' acceptable values are 'IB_nosolv', 'IB_solv', 'CS' :type energy_list_type: str @@ -3470,8 +4436,7 @@ def buildInteractionMatrixEnergy(self, **kwargs): O Keskin, I Bahar and colleagues from [OK98]_ and have RT units. 'CS' is from MD simulations of amino acid pairs from Carlos Simmerling - and Gary Wu in the InSty paper (under preparation) and have units of kcal/mol. - """ + and Gary Wu in the InSty paper (under preparation) and have units of kcal/mol. """ import numpy as np import matplotlib @@ -3634,7 +4599,7 @@ def getFrequentInteractors(self, contacts_min=2): (7) Disulfide bonds (disb) :arg contacts_min: Minimal number of contacts which residue may form with other residues, - by default 2. + Default is 2. :type contacts_min: int """ atoms = self._atoms @@ -4761,7 +5726,7 @@ def getLigandInteractions(self, **kwargs): :arg include_frames: used with filters, it will leave selected keyword in orginal lists, if False it will collect selected interactions in one list, Use True to assign new selection using setLigandInteractions. - by default True + Default is True :type include_frames: bool """ @@ -4811,7 +5776,7 @@ def getLigandInteractionsNumber(self, **kwargs): be a total number of interactions or it can be divided into interaction types. :arg types: Interaction types can be included (True) or not (False). - by default is True. + Default is True. :type types: bool """ @@ -4953,7 +5918,7 @@ def saveInteractionsPDB(self, **kwargs): :type filename: str :arg ligand_sele: ligand selection, - by default is 'all not (protein or water or ion)'. + Default is 'all not (protein or water or ion)'. :type ligand_sele: str """