From f2956d96bcc5a099f4a80e646e19de91ffcb7c89 Mon Sep 17 00:00:00 2001 From: karolamik13 Date: Sun, 10 Nov 2024 17:44:15 +0100 Subject: [PATCH] createFoldseekAlignment --- prody/proteins/interactions.py | 70 +++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py index 428e04be2..1e7c4a106 100644 --- a/prody/proteins/interactions.py +++ b/prody/proteins/interactions.py @@ -51,7 +51,8 @@ 'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory', 'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues', 'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues', - 'saveInteractionsAsDummyAtoms', 'extractMultiModelPDB', 'calcSignatureInteractions'] + 'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', + 'extractMultiModelPDB', 'calcSignatureInteractions'] def cleanNumbers(listContacts): @@ -3239,6 +3240,73 @@ def showSminaTermValues(data): return show +def createFoldseekAlignment(prot_seq, prot_foldseek, *kwargs): + """Aligns sequences from prot_seq with homologous sequences identified in prot_foldseek, + generating a multiple sequence alignment. + + :arg prot_seq: The natural sequence extracted from the PDB (seq file) + :type prot_seq: str + + :arg prot_foldseek: The results from foldseek (foldseek file) + :type prot_foldseek: str + + :arg msa_output_name: The natural sequence extracted from the PDB (msa file) + :type msa_output_name: str + """ + + msa_output_name = kwargs.pop('msa_output_name', 'prot_struc.msa') + + def find_match_index(tar_nogap, nat_seq): + tar_nogap_str = ''.join(tar_nogap) + nat_seq_str = ''.join(nat_seq) + index = nat_seq_str.find(tar_nogap_str) + return index + + # Read input files + with open(prot_seq, 'r') as f: + file1 = f.readlines() + + with open(prot_foldseek, 'r') as f: + file2 = f.readlines() + + # Open output file + with open(msa_output_name, 'w') as fp: + nat_seq = list(file1[0].strip()) + + # Write the natural sequence to the output file + fp.write(''.join(nat_seq) + "\n") + + # Process each foldseek entry + for line in file2: + entries = line.split() + + if float(entries[2]) >= 0.5: + tar_seq = list(entries[11].strip()) + mat_seq = list(entries[12].strip()) + + tar_nogap = [] + processed_mat = [] + + for j in range(len(tar_seq)): + if tar_seq[j] != '-': + tar_nogap.append(tar_seq[j]) + processed_mat.append(mat_seq[j]) + + match_index = find_match_index(tar_nogap, nat_seq) + end_index = match_index + len(tar_nogap) + m = 0 + + for l in range(len(nat_seq)): + if l < match_index: + fp.write("-") + elif l >= match_index and l < end_index: + fp.write(processed_mat[m]) + m += 1 + else: + fp.write("-") + fp.write("\n") + + def extractMultiModelPDB(multimodelPDB, **kwargs): """Extracts individual PDB models from multimodel PDB and places them into the pointed directory. If used for calculating calcSignatureInteractions align the models.