From 22ff338198a530dff27720475421ad96628b26a7 Mon Sep 17 00:00:00 2001
From: karolamik13 <karolami@pitt.edu>
Date: Sat, 16 Nov 2024 13:55:17 +0100
Subject: [PATCH] runDali added

---
 prody/proteins/interactions.py | 101 ++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/prody/proteins/interactions.py b/prody/proteins/interactions.py
index 629898632..f799f3398 100644
--- a/prody/proteins/interactions.py
+++ b/prody/proteins/interactions.py
@@ -33,6 +33,8 @@
 
 import multiprocessing
 from .fixer import *
+from .compare import *
+from prody.measure import calcTransformation, calcDistance, calcRMSD, superpose
 
 
 __all__ = ['calcHydrogenBonds', 'calcChHydrogenBonds', 'calcSaltBridges',
@@ -49,7 +51,7 @@
            'Interactions', 'InteractionsTrajectory', 'LigandInteractionsTrajectory',
            'calcSminaBindingAffinity', 'calcSminaPerAtomInteractions', 'calcSminaTermValues',
            'showSminaTermValues', 'showPairEnergy', 'checkNonstandardResidues',
-           'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 
+           'saveInteractionsAsDummyAtoms', 'createFoldseekAlignment', 'runFoldseek', 'runDali', 
            'extractMultiModelPDB', 'calcSignatureInteractions']
 
 
@@ -3683,6 +3685,103 @@ def extract_sequence_from_pdb(pdb_file, chain, output_file):
     subprocess.run("rm -f inp.pdb prot.seq target.pdb temp_coord.txt temp_resind.txt temp_seq.txt", shell=True)
     subprocess.run("rm -rf tmp2 temp", shell=True)
     
+    
+def runDali(pdb, chain, **kwargs):
+    """This function calls searchDali() and downloads all the PDB files, separate by chains, add hydrogens and missing side chains,
+    and finally align them and put into the newly created folder.
+       
+    :arg pdb: A PDB code
+    :type pdb: str
+
+    :arg chain: chain identifier
+    :type chain: str
+
+    :arg cutoff_len: Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1)
+            See searchDali for more details 
+            by default 0.5
+    :type cutoff_len: float
+
+    :arg cutoff_rmsd: RMSD cutoff (see searchDali)
+            by default 1.0
+    :type cutoff_rmsd: float
+    
+    :arg subsetDali: fullPDB, PDB25, PDB50, PDB90
+            by default is 'fullPDB'
+    :type subsetDali: str    
+    
+    :arg fixer: The method for fixing lack of hydrogen bonds
+            by default is 'pdbfixer'
+    :type fixer: 'pdbfixer' or 'openbabel'
+    
+    :arg subset: subsets of atoms: 'ca', 'bb', 'heavy', 'noh', 'all'  (see matchChains())
+            by default is 'ca'
+    :type subset: str
+
+    :arg seqid: Minimum value of the sequence identity (see matchChains())
+            by default 5
+    :type seqid: float
+    
+    :arg overlap: percent overlap (see matchChains())
+            by default 50
+    :type overlap: float
+
+    :arg folder_name: Folder where the results will be collected
+            by default is 'struc_homologs'
+    :type folder_name: str
+    """
+
+    import os
+    import shutil
+    from prody.database import dali 
+    
+    cutoff_len = kwargs.pop('cutoff_len', 0.5)
+    cutoff_rmsd = kwargs.pop('cutoff_rmsd', 1.0)
+    fixer = kwargs.pop('fixer', 'pdbfixer')
+    subset_Dali = kwargs.pop('subset_Dali', 'fullPDB')
+    subset = kwargs.pop('subset', 'ca')
+    seqid = kwargs.pop('seqid', 5)
+    overlap = kwargs.pop('overlap', 50)
+    folder_name = kwargs.pop('folder_name', 'struc_homologs')
+    
+    dali_rec = dali.searchDali(pdb, chain, subset=subset_Dali)
+
+    while not dali_rec.isSuccess:
+        dali_rec.fetch()
+    
+    pdb_ids = dali_rec.filter(cutoff_len=cutoff_len, cutoff_rmsd=cutoff_rmsd)
+    pdb_hits = [ (i[:4], i[4:]) for i in pdb_ids ]
+    
+    list_pdbs = []
+    LOGGER.info('Separating chains and saving into PDB file')
+    for i in pdb_hits:
+        LOGGER.info('PDB code {} and chain {}'.format(i[0], i[1]))
+        p = parsePDB(i[0]).select('chain '+i[1]+' and protein')
+        writePDB(i[0]+i[1]+'.pdb', p)
+        list_pdbs.append(i[0]+i[1]+'.pdb')
+    
+    #LOGGER.info('Number of selected structures: ', len(list_pdbs))
+    LOGGER.info('Adding hydrogens to the structures..')
+    new_pdbids = fixStructuresMissingAtoms(list_pdbs, method='pdbfixer', model_residues=True, overwrite=True)
+
+    os.makedirs(folder_name)
+    structures = parsePDB(new_pdbids)
+    target = structures[0]
+    rmsds = []
+    for mobile in structures[1:]:
+        try:
+            LOGGER.info('Aligning the structures..')
+            i = mobile.getTitle()
+            LOGGER.info(i)
+            matches = matchChains(mobile.protein, target.protein, subset=subset, seqid=seqid, overlap=overlap)
+            m = matches[0]
+            m0_alg, T = superpose(m[0], m[1], weights=m[0].getFlags("mapped"))
+            rmsds.append(calcRMSD(m[0], m[1], weights=m[0].getFlags("mapped")))
+            source_file = 'align__'+i+'.pdb'
+            writePDB(source_file, mobile)
+            shutil.move(source_file, os.path.join(folder_name, os.path.basename(source_file)))
+        except TypeError:
+            raise TypeError('There is a problem with {}. Change seqid or overlap parameter to include the structure.'.format(i))
+    
 
 def calcSignatureInteractions(mapping_file, PDB_folder, **kwargs):
     """Analyzes protein structures to identify various interactions using InSty.