From 8b53f715cebd77977c6ce1b782c386da33e71225 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 14:22:28 +0200 Subject: [PATCH 1/8] add fixer for multiple structures --- prody/proteins/fixer.py | 92 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 11 deletions(-) diff --git a/prody/proteins/fixer.py b/prody/proteins/fixer.py index 26114bcbb..7cff7d1a9 100644 --- a/prody/proteins/fixer.py +++ b/prody/proteins/fixer.py @@ -19,12 +19,16 @@ __email__ = ['karolamik@fizyka.umk.pl', 'jamesmkrieger@gmail.com'] from prody import LOGGER +from numbers import Integral, Number -__all__ = ['addMissingAtoms'] +__all__ = ['addMissingAtoms', 'fixStructuresMissingAtoms'] def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): - """Function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_ - or PDBFixer with OpenMM. + """This function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_ + or PDBFixer with OpenMM. + + There are also options whether to *model_residues* (default False), *remove_heterogens* + (default False) and *keep_waters* (default True). :arg infile: PDB file name :type infile: str @@ -56,6 +60,24 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): import os + if not isinstance(model_residues, bool): + raise TypeError('model_residues should be True or False') + + if not isinstance(remove_heterogens, bool): + raise TypeError('remove_heterogens should be True or False') + + if not isinstance(keep_water, bool): + raise TypeError('keep_water should be True or False') + + if not isinstance(infile, str): + raise TypeError('infile should be a string pointing to a file') + + if not os.path.exists(infile): + raise ValueError('infile {0} does not exist'.format(infile)) + + if not isinstance(pH, Number): + raise TypeError('pH should be a number') + if outfile == None: outfile = os.path.join(os.path.split(infile)[0], "addH_" + os.path.split(infile)[1]) @@ -70,17 +92,18 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): raise ValueError('Openbabel cannot handle cif files') try: - #import openbabel from openbabel import openbabel - obconversion = openbabel.OBConversion() - obconversion.SetInFormat("pdb") - mol = openbabel.OBMol() - obconversion.ReadFile(mol, infile) - mol.AddHydrogens() - obconversion.WriteFile(mol, outfile) - LOGGER.info("Hydrogens were added to the structure. Structure {0} is saved in the local directry.".format(outfile)) except ImportError: raise ImportError("Install Openbabel to add hydrogens to the structure or use PDBFixer/OpenMM.") + + obconversion = openbabel.OBConversion() + obconversion.SetInFormat("pdb") + mol = openbabel.OBMol() + obconversion.ReadFile(mol, infile) + mol.AddHydrogens() + obconversion.WriteFile(mol, outfile) + LOGGER.info("Hydrogens were added to the structure. Structure {0} is saved in the local directry.".format(outfile)) + elif method == 'pdbfixer': try: @@ -115,3 +138,50 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): return outfile +def fixStructuresMissingAtoms(infiles, method='openbabel', pH=7.0, outfiles=None, **kwargs): + """This function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_ + or PDBFixer with OpenMM. + + There are also options whether to *model_residues* (default False), *remove_heterogens* + (default False) and *keep_waters* (default True). + + :arg infiles: a list of PDB file names + :type infile: list + + :arg method: Name of program which will be use to fix protein structure. + Two alternative options are available: 'openbabel' and 'pdbfixer'. + For either option additional software need to be installed: + 'openbabel': OpenBabel + 'pdbfixer': PDBFixer and OpenMM + default is 'openbabel' + :type method: str + + :arg pH: pH value applyed only for PDBfixer. + :type pH: int, float + + Instalation of Openbabel: + conda install -c conda-forge openbabel + + Find more information here: https://anaconda.org/conda-forge/openbabel + https://github.com/openmm/pdbfixer + Program will create new file in the same directory with 'addH_' prefix. + + .. [NO11] O'Boyle, N. M., Banck M., James C. A., Morley C., Vandermeersch T., Hutchison G. R. + Open Babel: An open chemical toolbox *Journal of cheminformatics* **2011** 3:1-14. """ + + if not isinstance(infiles, list): + raise TypeError('infiles should be a list') + + if outfiles is None: + outfiles = [None for infile in infiles] + + if not isinstance(outfiles, list): + raise TypeError('outfiles should be None or a list') + if len(outfiles) != len(infiles): + raise ValueError('outfiles should have the same length as infiles') + + results = [] + for i, infile in enumerate(infiles): + results.append(addMissingAtoms(infile, method, pH, + outfiles[i], **kwargs)) + return results From 2e8bfedc2bf5ac25f6b5600789e91db4acfb0757 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 14:34:05 +0200 Subject: [PATCH 2/8] add option not overwrite outfile --- prody/proteins/fixer.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/prody/proteins/fixer.py b/prody/proteins/fixer.py index 7cff7d1a9..2372e0774 100644 --- a/prody/proteins/fixer.py +++ b/prody/proteins/fixer.py @@ -28,7 +28,7 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): or PDBFixer with OpenMM. There are also options whether to *model_residues* (default False), *remove_heterogens* - (default False) and *keep_waters* (default True). + (default False), *keep_waters* (default True), *overwrite* (default False). :arg infile: PDB file name :type infile: str @@ -57,6 +57,7 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): model_residues = kwargs.get("model_residues", False) remove_heterogens = kwargs.get("remove_heterogens", False) keep_water = kwargs.get("keep_water", True) + overwrite = kwargs.get("overwrite", False) import os @@ -68,6 +69,9 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): if not isinstance(keep_water, bool): raise TypeError('keep_water should be True or False') + + if not isinstance(overwrite, bool): + raise TypeError('overwrite should be True or False') if not isinstance(infile, str): raise TypeError('infile should be a string pointing to a file') @@ -79,7 +83,13 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): raise TypeError('pH should be a number') if outfile == None: - outfile = os.path.join(os.path.split(infile)[0], "addH_" + os.path.split(infile)[1]) + outfile = os.path.join(os.path.split(infile)[0], + "addH_" + os.path.split(infile)[1]) + + if os.path.exists(outfile) and not overwrite: + LOGGER.warn('outfile {0} already exists, so returning it. \ +Set overwrite=True to overwrite it'.format(outfile)) + return outfile if outfile == infile: raise ValueError('outfile cannot be the same as infile') From 12abe341d2d5fc11e426013cc3103abf793b546e Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 14:51:34 +0200 Subject: [PATCH 3/8] add filter water --- prody/proteins/waterbridges.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/prody/proteins/waterbridges.py b/prody/proteins/waterbridges.py index a01c9f114..9179a7d01 100644 --- a/prody/proteins/waterbridges.py +++ b/prody/proteins/waterbridges.py @@ -28,7 +28,8 @@ 'calcWaterBridgesStatistics', 'getWaterBridgeStatInfo', 'calcWaterBridgeMatrix', 'showWaterBridgeMatrix', 'calcBridgingResiduesHistogram', 'calcWaterBridgesDistribution', 'savePDBWaterBridges', 'savePDBWaterBridgesTrajectory', - 'saveWaterBridges', 'parseWaterBridges', 'findClusterCenters'] + 'saveWaterBridges', 'parseWaterBridges', 'findClusterCenters', + 'filterStructuresWithoutWater'] class ResType(Enum): @@ -1149,3 +1150,21 @@ def findClusterCenters(file_pattern, **kwargs): filename = 'clusters.pdb' writePDB(filename, selectedWaters) LOGGER.info("Results are saved in {0}.".format(filename)) + +def filterStructuresWithoutWater(structures, min_water=0): + new_structures = [] + for struct in structures: + title = struct.getTitle() + waters = struct.select('water') + + if waters == None: + LOGGER.warn(title+" doesn't contain water molecules") + continue + + numWaters = waters.numAtoms() + if numWaters < min_water: + LOGGER.warn(title+" doesn't contain enough water molecules ({0})".format(numWaters)) + else: + new_structures.append(struct) + + return new_structures From 4bf1ffe1a21de5dcb5d4ddce9bab43b46c7de546 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 15:27:47 +0200 Subject: [PATCH 4/8] filter filenames too --- prody/proteins/waterbridges.py | 56 +++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/prody/proteins/waterbridges.py b/prody/proteins/waterbridges.py index 9179a7d01..3694c206e 100644 --- a/prody/proteins/waterbridges.py +++ b/prody/proteins/waterbridges.py @@ -8,6 +8,7 @@ __email__ = ['karolamik@fizyka.umk.pl', 'fdoljanin@pmfst.hr'] import numpy as np +import os from itertools import combinations from collections import deque @@ -1151,20 +1152,65 @@ def findClusterCenters(file_pattern, **kwargs): writePDB(filename, selectedWaters) LOGGER.info("Results are saved in {0}.".format(filename)) -def filterStructuresWithoutWater(structures, min_water=0): +def filterStructuresWithoutWater(structures, min_water=0, filenames=None): + """This function will filter out structures from *structures* that have no water + or fewer water molecules than *min_water*. + + :arg structures: list of :class:`.Atomic` structures to be filtered + :type structures: list + + :arg min_water: minimum number of water O atoms, + default is 0 + :type min_water: int + + :arg filenames: an optional list of filenames to filter too + This is an output argument + :type filenames: list + """ + + if not isinstance(structures, list): + raise TypeError('structures should be a list') + + if not np.alltrue([isinstance(struct, Atomic) for struct in structures]): + raise ValueError('elements of structures should be Atomic objects') + + if not isinstance(min_water, int): + raise TypeError('min_water should be an integer') + + if filenames is None: filenames = [] + + if not isinstance(filenames, list): + raise TypeError('filenames should be None or a list') + + if len(filenames) not in [0, len(structures)]: + raise TypeError('filenames should have the same length as structures') + + if not np.alltrue([isinstance(filename, str) for filename in filenames]): + raise ValueError('elements of filenames should be strings') + + if not np.alltrue([os.path.exists(filename) for filename in filenames]): + raise ValueError('at least one of the filenames does not exist') + + have_filenames = len(filenames)>0 + new_structures = [] - for struct in structures: + for i, struct in enumerate(reversed(structures)): title = struct.getTitle() - waters = struct.select('water') + waters = struct.select('water and name O') if waters == None: LOGGER.warn(title+" doesn't contain water molecules") + if have_filenames: + filenames.pop(-i) continue numWaters = waters.numAtoms() if numWaters < min_water: LOGGER.warn(title+" doesn't contain enough water molecules ({0})".format(numWaters)) - else: - new_structures.append(struct) + if have_filenames: + filenames.pop(-i) + continue + + new_structures.append(struct) return new_structures From ec001c751c192a87463df90e7de3b16b6143d097 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 15:27:55 +0200 Subject: [PATCH 5/8] improve fixer docs --- prody/proteins/fixer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prody/proteins/fixer.py b/prody/proteins/fixer.py index 2372e0774..ad30aa731 100644 --- a/prody/proteins/fixer.py +++ b/prody/proteins/fixer.py @@ -149,8 +149,8 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs): def fixStructuresMissingAtoms(infiles, method='openbabel', pH=7.0, outfiles=None, **kwargs): - """This function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_ - or PDBFixer with OpenMM. + """This function will add hydrogens to the protein and ligand structure from a set of files + using Openbabel [NO11]_ or PDBFixer with OpenMM. There are also options whether to *model_residues* (default False), *remove_heterogens* (default False) and *keep_waters* (default True). From 7010ce549855ec85734b07957136db2c704cb1d4 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 15:37:18 +0200 Subject: [PATCH 6/8] fix filter filenames --- prody/proteins/waterbridges.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/prody/proteins/waterbridges.py b/prody/proteins/waterbridges.py index 3694c206e..53e271320 100644 --- a/prody/proteins/waterbridges.py +++ b/prody/proteins/waterbridges.py @@ -1194,6 +1194,7 @@ def filterStructuresWithoutWater(structures, min_water=0, filenames=None): have_filenames = len(filenames)>0 new_structures = [] + numStructures = len(structures) for i, struct in enumerate(reversed(structures)): title = struct.getTitle() waters = struct.select('water and name O') @@ -1201,16 +1202,16 @@ def filterStructuresWithoutWater(structures, min_water=0, filenames=None): if waters == None: LOGGER.warn(title+" doesn't contain water molecules") if have_filenames: - filenames.pop(-i) + filenames.pop(numStructures-i-1) continue numWaters = waters.numAtoms() if numWaters < min_water: LOGGER.warn(title+" doesn't contain enough water molecules ({0})".format(numWaters)) if have_filenames: - filenames.pop(-i) + filenames.pop(numStructures-i-1) continue new_structures.append(struct) - return new_structures + return list(reversed(new_structures)) From 5c1c58983d936053bc38087b4b42d39bf8e9932e Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 16:13:41 +0200 Subject: [PATCH 7/8] fix findClusterCenters for None within distC of center --- prody/proteins/waterbridges.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prody/proteins/waterbridges.py b/prody/proteins/waterbridges.py index 53e271320..494beb28f 100644 --- a/prody/proteins/waterbridges.py +++ b/prody/proteins/waterbridges.py @@ -1124,8 +1124,8 @@ def findClusterCenters(file_pattern, **kwargs): removeCoords = [] for ii in range(len(coords_all)): sel = coords_all.select('water within '+str(distC)+' of center', - center=coords_all.getCoords()[ii]) - if len(sel) <= int(numC): + center=coords_all.getCoords()[ii]) + if sel is not None and len(sel) <= int(numC): removeResid.append(coords_all.getResnums()[ii]) removeCoords.append(list(coords_all.getCoords()[ii])) From e229d71cb1121e2f2480241f64d1ff2a00af3fb7 Mon Sep 17 00:00:00 2001 From: James Krieger Date: Fri, 7 Jun 2024 16:27:48 +0200 Subject: [PATCH 8/8] remove bacterial PF00497 from tests --- prody/tests/database/test_pfam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prody/tests/database/test_pfam.py b/prody/tests/database/test_pfam.py index f041cb76b..385d8b372 100644 --- a/prody/tests/database/test_pfam.py +++ b/prody/tests/database/test_pfam.py @@ -34,7 +34,7 @@ def testUniprotAccMulti(self): 'searchPfam failed to return a dict instance') self.assertEqual(sorted(list(a.keys())), - ['PF00060', 'PF00497', 'PF01094', 'PF10613'], + ['PF00060', 'PF01094', 'PF10613'], 'searchPfam failed to return the right domain family IDs') def testPdbIdChMulti(self): @@ -46,7 +46,7 @@ def testPdbIdChMulti(self): self.assertIsInstance(a, dict, 'searchPfam failed to return a dict instance') - self.assertEqual(sorted(list(a.keys())), ['PF00060', 'PF00497', 'PF01094', 'PF10613'], + self.assertEqual(sorted(list(a.keys())), ['PF00060', 'PF01094', 'PF10613'], 'searchPfam failed to return the right domain family IDs for AMPAR') def testPdbIdChSingle(self):