Skip to content

Commit

Permalink
Merge pull request prody#1903 from jamesmkrieger/streamline_fix_filter
Browse files Browse the repository at this point in the history
Streamline fixer and water filter for multiple structures
  • Loading branch information
karolamik13 authored Jun 7, 2024
2 parents 3078bef + e229d71 commit e189826
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 17 deletions.
104 changes: 92 additions & 12 deletions prody/proteins/fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
__email__ = ['[email protected]', '[email protected]']

from prody import LOGGER
from numbers import Integral, Number

__all__ = ['addMissingAtoms']
__all__ = ['addMissingAtoms', 'fixStructuresMissingAtoms']

def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs):
"""Function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_
or PDBFixer with OpenMM.
"""This function will add hydrogens to the protein and ligand structure using Openbabel [NO11]_
or PDBFixer with OpenMM.
There are also options whether to *model_residues* (default False), *remove_heterogens*
(default False), *keep_waters* (default True), *overwrite* (default False).
:arg infile: PDB file name
:type infile: str
Expand Down Expand Up @@ -53,11 +57,39 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs):
model_residues = kwargs.get("model_residues", False)
remove_heterogens = kwargs.get("remove_heterogens", False)
keep_water = kwargs.get("keep_water", True)
overwrite = kwargs.get("overwrite", False)

import os

if not isinstance(model_residues, bool):
raise TypeError('model_residues should be True or False')

if not isinstance(remove_heterogens, bool):
raise TypeError('remove_heterogens should be True or False')

if not isinstance(keep_water, bool):
raise TypeError('keep_water should be True or False')

if not isinstance(overwrite, bool):
raise TypeError('overwrite should be True or False')

if not isinstance(infile, str):
raise TypeError('infile should be a string pointing to a file')

if not os.path.exists(infile):
raise ValueError('infile {0} does not exist'.format(infile))

if not isinstance(pH, Number):
raise TypeError('pH should be a number')

if outfile == None:
outfile = os.path.join(os.path.split(infile)[0], "addH_" + os.path.split(infile)[1])
outfile = os.path.join(os.path.split(infile)[0],
"addH_" + os.path.split(infile)[1])

if os.path.exists(outfile) and not overwrite:
LOGGER.warn('outfile {0} already exists, so returning it. \
Set overwrite=True to overwrite it'.format(outfile))
return outfile

if outfile == infile:
raise ValueError('outfile cannot be the same as infile')
Expand All @@ -70,17 +102,18 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs):
raise ValueError('Openbabel cannot handle cif files')

try:
#import openbabel
from openbabel import openbabel
obconversion = openbabel.OBConversion()
obconversion.SetInFormat("pdb")
mol = openbabel.OBMol()
obconversion.ReadFile(mol, infile)
mol.AddHydrogens()
obconversion.WriteFile(mol, outfile)
LOGGER.info("Hydrogens were added to the structure. Structure {0} is saved in the local directry.".format(outfile))
except ImportError:
raise ImportError("Install Openbabel to add hydrogens to the structure or use PDBFixer/OpenMM.")

obconversion = openbabel.OBConversion()
obconversion.SetInFormat("pdb")
mol = openbabel.OBMol()
obconversion.ReadFile(mol, infile)
mol.AddHydrogens()
obconversion.WriteFile(mol, outfile)
LOGGER.info("Hydrogens were added to the structure. Structure {0} is saved in the local directry.".format(outfile))


elif method == 'pdbfixer':
try:
Expand Down Expand Up @@ -115,3 +148,50 @@ def addMissingAtoms(infile, method='openbabel', pH=7.0, outfile=None, **kwargs):
return outfile


def fixStructuresMissingAtoms(infiles, method='openbabel', pH=7.0, outfiles=None, **kwargs):
"""This function will add hydrogens to the protein and ligand structure from a set of files
using Openbabel [NO11]_ or PDBFixer with OpenMM.
There are also options whether to *model_residues* (default False), *remove_heterogens*
(default False) and *keep_waters* (default True).
:arg infiles: a list of PDB file names
:type infile: list
:arg method: Name of program which will be use to fix protein structure.
Two alternative options are available: 'openbabel' and 'pdbfixer'.
For either option additional software need to be installed:
'openbabel': OpenBabel
'pdbfixer': PDBFixer and OpenMM
default is 'openbabel'
:type method: str
:arg pH: pH value applyed only for PDBfixer.
:type pH: int, float
Instalation of Openbabel:
conda install -c conda-forge openbabel
Find more information here: https://anaconda.org/conda-forge/openbabel
https://github.com/openmm/pdbfixer
Program will create new file in the same directory with 'addH_' prefix.
.. [NO11] O'Boyle, N. M., Banck M., James C. A., Morley C., Vandermeersch T., Hutchison G. R.
Open Babel: An open chemical toolbox *Journal of cheminformatics* **2011** 3:1-14. """

if not isinstance(infiles, list):
raise TypeError('infiles should be a list')

if outfiles is None:
outfiles = [None for infile in infiles]

if not isinstance(outfiles, list):
raise TypeError('outfiles should be None or a list')
if len(outfiles) != len(infiles):
raise ValueError('outfiles should have the same length as infiles')

results = []
for i, infile in enumerate(infiles):
results.append(addMissingAtoms(infile, method, pH,
outfiles[i], **kwargs))
return results
72 changes: 69 additions & 3 deletions prody/proteins/waterbridges.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
__email__ = ['[email protected]', '[email protected]']

import numpy as np
import os

from itertools import combinations
from collections import deque
Expand All @@ -28,7 +29,8 @@
'calcWaterBridgesStatistics', 'getWaterBridgeStatInfo', 'calcWaterBridgeMatrix', 'showWaterBridgeMatrix',
'calcBridgingResiduesHistogram', 'calcWaterBridgesDistribution',
'savePDBWaterBridges', 'savePDBWaterBridgesTrajectory',
'saveWaterBridges', 'parseWaterBridges', 'findClusterCenters']
'saveWaterBridges', 'parseWaterBridges', 'findClusterCenters',
'filterStructuresWithoutWater']


class ResType(Enum):
Expand Down Expand Up @@ -1122,8 +1124,8 @@ def findClusterCenters(file_pattern, **kwargs):
removeCoords = []
for ii in range(len(coords_all)):
sel = coords_all.select('water within '+str(distC)+' of center',
center=coords_all.getCoords()[ii])
if len(sel) <= int(numC):
center=coords_all.getCoords()[ii])
if sel is not None and len(sel) <= int(numC):
removeResid.append(coords_all.getResnums()[ii])
removeCoords.append(list(coords_all.getCoords()[ii]))

Expand All @@ -1149,3 +1151,67 @@ def findClusterCenters(file_pattern, **kwargs):
filename = 'clusters.pdb'
writePDB(filename, selectedWaters)
LOGGER.info("Results are saved in {0}.".format(filename))

def filterStructuresWithoutWater(structures, min_water=0, filenames=None):
"""This function will filter out structures from *structures* that have no water
or fewer water molecules than *min_water*.
:arg structures: list of :class:`.Atomic` structures to be filtered
:type structures: list
:arg min_water: minimum number of water O atoms,
default is 0
:type min_water: int
:arg filenames: an optional list of filenames to filter too
This is an output argument
:type filenames: list
"""

if not isinstance(structures, list):
raise TypeError('structures should be a list')

if not np.alltrue([isinstance(struct, Atomic) for struct in structures]):
raise ValueError('elements of structures should be Atomic objects')

if not isinstance(min_water, int):
raise TypeError('min_water should be an integer')

if filenames is None: filenames = []

if not isinstance(filenames, list):
raise TypeError('filenames should be None or a list')

if len(filenames) not in [0, len(structures)]:
raise TypeError('filenames should have the same length as structures')

if not np.alltrue([isinstance(filename, str) for filename in filenames]):
raise ValueError('elements of filenames should be strings')

if not np.alltrue([os.path.exists(filename) for filename in filenames]):
raise ValueError('at least one of the filenames does not exist')

have_filenames = len(filenames)>0

new_structures = []
numStructures = len(structures)
for i, struct in enumerate(reversed(structures)):
title = struct.getTitle()
waters = struct.select('water and name O')

if waters == None:
LOGGER.warn(title+" doesn't contain water molecules")
if have_filenames:
filenames.pop(numStructures-i-1)
continue

numWaters = waters.numAtoms()
if numWaters < min_water:
LOGGER.warn(title+" doesn't contain enough water molecules ({0})".format(numWaters))
if have_filenames:
filenames.pop(numStructures-i-1)
continue

new_structures.append(struct)

return list(reversed(new_structures))
4 changes: 2 additions & 2 deletions prody/tests/database/test_pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def testUniprotAccMulti(self):
'searchPfam failed to return a dict instance')

self.assertEqual(sorted(list(a.keys())),
['PF00060', 'PF00497', 'PF01094', 'PF10613'],
['PF00060', 'PF01094', 'PF10613'],
'searchPfam failed to return the right domain family IDs')

def testPdbIdChMulti(self):
Expand All @@ -46,7 +46,7 @@ def testPdbIdChMulti(self):
self.assertIsInstance(a, dict,
'searchPfam failed to return a dict instance')

self.assertEqual(sorted(list(a.keys())), ['PF00060', 'PF00497', 'PF01094', 'PF10613'],
self.assertEqual(sorted(list(a.keys())), ['PF00060', 'PF01094', 'PF10613'],
'searchPfam failed to return the right domain family IDs for AMPAR')

def testPdbIdChSingle(self):
Expand Down

0 comments on commit e189826

Please sign in to comment.