From 92c971a0ff1acd96ae428ce600e6c9476e741253 Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Tue, 26 Apr 2022 00:11:44 -0700 Subject: [PATCH 1/5] fixing the bond matching algorithm --- prody/atomic/atomgroup.py | 22 +++---- prody/proteins/pdbfile.py | 128 ++++++++++++++++++++++---------------- 2 files changed, 84 insertions(+), 66 deletions(-) diff --git a/prody/atomic/atomgroup.py b/prody/atomic/atomgroup.py index 2763d5862..3f39f06f6 100644 --- a/prody/atomic/atomgroup.py +++ b/prody/atomic/atomgroup.py @@ -1095,8 +1095,8 @@ def setBonds(self, bonds): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of bonds will be generated and stored with label *numbonds*. This can be used in atom selections, e.g. - ``'numbonds 0'`` can be used to select ions in a system. If *bonds* - is empty or **None**, then all bonds will be removed for this + ``'numbonds 0'`` can be used to select ions in a system. If *bonds* + is empty or **None**, then all bonds will be removed for this :class:`.AtomGroup`. """ if bonds is None or len(bonds) == 0: @@ -1126,7 +1126,7 @@ def setBonds(self, bonds): self._fragments = None def numBonds(self): - """Returns number of bonds. Use :meth:`setBonds` or + """Returns number of bonds. Use :meth:`setBonds` or :meth:`inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1134,7 +1134,7 @@ def numBonds(self): return 0 def getBonds(self): - """Returns bonds. Use :meth:`setBonds` or + """Returns bonds. Use :meth:`setBonds` or :meth:`inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1171,7 +1171,7 @@ def iterBonds(self): yield Bond(self, bond, acsi) def _iterBonds(self): - """Yield pairs of bonded atom indices. Use :meth:`setBonds` + """Yield pairs of bonded atom indices. Use :meth:`setBonds` or `inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1366,8 +1366,8 @@ def setDonors(self, donors): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of donors will be generated and stored with label *numdonors*. This can be used in atom selections, e.g. - ``'numdonors 0'`` can be used to select ions in a system. If *donors* - is empty or **None**, then all donors will be removed for this + ``'numdonors 0'`` can be used to select ions in a system. If *donors* + is empty or **None**, then all donors will be removed for this :class:`.AtomGroup`. """ if donors is None or len(donors) == 0: @@ -1432,8 +1432,8 @@ def setAcceptors(self, acceptors): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of acceptors will be generated and stored with label *numacceptors*. This can be used in atom selections, e.g. - ``'numacceptors 0'`` can be used to select ions in a system. If *acceptors* - is empty or **None**, then all acceptors will be removed for this + ``'numacceptors 0'`` can be used to select ions in a system. If *acceptors* + is empty or **None**, then all acceptors will be removed for this :class:`.AtomGroup`. """ if acceptors is None or len(acceptors) == 0: @@ -1498,8 +1498,8 @@ def setNBExclusions(self, nbexclusions): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of nbexclusions will be generated and stored with label *numnbexclusions*. This can be used in atom selections, e.g. - ``'numnbexclusions 0'`` can be used to select ions in a system. If *nbexclusions* - is empty or **None**, then all nbexclusions will be removed for this + ``'numnbexclusions 0'`` can be used to select ions in a system. If *nbexclusions* + is empty or **None**, then all nbexclusions will be removed for this :class:`.AtomGroup`. """ if nbexclusions is None or len(nbexclusions) == 0: diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py index da5eeb553..a3bf68c35 100644 --- a/prody/proteins/pdbfile.py +++ b/prody/proteins/pdbfile.py @@ -26,7 +26,7 @@ 'writePDBStream', 'writePDB', 'writeChainsList', 'writePQR', 'writePQRStream'] -MAX_N_ATOM = 99999 +MAX_N_ATOM = 99999 MAX_N_RES = 9999 class PDBParseError(Exception): @@ -71,9 +71,9 @@ class PDBParseError(Exception): :arg biomol: if **True**, biomolecules are obtained by transforming the coordinates using information from header section will be returned. - This option uses :func:`.buildBiomolecules` and as noted there, - atoms in biomolecules are ordered according to the original chain - IDs. Chains may have the same chain ID, in which case they are given + This option uses :func:`.buildBiomolecules` and as noted there, + atoms in biomolecules are ordered according to the original chain + IDs. Chains may have the same chain ID, in which case they are given different segment names. Default is **False** :type biomol: bool @@ -99,15 +99,15 @@ def parsePDB(*pdb, **kwargs): :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. - + You can also provide arguments that you would like passed on to fetchPDB(). :arg extend_biomol: whether to extend the list of results with a list - rather than appending, which can create a mixed list, + rather than appending, which can create a mixed list, especially when biomol=True. Default value is False to reproduce previous behaviour. This value is ignored when result is not a list (header=True or model=0). - :type extend_biomol: bool + :type extend_biomol: bool """ extend_biomol = kwargs.pop('extend_biomol', False) @@ -119,7 +119,7 @@ def parsePDB(*pdb, **kwargs): if isListLike(pdb[0]) or isinstance(pdb[0], dict): pdb = pdb[0] n_pdb = len(pdb) - + if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: @@ -139,7 +139,7 @@ def parsePDB(*pdb, **kwargs): for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain','') - LOGGER.update(i, 'Retrieving {0}...'.format(p+c), + LOGGER.update(i, 'Retrieving {0}...'.format(p+c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): @@ -151,7 +151,7 @@ def parsePDB(*pdb, **kwargs): results = list(zip(*results)) LOGGER.finish() - + for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) @@ -216,7 +216,7 @@ def _parsePDB(pdb, **kwargs): try: LOGGER.info("Trying to parse EMD file instead") return parseEMD(pdb+chain, **kwargs) - except: + except: raise IOError('PDB file for {0} could not be downloaded.' .format(pdb)) pdb = filename @@ -246,7 +246,7 @@ def _parsePDB(pdb, **kwargs): except ValueError: LOGGER.warn("Could not parse anything so returning None") return None - except: + except: raise IOError('PDB file for {0} could not be downloaded.' .format(pdb)) @@ -257,8 +257,8 @@ def parsePDBStream(stream, **kwargs): parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` - (e.g. :class:`file`, buffer, stdin)""" - + (e.g. :class:`file`, buffer, stdin)""" + model = kwargs.get('model') header = kwargs.get('header', False) assert isinstance(header, bool), 'header must be a boolean' @@ -326,6 +326,7 @@ def parsePDBStream(stream, **kwargs): try: ag.setBonds(bonds) except ValueError: + raise LOGGER.warn('Bonds read from CONECT records do not apply to subset so were not added') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' @@ -489,6 +490,7 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, start = split stop = len(lines) nmodel = 0 + bonds_serial = [] # if a specific model is requested, skip lines until that one if isPDB and model is not None and model != 1: for i in range(split, len(lines)): @@ -537,7 +539,7 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, startswith = line[0:6].strip() else: startswith = fields[0] - + if startswith == 'ATOM' or startswith == 'HETATM': if isPDB: atomname = line[12:16].strip() @@ -560,7 +562,7 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, if not chid in chain: i += 1 continue - + if isPDB: alt = line[16] if alt not in which_altlocs and which_altlocs != 'all': @@ -620,14 +622,14 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, LOGGER.warn('failed to parse serial number in line {0}. Assigning it as 1.' .format(i)) serials[acount] = 1 - + altlocs[acount] = alt atomnames[acount] = atomname resnames[acount] = resname chainids[acount] = chid if isPDB: resnum_str = line[22:26] - icode = line[26] + icode = line[26] if dec: try: resnum = int(resnum_str) @@ -638,7 +640,7 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, if not warned_5_digit: LOGGER.warn('parsed 5 digit residue number including numeric insertion code') warned_5_digit = True - resnum = int(str(resnum) + icode) + resnum = int(str(resnum) + icode) else: icodes[acount] = icode @@ -654,7 +656,7 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, else: # lower case is found in hexadecimal PDB files resnum = int(resnum_str, 16) - + except ValueError: if acount > 0: LOGGER.warn('failed to parse residue number in line {0}. Assigning it by incrementing.' @@ -758,19 +760,23 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, if bonds is not None: atom_serial = line[6:11] bonded1_serial = line[11:16] - bonds.append([int(atom_serial)-1-num_ters, int(bonded1_serial)-1-num_ters]) - + # bonds.append([int(atom_serial)-1-num_ters, int(bonded1_serial)-1-num_ters]) + bonds_serial.append([int(atom_serial), int(bonded1_serial)]) + bonded2_serial = line[16:21] if len(bonded2_serial.strip()): - bonds.append([int(atom_serial)-1-num_ters, int(bonded2_serial)-1-num_ters]) + # bonds.append([int(atom_serial)-1-num_ters, int(bonded2_serial)-1-num_ters]) + bonds_serial.append([int(atom_serial), int(bonded2_serial)]) bonded3_serial = line[21:26] if len(bonded3_serial.strip()): - bonds.append([int(atom_serial)-1-num_ters, int(bonded3_serial)-1-num_ters]) - - bonded4_serial = line[27:31] + # bonds.append([int(atom_serial)-1-num_ters, int(bonded3_serial)-1-num_ters]) + bonds_serial.append([int(atom_serial), int(bonded3_serial)]) + + bonded4_serial = line[26:31] # fixed typo if len(bonded4_serial.strip()): - bonds.append([int(atom_serial)-1-num_ters, int(bonded4_serial)-1-num_ters]) + # bonds.append([int(atom_serial)-1-num_ters, int(bonded4_serial)-1-num_ters]) + bonds_serial.append([int(atom_serial), int(bonded4_serial)]) elif not onlycoords and (startswith == 'TER ' or startswith.strip() == 'TER'): @@ -977,6 +983,18 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, charges.resize(acount, refcheck=False) atomgroup.setCharges(charges) + # hard-coded patch + serial_to_id = {} + for aidx in range(len(serials)): + serial_to_id[int(serials[aidx])] = aidx + # rematch the bond-atom serial numbers to atom ids + for bond in bonds_serial: + try: + bonds.append([serial_to_id[bond[0]], serial_to_id[bond[1]]]) + except KeyError: + LOGGER.warn(f"Bond connecting atom serial numbers {bond[0]}" + f" and {bond[1]} contains atoms not included in the model") + if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) @@ -1045,7 +1063,7 @@ def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): HELIXLINE = ('HELIX {serNum:3d} {helixID:>3s} ' '{initResName:<3s} {initChainID:1s} {initSeqNum:4d}{initICode:1s} ' '{endResName:<3s} {endChainID:1s} {endSeqNum:4d}{endICode:1s}' - '{helixClass:2d} {length:5d}\n') + '{helixClass:2d} {length:5d}\n') SHEETLINE = ('SHEET {strand:3d} {sheetID:>3s}{numStrands:2d} ' '{initResName:3s} {initChainID:1s}{initSeqNum:4d}{initICode:1s} ' @@ -1113,7 +1131,7 @@ def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): :arg hybrid36: whether to use hybrid36 format for atoms with serial greater than 99999. Hexadecimal is used otherwise. Default is False - :type hybrid36: bool + :type hybrid36: bool """ def writeChainsList(chains, filename): @@ -1122,7 +1140,7 @@ def writeChainsList(chains, filename): :arg chains: a list of :class:`.Chain` objects :type chains: list - + :arg filename: the name of the file to be written :type filename: str """ @@ -1140,10 +1158,10 @@ def parseChainsList(filename): :arg filename: the name of the file to be read :type filename: str - Returns: lists containing an :class:'.AtomGroup' for each PDB, + Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ - + fi = open(filename,'r') lines = fi.readlines() fi.close() @@ -1179,7 +1197,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): :arg stream: anything that implements a :meth:`write` method (e.g. file, buffer, stdout) - + :arg renumber: whether to renumber atoms with serial indices Default is **True** :type renumber: bool @@ -1343,13 +1361,13 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): helix_icodes = icodes[torf] L = helix_resnums[-1] - helix_resnums[0] + 1 - stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], - initResName=helix_resnames[0], initChainID=helix_chainids[0], + stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], + initResName=helix_resnames[0], initChainID=helix_chainids[0], initSeqNum=helix_resnums[0], initICode=helix_icodes[0], - endResName=helix_resnames[-1], endChainID=helix_chainids[-1], + endResName=helix_resnames[-1], endChainID=helix_chainids[-1], endSeqNum=helix_resnums[-1], endICode=helix_icodes[-1], helixClass=helix_secclasses[0], length=L)) - + # write strands torf_all_sheets = isSheet(secstrs) sheet_secids = secids[torf_all_sheets] @@ -1368,9 +1386,9 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): strand_icodes = icodes[torf_strand] stream.write(SHEETLINE.format(strand=i, sheetID=sheet_id, numStrands=numStrands, - initResName=strand_resnames[0], initChainID=strand_chainids[0], + initResName=strand_resnames[0], initChainID=strand_chainids[0], initSeqNum=strand_resnums[0], initICode=strand_icodes[0], - endResName=strand_resnames[-1], endChainID=strand_chainids[-1], + endResName=strand_resnames[-1], endChainID=strand_chainids[-1], endSeqNum=strand_resnums[-1], endICode=strand_icodes[-1], sense=strand_secclasses[0])) pass @@ -1393,7 +1411,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): warned_hybrid36 = False warned_5_digit = False - + for i, xyz in enumerate(coords): if hybrid36: pdbline = PDBLINE_H36 @@ -1421,7 +1439,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): pdbline = PDBLINE_GE100K_GE10K anisouline = ANISOULINE_GE100K_GE10K LOGGER.warn('Resnums are exceeding 9999 and hexadecimal format is being used for indices and resnums') - + elif reached_max_n_res and not reached_max_n_atom and (i == MAX_N_ATOM or serials[i] > MAX_N_ATOM): reached_max_n_atom = True pdbline = PDBLINE_GE100K_GE10K @@ -1439,7 +1457,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): if len(str(resnum)) == 5: if icodes[i] == '': icodes[i] = str(resnum)[4] - + if not warned_5_digit: LOGGER.warn('Storing 5-digit resnums using insertion codes') warned_5_digit = True @@ -1447,21 +1465,21 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): LOGGER.warn('Truncating 5-digit resnum as insertion code is busy.') resnum = int(str(resnum)[:4]) - + elif len(str(resnum)) > 5: if not warned_5_digit: LOGGER.warn('Truncating {0}-digit resnum as too long to be ' 'supported by insertion code.'.format(len(str(resnum)))) warned_5_digit = True - + resnum = int(str(resnum)[:4]) else: final_resnum = '%4x' % int(resnum) - + if len(str(final_resnum)) == 5: if icodes[i] == '': icodes[i] = str(final_resnum)[4] - + if not warned_5_digit: LOGGER.warn('Storing 5-digit hex resnums using insertion codes') warned_5_digit = True @@ -1469,16 +1487,16 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): LOGGER.warn('Truncating 5-digit hex resnum as insertion code is busy.') resnum = int(str(final_resnum)[:4], 16) - + elif len(str(final_resnum)) > 5: if not warned_5_digit: LOGGER.warn('Truncating {0}-digit hex resnum ({1}) as too long to be ' - 'supported by insertion code.'.format(len(str(final_resnum)), + 'supported by insertion code.'.format(len(str(final_resnum)), final_resnum)) warned_5_digit = True - + resnum = int(str(final_resnum)[:4], 16) - + write(pdbline % (hetero[i], serial, atomnames[i], altlocs[i], @@ -1511,7 +1529,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): def writePDB(filename, atoms, csets=None, autoext=True, **kwargs): """Write *atoms* in PDB format to a file with name *filename* and return *filename*. If *filename* ends with :file:`.gz`, a compressed file will - be written. + be written. :arg renumber: whether to renumber atoms with serial indices Default is **True** @@ -1568,12 +1586,12 @@ def writePQRStream(stream, atoms, **kwargs): length = ss_end - ss_start + 1 entries = [init.getSecindex(), init.getSecid(), - init.getResname(), init.getChid(), + init.getResname(), init.getChid(), init.getResnum(), init.getIcode(), end.getResname(), end.getChid(), end.getResnum(), end.getIcode(), init.getSecclass()] - + if ssa[ss_end] == 'H': helix.append(["HELIX "] + entries + ['', length]) @@ -1607,7 +1625,7 @@ def writePQRStream(stream, atoms, **kwargs): num_strands = item1[1] for item2 in sorted_sheet[i-num_strands+1:]: - item2.append(num_strands) + item2.append(num_strands) format_sheet = ('{0:6s} {1:3d} {2:3s}{12:2d} ' + '{3:3s} {4:1s}{5:4d}{6:1s}' + @@ -1651,7 +1669,7 @@ def writePQRStream(stream, atoms, **kwargs): '{8:8.3f} {9:8.3f} {10:8.3f}' + '{11:8.4f} {12:7.4f}\n').format coords = atoms._getCoords() - + for i, xyz in enumerate(coords): write(format(hetero[i], i+1, atomnames[i], altlocs[i], resnames[i], chainids[i], int(resnums[i]), From 56055ab8114e0452dfe7d6de9fd1415df98957ab Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Tue, 26 Apr 2022 00:15:43 -0700 Subject: [PATCH 2/5] switch back to warning --- prody/proteins/pdbfile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py index a3bf68c35..140739b89 100644 --- a/prody/proteins/pdbfile.py +++ b/prody/proteins/pdbfile.py @@ -326,7 +326,6 @@ def parsePDBStream(stream, **kwargs): try: ag.setBonds(bonds) except ValueError: - raise LOGGER.warn('Bonds read from CONECT records do not apply to subset so were not added') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' From f53d77c877e796a4dbbcef253dcc52cc6597a519 Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Tue, 26 Apr 2022 09:36:49 -0700 Subject: [PATCH 3/5] restore format changes --- prody/atomic/atomgroup.py | 22 +++++++++++----------- prody/proteins/pdbfile.py | 36 ++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/prody/atomic/atomgroup.py b/prody/atomic/atomgroup.py index 3f39f06f6..2763d5862 100644 --- a/prody/atomic/atomgroup.py +++ b/prody/atomic/atomgroup.py @@ -1095,8 +1095,8 @@ def setBonds(self, bonds): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of bonds will be generated and stored with label *numbonds*. This can be used in atom selections, e.g. - ``'numbonds 0'`` can be used to select ions in a system. If *bonds* - is empty or **None**, then all bonds will be removed for this + ``'numbonds 0'`` can be used to select ions in a system. If *bonds* + is empty or **None**, then all bonds will be removed for this :class:`.AtomGroup`. """ if bonds is None or len(bonds) == 0: @@ -1126,7 +1126,7 @@ def setBonds(self, bonds): self._fragments = None def numBonds(self): - """Returns number of bonds. Use :meth:`setBonds` or + """Returns number of bonds. Use :meth:`setBonds` or :meth:`inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1134,7 +1134,7 @@ def numBonds(self): return 0 def getBonds(self): - """Returns bonds. Use :meth:`setBonds` or + """Returns bonds. Use :meth:`setBonds` or :meth:`inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1171,7 +1171,7 @@ def iterBonds(self): yield Bond(self, bond, acsi) def _iterBonds(self): - """Yield pairs of bonded atom indices. Use :meth:`setBonds` + """Yield pairs of bonded atom indices. Use :meth:`setBonds` or `inferBonds` for setting bonds.""" if self._bonds is not None: @@ -1366,8 +1366,8 @@ def setDonors(self, donors): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of donors will be generated and stored with label *numdonors*. This can be used in atom selections, e.g. - ``'numdonors 0'`` can be used to select ions in a system. If *donors* - is empty or **None**, then all donors will be removed for this + ``'numdonors 0'`` can be used to select ions in a system. If *donors* + is empty or **None**, then all donors will be removed for this :class:`.AtomGroup`. """ if donors is None or len(donors) == 0: @@ -1432,8 +1432,8 @@ def setAcceptors(self, acceptors): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of acceptors will be generated and stored with label *numacceptors*. This can be used in atom selections, e.g. - ``'numacceptors 0'`` can be used to select ions in a system. If *acceptors* - is empty or **None**, then all acceptors will be removed for this + ``'numacceptors 0'`` can be used to select ions in a system. If *acceptors* + is empty or **None**, then all acceptors will be removed for this :class:`.AtomGroup`. """ if acceptors is None or len(acceptors) == 0: @@ -1498,8 +1498,8 @@ def setNBExclusions(self, nbexclusions): index 1"``. See :mod:`.select` module documentation for details. Also, a data array with number of nbexclusions will be generated and stored with label *numnbexclusions*. This can be used in atom selections, e.g. - ``'numnbexclusions 0'`` can be used to select ions in a system. If *nbexclusions* - is empty or **None**, then all nbexclusions will be removed for this + ``'numnbexclusions 0'`` can be used to select ions in a system. If *nbexclusions* + is empty or **None**, then all nbexclusions will be removed for this :class:`.AtomGroup`. """ if nbexclusions is None or len(nbexclusions) == 0: diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py index 140739b89..1beaf517f 100644 --- a/prody/proteins/pdbfile.py +++ b/prody/proteins/pdbfile.py @@ -26,7 +26,7 @@ 'writePDBStream', 'writePDB', 'writeChainsList', 'writePQR', 'writePQRStream'] -MAX_N_ATOM = 99999 +MAX_N_ATOM = 99999 MAX_N_RES = 9999 class PDBParseError(Exception): @@ -71,9 +71,9 @@ class PDBParseError(Exception): :arg biomol: if **True**, biomolecules are obtained by transforming the coordinates using information from header section will be returned. - This option uses :func:`.buildBiomolecules` and as noted there, - atoms in biomolecules are ordered according to the original chain - IDs. Chains may have the same chain ID, in which case they are given + This option uses :func:`.buildBiomolecules` and as noted there, + atoms in biomolecules are ordered according to the original chain + IDs. Chains may have the same chain ID, in which case they are given different segment names. Default is **False** :type biomol: bool @@ -99,15 +99,15 @@ def parsePDB(*pdb, **kwargs): :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. - + You can also provide arguments that you would like passed on to fetchPDB(). :arg extend_biomol: whether to extend the list of results with a list - rather than appending, which can create a mixed list, + rather than appending, which can create a mixed list, especially when biomol=True. Default value is False to reproduce previous behaviour. This value is ignored when result is not a list (header=True or model=0). - :type extend_biomol: bool + :type extend_biomol: bool """ extend_biomol = kwargs.pop('extend_biomol', False) @@ -139,7 +139,7 @@ def parsePDB(*pdb, **kwargs): for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain','') - LOGGER.update(i, 'Retrieving {0}...'.format(p+c), + LOGGER.update(i, 'Retrieving {0}...'.format(p+c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): @@ -257,7 +257,7 @@ def parsePDBStream(stream, **kwargs): parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` - (e.g. :class:`file`, buffer, stdin)""" + (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') header = kwargs.get('header', False) @@ -1130,7 +1130,7 @@ def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): :arg hybrid36: whether to use hybrid36 format for atoms with serial greater than 99999. Hexadecimal is used otherwise. Default is False - :type hybrid36: bool + :type hybrid36: bool """ def writeChainsList(chains, filename): @@ -1157,7 +1157,7 @@ def parseChainsList(filename): :arg filename: the name of the file to be read :type filename: str - Returns: lists containing an :class:'.AtomGroup' for each PDB, + Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ @@ -1360,10 +1360,10 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): helix_icodes = icodes[torf] L = helix_resnums[-1] - helix_resnums[0] + 1 - stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], - initResName=helix_resnames[0], initChainID=helix_chainids[0], + stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], + initResName=helix_resnames[0], initChainID=helix_chainids[0], initSeqNum=helix_resnums[0], initICode=helix_icodes[0], - endResName=helix_resnames[-1], endChainID=helix_chainids[-1], + endResName=helix_resnames[-1], endChainID=helix_chainids[-1], endSeqNum=helix_resnums[-1], endICode=helix_icodes[-1], helixClass=helix_secclasses[0], length=L)) @@ -1385,9 +1385,9 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): strand_icodes = icodes[torf_strand] stream.write(SHEETLINE.format(strand=i, sheetID=sheet_id, numStrands=numStrands, - initResName=strand_resnames[0], initChainID=strand_chainids[0], + initResName=strand_resnames[0], initChainID=strand_chainids[0], initSeqNum=strand_resnums[0], initICode=strand_icodes[0], - endResName=strand_resnames[-1], endChainID=strand_chainids[-1], + endResName=strand_resnames[-1], endChainID=strand_chainids[-1], endSeqNum=strand_resnums[-1], endICode=strand_icodes[-1], sense=strand_secclasses[0])) pass @@ -1490,7 +1490,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs): elif len(str(final_resnum)) > 5: if not warned_5_digit: LOGGER.warn('Truncating {0}-digit hex resnum ({1}) as too long to be ' - 'supported by insertion code.'.format(len(str(final_resnum)), + 'supported by insertion code.'.format(len(str(final_resnum)), final_resnum)) warned_5_digit = True @@ -1585,7 +1585,7 @@ def writePQRStream(stream, atoms, **kwargs): length = ss_end - ss_start + 1 entries = [init.getSecindex(), init.getSecid(), - init.getResname(), init.getChid(), + init.getResname(), init.getChid(), init.getResnum(), init.getIcode(), end.getResname(), end.getChid(), end.getResnum(), end.getIcode(), From 37749076659ba878732d351f68ff5cd31e6cecca Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Thu, 28 Apr 2022 14:49:53 -0700 Subject: [PATCH 4/5] cleanup and log warning for unsupported mmcif bond parsing --- prody/proteins/ciffile.py | 4 ++++ prody/proteins/pdbfile.py | 18 ++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py index 3d583511a..adeb464f3 100644 --- a/prody/proteins/ciffile.py +++ b/prody/proteins/ciffile.py @@ -72,6 +72,10 @@ def parseMMCIF(pdb, **kwargs): """ chain = kwargs.pop('chain', None) title = kwargs.get('title', None) + auto_bonds = SETTINGS.get('auto_bonds') + get_bonds = kwargs.get('bonds', auto_bonds) + if get_bonds: + LOGGER.warn('Parsing struct_conn information from mmCIF is current unsupported and no bond information is added to the results') if not os.path.isfile(pdb): if len(pdb) == 5 and pdb.isalnum(): if chain is None: diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py index 1beaf517f..4003c2cad 100644 --- a/prody/proteins/pdbfile.py +++ b/prody/proteins/pdbfile.py @@ -209,6 +209,9 @@ def _parsePDB(pdb, **kwargs): kwargs['title'] = title filename = fetchPDB(pdb, **kwargs) if filename is None: + # ZQ 04/26: escape from mmcif + raise IOError('PDB file for {0} could not be downloaded.' + .format(pdb)) try: LOGGER.info("Trying to parse mmCIF file instead") return parseMMCIF(pdb+chain, **kwargs) @@ -759,22 +762,18 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, if bonds is not None: atom_serial = line[6:11] bonded1_serial = line[11:16] - # bonds.append([int(atom_serial)-1-num_ters, int(bonded1_serial)-1-num_ters]) bonds_serial.append([int(atom_serial), int(bonded1_serial)]) bonded2_serial = line[16:21] if len(bonded2_serial.strip()): - # bonds.append([int(atom_serial)-1-num_ters, int(bonded2_serial)-1-num_ters]) bonds_serial.append([int(atom_serial), int(bonded2_serial)]) bonded3_serial = line[21:26] if len(bonded3_serial.strip()): - # bonds.append([int(atom_serial)-1-num_ters, int(bonded3_serial)-1-num_ters]) bonds_serial.append([int(atom_serial), int(bonded3_serial)]) bonded4_serial = line[26:31] # fixed typo if len(bonded4_serial.strip()): - # bonds.append([int(atom_serial)-1-num_ters, int(bonded4_serial)-1-num_ters]) bonds_serial.append([int(atom_serial), int(bonded4_serial)]) elif not onlycoords and (startswith == 'TER ' or @@ -982,17 +981,16 @@ def _parsePDBLines(atomgroup, lines, split, model, chain, subset, charges.resize(acount, refcheck=False) atomgroup.setCharges(charges) - # hard-coded patch - serial_to_id = {} - for aidx in range(len(serials)): - serial_to_id[int(serials[aidx])] = aidx # rematch the bond-atom serial numbers to atom ids + serial_to_id = {int(serial): aidx for aidx, serial in enumerate(serials)} for bond in bonds_serial: try: bonds.append([serial_to_id[bond[0]], serial_to_id[bond[1]]]) except KeyError: - LOGGER.warn(f"Bond connecting atom serial numbers {bond[0]}" - f" and {bond[1]} contains atoms not included in the model") + LOGGER.warn("Bond connecting atom serial numbers {0}" + " and {1} contains atoms not included in the model" + .format(bond[0], bond[1]) + ) if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) From ad13c6834b461ecae9334753ef51f35ec3d1f47d Mon Sep 17 00:00:00 2001 From: Zhuoran Qiao Date: Thu, 28 Apr 2022 14:51:58 -0700 Subject: [PATCH 5/5] revert inhouse changes --- prody/proteins/pdbfile.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py index 4003c2cad..d014a21a5 100644 --- a/prody/proteins/pdbfile.py +++ b/prody/proteins/pdbfile.py @@ -209,9 +209,6 @@ def _parsePDB(pdb, **kwargs): kwargs['title'] = title filename = fetchPDB(pdb, **kwargs) if filename is None: - # ZQ 04/26: escape from mmcif - raise IOError('PDB file for {0} could not be downloaded.' - .format(pdb)) try: LOGGER.info("Trying to parse mmCIF file instead") return parseMMCIF(pdb+chain, **kwargs)