Merge pull request #250 from tovrstra/pdb-bonds

Add support for `CONECT` lines in PDB file (load and dump bonds)
theochem · Apr 29, 2021 · d7eccdf · d7eccdf
2 parents a0bd8aa + 54b4127
commit d7eccdf
Show file tree

Hide file tree

Showing 6 changed files with 225 additions and 48 deletions.
diff --git a/iodata/formats/pdb.py b/iodata/formats/pdb.py
@@ -31,7 +31,7 @@
 from ..docstrings import (document_load_one, document_load_many, document_dump_one,
  document_dump_many)
 from ..iodata import IOData
-from ..periodic import sym2num, num2sym
+from ..periodic import sym2num, num2sym, bond2num
 from ..utils import angstrom, LineIterator
 
 
@@ -41,9 +41,7 @@
 PATTERNS = ['*.pdb']
 
 
-@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title'])
-def load_one(lit: LineIterator) -> dict:
- """Do not edit this docstring. It will be overwritten."""
+def _parse_pdb_atom_line(line):
  # Overview of ATOM records
  # COLUMNS DATA TYPE FIELD DEFINITION
  # -------------------------------------------------------------------------------------
@@ -62,66 +60,137 @@ def load_one(lit: LineIterator) -> dict:
  # 61 - 66 Real(6.2) tempFactor Temperature factor.
  # 77 - 78 LString(2) element Element symbol, right-justified.
  # 79 - 80 LString(2) charge Charge on the atom.
+
+ # Get element symbol from position 77:78 in pdb format
+ words = line[76:78].split()
+ if not words:
+ # If not present, guess it from position 13:16 (atom name)
+ words = line[12:16].split()
+ # assign atomic number
+ symbol = words[0].title()
+ atnum = sym2num.get(symbol, sym2num.get(symbol[0], None))
+ # atom name, residue name, chain id, & residue sequence number
+ attype = line[12:16].strip()
+ restype = line[17:20].strip()
+ chainid = line[21]
+ resnum = int(line[22:26])
+ # add x, y, and z
+ atcoord = [
+ float(line[30:38]) * angstrom,
+ float(line[38:46]) * angstrom,
+ float(line[46:54]) * angstrom,
+ ]
+ # get occupancies & temperature factor
+ occupancy = float(line[54:60])
+ bfactor = float(line[60:66])
+ return atnum, attype, restype, chainid, resnum, atcoord, occupancy, bfactor
+
+
+def _parse_pdb_conect_line(line):
+ # Overview of CONECT records
+ # COLUMNS DATA TYPE FIELD DEFINITION
+ # -------------------------------------------------------------------------
+ # 1 - 6 Record name "CONECT"
+ # 7 - 11 Integer serial Atom serial number
+ # 12 - 16 Integer serial Serial number of bonded atom
+ # 17 - 21 Integer serial Serial number of bonded atom
+ # 22 - 26 Integer serial Serial number of bonded atom
+ # 27 - 31 Integer serial Serial number of bonded atom
+ iatom0 = int(line[7:12]) - 1
+ for ipos in 12, 17, 22, 27:
+ serial_str = line[ipos: ipos + 5].strip()
+ if serial_str != "":
+ iatom1 = int(serial_str) - 1
+ if iatom1 > iatom0:
+ yield iatom0, iatom1
+
+
+@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title', 'bonds'])
+def load_one(lit: LineIterator) -> dict: # pylint: disable=too-many-branches, too-many-statements
+ """Do not edit this docstring. It will be overwritten."""
+ title_lines = []
+ compnd_lines = []
  atnums = []
  attypes = []
  restypes = []
  chainids = []
  resnums = []
- coords = []
+ atcoords = []
  occupancies = []
  bfactors = []
+ bonds = []
  molecule_found = False
  end_reached = False
- title = "PDB file from IOData"
  while True:
  try:
  line = next(lit)
  except StopIteration:
  break
- # If the PDB file has a title replace it.
- if line.startswith("TITLE") or line.startswith("COMPND"):
- title = line[10:].rstrip()
+ # If the PDB file has a title, replace the default.
+ if line.startswith("TITLE"):
+ title_lines.append(line[10:].strip())
+ if line.startswith("COMPND"):
+ compnd_lines.append(line[10:].strip())
  if line.startswith("ATOM") or line.startswith("HETATM"):
- # get element symbol from position 77:78 in pdb format
- words = line[76:78].split()
- if not words:
- # If not present, guess it from position 13:16 (atom name)
- words = line[12:16].split()
- # assign atomic number
- symbol = words[0].title()
- atnums.append(sym2num.get(symbol, sym2num.get(symbol[0], None)))
- # atom name, residue name, chain id, & residue sequence number
- attypes.append(line[12:16].strip())
- restypes.append(line[17:20].strip())
- chainids.append(line[21])
- resnums.append(int(line[22:26]))
- # add x, y, and z
- coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])])
- # get occupancies & temperature factor
- occupancies.append(float(line[54:60]))
- bfactors.append(float(line[60:66]))
+ (atnum, attype, restype, chainid, resnum, atcoord, occupancy,
+ bfactor) = _parse_pdb_atom_line(line)
+ atnums.append(atnum)
+ attypes.append(attype)
+ restypes.append(restype)
+ chainids.append(chainid)
+ resnums.append(resnum)
+ atcoords.append(atcoord)
+ occupancies.append(occupancy)
+ bfactors.append(bfactor)
  molecule_found = True
+ if line.startswith("CONECT"):
+ for iatom0, iatom1 in _parse_pdb_conect_line(line):
+ bonds.append([iatom0, iatom1, bond2num["un"]])
  if line.startswith("END") and molecule_found:
  end_reached = True
  break
- if molecule_found is False:
+ if not molecule_found:
  lit.error("Molecule could not be read!")
  if not end_reached:
  lit.warn("The END is not found, but the parsed data is returned!")
 
- atffparams = {"attypes": np.array(attypes), "restypes": np.array(restypes),
- "resnums": np.array(resnums)}
- extra = {"occupancies": np.array(occupancies), "bfactors": np.array(bfactors)}
+ # Data related to force fields
+ atffparams = {
+ "attypes": np.array(attypes),
+ "restypes": np.array(restypes),
+ "resnums": np.array(resnums),
+ }
+ # Extra data
+ extra = {
+ "occupancies": np.array(occupancies),
+ "bfactors": np.array(bfactors),
+ }
+ if len(compnd_lines) > 0:
+ extra["compound"] = "\n".join(compnd_lines)
  # add chain id, if it wasn't all empty
  if not np.all(chainids == [' '] * len(chainids)):
  extra["chainids"] = np.array(chainids)
+ # Set a useful title
+ if len(title_lines) == 0:
+ # Some files use COMPND instead of TITLE, in which case COMPND will be
+ # used as title.
+ if "compound" in extra:
+ title = extra["compound"]
+ del extra["compound"]
+ else:
+ title = "PDB file loaded by IOData"
+ else:
+ title = "\n".join(title_lines)
  result = {
- 'atcoords': np.array(coords) * angstrom,
+ 'atcoords': np.array(atcoords),
  'atnums': np.array(atnums),
  'atffparams': atffparams,
  'title': title,
- 'extra': extra
+ 'extra': extra,
  }
+ # assign bonds only if some were present
+ if len(bonds) > 0:
+ result["bonds"] = np.array(bonds)
  return result
 
 
@@ -137,16 +206,39 @@ def load_many(lit: LineIterator) -> Iterator[dict]:
  return
 
 
-@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title'])
+def _dump_multiline_str(f: TextIO, key: str, value: str):
+ r"""Write a multiline string in PDB format.
+
+ Parameters
+ ----------
+ f
+ A file object to write to.
+ key
+ The key used to prefix the multiline string, e.g. `"TITLE"`.
+ value
+ A (multiline) string, with multiple lines separated by `\n`.
+
+ """
+ prefix = key.ljust(10)
+ for iline, line in enumerate(value.split("\n")):
+ print(prefix + line, file=f)
+ prefix = key + str(iline + 2).rjust(10 - len(key)) + " "
+
+
+@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title', 'bonds'])
 def dump_one(f: TextIO, data: IOData):
  """Do not edit this docstring. It will be overwritten."""
- print(str("TITLE " + data.title) or "TITLE Created with IOData", file=f)
+ _dump_multiline_str(f, "TITLE", data.title or "Created with IOData")
+ if "compound" in data.extra:
+ _dump_multiline_str(f, "COMPND", data.extra["compound"])
+ # Prepare for ATOM lines.
  attypes = data.atffparams.get('attypes', None)
  restypes = data.atffparams.get('restypes', None)
  resnums = data.atffparams.get('resnums', None)
  occupancies = data.extra.get('occupancies', None)
  bfactors = data.extra.get('bfactors', None)
  chainids = data.extra.get('chainids', None)
+ # Write ATOM lines.
  for i in range(data.natom):
  n = num2sym[data.atnums[i]]
  resnum = -1 if resnums is None else resnums[i]
@@ -159,6 +251,22 @@ def dump_one(f: TextIO, data: IOData):
  out1 = f'{i+1:>5d} {attype:<4s} {restype:3s} {chain:1s}{resnum:>4d} '
  out2 = f'{x:8.3f}{y:8.3f}{z:8.3f}{occ:6.2f}{b:6.2f}{n:>12s}'
  print("ATOM " + out1 + out2, file=f)
+ if data.bonds is not None:
+ # Prepare for CONECT lines.
+ connections = [[] for iatom in range(data.natom)]
+ for iatom0, iatom1 in data.bonds[:, :2]:
+ connections[iatom0].append(iatom1)
+ connections[iatom1].append(iatom0)
+ # Write CONECT lines.
+ for iatom0, iatoms1 in enumerate(connections):
+ if len(iatoms1) > 0:
+ # Write connection in groups of max 4
+ for ichunk in range(len(iatoms1) // 4 + 1):
+ other_atoms_str = "".join(
+ "{:5d}".format(iatom1 + 1)
+ for iatom1 in iatoms1[ichunk * 4:ichunk * 4 + 4])
+ conect_line = f"CONECT{iatom0 + 1:5d}{other_atoms_str}"
+ print(conect_line, file=f)
  print("END", file=f)
 
 

diff --git a/iodata/test/data/ch5plus.pdb b/iodata/test/data/ch5plus.pdb
@@ -0,0 +1,17 @@
+COMPND UNNAMED
+AUTHOR GENERATED BY OPEN BABEL 2.4.1
+HETATM 1 C UNL 1 1.084 0.018 -0.200 1.00 0.00 C1+
+HETATM 2 H UNL 1 1.536 0.152 -1.213 1.00 0.00 H 
+HETATM 3 H UNL 1 0.314 0.733 -0.579 1.00 0.00 H 
+HETATM 4 H UNL 1 0.636 -0.992 -0.139 1.00 0.00 H 
+HETATM 5 H UNL 1 0.925 0.544 0.772 1.00 0.00 H 
+HETATM 6 H UNL 1 2.147 -0.036 0.138 1.00 0.00 H 
+CONECT 1 2 3 4 5 
+CONECT 1 6 
+CONECT 2 1 
+CONECT 3 1 
+CONECT 4 1 
+CONECT 5 1 
+CONECT 6 1 
+MASTER 0 0 0 0 0 0 0 0 6 0 6 0
+END
diff --git a/iodata/test/data/water_single_model.pdb b/iodata/test/data/water_single_model.pdb
@@ -1,6 +1,10 @@
 MODEL
+TITLE water
 HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H 
 HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O 
 HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H 
+CONECT 1 2
+CONECT 2 1 3
+CONECT 3 2
 ENDMDL
 END
diff --git a/iodata/test/data/water_single_no_end.pdb b/iodata/test/data/water_single_no_end.pdb
@@ -3,3 +3,6 @@ AUTHOR GENERATED BY OPEN BABEL 2.4.1
 HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H 
 HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O 
 HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
diff --git a/iodata/test/data/water_trajectory_no_model.pdb b/iodata/test/data/water_trajectory_no_model.pdb
@@ -1,20 +1,35 @@
 HETATM 1 O HOH 1 3.341 0.264 2.538 1.00 0.00 O 
 HETATM 2 H HOH 0 3.391 -0.627 2.172 1.00 0.00 H 
 HETATM 3 H HOH 0 2.864 0.114 3.364 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
 END
 HETATM 1 O HOH 1 0.345 2.460 -3.307 1.00 0.00 O 
 HETATM 2 H HOH 0 -0.015 3.354 -3.352 1.00 0.00 H 
 HETATM 3 H HOH 0 0.165 2.218 -2.391 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
 END
 HETATM 1 O HOH 1 -0.233 -0.790 -3.248 1.00 0.00 O 
 HETATM 2 H HOH 0 0.409 -0.077 -3.354 1.00 0.00 H 
 HETATM 3 H HOH 0 -0.872 -0.395 -2.642 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
 END
 HETATM 1 O HOH 1 1.359 3.030 -0.351 1.00 0.00 O 
 HETATM 2 H HOH 0 1.799 3.355 0.444 1.00 0.00 H 
 HETATM 3 H HOH 0 1.936 3.363 -1.049 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
 END
 HETATM 1 O HOH 1 -1.382 -3.328 -2.737 1.00 0.00 O 
 HETATM 2 H HOH 0 -2.123 -3.355 -3.354 1.00 0.00 H 
 HETATM 3 H HOH 0 -0.627 -3.266 -3.336 1.00 0.00 H 
+CONECT 1 2 
+CONECT 2 1 3 
+CONECT 3 2 
 END