Skip to content

Commit

Permalink
Merge pull request #250 from tovrstra/pdb-bonds
Browse files Browse the repository at this point in the history
Add support for `CONECT` lines in PDB file (load and dump bonds)
  • Loading branch information
tovrstra authored Apr 29, 2021
2 parents a0bd8aa + 54b4127 commit d7eccdf
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 48 deletions.
178 changes: 143 additions & 35 deletions iodata/formats/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from ..docstrings import (document_load_one, document_load_many, document_dump_one,
document_dump_many)
from ..iodata import IOData
from ..periodic import sym2num, num2sym
from ..periodic import sym2num, num2sym, bond2num
from ..utils import angstrom, LineIterator


Expand All @@ -41,9 +41,7 @@
PATTERNS = ['*.pdb']


@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title'])
def load_one(lit: LineIterator) -> dict:
"""Do not edit this docstring. It will be overwritten."""
def _parse_pdb_atom_line(line):
# Overview of ATOM records
# COLUMNS DATA TYPE FIELD DEFINITION
# -------------------------------------------------------------------------------------
Expand All @@ -62,66 +60,137 @@ def load_one(lit: LineIterator) -> dict:
# 61 - 66 Real(6.2) tempFactor Temperature factor.
# 77 - 78 LString(2) element Element symbol, right-justified.
# 79 - 80 LString(2) charge Charge on the atom.

# Get element symbol from position 77:78 in pdb format
words = line[76:78].split()
if not words:
# If not present, guess it from position 13:16 (atom name)
words = line[12:16].split()
# assign atomic number
symbol = words[0].title()
atnum = sym2num.get(symbol, sym2num.get(symbol[0], None))
# atom name, residue name, chain id, & residue sequence number
attype = line[12:16].strip()
restype = line[17:20].strip()
chainid = line[21]
resnum = int(line[22:26])
# add x, y, and z
atcoord = [
float(line[30:38]) * angstrom,
float(line[38:46]) * angstrom,
float(line[46:54]) * angstrom,
]
# get occupancies & temperature factor
occupancy = float(line[54:60])
bfactor = float(line[60:66])
return atnum, attype, restype, chainid, resnum, atcoord, occupancy, bfactor


def _parse_pdb_conect_line(line):
# Overview of CONECT records
# COLUMNS DATA TYPE FIELD DEFINITION
# -------------------------------------------------------------------------
# 1 - 6 Record name "CONECT"
# 7 - 11 Integer serial Atom serial number
# 12 - 16 Integer serial Serial number of bonded atom
# 17 - 21 Integer serial Serial number of bonded atom
# 22 - 26 Integer serial Serial number of bonded atom
# 27 - 31 Integer serial Serial number of bonded atom
iatom0 = int(line[7:12]) - 1
for ipos in 12, 17, 22, 27:
serial_str = line[ipos: ipos + 5].strip()
if serial_str != "":
iatom1 = int(serial_str) - 1
if iatom1 > iatom0:
yield iatom0, iatom1


@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title', 'bonds'])
def load_one(lit: LineIterator) -> dict: # pylint: disable=too-many-branches, too-many-statements
"""Do not edit this docstring. It will be overwritten."""
title_lines = []
compnd_lines = []
atnums = []
attypes = []
restypes = []
chainids = []
resnums = []
coords = []
atcoords = []
occupancies = []
bfactors = []
bonds = []
molecule_found = False
end_reached = False
title = "PDB file from IOData"
while True:
try:
line = next(lit)
except StopIteration:
break
# If the PDB file has a title replace it.
if line.startswith("TITLE") or line.startswith("COMPND"):
title = line[10:].rstrip()
# If the PDB file has a title, replace the default.
if line.startswith("TITLE"):
title_lines.append(line[10:].strip())
if line.startswith("COMPND"):
compnd_lines.append(line[10:].strip())
if line.startswith("ATOM") or line.startswith("HETATM"):
# get element symbol from position 77:78 in pdb format
words = line[76:78].split()
if not words:
# If not present, guess it from position 13:16 (atom name)
words = line[12:16].split()
# assign atomic number
symbol = words[0].title()
atnums.append(sym2num.get(symbol, sym2num.get(symbol[0], None)))
# atom name, residue name, chain id, & residue sequence number
attypes.append(line[12:16].strip())
restypes.append(line[17:20].strip())
chainids.append(line[21])
resnums.append(int(line[22:26]))
# add x, y, and z
coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])])
# get occupancies & temperature factor
occupancies.append(float(line[54:60]))
bfactors.append(float(line[60:66]))
(atnum, attype, restype, chainid, resnum, atcoord, occupancy,
bfactor) = _parse_pdb_atom_line(line)
atnums.append(atnum)
attypes.append(attype)
restypes.append(restype)
chainids.append(chainid)
resnums.append(resnum)
atcoords.append(atcoord)
occupancies.append(occupancy)
bfactors.append(bfactor)
molecule_found = True
if line.startswith("CONECT"):
for iatom0, iatom1 in _parse_pdb_conect_line(line):
bonds.append([iatom0, iatom1, bond2num["un"]])
if line.startswith("END") and molecule_found:
end_reached = True
break
if molecule_found is False:
if not molecule_found:
lit.error("Molecule could not be read!")
if not end_reached:
lit.warn("The END is not found, but the parsed data is returned!")

atffparams = {"attypes": np.array(attypes), "restypes": np.array(restypes),
"resnums": np.array(resnums)}
extra = {"occupancies": np.array(occupancies), "bfactors": np.array(bfactors)}
# Data related to force fields
atffparams = {
"attypes": np.array(attypes),
"restypes": np.array(restypes),
"resnums": np.array(resnums),
}
# Extra data
extra = {
"occupancies": np.array(occupancies),
"bfactors": np.array(bfactors),
}
if len(compnd_lines) > 0:
extra["compound"] = "\n".join(compnd_lines)
# add chain id, if it wasn't all empty
if not np.all(chainids == [' '] * len(chainids)):
extra["chainids"] = np.array(chainids)
# Set a useful title
if len(title_lines) == 0:
# Some files use COMPND instead of TITLE, in which case COMPND will be
# used as title.
if "compound" in extra:
title = extra["compound"]
del extra["compound"]
else:
title = "PDB file loaded by IOData"
else:
title = "\n".join(title_lines)
result = {
'atcoords': np.array(coords) * angstrom,
'atcoords': np.array(atcoords),
'atnums': np.array(atnums),
'atffparams': atffparams,
'title': title,
'extra': extra
'extra': extra,
}
# assign bonds only if some were present
if len(bonds) > 0:
result["bonds"] = np.array(bonds)
return result


Expand All @@ -137,16 +206,39 @@ def load_many(lit: LineIterator) -> Iterator[dict]:
return


@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title'])
def _dump_multiline_str(f: TextIO, key: str, value: str):
r"""Write a multiline string in PDB format.
Parameters
----------
f
A file object to write to.
key
The key used to prefix the multiline string, e.g. `"TITLE"`.
value
A (multiline) string, with multiple lines separated by `\n`.
"""
prefix = key.ljust(10)
for iline, line in enumerate(value.split("\n")):
print(prefix + line, file=f)
prefix = key + str(iline + 2).rjust(10 - len(key)) + " "


@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title', 'bonds'])
def dump_one(f: TextIO, data: IOData):
"""Do not edit this docstring. It will be overwritten."""
print(str("TITLE " + data.title) or "TITLE Created with IOData", file=f)
_dump_multiline_str(f, "TITLE", data.title or "Created with IOData")
if "compound" in data.extra:
_dump_multiline_str(f, "COMPND", data.extra["compound"])
# Prepare for ATOM lines.
attypes = data.atffparams.get('attypes', None)
restypes = data.atffparams.get('restypes', None)
resnums = data.atffparams.get('resnums', None)
occupancies = data.extra.get('occupancies', None)
bfactors = data.extra.get('bfactors', None)
chainids = data.extra.get('chainids', None)
# Write ATOM lines.
for i in range(data.natom):
n = num2sym[data.atnums[i]]
resnum = -1 if resnums is None else resnums[i]
Expand All @@ -159,6 +251,22 @@ def dump_one(f: TextIO, data: IOData):
out1 = f'{i+1:>5d} {attype:<4s} {restype:3s} {chain:1s}{resnum:>4d} '
out2 = f'{x:8.3f}{y:8.3f}{z:8.3f}{occ:6.2f}{b:6.2f}{n:>12s}'
print("ATOM " + out1 + out2, file=f)
if data.bonds is not None:
# Prepare for CONECT lines.
connections = [[] for iatom in range(data.natom)]
for iatom0, iatom1 in data.bonds[:, :2]:
connections[iatom0].append(iatom1)
connections[iatom1].append(iatom0)
# Write CONECT lines.
for iatom0, iatoms1 in enumerate(connections):
if len(iatoms1) > 0:
# Write connection in groups of max 4
for ichunk in range(len(iatoms1) // 4 + 1):
other_atoms_str = "".join(
"{:5d}".format(iatom1 + 1)
for iatom1 in iatoms1[ichunk * 4:ichunk * 4 + 4])
conect_line = f"CONECT{iatom0 + 1:5d}{other_atoms_str}"
print(conect_line, file=f)
print("END", file=f)


Expand Down
17 changes: 17 additions & 0 deletions iodata/test/data/ch5plus.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
COMPND UNNAMED
AUTHOR GENERATED BY OPEN BABEL 2.4.1
HETATM 1 C UNL 1 1.084 0.018 -0.200 1.00 0.00 C1+
HETATM 2 H UNL 1 1.536 0.152 -1.213 1.00 0.00 H
HETATM 3 H UNL 1 0.314 0.733 -0.579 1.00 0.00 H
HETATM 4 H UNL 1 0.636 -0.992 -0.139 1.00 0.00 H
HETATM 5 H UNL 1 0.925 0.544 0.772 1.00 0.00 H
HETATM 6 H UNL 1 2.147 -0.036 0.138 1.00 0.00 H
CONECT 1 2 3 4 5
CONECT 1 6
CONECT 2 1
CONECT 3 1
CONECT 4 1
CONECT 5 1
CONECT 6 1
MASTER 0 0 0 0 0 0 0 0 6 0 6 0
END
4 changes: 4 additions & 0 deletions iodata/test/data/water_single_model.pdb
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
MODEL
TITLE water
HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H
HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O
HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
ENDMDL
END
3 changes: 3 additions & 0 deletions iodata/test/data/water_single_no_end.pdb
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ AUTHOR GENERATED BY OPEN BABEL 2.4.1
HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H
HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O
HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
15 changes: 15 additions & 0 deletions iodata/test/data/water_trajectory_no_model.pdb
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
HETATM 1 O HOH 1 3.341 0.264 2.538 1.00 0.00 O
HETATM 2 H HOH 0 3.391 -0.627 2.172 1.00 0.00 H
HETATM 3 H HOH 0 2.864 0.114 3.364 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 0.345 2.460 -3.307 1.00 0.00 O
HETATM 2 H HOH 0 -0.015 3.354 -3.352 1.00 0.00 H
HETATM 3 H HOH 0 0.165 2.218 -2.391 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 -0.233 -0.790 -3.248 1.00 0.00 O
HETATM 2 H HOH 0 0.409 -0.077 -3.354 1.00 0.00 H
HETATM 3 H HOH 0 -0.872 -0.395 -2.642 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 1.359 3.030 -0.351 1.00 0.00 O
HETATM 2 H HOH 0 1.799 3.355 0.444 1.00 0.00 H
HETATM 3 H HOH 0 1.936 3.363 -1.049 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 -1.382 -3.328 -2.737 1.00 0.00 O
HETATM 2 H HOH 0 -2.123 -3.355 -3.354 1.00 0.00 H
HETATM 3 H HOH 0 -0.627 -3.266 -3.336 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
Loading

0 comments on commit d7eccdf

Please sign in to comment.