Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for CONECT lines in PDB file (load and dump bonds) #250

Merged
merged 5 commits into from
Apr 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 143 additions & 35 deletions iodata/formats/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from ..docstrings import (document_load_one, document_load_many, document_dump_one,
document_dump_many)
from ..iodata import IOData
from ..periodic import sym2num, num2sym
from ..periodic import sym2num, num2sym, bond2num
from ..utils import angstrom, LineIterator


Expand All @@ -41,9 +41,7 @@
PATTERNS = ['*.pdb']


@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title'])
def load_one(lit: LineIterator) -> dict:
"""Do not edit this docstring. It will be overwritten."""
def _parse_pdb_atom_line(line):
# Overview of ATOM records
# COLUMNS DATA TYPE FIELD DEFINITION
# -------------------------------------------------------------------------------------
Expand All @@ -62,66 +60,137 @@ def load_one(lit: LineIterator) -> dict:
# 61 - 66 Real(6.2) tempFactor Temperature factor.
# 77 - 78 LString(2) element Element symbol, right-justified.
# 79 - 80 LString(2) charge Charge on the atom.

# Get element symbol from position 77:78 in pdb format
words = line[76:78].split()
if not words:
# If not present, guess it from position 13:16 (atom name)
words = line[12:16].split()
# assign atomic number
symbol = words[0].title()
atnum = sym2num.get(symbol, sym2num.get(symbol[0], None))
# atom name, residue name, chain id, & residue sequence number
attype = line[12:16].strip()
restype = line[17:20].strip()
chainid = line[21]
resnum = int(line[22:26])
# add x, y, and z
atcoord = [
float(line[30:38]) * angstrom,
float(line[38:46]) * angstrom,
float(line[46:54]) * angstrom,
]
# get occupancies & temperature factor
occupancy = float(line[54:60])
bfactor = float(line[60:66])
return atnum, attype, restype, chainid, resnum, atcoord, occupancy, bfactor


def _parse_pdb_conect_line(line):
# Overview of CONECT records
# COLUMNS DATA TYPE FIELD DEFINITION
# -------------------------------------------------------------------------
# 1 - 6 Record name "CONECT"
# 7 - 11 Integer serial Atom serial number
# 12 - 16 Integer serial Serial number of bonded atom
# 17 - 21 Integer serial Serial number of bonded atom
# 22 - 26 Integer serial Serial number of bonded atom
# 27 - 31 Integer serial Serial number of bonded atom
iatom0 = int(line[7:12]) - 1
for ipos in 12, 17, 22, 27:
serial_str = line[ipos: ipos + 5].strip()
if serial_str != "":
iatom1 = int(serial_str) - 1
if iatom1 > iatom0:
yield iatom0, iatom1


@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title', 'bonds'])
def load_one(lit: LineIterator) -> dict: # pylint: disable=too-many-branches, too-many-statements
"""Do not edit this docstring. It will be overwritten."""
title_lines = []
compnd_lines = []
atnums = []
attypes = []
restypes = []
chainids = []
resnums = []
coords = []
atcoords = []
occupancies = []
bfactors = []
bonds = []
molecule_found = False
end_reached = False
title = "PDB file from IOData"
while True:
try:
line = next(lit)
except StopIteration:
break
# If the PDB file has a title replace it.
if line.startswith("TITLE") or line.startswith("COMPND"):
title = line[10:].rstrip()
# If the PDB file has a title, replace the default.
if line.startswith("TITLE"):
title_lines.append(line[10:].strip())
if line.startswith("COMPND"):
compnd_lines.append(line[10:].strip())
if line.startswith("ATOM") or line.startswith("HETATM"):
# get element symbol from position 77:78 in pdb format
words = line[76:78].split()
if not words:
# If not present, guess it from position 13:16 (atom name)
words = line[12:16].split()
# assign atomic number
symbol = words[0].title()
atnums.append(sym2num.get(symbol, sym2num.get(symbol[0], None)))
# atom name, residue name, chain id, & residue sequence number
attypes.append(line[12:16].strip())
restypes.append(line[17:20].strip())
chainids.append(line[21])
resnums.append(int(line[22:26]))
# add x, y, and z
coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])])
# get occupancies & temperature factor
occupancies.append(float(line[54:60]))
bfactors.append(float(line[60:66]))
(atnum, attype, restype, chainid, resnum, atcoord, occupancy,
bfactor) = _parse_pdb_atom_line(line)
atnums.append(atnum)
attypes.append(attype)
restypes.append(restype)
chainids.append(chainid)
resnums.append(resnum)
atcoords.append(atcoord)
occupancies.append(occupancy)
bfactors.append(bfactor)
molecule_found = True
if line.startswith("CONECT"):
for iatom0, iatom1 in _parse_pdb_conect_line(line):
bonds.append([iatom0, iatom1, bond2num["un"]])
if line.startswith("END") and molecule_found:
end_reached = True
break
if molecule_found is False:
if not molecule_found:
lit.error("Molecule could not be read!")
if not end_reached:
lit.warn("The END is not found, but the parsed data is returned!")

atffparams = {"attypes": np.array(attypes), "restypes": np.array(restypes),
"resnums": np.array(resnums)}
extra = {"occupancies": np.array(occupancies), "bfactors": np.array(bfactors)}
# Data related to force fields
atffparams = {
"attypes": np.array(attypes),
"restypes": np.array(restypes),
"resnums": np.array(resnums),
}
# Extra data
extra = {
"occupancies": np.array(occupancies),
"bfactors": np.array(bfactors),
}
if len(compnd_lines) > 0:
extra["compound"] = "\n".join(compnd_lines)
# add chain id, if it wasn't all empty
if not np.all(chainids == [' '] * len(chainids)):
extra["chainids"] = np.array(chainids)
# Set a useful title
if len(title_lines) == 0:
# Some files use COMPND instead of TITLE, in which case COMPND will be
# used as title.
if "compound" in extra:
title = extra["compound"]
del extra["compound"]
else:
title = "PDB file loaded by IOData"
else:
title = "\n".join(title_lines)
result = {
'atcoords': np.array(coords) * angstrom,
'atcoords': np.array(atcoords),
'atnums': np.array(atnums),
'atffparams': atffparams,
'title': title,
'extra': extra
'extra': extra,
}
# assign bonds only if some were present
if len(bonds) > 0:
result["bonds"] = np.array(bonds)
return result


Expand All @@ -137,16 +206,39 @@ def load_many(lit: LineIterator) -> Iterator[dict]:
return


@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title'])
def _dump_multiline_str(f: TextIO, key: str, value: str):
r"""Write a multiline string in PDB format.

Parameters
----------
f
A file object to write to.
key
The key used to prefix the multiline string, e.g. `"TITLE"`.
value
A (multiline) string, with multiple lines separated by `\n`.

"""
prefix = key.ljust(10)
for iline, line in enumerate(value.split("\n")):
print(prefix + line, file=f)
prefix = key + str(iline + 2).rjust(10 - len(key)) + " "


@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title', 'bonds'])
def dump_one(f: TextIO, data: IOData):
"""Do not edit this docstring. It will be overwritten."""
print(str("TITLE " + data.title) or "TITLE Created with IOData", file=f)
_dump_multiline_str(f, "TITLE", data.title or "Created with IOData")
if "compound" in data.extra:
_dump_multiline_str(f, "COMPND", data.extra["compound"])
# Prepare for ATOM lines.
attypes = data.atffparams.get('attypes', None)
restypes = data.atffparams.get('restypes', None)
resnums = data.atffparams.get('resnums', None)
occupancies = data.extra.get('occupancies', None)
bfactors = data.extra.get('bfactors', None)
chainids = data.extra.get('chainids', None)
# Write ATOM lines.
for i in range(data.natom):
n = num2sym[data.atnums[i]]
resnum = -1 if resnums is None else resnums[i]
Expand All @@ -159,6 +251,22 @@ def dump_one(f: TextIO, data: IOData):
out1 = f'{i+1:>5d} {attype:<4s} {restype:3s} {chain:1s}{resnum:>4d} '
out2 = f'{x:8.3f}{y:8.3f}{z:8.3f}{occ:6.2f}{b:6.2f}{n:>12s}'
print("ATOM " + out1 + out2, file=f)
if data.bonds is not None:
# Prepare for CONECT lines.
connections = [[] for iatom in range(data.natom)]
for iatom0, iatom1 in data.bonds[:, :2]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would create the list connections and loop over connections only if data.bonds is not None.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point!

connections[iatom0].append(iatom1)
connections[iatom1].append(iatom0)
# Write CONECT lines.
for iatom0, iatoms1 in enumerate(connections):
if len(iatoms1) > 0:
# Write connection in groups of max 4
for ichunk in range(len(iatoms1) // 4 + 1):
other_atoms_str = "".join(
"{:5d}".format(iatom1 + 1)
for iatom1 in iatoms1[ichunk * 4:ichunk * 4 + 4])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would increase the readability of the conde defining the string first of the connected atoms (iatom1) and then concatenate the string "CONECT" and iatom0 string with the newly created string. It took me a some time to understand what is written.

conect_line = f"CONECT{iatom0 + 1:5d}{other_atoms_str}"
print(conect_line, file=f)
print("END", file=f)


Expand Down
17 changes: 17 additions & 0 deletions iodata/test/data/ch5plus.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
COMPND UNNAMED
AUTHOR GENERATED BY OPEN BABEL 2.4.1
HETATM 1 C UNL 1 1.084 0.018 -0.200 1.00 0.00 C1+
HETATM 2 H UNL 1 1.536 0.152 -1.213 1.00 0.00 H
HETATM 3 H UNL 1 0.314 0.733 -0.579 1.00 0.00 H
HETATM 4 H UNL 1 0.636 -0.992 -0.139 1.00 0.00 H
HETATM 5 H UNL 1 0.925 0.544 0.772 1.00 0.00 H
HETATM 6 H UNL 1 2.147 -0.036 0.138 1.00 0.00 H
CONECT 1 2 3 4 5
CONECT 1 6
CONECT 2 1
CONECT 3 1
CONECT 4 1
CONECT 5 1
CONECT 6 1
MASTER 0 0 0 0 0 0 0 0 6 0 6 0
END
4 changes: 4 additions & 0 deletions iodata/test/data/water_single_model.pdb
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
MODEL
TITLE water
HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H
HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O
HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
ENDMDL
END
3 changes: 3 additions & 0 deletions iodata/test/data/water_single_no_end.pdb
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ AUTHOR GENERATED BY OPEN BABEL 2.4.1
HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H
HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O
HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
15 changes: 15 additions & 0 deletions iodata/test/data/water_trajectory_no_model.pdb
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
HETATM 1 O HOH 1 3.341 0.264 2.538 1.00 0.00 O
HETATM 2 H HOH 0 3.391 -0.627 2.172 1.00 0.00 H
HETATM 3 H HOH 0 2.864 0.114 3.364 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 0.345 2.460 -3.307 1.00 0.00 O
HETATM 2 H HOH 0 -0.015 3.354 -3.352 1.00 0.00 H
HETATM 3 H HOH 0 0.165 2.218 -2.391 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 -0.233 -0.790 -3.248 1.00 0.00 O
HETATM 2 H HOH 0 0.409 -0.077 -3.354 1.00 0.00 H
HETATM 3 H HOH 0 -0.872 -0.395 -2.642 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 1.359 3.030 -0.351 1.00 0.00 O
HETATM 2 H HOH 0 1.799 3.355 0.444 1.00 0.00 H
HETATM 3 H HOH 0 1.936 3.363 -1.049 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
HETATM 1 O HOH 1 -1.382 -3.328 -2.737 1.00 0.00 O
HETATM 2 H HOH 0 -2.123 -3.355 -3.354 1.00 0.00 H
HETATM 3 H HOH 0 -0.627 -3.266 -3.336 1.00 0.00 H
CONECT 1 2
CONECT 2 1 3
CONECT 3 2
END
Loading