-
Notifications
You must be signed in to change notification settings - Fork 46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for CONECT
lines in PDB file (load and dump bonds)
#250
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
166817d
PDB format: Load and dump CONECT lines
tovrstra c85e546
Use more pytest.mark.parametrize
tovrstra 9a66652
Change bond type 1 to unknown in pdb
tovrstra 2a2717b
Fix few issues in CONECT dumping
tovrstra 54b4127
Support multiline TITLE and improve handling of COMPND
tovrstra File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ | |
from ..docstrings import (document_load_one, document_load_many, document_dump_one, | ||
document_dump_many) | ||
from ..iodata import IOData | ||
from ..periodic import sym2num, num2sym | ||
from ..periodic import sym2num, num2sym, bond2num | ||
from ..utils import angstrom, LineIterator | ||
|
||
|
||
|
@@ -41,9 +41,7 @@ | |
PATTERNS = ['*.pdb'] | ||
|
||
|
||
@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title']) | ||
def load_one(lit: LineIterator) -> dict: | ||
"""Do not edit this docstring. It will be overwritten.""" | ||
def _parse_pdb_atom_line(line): | ||
# Overview of ATOM records | ||
# COLUMNS DATA TYPE FIELD DEFINITION | ||
# ------------------------------------------------------------------------------------- | ||
|
@@ -62,66 +60,137 @@ def load_one(lit: LineIterator) -> dict: | |
# 61 - 66 Real(6.2) tempFactor Temperature factor. | ||
# 77 - 78 LString(2) element Element symbol, right-justified. | ||
# 79 - 80 LString(2) charge Charge on the atom. | ||
|
||
# Get element symbol from position 77:78 in pdb format | ||
words = line[76:78].split() | ||
if not words: | ||
# If not present, guess it from position 13:16 (atom name) | ||
words = line[12:16].split() | ||
# assign atomic number | ||
symbol = words[0].title() | ||
atnum = sym2num.get(symbol, sym2num.get(symbol[0], None)) | ||
# atom name, residue name, chain id, & residue sequence number | ||
attype = line[12:16].strip() | ||
restype = line[17:20].strip() | ||
chainid = line[21] | ||
resnum = int(line[22:26]) | ||
# add x, y, and z | ||
atcoord = [ | ||
float(line[30:38]) * angstrom, | ||
float(line[38:46]) * angstrom, | ||
float(line[46:54]) * angstrom, | ||
] | ||
# get occupancies & temperature factor | ||
occupancy = float(line[54:60]) | ||
bfactor = float(line[60:66]) | ||
return atnum, attype, restype, chainid, resnum, atcoord, occupancy, bfactor | ||
|
||
|
||
def _parse_pdb_conect_line(line): | ||
# Overview of CONECT records | ||
# COLUMNS DATA TYPE FIELD DEFINITION | ||
# ------------------------------------------------------------------------- | ||
# 1 - 6 Record name "CONECT" | ||
# 7 - 11 Integer serial Atom serial number | ||
# 12 - 16 Integer serial Serial number of bonded atom | ||
# 17 - 21 Integer serial Serial number of bonded atom | ||
# 22 - 26 Integer serial Serial number of bonded atom | ||
# 27 - 31 Integer serial Serial number of bonded atom | ||
iatom0 = int(line[7:12]) - 1 | ||
for ipos in 12, 17, 22, 27: | ||
serial_str = line[ipos: ipos + 5].strip() | ||
if serial_str != "": | ||
iatom1 = int(serial_str) - 1 | ||
if iatom1 > iatom0: | ||
yield iatom0, iatom1 | ||
|
||
|
||
@document_load_one("PDB", ['atcoords', 'atnums', 'atffparams', 'extra'], ['title', 'bonds']) | ||
def load_one(lit: LineIterator) -> dict: # pylint: disable=too-many-branches, too-many-statements | ||
"""Do not edit this docstring. It will be overwritten.""" | ||
title_lines = [] | ||
compnd_lines = [] | ||
atnums = [] | ||
attypes = [] | ||
restypes = [] | ||
chainids = [] | ||
resnums = [] | ||
coords = [] | ||
atcoords = [] | ||
occupancies = [] | ||
bfactors = [] | ||
bonds = [] | ||
molecule_found = False | ||
end_reached = False | ||
title = "PDB file from IOData" | ||
while True: | ||
try: | ||
line = next(lit) | ||
except StopIteration: | ||
break | ||
# If the PDB file has a title replace it. | ||
if line.startswith("TITLE") or line.startswith("COMPND"): | ||
title = line[10:].rstrip() | ||
# If the PDB file has a title, replace the default. | ||
if line.startswith("TITLE"): | ||
title_lines.append(line[10:].strip()) | ||
if line.startswith("COMPND"): | ||
compnd_lines.append(line[10:].strip()) | ||
if line.startswith("ATOM") or line.startswith("HETATM"): | ||
# get element symbol from position 77:78 in pdb format | ||
words = line[76:78].split() | ||
if not words: | ||
# If not present, guess it from position 13:16 (atom name) | ||
words = line[12:16].split() | ||
# assign atomic number | ||
symbol = words[0].title() | ||
atnums.append(sym2num.get(symbol, sym2num.get(symbol[0], None))) | ||
# atom name, residue name, chain id, & residue sequence number | ||
attypes.append(line[12:16].strip()) | ||
restypes.append(line[17:20].strip()) | ||
chainids.append(line[21]) | ||
resnums.append(int(line[22:26])) | ||
# add x, y, and z | ||
coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])]) | ||
# get occupancies & temperature factor | ||
occupancies.append(float(line[54:60])) | ||
bfactors.append(float(line[60:66])) | ||
(atnum, attype, restype, chainid, resnum, atcoord, occupancy, | ||
bfactor) = _parse_pdb_atom_line(line) | ||
atnums.append(atnum) | ||
attypes.append(attype) | ||
restypes.append(restype) | ||
chainids.append(chainid) | ||
resnums.append(resnum) | ||
atcoords.append(atcoord) | ||
occupancies.append(occupancy) | ||
bfactors.append(bfactor) | ||
molecule_found = True | ||
if line.startswith("CONECT"): | ||
for iatom0, iatom1 in _parse_pdb_conect_line(line): | ||
bonds.append([iatom0, iatom1, bond2num["un"]]) | ||
if line.startswith("END") and molecule_found: | ||
end_reached = True | ||
break | ||
if molecule_found is False: | ||
if not molecule_found: | ||
lit.error("Molecule could not be read!") | ||
if not end_reached: | ||
lit.warn("The END is not found, but the parsed data is returned!") | ||
|
||
atffparams = {"attypes": np.array(attypes), "restypes": np.array(restypes), | ||
"resnums": np.array(resnums)} | ||
extra = {"occupancies": np.array(occupancies), "bfactors": np.array(bfactors)} | ||
# Data related to force fields | ||
atffparams = { | ||
"attypes": np.array(attypes), | ||
"restypes": np.array(restypes), | ||
"resnums": np.array(resnums), | ||
} | ||
# Extra data | ||
extra = { | ||
"occupancies": np.array(occupancies), | ||
"bfactors": np.array(bfactors), | ||
} | ||
if len(compnd_lines) > 0: | ||
extra["compound"] = "\n".join(compnd_lines) | ||
# add chain id, if it wasn't all empty | ||
if not np.all(chainids == [' '] * len(chainids)): | ||
extra["chainids"] = np.array(chainids) | ||
# Set a useful title | ||
if len(title_lines) == 0: | ||
# Some files use COMPND instead of TITLE, in which case COMPND will be | ||
# used as title. | ||
if "compound" in extra: | ||
title = extra["compound"] | ||
del extra["compound"] | ||
else: | ||
title = "PDB file loaded by IOData" | ||
else: | ||
title = "\n".join(title_lines) | ||
result = { | ||
'atcoords': np.array(coords) * angstrom, | ||
'atcoords': np.array(atcoords), | ||
'atnums': np.array(atnums), | ||
'atffparams': atffparams, | ||
'title': title, | ||
'extra': extra | ||
'extra': extra, | ||
} | ||
# assign bonds only if some were present | ||
if len(bonds) > 0: | ||
result["bonds"] = np.array(bonds) | ||
return result | ||
|
||
|
||
|
@@ -137,16 +206,39 @@ def load_many(lit: LineIterator) -> Iterator[dict]: | |
return | ||
|
||
|
||
@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title']) | ||
def _dump_multiline_str(f: TextIO, key: str, value: str): | ||
r"""Write a multiline string in PDB format. | ||
|
||
Parameters | ||
---------- | ||
f | ||
A file object to write to. | ||
key | ||
The key used to prefix the multiline string, e.g. `"TITLE"`. | ||
value | ||
A (multiline) string, with multiple lines separated by `\n`. | ||
|
||
""" | ||
prefix = key.ljust(10) | ||
for iline, line in enumerate(value.split("\n")): | ||
print(prefix + line, file=f) | ||
prefix = key + str(iline + 2).rjust(10 - len(key)) + " " | ||
|
||
|
||
@document_dump_one("PDB", ['atcoords', 'atnums', 'extra'], ['atffparams', 'title', 'bonds']) | ||
def dump_one(f: TextIO, data: IOData): | ||
"""Do not edit this docstring. It will be overwritten.""" | ||
print(str("TITLE " + data.title) or "TITLE Created with IOData", file=f) | ||
_dump_multiline_str(f, "TITLE", data.title or "Created with IOData") | ||
if "compound" in data.extra: | ||
_dump_multiline_str(f, "COMPND", data.extra["compound"]) | ||
# Prepare for ATOM lines. | ||
attypes = data.atffparams.get('attypes', None) | ||
restypes = data.atffparams.get('restypes', None) | ||
resnums = data.atffparams.get('resnums', None) | ||
occupancies = data.extra.get('occupancies', None) | ||
bfactors = data.extra.get('bfactors', None) | ||
chainids = data.extra.get('chainids', None) | ||
# Write ATOM lines. | ||
for i in range(data.natom): | ||
n = num2sym[data.atnums[i]] | ||
resnum = -1 if resnums is None else resnums[i] | ||
|
@@ -159,6 +251,22 @@ def dump_one(f: TextIO, data: IOData): | |
out1 = f'{i+1:>5d} {attype:<4s} {restype:3s} {chain:1s}{resnum:>4d} ' | ||
out2 = f'{x:8.3f}{y:8.3f}{z:8.3f}{occ:6.2f}{b:6.2f}{n:>12s}' | ||
print("ATOM " + out1 + out2, file=f) | ||
if data.bonds is not None: | ||
# Prepare for CONECT lines. | ||
connections = [[] for iatom in range(data.natom)] | ||
for iatom0, iatom1 in data.bonds[:, :2]: | ||
connections[iatom0].append(iatom1) | ||
connections[iatom1].append(iatom0) | ||
# Write CONECT lines. | ||
for iatom0, iatoms1 in enumerate(connections): | ||
if len(iatoms1) > 0: | ||
# Write connection in groups of max 4 | ||
for ichunk in range(len(iatoms1) // 4 + 1): | ||
other_atoms_str = "".join( | ||
"{:5d}".format(iatom1 + 1) | ||
for iatom1 in iatoms1[ichunk * 4:ichunk * 4 + 4]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would increase the readability of the conde defining the string first of the connected atoms (iatom1) and then concatenate the string "CONECT" and iatom0 string with the newly created string. It took me a some time to understand what is written. |
||
conect_line = f"CONECT{iatom0 + 1:5d}{other_atoms_str}" | ||
print(conect_line, file=f) | ||
print("END", file=f) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
COMPND UNNAMED | ||
AUTHOR GENERATED BY OPEN BABEL 2.4.1 | ||
HETATM 1 C UNL 1 1.084 0.018 -0.200 1.00 0.00 C1+ | ||
HETATM 2 H UNL 1 1.536 0.152 -1.213 1.00 0.00 H | ||
HETATM 3 H UNL 1 0.314 0.733 -0.579 1.00 0.00 H | ||
HETATM 4 H UNL 1 0.636 -0.992 -0.139 1.00 0.00 H | ||
HETATM 5 H UNL 1 0.925 0.544 0.772 1.00 0.00 H | ||
HETATM 6 H UNL 1 2.147 -0.036 0.138 1.00 0.00 H | ||
CONECT 1 2 3 4 5 | ||
CONECT 1 6 | ||
CONECT 2 1 | ||
CONECT 3 1 | ||
CONECT 4 1 | ||
CONECT 5 1 | ||
CONECT 6 1 | ||
MASTER 0 0 0 0 0 0 0 0 6 0 6 0 | ||
END |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,10 @@ | ||
MODEL | ||
TITLE water | ||
HETATM 1 H HOH 0 0.784 -0.492 0.000 1.00 0.00 H | ||
HETATM 2 O HOH 1 0.000 0.062 0.000 1.00 0.00 O | ||
HETATM 3 H HOH 0 -0.784 -0.492 0.000 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
ENDMDL | ||
END |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,35 @@ | ||
HETATM 1 O HOH 1 3.341 0.264 2.538 1.00 0.00 O | ||
HETATM 2 H HOH 0 3.391 -0.627 2.172 1.00 0.00 H | ||
HETATM 3 H HOH 0 2.864 0.114 3.364 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
END | ||
HETATM 1 O HOH 1 0.345 2.460 -3.307 1.00 0.00 O | ||
HETATM 2 H HOH 0 -0.015 3.354 -3.352 1.00 0.00 H | ||
HETATM 3 H HOH 0 0.165 2.218 -2.391 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
END | ||
HETATM 1 O HOH 1 -0.233 -0.790 -3.248 1.00 0.00 O | ||
HETATM 2 H HOH 0 0.409 -0.077 -3.354 1.00 0.00 H | ||
HETATM 3 H HOH 0 -0.872 -0.395 -2.642 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
END | ||
HETATM 1 O HOH 1 1.359 3.030 -0.351 1.00 0.00 O | ||
HETATM 2 H HOH 0 1.799 3.355 0.444 1.00 0.00 H | ||
HETATM 3 H HOH 0 1.936 3.363 -1.049 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
END | ||
HETATM 1 O HOH 1 -1.382 -3.328 -2.737 1.00 0.00 O | ||
HETATM 2 H HOH 0 -2.123 -3.355 -3.354 1.00 0.00 H | ||
HETATM 3 H HOH 0 -0.627 -3.266 -3.336 1.00 0.00 H | ||
CONECT 1 2 | ||
CONECT 2 1 3 | ||
CONECT 3 2 | ||
END |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would create the list connections and loop over connections only if data.bonds is not None.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point!