Skip to content

Commit

Permalink
Added PDB -> oxDNA converter to oat
Browse files Browse the repository at this point in the history
  • Loading branch information
ErikPoppleton committed Nov 4, 2024
1 parent 3ad9bb7 commit e0084bc
Show file tree
Hide file tree
Showing 3 changed files with 298 additions and 14 deletions.
270 changes: 270 additions & 0 deletions analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
import time
start_time = time.time()
import argparse
import numpy as np
from os import path
from typing import Tuple, List, Dict
from dataclasses import dataclass
from itertools import permutations
from oxDNA_analysis_tools.UTILS.logger import log, logger_settings
from oxDNA_analysis_tools.UTILS.RyeReader import write_conf, write_top
from oxDNA_analysis_tools.UTILS.data_structures import Configuration, System, Strand, Monomer

BASE_SHIFT = 1.13
COM_SHIFT = 0.5
OXDNA_TO_ANGSTROM = 8.518
ANGSTROM_TO_OXDNA = 1. / OXDNA_TO_ANGSTROM

@dataclass
class Atom():
"""A dataclass defining an atom"""
id : int
type : str
alt : str
resn : str
chain : str
resi : int
pos : np.ndarray

class Residue():
"""A class defining a residue"""
resn : str
resi : int
atoms : List[Atom]
sugar_atoms : List[Atom]
base_atoms : List[Atom]
phosphate_atoms : List[Atom]
atom_lookup : Dict[str, Atom]

def __init__(self, resn, resi):
self.resn = resn.strip('D35*')
self.resi = resi
self.atoms = []
self.sugar_atoms = []
self.phosphate_atoms = []
self.base_atoms = []
self.atom_lookup = {}
if self.resn not in {'A', 'G', 'C', 'T', 'U'}:
raise RuntimeError(f'Residue name {resn} at position {resi} could not be mapped to any standard nucleic acid name.')

def parse_nucleic_atoms(self):
for a in self.atoms:
if 'P' in a.type or a.type == "HO5'":
self.phosphate_atoms.append(a)
elif "'" in a.type:
self.sugar_atoms.append(a)
else:
self.base_atoms.append(a)

self.atom_lookup[a.type] = a

def calc_ox_properties(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
self.parse_nucleic_atoms() # should make this work for protein ANMs at some point
ring_names = ["C2", "C4", "C5", "C6", "N1", "N3"] # These are found in all rings

# Take atoms and figure out the pos, a1 and a3
pos = np.mean([a.pos for a in self.atoms], axis=0) * ANGSTROM_TO_OXDNA # Center of mass of the atoms
#sugar_com = np.mean([a.pos for a in self.sugar_atoms], axis=0)

# Figure out the a3 vector by finding the average normal vector to the base
a3 = np.zeros(3)
# In the Taco script they use O4' and the center of the base as the reference, however this is a bad assumption for RNA
# Instead I'll use C5' and C3' because that seems much more likely to point up the helix.
ref_vec = self.atom_lookup["C5'"].pos - self.atom_lookup["C3'"].pos
for perm in permutations(ring_names, 3):
p = self.atom_lookup[perm[0]]
q = self.atom_lookup[perm[1]]
r = self.atom_lookup[perm[2]]
v1 = p.pos - q.pos
v2 = p.pos - r.pos
v1 /= np.linalg.norm(v1)
v2 /= np.linalg.norm(v2)
a3_i = np.cross(v1, v2)
a3_i /= np.linalg.norm(a3_i)
if np.dot(a3_i, ref_vec) < 0:
a3_i *= -1
a3 += a3_i
a3 /= np.linalg.norm(a3)

# Figure out the a1 vector based on the ring atom positions
if "C" in self.resn or "T" in self.resn or "U" in self.resn:
pairs = [ ["N3", "C6"], ["C2", "N1"], ["C4", "C5"] ]
else:
pairs = [ ["N1", "C4"], ["C2", "N3"], ["C6", "C5"] ]
a1 = np.zeros(3)
for pair in pairs:
p = self.atom_lookup[pair[0]]
q = self.atom_lookup[pair[1]]
v1 = p.pos - q.pos
v1 /= np.linalg.norm(v1)
a1 += v1
a1 /= np.linalg.norm(a1)

return (pos, a1, a3)

def to_monomer(self, strand:Strand) -> Monomer:
p, a1, a3 = self.calc_ox_properties()
return Monomer(self.resi, self.resn, strand, None, None, None, p, a1, a3)

def parse_atom(l:str):
return Atom(
int(l[6:11].strip()), # atom index
l[12:16].strip(), # atom name
l[16].strip(), # alternate location
l[17:20].strip(), # resname
l[21], # chain
int(l[22:26].strip()), # resid
np.array([float(l[30:38]), float(l[38:46]), float(l[46:54])]) # xyz coords
)

def PDB_oxDNA(pdb_str:str, old_top:bool=False) -> Tuple[List[Configuration], List[System]]:
"""
Convert a PDB string to an oxDNA Configuration/System
Parameters:
pdb_str (str) : The contents of a PDB file as a string
old_top (bool) : (optional) If True, create an old-style topology. default=False
Returns:
(tuple(System, Configuration)) : The system and configuration in oxDNA ready for write-out
"""

# Cleanup function to run whenever we end a strand
def end_strand():
nonlocal a, r, strand, sys, prev_chain, prev_resi
monomer = r.to_monomer(strand)
if "O2'" in r.atom_lookup.keys(): # Check for the 2' hydroxyl
strand.type='RNA'
strand.append(monomer)
sys.append(strand)
strand = Strand(strand.id+1)
prev_chain = a.chain
prev_resi = -1

# Cleanup function to run whenever we end a model
def end_system():
nonlocal sys, old_top, systems, configs
if old_top:
for s in sys.strands:
s.monomers = s.monomers[::-1]
s.set_old(True)

positions = np.array([m.pos for s in sys.strands for m in s])
box = 1.5*(np.max(positions) - np.min(positions))

conf = Configuration(0, np.array([box, box, box]), np.array([0, 0, 0]), positions, np.array([m.a1 for s in sys.strands for m in s]), np.array([m.a3 for s in sys.strands for m in s]))
systems.append(sys)
configs.append(conf)
sys = System('', [])

systems = []
configs = []
sys = System('')
strand = Strand(1)
prev_resi:int = -1
prev_chain:str = ''

lines = pdb_str.split('\n')
for l in lines:
if l.startswith('ATOM'): # We have found an atom
a = parse_atom(l)

# Catch case where there was no TER on the previous strand
if prev_chain != '':
if a.chain != prev_chain and len(strand) != 0:
end_strand()

# Use the first available alternate position
if a.alt:
if not (a.alt == 'A' or a.alt == '1'):
continue
log(f"Alternate location for atom {a.id} of residue {a.resi} encountered, using location {a.alt}")

# We're in a new residue
if a.resi != prev_resi:
if prev_resi != -1:
monomer = r.to_monomer(strand)
strand.append(monomer)
r = Residue(a.resn, a.resi)

r.atoms.append(a)
prev_resi = a.resi
prev_chain = a.chain
continue

# End of a chain
elif l.startswith('TER'):
end_strand()
continue

# End the system, start a new one (if there is one)
# Also catch the last strand if there was no TER identifier
elif l.startswith('END'):
if len(strand) > 0:
end_strand()
if len(sys.strands) > 0:
end_system()
continue

# Catch the case where there was no TER or END identifier
if len(strand) > 0:
end_strand()
end_system()

return configs, systems

# This is what gets picked up by the cli documentation builder
def cli_parser(prog="program_name"):
parser = argparse.ArgumentParser(prog = prog, description="Convert a PDB file to oxDNA")
parser.add_argument('pdb_file', type=str, help='The pdb file you wish to convert')
parser.add_argument('-o', '--output', metavar='output_file', help='The filename to save the output oxDNA files to')
parser.add_argument('-b', '--backward', action='store_true', default=False, help="Use the old topology format? (strands will be listed 3'-5')")
parser.add_argument('-q', '--quiet', metavar='quiet', dest='quiet', action='store_const', const=True, default=False, help="Don't print 'INFO' messages to stderr")
return parser

def main():
# Get arguments from the CLI
parser = cli_parser(path.basename(__file__))
args = parser.parse_args()

#run system checks
logger_settings.set_quiet(args.quiet)
from oxDNA_analysis_tools.config import check
check(["python", "numpy"])

# Parse CLI input
pdb_file = args.pdb_file

old_top = args.backward

# -o names the output file
if args.output:
outbase = args.output
else:
outbase = pdb_file.replace('.pdb', '')
log(f"No outfile name provided, defaulting to \"{outbase}\"")

# Get the string from the PDB
with open(pdb_file, 'r') as f:
pdb_str = f.read()

# Get the oxDNA-style data structures
configs, systems = PDB_oxDNA(pdb_str, old_top=old_top)

for i, (conf, sys) in enumerate(zip(configs, systems)):
# Figure out the names
if len(configs) == 1:
sysn = ''
else:
sysn = '_'+str(i)
outtop = outbase+sysn+'.top'
outdat = outbase+sysn+'.dat'
# Write out the files
write_top(outtop, sys, old_format=old_top)
write_conf(outdat, conf)
log(f"Wrote outfiles {outtop}, {outdat}")

print("--- %s seconds ---" % (time.time() - start_time))

if __name__ == '__main__':
main()
40 changes: 27 additions & 13 deletions analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Dict
import inspect
import numpy as np
#import numpy.typing as npt
from typing import Union


Expand Down Expand Up @@ -66,7 +67,7 @@ class Configuration:
a3s (numpy.ndarray) : the orientations of the stacking sites
"""
time : int
box : np.ndarray
box : np.ndarray #np.ndarray[tuple[Literal[3]], np.dtype[np.float]] # This is possible in np > 2.1.0
energy : np.ndarray
positions : np.ndarray
a1s : np.ndarray
Expand Down Expand Up @@ -98,7 +99,7 @@ class System:
top_file: str
strands: List[Strand]

def __init__(self, top_file:str='', strands:List = []):
def __init__(self, top_file:str='', strands:List[Strand] = []):
self.top_file = top_file
self.strands = strands

Expand Down Expand Up @@ -134,21 +135,21 @@ class Strand:
**kwargs (dict) : Set addtional attributes of the strand object from key:value pairs.
Attributes:
__from_old (bool) : Was this created from an old-style topology file? (default : False)
_from_old (bool) : Was this created from an old-style topology file? (default : False)
id (int) : ID of this strand in the topology file
monomers (list[Monomer]) : List of consitutent monomer objects (default : [])
type (str) : Type of molecule this represents (default : DNA)
circular (bool) : Is this a circular strand? (default : False)
"""

__from_old: bool
_from_old: bool
id: int
monomers: List[Monomer]
type: str
circular: bool

def __init__(self, id, *initial_data, **kwargs):
self.__from_old = False
self._from_old = False
self.id = id
self.monomers = []
self.type = 'DNA'
Expand Down Expand Up @@ -184,21 +185,21 @@ def get_kwdata(self) -> Dict[str,str]:
Used for writing out new-style topology files.
"""
attributes = inspect.getmembers(self, lambda a:not(inspect.isroutine(a)))
attributes = [a for a in attributes if not(a[0].startswith('__') or a[0].endswith('__'))]
attributes = [a for a in attributes if not(a[0].startswith('_') or a[0].endswith('_'))]
d = {k:str(v) for k,v in attributes if k != 'monomers'}
return d

def is_old(self):
"""
Returns whether this Strand came from an old-style topology file.
"""
return self.__from_old
return self._from_old

def set_old(self, from_old):
"""
Sets the __from_old attribute read by `is_old`.
Sets the _from_old attribute read by `is_old`.
"""
self.__from_old = from_old
self._from_old = from_old

def is_circular(self) -> bool:
"""
Expand Down Expand Up @@ -228,6 +229,16 @@ def set_sequence(self, new_seq:str) -> None:
for m, s, in zip(self, new_seq):
m.btype = s

def append(self, monomer:Monomer):
"""
Append a monomer to the strand.
Modifies this strand in-place.
Parameters:
monomer (Monomer) : The monomer to append
"""
self.monomers.append(monomer)

#No, you cannot make n3, n5 and pair refs to other Monomers
#Scaffold strands are long enough that it would stack overflow while pickling for Pool processing
Expand All @@ -239,8 +250,11 @@ class Monomer:
"""
id : int
btype : str
strand : Union[Strand, None]
n3 : Union[int, None]
n5 : Union[int, None]
pair : Union[int, None]
strand : Union[Strand, None] = None
n3 : Union[int, None] = None
n5 : Union[int, None] = None
pair : Union[int, None] = None
pos : Union[np.ndarray, None] = None
a1 : Union[np.ndarray, None] = None
a3 : Union[np.ndarray, None] = None

2 changes: 1 addition & 1 deletion analysis/src/oxDNA_analysis_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.4.2'
__version__ = '2.5.0'

0 comments on commit e0084bc

Please sign in to comment.