From e0084bc259ff139ef0b7fe5d38902a72c7198b41 Mon Sep 17 00:00:00 2001 From: Erik Poppleton Date: Mon, 4 Nov 2024 15:53:17 +0100 Subject: [PATCH] Added PDB -> oxDNA converter to oat --- .../src/oxDNA_analysis_tools/PDB_oxDNA.py | 270 ++++++++++++++++++ .../UTILS/data_structures.py | 40 ++- analysis/src/oxDNA_analysis_tools/__init__.py | 2 +- 3 files changed, 298 insertions(+), 14 deletions(-) create mode 100644 analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py diff --git a/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py b/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py new file mode 100644 index 00000000..3eeb5a9b --- /dev/null +++ b/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py @@ -0,0 +1,270 @@ +import time +start_time = time.time() +import argparse +import numpy as np +from os import path +from typing import Tuple, List, Dict +from dataclasses import dataclass +from itertools import permutations +from oxDNA_analysis_tools.UTILS.logger import log, logger_settings +from oxDNA_analysis_tools.UTILS.RyeReader import write_conf, write_top +from oxDNA_analysis_tools.UTILS.data_structures import Configuration, System, Strand, Monomer + +BASE_SHIFT = 1.13 +COM_SHIFT = 0.5 +OXDNA_TO_ANGSTROM = 8.518 +ANGSTROM_TO_OXDNA = 1. / OXDNA_TO_ANGSTROM + +@dataclass +class Atom(): + """A dataclass defining an atom""" + id : int + type : str + alt : str + resn : str + chain : str + resi : int + pos : np.ndarray + +class Residue(): + """A class defining a residue""" + resn : str + resi : int + atoms : List[Atom] + sugar_atoms : List[Atom] + base_atoms : List[Atom] + phosphate_atoms : List[Atom] + atom_lookup : Dict[str, Atom] + + def __init__(self, resn, resi): + self.resn = resn.strip('D35*') + self.resi = resi + self.atoms = [] + self.sugar_atoms = [] + self.phosphate_atoms = [] + self.base_atoms = [] + self.atom_lookup = {} + if self.resn not in {'A', 'G', 'C', 'T', 'U'}: + raise RuntimeError(f'Residue name {resn} at position {resi} could not be mapped to any standard nucleic acid name.') + + def parse_nucleic_atoms(self): + for a in self.atoms: + if 'P' in a.type or a.type == "HO5'": + self.phosphate_atoms.append(a) + elif "'" in a.type: + self.sugar_atoms.append(a) + else: + self.base_atoms.append(a) + + self.atom_lookup[a.type] = a + + def calc_ox_properties(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + self.parse_nucleic_atoms() # should make this work for protein ANMs at some point + ring_names = ["C2", "C4", "C5", "C6", "N1", "N3"] # These are found in all rings + + # Take atoms and figure out the pos, a1 and a3 + pos = np.mean([a.pos for a in self.atoms], axis=0) * ANGSTROM_TO_OXDNA # Center of mass of the atoms + #sugar_com = np.mean([a.pos for a in self.sugar_atoms], axis=0) + + # Figure out the a3 vector by finding the average normal vector to the base + a3 = np.zeros(3) + # In the Taco script they use O4' and the center of the base as the reference, however this is a bad assumption for RNA + # Instead I'll use C5' and C3' because that seems much more likely to point up the helix. + ref_vec = self.atom_lookup["C5'"].pos - self.atom_lookup["C3'"].pos + for perm in permutations(ring_names, 3): + p = self.atom_lookup[perm[0]] + q = self.atom_lookup[perm[1]] + r = self.atom_lookup[perm[2]] + v1 = p.pos - q.pos + v2 = p.pos - r.pos + v1 /= np.linalg.norm(v1) + v2 /= np.linalg.norm(v2) + a3_i = np.cross(v1, v2) + a3_i /= np.linalg.norm(a3_i) + if np.dot(a3_i, ref_vec) < 0: + a3_i *= -1 + a3 += a3_i + a3 /= np.linalg.norm(a3) + + # Figure out the a1 vector based on the ring atom positions + if "C" in self.resn or "T" in self.resn or "U" in self.resn: + pairs = [ ["N3", "C6"], ["C2", "N1"], ["C4", "C5"] ] + else: + pairs = [ ["N1", "C4"], ["C2", "N3"], ["C6", "C5"] ] + a1 = np.zeros(3) + for pair in pairs: + p = self.atom_lookup[pair[0]] + q = self.atom_lookup[pair[1]] + v1 = p.pos - q.pos + v1 /= np.linalg.norm(v1) + a1 += v1 + a1 /= np.linalg.norm(a1) + + return (pos, a1, a3) + + def to_monomer(self, strand:Strand) -> Monomer: + p, a1, a3 = self.calc_ox_properties() + return Monomer(self.resi, self.resn, strand, None, None, None, p, a1, a3) + +def parse_atom(l:str): + return Atom( + int(l[6:11].strip()), # atom index + l[12:16].strip(), # atom name + l[16].strip(), # alternate location + l[17:20].strip(), # resname + l[21], # chain + int(l[22:26].strip()), # resid + np.array([float(l[30:38]), float(l[38:46]), float(l[46:54])]) # xyz coords + ) + +def PDB_oxDNA(pdb_str:str, old_top:bool=False) -> Tuple[List[Configuration], List[System]]: + """ + Convert a PDB string to an oxDNA Configuration/System + + Parameters: + pdb_str (str) : The contents of a PDB file as a string + old_top (bool) : (optional) If True, create an old-style topology. default=False + + Returns: + (tuple(System, Configuration)) : The system and configuration in oxDNA ready for write-out + """ + + # Cleanup function to run whenever we end a strand + def end_strand(): + nonlocal a, r, strand, sys, prev_chain, prev_resi + monomer = r.to_monomer(strand) + if "O2'" in r.atom_lookup.keys(): # Check for the 2' hydroxyl + strand.type='RNA' + strand.append(monomer) + sys.append(strand) + strand = Strand(strand.id+1) + prev_chain = a.chain + prev_resi = -1 + + # Cleanup function to run whenever we end a model + def end_system(): + nonlocal sys, old_top, systems, configs + if old_top: + for s in sys.strands: + s.monomers = s.monomers[::-1] + s.set_old(True) + + positions = np.array([m.pos for s in sys.strands for m in s]) + box = 1.5*(np.max(positions) - np.min(positions)) + + conf = Configuration(0, np.array([box, box, box]), np.array([0, 0, 0]), positions, np.array([m.a1 for s in sys.strands for m in s]), np.array([m.a3 for s in sys.strands for m in s])) + systems.append(sys) + configs.append(conf) + sys = System('', []) + + systems = [] + configs = [] + sys = System('') + strand = Strand(1) + prev_resi:int = -1 + prev_chain:str = '' + + lines = pdb_str.split('\n') + for l in lines: + if l.startswith('ATOM'): # We have found an atom + a = parse_atom(l) + + # Catch case where there was no TER on the previous strand + if prev_chain != '': + if a.chain != prev_chain and len(strand) != 0: + end_strand() + + # Use the first available alternate position + if a.alt: + if not (a.alt == 'A' or a.alt == '1'): + continue + log(f"Alternate location for atom {a.id} of residue {a.resi} encountered, using location {a.alt}") + + # We're in a new residue + if a.resi != prev_resi: + if prev_resi != -1: + monomer = r.to_monomer(strand) + strand.append(monomer) + r = Residue(a.resn, a.resi) + + r.atoms.append(a) + prev_resi = a.resi + prev_chain = a.chain + continue + + # End of a chain + elif l.startswith('TER'): + end_strand() + continue + + # End the system, start a new one (if there is one) + # Also catch the last strand if there was no TER identifier + elif l.startswith('END'): + if len(strand) > 0: + end_strand() + if len(sys.strands) > 0: + end_system() + continue + + # Catch the case where there was no TER or END identifier + if len(strand) > 0: + end_strand() + end_system() + + return configs, systems + +# This is what gets picked up by the cli documentation builder +def cli_parser(prog="program_name"): + parser = argparse.ArgumentParser(prog = prog, description="Convert a PDB file to oxDNA") + parser.add_argument('pdb_file', type=str, help='The pdb file you wish to convert') + parser.add_argument('-o', '--output', metavar='output_file', help='The filename to save the output oxDNA files to') + parser.add_argument('-b', '--backward', action='store_true', default=False, help="Use the old topology format? (strands will be listed 3'-5')") + parser.add_argument('-q', '--quiet', metavar='quiet', dest='quiet', action='store_const', const=True, default=False, help="Don't print 'INFO' messages to stderr") + return parser + +def main(): + # Get arguments from the CLI + parser = cli_parser(path.basename(__file__)) + args = parser.parse_args() + + #run system checks + logger_settings.set_quiet(args.quiet) + from oxDNA_analysis_tools.config import check + check(["python", "numpy"]) + + # Parse CLI input + pdb_file = args.pdb_file + + old_top = args.backward + + # -o names the output file + if args.output: + outbase = args.output + else: + outbase = pdb_file.replace('.pdb', '') + log(f"No outfile name provided, defaulting to \"{outbase}\"") + + # Get the string from the PDB + with open(pdb_file, 'r') as f: + pdb_str = f.read() + + # Get the oxDNA-style data structures + configs, systems = PDB_oxDNA(pdb_str, old_top=old_top) + + for i, (conf, sys) in enumerate(zip(configs, systems)): + # Figure out the names + if len(configs) == 1: + sysn = '' + else: + sysn = '_'+str(i) + outtop = outbase+sysn+'.top' + outdat = outbase+sysn+'.dat' + # Write out the files + write_top(outtop, sys, old_format=old_top) + write_conf(outdat, conf) + log(f"Wrote outfiles {outtop}, {outdat}") + + print("--- %s seconds ---" % (time.time() - start_time)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py b/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py index 1a5f6eda..37df303a 100644 --- a/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py +++ b/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py @@ -3,6 +3,7 @@ from typing import List, Dict import inspect import numpy as np +#import numpy.typing as npt from typing import Union @@ -66,7 +67,7 @@ class Configuration: a3s (numpy.ndarray) : the orientations of the stacking sites """ time : int - box : np.ndarray + box : np.ndarray #np.ndarray[tuple[Literal[3]], np.dtype[np.float]] # This is possible in np > 2.1.0 energy : np.ndarray positions : np.ndarray a1s : np.ndarray @@ -98,7 +99,7 @@ class System: top_file: str strands: List[Strand] - def __init__(self, top_file:str='', strands:List = []): + def __init__(self, top_file:str='', strands:List[Strand] = []): self.top_file = top_file self.strands = strands @@ -134,21 +135,21 @@ class Strand: **kwargs (dict) : Set addtional attributes of the strand object from key:value pairs. Attributes: - __from_old (bool) : Was this created from an old-style topology file? (default : False) + _from_old (bool) : Was this created from an old-style topology file? (default : False) id (int) : ID of this strand in the topology file monomers (list[Monomer]) : List of consitutent monomer objects (default : []) type (str) : Type of molecule this represents (default : DNA) circular (bool) : Is this a circular strand? (default : False) """ - __from_old: bool + _from_old: bool id: int monomers: List[Monomer] type: str circular: bool def __init__(self, id, *initial_data, **kwargs): - self.__from_old = False + self._from_old = False self.id = id self.monomers = [] self.type = 'DNA' @@ -184,7 +185,7 @@ def get_kwdata(self) -> Dict[str,str]: Used for writing out new-style topology files. """ attributes = inspect.getmembers(self, lambda a:not(inspect.isroutine(a))) - attributes = [a for a in attributes if not(a[0].startswith('__') or a[0].endswith('__'))] + attributes = [a for a in attributes if not(a[0].startswith('_') or a[0].endswith('_'))] d = {k:str(v) for k,v in attributes if k != 'monomers'} return d @@ -192,13 +193,13 @@ def is_old(self): """ Returns whether this Strand came from an old-style topology file. """ - return self.__from_old + return self._from_old def set_old(self, from_old): """ - Sets the __from_old attribute read by `is_old`. + Sets the _from_old attribute read by `is_old`. """ - self.__from_old = from_old + self._from_old = from_old def is_circular(self) -> bool: """ @@ -228,6 +229,16 @@ def set_sequence(self, new_seq:str) -> None: for m, s, in zip(self, new_seq): m.btype = s + def append(self, monomer:Monomer): + """ + Append a monomer to the strand. + + Modifies this strand in-place. + + Parameters: + monomer (Monomer) : The monomer to append + """ + self.monomers.append(monomer) #No, you cannot make n3, n5 and pair refs to other Monomers #Scaffold strands are long enough that it would stack overflow while pickling for Pool processing @@ -239,8 +250,11 @@ class Monomer: """ id : int btype : str - strand : Union[Strand, None] - n3 : Union[int, None] - n5 : Union[int, None] - pair : Union[int, None] + strand : Union[Strand, None] = None + n3 : Union[int, None] = None + n5 : Union[int, None] = None + pair : Union[int, None] = None + pos : Union[np.ndarray, None] = None + a1 : Union[np.ndarray, None] = None + a3 : Union[np.ndarray, None] = None diff --git a/analysis/src/oxDNA_analysis_tools/__init__.py b/analysis/src/oxDNA_analysis_tools/__init__.py index 42298073..4ceedc39 100644 --- a/analysis/src/oxDNA_analysis_tools/__init__.py +++ b/analysis/src/oxDNA_analysis_tools/__init__.py @@ -1 +1 @@ -__version__ = '2.4.2' \ No newline at end of file +__version__ = '2.5.0' \ No newline at end of file