From e0084bc259ff139ef0b7fe5d38902a72c7198b41 Mon Sep 17 00:00:00 2001
From: Erik Poppleton <rkpoppleton@gmail.com>
Date: Mon, 4 Nov 2024 15:53:17 +0100
Subject: [PATCH] Added PDB -> oxDNA converter to oat

---
 .../src/oxDNA_analysis_tools/PDB_oxDNA.py     | 270 ++++++++++++++++++
 .../UTILS/data_structures.py                  |  40 ++-
 analysis/src/oxDNA_analysis_tools/__init__.py |   2 +-
 3 files changed, 298 insertions(+), 14 deletions(-)
 create mode 100644 analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py

diff --git a/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py b/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py
new file mode 100644
index 00000000..3eeb5a9b
--- /dev/null
+++ b/analysis/src/oxDNA_analysis_tools/PDB_oxDNA.py
@@ -0,0 +1,270 @@
+import time
+start_time = time.time()
+import argparse
+import numpy as np
+from os import path
+from typing import Tuple, List, Dict
+from dataclasses import dataclass
+from itertools import permutations
+from oxDNA_analysis_tools.UTILS.logger import log, logger_settings
+from oxDNA_analysis_tools.UTILS.RyeReader import write_conf, write_top
+from oxDNA_analysis_tools.UTILS.data_structures import Configuration, System, Strand, Monomer
+
+BASE_SHIFT = 1.13
+COM_SHIFT = 0.5
+OXDNA_TO_ANGSTROM = 8.518
+ANGSTROM_TO_OXDNA = 1. / OXDNA_TO_ANGSTROM
+
+@dataclass
+class Atom():
+    """A dataclass defining an atom"""
+    id : int
+    type : str
+    alt : str
+    resn : str
+    chain : str
+    resi : int
+    pos : np.ndarray
+
+class Residue():
+    """A class defining a residue"""
+    resn : str
+    resi : int
+    atoms : List[Atom]
+    sugar_atoms : List[Atom]
+    base_atoms : List[Atom]
+    phosphate_atoms : List[Atom]
+    atom_lookup : Dict[str, Atom]
+
+    def __init__(self, resn, resi):
+        self.resn = resn.strip('D35*')
+        self.resi = resi
+        self.atoms = []
+        self.sugar_atoms = []
+        self.phosphate_atoms = []
+        self.base_atoms = []
+        self.atom_lookup = {}
+        if self.resn not in {'A', 'G', 'C', 'T', 'U'}:
+            raise RuntimeError(f'Residue name {resn} at position {resi} could not be mapped to any standard nucleic acid name.')
+
+    def parse_nucleic_atoms(self):
+        for a in self.atoms:
+            if 'P' in a.type or a.type == "HO5'":
+                self.phosphate_atoms.append(a)
+            elif "'" in a.type:
+                self.sugar_atoms.append(a)
+            else:
+                self.base_atoms.append(a)
+
+            self.atom_lookup[a.type] = a
+
+    def calc_ox_properties(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self.parse_nucleic_atoms() # should make this work for protein ANMs at some point
+        ring_names = ["C2", "C4", "C5", "C6", "N1", "N3"] # These are found in all rings
+
+        # Take atoms and figure out the pos, a1 and a3
+        pos = np.mean([a.pos for a in self.atoms], axis=0) * ANGSTROM_TO_OXDNA # Center of mass of the atoms
+        #sugar_com = np.mean([a.pos for a in self.sugar_atoms], axis=0)
+
+        # Figure out the a3 vector by finding the average normal vector to the base
+        a3 = np.zeros(3)
+        # In the Taco script they use O4' and the center of the base as the reference, however this is a bad assumption for RNA
+        # Instead I'll use C5' and C3' because that seems much more likely to point up the helix.
+        ref_vec = self.atom_lookup["C5'"].pos - self.atom_lookup["C3'"].pos
+        for perm in permutations(ring_names, 3):
+            p = self.atom_lookup[perm[0]]
+            q = self.atom_lookup[perm[1]]
+            r = self.atom_lookup[perm[2]]
+            v1 = p.pos - q.pos
+            v2 = p.pos - r.pos
+            v1 /= np.linalg.norm(v1)
+            v2 /= np.linalg.norm(v2)
+            a3_i = np.cross(v1, v2)
+            a3_i /= np.linalg.norm(a3_i)
+            if np.dot(a3_i, ref_vec) < 0:
+                a3_i *= -1
+            a3 += a3_i
+        a3 /= np.linalg.norm(a3)
+
+        # Figure out the a1 vector based on the ring atom positions
+        if "C" in self.resn or "T" in self.resn or "U" in self.resn:
+            pairs = [ ["N3", "C6"], ["C2", "N1"], ["C4", "C5"] ]
+        else:
+            pairs = [ ["N1", "C4"], ["C2", "N3"], ["C6", "C5"] ]
+        a1 = np.zeros(3)
+        for pair in pairs:
+            p = self.atom_lookup[pair[0]]
+            q = self.atom_lookup[pair[1]]
+            v1 = p.pos - q.pos
+            v1 /= np.linalg.norm(v1)
+            a1 += v1
+        a1 /= np.linalg.norm(a1)
+
+        return (pos, a1, a3)
+    
+    def to_monomer(self, strand:Strand) -> Monomer:
+        p, a1, a3 = self.calc_ox_properties()
+        return Monomer(self.resi, self.resn, strand, None, None, None, p, a1, a3)
+
+def parse_atom(l:str):
+    return Atom(
+        int(l[6:11].strip()),                                          # atom index
+        l[12:16].strip(),                                              # atom name
+        l[16].strip(),                                                 # alternate location
+        l[17:20].strip(),                                              # resname
+        l[21],                                                         # chain
+        int(l[22:26].strip()),                                         # resid
+        np.array([float(l[30:38]), float(l[38:46]), float(l[46:54])])  # xyz coords
+    )
+
+def PDB_oxDNA(pdb_str:str, old_top:bool=False) -> Tuple[List[Configuration], List[System]]:
+    """
+    Convert a PDB string to an oxDNA Configuration/System
+
+    Parameters:
+        pdb_str (str) : The contents of a PDB file as a string
+        old_top (bool) : (optional) If True, create an old-style topology. default=False
+
+    Returns:
+        (tuple(System, Configuration)) : The system and configuration in oxDNA ready for write-out
+    """
+    
+    # Cleanup function to run whenever we end a strand
+    def end_strand():
+        nonlocal a, r, strand, sys, prev_chain, prev_resi
+        monomer = r.to_monomer(strand)
+        if "O2'" in r.atom_lookup.keys(): # Check for the 2' hydroxyl 
+            strand.type='RNA'
+        strand.append(monomer)
+        sys.append(strand)
+        strand = Strand(strand.id+1)
+        prev_chain = a.chain
+        prev_resi = -1
+
+    # Cleanup function to run whenever we end a model
+    def end_system():
+        nonlocal sys, old_top, systems, configs
+        if old_top:
+            for s in sys.strands:
+                s.monomers = s.monomers[::-1]
+                s.set_old(True)
+
+        positions = np.array([m.pos for s in sys.strands for m in s])
+        box = 1.5*(np.max(positions) - np.min(positions))
+            
+        conf = Configuration(0, np.array([box, box, box]), np.array([0, 0, 0]), positions, np.array([m.a1 for s in sys.strands for m in s]), np.array([m.a3 for s in sys.strands for m in s]))
+        systems.append(sys)
+        configs.append(conf)
+        sys = System('', [])
+
+    systems = []
+    configs = []
+    sys = System('')
+    strand = Strand(1)
+    prev_resi:int = -1
+    prev_chain:str = ''
+
+    lines = pdb_str.split('\n')
+    for l in lines:
+        if l.startswith('ATOM'): # We have found an atom
+            a = parse_atom(l)
+
+            # Catch case where there was no TER on the previous strand
+            if prev_chain != '':
+                if a.chain != prev_chain and len(strand) != 0:
+                    end_strand()
+            
+            # Use the first available alternate position
+            if a.alt:
+                if not (a.alt == 'A' or a.alt == '1'):
+                    continue 
+                log(f"Alternate location for atom {a.id} of residue {a.resi} encountered, using location {a.alt}")
+            
+            # We're in a new residue
+            if a.resi != prev_resi:
+                if prev_resi != -1:
+                    monomer = r.to_monomer(strand)
+                    strand.append(monomer)
+                r = Residue(a.resn, a.resi)
+
+            r.atoms.append(a)
+            prev_resi = a.resi
+            prev_chain = a.chain
+            continue
+
+        # End of a chain
+        elif l.startswith('TER'):
+            end_strand()
+            continue
+
+        # End the system, start a new one (if there is one)
+        # Also catch the last strand if there was no TER identifier
+        elif l.startswith('END'):
+            if len(strand) > 0:
+                end_strand()
+            if len(sys.strands) > 0:
+                end_system()
+            continue
+    
+    # Catch the case where there was no TER or END identifier
+    if len(strand) > 0: 
+        end_strand()
+        end_system()
+    
+    return configs, systems
+
+# This is what gets picked up by the cli documentation builder
+def cli_parser(prog="program_name"):
+    parser = argparse.ArgumentParser(prog = prog, description="Convert a PDB file to oxDNA")
+    parser.add_argument('pdb_file', type=str, help='The pdb file you wish to convert')
+    parser.add_argument('-o', '--output', metavar='output_file', help='The filename to save the output oxDNA files to')
+    parser.add_argument('-b', '--backward', action='store_true', default=False, help="Use the old topology format? (strands will be listed 3'-5')")
+    parser.add_argument('-q', '--quiet', metavar='quiet', dest='quiet', action='store_const', const=True, default=False, help="Don't print 'INFO' messages to stderr")
+    return parser
+
+def main():
+    # Get arguments from the CLI
+    parser = cli_parser(path.basename(__file__))
+    args = parser.parse_args()
+
+    #run system checks
+    logger_settings.set_quiet(args.quiet)
+    from oxDNA_analysis_tools.config import check
+    check(["python", "numpy"])
+
+    # Parse CLI input
+    pdb_file = args.pdb_file
+
+    old_top = args.backward
+ 
+    # -o names the output file
+    if args.output:
+        outbase = args.output
+    else:
+        outbase = pdb_file.replace('.pdb', '')
+        log(f"No outfile name provided, defaulting to \"{outbase}\"")
+
+    # Get the string from the PDB
+    with open(pdb_file, 'r') as f:
+        pdb_str = f.read()
+
+    # Get the oxDNA-style data structures
+    configs, systems = PDB_oxDNA(pdb_str, old_top=old_top)
+
+    for i, (conf, sys) in enumerate(zip(configs, systems)):
+        # Figure out the names
+        if len(configs) == 1:
+            sysn = ''
+        else:
+            sysn = '_'+str(i)
+        outtop = outbase+sysn+'.top'
+        outdat = outbase+sysn+'.dat'
+        # Write out the files
+        write_top(outtop, sys, old_format=old_top)
+        write_conf(outdat, conf)
+        log(f"Wrote outfiles {outtop}, {outdat}")
+
+    print("--- %s seconds ---" % (time.time() - start_time))
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py b/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py
index 1a5f6eda..37df303a 100644
--- a/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py
+++ b/analysis/src/oxDNA_analysis_tools/UTILS/data_structures.py
@@ -3,6 +3,7 @@
 from typing import List, Dict
 import inspect
 import numpy as np
+#import numpy.typing as npt
 from typing import Union
 
 
@@ -66,7 +67,7 @@ class Configuration:
             a3s (numpy.ndarray) : the orientations of the stacking sites
     """
     time : int
-    box : np.ndarray
+    box : np.ndarray #np.ndarray[tuple[Literal[3]], np.dtype[np.float]] # This is possible in np > 2.1.0
     energy : np.ndarray
     positions : np.ndarray
     a1s : np.ndarray
@@ -98,7 +99,7 @@ class System:
     top_file: str
     strands: List[Strand]
 
-    def __init__(self, top_file:str='', strands:List = []):
+    def __init__(self, top_file:str='', strands:List[Strand] = []):
         self.top_file = top_file
         self.strands = strands
 
@@ -134,21 +135,21 @@ class Strand:
             **kwargs (dict) : Set addtional attributes of the strand object from key:value pairs.
 
         Attributes:
-            __from_old (bool) : Was this created from an old-style topology file? (default : False)
+            _from_old (bool) : Was this created from an old-style topology file? (default : False)
             id (int) : ID of this strand in the topology file
             monomers (list[Monomer]) : List of consitutent monomer objects (default : [])
             type (str) : Type of molecule this represents (default : DNA)
             circular (bool) : Is this a circular strand? (default : False)
     """
 
-    __from_old: bool
+    _from_old: bool
     id: int
     monomers: List[Monomer]
     type: str
     circular: bool
 
     def __init__(self, id, *initial_data, **kwargs):
-        self.__from_old = False
+        self._from_old = False
         self.id = id
         self.monomers = []
         self.type = 'DNA'
@@ -184,7 +185,7 @@ def get_kwdata(self) -> Dict[str,str]:
         Used for writing out new-style topology files.
         """
         attributes = inspect.getmembers(self, lambda a:not(inspect.isroutine(a)))
-        attributes = [a for a in attributes if not(a[0].startswith('__') or a[0].endswith('__'))]
+        attributes = [a for a in attributes if not(a[0].startswith('_') or a[0].endswith('_'))]
         d = {k:str(v) for k,v in attributes if k != 'monomers'}
         return d
 
@@ -192,13 +193,13 @@ def is_old(self):
         """
         Returns whether this Strand came from an old-style topology file.
         """
-        return self.__from_old
+        return self._from_old
     
     def set_old(self, from_old):
         """
-        Sets the __from_old attribute read by `is_old`.
+        Sets the _from_old attribute read by `is_old`.
         """
-        self.__from_old = from_old
+        self._from_old = from_old
 
     def is_circular(self) -> bool:
         """
@@ -228,6 +229,16 @@ def set_sequence(self, new_seq:str) -> None:
         for m, s, in zip(self, new_seq):
             m.btype = s
 
+    def append(self, monomer:Monomer):
+        """
+            Append a monomer to the strand.
+
+            Modifies this strand in-place.
+
+            Parameters:
+                monomer (Monomer) : The monomer to append
+        """
+        self.monomers.append(monomer)
 
 #No, you cannot make n3, n5 and pair refs to other Monomers
 #Scaffold strands are long enough that it would stack overflow while pickling for Pool processing
@@ -239,8 +250,11 @@ class Monomer:
     """
     id : int
     btype : str
-    strand : Union[Strand, None]
-    n3 : Union[int, None]
-    n5 : Union[int, None]
-    pair : Union[int, None]
+    strand : Union[Strand, None] = None
+    n3 : Union[int, None] = None
+    n5 : Union[int, None] = None
+    pair : Union[int, None] = None
+    pos : Union[np.ndarray, None] = None
+    a1 : Union[np.ndarray, None] = None
+    a3 : Union[np.ndarray, None] = None
     
diff --git a/analysis/src/oxDNA_analysis_tools/__init__.py b/analysis/src/oxDNA_analysis_tools/__init__.py
index 42298073..4ceedc39 100644
--- a/analysis/src/oxDNA_analysis_tools/__init__.py
+++ b/analysis/src/oxDNA_analysis_tools/__init__.py
@@ -1 +1 @@
-__version__ = '2.4.2'
\ No newline at end of file
+__version__ = '2.5.0'
\ No newline at end of file