Autodesk · ktbolt · Sep 21, 2017 · Sep 21, 2017 · Oct 6, 2017 · Oct 12, 2017
diff --git a/moldesign/helpers/pdb.py b/moldesign/helpers/pdb.py
@@ -31,6 +31,10 @@
 
 BioAssembly = collections.namedtuple('BioAssembly', 'desc chains transforms')
 
+# This named tuple is used to store the mapping between PDB and CIF residue
+# sequence numbering.
+ResidueSequenceMap = collections.namedtuple('ResidueSequenceMap', 'pdb_chain_id, begin_seq, end_seq, pdb_begin_seq, pdb_end_seq')
+
 
 def get_conect_pairs(mol):
     """ Returns a dicitonary of HETATM bonds for a PDB CONECT record

diff --git a/moldesign/interfaces/biopython_interface.py b/moldesign/interfaces/biopython_interface.py
@@ -117,15 +117,18 @@ def get_mmcif_assemblies(fileobj=None, mmcdata=None):
         opers = [opers]
 
     # now create the assembly specifications
-    assemblies = {}
-    for id, detail, chainlist, operlist in zip(ids, details, chains, opers):
-        assert id not in assemblies
-        transforms = [transforms[i] for i in operlist.split(',')]
-        assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms)
+    try:
+        assemblies = {}
+        for id, detail, chainlist, operlist in zip(ids, details, chains, opers):
+            assert id not in assemblies
+            transforms = [transforms[i] for i in operlist.split(',')]
+            assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms)
+    except Exception:
+        print('WARNING: failed to construct assemblies')
+        assemblies = {}
 
     return assemblies
 
-
 def _make_transform_dict(tmat, transform_ids):
     if isinstance(transform_ids, list):
         for i, j in itertools.product(range(0, 3), range(0, 4)):

diff --git a/moldesign/interfaces/parmed_interface.py b/moldesign/interfaces/parmed_interface.py
@@ -29,6 +29,7 @@
 import moldesign as mdt
 from .. import units as u
 from .. import utils
+from moldesign.helpers.pdb import ResidueSequenceMap
 
 if future.utils.PY2:
     from cStringIO import StringIO
@@ -161,13 +162,16 @@ def parmed_to_mdt(pmdmol):
     positions = [[pa.xx, pa.xy, pa.xz] for pa in pmdmol.atoms] * u.angstrom
 
     for iatom, patm in enumerate(pmdmol.atoms):
+        if patm.residue.insertion_code:
+            continue
+
         if patm.residue.chain not in chains:
             chains[patm.residue.chain] = mdt.Chain(pdbname=patm.residue.chain)
         chain = chains[patm.residue.chain]
 
         if patm.residue not in residues:
             residues[patm.residue] = mdt.Residue(resname=patm.residue.name,
-                                                 pdbindex=patm.residue.number)
+                                                     pdbindex=patm.residue.number)
             residues[patm.residue].chain = chain
             chain.add(residues[patm.residue])
         residue = residues[patm.residue]
@@ -184,6 +188,8 @@ def parmed_to_mdt(pmdmol):
         atoms[patm] = atom
 
     for pbnd in pmdmol.bonds:
+        if pbnd.atom1.residue.insertion_code or pbnd.atom2.residue.insertion_code:
+            continue
         atoms[pbnd.atom1].bond_to(atoms[pbnd.atom2], int(pbnd.order))
 
     mol = mdt.Molecule(list(atoms.values()),
@@ -266,7 +272,11 @@ def _reassign_chains(f, mol):
         poly_seq_ids = _aslist(data['_pdbx_poly_seq_scheme.asym_id'])
         nonpoly_ids = _aslist(data['_pdbx_nonpoly_scheme.asym_id'])
     except KeyError:
-        return mol.copy(name=mol.name)
+        if '_pdbx_poly_seq_scheme.asym_id' in data:
+            residue_seq_map = _add_residue_map(data)
+        else:
+            residue_seq_map = _add_default_residue_map(mol)
+        return mol.copy(name=mol.name, residue_map=residue_seq_map)
 
     newchain_names = set(poly_seq_ids + nonpoly_ids)
     newchains = {name: mdt.Chain(name) for name in newchain_names}
@@ -276,7 +286,6 @@ def _reassign_chains(f, mol):
                 _aslist(data['_pdbx_poly_seq_scheme.pdb_seq_num']),
                 _aslist(data['_pdbx_poly_seq_scheme.pdb_strand_id']),
                 _aslist(data['_pdbx_poly_seq_scheme.asym_id'])),
-
             zip(_aslist(data['_pdbx_nonpoly_scheme.mon_id']),
                 _aslist(data['_pdbx_nonpoly_scheme.pdb_seq_num']),
                 _aslist(data['_pdbx_nonpoly_scheme.pdb_strand_id']),
@@ -290,8 +299,63 @@ def _reassign_chains(f, mol):
 
         residue.chain = newchain
 
-    return mdt.Molecule(mol.atoms,
-                        name=mol.name, metadata=mol.metadata)
+    # Add data to map residue numbering between CIF and PDB.
+    residue_seq_map = _add_residue_map(data)
+
+    return mdt.Molecule(mol.atoms, name=mol.name, metadata=mol.metadata, 
+        residue_map=residue_seq_map)
+
+def _add_default_residue_map(mol):
+    """ Create a default map between CIF and PDB residue numbers. 
+
+    If the CIF file does not have '_pdbx_poly_seq_scheme' records then
+    create the residue map from the existing chains.
+    """
+    residue_seq_map = {}
+    chains = mol.chains
+    for chain in chains:
+        chain_id = chain.name
+        residues = chain.residues
+        start = residues[0].pdbindex
+        end = residues[-1].pdbindex
+        residue_seq_map[chain_id] = ResidueSequenceMap(chain_id, begin_seq=start, end_seq=end, 
+            pdb_begin_seq=start, pdb_end_seq=end)
+
+    return residue_seq_map
+
+
+def _add_residue_map(data):
+    """ Create a map between CIF and PDB residue numbers. """
+    residue_seq_map = {}
+    chain_ids = data["_pdbx_poly_seq_scheme.asym_id"]
+    pdb_chain_ids = data["_pdbx_poly_seq_scheme.pdb_strand_id"]
+    seq_ids = data["_pdbx_poly_seq_scheme.seq_id"]
+    res_names = data["_pdbx_poly_seq_scheme.mon_id"]
+    auth_seq_ids = data["_pdbx_poly_seq_scheme.auth_seq_num"]
+    list_size = len(chain_ids)
+
+    # Find the min/max residue sequence number for each chain.
+    chain_res_range = {}
+    for i in range(list_size-1):
+        chain_id = chain_ids[i]
+        seq_id = seq_ids[i]
+        res_name = res_names[i]
+        pdb_seq_id = auth_seq_ids[i]
+        if (pdb_seq_id == "?") or (res_name not in mdt.data.mdt.data.AMINO_NAMES):
+            continue 
+        if chain_id not in chain_res_range:
+            pdb_chain = pdb_chain_ids[i]
+            chain_res_range[chain_id] = [int(seq_id), int(seq_id), pdb_chain,
+                int(pdb_seq_id), int(pdb_seq_id)]
+        chain_res_range[chain_id][1] = int(seq_id)
+        chain_res_range[chain_id][4] = int(pdb_seq_id)
+
+    for chain_id in chain_res_range:
+        start, end, pdb_chain, pdb_start, pdb_end = chain_res_range[chain_id]
+        residue_seq_map[chain_id] = ResidueSequenceMap(pdb_chain, begin_seq=start, 
+            end_seq=end, pdb_begin_seq=pdb_start, pdb_end_seq=pdb_end)
+
+    return residue_seq_map
 
 
 def _aslist(l):

diff --git a/moldesign/molecules/molecule.py b/moldesign/molecules/molecule.py
@@ -378,7 +378,7 @@ class MolTopologyMixin(object):
         are here for code organization only - they could be included in the main
         Atom class without changing any functionality
     """
-    def copy(self, name=None):
+    def copy(self, name=None, residue_map=None):
         """ Create a copy of the molecule and all of its substructures, metadata, and methods
 
         Returns:
@@ -390,7 +390,7 @@ def copy(self, name=None):
                           name=name,
                           pdbname=self.pdbname,
                           charge=self.charge,
-                          metadata=self.metadata)
+                          metadata=self.metadata, residue_map=residue_map)
         if self.energy_model is not None:
             newmodel = self._copy_method('energy_model')
             newmol.set_energy_model(newmodel)
@@ -992,7 +992,8 @@ def __init__(self, atomcontainer,
                  copy_atoms=False,
                  pdbname=None,
                  charge=None,
-                 metadata=None):
+                 metadata=None,
+                 residue_map=None):
         super().__init__()
 
         # initial property init
@@ -1015,6 +1016,7 @@ def __init__(self, atomcontainer,
         self.integrator = None
         self.metadata = metadata
         self.electronic_state_index = 0
+        self.residue_map = residue_map 
 
         if charge is not None:
             self.charge = charge

diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py
@@ -83,4 +83,4 @@ def build_assembly(mol, assembly_name):
             all_atoms.extend(chain.atoms)
     newmol = mdt.Molecule(all_atoms,
                           name="%s (bioassembly %s)" % (mol.name, assembly_name))
-    return newmol
+    return newmol