diff --git a/moldesign/helpers/pdb.py b/moldesign/helpers/pdb.py index 4b33986..5c23b14 100644 --- a/moldesign/helpers/pdb.py +++ b/moldesign/helpers/pdb.py @@ -31,6 +31,10 @@ BioAssembly = collections.namedtuple('BioAssembly', 'desc chains transforms') +# This named tuple is used to store the mapping between PDB and CIF residue +# sequence numbering. +ResidueSequenceMap = collections.namedtuple('ResidueSequenceMap', 'pdb_chain_id, begin_seq, end_seq, pdb_begin_seq, pdb_end_seq') + def get_conect_pairs(mol): """ Returns a dicitonary of HETATM bonds for a PDB CONECT record diff --git a/moldesign/interfaces/biopython_interface.py b/moldesign/interfaces/biopython_interface.py index 610c796..14f348b 100644 --- a/moldesign/interfaces/biopython_interface.py +++ b/moldesign/interfaces/biopython_interface.py @@ -117,15 +117,18 @@ def get_mmcif_assemblies(fileobj=None, mmcdata=None): opers = [opers] # now create the assembly specifications - assemblies = {} - for id, detail, chainlist, operlist in zip(ids, details, chains, opers): - assert id not in assemblies - transforms = [transforms[i] for i in operlist.split(',')] - assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms) + try: + assemblies = {} + for id, detail, chainlist, operlist in zip(ids, details, chains, opers): + assert id not in assemblies + transforms = [transforms[i] for i in operlist.split(',')] + assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms) + except Exception: + print('WARNING: failed to construct assemblies') + assemblies = {} return assemblies - def _make_transform_dict(tmat, transform_ids): if isinstance(transform_ids, list): for i, j in itertools.product(range(0, 3), range(0, 4)): diff --git a/moldesign/interfaces/parmed_interface.py b/moldesign/interfaces/parmed_interface.py index 2663265..1a509b2 100644 --- a/moldesign/interfaces/parmed_interface.py +++ b/moldesign/interfaces/parmed_interface.py @@ -29,6 +29,7 @@ import moldesign as mdt from .. import units as u from .. import utils +from moldesign.helpers.pdb import ResidueSequenceMap if future.utils.PY2: from cStringIO import StringIO @@ -161,13 +162,16 @@ def parmed_to_mdt(pmdmol): positions = [[pa.xx, pa.xy, pa.xz] for pa in pmdmol.atoms] * u.angstrom for iatom, patm in enumerate(pmdmol.atoms): + if patm.residue.insertion_code: + continue + if patm.residue.chain not in chains: chains[patm.residue.chain] = mdt.Chain(pdbname=patm.residue.chain) chain = chains[patm.residue.chain] if patm.residue not in residues: residues[patm.residue] = mdt.Residue(resname=patm.residue.name, - pdbindex=patm.residue.number) + pdbindex=patm.residue.number) residues[patm.residue].chain = chain chain.add(residues[patm.residue]) residue = residues[patm.residue] @@ -184,6 +188,8 @@ def parmed_to_mdt(pmdmol): atoms[patm] = atom for pbnd in pmdmol.bonds: + if pbnd.atom1.residue.insertion_code or pbnd.atom2.residue.insertion_code: + continue atoms[pbnd.atom1].bond_to(atoms[pbnd.atom2], int(pbnd.order)) mol = mdt.Molecule(list(atoms.values()), @@ -266,7 +272,11 @@ def _reassign_chains(f, mol): poly_seq_ids = _aslist(data['_pdbx_poly_seq_scheme.asym_id']) nonpoly_ids = _aslist(data['_pdbx_nonpoly_scheme.asym_id']) except KeyError: - return mol.copy(name=mol.name) + if '_pdbx_poly_seq_scheme.asym_id' in data: + residue_seq_map = _add_residue_map(data) + else: + residue_seq_map = _add_default_residue_map(mol) + return mol.copy(name=mol.name, residue_map=residue_seq_map) newchain_names = set(poly_seq_ids + nonpoly_ids) newchains = {name: mdt.Chain(name) for name in newchain_names} @@ -276,7 +286,6 @@ def _reassign_chains(f, mol): _aslist(data['_pdbx_poly_seq_scheme.pdb_seq_num']), _aslist(data['_pdbx_poly_seq_scheme.pdb_strand_id']), _aslist(data['_pdbx_poly_seq_scheme.asym_id'])), - zip(_aslist(data['_pdbx_nonpoly_scheme.mon_id']), _aslist(data['_pdbx_nonpoly_scheme.pdb_seq_num']), _aslist(data['_pdbx_nonpoly_scheme.pdb_strand_id']), @@ -290,8 +299,63 @@ def _reassign_chains(f, mol): residue.chain = newchain - return mdt.Molecule(mol.atoms, - name=mol.name, metadata=mol.metadata) + # Add data to map residue numbering between CIF and PDB. + residue_seq_map = _add_residue_map(data) + + return mdt.Molecule(mol.atoms, name=mol.name, metadata=mol.metadata, + residue_map=residue_seq_map) + +def _add_default_residue_map(mol): + """ Create a default map between CIF and PDB residue numbers. + + If the CIF file does not have '_pdbx_poly_seq_scheme' records then + create the residue map from the existing chains. + """ + residue_seq_map = {} + chains = mol.chains + for chain in chains: + chain_id = chain.name + residues = chain.residues + start = residues[0].pdbindex + end = residues[-1].pdbindex + residue_seq_map[chain_id] = ResidueSequenceMap(chain_id, begin_seq=start, end_seq=end, + pdb_begin_seq=start, pdb_end_seq=end) + + return residue_seq_map + + +def _add_residue_map(data): + """ Create a map between CIF and PDB residue numbers. """ + residue_seq_map = {} + chain_ids = data["_pdbx_poly_seq_scheme.asym_id"] + pdb_chain_ids = data["_pdbx_poly_seq_scheme.pdb_strand_id"] + seq_ids = data["_pdbx_poly_seq_scheme.seq_id"] + res_names = data["_pdbx_poly_seq_scheme.mon_id"] + auth_seq_ids = data["_pdbx_poly_seq_scheme.auth_seq_num"] + list_size = len(chain_ids) + + # Find the min/max residue sequence number for each chain. + chain_res_range = {} + for i in range(list_size-1): + chain_id = chain_ids[i] + seq_id = seq_ids[i] + res_name = res_names[i] + pdb_seq_id = auth_seq_ids[i] + if (pdb_seq_id == "?") or (res_name not in mdt.data.mdt.data.AMINO_NAMES): + continue + if chain_id not in chain_res_range: + pdb_chain = pdb_chain_ids[i] + chain_res_range[chain_id] = [int(seq_id), int(seq_id), pdb_chain, + int(pdb_seq_id), int(pdb_seq_id)] + chain_res_range[chain_id][1] = int(seq_id) + chain_res_range[chain_id][4] = int(pdb_seq_id) + + for chain_id in chain_res_range: + start, end, pdb_chain, pdb_start, pdb_end = chain_res_range[chain_id] + residue_seq_map[chain_id] = ResidueSequenceMap(pdb_chain, begin_seq=start, + end_seq=end, pdb_begin_seq=pdb_start, pdb_end_seq=pdb_end) + + return residue_seq_map def _aslist(l): diff --git a/moldesign/molecules/molecule.py b/moldesign/molecules/molecule.py index 30f15ec..bafa501 100644 --- a/moldesign/molecules/molecule.py +++ b/moldesign/molecules/molecule.py @@ -378,7 +378,7 @@ class MolTopologyMixin(object): are here for code organization only - they could be included in the main Atom class without changing any functionality """ - def copy(self, name=None): + def copy(self, name=None, residue_map=None): """ Create a copy of the molecule and all of its substructures, metadata, and methods Returns: @@ -390,7 +390,7 @@ def copy(self, name=None): name=name, pdbname=self.pdbname, charge=self.charge, - metadata=self.metadata) + metadata=self.metadata, residue_map=residue_map) if self.energy_model is not None: newmodel = self._copy_method('energy_model') newmol.set_energy_model(newmodel) @@ -992,7 +992,8 @@ def __init__(self, atomcontainer, copy_atoms=False, pdbname=None, charge=None, - metadata=None): + metadata=None, + residue_map=None): super().__init__() # initial property init @@ -1015,6 +1016,7 @@ def __init__(self, atomcontainer, self.integrator = None self.metadata = metadata self.electronic_state_index = 0 + self.residue_map = residue_map if charge is not None: self.charge = charge diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py index a057e91..5b4e635 100644 --- a/moldesign/tools/build.py +++ b/moldesign/tools/build.py @@ -83,4 +83,4 @@ def build_assembly(mol, assembly_name): all_atoms.extend(chain.atoms) newmol = mdt.Molecule(all_atoms, name="%s (bioassembly %s)" % (mol.name, assembly_name)) - return newmol \ No newline at end of file + return newmol