From 5f4c3f843f296956e0a1804c63a204c7301b7212 Mon Sep 17 00:00:00 2001 From: Dave Parker Date: Thu, 21 Sep 2017 12:01:07 -0700 Subject: [PATCH 1/4] Add a new function build_model() to build a structural model from a template and target sequence. --- moldesign/tools/build.py | 76 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py index a057e91..fb27b95 100644 --- a/moldesign/tools/build.py +++ b/moldesign/tools/build.py @@ -83,4 +83,78 @@ def build_assembly(mol, assembly_name): all_atoms.extend(chain.atoms) newmol = mdt.Molecule(all_atoms, name="%s (bioassembly %s)" % (mol.name, assembly_name)) - return newmol \ No newline at end of file + return newmol + +@toplevel +def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequence, + targ_start, targ_end, targ_sequence): + """ Build a model of a structure for a given target sequence from a + template sequence and structure. + + Note that there is often a discrepancy between the number of residues in the temp + sequence and the number of actual residues contained in the molecule. + + Args: + temp_molecule(moldesign.Molecule): The molecule used as a template structure. + temp_chain_id(string): The ID of the chain in the template structure. + temp_start(int): The start of the template sequence. + temp_end(int): The end of the template sequence. + temp_sequence(string): The 1 character amino acid sequence of the template. + targ_start(int): The start of the target sequence. + targ_end(int): The end of the target sequence. + targ_sequence(string): The 1 charcter amino acid sequence of the target. + + Returns: + moldesign.Molecule: Molecule representing a structural model. + """ + if temp_chain_id not in temp_molecule.chains: + raise ValueError("No chain '%s' found in molecule '%s'" % + (temp_chain_id,temp_molecule.name)) + + chain = temp_molecule.chains[temp_chain_id] + residues = chain.residues + atoms = chain.atoms + + # Add residues from the template sequence that are present in the template + # molecule and create a list of mutations where they differ from the target + # sequence. + mutation_list = [] + temp_residues = [] + temp_seq_num = temp_start + targ_seq_num = targ_start + for temp_aa, targ_aa in zip(temp_sequence, targ_sequence): + temp_res_name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(temp_seq_num) + #print("mdt.build_model] temp_res_name %s" % temp_res_name) + if temp_res_name in residues: + if temp_aa != targ_aa: + #mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, targ_seq_num, targ_aa) + mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, residues[temp_res_name].pdbindex, targ_aa) + mutation_list.append(mutate_str) + temp_res = residues[temp_res_name] + #temp_res.pdbindex = targ_seq_num + temp_residues.append(temp_res) + temp_seq_num += 1 + #__for temp_aa, targ_aa in zip(temp_sequence, targ_sequence) + + # Create a template molecule from the template sequence. + temp_molecule = mdt.Molecule(temp_residues) + if len(mutation_list) == 0: + return temp_molecule + + # Change residues in template molecule to residues of the target sequence. + model_molecule = mdt.mutate_residues(temp_molecule, mutation_list) + + # Check that the number of bonds for backbone atoms match. + for res, model_res in zip(temp_molecule.residues, model_molecule.residues): + if not res.backbone: + continue + for atom in res.backbone: + bonds = [bond for bond in temp_molecule.bond_graph[atom] if bond.name in res.backbone] + model_atom = model_molecule.chains[temp_chain_id].residues[model_res.name].atoms[atom.name] + model_bonds = [bond for bond in model_molecule.bond_graph[model_atom] \ + if bond.name in model_res.backbone] + if len(bonds) != len(model_bonds): + raise Exception("Backbone atom bonds are missing") + + return model_molecule + From 36887675ac1f9bc4164d36b774588bbebb6e0a24 Mon Sep 17 00:00:00 2001 From: Dave Parker Date: Thu, 21 Sep 2017 12:13:03 -0700 Subject: [PATCH 2/4] Add code to build_model() to renumber residues to reflect target sequence. --- moldesign/tools/build.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py index fb27b95..c1c7c00 100644 --- a/moldesign/tools/build.py +++ b/moldesign/tools/build.py @@ -124,16 +124,17 @@ def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequenc targ_seq_num = targ_start for temp_aa, targ_aa in zip(temp_sequence, targ_sequence): temp_res_name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(temp_seq_num) - #print("mdt.build_model] temp_res_name %s" % temp_res_name) if temp_res_name in residues: if temp_aa != targ_aa: - #mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, targ_seq_num, targ_aa) - mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, residues[temp_res_name].pdbindex, targ_aa) + mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, targ_seq_num, targ_aa) mutation_list.append(mutate_str) temp_res = residues[temp_res_name] - #temp_res.pdbindex = targ_seq_num + # Renumber the residue for the target sequence. + temp_res.pdbindex = targ_seq_num + temp_res.name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(targ_seq_num) temp_residues.append(temp_res) temp_seq_num += 1 + targ_seq_num += 1 #__for temp_aa, targ_aa in zip(temp_sequence, targ_sequence) # Create a template molecule from the template sequence. From 4c346b1fc5550a2353ebb16fd3590caf7ca108ef Mon Sep 17 00:00:00 2001 From: Dave Parker Date: Fri, 6 Oct 2017 12:25:08 -0700 Subject: [PATCH 3/4] Added some functions to support homology modeling. --- moldesign/helpers/pdb.py | 4 + moldesign/interfaces/biopython_interface.py | 15 ++-- moldesign/interfaces/parmed_interface.py | 83 +++++++++++++++++++-- moldesign/molecules/molecule.py | 8 +- moldesign/tools/build.py | 39 ++++++++-- 5 files changed, 127 insertions(+), 22 deletions(-) diff --git a/moldesign/helpers/pdb.py b/moldesign/helpers/pdb.py index 4b33986..5c23b14 100644 --- a/moldesign/helpers/pdb.py +++ b/moldesign/helpers/pdb.py @@ -31,6 +31,10 @@ BioAssembly = collections.namedtuple('BioAssembly', 'desc chains transforms') +# This named tuple is used to store the mapping between PDB and CIF residue +# sequence numbering. +ResidueSequenceMap = collections.namedtuple('ResidueSequenceMap', 'pdb_chain_id, begin_seq, end_seq, pdb_begin_seq, pdb_end_seq') + def get_conect_pairs(mol): """ Returns a dicitonary of HETATM bonds for a PDB CONECT record diff --git a/moldesign/interfaces/biopython_interface.py b/moldesign/interfaces/biopython_interface.py index 610c796..14f348b 100644 --- a/moldesign/interfaces/biopython_interface.py +++ b/moldesign/interfaces/biopython_interface.py @@ -117,15 +117,18 @@ def get_mmcif_assemblies(fileobj=None, mmcdata=None): opers = [opers] # now create the assembly specifications - assemblies = {} - for id, detail, chainlist, operlist in zip(ids, details, chains, opers): - assert id not in assemblies - transforms = [transforms[i] for i in operlist.split(',')] - assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms) + try: + assemblies = {} + for id, detail, chainlist, operlist in zip(ids, details, chains, opers): + assert id not in assemblies + transforms = [transforms[i] for i in operlist.split(',')] + assemblies[id] = BioAssembly(detail, chainlist.split(','), transforms) + except Exception: + print('WARNING: failed to construct assemblies') + assemblies = {} return assemblies - def _make_transform_dict(tmat, transform_ids): if isinstance(transform_ids, list): for i, j in itertools.product(range(0, 3), range(0, 4)): diff --git a/moldesign/interfaces/parmed_interface.py b/moldesign/interfaces/parmed_interface.py index b5fe99f..58ef8dc 100644 --- a/moldesign/interfaces/parmed_interface.py +++ b/moldesign/interfaces/parmed_interface.py @@ -29,6 +29,7 @@ import moldesign as mdt from .. import units as u from .. import utils +from moldesign.helpers.pdb import ResidueSequenceMap if future.utils.PY2: from cStringIO import StringIO @@ -161,13 +162,16 @@ def parmed_to_mdt(pmdmol): positions = [[pa.xx, pa.xy, pa.xz] for pa in pmdmol.atoms] * u.angstrom for iatom, patm in enumerate(pmdmol.atoms): + if patm.residue.insertion_code: + continue + if patm.residue.chain not in chains: chains[patm.residue.chain] = mdt.Chain(pdbname=patm.residue.chain) chain = chains[patm.residue.chain] if patm.residue not in residues: residues[patm.residue] = mdt.Residue(resname=patm.residue.name, - pdbindex=patm.residue.number) + pdbindex=patm.residue.number) residues[patm.residue].chain = chain chain.add(residues[patm.residue]) residue = residues[patm.residue] @@ -184,6 +188,8 @@ def parmed_to_mdt(pmdmol): atoms[patm] = atom for pbnd in pmdmol.bonds: + if pbnd.atom1.residue.insertion_code or pbnd.atom2.residue.insertion_code: + continue atoms[pbnd.atom1].bond_to(atoms[pbnd.atom2], int(pbnd.order)) mol = mdt.Molecule(list(atoms.values()), @@ -264,8 +270,14 @@ def _reassign_chains(f, mol): try: newchain_names = set(data['_pdbx_poly_seq_scheme.asym_id']+ data['_pdbx_nonpoly_scheme.asym_id']) + except KeyError: - return mol.copy(name=mol.name) + if '_pdbx_poly_seq_scheme.asym_id' in data: + residue_seq_map = _add_residue_map(data) + else: + residue_seq_map = _add_default_residue_map(mol) + return mol.copy(name=mol.name,residue_map=residue_seq_map) + newchains = {name: mdt.Chain(name) for name in newchain_names} residue_iterator = itertools.chain( @@ -274,10 +286,10 @@ def _reassign_chains(f, mol): data['_pdbx_poly_seq_scheme.pdb_strand_id'], data['_pdbx_poly_seq_scheme.asym_id']), - zip(data['_pdbx_nonpoly_scheme.mon_id'], - data['_pdbx_nonpoly_scheme.pdb_seq_num'], - data['_pdbx_nonpoly_scheme.pdb_strand_id'], - data['_pdbx_nonpoly_scheme.asym_id'])) + zip(list(data['_pdbx_nonpoly_scheme.mon_id']), + list(data['_pdbx_nonpoly_scheme.pdb_seq_num']), + list(data['_pdbx_nonpoly_scheme.pdb_strand_id']), + list(data['_pdbx_nonpoly_scheme.asym_id']))) reschains = {(rname, ridx, rchain): newchains[chainid] for rname, ridx, rchain, chainid in residue_iterator} @@ -287,6 +299,61 @@ def _reassign_chains(f, mol): residue.chain = newchain - return mdt.Molecule(mol.atoms, - name=mol.name, metadata=mol.metadata) + # Add data to map residue numbering between CIF and PDB. + residue_seq_map = _add_residue_map(data) + + return mdt.Molecule(mol.atoms, name=mol.name, metadata=mol.metadata, + residue_map=residue_seq_map) + +def _add_default_residue_map(mol): + """ Create a default map between CIF and PDB residue numbers. + + If the CIF file does not have '_pdbx_poly_seq_scheme' records then + create the residue map from the existing chains. + """ + residue_seq_map = {} + chains = mol.chains + for chain in chains: + chain_id = chain.name + residues = chain.residues + start = residues[0].pdbindex + end = residues[-1].pdbindex + residue_seq_map[chain_id] = ResidueSequenceMap(chain_id, begin_seq=start, end_seq=end, + pdb_begin_seq=start, pdb_end_seq=end) + + return residue_seq_map + + +def _add_residue_map(data): + """ Create a map between CIF and PDB residue numbers. """ + residue_seq_map = {} + chain_ids = data["_pdbx_poly_seq_scheme.asym_id"] + pdb_chain_ids = data["_pdbx_poly_seq_scheme.pdb_strand_id"] + seq_ids = data["_pdbx_poly_seq_scheme.seq_id"] + res_names = data["_pdbx_poly_seq_scheme.mon_id"] + auth_seq_ids = data["_pdbx_poly_seq_scheme.auth_seq_num"] + list_size = len(chain_ids) + + # Find the min/max residue sequence number for each chain. + chain_res_range = {} + for i in range(list_size-1): + chain_id = chain_ids[i] + seq_id = seq_ids[i] + res_name = res_names[i] + pdb_seq_id = auth_seq_ids[i] + if (pdb_seq_id == "?") or (res_name not in mdt.data.mdt.data.AMINO_NAMES): + continue + if chain_id not in chain_res_range: + pdb_chain = pdb_chain_ids[i] + chain_res_range[chain_id] = [int(seq_id), int(seq_id), pdb_chain, + int(pdb_seq_id), int(pdb_seq_id)] + chain_res_range[chain_id][1] = int(seq_id) + chain_res_range[chain_id][4] = int(pdb_seq_id) + + for chain_id in chain_res_range: + start, end, pdb_chain, pdb_start, pdb_end = chain_res_range[chain_id] + residue_seq_map[chain_id] = ResidueSequenceMap(pdb_chain, begin_seq=start, + end_seq=end, pdb_begin_seq=pdb_start, pdb_end_seq=pdb_end) + + return residue_seq_map diff --git a/moldesign/molecules/molecule.py b/moldesign/molecules/molecule.py index 30f15ec..bafa501 100644 --- a/moldesign/molecules/molecule.py +++ b/moldesign/molecules/molecule.py @@ -378,7 +378,7 @@ class MolTopologyMixin(object): are here for code organization only - they could be included in the main Atom class without changing any functionality """ - def copy(self, name=None): + def copy(self, name=None, residue_map=None): """ Create a copy of the molecule and all of its substructures, metadata, and methods Returns: @@ -390,7 +390,7 @@ def copy(self, name=None): name=name, pdbname=self.pdbname, charge=self.charge, - metadata=self.metadata) + metadata=self.metadata, residue_map=residue_map) if self.energy_model is not None: newmodel = self._copy_method('energy_model') newmol.set_energy_model(newmodel) @@ -992,7 +992,8 @@ def __init__(self, atomcontainer, copy_atoms=False, pdbname=None, charge=None, - metadata=None): + metadata=None, + residue_map=None): super().__init__() # initial property init @@ -1015,6 +1016,7 @@ def __init__(self, atomcontainer, self.integrator = None self.metadata = metadata self.electronic_state_index = 0 + self.residue_map = residue_map if charge is not None: self.charge = charge diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py index c1c7c00..afbaa80 100644 --- a/moldesign/tools/build.py +++ b/moldesign/tools/build.py @@ -87,7 +87,7 @@ def build_assembly(mol, assembly_name): @toplevel def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequence, - targ_start, targ_end, targ_sequence): + targ_start, targ_end, targ_sequence, residue_map): """ Build a model of a structure for a given target sequence from a template sequence and structure. @@ -111,31 +111,58 @@ def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequenc raise ValueError("No chain '%s' found in molecule '%s'" % (temp_chain_id,temp_molecule.name)) + if not residue_map: + raise ValueError("No residue sequence mapping, molecule '%s' was not created from a CIF file" % temp_molecule.name) + chain = temp_molecule.chains[temp_chain_id] residues = chain.residues atoms = chain.atoms + # Check if the molecule residue sequence is different + # from the residue sequence used by the template. + cif_begin_seq = residue_map.begin_seq + cif_end_seq = residue_map.end_seq + mol_begin_seq = residue_map.pdb_begin_seq + mol_end_seq = residue_map.pdb_end_seq + temp_seq_num = mol_begin_seq - (cif_begin_seq-temp_start) + #print("[build_model] cif_begin_seq %s cif_end_seq %s" % (cif_begin_seq, cif_end_seq)) + #print("[build_model] mol_begin_seq %s mol_end_seq %s" % (mol_begin_seq, mol_end_seq)) + #print("[build_model] temp_sequence %s " % temp_sequence) + #print("[build_model] temp_seq_num %s " % temp_seq_num) + # Add residues from the template sequence that are present in the template # molecule and create a list of mutations where they differ from the target # sequence. mutation_list = [] temp_residues = [] - temp_seq_num = temp_start targ_seq_num = targ_start + #print("[build_model] temp_seq_num %d" % temp_seq_num) + #print("[build_model] targ_seq_num %d" % targ_seq_num) + #print("[build_model] ") + temp_res_names = [] for temp_aa, targ_aa in zip(temp_sequence, targ_sequence): temp_res_name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(temp_seq_num) + #print("[build_model] temp_res_name %s" % temp_res_name) if temp_res_name in residues: - if temp_aa != targ_aa: + temp_res = residues[temp_res_name] + if (temp_aa != targ_aa) and (targ_aa != '*'): mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, targ_seq_num, targ_aa) mutation_list.append(mutate_str) - temp_res = residues[temp_res_name] # Renumber the residue for the target sequence. temp_res.pdbindex = targ_seq_num - temp_res.name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(targ_seq_num) temp_residues.append(temp_res) + temp_res_names.append(temp_res.name) temp_seq_num += 1 targ_seq_num += 1 #__for temp_aa, targ_aa in zip(temp_sequence, targ_sequence) + print("[build_model] Number of template residues %d" % len(temp_res_names)) + print("[build_model] Template residues %s" % ' '.join(temp_res_names)) + print("[build_model] ") + print("[build_model] Number of residues to change %d" % len(mutation_list)) + print("[build_model] Residues to change %s" % ' '.join(mutation_list)) + + if len(temp_residues) == 0: + raise Exception("No residues found for the template sequence.") # Create a template molecule from the template sequence. temp_molecule = mdt.Molecule(temp_residues) @@ -150,6 +177,8 @@ def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequenc if not res.backbone: continue for atom in res.backbone: + if atom.element == "H": + continue bonds = [bond for bond in temp_molecule.bond_graph[atom] if bond.name in res.backbone] model_atom = model_molecule.chains[temp_chain_id].residues[model_res.name].atoms[atom.name] model_bonds = [bond for bond in model_molecule.bond_graph[model_atom] \ From 869169e89c4d082ed442b99199e2c161ec3542f9 Mon Sep 17 00:00:00 2001 From: Aaron Virshup Date: Thu, 7 Dec 2017 15:07:37 -0800 Subject: [PATCH 4/4] Removes build-model to its own script --- moldesign/tools/build.py | 104 --------------------------------------- 1 file changed, 104 deletions(-) diff --git a/moldesign/tools/build.py b/moldesign/tools/build.py index afbaa80..5b4e635 100644 --- a/moldesign/tools/build.py +++ b/moldesign/tools/build.py @@ -84,107 +84,3 @@ def build_assembly(mol, assembly_name): newmol = mdt.Molecule(all_atoms, name="%s (bioassembly %s)" % (mol.name, assembly_name)) return newmol - -@toplevel -def build_model(temp_molecule, temp_chain_id, temp_start, temp_end, temp_sequence, - targ_start, targ_end, targ_sequence, residue_map): - """ Build a model of a structure for a given target sequence from a - template sequence and structure. - - Note that there is often a discrepancy between the number of residues in the temp - sequence and the number of actual residues contained in the molecule. - - Args: - temp_molecule(moldesign.Molecule): The molecule used as a template structure. - temp_chain_id(string): The ID of the chain in the template structure. - temp_start(int): The start of the template sequence. - temp_end(int): The end of the template sequence. - temp_sequence(string): The 1 character amino acid sequence of the template. - targ_start(int): The start of the target sequence. - targ_end(int): The end of the target sequence. - targ_sequence(string): The 1 charcter amino acid sequence of the target. - - Returns: - moldesign.Molecule: Molecule representing a structural model. - """ - if temp_chain_id not in temp_molecule.chains: - raise ValueError("No chain '%s' found in molecule '%s'" % - (temp_chain_id,temp_molecule.name)) - - if not residue_map: - raise ValueError("No residue sequence mapping, molecule '%s' was not created from a CIF file" % temp_molecule.name) - - chain = temp_molecule.chains[temp_chain_id] - residues = chain.residues - atoms = chain.atoms - - # Check if the molecule residue sequence is different - # from the residue sequence used by the template. - cif_begin_seq = residue_map.begin_seq - cif_end_seq = residue_map.end_seq - mol_begin_seq = residue_map.pdb_begin_seq - mol_end_seq = residue_map.pdb_end_seq - temp_seq_num = mol_begin_seq - (cif_begin_seq-temp_start) - #print("[build_model] cif_begin_seq %s cif_end_seq %s" % (cif_begin_seq, cif_end_seq)) - #print("[build_model] mol_begin_seq %s mol_end_seq %s" % (mol_begin_seq, mol_end_seq)) - #print("[build_model] temp_sequence %s " % temp_sequence) - #print("[build_model] temp_seq_num %s " % temp_seq_num) - - # Add residues from the template sequence that are present in the template - # molecule and create a list of mutations where they differ from the target - # sequence. - mutation_list = [] - temp_residues = [] - targ_seq_num = targ_start - #print("[build_model] temp_seq_num %d" % temp_seq_num) - #print("[build_model] targ_seq_num %d" % targ_seq_num) - #print("[build_model] ") - temp_res_names = [] - for temp_aa, targ_aa in zip(temp_sequence, targ_sequence): - temp_res_name = mdt.data.mdt.data.RESIDUE_CODE_TO_NAME[temp_aa] + str(temp_seq_num) - #print("[build_model] temp_res_name %s" % temp_res_name) - if temp_res_name in residues: - temp_res = residues[temp_res_name] - if (temp_aa != targ_aa) and (targ_aa != '*'): - mutate_str = "%s.%s%d%s" % (temp_chain_id, temp_aa, targ_seq_num, targ_aa) - mutation_list.append(mutate_str) - # Renumber the residue for the target sequence. - temp_res.pdbindex = targ_seq_num - temp_residues.append(temp_res) - temp_res_names.append(temp_res.name) - temp_seq_num += 1 - targ_seq_num += 1 - #__for temp_aa, targ_aa in zip(temp_sequence, targ_sequence) - print("[build_model] Number of template residues %d" % len(temp_res_names)) - print("[build_model] Template residues %s" % ' '.join(temp_res_names)) - print("[build_model] ") - print("[build_model] Number of residues to change %d" % len(mutation_list)) - print("[build_model] Residues to change %s" % ' '.join(mutation_list)) - - if len(temp_residues) == 0: - raise Exception("No residues found for the template sequence.") - - # Create a template molecule from the template sequence. - temp_molecule = mdt.Molecule(temp_residues) - if len(mutation_list) == 0: - return temp_molecule - - # Change residues in template molecule to residues of the target sequence. - model_molecule = mdt.mutate_residues(temp_molecule, mutation_list) - - # Check that the number of bonds for backbone atoms match. - for res, model_res in zip(temp_molecule.residues, model_molecule.residues): - if not res.backbone: - continue - for atom in res.backbone: - if atom.element == "H": - continue - bonds = [bond for bond in temp_molecule.bond_graph[atom] if bond.name in res.backbone] - model_atom = model_molecule.chains[temp_chain_id].residues[model_res.name].atoms[atom.name] - model_bonds = [bond for bond in model_molecule.bond_graph[model_atom] \ - if bond.name in model_res.backbone] - if len(bonds) != len(model_bonds): - raise Exception("Backbone atom bonds are missing") - - return model_molecule -