Merge pull request #395 from BradyAJohnston/dev-ss-mmcif

dev ss mmcif
BradyAJohnston · Jan 5, 2024 · 105da27 · 105da27
2 parents c62d3d6 + eec09ee
commit 105da27
Show file tree

Hide file tree

Showing 8 changed files with 404 additions and 287 deletions.
diff --git a/molecularnodes/blender/nodes.py b/molecularnodes/blender/nodes.py
@@ -663,7 +663,7 @@ def chain_color(name, input_list, label_prefix = "Chain ", field = "chain_id", s
 
         # create an input for this chain
         group.interface.new_socket(current_chain, in_out='INPUT', socket_type='NodeSocketColor').default_value = color.random_rgb(i)
-        # switch input colours 10 and 11
+        # switch color input values for colors are index 10 and 11
         link(node_input.outputs[current_chain], node_color.inputs[11])
         link(node_compare.outputs['Result'], node_color.inputs['Switch'])
 

diff --git a/molecularnodes/io/load.py b/molecularnodes/io/load.py
@@ -71,68 +71,8 @@ def pdb_get_b_factors(file):
         b_factors.append(atoms.b_factor)
     return b_factors
 
-def get_secondary_structure(array, file) -> np.array:
-    """
-    Gets the secondary structure annotation that is included in mmtf files and returns it as a numerical numpy array.
 
-    Parameters:
-    -----------
-    array : numpy.array
-        The molecular coordinates array, from mmtf.get_structure()
-    file : mmtf.MMTFFile
-        The MMTF file containing the secondary structure information, from mmtf.MMTFFile.read()
 
-    Returns:
-    --------
-    atom_sse : numpy.array
-        Numerical numpy array representing the secondary structure of the molecule.
-    
-    Description:
-    ------------
-    This function uses the biotite.structure package to extract the secondary structure information from the MMTF file.
-    The resulting secondary structures are `1: Alpha Helix, 2: Beta-sheet, 3: loop`.
-    """
-
-    from biotite.structure import spread_residue_wise
-
-    sec_struct_codes = {
-        -1: "X",
-        0 : "I",
-        1 : "S",
-        2 : "H",
-        3 : "E",
-        4 : "G",
-        5 : "B",
-        6 : "T",
-        7 : "C"
-    }
-
-    dssp_to_abc = {
-        "X" : 0,
-        "I" : 1, #"a",
-        "S" : 3, #"c",
-        "H" : 1, #"a",
-        "E" : 2, #"b",
-        "G" : 1, #"a",
-        "B" : 2, #"b",
-        "T" : 3, #"c",
-        "C" : 3 #"c"
-    }
-
-    try:
-        sse = file["secStructList"]
-    except KeyError:
-        ss_int = np.full(len(array), 3)
-        print('Warning: "secStructList" field missing from MMTF file. Defaulting \
-            to "loop" for all residues.')
-    else:
-        ss_int = np.array(
-            [dssp_to_abc.get(sec_struct_codes.get(ss)) for ss in sse], 
-            dtype = int
-        )
-    atom_sse = spread_residue_wise(array, ss_int)
-
-    return atom_sse
 
 
 def comp_secondary_structure(array):
@@ -351,14 +291,10 @@ def att_is_carb():
         return struc.filter_carbohydrates(array)
 
     def att_sec_struct():
-        if calculate_ss or not file:
+        if hasattr(array, "sec_struct"):
+            return array.sec_struct
+        if calculate_ss:
             return comp_secondary_structure(array)
-        else:
-            return get_secondary_structure(array, file)
-
-
-
-
 
     # these are all of the attributes that will be added to the structure
     # TODO add capcity for selection of particular attributes to include / not include to potentially

diff --git a/molecularnodes/io/local.py b/molecularnodes/io/local.py
@@ -1,4 +1,5 @@
 import bpy
+import numpy as np
 import warnings
 from .. import assembly
 from .load import create_molecule
@@ -52,6 +53,7 @@ def load(
         except InvalidFileError:
             pass
 
+
     else:
         warnings.warn("Unable to open local file. Format not supported.")
     # if bonds chosen but no bonds currently exist (mn.bonds is None)
@@ -90,6 +92,64 @@ def load(
 
     return mol
 
+def ss_id_to_numeric(id: str) -> int:
+    "Convert the given ids in the mmmCIF file to 1 AH / 2 BS / 3 Loop integers"
+    if "HELX" in id:
+        return int(1)
+    elif "STRN" in id:
+        return int(2)
+    else:
+        return int(3)
+
+class NoSecondaryStructureError(Exception):
+    """Raised when no secondary structure is found"""
+    pass
+
+def get_ss_mmcif(mol, file):
+    import biotite.structure as struc
+
+    conf = file.get_category('struct_conf')
+    if not conf:
+        raise NoSecondaryStructureError
+    starts = conf['beg_auth_seq_id'].astype(int)
+    ends = conf['end_auth_seq_id'].astype(int)
+    chains = conf['end_auth_asym_id'].astype(str)
+    id_label = conf['id'].astype(str)
+
+    sheet = file.get_category('struct_sheet_range')
+    if sheet:
+        starts = np.append(starts, sheet['beg_auth_seq_id'].astype(int))
+        ends = np.append(ends, sheet['end_auth_seq_id'].astype(int))
+        chains = np.append(chains, sheet['end_auth_asym_id'].astype(str))
+        id_label = np.append(id_label, np.repeat('STRN', len(sheet['id'])))
+
+    id_int = np.array([ss_id_to_numeric(x) for x in id_label])
+    lookup = dict()
+    for chain in np.unique(chains):
+        arrays = []
+        mask = (chain == chains)
+        start_sub = starts[mask]
+        end_sub = ends[mask]
+        id_sub = id_int[mask]
+
+        for (start, end, id) in zip(start_sub, end_sub, id_sub):
+            idx = np.arange(start, end + 1, dtype = int)
+            arr = np.zeros((len(idx), 2), dtype = int)
+            arr[:, 0] = idx
+            arr[:, 1] = 3
+            arr[:, 1] = id
+            arrays.append(arr)
+
+        lookup[chain] =  dict(np.vstack(arrays).tolist())
+
+    ss = []
+
+    for i, (chain_id, res_id) in enumerate(zip(mol.chain_id, mol.res_id)):
+        ss.append(lookup[chain_id].get(res_id, 3))
+
+    arr = np.array(ss, dtype = int)
+    arr[~struc.filter_amino_acids(mol)] = 0
+    return arr
 
 def open_structure_local_pdb(file_path):
     import biotite.structure.io.pdb as pdb
@@ -101,6 +161,7 @@ def open_structure_local_pdb(file_path):
     mol = pdb.get_structure(file, extra_fields = ['b_factor', 'charge', 'occupancy', 'atom_id'], include_bonds = True)
     return mol, file
 
+
 def open_structure_local_pdbx(file_path):
     import biotite.structure as struc
     import biotite.structure.io.pdbx as pdbx
@@ -116,6 +177,11 @@ def open_structure_local_pdbx(file_path):
         mol  = pdbx.get_structure(file, extra_fields = ['b_factor', 'charge'])
     except InvalidFileError:
         mol = pdbx.get_component(file)
+
+    try:
+        mol.set_annotation('sec_struct', get_ss_mmcif(mol, file))
+    except NoSecondaryStructureError:
+        pass
 
     # pdbx doesn't include bond information apparently, so manually create them here
     if not mol.bonds:

diff --git a/molecularnodes/io/pdb.py b/molecularnodes/io/pdb.py
@@ -44,7 +44,7 @@ def load(
 
     mol, coll_frames = create_molecule(
         array = mol,
-        name = pdb_code,
+        name = pdb_code,    
         file = file,
         calculate_ss = False,
         centre = centre,
@@ -87,6 +87,8 @@ def get_chain_entity_id(file):
 
     return ent_dic
 
+
+
 def set_atom_entity_id(mol, file):
     mol.add_annotation('entity_id', int)
     ent_dic = get_chain_entity_id(file)
@@ -97,6 +99,91 @@ def set_atom_entity_id(mol, file):
     mol.set_annotation('entity_id', entity_ids)
     return entity_ids
 
+def get_secondary_structure(array, file) -> np.array:
+    """
+    Gets the secondary structure annotation that is included in mmtf files and returns it as a numerical numpy array.
+
+    Parameters:
+    -----------
+    array : numpy.array
+        The molecular coordinates array, from mmtf.get_structure()
+    file : mmtf.MMTFFile
+        The MMTF file containing the secondary structure information, from mmtf.MMTFFile.read()
+
+    Returns:
+    --------
+    atom_sse : numpy.array
+        Numerical numpy array representing the secondary structure of the molecule.
+    
+    Description:
+    ------------
+    This function uses the biotite.structure package to extract the secondary structure information from the MMTF file.
+    The resulting secondary structures are `1: Alpha Helix, 2: Beta-sheet, 3: loop`.
+    """
+
+    from biotite.structure import spread_residue_wise
+
+    sec_struct_codes = {
+        -1: "X", # undefined
+        0 : "I", # pi helix
+        1 : "S", # bend
+        2 : "H", # alpha helix
+        3 : "E", # extended
+        4 : "G", # 3-10 helix
+        5 : "B", # bridge
+        6 : "T", # turn
+        7 : "C"  # coil
+    }
+
+    # convert to 1 AH / 2 BS / 3 LOOP
+    dssp_codes_to_int = {
+        -1: 0, # undefined
+
+        0 : 1, # pi helix
+        2 : 1, # alpha helix
+        4 : 1, # 3-10 helix
+
+        3 : 2, # extended
+        5 : 2, # bridge
+
+        6 : 3, # turn
+        1 : 3, # bend
+        7 : 3  # coil
+    }
+
+
+
+    dssp_to_abc = {
+        "X" : 0,
+        "I" : 3, #"a",
+        "G" : 1, #"a",
+        "H" : 1, #"a",
+
+        "E" : 2, #"b",
+        "B" : 2, #"b",
+
+        "T" : 3, #"c",
+        "S" : 3, #"c",
+        "C" : 3  #"c"
+    }
+
+    try:
+        sse = file["secStructList"]
+    except KeyError:
+        ss_int = np.full(len(array), 3)
+        print('Warning: "secStructList" field missing from MMTF file. Defaulting \
+            to "loop" for all residues.')
+    else:
+        pass
+        ss_int = np.array(
+            [dssp_to_abc.get(sec_struct_codes.get(ss)) for ss in sse], 
+            dtype = int
+        )
+    atom_sse = spread_residue_wise(array, ss_int)
+    # atom_sse = spread_residue_wise(array, sse)
+
+    return atom_sse
+
 def open_structure_rcsb(pdb_code, cache_dir = None):
     import biotite.structure.io.mmtf as mmtf
     import biotite.database.rcsb as rcsb
@@ -108,6 +195,7 @@ def open_structure_rcsb(pdb_code, cache_dir = None):
     # the file. The stack will be of length = 1 if there is only one model in the file
     mol = mmtf.get_structure(file, extra_fields = ["b_factor", "charge", 'occupancy', 'atom_id'], include_bonds = True) 
     set_atom_entity_id(mol, file)
+    mol.set_annotation('sec_struct', get_secondary_structure(mol, file))
     return mol, file
 
 # operator that calls the function to import the structure from the PDB