Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MMTF Parser #1752

Merged
merged 18 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ jobs:
source activate test
conda install --yes numpy scipy nose pyparsing requests
if [[ ${{ matrix.python-version }} == "2.7" ]]; then conda install --yes unittest2; fi
pip install mmtf-python
pip install .
python setup.py build_ext --inplace --force
- name: Test with pytest
run: |
source activate test
conda install --yes pytest
pytest


4 changes: 4 additions & 0 deletions prody/proteins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@
from .pdbfile import *
__all__.extend(pdbfile.__all__)

from . import mmtffile
from .mmtffile import *
__all__.extend(mmtffile.__all__)

from . import emdfile
from .emdfile import *
__all__.extend(emdfile.__all__)
Expand Down
349 changes: 349 additions & 0 deletions prody/proteins/mmtffile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# -*- coding: utf-8 -*-
"""This module defines functions for parsing MMTF files and structure objects.

.. MMTF files: https://mmtf.rcsb.org"""

from numbers import Number
import os.path
from collections import OrderedDict

from prody.atomic import AtomGroup
from prody.atomic import flags
from prody.atomic import ATOMIC_FIELDS

from prody.utilities import openFile, isListLike, copy
from prody import LOGGER
from prody.utilities.misctools import getMasses
from .header import getHeaderDict, buildBiomolecules

import struct as st
import numpy as np

__all__ = ['parseMMTF']

_parseMMTFdoc = """
:arg chain: Chain identifier(s) to parse (e.g., 'A' for chain A). If not provided,
all chains are parsed. If a PDB ID is used, chain can also be specified as part
of the ID (e.g., '1abcA' to parse chain A).
:type chain: str, optional

:arg title: Title to assign to the resulting AtomGroup. If not provided, the
title is extracted from the MMTF structure or set to the PDB ID.
:type title: str, optional
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optional isn't really part of the type

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally this should change but it’s ok really


:arg subset: a predefined keyword to parse subset of atoms, valid keywords
are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
(read all atoms), e.g. ``subset='bb'``
:type subset: str

:arg model: model index or None (read all models), e.g. ``model=10``
:type model: int, list

:arg altloc: if a location indicator is passed, such as ``'A'`` or ``'B'``,
only indicated alternate locations will be parsed as the single
coordinate set of the AtomGroup, if *altloc* is set **True** all
alternate locations will be parsed and each will be appended as a
distinct coordinate set, default is ``"A"``
:type altloc: str
"""

def parseMMTF(mmtf_struc, **kwargs):
jamesmkrieger marked this conversation as resolved.
Show resolved Hide resolved
"""
Parse an MMTF (Macromolecular Transmission Format) structure or fetch it from the PDB,
and return an AtomGroup containing the parsed data.

:param mmtf_struc: The MMTF structure to parse. It can be provided in one of the following ways:
- A string representing a PDB ID (e.g., '1abc').
- The filename of an MMTF file (ending with '.mmtf' or '.mmtf.gz').
- An MMTF structure object (file parsed through mmtf-python).
:type mmtf_struc: str or MMTF Structure object

:param chain: Chain identifier(s) to parse (e.g., 'A' for chain A). If not provided,
all chains are parsed. If a PDB ID is used, chain can also be specified as part
of the ID (e.g., '1abcA' to parse chain A).
:type chain: str, optional

:return: An AtomGroup containing the parsed atomic data.
:rtype: AtomGroup
"""
try:
from mmtf import fetch, parse
import mmtf
except ImportError:
raise ImportError("Install mmtf to read in mmtf structure objects (e.g. pip install mmtf-python)")

chain = kwargs.pop('chain', None)
title = kwargs.get('title', None)

if type(mmtf_struc)==str:
if not mmtf_struc.endswith(".mmtf") and not mmtf_struc.endswith(".mmtf.gz"): #entering just the pdb id
if len(mmtf_struc) == 5 and mmtf_struc.isalnum():
if chain is None:
chain = mmtf_struc[-1]
mmtf_struc = mmtf_struc[:4]
else:
raise ValueError('Please provide chain as a keyword argument '
'or part of the PDB ID, not both')
else:
chain = chain

if len(mmtf_struc) == 4 and mmtf_struc.isalnum():
if title is None:
title = mmtf_struc
kwargs['title'] = title
structure = fetch(mmtf_struc)
if structure is None:
raise IOError('mmtf file for {0} could not be downloaded.'
.format(mmtf_struc))
mmtf_struc = structure
else:
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(mmtf_struc))

else: #entering the mmtf file name
structure=parse(mmtf_struc)
if structure is None:
raise IOError('mmtf file for {0} could not be downloaded.'
.format(mmtf_struc))
mmtf_struc = structure
title = mmtf_struc.structure_id

#if none of the above loops are entered, user should have passed a mmtf structure object
if title is None:
title = mmtf_struc.structure_id

result = _parseMMTF(mmtf_struc, chain=chain, **kwargs)

return result

parseMMTF.__doc__ += _parseMMTFdoc


def _parseMMTF(mmtf_struc, **kwargs):
LOGGER.timeit()
ag = AtomGroup()
model = kwargs.get('model')
subset = kwargs.get('subset')
chain = kwargs.get('chain')
header = kwargs.get('header', False)
get_bonds = kwargs.get('bonds',False)
altloc_sel = kwargs.get('altloc', 'A')

assert isinstance(header, bool), 'header must be a boolean'

if model is not None:
if isinstance(model, int):
if model < 0:
raise ValueError('model must be greater than 0')
else:
raise TypeError('model must be an integer, {0} is invalid'
.format(str(model)))
title_suffix = ''

biomol = kwargs.get('biomol', False)
hd = set_header(mmtf_struc)
ag = set_info(ag, mmtf_struc, get_bonds, altloc_sel)

if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate set(s) were '
'parsed in %.2fs.'.format(ag.numAtoms(),
ag.numCoordsets()))
else:
ag = None
LOGGER.warn('Atomic data could not be parsed, please '
'check the input file.')



if ag is not None and isinstance(hd, dict):
if biomol:
ag = buildBiomolecules(hd, ag)
if isinstance(ag, list):
LOGGER.info('Biomolecular transformations were applied, {0} '
'biomolecule(s) are returned.'.format(len(ag)))
else:
LOGGER.info('Biomolecular transformations were applied to the '
'coordinate data.')

if header:
return ag, hd
else:
return ag


def set_header(data_api):

#get the transform here and convert it to the format that prody wants
assembly = data_api.bio_assembly[0]
chain_list = data_api.chain_name_list
assembly = bio_transform(assembly, chain_list)

header = {
'r_work': data_api.r_work,
'r_free': data_api.r_free,
'resolution': data_api.resolution,
'title': data_api.title,
'deposition_date': data_api.deposition_date,
'release_date': data_api.release_date,
'experimental_methods': data_api.experimental_methods,
'biomoltrans': assembly
}

return header

def bio_transform(input_data, chain_list):

output_dict = {}
index = input_data['name']
transforms=[]

for item in input_data['transformList']:
chains = list(set([chain_list[i] for i in item['chainIndexList']]))[::-1]
transforms.append(chains)
for i in range(0, len(item['matrix'])-4, 4):
string_slice = item['matrix'][i:i+4]
formatted_string = ' '.join(map(str, string_slice))
transforms.append(formatted_string)

output_dict[index]=transforms

return output_dict

def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'):

mmtfHETATMtypes = set([
"D-SACCHARIDE",
"D-SACCHARIDE 1,4 AND 1,4 LINKING",
"D-SACCHARIDE 1,4 AND 1,6 LINKING",
"L-SACCHARIDE",
"L-SACCHARIDE 1,4 AND 1,4 LINKING",
"L-SACCHARIDE 1,4 AND 1,6 LINKING",
"NON-POLYMER",
"OTHER",
"PEPTIDE-LIKE",
"SACCHARIDE"])

asize = mmtf_data.num_atoms

if len(mmtf_data.chains_per_model) > 1:
# get number of atoms in first model
asize = 0
groupIndex = 0
modelChainCount = mmtf_data.chains_per_model[0]
for chainGroupCount in mmtf_data.groups_per_chain[:modelChainCount]:
#traverse groups
for _ in range(chainGroupCount):
group = mmtf_data.group_list[mmtf_data.group_type_list[groupIndex]]
asize += len(group['atomNameList'])
groupIndex += 1

fields = OrderedDict()

x = mmtf_data.x_coord_list
y = mmtf_data.y_coord_list
z = mmtf_data.z_coord_list

if len(x) != mmtf_data.num_models*asize:
LOGGER.warn('Multi-model MMTF files with different molecules not supported. Keeping only first model')
coords = np.array([x, y, z]).T[:asize].reshape(1, asize, 3)
else:
coords = np.array([x, y, z]).T.reshape(mmtf_data.num_models, asize, 3)

# Initialize arrays for atom properties
atom_names = np.empty(asize, dtype=ATOMIC_FIELDS['name'].dtype)
resnames = np.empty(asize, dtype=ATOMIC_FIELDS['resname'].dtype)
resnums = np.empty(asize, dtype=ATOMIC_FIELDS['resnum'].dtype)
chainids = np.empty(asize, dtype=ATOMIC_FIELDS['chain'].dtype)
segnames = np.empty(asize, dtype=ATOMIC_FIELDS['segment'].dtype)
hetero = np.zeros(asize, dtype=bool)
termini = np.zeros(asize, dtype=bool)
altlocs = np.empty(asize, dtype=ATOMIC_FIELDS['altloc'].dtype)
icodes = np.empty(asize, dtype=ATOMIC_FIELDS['icode'].dtype)
serials = np.empty(asize, dtype=ATOMIC_FIELDS['serial'].dtype)
elements = np.empty(asize, dtype=ATOMIC_FIELDS['element'].dtype)
bfactors = np.empty(asize, dtype=ATOMIC_FIELDS['beta'].dtype)
occupancies = np.empty(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype)

modelIndex = 0
chainIndex = 0
groupIndex = 0
atomIndex = 0

#traverse models
for modelChainCount in mmtf_data.chains_per_model:
chain_name_list = mmtf_data.chain_name_list

#traverse chains
for k in range(modelChainCount):
chainGroupCount = mmtf_data.groups_per_chain[chainIndex]

#traverse groups
for _ in range(chainGroupCount):
group = mmtf_data.group_list[mmtf_data.group_type_list[groupIndex]]
groupAtomCount = len(group.get('atomNameList'))

#traverse atoms
for i in range(groupAtomCount):
bfactors[atomIndex] = mmtf_data.b_factor_list[atomIndex]
altlocs[atomIndex] = mmtf_data.alt_loc_list[atomIndex]
occupancies[atomIndex] = mmtf_data.occupancy_list[atomIndex]
atom_names[atomIndex] = group.get('atomNameList')[i]
elements[atomIndex] = group.get('elementList')[i]
resnames[atomIndex] = group.get('groupName')
resnums[atomIndex] = mmtf_data.group_id_list[groupIndex]
icodes[atomIndex] = mmtf_data.ins_code_list[groupIndex]
chainids[atomIndex] = chain_name_list[k]
if group.get('chemCompType') in mmtfHETATMtypes:
hetero[atomIndex] = True
serials[atomIndex] = atomIndex + 1
segnames[atomIndex]=''

atomIndex += 1

groupIndex += 1
chainIndex += 1
modelIndex += 1
break

#detect termini based on chain changes
termini[:-1] = chainids[1:] != chainids[:-1]
termini[-1] = True #the last atom is always a terminus

mask = np.full(asize, True, dtype=bool)
if altloc_sel != 'all':
#mask out any unwanted alternative locations
mask = (altlocs == '') | (altlocs == altloc_sel)

atomgroup.setCoords(coords[:,mask])
atomgroup.setNames(atom_names[mask])
atomgroup.setResnums(resnums[mask])
atomgroup.setResnames(resnames[mask])
atomgroup.setChids(chainids[mask])
atomgroup.setElements(elements[mask])
atomgroup.setMasses(getMasses(elements[mask]))
atomgroup.setBetas(bfactors[mask])
atomgroup.setAltlocs(altlocs[mask])
atomgroup.setOccupancies(occupancies[mask])
atomgroup.setFlags('hetatm', hetero[mask])
atomgroup.setFlags('pdbter', termini[mask])
atomgroup.setFlags('selpdbter', termini[mask])
atomgroup.setSerials(serials[mask])
atomgroup.setIcodes(icodes[mask])
atomgroup.setSegnames(segnames[mask])
atomgroup.setTitle(mmtf_data.structure_id)

if get_bonds and hasattr(mmtf_data,'bond_atom_list'):
#have to remap any masked out atoms
remaining = np.arange(asize)[mask]
remap = np.full(asize,-1)
remap[remaining] = np.arange(len(remaining))
allbonds = np.array(mmtf_data.bond_atom_list).reshape(-1,2)
nonpeptide = []
#irgnore bonds between C and N in adjacent residues
for a,b in allbonds:
if a < asize and b < asize and mask[a] and mask[b] :
if atom_names[a] != 'N' or atom_names[b] != 'C' or resnums[a]-resnums[b] != 1:
nonpeptide.append((remap[a],remap[b]))
atomgroup.setBonds(nonpeptide)

return atomgroup
Binary file added prody/tests/datafiles/1pwc.mmtf
Binary file not shown.
Loading