From aa2b73d77f43d6cbca029ac9150993481e2a2fec Mon Sep 17 00:00:00 2001 From: David Koes Date: Wed, 4 Oct 2023 11:26:55 -0400 Subject: [PATCH 1/3] Less ambiguous selection. If a protein has a chain 'x' and a chain '2' and the x happens to be next to the 2 in the selection string (chain a b c x 2) the parser fails to realize that x and 2 are chain identifiers. This makes sure there is no ambiguity. There is still a problem with PDBs that have a chain 'bb' (e.g. 2VVJ), but I'm not up for diving into the parser to fix that. --- prody/proteins/header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prody/proteins/header.py b/prody/proteins/header.py index 129fbc9f4..30dff23a4 100644 --- a/prody/proteins/header.py +++ b/prody/proteins/header.py @@ -1111,7 +1111,7 @@ def buildBiomolecules(header, atoms, biomol=None): translation[2] = line2[3] t = Transformation(rotation, translation) - newag = atoms.select('chain ' + ' '.join(mt[times*4+0])).copy() + newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0])).copy() if newag is None: continue newag.all.setSegnames(decToHybrid36(times+1,resnum=True)) From 7102c7b77924f167a2832ee92c0ba7daff7996f8 Mon Sep 17 00:00:00 2001 From: David Koes Date: Wed, 4 Oct 2023 16:04:37 -0400 Subject: [PATCH 2/3] Improvements to MMTF bioassemlies Correctly parse biotransform header and include mutiple assemblies if present. Some tweaks to the atom parser for efficiency. --- prody/proteins/mmtffile.py | 66 ++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/prody/proteins/mmtffile.py b/prody/proteins/mmtffile.py index 7d637d38c..6d22203ab 100644 --- a/prody/proteins/mmtffile.py +++ b/prody/proteins/mmtffile.py @@ -118,7 +118,15 @@ def parseMMTF(mmtf_struc, **kwargs): parseMMTF.__doc__ += _parseMMTFdoc - +def _has_biomol(hd): + '''Return true if header actually has multiple transformations''' + if 'biomoltrans' not in hd: + return False + for v in hd['biomoltrans'].values(): + if len(v) > 4: + return True + return False + def _parseMMTF(mmtf_struc, **kwargs): LOGGER.timeit() ag = AtomGroup() @@ -156,7 +164,7 @@ def _parseMMTF(mmtf_struc, **kwargs): if ag is not None and isinstance(hd, dict): - if biomol: + if biomol and _has_biomol(hd): ag = buildBiomolecules(hd, ag) if isinstance(ag, list): LOGGER.info('Biomolecular transformations were applied, {0} ' @@ -176,8 +184,7 @@ def set_header(data_api): #get the transform here and convert it to the format that prody wants assembly = data_api.bio_assembly[0] chain_list = data_api.chain_name_list - assembly = bio_transform(assembly, chain_list) - + assembly = _bio_transform(data_api) header = { 'r_work': data_api.r_work, 'r_free': data_api.r_free, @@ -191,23 +198,20 @@ def set_header(data_api): return header -def bio_transform(input_data, chain_list): - - output_dict = {} - index = input_data['name'] - transforms=[] - - for item in input_data['transformList']: - chains = list(set([chain_list[i] for i in item['chainIndexList']]))[::-1] - transforms.append(chains) - for i in range(0, len(item['matrix'])-4, 4): - string_slice = item['matrix'][i:i+4] - formatted_string = ' '.join(map(str, string_slice)) - transforms.append(formatted_string) - - output_dict[index]=transforms - - return output_dict +def _bio_transform(dec): + ret = {} + for t in dec.bio_assembly: + name = t['name'] + L = [] + for trans in t['transformList']: + chis = trans['chainIndexList'] + chains = sorted(set([dec.chain_name_list[c] for c in chis])) + m = trans['matrix'] + L += [chains, '%f %f %f %f'%tuple(m[:4]), + '%f %f %f %f'%tuple(m[4:8]), + '%f %f %f %f'%tuple(m[8:12])] + ret[t['name']] = L + return ret def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'): @@ -255,14 +259,17 @@ def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'): resnums = np.empty(asize, dtype=ATOMIC_FIELDS['resnum'].dtype) chainids = np.empty(asize, dtype=ATOMIC_FIELDS['chain'].dtype) segnames = np.empty(asize, dtype=ATOMIC_FIELDS['segment'].dtype) + segnames[:] = '' + elements = np.empty(asize, dtype=ATOMIC_FIELDS['element'].dtype) + icodes = np.empty(asize, dtype=ATOMIC_FIELDS['icode'].dtype) + hetero = np.zeros(asize, dtype=bool) termini = np.zeros(asize, dtype=bool) - altlocs = np.empty(asize, dtype=ATOMIC_FIELDS['altloc'].dtype) - icodes = np.empty(asize, dtype=ATOMIC_FIELDS['icode'].dtype) - serials = np.empty(asize, dtype=ATOMIC_FIELDS['serial'].dtype) - elements = np.empty(asize, dtype=ATOMIC_FIELDS['element'].dtype) - bfactors = np.empty(asize, dtype=ATOMIC_FIELDS['beta'].dtype) - occupancies = np.empty(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype) + + altlocs = np.array(mmtf_data.alt_loc_list[:asize], dtype=ATOMIC_FIELDS['altloc'].dtype) + serials = np.array(mmtf_data.atom_id_list[:asize], dtype=ATOMIC_FIELDS['serial'].dtype) + bfactors = np.array(mmtf_data.b_factor_list[:asize], dtype=ATOMIC_FIELDS['beta'].dtype) + occupancies = np.array(mmtf_data.occupancy_list[:asize], dtype=ATOMIC_FIELDS['occupancy'].dtype) modelIndex = 0 chainIndex = 0 @@ -284,9 +291,6 @@ def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'): #traverse atoms for i in range(groupAtomCount): - bfactors[atomIndex] = mmtf_data.b_factor_list[atomIndex] - altlocs[atomIndex] = mmtf_data.alt_loc_list[atomIndex] - occupancies[atomIndex] = mmtf_data.occupancy_list[atomIndex] atom_names[atomIndex] = group.get('atomNameList')[i] elements[atomIndex] = group.get('elementList')[i] resnames[atomIndex] = group.get('groupName') @@ -295,8 +299,6 @@ def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'): chainids[atomIndex] = chain_name_list[k] if group.get('chemCompType') in mmtfHETATMtypes: hetero[atomIndex] = True - serials[atomIndex] = atomIndex + 1 - segnames[atomIndex]='' atomIndex += 1 From 4d4a732ffcab422a6db33f9c2b4aa269e31d8e37 Mon Sep 17 00:00:00 2001 From: David Koes Date: Fri, 6 Oct 2023 13:42:49 -0400 Subject: [PATCH 3/3] support reading a bytearray (as found in hadoop seq file) --- prody/proteins/mmtffile.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/prody/proteins/mmtffile.py b/prody/proteins/mmtffile.py index 6d22203ab..66896c282 100644 --- a/prody/proteins/mmtffile.py +++ b/prody/proteins/mmtffile.py @@ -68,6 +68,7 @@ def parseMMTF(mmtf_struc, **kwargs): """ try: from mmtf import fetch, parse + import gzip, msgpack import mmtf except ImportError: raise ImportError("Install mmtf to read in mmtf structure objects (e.g. pip install mmtf-python)") @@ -107,6 +108,13 @@ def parseMMTF(mmtf_struc, **kwargs): .format(mmtf_struc)) mmtf_struc = structure title = mmtf_struc.structure_id + elif type(mmtf_struc)==bytearray: + data = gzip.decompress(mmtf_struc) + unpack = msgpack.unpackb(data) + dec = mmtf.MMTFDecoder() + dec.decode_data(unpack) + mmtf_struc = dec + #if none of the above loops are entered, user should have passed a mmtf structure object if title is None: