Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swap asym ids #1773

Merged
merged 23 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0b5227b
fix get unobserved seq
jamesmkrieger Oct 12, 2023
1005eb5
swap asym ids and fix downstream
jamesmkrieger Oct 12, 2023
3054d80
make unite_chains True
jamesmkrieger Oct 12, 2023
ce01519
Merge branch 'fix_unobs' into swap_asym_id
jamesmkrieger Oct 12, 2023
de67ffc
allow forgotten None case
jamesmkrieger Oct 12, 2023
074ec84
add segment selection to parse mmcif
jamesmkrieger Oct 12, 2023
fb4d2e8
make parseMMCIF chain sensitive to unite_chains
jamesmkrieger Oct 12, 2023
5c664b3
check segment and add it to title_suffix
jamesmkrieger Oct 12, 2023
3dccd38
update mmcif 3o21 into test datafiles
jamesmkrieger Oct 12, 2023
39feb39
tests for unite chains and biomols
jamesmkrieger Oct 12, 2023
709014a
make unite_chains True in parseMMCIFStream
jamesmkrieger Oct 12, 2023
6713df2
unobs header tests
jamesmkrieger Oct 12, 2023
99784a5
fixes from failed tests
jamesmkrieger Oct 12, 2023
28c70fb
hopefully last fixes
jamesmkrieger Oct 12, 2023
692c1a5
fix biomols for same chid
jamesmkrieger Oct 16, 2023
43b524f
fix unite_chains docs to True
jamesmkrieger Oct 16, 2023
4deb25e
set unite_chains False and improve docs
jamesmkrieger Oct 16, 2023
951a126
unite chains docs
jamesmkrieger Oct 16, 2023
1e2cb8a
fix exanm exgnm docs
jamesmkrieger Oct 16, 2023
8de2c03
parser fix
jamesmkrieger Oct 16, 2023
14ecff6
extend parser fix to unary
jamesmkrieger Oct 16, 2023
e6d15d7
fix unite chains test for default False
jamesmkrieger Oct 16, 2023
98cebd0
fix unite chains biomol test for default False
jamesmkrieger Oct 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion prody/atomic/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,7 @@ def _and2(self, sel, loc, tokens, subset=None):
isDataLabel = atoms.isDataLabel
append = None
wasand = False
wasdata = False
while tokens:
# check whether token is an array to avoid array == str comparison
token = tokens.pop(0)
Expand All @@ -1354,9 +1355,10 @@ def _and2(self, sel, loc, tokens, subset=None):
.format(repr('and ... and')), ['and', 'and'])
append = None
wasand = True
wasdata = False
continue

elif isFlagLabel(token):
elif isFlagLabel(token) and not wasdata:
flags.append(token)
append = None

Expand Down Expand Up @@ -1396,10 +1398,12 @@ def _and2(self, sel, loc, tokens, subset=None):
evals.append([])
append = evals[-1].append
append(token)
wasdata = True

elif token in UNARY:
unary.append([])
append = unary[-1].append
wasdata = False

if token == 'not':
append((token,))
Expand Down
2 changes: 1 addition & 1 deletion prody/dynamics/exanm.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def buildMembrane(self, coords, **kwargs):

def buildHessian(self, coords, cutoff=15., gamma=1., **kwargs):
"""Build Hessian matrix for given coordinate set.
**kwargs** are passed to :method:`.buildMembrane`.
**kwargs** are passed to :meth:`.buildMembrane`.

:arg coords: a coordinate set or an object with ``getCoords`` method
:type coords: :class:`numpy.ndarray`
Expand Down
2 changes: 1 addition & 1 deletion prody/dynamics/exgnm.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def buildMembrane(self, coords, **kwargs):

def buildKirchhoff(self, coords, cutoff=15., gamma=1., **kwargs):
"""Build Kirchhoff matrix for given coordinate set.
**kwargs** are passed to :method:`.buildMembrane`.
**kwargs** are passed to :meth:`.buildMembrane`.

:arg coords: a coordinate set or an object with ``getCoords`` method
:type coords: :class:`numpy.ndarray`
Expand Down
75 changes: 60 additions & 15 deletions prody/proteins/ciffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .cifheader import getCIFHeaderDict
from .header import buildBiomolecules, assignSecstr, isHelix, isSheet

__all__ = ['parseMMCIFStream', 'parseMMCIF']
__all__ = ['parseMMCIFStream', 'parseMMCIF', 'parseCIF']


class MMCIFParseError(Exception):
Expand All @@ -36,6 +36,11 @@ class MMCIFParseError(Exception):
chains are parsed
:type chain: str

:arg segment: segment identifiers for parsing specific chains, e.g.
``segment='A'``, ``segment='B'``, ``segment='DE'``, by default all
segment are parsed
:type segment: str

:arg subset: a predefined keyword to parse subset of atoms, valid keywords
are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
(read all atoms), e.g. ``subset='bb'``
Expand All @@ -50,6 +55,13 @@ class MMCIFParseError(Exception):
alternate locations will be parsed and each will be appended as a
distinct coordinate set, default is ``"A"``
:type altloc: str

:arg unite_chains: unite chains with the same segment name (auth_asym_id), making chain ids be
auth_asym_id instead of label_asym_id. This can be helpful in some cases e.g. alignments, but can
cause some problems too. For example, using :meth:`.buildBiomolecules` afterwards requires original
chain id (label_asym_id). Using biomol=True, inside parseMMCIF is fine.
Default is *False*
jamesmkrieger marked this conversation as resolved.
Show resolved Hide resolved
:type unite_chains: bool
"""

_PDBSubsets = {'ca': 'ca', 'calpha': 'ca', 'bb': 'bb', 'backbone': 'bb'}
Expand All @@ -66,15 +78,9 @@ def parseMMCIF(pdb, **kwargs):
:arg pdb: a PDB identifier or a filename
If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function.
:type pdb: str

:arg chain: comma separated string or list-like of chain IDs
:type chain: str, tuple, list, :class:`~numpy.ndarray`

:arg unite_chains: unite chains with the same segment name
Default is *False*
:type unite_chains: bool
"""
chain = kwargs.pop('chain', None)
segment = kwargs.pop('segment', None)
title = kwargs.get('title', None)
unite_chains = kwargs.get('unite_chains', False)
auto_bonds = SETTINGS.get('auto_bonds')
Expand Down Expand Up @@ -119,12 +125,30 @@ def parseMMCIF(pdb, **kwargs):
title = title[3:]
kwargs['title'] = title
cif = openFile(pdb, 'rt')
result = parseMMCIFStream(cif, chain=chain, **kwargs)
result = parseMMCIFStream(cif, chain=chain, segment=segment, **kwargs)
cif.close()
if unite_chains:
result.setSegnames(result.getChids())
if isinstance(result, AtomGroup):
result.setChids(result.getSegnames())

elif isinstance(result, list):
# e.g. multiple biomol assemblies
[r.setChids(r.getSegnames()) for r in result if isinstance(r, AtomGroup)]

elif isinstance(result, tuple):
# atoms, header
if isinstance(result[0], AtomGroup):
result[0].setChids(result[0].getSegnames())

elif isinstance(result[0], list):
# e.g. multiple biomol assemblies
[r.setChids(r.getSegnames()) for r in result[0] if isinstance(r, AtomGroup)]

elif result is not None:
raise TypeError('result from parseMMCIFStream should be a tuple, AtomGroup or list')
return result

parseCIF = parseMMCIF

def parseMMCIFStream(stream, **kwargs):
"""Returns an :class:`.AtomGroup` and/or a class:`.StarDict`
Expand All @@ -138,6 +162,8 @@ def parseMMCIFStream(stream, **kwargs):
model = kwargs.get('model')
subset = kwargs.get('subset')
chain = kwargs.get('chain')
segment = kwargs.get('segment')
unite_chains = kwargs.get('unite_chains', False)
altloc = kwargs.get('altloc', 'A')
header = kwargs.get('header', False)
assert isinstance(header, bool), 'header must be a boolean'
Expand All @@ -150,6 +176,7 @@ def parseMMCIFStream(stream, **kwargs):
raise TypeError('model must be an integer, {0} is invalid'
.format(str(model)))
title_suffix = ''

if subset:
try:
subset = _PDBSubsets[subset.lower()]
Expand All @@ -159,13 +186,21 @@ def parseMMCIFStream(stream, **kwargs):
raise ValueError('{0} is not a valid subset'
.format(repr(subset)))
title_suffix = '_' + subset

if chain is not None:
if not isinstance(chain, str):
raise TypeError('chain must be a string')
elif len(chain) == 0:
raise ValueError('chain must not be an empty string')
title_suffix = '_' + chain + title_suffix

if segment is not None:
if not isinstance(segment, str):
raise TypeError('segment must be a string')
elif len(segment) == 0:
raise ValueError('segment must not be an empty string')
title_suffix = '_' + segment + title_suffix

ag = None
if 'ag' in kwargs:
ag = kwargs['ag']
Expand Down Expand Up @@ -194,7 +229,8 @@ def parseMMCIFStream(stream, **kwargs):
if header or biomol or secondary:
hd = getCIFHeaderDict(lines)

_parseMMCIFLines(ag, lines, model, chain, subset, altloc)
_parseMMCIFLines(ag, lines, model, chain, subset, altloc,
segment, unite_chains)

if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate set(s) were '
Expand Down Expand Up @@ -239,7 +275,7 @@ def parseMMCIFStream(stream, **kwargs):


def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
altloc_torf):
altloc_torf, segment, unite_chains):
"""Returns an AtomGroup. See also :func:`.parsePDBStream()`.

:arg lines: mmCIF lines
Expand Down Expand Up @@ -375,15 +411,24 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
if not (atomname in subset and resname in protein_resnames):
continue

chID = line.split()[fields['auth_asym_id']]
chID = line.split()[fields['label_asym_id']]
segID = line.split()[fields['auth_asym_id']]

if chain is not None:
if isinstance(chain, str):
chain = chain.split(',')
if not chID in chain:
if not unite_chains:
continue
if not segID in chain:
continue

if segment is not None:
if isinstance(segment, str):
segment = segment.split(',')
if not segID in segment:
continue

segID = line.split()[fields['label_asym_id']]

alt = line.split()[fields['label_alt_id']]
if alt not in which_altlocs and which_altlocs != 'all':
continue
Expand Down
53 changes: 42 additions & 11 deletions prody/proteins/cifheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""This module defines functions for parsing header data from PDB files."""

from collections import defaultdict, OrderedDict
import numpy as np
import os.path

from prody import LOGGER
Expand Down Expand Up @@ -185,7 +186,7 @@ def _getBiomoltrans(lines):
currentBiomolecule = item1["_pdbx_struct_assembly_gen.assembly_id"]
applyToChains = []

chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].split(',')
chains = item1["_pdbx_struct_assembly_gen.asym_id_list"].replace(';','').strip().split(',')
applyToChains.extend(chains)

biomt = biomolecule[currentBiomolecule]
Expand Down Expand Up @@ -777,7 +778,7 @@ def _getPolymers(lines):
poly = polymers.get(ch, Polymer(ch))
polymers[ch] = poly
poly.sequence += ''.join(item[
'_entity_poly.pdbx_seq_one_letter_code'][1:-2].split())
'_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())

# DBREF block 1
items2 = parseSTARSection(lines, '_struct_ref')
Expand Down Expand Up @@ -1251,10 +1252,10 @@ def _getUnobservedSeq(lines):

unobs_seqs = OrderedDict()
for item in unobs:
chid = item['_pdbx_unobs_or_zero_occ_residues.label_asym_id']
chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
if not chid in unobs_seqs.keys():
unobs_seqs[chid] = ''
unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.label_comp_id']]
unobs_seqs[chid] += AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]

if len(unobs_seqs) == 0:
return None
Expand All @@ -1271,14 +1272,44 @@ def _getUnobservedSeq(lines):
return None

alns = OrderedDict()
for key, seq in full_seqs.items():
for k, (key, seq) in enumerate(full_seqs.items()):
if key in unobs_seqs.keys():
unobs_seq = unobs_seqs[key]
alns[key] = alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
MISMATCH_SCORE=-1000,
ALIGNMENT_METHOD='global',
GAP_PENALTY=GAP_PENALTY,
GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2]
# initialise alignment (quite possibly incorrect)
aln = list(alignBioPairwise(unobs_seq, seq, MATCH_SCORE=1000,
MISMATCH_SCORE=-1000,
ALIGNMENT_METHOD='global',
GAP_PENALTY=GAP_PENALTY,
GAP_EXT_PENALTY=GAP_EXT_PENALTY)[0][:2])

# fix it
prev_chid = unobs[0]['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
i = 0
for item in unobs:
chid = item['_pdbx_unobs_or_zero_occ_residues.auth_asym_id']
if chid != prev_chid:
prev_chid = chid
i = 0

if chid == key:
one_letter = AAMAP[item['_pdbx_unobs_or_zero_occ_residues.auth_comp_id']]
good_pos = int(item['_pdbx_unobs_or_zero_occ_residues.label_seq_id']) - 1

row1_list = list(aln[0])

arr_unobs_seq = np.array(list(unobs_seq))
unobs_rep = np.where(arr_unobs_seq[:i+1] == one_letter)[0].shape[0] - 1
actual_pos = np.where(np.array(row1_list) == one_letter)[0][unobs_rep]

if actual_pos != good_pos:
row1_list[good_pos] = one_letter
row1_list[actual_pos] = '-'

aln[0] = ''.join(row1_list)

i += 1

alns[key] = aln

return alns

Expand All @@ -1303,7 +1334,7 @@ def _getUnobservedSeq(lines):
for line in lines][0],
'identifier': lambda lines: [line.split('_')[1]
if line.find("data") == 0 else ''
for line in lines][0],
for line in lines][0].strip(),
'title': _getTitle,
'experiment': lambda lines: [line.split()[1]
if line.find("_exptl.method") != -1 else None
Expand Down
7 changes: 5 additions & 2 deletions prody/proteins/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,10 +1111,13 @@ def buildBiomolecules(header, atoms, biomol=None):
translation[2] = line2[3]
t = Transformation(rotation, translation)

newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0])).copy()
newag = atoms.select('chain ' + ' or chain '.join(mt[times*4+0]))
if newag is None:
continue
newag.all.setSegnames(decToHybrid36(times+1,resnum=True))
jamesmkrieger marked this conversation as resolved.
Show resolved Hide resolved
newag = newag.copy()
segnames = newag.all.getSegnames()
newag.all.setSegnames(np.array([segname + decToHybrid36(times+1, resnum=True)
for segname in segnames]))
for acsi in range(newag.numCoordsets()):
newag.setACSIndex(acsi)
newag = t.apply(newag)
Expand Down
18 changes: 17 additions & 1 deletion prody/tests/datafiles/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,26 @@
'bb_atoms': 144,
'models': 26,
},
'biomols_cif': {
'pdb': '3o21',
'file': 'mmcif_3o21.cif',
'atoms': 12793,
'bm0_atoms': 6281,
'num_chains_united': 4,
'bm_chains_united': [2, 2],
'bm_chains_alone': [9, 10],
'chainA_atoms_united': 3170,
'chainA_atoms_alone': 3025,
'ca_atoms': 1489,
'bb_atoms': 5956,
'models': 1,
'unobs_B_start': 'G-------------------------------NQNTTEK-'
},
'long_chid_cif': {
'pdb': '6zu5',
'file': 'mmcif_6zu5.cif',
'atoms': 165175,
'chain_SX0_atoms': 1089,
'segment_SX0_atoms': 1089,
},
'3hsy': {
'pdb': '3hsy',
Expand All @@ -242,6 +257,7 @@
'pdb': '3o21',
'file': 'pdb3o21.pdb',
'atoms': 12793,
'chainA_atoms': 3170,
'ca_atoms': 1489,
'models': 1,
'biomols': 2
Expand Down
Loading
Loading