diff --git a/prody/atomic/atomic.py b/prody/atomic/atomic.py index b4a6700f4..1541c1993 100644 --- a/prody/atomic/atomic.py +++ b/prody/atomic/atomic.py @@ -24,6 +24,15 @@ except: continue +CORE_AAMAP = AAMAP = { + 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', + 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', + 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', + 'TYR': 'Y', 'VAL': 'V' +} + +invAAMAP = dict((v, k) for k, v in CORE_AAMAP.items()) + AAMAP = { 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', @@ -242,17 +251,25 @@ def getTitle(self): return ag._title def getSequence(self, **kwargs): - """Returns one-letter sequence string for amino acids. + """Returns one-letter sequence string for amino acids, unless *longSeq* is **True**. When *allres* keyword argument is **True**, sequence will include all residues (e.g. water molecules) in the chain and **X** will be used for non-standard residue names.""" + longSeq = kwargs.get('longSeq', False) + get = AAMAP.get if hasattr(self, 'getResnames'): - seq = ''.join([get(res, 'X') for res in self.getResnames()]) + if longSeq: + seq = ' '.join(self.getResnames()) + else: + seq = ''.join([get(res, 'X') for res in self.getResnames()]) else: res = self.getResname() - seq = get(res, 'X') + if longSeq: + seq = res + else: + seq = get(res, 'X') return seq diff --git a/prody/atomic/chain.py b/prody/atomic/chain.py index d5534f783..db74231c9 100644 --- a/prody/atomic/chain.py +++ b/prody/atomic/chain.py @@ -8,9 +8,14 @@ __all__ = ['Chain'] -def getSequence(resnames): - """Returns polypeptide sequence as from list of *resnames* (residue - name abbreviations).""" +def getSequence(resnames, **kwargs): + """Returns polypeptide sequence from a list of *resnames* using one-letter residue + name abbreviations by default, or long (usually three letter) abbrevations + if *longSeq* is **True**.""" + + longSeq = kwargs.get('longSeq', False) + if longSeq: + return ' '.join(resnames) get = AAMAP.get return ''.join([get(rn, 'X') for rn in resnames]) @@ -134,13 +139,16 @@ def getSequence(self, **kwargs): if kwargs.get('allres', False): get = AAMAP.get - seq = ''.join([get(res.getResname(), 'X') for res in self]) + if kwargs.get('longSeq', False): + seq = ' '.join([res.getResname() for res in self]) + else: + seq = ''.join([get(res.getResname(), 'X') for res in self]) elif self._seq: seq = self._seq else: calpha = self.calpha if calpha: - seq = getSequence(calpha.getResnames()) + seq = getSequence(calpha.getResnames(), **kwargs) else: seq = '' self._seq = seq diff --git a/prody/atomic/flags.py b/prody/atomic/flags.py index cee3c466a..56d9c4701 100644 --- a/prody/atomic/flags.py +++ b/prody/atomic/flags.py @@ -67,6 +67,8 @@ NONSTANDARD = { 'ASX': set(['acyclic', 'surface', 'polar', 'medium']), 'GLX': set(['acyclic', 'surface', 'large', 'polar']), + 'ASH': set(['acyclic', 'acidic', 'surface', 'polar', 'medium']), + 'GLH': set(['acyclic', 'acidic', 'surface', 'large', 'polar']), 'CSO': set(['acyclic', 'neutral', 'surface', 'medium', 'polar']), 'CYX': set(['acyclic', 'neutral', 'buried', 'medium', 'polar']), 'HIP': set(['cyclic', 'basic', 'surface', 'large', 'polar']), @@ -75,6 +77,12 @@ 'HSD': set(['cyclic', 'basic', 'surface', 'large', 'polar']), 'HSE': set(['cyclic', 'basic', 'surface', 'large', 'polar']), 'HSP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']), + 'HISD': set(['cyclic', 'basic', 'surface', 'large', 'polar']), + 'HISE': set(['cyclic', 'basic', 'surface', 'large', 'polar']), + 'HISP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']), + 'LYN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']), + 'TYM': set(['cyclic', 'aromatic', 'surface', 'basic', 'large', 'polar']), + 'ARN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']), 'MSE': set(['acyclic', 'neutral', 'buried', 'large']), 'CME': set(['acyclic', 'neutral', 'buried', 'large']), 'SEC': set(['acyclic', 'neutral', 'buried', 'polar', 'medium']), diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py index 9a31112da..71c634486 100644 --- a/prody/proteins/cifheader.py +++ b/prody/proteins/cifheader.py @@ -7,7 +7,8 @@ from prody import LOGGER from prody.atomic import flags, AAMAP -from prody.utilities import openFile, alignBioPairwise, GAP_PENALTY, GAP_EXT_PENALTY +from prody.atomic.atomic import invAAMAP +from prody.utilities import openFile, alignBioPairwise, GAP_EXT_PENALTY from .localpdb import fetchPDB from .header import (Chemical, Polymer, DBRef, _PDB_DBREF, @@ -57,7 +58,7 @@ def _natomsFromFormulaPart(part): return 1 return int("".join(digits)) -def parseCIFHeader(pdb, *keys): +def parseCIFHeader(pdb, *keys, **kwargs): """Returns header data dictionary for *pdb*. This function is equivalent to ``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be an identifier or a filename. @@ -119,14 +120,18 @@ def parseCIFHeader(pdb, *keys): raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) pdb = openFile(pdb, 'rt') - header = getCIFHeaderDict(pdb, *keys) + header = getCIFHeaderDict(pdb, *keys, **kwargs) pdb.close() return header -def getCIFHeaderDict(stream, *keys): +def getCIFHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines - or a stream.""" + or a stream. + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**. + """ try: lines = stream.readlines() @@ -139,11 +144,17 @@ def getCIFHeaderDict(stream, *keys): keys = list(keys) for k, key in enumerate(keys): if key in _PDB_HEADER_MAP: - value = _PDB_HEADER_MAP[key](lines) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value else: try: - value = _PDB_HEADER_MAP['others'](lines, key) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value except: raise KeyError('{0} is not a valid header data identifier' @@ -758,8 +769,11 @@ def _getReference(lines): return ref -def _getPolymers(lines): - """Returns list of polymers (macromolecules).""" +def _getPolymers(lines, **kwargs): + """Returns list of polymers (macromolecules). + + Sequence is usually one-letter abbreviations, but can be long + abbreviations (usually three letters) if *longSeq* is **True**""" pdbid = _PDB_HEADER_MAP['identifier'](lines) polymers = dict() @@ -777,8 +791,28 @@ def _getPolymers(lines): entities[entity].append(ch) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - poly.sequence += ''.join(item[ - '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split()) + + longSeq = kwargs.get('longSeq', False) + if longSeq: + poly.sequence += ''.join(item[ + '_entity_poly.pdbx_seq_one_letter_code'].replace(';', '').split()) + else: + poly.sequence += ''.join(item[ + '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split()) + + if longSeq: + for poly in polymers.values(): + seq = poly.sequence + resnames = [] + for item in seq.split('('): + if item.find(')') != -1: + resnames.append(item[:item.find(')')]) + letters = list(item[item.find(')')+1:]) + else: + letters = list(item) + resnames.extend([invAAMAP[letter] for letter in letters]) + + poly.sequence = ' '.join(resnames) # DBREF block 1 items2 = parseSTARSection(lines, '_struct_ref', report=False) @@ -1237,13 +1271,17 @@ def _getOther(lines, key=None): return data -def _getUnobservedSeq(lines): +def _getUnobservedSeq(lines, **kwargs): + """Get sequence of unobserved residues. + + This sequence is usually using one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" key_unobs = '_pdbx_unobs_or_zero_occ_residues' try: unobs = parseSTARSection(lines, key_unobs, report=False) - polymers = _getPolymers(lines) + polymers = _getPolymers(lines, **kwargs) except: pass diff --git a/prody/proteins/header.py b/prody/proteins/header.py index 1a64132c6..2de4e7123 100644 --- a/prody/proteins/header.py +++ b/prody/proteins/header.py @@ -235,7 +235,7 @@ def cleanString(string, nows=False): return ' '.join(string.strip().split()) -def parsePDBHeader(pdb, *keys): +def parsePDBHeader(pdb, *keys, **kwargs): """Returns header data dictionary for *pdb*. This function is equivalent to ``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be an identifier or a filename. @@ -297,14 +297,17 @@ def parsePDBHeader(pdb, *keys): raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) pdb = openFile(pdb, 'rt') - header, _ = getHeaderDict(pdb, *keys) + header, _ = getHeaderDict(pdb, *keys, **kwargs) pdb.close() return header -def getHeaderDict(stream, *keys): +def getHeaderDict(stream, *keys, **kwargs): """Returns header data in a dictionary. *stream* may be a list of PDB lines - or a stream.""" + or a stream. + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" lines = defaultdict(list) loc = 0 @@ -325,7 +328,10 @@ def getHeaderDict(stream, *keys): keys = list(keys) for k, key in enumerate(keys): if key in _PDB_HEADER_MAP: - value = _PDB_HEADER_MAP[key](lines) + if key == 'polymers': + value = _PDB_HEADER_MAP[key](lines, **kwargs) + else: + value = _PDB_HEADER_MAP[key](lines) keys[k] = value else: raise KeyError('{0} is not a valid header data identifier' @@ -555,8 +561,11 @@ def _getReference(lines): return ref -def _getPolymers(lines): - """Returns list of polymers (macromolecules).""" +def _getPolymers(lines, **kwargs): + """Returns list of polymers (macromolecules). + + Polymers have sequences that usually use one-letter residue name abbreviations by default. + To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.""" pdbid = lines['pdbid'] polymers = dict() @@ -564,7 +573,14 @@ def _getPolymers(lines): ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly - poly.sequence += ''.join(getSequence(line[19:].split())) + + longSeq = kwargs.get('longSeq', False) + if longSeq: + if poly.sequence != '': + poly.sequence += ' ' + poly.sequence += getSequence(line[19:].split(), **kwargs) + else: + poly.sequence += ''.join(getSequence(line[19:].split(), **kwargs)) for i, line in lines['DBREF ']: i += 1 diff --git a/prody/utilities/catchall.py b/prody/utilities/catchall.py index 33d166dcc..d88aeee70 100644 --- a/prody/utilities/catchall.py +++ b/prody/utilities/catchall.py @@ -314,7 +314,7 @@ def calcTree(names, distance_matrix, method='upgma', linkage=False): method = method.lower().strip() - if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median']: + if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median', 'complete']: from scipy.cluster.hierarchy import linkage as hlinkage from scipy.spatial.distance import squareform