Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add threeLetter SEQRES #1950

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions prody/atomic/atomic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@
except:
continue

CORE_AAMAP = AAMAP = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q',
'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W',
'TYR': 'Y', 'VAL': 'V'
}

invAAMAP = dict((v, k) for k, v in CORE_AAMAP.items())

AAMAP = {
'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q',
'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
Expand Down Expand Up @@ -242,17 +251,25 @@ def getTitle(self):
return ag._title

def getSequence(self, **kwargs):
"""Returns one-letter sequence string for amino acids.
"""Returns one-letter sequence string for amino acids, unless *longSeq* is **True**.
When *allres* keyword argument is **True**, sequence will include all
residues (e.g. water molecules) in the chain and **X** will be used for
non-standard residue names."""

longSeq = kwargs.get('longSeq', False)

get = AAMAP.get
if hasattr(self, 'getResnames'):
seq = ''.join([get(res, 'X') for res in self.getResnames()])
if longSeq:
seq = ' '.join(self.getResnames())
else:
seq = ''.join([get(res, 'X') for res in self.getResnames()])
else:
res = self.getResname()
seq = get(res, 'X')
if longSeq:
seq = res
else:
seq = get(res, 'X')

return seq

Expand Down
18 changes: 13 additions & 5 deletions prody/atomic/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@

__all__ = ['Chain']

def getSequence(resnames):
"""Returns polypeptide sequence as from list of *resnames* (residue
name abbreviations)."""
def getSequence(resnames, **kwargs):
"""Returns polypeptide sequence from a list of *resnames* using one-letter residue
name abbreviations by default, or long (usually three letter) abbrevations
if *longSeq* is **True**."""

longSeq = kwargs.get('longSeq', False)
if longSeq:
return ' '.join(resnames)

get = AAMAP.get
return ''.join([get(rn, 'X') for rn in resnames])
Expand Down Expand Up @@ -134,13 +139,16 @@ def getSequence(self, **kwargs):

if kwargs.get('allres', False):
get = AAMAP.get
seq = ''.join([get(res.getResname(), 'X') for res in self])
if kwargs.get('longSeq', False):
seq = ' '.join([res.getResname() for res in self])
else:
seq = ''.join([get(res.getResname(), 'X') for res in self])
elif self._seq:
seq = self._seq
else:
calpha = self.calpha
if calpha:
seq = getSequence(calpha.getResnames())
seq = getSequence(calpha.getResnames(), **kwargs)
else:
seq = ''
self._seq = seq
Expand Down
8 changes: 8 additions & 0 deletions prody/atomic/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
NONSTANDARD = {
'ASX': set(['acyclic', 'surface', 'polar', 'medium']),
'GLX': set(['acyclic', 'surface', 'large', 'polar']),
'ASH': set(['acyclic', 'acidic', 'surface', 'polar', 'medium']),
'GLH': set(['acyclic', 'acidic', 'surface', 'large', 'polar']),
'CSO': set(['acyclic', 'neutral', 'surface', 'medium', 'polar']),
'CYX': set(['acyclic', 'neutral', 'buried', 'medium', 'polar']),
'HIP': set(['cyclic', 'basic', 'surface', 'large', 'polar']),
Expand All @@ -75,6 +77,12 @@
'HSD': set(['cyclic', 'basic', 'surface', 'large', 'polar']),
'HSE': set(['cyclic', 'basic', 'surface', 'large', 'polar']),
'HSP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']),
'HISD': set(['cyclic', 'basic', 'surface', 'large', 'polar']),
'HISE': set(['cyclic', 'basic', 'surface', 'large', 'polar']),
'HISP': set(['cyclic', 'acidic', 'surface', 'large', 'polar']),
'LYN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']),
'TYM': set(['cyclic', 'aromatic', 'surface', 'basic', 'large', 'polar']),
'ARN': set(['acyclic', 'neutral', 'surface', 'large', 'polar']),
'MSE': set(['acyclic', 'neutral', 'buried', 'large']),
'CME': set(['acyclic', 'neutral', 'buried', 'large']),
'SEC': set(['acyclic', 'neutral', 'buried', 'polar', 'medium']),
Expand Down
64 changes: 51 additions & 13 deletions prody/proteins/cifheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from prody import LOGGER
from prody.atomic import flags, AAMAP
from prody.utilities import openFile, alignBioPairwise, GAP_PENALTY, GAP_EXT_PENALTY
from prody.atomic.atomic import invAAMAP
from prody.utilities import openFile, alignBioPairwise, GAP_EXT_PENALTY

from .localpdb import fetchPDB
from .header import (Chemical, Polymer, DBRef, _PDB_DBREF,
Expand Down Expand Up @@ -57,7 +58,7 @@ def _natomsFromFormulaPart(part):
return 1
return int("".join(digits))

def parseCIFHeader(pdb, *keys):
def parseCIFHeader(pdb, *keys, **kwargs):
"""Returns header data dictionary for *pdb*. This function is equivalent to
``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be
an identifier or a filename.
Expand Down Expand Up @@ -119,14 +120,18 @@ def parseCIFHeader(pdb, *keys):
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(pdb))
pdb = openFile(pdb, 'rt')
header = getCIFHeaderDict(pdb, *keys)
header = getCIFHeaderDict(pdb, *keys, **kwargs)
pdb.close()
return header


def getCIFHeaderDict(stream, *keys):
def getCIFHeaderDict(stream, *keys, **kwargs):
"""Returns header data in a dictionary. *stream* may be a list of PDB lines
or a stream."""
or a stream.

Polymers have sequences that usually use one-letter residue name abbreviations by default.
To obtain long (usually three letter) abbrevations, set *longSeq* to **True**.
"""

try:
lines = stream.readlines()
Expand All @@ -139,11 +144,17 @@ def getCIFHeaderDict(stream, *keys):
keys = list(keys)
for k, key in enumerate(keys):
if key in _PDB_HEADER_MAP:
value = _PDB_HEADER_MAP[key](lines)
if key == 'polymers':
value = _PDB_HEADER_MAP[key](lines, **kwargs)
else:
value = _PDB_HEADER_MAP[key](lines)
keys[k] = value
else:
try:
value = _PDB_HEADER_MAP['others'](lines, key)
if key == 'polymers':
value = _PDB_HEADER_MAP[key](lines, **kwargs)
else:
value = _PDB_HEADER_MAP[key](lines)
keys[k] = value
except:
raise KeyError('{0} is not a valid header data identifier'
Expand Down Expand Up @@ -758,8 +769,11 @@ def _getReference(lines):
return ref


def _getPolymers(lines):
"""Returns list of polymers (macromolecules)."""
def _getPolymers(lines, **kwargs):
"""Returns list of polymers (macromolecules).

Sequence is usually one-letter abbreviations, but can be long
abbreviations (usually three letters) if *longSeq* is **True**"""

pdbid = _PDB_HEADER_MAP['identifier'](lines)
polymers = dict()
Expand All @@ -777,8 +791,28 @@ def _getPolymers(lines):
entities[entity].append(ch)
poly = polymers.get(ch, Polymer(ch))
polymers[ch] = poly
poly.sequence += ''.join(item[
'_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())

longSeq = kwargs.get('longSeq', False)
if longSeq:
poly.sequence += ''.join(item[
'_entity_poly.pdbx_seq_one_letter_code'].replace(';', '').split())
else:
poly.sequence += ''.join(item[
'_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())

if longSeq:
for poly in polymers.values():
seq = poly.sequence
resnames = []
for item in seq.split('('):
if item.find(')') != -1:
resnames.append(item[:item.find(')')])
letters = list(item[item.find(')')+1:])
else:
letters = list(item)
resnames.extend([invAAMAP[letter] for letter in letters])

poly.sequence = ' '.join(resnames)

# DBREF block 1
items2 = parseSTARSection(lines, '_struct_ref', report=False)
Expand Down Expand Up @@ -1237,13 +1271,17 @@ def _getOther(lines, key=None):
return data


def _getUnobservedSeq(lines):
def _getUnobservedSeq(lines, **kwargs):
"""Get sequence of unobserved residues.

This sequence is usually using one-letter residue name abbreviations by default.
To obtain long (usually three letter) abbrevations, set *longSeq* to **True**."""

key_unobs = '_pdbx_unobs_or_zero_occ_residues'

try:
unobs = parseSTARSection(lines, key_unobs, report=False)
polymers = _getPolymers(lines)
polymers = _getPolymers(lines, **kwargs)
except:
pass

Expand Down
32 changes: 24 additions & 8 deletions prody/proteins/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def cleanString(string, nows=False):
return ' '.join(string.strip().split())


def parsePDBHeader(pdb, *keys):
def parsePDBHeader(pdb, *keys, **kwargs):
"""Returns header data dictionary for *pdb*. This function is equivalent to
``parsePDB(pdb, header=True, model=0, meta=False)``, likewise *pdb* may be
an identifier or a filename.
Expand Down Expand Up @@ -297,14 +297,17 @@ def parsePDBHeader(pdb, *keys):
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(pdb))
pdb = openFile(pdb, 'rt')
header, _ = getHeaderDict(pdb, *keys)
header, _ = getHeaderDict(pdb, *keys, **kwargs)
pdb.close()
return header


def getHeaderDict(stream, *keys):
def getHeaderDict(stream, *keys, **kwargs):
"""Returns header data in a dictionary. *stream* may be a list of PDB lines
or a stream."""
or a stream.

Polymers have sequences that usually use one-letter residue name abbreviations by default.
To obtain long (usually three letter) abbrevations, set *longSeq* to **True**."""

lines = defaultdict(list)
loc = 0
Expand All @@ -325,7 +328,10 @@ def getHeaderDict(stream, *keys):
keys = list(keys)
for k, key in enumerate(keys):
if key in _PDB_HEADER_MAP:
value = _PDB_HEADER_MAP[key](lines)
if key == 'polymers':
value = _PDB_HEADER_MAP[key](lines, **kwargs)
else:
value = _PDB_HEADER_MAP[key](lines)
keys[k] = value
else:
raise KeyError('{0} is not a valid header data identifier'
Expand Down Expand Up @@ -555,16 +561,26 @@ def _getReference(lines):
return ref


def _getPolymers(lines):
"""Returns list of polymers (macromolecules)."""
def _getPolymers(lines, **kwargs):
"""Returns list of polymers (macromolecules).

Polymers have sequences that usually use one-letter residue name abbreviations by default.
To obtain long (usually three letter) abbrevations, set *longSeq* to **True**."""

pdbid = lines['pdbid']
polymers = dict()
for i, line in lines['SEQRES']:
ch = line[11]
poly = polymers.get(ch, Polymer(ch))
polymers[ch] = poly
poly.sequence += ''.join(getSequence(line[19:].split()))

longSeq = kwargs.get('longSeq', False)
if longSeq:
if poly.sequence != '':
poly.sequence += ' '
poly.sequence += getSequence(line[19:].split(), **kwargs)
else:
poly.sequence += ''.join(getSequence(line[19:].split(), **kwargs))

for i, line in lines['DBREF ']:
i += 1
Expand Down
2 changes: 1 addition & 1 deletion prody/utilities/catchall.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def calcTree(names, distance_matrix, method='upgma', linkage=False):

method = method.lower().strip()

if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median']:
if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median', 'complete']:
from scipy.cluster.hierarchy import linkage as hlinkage
from scipy.spatial.distance import squareform

Expand Down
Loading