Merge branch 'prody:master' into interactions

karolamik13 · Nov 9, 2023 · cfa2128 · cfa2128
2 parents 05b6fdf + 0a0c500
commit cfa2128
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 43 deletions.
diff --git a/prody/proteins/ciffile.py b/prody/proteins/ciffile.py
@@ -19,6 +19,8 @@
 from .cifheader import getCIFHeaderDict
 from .header import buildBiomolecules, assignSecstr, isHelix, isSheet
 
+from string import ascii_uppercase
+
 __all__ = ['parseMMCIFStream', 'parseMMCIF', 'parseCIF']
 
 
@@ -86,7 +88,7 @@ def parseMMCIF(pdb, **kwargs):
     auto_bonds = SETTINGS.get('auto_bonds')
     get_bonds = kwargs.get('bonds', auto_bonds)
     if get_bonds:
-        LOGGER.warn('Parsing struct_conn information from mmCIF is current unsupported and no bond information is added to the results')
+        LOGGER.warn('Parsing struct_conn information from mmCIF is currently unsupported and no bond information is added to the results')
     if not os.path.isfile(pdb):
         if len(pdb) == 5 and pdb.isalnum():
             if chain is None:
@@ -105,8 +107,12 @@ def parseMMCIF(pdb, **kwargs):
 
             if os.path.isfile(pdb + '.cif'):
                 filename = pdb + '.cif'
+                LOGGER.debug('CIF file is found in working directory ({0}).'
+                            .format(filename))
             elif os.path.isfile(pdb + '.cif.gz'):
                 filename = pdb + '.cif.gz'
+                LOGGER.debug('CIF file is found in working directory ({0}).'
+                            .format(filename))
             else:
                 filename = fetchPDB(pdb, report=True,
                                     format='cif', compressed=False)
@@ -300,6 +306,7 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
     doneAtomBlock = False
     start = 0
     stop = 0
+    warnedAltloc = False
     while not doneAtomBlock:
         line = lines[i]
         if line[:11] == '_atom_site.':
@@ -431,7 +438,7 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
                 continue
 
         alt = line.split()[fields['label_alt_id']]
-        if alt not in which_altlocs and which_altlocs != 'all':
+        if not (alt in which_altlocs or ascii_uppercase[int(alt)-1] in which_altlocs) and which_altlocs != 'all':
             continue
 
         if alt == '.':
@@ -505,12 +512,8 @@ def _parseMMCIFLines(atomgroup, lines, model, chain, subset,
 
     anisou = None
     siguij = None
-    try:
-        data = parseSTARSection(lines, "_atom_site_anisotrop")
-        x = data[0] # check if data has anything in it
-    except IndexError:
-        LOGGER.warn("No anisotropic B factors found")
-    else:
+    data = parseSTARSection(lines, "_atom_site_anisotrop", report=False)
+    if len(data) > 0:
         anisou = np.zeros((acount, 6),
                           dtype=float)
 

diff --git a/prody/proteins/cifheader.py b/prody/proteins/cifheader.py
@@ -178,8 +178,8 @@ def _getBiomoltrans(lines):
     # 2 blocks are needed for this:
     # _pdbx_struct_assembly_gen: what to apply to which chains
     # _pdbx_struct_oper_list: everything else
-    data1 = parseSTARSection(lines, '_pdbx_struct_assembly_gen')
-    data2 = parseSTARSection(lines, '_pdbx_struct_oper_list')
+    data1 = parseSTARSection(lines, '_pdbx_struct_assembly_gen', report=False)
+    data2 = parseSTARSection(lines, '_pdbx_struct_oper_list', report=False)
 
     # extracting the data
     for n, item1 in enumerate(data1):
@@ -225,7 +225,7 @@ def _getRelatedEntries(lines):
 
     try:
         key = "_pdbx_database_related"
-        data = parseSTARSection(lines, key)
+        data = parseSTARSection(lines, key, report=False)
         for item in data:
             dbref = DBRef()
             dbref.accession = item[key + ".db_id"]
@@ -715,8 +715,8 @@ def _getReference(lines):
 
     # JRNL double block. Blocks 6 and 7 as copied from COMPND
     # Block 1 has most info. Block 2 has author info
-    items1 = parseSTARSection(lines, "_citation")
-    items2 = parseSTARSection(lines, "_citation_author")
+    items1 = parseSTARSection(lines, "_citation", report=False)
+    items2 = parseSTARSection(lines, "_citation_author", report=False)
 
     for row in items1:
         for k, value in row.items():
@@ -767,7 +767,7 @@ def _getPolymers(lines):
     entities = defaultdict(list)
 
     # SEQRES block
-    items1 = parseSTARSection(lines, '_entity_poly')
+    items1 = parseSTARSection(lines, '_entity_poly', report=False)
 
     for item in items1:
         chains = item['_entity_poly.pdbx_strand_id']
@@ -781,7 +781,7 @@ def _getPolymers(lines):
                 '_entity_poly.pdbx_seq_one_letter_code_can'].replace(';', '').split())
 
     # DBREF block 1
-    items2 = parseSTARSection(lines, '_struct_ref')
+    items2 = parseSTARSection(lines, '_struct_ref', report=False)
 
     for item in items2:
         entity = item["_struct_ref.id"]
@@ -798,7 +798,7 @@ def _getPolymers(lines):
             poly.dbrefs.append(dbref)
 
     # DBREF block 2
-    items3 = parseSTARSection(lines, "_struct_ref_seq")
+    items3 = parseSTARSection(lines, "_struct_ref_seq", report=False)
 
     for i, item in enumerate(items3):
         i += 1
@@ -884,7 +884,7 @@ def _getPolymers(lines):
             last = temp
 
     # MODRES block
-    data4 = parseSTARSection(lines, "_pdbx_struct_mod_residue")
+    data4 = parseSTARSection(lines, "_pdbx_struct_mod_residue", report=False)
 
     for data in data4:
         ch = data["_pdbx_struct_mod_residue.label_asym_id"]
@@ -904,7 +904,7 @@ def _getPolymers(lines):
                                 data["_pdbx_struct_mod_residue.details"]))
 
     # SEQADV block
-    data5 = parseSTARSection(lines, "_struct_ref_seq_dif")
+    data5 = parseSTARSection(lines, "_struct_ref_seq_dif", report=False)
 
     for i, data in enumerate(data5):
         ch = data["_struct_ref_seq_dif.pdbx_pdb_strand_id"]
@@ -964,8 +964,8 @@ def _getPolymers(lines):
 
     # COMPND double block. 
     # Block 6 has most info. Block 7 has synonyms
-    data6 = parseSTARSection(lines, "_entity")
-    data7 = parseSTARSection(lines, "_entity_name_com")
+    data6 = parseSTARSection(lines, "_entity", report=False)
+    data7 = parseSTARSection(lines, "_entity_name_com", report=False)
 
     dict_ = {}
     for molecule in data6:
@@ -1045,7 +1045,7 @@ def _getChemicals(lines):
     # 1st block we need is has info about location in structure
 
     # this instance only includes single sugars not branched structures
-    items = parseSTARSection(lines, "_pdbx_nonpoly_scheme")
+    items = parseSTARSection(lines, "_pdbx_nonpoly_scheme", report=False)
 
     for data in items:
         resname = data["_pdbx_nonpoly_scheme.mon_id"]
@@ -1064,7 +1064,7 @@ def _getChemicals(lines):
         chemicals[chem.resname].append(chem)
 
     # next we get the equivalent one for branched sugars part
-    items = parseSTARSection(lines, "_pdbx_branch_scheme")
+    items = parseSTARSection(lines, "_pdbx_branch_scheme", report=False)
 
     for data in items:
         resname = data["_pdbx_branch_scheme.mon_id"]
@@ -1080,7 +1080,7 @@ def _getChemicals(lines):
         chemicals[chem.resname].append(chem)
 
     # 2nd block to get has general info e.g. name and formula
-    items = parseSTARSection(lines, "_chem_comp")
+    items = parseSTARSection(lines, "_chem_comp", report=False)
 
     for data in items:
         resname = data["_chem_comp.id"]
@@ -1155,7 +1155,7 @@ def _getTitle(lines):
     title = ''
 
     try:
-        data = parseSTARSection(lines, "_struct")
+        data = parseSTARSection(lines, "_struct", report=False)
         for item in data:
             title += item['_struct.title'].upper()
     except:
@@ -1172,7 +1172,7 @@ def _getAuthors(lines):
     authors = []
 
     try:
-        data = parseSTARSection(lines, "_audit_author")
+        data = parseSTARSection(lines, "_audit_author", report=False)
         for item in data:
             author = ''.join(item['_audit_author.name'].split(', ')[::-1])
             authors.append(author.upper())
@@ -1192,7 +1192,7 @@ def _getSplit(lines):
     key = "_pdbx_database_related"
 
     try:
-        data, _ = parseSTARSection(lines, key)
+        data, _ = parseSTARSection(lines, key, report=False)
         for item in data:
             if item[key + '.content_type'] == 'split':
                 split.append(item[key + '.db_id'])
@@ -1227,7 +1227,7 @@ def _getOther(lines, key=None):
     data = []
 
     try:
-        data = parseSTARSection(lines, key)
+        data = parseSTARSection(lines, key, report=False)
     except:
         pass
 
@@ -1242,7 +1242,7 @@ def _getUnobservedSeq(lines):
     key_unobs = '_pdbx_unobs_or_zero_occ_residues'
 
     try:
-        unobs = parseSTARSection(lines, key_unobs)
+        unobs = parseSTARSection(lines, key_unobs, report=False)
         polymers = _getPolymers(lines)
     except:
         pass

diff --git a/prody/proteins/emdfile.py b/prody/proteins/emdfile.py
@@ -72,8 +72,12 @@ def parseEMD(emd, **kwargs):
 
             if os.path.isfile(emd + '.map'):
                 filename = emd + '.map'
+                LOGGER.debug('EMD file is found in working directory ({0}).'
+                            .format(filename))
             elif os.path.isfile(emd + '.map.gz'):
                 filename = emd + '.map.gz'
+                LOGGER.debug('EMD file is found in working directory ({0}).'
+                            .format(filename))
             else:
                 filename = fetchPDB(emd, report=True,
                                     format='emd', compressed=False)
@@ -91,6 +95,14 @@ def parseEMD(emd, **kwargs):
     result = parseEMDStream(emdStream, **kwargs)
     emdStream.close()
 
+    if hasattr(result, 'numAtoms'):
+        LOGGER.info('Output is an AtomGroup with {0} atoms fitted.'.format(result.numAtoms()))
+    elif hasattr(result, 'apix'):
+        LOGGER.info('Output is an EMDMAP with {:4.2f} A/pix.'.format(result.apix[0]))
+    else:
+        LOGGER.warn('Atomic data could not be parsed, please '
+                    'check the input file.')
+
     return result
 
 
@@ -128,8 +140,6 @@ def parseEMDStream(stream, **kwargs):
     else:
         make_nodes = False
         map = True
-        LOGGER.info('As n_nodes is less than or equal to 0, no nodes will be'
-                    ' made and the raw map will be returned')
 
     emd = EMDMAP(stream, min_cutoff, max_cutoff)
 

diff --git a/prody/proteins/localpdb.py b/prody/proteins/localpdb.py
@@ -212,16 +212,15 @@ def fetchPDB(*pdb, **kwargs):
     if len(pdb) == 1 and isinstance(pdb[0], list):
         pdb = pdb[0]
 
-    if 'format' in kwargs and kwargs.get('format') != 'pdb':
-        return fetchPDBviaFTP(*pdb, **kwargs)
-
     identifiers = checkIdentifiers(*pdb)
 
     folder = kwargs.get('folder', '.')
     compressed = kwargs.get('compressed')
+    format_ = kwargs.get('format')
 
     # check *folder* specified by the user, usually pwd ('.')
-    filedict = findPDBFiles(folder, compressed=compressed)
+    filedict = findPDBFiles(folder, compressed=compressed, 
+                            format=format_)
 
     filenames = []
     not_found = []
@@ -240,8 +239,8 @@ def fetchPDB(*pdb, **kwargs):
         if len(filenames) == 1:
             filenames = filenames[0]
             if exists:
-                LOGGER.debug('PDB file is found in working directory ({0}).'
-                             .format(sympath(filenames)))
+                LOGGER.debug('{0} file is found in working directory ({1}).'
+                             .format(format_.upper(), sympath(filedict[pdb])))
         return filenames
 
     if not isWritable(folder):
@@ -414,6 +413,8 @@ def iterPDBFilenames(path=None, sort=False, unique=True, **kwargs):
 
     from re import compile, IGNORECASE
 
+    format = kwargs.get('format')
+
     if path is None or kwargs.get('mirror') is True:
         if path is None:
             path = pathPDBMirror()
@@ -436,10 +437,20 @@ def iterPDBFilenames(path=None, sort=False, unique=True, **kwargs):
         compressed = kwargs.get('compressed')
         if compressed is None:
             pdbext = compile('\.(pdb|ent)(\.gz)?$', IGNORECASE)
+            cifext = compile('\.(cif)(\.gz)?$', IGNORECASE)
+            emdext = compile('\.(emd|map|mrc)(\.gz)?$', IGNORECASE)
         elif compressed:
             pdbext = compile('\.(pdb|ent)\.gz$', IGNORECASE)
+            cifext = compile('\.(cif)\.gz$', IGNORECASE)
+            emdext = compile('\.(emd|map|mrc)\.gz$', IGNORECASE)
         else:
             pdbext = compile('\.(pdb|ent)$', IGNORECASE)
+            cifext = compile('\.(cif)$', IGNORECASE)
+            emdext = compile('\.(emd|map|mrc)$', IGNORECASE)
+        if format == 'cif':
+            pdbext = cifext
+        if format == 'emd':
+            pdbext = emdext
         pdbs = [pdb for pdb in iglob(join(path, '*')) if pdbext.search(pdb)]
         if sort:
             pdbs.sort(reverse=kwargs.get('reverse'))
@@ -476,7 +487,7 @@ def findPDBFiles(path, case=None, **kwargs):
         pdb = splitext(split(fn)[1])[0]
         ending = splitext(splitext(split(fn)[1])[0])[1]
         if ending == 'gz':
-            pdb = splittext(pdb)[0]
+            pdb = splitext(pdb)[0]
         if len(pdb) == 7 and pdb.startswith('pdb'):
             pdb = pdb[3:]
         if upper:

diff --git a/prody/proteins/mmtffile.py b/prody/proteins/mmtffile.py
@@ -323,6 +323,9 @@ def set_info(atomgroup, mmtf_data,get_bonds=False,altloc_sel='A'):
     if altloc_sel != 'all':
         #mask out any unwanted alternative locations
         mask = (altlocs == '') | (altlocs == altloc_sel)
+
+    if np.all(mask == False):
+        mask = (altlocs == '') | (altlocs == altlocs[0])
 
     atomgroup.setCoords(coords[:,mask])
     atomgroup.setNames(atom_names[mask])

diff --git a/prody/proteins/pdbfile.py b/prody/proteins/pdbfile.py
@@ -1242,8 +1242,7 @@ def writePDBStream(stream, atoms, csets=None, **kwargs):
         Default is **False**, which means using hexadecimal instead.
         NB: ChimeraX seems to prefer hybrid36 and may have problems with hexadecimal.
     :type hybrid36: bool
-    """
-    initialACSI = atoms.getACSIndex()
+    """    
     renumber = kwargs.get('renumber', True)
 
     remark = str(atoms)
@@ -1262,8 +1261,10 @@ def writePDBStream(stream, atoms, csets=None, **kwargs):
     if coordsets is None:
         raise ValueError('atoms does not have any coordinate sets')
 
+    had_atoms = False
     try:
         acsi = atoms.getACSIndex()
+        had_atoms = True
     except AttributeError:
         try:
             atoms = atoms.getAtoms()
@@ -1438,7 +1439,8 @@ def writePDBStream(stream, atoms, csets=None, **kwargs):
     num_ter_lines = 0
     for m, coords in enumerate(coordsets):
 
-        atoms.setACSIndex(m)
+        if had_atoms:
+            atoms.setACSIndex(m)
         anisous = atoms._getAnisous()
         if anisous is not None:
             anisous = np.array(anisous * 10000, dtype=int)
@@ -1614,7 +1616,8 @@ def writePDBStream(stream, atoms, csets=None, **kwargs):
 
     write('END   ' + " "*74 + '\n')
 
-    atoms.setACSIndex(initialACSI)
+    if had_atoms:
+        atoms.setACSIndex(acsi)
 
 writePDBStream.__doc__ += _writePDBdoc
 

diff --git a/prody/proteins/starfile.py b/prody/proteins/starfile.py
@@ -1026,7 +1026,7 @@ def parseImagesFromSTAR(particlesSTAR, **kwargs):
     return np.array(images), parsed_images_data
 
 
-def parseSTARSection(lines, key):
+def parseSTARSection(lines, key, report=True):
     """Parse a section of data from *lines* from a STAR file 
     corresponding to a *key* (part before the dot). 
     This can be a loop or data block.
@@ -1077,7 +1077,8 @@ def parseSTARSection(lines, key):
         else:
             data = [loop_dict["data"]]
     else:
-        LOGGER.warn("Could not find {0} in lines.".format(key))
+        if report:
+            LOGGER.warn("Could not find {0} in lines.".format(key))
         return []
 
     return data