added python3 compatibility (for dlpisces.py and download_pdbs.py), f…

…ixed numerous packaging bugs preventing pip installation, and made the code slightly more modular.
jewettaij · Jan 25, 2021 · 19a95fd · 19a95fd
1 parent c4613c9
commit 19a95fd
Show file tree

Hide file tree

Showing 9 changed files with 367 additions and 134 deletions.
diff --git a/dlpdb/__init__.py b/dlpdb/__init__.py
@@ -3,18 +3,20 @@
 # Hence, only a few of the modules contain useful functions that can be
 # accessed from within python.  They are below:
 from .resid import ResID
-from .closest_points import ClosestPoints
+from .closest_line_points import ClosestLinePoints
 from .coords2angles import Coords2AnglesLengths, Coords2Angles
 from .coords2dihedrals import Coords2DihedralsAnglesLengths, Coords2Dihedrals
-from .coords2dihedrals_inner import Coords2DihedralsLengthsInner, Coords2DihedralsInner
+from .coords2dihedrals_inner import Coords2DihedralsInnerLengths, Coords2DihedralsInner
+from .helixAngleOmega import CalcOmegaFromThetaPhi, CalcOmega
 
 # I no longer remember why I import "main" from the executable scripts.
 # Perhaps these next few lines are unnecessary, but they seem to do no harm:
 from .coords2angles import main
 from .coords2dihedrals_inner import main
 from .coords2dihedrals import main
 from .coords2distances import main
-from .coords2helixAngleOmega.py import main
+from .coords2helixAngleOmega import main
+from .dlpdbfile import main
 from .dlpisces import main
 from .dna_interleave_residues import main
 from .download_pdbs import main
@@ -26,7 +28,6 @@
 from .has_secondary_str import main
 from .has_sheets import main
 from .has_turns import main
-from .helixAngleOmega import main
 from .merge_lines_periodic import main
 from .pdb2coords_ave import main
 from .pdb2coords import main
@@ -46,7 +47,7 @@
            'coords2dihedrals_inner',
            'coords2dihedrals',
            'coords2distances',
-           'coords2helixAngleOmega.py',
+           'coords2helixAngleOmega',
            'dlpisces',
            'dna_interleave_residues',
            'download_pdbs',

diff --git a/dlpdb/closest_points.py → dlpdb/closest_line_points.py b/dlpdb/closest_points.py → dlpdb/closest_line_points.py
@@ -47,7 +47,7 @@ def PrintVect(v):
     sys.stdout.write('\n')
 
 
-def ClosestPoints(ra0, rb0, va, vb):
+def ClosestLinePoints(ra0, rb0, va, vb):
     """
     Return the closest pair of points in 3D along two lines of infinite length
     pointing in directions va,vb and passing through points ra0,rb0.

diff --git a/dlpdb/coords2dihedrals_inner.py b/dlpdb/coords2dihedrals_inner.py
@@ -61,11 +61,11 @@
 import sys
 from math import sqrt, cos, sin, tan, acos, asin, atan, pi, floor
 try:
-    from .closest_points import ClosestPoints
+    from .closest_line_points import ClosestLinePoints
     from .coords2dihedrals import Coords2DihedralsAnglesLengths,Coords2Dihedrals
 except (ImportError, SystemError, ValueError):
     # not installed as a package
-    from closest_points import ClosestPoints
+    from closest_line_points import ClosestLinePoints
     from coords2dihedrals import Coords2DihedralsAnglesLengths,Coords2Dihedrals
 
 

diff --git a/dlpdb/coords2helixAngleOmega.py b/dlpdb/coords2helixAngleOmega.py
@@ -8,32 +8,16 @@
 """
 
 
-from math import sqrt, cos, sin, tan, acos, asin, atan, pi
 import sys
-from helixAngleOmega import CalcOmega
-
-
-
-def length_v(r):
-    lsqd = 0.0
-    for d in range(0,len(r)):
-        lsqd += r[d]*r[d]
-    return sqrt(lsqd)
-
+from math import sqrt, cos, sin, tan, acos, asin, atan, pi
+try:
+    from .helixAngleOmega import CalcOmegaFromThetaPhi, CalcOmega
+except (ImportError, SystemError, ValueError):
+    # not installed as a package
+    from helixAngleOmega import CalcOmegaFromThetaPhi, CalcOmega
 
-def inner_prod_v(r1,r2):
-    result = 0.0
-    for d in range(0,len(r1)):
-        result += r1[d]*r2[d]
-    return result
 
 
-def cross_prod_v3(a,b):
-    c = [0.0,0.0,0.0]
-    c[0] = a[1]*b[2] - a[2]*b[1]
-    c[1] = a[2]*b[0] - a[0]*b[2]
-    c[2] = a[0]*b[1] - a[1]*b[0]
-    return c
 
 
 def main():
@@ -83,69 +67,7 @@ def main():
 
         if (len(r_i[i])==3)and(len(r_i[i+1])==3)and(len(r_i[i+2])==3)and(len(r_i[i+3])==3):
 
-            r10 = [0.0, 0.0, 0.0]
-            r21 = [0.0, 0.0, 0.0]
-            r32 = [0.0, 0.0, 0.0]
-            for d in range(0,3):
-                r10[d] = r_i[i+1][d] - r_i[i][d]
-                r21[d] = r_i[i+2][d] - r_i[i+1][d]
-                r32[d] = r_i[i+3][d] - r_i[i+2][d]
-            l10 = length_v(r10)
-            l21 = length_v(r21)
-            l32 = length_v(r32)
-
-            n012 = cross_prod_v3(r10, r21)
-            n123 = cross_prod_v3(r21, r32)
-
-
-            # The torsion-angle or 4-body angle is named "angle0124"
-            cos_phi = inner_prod_v(n012, n123) /(length_v(n012)*length_v(n123))
-
-            # There is a problem whenever 4 consecutive atoms are coplanar:
-            #
-            #            *---*
-            #                |      (all 4 atoms are coplanar, and phi = 0)
-            #            *---*
-            #
-            # In this example, the torsion angle phi is well defined and =0.
-            # The problem is that, due to floating point roundoff
-            # "cos_phi" can sometimes slightly exceed 1.
-            # This causes a NAN when you calculate acos(cos_phi).
-
-            if (cos_phi > 1.0):
-                cos_phi = 1.0
-            elif (cos_phi < -1.0):
-                cos_phi = -1.0
-
-            phi = acos(cos_phi)
-
-            # This formula does not distinguish positive and negative phi.
-            #
-            # Negative torsion angles:
-            #
-            # Check if  the position of atom i+3 is above the phi=0 plane
-            # (in the region of positive phi), or below the phi=0 plane.
-            # It is above the phi=0 plane if the bond from atom i+2 to i+3
-            # points in the same direction as the negative-phi-tangent-vector 
-            # for atom i (not i+3)  (...which points in the n012 direction)
-            if inner_prod_v(n012, r32) < 0.0:
-                phi = -phi
-
-            # The two bond-angles or 3-body angles are named "angle012" and "angle123"
-            angle012 = acos( -inner_prod_v(r10, r21) / (l10 * l21) )
-            angle123 = acos( -inner_prod_v(r21, r32) / (l21 * l32) )
-            # (The negative sign above comes from the fact that we are really 
-            #  interested in the angle between r21 and r01 (which is -r10).)
-
-            # Convert these angles to degrees, and print it out
-            #sys.stdout.write(str(angle012*180.0/pi)+' ')
-            #sys.stdout.write(str(angle123*180.0/pi)+' ')
-
-            # Let theta = the average of the two 3-body angles
-            theta = 0.5 * (angle012 + angle123) 
-
-            # Omega (usually 360/3.6 ~= 100 degrees) is the helix rotation angle.
-            Omega = CalcOmega(theta, phi)
+            Omega = CalcOmega(r_i[i], r_i[i+1], r_i[i+2], r_i[i+3])
 
             sys.stdout.write(str(Omega*180.0/pi))
 

diff --git a/dlpdb/dlpdbfile.py b/dlpdb/dlpdbfile.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+
+"""
+ Typical usage (example):
+
+    dlpdbfile.py < list_of_pdb_codes.txt
+
+Where the input file contains a list of pdb-codes, one per line:
+100d
+101d
+102d
+103d
+104d
+107d
+108d
+109d
+ Every line in this file contains a 4-letter pdb identifier.
+Each 4-letter pdb code is compared with a list of pdb codes downloaded so far
+(which is initially read from the file "pdbs_most_recent.txt", if present).
+If it is new 4-letter pdb code, then the corresponding pdb file is downloaded,
+and appended to the end of the "pdbs_most_recent.txt" file.
+The corresponding DSSP file is also downloaded, if available.
+
+"""
+
+# author: Andrew Jewett
+g_program_name = __file__.split('/')[-1]
+g_date_str = '2021-1-24'
+g_version_str = '0.6.0'
+
+
+import sys, urllib, urllib.request, random, time, gzip
+
+
+def FileExists(fname):
+    try:
+        file = open(fname, 'r')
+    except IOError:
+        exists = False
+    else:
+        exists = True
+    return exists
+
+
+
+def ExtractFileName(url):
+    # The name of the file we create will be the same as the last few 
+    # characters of the URL. (Whatever comes after the last '/' character.)
+    out_fname = url
+    slash_loc = out_fname.rfind('/')
+    if (slash_loc >= 0):  # if '/' was found, then remove text preceeding '/'
+        out_fname = out_fname[slash_loc+1:]
+        if (out_fname == ''):
+            sys.stderr.write('Error: filename difficult to determine from URL:\n' +\
+                                 '       \"'+url+'\"\n')
+            sys.exit(-1)
+    return out_fname
+
+
+
+# This function downloads a file from a URL, and (if requested by the user)
+# prints out a warning message if there was a problem.
+
+import os
+
+def DownloadFileTo(url, file_name, verbose_mode = True):
+    # Open the input file (at url) for reading
+    # Simple way:
+    #   in_file = urllib.urlopen(url)
+    # But the web site may return garbage if the file doesn't exist.
+    # Instead, we check for errors first:
+    try: in_file = urllib.request.urlopen(url)
+    except urllib.error.URLError as e:
+        if (verbose_mode):
+            sys.stderr.write(str(e)+'\n')
+            sys.stderr.write('   omitting file \"'+file_name+'\"\n')
+    else:
+        if (verbose_mode):
+            sys.stderr.write('downloading file \"'+file_name+'\"\n')
+        # Open the output file for writing
+        out_file = open(file_name, 'wb')
+        out_file.write(in_file.read())
+        out_file.close()
+
+
+
+# This version infers the file name from the URL.
+def DownloadFile(url, verbose_mode = True):
+    DownloadFileTo(url, ExtractFileName(url), verbose_mode)
+
+
+
+
+# Read in the list of PDB files already downloaded:
+
+pdbs_old = set([])
+try:
+    pdbs_old_file = open('pdbs_old.txt', 'r')
+    for pdb_code in pdbs_old_file:
+        pdb_code = pdb_code.strip()#eliminate trailing and preceeding whitespace
+        pdb_code = pdb_code.lower()#pdb codes are case-insensitive
+        if (len(pdb_code) != 4):
+            sys.stderr.write('Error in in \"pdbs_old.txt\":\n')
+            sys.stderr.write('      Invalid PDB-code: \"'+pdb_code+'\"\n')
+            exit(-1)
+        pdbs_old.add(pdb_code)
+    pdbs_old_file.close()
+except IOError:
+    pass
+
+
+
+
+def main():
+    # Here we keep track of the list of pdb files which
+    #  i) are in the current list of pdb files requested in 
+    #     sys.stdin
+    # ii) are in the current list, but are not downloaded yet 
+    #     (new pdb files)
+    pdbs_current_file = open('pdbs_most_recent.txt', 'w')
+    pdbs_current = set([]) #entire list of pdb codes requested
+    pdbs_new  = set([]) #a list of new pdb codes requested that were not in the old list
+    pdbs_old_file = open('pdbs_old.txt', 'a')
+
+
+    for line in sys.stdin:
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        pdb_code = line.lower()  #(pdb codes are case-insensitive)
+
+        if (pdb_code in pdbs_new):
+            sys.stderr.write(pdb_code+' appears redundantly. skipping\n')
+        elif (pdb_code in pdbs_old):
+            if (not (pdb_code in pdbs_current)):
+                sys.stderr.write(pdb_code+' downloaded already. skipping.\n')
+                pdbs_current.add(pdb_code)
+                pdbs_current_file.write(pdb_code+'\n')
+                pdbs_current_file.flush()  #<- necessary in case we get interrupted
+            else:
+                sys.stderr.write(pdb_code+' appears redundantly. skipping\n')
+
+        else:
+            #Download the corresponding pdb file
+            file_name = pdb_code+'.pdb.gz' # <- these are compressed files
+            url = 'http://www.rcsb.org/pdb/files/'+file_name
+            # Note, if the URL above files, try this one instead:
+            # url = 'https://files.rcsb.org/download/'+file_name
+            DownloadFileTo(url, file_name)
+
+            #Unzip the pdb file:
+            with gzip.open(file_name, 'rb') as f:
+                file_content = f.read()
+                f.close()
+            f = open(pdb_code+'.pdb', 'wb')
+            f.write(file_content)
+            if not FileExists(pdb_code+'.pdb'):
+                sys.stderr.write('Error: A problem occured when trying to download PDB code \"'+pdb_code+'\"\n'
+                                 '       Delete this entry from the file \"pdbs_old.txt\", and rerun '+g_program_name+'\n')
+                sys.exit(-1)
+
+            #Optional: Download the corresponding DSSP file
+            url = 'ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp/'+pdb_code+'.dssp'
+            DownloadFile(url)
+            if not FileExists(pdb_code+'.dssp'):
+                sys.stderr.write("    (The old DSSP PDB is server flaking out again.\n"
+                                 "     Don't worry.  DSSP files are not needed.)\n")
+
+            #Keep track of the the pdbs we have downloaded so far:
+
+            pdbs_current.add(pdb_code)
+            pdbs_current_file.write(pdb_code+'\n')
+            pdbs_current_file.flush()  #<- necessary in case we get interrupted
+            pdbs_old_file.write(pdb_code+'\n')
+            pdbs_old_file.flush()      #<- necessary in case we get interrupted
+            pdbs_new.add(pdb_code)
+
+
+    pdbs_current_file.close()
+
+    def ChainFromPDBfile(chainID, pdb_file):
+        for line in pdb_file:
+            line_type = line[0:6]
+            if line_type in set(['ATOM  ', 'HETATM', 'ANISOU', 'SIGATM', 'SIGUIJ']):
+                if line[21:22] == chainID:
+                    yield line
+            elif line_type == "HET   ":
+                if line[12:13] == chainID:
+                    yield line
+            elif line_type == "HELIX ":
+                initChainID = line[19:20]
+                endChainID  = line[31:32]
+                if (initChainID == chainID) or (endChainID == chainID):
+                    yield line
+            elif line_type == "SHEET ":
+                initChainID = line[21:22]
+                endChainID  = line[32:33]
+                if (initChainID == chainID) or (endChainID == chainID):
+                    yield line
+            elif line_type == "TURN  ":
+                initChainID = line[19:20]
+                endChainID  = line[30:31]
+                if (initChainID == chainID) or (endChainID == chainID):
+                    yield line
+            elif line_type == "SEQRES":
+                if line[11:12] == chainID:
+                    yield line
+            elif line_type == "TER   ":
+                ter_chainID = line[21:22]
+                if (ter_chainID == chainID) or (ter_chainID == " "):
+                    yield line
+            else:
+                yield line
+        return
+
+
+if __name__ == "__main__":
+    main()