From aab01281d04466a81cdc13a2870999c6dbeb2954 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Mon, 20 Jul 2020 22:06:49 +0000
Subject: [PATCH 1/8] Made changes to auth.py file

Made changes to auth.py file that should get the token and save it to the .htrc file and then be read by the htrc.volumes __init__.py file. I am getting an API timeout error which is from the download_volumes function, but cannot figure out why.
---
 htrc/auth.py   | 41 ++++++++++++++++++++++++-----------------
 htrc/config.py | 20 ++++++++++----------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/htrc/auth.py b/htrc/auth.py
index a24255d..53c2290 100644
--- a/htrc/auth.py
+++ b/htrc/auth.py
@@ -3,32 +3,39 @@
 import http.client
 import ssl
 import time
-
+import subprocess
 import requests
 import requests.auth
+import configparser
 
 import htrc.config
 
 def get_jwt_token():
     # Currently we just store one common jwt token locally at .htrc file for simplicity
     # Expect to add POST method to query unique jwt token with the combo of username and password
-    username, password = credential_prompt()
-
-    client_id, client_secret = htrc.config.get_credentials()
-
-    auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
-    data = { "grant_type": "password",
-             "username": username,
-             "password": password,
-             "scope" : "openid"}
-
-    url = htrc.config.get_idp_url()
-    r = requests.post(url, data=data, auth=auth)
+    #username, password = credential_prompt()
+
+    #client_id, client_secret = htrc.config.get_credentials()
+
+    #auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
+    #data = { "grant_type": "password",
+             #"username": username,
+             #"password": password,
+             #"scope" : "openid"}
+              
+    url1 = htrc.config.get_idp_url()
+    capsule_id = htrc.config._get_value("jwt", "capsule_id")
+    result = subprocess.check_output(['hostname', '-s', '-I'])
+    result = result.decode('utf-8')
+    result = result[:-1]
+    capsule_ip = result.strip()
+    url = url1 + "/" + capsule_id + "/" + capsule_ip
+    r = requests.get(url, verify = False)
 
     data = r.json()
     if 'error' not in data:
-        expiration = int(time.time()) + data['expires_in']
-        return data['id_token'], expiration
+        #expiration = int(time.time()) + data['expires_in']
+        return data['token']
     elif data['error'] == 'invalid_grant':
         print("Invalid username or password. Please try again.\n")
         return get_jwt_token()
@@ -51,5 +58,5 @@ def credential_prompt():
 
 
 if __name__ == '__main__':
-    token, expiration = get_jwt_token()
-    htrc.config.save_jwt_token(token, expiration)
+    token = get_jwt_token()
+    htrc.config.save_jwt_token(token)
diff --git a/htrc/config.py b/htrc/config.py
index ccd7d54..deb0ffe 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -81,19 +81,19 @@ def get_jwt_token(path=None):
         token = _get_value('jwt', 'token', path)
 
         # check expiration date
-        expiration = int(_get_value('jwt', 'expiration', path))
-        if time.time() > expiration:
-            raise RuntimeError("JWT token expired.") 
+        #expiration = int(_get_value('jwt', 'expiration', path))
+        #if time.time() > expiration:
+            #raise RuntimeError("JWT token expired.") 
     except:
         # This should run on either a missing or expired token.
         import htrc.auth
-        token, expiration = htrc.auth.get_jwt_token()
-        htrc.config.save_jwt_token(token, expiration, path)
+        token = htrc.auth.get_jwt_token()
+        htrc.config.save_jwt_token(token, path)
 
 
     return token
 
-def save_jwt_token(token, expiration=None, path=None):
+def save_jwt_token(token, path=None):
     """
     Saves JWT token in the config file.
     """
@@ -102,8 +102,8 @@ def save_jwt_token(token, expiration=None, path=None):
         path = DEFAULT_PATH
 
     # Default to expiration of now - force a new token on next request
-    if expiration is None:
-        expiration = time.time()
+    #if expiration is None:
+        #expiration = time.time()
 
     # Open and modify existing config file, if it exists.
     config = ConfigParser(allow_no_value=True)
@@ -114,7 +114,7 @@ def save_jwt_token(token, expiration=None, path=None):
 
     # set token and expiration
     config.set('jwt', 'token', token)
-    config.set('jwt', 'expiration', expiration)
+    #config.set('jwt', 'expiration', expiration)
 
     with open(path, 'w') as credential_file:
         config.write(credential_file)
@@ -137,7 +137,7 @@ def remove_jwt_token(path=None):
         config.add_section('jwt')
     # set token and expiration
     config.set('jwt', 'token', " ")
-    config.set('jwt', 'expiration', " ")
+    #config.set('jwt', 'expiration', " ")
 
     with open(path, 'w') as credential_file:
         config.write(credential_file)

From 46b9710b50a2f5718c4aaf1f2b70fb61cddd2765 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Fri, 24 Jul 2020 14:15:56 -0400
Subject: [PATCH 2/8] skip config file for JWT.

---
 htrc/config.py | 14 ++------------
 setup.py       |  2 +-
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/htrc/config.py b/htrc/config.py
index deb0ffe..eb1dbf7 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -77,19 +77,9 @@ def get_idp_url(path=None):
 
 # Add jwt credential access methods
 def get_jwt_token(path=None):
-    try:
-        token = _get_value('jwt', 'token', path)
-
-        # check expiration date
-        #expiration = int(_get_value('jwt', 'expiration', path))
-        #if time.time() > expiration:
-            #raise RuntimeError("JWT token expired.") 
-    except:
-        # This should run on either a missing or expired token.
-        import htrc.auth
-        token = htrc.auth.get_jwt_token()
-        htrc.config.save_jwt_token(token, path)
 
+    import htrc.auth
+    token = htrc.auth.get_jwt_token()
 
     return token
 
diff --git a/setup.py b/setup.py
index 221915a..bd2ea20 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.53'
+__version__ = '0.1.54b2'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226']

From 4618d532e1320343ef5d62bffb6d5276f9c9fca0 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Mon, 27 Jul 2020 20:20:41 +0000
Subject: [PATCH 3/8] Add files via upload

---
 htrc/auth.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/htrc/auth.py b/htrc/auth.py
index 53c2290..cd44c48 100644
--- a/htrc/auth.py
+++ b/htrc/auth.py
@@ -1,12 +1,12 @@
-from base64 import b64encode
+#from base64 import b64encode
 from getpass import getpass
-import http.client
-import ssl
-import time
+#import http.client
+#import ssl
+#import time
 import subprocess
 import requests
 import requests.auth
-import configparser
+#import configparser
 
 import htrc.config
 
@@ -30,7 +30,7 @@ def get_jwt_token():
     result = result[:-1]
     capsule_ip = result.strip()
     url = url1 + "/" + capsule_id + "/" + capsule_ip
-    r = requests.get(url, verify = False)
+    r = requests.get(url)
 
     data = r.json()
     if 'error' not in data:

From cd6e90db3cff8d97d74af2ac7d6641c47667785c Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Mon, 27 Jul 2020 20:22:39 +0000
Subject: [PATCH 4/8] Commented out unused functions

Commented out unused functions and packages.
---
 htrc/volumes/__init__.py | 85 ++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 6ddb9a7..6c8787a 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -12,30 +12,29 @@
 from future import standard_library
 standard_library.install_aliases()
 
-from builtins import input
+#from builtins import input
 
 import http.client
 from io import BytesIO  # used to stream http response into zipfile.
-import json
+#import json
 import logging
 import os.path
 import progressbar
-import re
+#import re
 import socket
 import ssl
-import sys
-from time import sleep
-from urllib.request import urlopen
-from urllib.error import HTTPError
-from urllib.parse import quote_plus, urlencode
-import xml.etree.ElementTree as ET
+#import sys
+#from time import sleep
+#from urllib.request import urlopen
+#from urllib.error import HTTPError
+from urllib.parse import urlencode
+#import xml.etree.ElementTree as ET
 from zipfile import ZipFile  # used to decompress requested zip archives.
 
-from htrc.lib.cli import bool_prompt
+#from htrc.lib.cli import bool_prompt
 from htrc.util import split_items
 import htrc.config
 
-import logging
 from logging import NullHandler
 logging.getLogger(__name__).addHandler(NullHandler())
 
@@ -77,7 +76,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
     ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
+    #ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
@@ -154,7 +153,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
     ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
+    #ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
@@ -191,49 +190,49 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     return data
 
-def get_oauth2_token(username, password):
+#def get_oauth2_token(username, password):
     # make sure to set the request content-type as application/x-www-form-urlencoded
-    headers = {"Content-type": "application/x-www-form-urlencoded"}
-    data = { "grant_type": "client_credentials",
-             "client_secret": password,
-             "client_id": username }
-    data = urlencode(data)
+    #headers = {"Content-type": "application/x-www-form-urlencoded"}
+    #data = { "grant_type": "client_credentials",
+             #"client_secret": password,
+             #"client_id": username }
+    #data = urlencode(data)
 
     # create an SSL context
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
+    #ctx = ssl.create_default_context()
+    #ctx.check_hostname = False
+    #ctx.verify_mode = ssl.CERT_NONE
 
     # make sure the request method is POST
-    host, port = htrc.config.get_oauth2_host_port()
-    oauth2port = htrc.config.get_oauth2_port()
-    oauth2EPRurl = htrc.config.get_oauth2_url()
-    httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx)
-    httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers)
+    #host, port = htrc.config.get_oauth2_host_port()
+    #oauth2port = htrc.config.get_oauth2_port()
+    #oauth2EPRurl = htrc.config.get_oauth2_url()
+    #httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx)
+    #httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers)
 
-    response = httpsConnection.getresponse()
+    #response = httpsConnection.getresponse()
 
     # if response status is OK
-    if response.status == 200:
-        data = response.read().decode('utf8')
+    #if response.status == 200:
+        #data = response.read().decode('utf8')
 
-        jsonData = json.loads(data)
-        logging.info("*** JSON: {}".format(jsonData))
+        #jsonData = json.loads(data)
+        #logging.info("*** JSON: {}".format(jsonData))
 
-        token = jsonData["access_token"]
-        logging.info("*** parsed token: {}".format(token))
+        #token = jsonData["access_token"]
+        #logging.info("*** parsed token: {}".format(token))
 
-    else:
-        logging.debug("Unable to get token")
-        logging.debug("Response Code: {}".format(response.status))
-        logging.debug("Response: {}".format(response.reason))
-        logging.debug(response.read())
-        raise EnvironmentError("Unable to get token.")
+    #else:
+        #logging.debug("Unable to get token")
+        #logging.debug("Response Code: {}".format(response.status))
+        #logging.debug("Response: {}".format(response.reason))
+        #logging.debug(response.read())
+        #raise EnvironmentError("Unable to get token.")
 
-    if httpsConnection is not None:
-        httpsConnection.close()
+    #if httpsConnection is not None:
+        #httpsConnection.close()
 
-    return token
+    #return token
 
 def grep(file_name, output_dir, pattern):
     na_volume = []

From eac2786e08341911d94a424653dc0656477fbf4d Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Wed, 5 Aug 2020 18:42:55 +0000
Subject: [PATCH 5/8] Changes to work with docker

Made changes so IP address can be found while working in docker.
---
 htrc/auth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htrc/auth.py b/htrc/auth.py
index cd44c48..76b9152 100644
--- a/htrc/auth.py
+++ b/htrc/auth.py
@@ -25,7 +25,7 @@ def get_jwt_token():
               
     url1 = htrc.config.get_idp_url()
     capsule_id = htrc.config._get_value("jwt", "capsule_id")
-    result = subprocess.check_output(['hostname', '-s', '-I'])
+    result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True)
     result = result.decode('utf-8')
     result = result[:-1]
     capsule_ip = result.strip()

From 700f7f42522121b12ad69a65f19a584f6bf86cc1 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 7 Oct 2021 12:23:27 -0400
Subject: [PATCH 6/8] Merge develop into dk_test.

---
 .gitignore                      |   2 +
 docs/source/cli.rst             |  39 +++-
 htrc/.htrc.default              |   1 +
 htrc/__main__.py                |  76 ++++++--
 htrc/auth.py                    |   3 +-
 htrc/config.py                  |  42 ++++-
 htrc/hf_utils/__init__.py       | 110 ++++++++++++
 htrc/lib/cli.py                 |   2 +
 htrc/models/__init__.py         |  68 +++++++
 htrc/runningheaders/__init__.py | 163 +++++++++++++++++
 htrc/tools/mallet.py            |   1 +
 htrc/tools/topicexplorer.py     |   1 +
 htrc/util/__init__.py           |   7 +-
 htrc/util/resolve.py            |  20 ++-
 htrc/volumes/__init__.py        | 307 +++++++++++++++++++++++---------
 setup.py                        |   6 +-
 tests/test_htrc_util_resolve.py |   3 +
 tests/test_htrc_volumes.py      |  56 ++++--
 18 files changed, 771 insertions(+), 136 deletions(-)
 create mode 100644 htrc/hf_utils/__init__.py
 create mode 100644 htrc/models/__init__.py
 create mode 100644 htrc/runningheaders/__init__.py

diff --git a/.gitignore b/.gitignore
index d77aad6..efb0815 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ htrc.egg-info
 .coverage
 htmlcov/
 .eggs
+ssl-cert-trust
+venv/
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 0d19316..2a0ff7c 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -1,6 +1,6 @@
 HTRC Workset Toolkit
 ======================
-The HTRC Workset Toolkit povides a command line interface for interacting with 
+The HTRC Workset Toolkit povides a command line interface for interacting with
 and analyzing volumes in the HathiTrust Digital Library:
 
 - Volume Download (``htrc download``)
@@ -11,7 +11,7 @@ and analyzing volumes in the HathiTrust Digital Library:
 Workset Path
 --------------
 
-Each of these commands takes a *workset path*. Valid types of workset paths 
+Each of these commands takes a *workset path*. Valid types of workset paths
 and examples of each are:
 
 ==================================  ==============================================================================
@@ -71,7 +71,7 @@ download`_, the
 
 Topic Modeling
 ''''''''''''''''
-There are two implementations of LDA topic modeling supported by the 
+There are two implementations of LDA topic modeling supported by the
 
 
 Arguments
@@ -114,6 +114,35 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
 
     ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c``
 
+* Download specific pages from a single volume :
+
+    ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]``
+
+* Download volumes and then extract headers/footers from the volumes :
+
+    ``htrc download -hf /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) :
+
+    ``htrc download -hfc /home/dcuser/HTRC/htrc-id``
+    
+* Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers :
+
+    ``htrc download -hf -s /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) :
+
+    ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) :
+
+    ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id``
+    
+* Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2):
+
+    ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id``
+    
+
 |
 +---------------------------------+-----------------------------------------------+
 | command: ``htrc metadata``      | capsule mode: **secure** and **maintenance**  |
@@ -246,7 +275,3 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
 * Run topicexplorer on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names).
 
    ``htrc topicexplorer /home/dcuser/unzipped_volumes -k 20``
-
-
-
-
diff --git a/htrc/.htrc.default b/htrc/.htrc.default
index bfeadee..3ec2327 100644
--- a/htrc/.htrc.default
+++ b/htrc/.htrc.default
@@ -8,6 +8,7 @@ port = 443
 url = /
 cert =
 key =
+pd_only =
 
 [oauth]
 host = silvermaple.pti.indiana.edu
diff --git a/htrc/__main__.py b/htrc/__main__.py
index 04b26b4..6102ad9 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -6,8 +6,8 @@
 from future import standard_library
 standard_library.install_aliases()
 
-import json
-import os, os.path
+import os
+import os.path
 import shutil
 import sys
 from tempfile import NamedTemporaryFile
@@ -16,6 +16,7 @@
 import htrc.volumes
 import htrc.workset
 import htrc.tools.mallet
+
 from argparse import ArgumentParser
 import htrc.tools.topicexplorer
 from htrc.lib.cli import bool_prompt
@@ -25,18 +26,37 @@
 def download_parser(parser=None):
     if parser is None:
         parser = ArgumentParser()
-    parser.add_argument("-u", "--username", help="HTRC username")
-    parser.add_argument("-p", "--password", help="HTRC password")
+    #parser.add_argument("-u", "--username", help="HTRC username")
+    #parser.add_argument("-p", "--password", help="HTRC password")
     parser.add_argument("file", nargs='?', default=sys.stdin,
-        help="workset path[s]")
+        help="Workset path[s]")
     parser.add_argument("-f", "--force", action='store_true', 
-        help="remove folder if exists")
-    parser.add_argument("-o", "--output", help="output directory",
+        help="Remove folder if exists")
+    parser.add_argument("-o", "--output", help="Output directory",
         default='/media/secure_volume/workset/')
+    parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
+        help="Remove headers and footers from individual pages and save in a separate csv file for inspection")
+    parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
+        help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages")
+    parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
+                        help="How many pages ahead does the header/footer extractor algorithm look to find potential "
+                             "matching headers/footers (higher value gives potentially more accurate results on lower "
+                             "quality OCR volumes at the expense of runtime)")
+    parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7,
+                        help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching "
+                             "algorithm to declare that two headers are considered 'the same' (the higher the value, up "
+                             "to a max of 1.0, the more strict the matching has to be; lower values allow for more "
+                             "fuzziness to account for OCR errors)")
+    parser.add_argument("-s", "--skip-removed-hf", action='store_true',
+                        help="Skip creating a saved report of the removed headers and footers for each page for inspection")
+    parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
+                        help="The max number of concurrent tasks to start when downloading or removing headers/footers")
+    parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,
+                        help="The max number of volumes to download at a time from DataAPI")
     parser.add_argument("-c", "--concat", action='store_true',
-        help="concatenate a volume's pages in to a single file")
+        help="Concatenate a volume's pages in to a single file")
     parser.add_argument("-m", "--mets", action='store_true',
-                        help="add volume's METS file")
+                        help="Add volume's METS file")
     parser.add_argument("-pg", "--pages",action='store_true',
         help="Download given page numbers of a volumes.")
     parser.add_argument("-t", "--token", help="JWT for volumes download.")
@@ -47,17 +67,17 @@ def download_parser(parser=None):
     parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.")
     return parser
 
+
 def add_workset_path(parser=None):
     if parser is None:
         parser = ArgumentParser()
-    parser.add_argument("path", nargs='+', help="workset path[s]")
+    parser.add_argument("path", nargs='+', help="Workset path[s]")
     return parser
 
-    
 
 def main():
     parser = ArgumentParser()
-    parser.add_argument('-d', '--debug', help="print long debug messages",
+    parser.add_argument('-d', '--debug', help="Print long debug messages",
                         action='store_true')
     parsers = parser.add_subparsers(help="select a command")
 
@@ -78,10 +98,11 @@ def main():
         help="Download HathiTrust volumes to disk [requires auth]")
     download_parser(parser_download)
     parser_download.set_defaults(func='download')
-
+    
+    
     # Run helper
     parser_run = parsers.add_parser('run', help="Run a built-in algorithm.")
-    run_parsers = parser_run.add_subparsers(help="select a command")
+    run_parsers = parser_run.add_subparsers(help="Select a command")
 
     parser_mallet = run_parsers.add_parser('mallet')
     htrc.tools.mallet.populate_parser(parser_mallet)
@@ -94,6 +115,9 @@ def main():
     parser_run.set_defaults(func='run')
 
     args = parser.parse_args()
+    if 'func' not in args:
+        parser.print_help()
+        sys.exit(1)
 
     if args.func in ['metadata', 'export']:
         volumes = []
@@ -113,6 +137,9 @@ def main():
             metadata = get_metadata(volumes)
             print(json.dumps(metadata))
     elif args.func == 'run':
+        if 'run' not in args:
+            parser_run.print_help()
+            sys.exit(1)
         if args.run == 'mallet':
             htrc.tools.mallet.main(args.path, args.k, args.iter)
         if args.run == 'topicexplorer':
@@ -125,10 +152,25 @@ def main():
             else:
                 print("Please choose another output folder and try again.")
                 sys.exit(1)
-
+        
+        if args.concat and args.remove_headers_footers:
+            print("Cannot set both concat and remove-headers-footers")
+            sys.exit(1)
+        if args.concat and args.remove_headers_footers_and_concat:
+            print("Cannot set both concat and remove-headers-footers-and-concat")
+            sys.exit(1)
+        if args.remove_headers_footers and args.remove_headers_footers_and_concat:
+            print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat")
+            sys.exit(1)
+        if args.mets and args.remove_headers_footers_and_concat:
+            print("Cannot set both mets and remove_headers_footers_and_concat")
+            sys.exit(1)
         if args.pages:
             if args.mets and args.concat:
-                print ("Cannot set both concat and mets with pages")
+                print("Cannot set both concat and mets with pages")
+                sys.exit(1)
+            if args.mets and args.remove_headers_footers_and_concat:
+                print("Cannot set both mets and remove_headers_footers_and_concat with pages")
                 sys.exit(1)
 
         try:
@@ -137,6 +179,7 @@ def main():
             print("Invalid identifier:", args.file)
             sys.exit(1)
 
+
 def resolve_and_download(args):
     if args.file == sys.stdin:
         # For use with UNIX pipes
@@ -201,6 +244,7 @@ def download(args):
         else:
             raise e
 
+
 def download_with_tempfile(args, volumes):
     f = NamedTemporaryFile()
     for volume in volumes:
diff --git a/htrc/auth.py b/htrc/auth.py
index 76b9152..b975e63 100644
--- a/htrc/auth.py
+++ b/htrc/auth.py
@@ -10,6 +10,7 @@
 
 import htrc.config
 
+
 def get_jwt_token():
     # Currently we just store one common jwt token locally at .htrc file for simplicity
     # Expect to add POST method to query unique jwt token with the combo of username and password
@@ -22,7 +23,7 @@ def get_jwt_token():
              #"username": username,
              #"password": password,
              #"scope" : "openid"}
-              
+
     url1 = htrc.config.get_idp_url()
     capsule_id = htrc.config._get_value("jwt", "capsule_id")
     result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True)
diff --git a/htrc/config.py b/htrc/config.py
index eb1dbf7..c6b5743 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -6,18 +6,14 @@
 """
 from future import standard_library
 standard_library.install_aliases()
-from builtins import input
-
+from typing import Optional
 from configparser import RawConfigParser as ConfigParser, NoSectionError
 from codecs import open
-from getpass import getpass
 import logging
 import os.path
 import shutil
 import time
 
-from htrc.lib.cli import bool_prompt
-
 DEFAULT_PATH = os.path.expanduser('~')
 DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc')
 if not os.path.exists(DEFAULT_PATH):
@@ -26,6 +22,25 @@
     logging.info("Copying default config file to home directory.")
     shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH)
 
+
+class HtrcDataApiConfig:
+    def __init__(self,
+                 token: Optional[str] = None,
+                 host: Optional[str] = None,
+                 port: Optional[int] = None,
+                 epr: Optional[str] = None,
+                 cert: Optional[str] = None,
+                 key: Optional[str] = None) -> None:
+        super().__init__()
+
+        self.token = token or get_jwt_token(save_new_token=False)
+        self.host = host or get_dataapi_host()
+        self.port = port or get_dataapi_port()
+        self.epr = epr or get_dataapi_epr()
+        self.cert = cert or get_dataapi_cert()
+        self.key = key or get_dataapi_key()
+
+
 def _get_value(section, key, path=None):
     if path is None:
         path = DEFAULT_PATH
@@ -38,33 +53,45 @@ def _get_value(section, key, path=None):
     except NoSectionError:
         raise EnvironmentError("Config not set for {} {} in {}".format(
             section, key, path))
-    
+
+
 def get_dataapi_port(path=None):
     port = int(_get_value('data', 'port', path))
     return (port)
 
+
 def get_dataapi_host(path=None):
     host = _get_value('data', 'host', path)
     return (host)
 
+
 def get_dataapi_epr(path=None):
     return _get_value('data', 'url', path)
 
+
 def get_dataapi_cert(path=None):
     return _get_value('data', 'cert', path)
 
+
 def get_dataapi_key(path=None):
     return _get_value('data', 'key', path)
 
+
+def get_dataapi_access(path=None):
+    return _get_value('data', 'pd_only', path)
+
+
 def get_idp_host_port(path=None):
     host = _get_value('idp', 'host', path)
     port = _get_value('idp', 'port', path)
 
     return (host, port)
 
+
 def get_idp_path(path=None):
     return _get_value('idp', 'url')
 
+
 def get_idp_url(path=None):
     host, port = get_idp_host_port(path)
     path = get_idp_path(path)
@@ -111,6 +138,7 @@ def save_jwt_token(token, path=None):
 
     return token
 
+
 def remove_jwt_token(path=None):
     """
     Removes JWT token from the config file.
@@ -148,9 +176,11 @@ def get_credentials(path=None):
 
     return (client_id, client_secret)
 
+
 def populate_parser(parser):
     return parser
 
+
 if __name__ == '__main__':
     from argparse import ArgumentParser
 
diff --git a/htrc/hf_utils/__init__.py b/htrc/hf_utils/__init__.py
new file mode 100644
index 0000000..81553de
--- /dev/null
+++ b/htrc/hf_utils/__init__.py
@@ -0,0 +1,110 @@
+import re
+from typing import TypeVar, List, Iterator, Tuple, Callable
+
+T = TypeVar('T')
+
+
+def clean_text(s: str) -> str:
+    # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported)
+    s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE)
+    # replace multiple sequential whitespaces with single whitespace
+    s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE)
+    # trim whitespaces at the beginning and end
+    s = s.strip()
+    # lowercase
+    s = s.lower()
+
+    return s
+
+
+def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int:
+    """ From Wikipedia article; Iterative with two matrix rows. """
+    # degenerate cases
+    if s == t:
+        return 0
+
+    len0 = len(s)
+    len1 = len(t)
+
+    if not len0:
+        return len1
+
+    if not len1:
+        return len0
+
+    # the array of distances
+    v0 = [0] * (len0 + 1)
+    v1 = [0] * (len0 + 1)
+
+    # initial cost of skipping prefix in s
+    for i in range(len(v0)):
+        v0[i] = i
+
+    # dynamically compute the array of distances
+
+    # transformation cost for each letter in t
+    for j in range(len1):
+        # initial cost of skipping prefix in t
+        v1[0] = j + 1
+
+        # transformation cost for each letter in s
+        for i in range(len0):
+            # matching current letters in both strings
+            match = 0 if s[i] == t[j] else 1
+
+            # computing cost for each transformation
+            cost_insert = v0[i + 1] + insert_cost
+            cost_delete = v1[i] + delete_cost
+            cost_replace = v0[i] + match * replace_cost
+
+            # keep minimum cost
+            v1[i + 1] = min(cost_insert, cost_delete, cost_replace)
+
+        # swap cost arrays
+        v0, v1 = v1, v0
+
+    # the distance is the cost for transforming all letters in both strings
+    return v0[len0]
+
+
+def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]:
+    if not xs:
+        return []
+
+    result = []
+    x, xs = xs[0], xs[1:]
+
+    while xs:
+        result = result + [(x, v) for v in xs[:n - 1]]
+        x, xs = xs[0], xs[1:]
+
+    return result
+
+
+def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]:
+    result = []
+    _prev, _next = None, None
+
+    while len(xs) > 1:
+        _prev, _next = xs[0], xs[1]
+        result.append(_prev)
+        if not pred(_prev, _next):
+            yield result
+            result = []
+        xs = xs[1:]
+
+    if len(xs) == 1:
+        _prev, _next = _next, xs[0]
+
+    if _prev is not None and _next is not None and pred(_prev, _next):
+        result.extend([_prev, _next])
+    elif _next is not None:
+        result.append(_next)
+
+    yield result
+
+
+def flatten(xss: List[tuple]) -> Iterator[T]:
+    for xs in xss:
+        for x in xs:
+            yield x
diff --git a/htrc/lib/cli.py b/htrc/lib/cli.py
index 33c378e..11a6e10 100644
--- a/htrc/lib/cli.py
+++ b/htrc/lib/cli.py
@@ -1,4 +1,6 @@
 from builtins import input
+
+
 def bool_prompt(prompt_str, default=None):
     if default is True:
         default = 'y'
diff --git a/htrc/models/__init__.py b/htrc/models/__init__.py
new file mode 100644
index 0000000..e86e115
--- /dev/null
+++ b/htrc/models/__init__.py
@@ -0,0 +1,68 @@
+import os
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class Page(ABC):
+    @property
+    @abstractmethod
+    def text_lines(self) -> List[str]:
+        """
+        The lines of text on the page
+        """
+        pass
+
+    @property
+    def text(self) -> str:
+        return os.linesep.join(self.text_lines)
+
+
+class PageStructure(Page, ABC):
+    def __init__(self) -> None:
+        self.num_header_lines = 0
+        self.num_footer_lines = 0
+
+    @property
+    def has_header(self) -> bool:
+        return self.num_header_lines > 0
+
+    @property
+    def has_body(self) -> bool:
+        return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0
+
+    @property
+    def has_footer(self) -> bool:
+        return self.num_footer_lines > 0
+
+    @property
+    def header_lines(self) -> List[str]:
+        return self.text_lines[:self.num_header_lines]
+
+    @property
+    def body_lines(self) -> List[str]:
+        return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines]
+
+    @property
+    def footer_lines(self) -> List[str]:
+        return self.text_lines[-self.num_footer_lines:] if self.has_footer else []
+
+    @property
+    def header(self) -> str:
+        return os.linesep.join(self.header_lines)
+
+    @property
+    def body(self) -> str:
+        return os.linesep.join(self.body_lines)
+
+    @property
+    def footer(self) -> str:
+        return os.linesep.join(self.footer_lines)
+
+
+class HtrcPage(Page):
+    def __init__(self, lines: List[str]) -> None:
+        self._lines = lines
+
+    @property
+    def text_lines(self) -> List[str]:
+        return self._lines
diff --git a/htrc/runningheaders/__init__.py b/htrc/runningheaders/__init__.py
new file mode 100644
index 0000000..799bf39
--- /dev/null
+++ b/htrc/runningheaders/__init__.py
@@ -0,0 +1,163 @@
+import re
+from collections import defaultdict
+from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict
+
+from htrc.models import Page, PageStructure
+from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when
+
+T = TypeVar('T', bound=Page)
+U = TypeVar('U', bound=PageStructure)
+
+
+class _Line:
+    def __init__(self, text: str, line_number: int, page: Page) -> None:
+        self.text = text
+        self.line_number = line_number
+        self.page = page
+        self.cleaned_text = clean_text(text)
+
+    def __eq__(self, o: object) -> bool:
+        if not isinstance(o, _Line):
+            raise NotImplemented
+
+        are_equal = self.page is o.page and self.line_number == o.line_number
+
+        return are_equal
+
+    def __ne__(self, o: object) -> bool:
+        return not self == o
+
+    def __hash__(self) -> int:
+        line_hash = hash(self.line_number)
+        page_hash = hash(self.page)
+        hash_value = 31 * line_hash + page_hash
+
+        return hash_value
+
+    def __str__(self) -> str:
+        return str((self.line_number, self.cleaned_text))
+
+    def similarity_ratio(self, line: '_Line') -> float:
+        ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text),
+                                                                                   len(line.cleaned_text))
+
+        return ratio
+
+
+def parse_page_structure(pages: List[T],
+                         window_size: int = 6,
+                         min_similarity_ratio: float = 0.7,
+                         min_cluster_size: int = 3,
+                         max_header_lines: int = 3,
+                         max_footer_lines: int = 3) -> List[U]:
+    def _get_page_lines(p: T) -> List[_Line]:
+        return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)]
+
+    def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]:
+        cluster_map = {}
+
+        for l1, l2 in lines:
+            c1 = cluster_map.get(l1)
+            c2 = cluster_map.get(l2)
+
+            if c1 is not None and c2 is not None and c1 is not c2:
+                smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1)
+                larger.extend(smaller)
+                for x in smaller:
+                    cluster_map[x] = larger
+            elif c1 is not None and c2 is None:
+                c1.append(l2)
+                cluster_map[l2] = c1
+            elif c1 is None and c2 is not None:
+                c2.append(l1)
+                cluster_map[l1] = c2
+            elif c1 is None and c2 is None:
+                c = [l1, l2]
+                cluster_map[l1] = c
+                cluster_map[l2] = c
+
+        return set(map(tuple, cluster_map.values()))
+
+    def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]:
+        lines_grouped_by_page = defaultdict(list)
+        for line in lines:
+            lines_grouped_by_page[line.page].append(line)
+
+        return lines_grouped_by_page
+
+    def _get_last_header_line(lines: List[_Line]) -> Optional[int]:
+        if not lines:
+            return None
+
+        return max(l.line_number for l in lines)
+
+    def _get_first_footer_line(lines: List[_Line]) -> Optional[int]:
+        if not lines:
+            return None
+
+        return min(l.line_number for l in lines)
+
+    def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]:
+        numbers = [int(match.group(0)) for match in
+                   re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)]
+
+        return line, numbers
+
+    def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]:
+        assert len(lines) > 0
+        line, numbers = _extract_line_numbers(lines[-1])
+        if not numbers and not str.strip(line.text) and len(lines) > 1:
+            line, numbers = _extract_line_numbers(lines[-2])
+
+        return line, numbers
+
+    candidate_header_lines = []
+    candidate_footer_lines = []
+
+    pages_lines = [_get_page_lines(p) for p in pages]
+
+    for lines in pages_lines:
+        # ignore lines that are <4 characters long and/or have no alphabetic characters
+        candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4])
+        candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4])
+
+    headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size)
+    footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size)
+
+    header_line_similarities = []
+    for (lines1, lines2) in headers_for_comparison:
+        header_line_similarities.extend(
+            (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
+
+    footer_line_similarities = []
+    for (lines1, lines2) in footers_for_comparison:
+        footer_line_similarities.extend(
+            (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
+
+    header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if
+                       len(cluster) >= min_cluster_size]
+    footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if
+                       len(cluster) >= min_cluster_size]
+
+    if not footer_clusters:
+        potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines]
+        potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1]
+        potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)),
+                                 group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1))
+        footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size]
+
+    header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters))
+    footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters))
+
+    last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()}
+    first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in
+                                   footer_lines_grouped_by_page.items()}
+
+    for page in pages:
+        last_header_line = last_header_line_pages_map.get(page)
+        first_footer_line = first_footer_line_pages_map.get(page)
+        page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {})
+        page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0
+        page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0
+
+    return pages
diff --git a/htrc/tools/mallet.py b/htrc/tools/mallet.py
index a005e93..e82758a 100644
--- a/htrc/tools/mallet.py
+++ b/htrc/tools/mallet.py
@@ -19,6 +19,7 @@ def install_mallet():
         mallet_dir.extractall(path=MALLET_DIR)
         mallet_dir.close()
 
+
 def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'):
     if not os.path.exists(MALLET_DIR):
         if not os.path.exists('/media/secure_volume/'):
diff --git a/htrc/tools/topicexplorer.py b/htrc/tools/topicexplorer.py
index 293baca..5149cc3 100644
--- a/htrc/tools/topicexplorer.py
+++ b/htrc/tools/topicexplorer.py
@@ -6,6 +6,7 @@
 from htrc.volumes import download_volumes
 from htrc.workset import path_to_volumes
 
+
 def main(path, topics, iterations, output_dir='/media/secure_volume/workset'):
     if os.path.exists("/media/secure_volume"):
         # If in secure mode, downlaod the volumes from data api
diff --git a/htrc/util/__init__.py b/htrc/util/__init__.py
index edbddd1..2b1dd3e 100644
--- a/htrc/util/__init__.py
+++ b/htrc/util/__init__.py
@@ -4,6 +4,7 @@
 
 from .resolve import ORG_CODES
 
+
 def split_items(seq, split_size):
     """
     Returns a generator that returns portions of `seq` up to `split_size`.
@@ -13,7 +14,7 @@ def split_items(seq, split_size):
     :param split_size: The maximum size of each split.
     """
     full_segments = int(math.floor(len(seq) / split_size))
-    for i in range(1,full_segments+1):
-        yield seq[(i-1)*split_size:i*split_size]
+    for i in range(1, full_segments + 1):
+        yield seq[(i - 1) * split_size:i * split_size]
     if (full_segments * split_size) < len(seq):
-        yield seq[full_segments*split_size:]
+        yield seq[full_segments * split_size:]
diff --git a/htrc/util/resolve.py b/htrc/util/resolve.py
index e3b2b4f..1d1a7e2 100644
--- a/htrc/util/resolve.py
+++ b/htrc/util/resolve.py
@@ -94,29 +94,31 @@ def parse_volume_id(string):
     Organization codes for the volumes can be found in ORG_CODES.
     '''
 
-    # First extract the volume ID from a URL, fallbck to assume string.
+    # First extract the volume ID from a URL, fallback to assume string.
     parsed_url = urlparse(string)
     if parsed_url.netloc == 'hdl.handle.net':
         # Parse the Handle ID, ex:
         # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s'
         # Note that if the Handle URL contains page info, this is discarded.
-        id = parsed_url.path.replace('/2027/', '')
+        htid = parsed_url.path.replace('/2027/', '')
 
     elif parsed_url.netloc == 'babel.hathitrust.org':
         # Parse the HT Digital Library URL, ex:
         # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7
         if parsed_url.query:
-            id = parse_qs(parsed_url.query).get('id', None)
-            if id is not None:
-                id = id[0]
+            htid = parse_qs(parsed_url.query).get('id', None)
+            if htid is not None:
+                htid = htid[0]
+                if ';' in htid:
+                    htid = htid.split(';')[0]
 
     else:
-        id = string
+        htid = string
 
     # Validate ID against ORG_CODES. 
-    # Won't guarantee volume existance, but is a sanity check.
-    if id and any(id.startswith(org) for org in ORG_CODES):
-        return id
+    # Won't guarantee volume existence, but it is a sanity check.
+    if htid and any(htid.startswith(org) for org in ORG_CODES):
+        return htid
     else: 
         raise ValueError("Invalid Organization Code in HathiTrust ID")
 
diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 6c8787a..aabb171 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -10,13 +10,15 @@
 """
 from __future__ import print_function
 from future import standard_library
+
 standard_library.install_aliases()
 
 #from builtins import input
+from htrc.models import HtrcPage
 
 import http.client
-from io import BytesIO  # used to stream http response into zipfile.
-#import json
+from io import BytesIO, TextIOWrapper
+import json
 import logging
 import os.path
 import progressbar
@@ -29,16 +31,25 @@
 #from urllib.error import HTTPError
 from urllib.parse import urlencode
 #import xml.etree.ElementTree as ET
+from urllib.parse import urlencode
 from zipfile import ZipFile  # used to decompress requested zip archives.
+from tqdm import tqdm
+from htrc.runningheaders import parse_page_structure
+from functools import partial
+import pandas as pd
 
 #from htrc.lib.cli import bool_prompt
 from htrc.util import split_items
 import htrc.config
+import multiprocessing
 
+import logging
 from logging import NullHandler
+
 logging.getLogger(__name__).addHandler(NullHandler())
 
-def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False):
+
+def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128):
     """
     Returns volumes from the Data API as a raw zip stream.
 
@@ -53,15 +64,16 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     if not volume_ids:
         raise ValueError("volume_ids is empty.")
 
-    url = epr + "volumes"
+    url = data_api_config.epr + "volumes"
 
     for id in volume_ids:
         if ("." not in id
-            or " " in id):
+                or " " in id):
             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
 
     data = {'volumeIDs': '|'.join(
         [id.replace('+', ':').replace('=', '/') for id in volume_ids])}
+
     if concat:
         data['concat'] = 'true'
 
@@ -69,7 +81,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
         data['mets'] = 'true'
 
     # Authorization
-    headers = {"Authorization": "Bearer " + token,
+    headers = {"Authorization": "Bearer " + data_api_config.token,
                "Content-type": "application/x-www-form-urlencoded"}
 
     # Create SSL lookup
@@ -79,8 +91,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     #ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
-    httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
-
+    httpsConnection = http.client.HTTPSConnection(
+        data_api_config.host,
+        data_api_config.port,
+        context=ctx,
+        key_file=data_api_config.key,
+        cert_file=data_api_config.cert)
 
     httpsConnection.request("POST", url, urlencode(data), headers)
 
@@ -91,12 +107,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
         data = BytesIO()
         bytes_downloaded = 0
         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
-            widgets=[progressbar.AnimatedMarker(), '    ',
-                     progressbar.DataSize(),
-                     ' (', progressbar.FileTransferSpeed(), ')'])
+                                      widgets=[progressbar.AnimatedMarker(), '    ',
+                                               progressbar.DataSize(),
+                                               ' (', progressbar.FileTransferSpeed(), ')'])
 
         while body:
-            body = response.read(128)
+            body = response.read(buffer_size)
             data.write(body)
             bytes_downloaded += len(body)
             bar.update(bytes_downloaded)
@@ -114,12 +130,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     return data
 
 
-def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=False):
+def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128):
     """
     Returns a ZIP file containing specfic pages.
 
     Parameters:
-    :token: An OAuth2 token for the app.
+    :data_api_config: The configuration data of the DataAPI endpoint.
     :volume_ids: A list of volume_ids
     :concat: If True, return a single file per volume. If False, return a single
     file per page (default).
@@ -127,11 +143,11 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     if not page_ids:
         raise ValueError("page_ids is empty.")
 
-    url = epr + "pages"
+    url = data_api_config.epr + "pages"
 
     for id in page_ids:
         if ("." not in id
-            or " " in id):
+                or " " in id):
             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
 
     data = {'pageIDs': '|'.join(
@@ -145,10 +161,9 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
         data['mets'] = 'true'
 
     # Authorization
-    headers = {"Authorization": "Bearer " + token,
+    headers = {"Authorization": "Bearer " + data_api_config.token,
                "Content-type": "application/x-www-form-urlencoded"}
 
-
     # Create SSL lookup
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
@@ -156,8 +171,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     #ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
-    httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
-
+    httpsConnection = http.client.HTTPSConnection(
+        data_api_config.host,
+        data_api_config.port,
+        context=ctx,
+        key_file=data_api_config.key,
+        cert_file=data_api_config.cert
+    )
 
     httpsConnection.request("POST", url, urlencode(data), headers)
 
@@ -168,12 +188,12 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
         data = BytesIO()
         bytes_downloaded = 0
         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
-              widgets=[progressbar.AnimatedMarker(), '    ',
+                                      widgets=[progressbar.AnimatedMarker(), '    ',
                                                progressbar.DataSize(),
                                                ' (', progressbar.FileTransferSpeed(), ')'])
 
         while body:
-            body = response.read(128)
+            body = response.read(buffer_size)
             data.write(body)
             bytes_downloaded += len(body)
             bar.update(bytes_downloaded)
@@ -234,82 +254,194 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     #return token
 
-def grep(file_name, output_dir, pattern):
-    na_volume = []
-    for line in open(file_name):
-        if pattern in line:
-            na_volume.append(line.split()[-1])
-    if len(na_volume) < 100:
-        print("\nFollowing volume ids are not available.")
-        print("\n".join(str(item) for item in na_volume))
-        with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
-            volume_na.write("\n".join(str(item) for item in na_volume))
-    else:
-        if len(na_volume) == 100:
-            print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
-
-def check_error_file(output_dir):
-    file_name = "ERROR.err"
 
+def grep_error(file_name, output_dir, pattern, txt_index):
+    na_volume = []
     if output_dir.endswith("/"):
-        file_path = output_dir+ file_name
+        file_path = output_dir + file_name
     else:
-        file_path = output_dir+"/"+file_name
+        file_path = output_dir + "/" + file_name
 
     if os.path.isfile(file_path):
-        grep(file_path, output_dir, "KeyNotFoundException")
+        for line in open(file_path):
+            if pattern in line:
+                na_volume.append(line.split()[txt_index])
 
+    return na_volume
 
-def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
-    # create output_dir folder, if nonexistant
-    if not os.path.isdir(output_dir):
-        os.makedirs(output_dir)
 
-    # get token if not specified
-    if not token:
-        token = htrc.config.get_jwt_token()
-        htrc.config.remove_jwt_token()
+def _to_htrc_page(page_file, zip):
+    with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page:
+        return HtrcPage([line.rstrip() for line in page.readlines()])
 
-    if not host:
-        host= htrc.config.get_dataapi_host()
 
-    if not port:
-        port = htrc.config.get_dataapi_port()
+def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False,
+                     remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False,
+                     parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None):
+    if not 0 < parallelism <= multiprocessing.cpu_count():
+        raise ValueError("Invalid parallelism level specified")
 
-    if not epr:
-        epr = htrc.config.get_dataapi_epr()
+    remove_hf_fun = partial(
+        _remove_headers_footers_and_save,
+        concat=concat,
+        hf_min_similarity=hf_min_similarity,
+        hf_window_size=hf_window_size,
+        skip_removed_hf=skip_removed_hf,
+        output_dir=output_dir
+    )
 
-    if not cert:
-        cert = htrc.config.get_dataapi_cert()
+    volume_ids = list(set(volume_ids))  # ensure unique volume ids
+    num_vols = len(volume_ids)
 
-    if not key:
-        key = htrc.config.get_dataapi_key()
+    data_api_config = data_api_config or htrc.config.HtrcDataApiConfig()
 
-    if any((token, host, port)) is not None:
-        logging.info("obtained token: %s\n" % token)
+    os.makedirs(output_dir, exist_ok=True)
+
+    if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None:
+        logging.info("obtained token: %s\n" % data_api_config.token)
 
         try:
-            for ids in split_items(volume_ids, 250):
-                if pages:
-                    if concat & mets:
-                        raise ValueError("Cannot set both concat and mets with pages.")
+            errors = []
+            rights = []
+
+            with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool:
+                for ids in split_items(volume_ids, batch_size):
+                    if pages:
+                        if concat and mets:
+                            raise ValueError("Cannot set both concat and mets with pages.")
+                        else:
+                            data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets)
                     else:
-                        data = get_pages(token, ids, host, port, cert, key, epr, concat, mets)
+                        data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets)
+
+                    volumes = []
+
+                    with ZipFile(BytesIO(data)) as vols_zip:
+                        zip_list = vols_zip.namelist()
+                        if 'ERROR.err' in zip_list:
+                            errors.append(vols_zip.read('ERROR.err').decode('utf-8'))
+                            zip_list.remove('ERROR.err')
+                        if 'volume-rights.txt' in zip_list:
+                            rights_data = vols_zip.read('volume-rights.txt').decode('utf-8')
+                            zip_list.remove('volume-rights.txt')
+                            if not rights:
+                                rights.append(rights_data)
+                            else:
+                                # due to the format in which 'volume-rights.txt' is created, we have to skip
+                                # the first 4 lines which make up the header of the file, to extract only the
+                                # actual volume rights data for accumulation
+                                rights.append(''.join(rights_data.splitlines(keepends=True)[4:]))
+
+                        zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')]
+                        num_vols_in_zip = len(zip_volume_paths)
+
+                        if not remove_headers_footers:
+                            vols_zip.extractall(output_dir, members=zip_list)
+                            progress.update(num_vols_in_zip)
+                        else:
+                            for zip_vol_path in zip_volume_paths:
+                                sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/'))
+                                vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths]
+                                volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages))
+
+                    del data, vols_zip
+
+                    num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip
+                    progress.update(num_missing)  # update progress bar state to include the missing volumes also
+
+                    # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted
+                    # without further processing
+                    if volumes:
+                        for _ in pool.imap_unordered(remove_hf_fun, volumes):
+                            progress.update()
+
+            na_volumes_all = []
+
+            if errors:
+                with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file:
+                    err_file.write(''.join(errors))
+
+                na_volumes_error = grep_error('ERROR.err', output_dir, 'KeyNotFoundException', -1)
+                na_volumes_all.extend(na_volumes_error)
+
+            if rights:
+                with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file:
+                    rights_file.write(''.join(rights))
+
+                if htrc.config.get_dataapi_access() == "true":
+                    na_volumes_rights = grep_error('volume-rights.txt', output_dir, ' 3', 0)
+                    na_volumes_all.extend(na_volumes_rights)
+
+            num_na = len(na_volumes_all)
+
+            if num_na > 0:
+                with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na:
+                    volumes_na.write("\n".join(str(item) for item in na_volumes_all))
+
+                if num_na < 100:
+                    print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt "
+                          "for the complete list. ")
+                    print('\n'.join(str(item) for item in na_volumes_all))
                 else:
-                    data = get_volumes(token, ids, host, port, cert, key, epr, concat, mets)
-
-                myzip = ZipFile(BytesIO(data))
-                myzip.extractall(output_dir)
-                myzip.close()
-
-                check_error_file(output_dir)
+                    print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt "
+                          "for the "
+                          "complete list. \nTo check the validity of volumes in your workset or volume id file go "
+                          "to:\n "
+                          "https://analytics.hathitrust.org/validateworkset \n or email us at "
+                          "htrc-help@hathitrust.org "
+                          "for assistance.".format(num_na))
 
         except socket.error:
-            raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")
+            raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. "
+                               "Contact HTRC for further help.")
 
     else:
-        raise RuntimeError("Failed to obtain jwt token.")
+        raise RuntimeError("Failed to obtain the JWT token.")
+
+
+def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir):
+    zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data
+    clean_volid = zip_vol_path[:-1]
+
+    vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity)
+    pages_body = (page.body for page in vol_pages)
+    # save the removed headers/footers for user inspection
+    if skip_removed_hf:
+        if concat:
+            with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
+                vol_file.write('\n'.join(pages_body))
+        else:
+            vol_path = os.path.join(output_dir, zip_vol_path)
+            os.mkdir(vol_path)
+            for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
+                with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
+                    page_file.write(page_body)
+    else:
+        if concat:
+            with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
+                vol_file.write('\n'.join(pages_body))
+        else:
+            vol_path = os.path.join(output_dir, zip_vol_path)
+            os.mkdir(vol_path)
+            for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
+                with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
+                    page_file.write(page_body)
+
+        removed_hf = []
+        for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages):
+            if not (vol_page.has_header or vol_page.has_footer):
+                # skip reporting pages that don't have an identified header or footer
+                continue
+            _, page_name = os.path.split(vol_page_path)
+            page_name, _ = os.path.splitext(page_name)
+            removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer})
+
+        if concat:
+            removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv')
+        else:
+            removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv')
+
+        pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False)
 
 
 def download(args):
@@ -317,9 +449,24 @@ def download(args):
     with open(args.file) as IDfile:
         volumeIDs = [line.strip() for line in IDfile]
 
+    data_api_config = htrc.config.HtrcDataApiConfig(
+        token=args.token,
+        host=args.datahost,
+        port=args.dataport,
+        epr=args.dataepr,
+        cert=args.datacert,
+        key=args.datakey
+    )
+
     return download_volumes(volumeIDs, args.output,
-        username=args.username, password=args.password,
-        token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
-        port=args.dataport, cert=args.datacert, key=args.datakey,
-        epr=args.dataepr)
+                            remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat,
+                            concat=args.concat or args.remove_headers_footers_and_concat,
+                            mets=args.mets,
+                            pages=args.pages,
+                            hf_window_size=args.window_size,
+                            hf_min_similarity=args.min_similarity_ratio,
+                            parallelism=args.parallelism,
+                            batch_size=args.batch_size,
+                            skip_removed_hf=args.skip_removed_hf,
+                            data_api_config=data_api_config)
 
diff --git a/setup.py b/setup.py
index 91f8bf7..e2d26f3 100644
--- a/setup.py
+++ b/setup.py
@@ -9,10 +9,10 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.55b0'
+__version__ = '0.1.57b0'
 
-install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
-                    'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2']
+install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
+                    'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']
 # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']
 if sys.version_info.major == 2:
     install_requires.append('configparser')
diff --git a/tests/test_htrc_util_resolve.py b/tests/test_htrc_util_resolve.py
index 432734d..6bbbfd0 100644
--- a/tests/test_htrc_util_resolve.py
+++ b/tests/test_htrc_util_resolve.py
@@ -42,6 +42,9 @@ def test_parse_volume_id(self):
         id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7')
         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
 
+        id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7')
+        self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
+
         id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s')
         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
 
diff --git a/tests/test_htrc_volumes.py b/tests/test_htrc_volumes.py
index d4d9abf..752cbf4 100644
--- a/tests/test_htrc_volumes.py
+++ b/tests/test_htrc_volumes.py
@@ -60,27 +60,53 @@ def test_get_volumes_and_pages(self, https_mock):
         response_mock.read.return_value =\
             ''.encode('utf8')
         https_mock.return_value.getresponse.return_value = response_mock
-
-        htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
-        htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/')
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
+        htrc.volumes.get_volumes(data_api_config, self.test_vols)
+        htrc.volumes.get_pages(data_api_config, self.test_vols)
 
     @patch('htrc.volumes.http.client.HTTPSConnection')
     def test_get_volumes_and_pages_error(self, https_mock):
         response_mock = Mock(status=500)
         https_mock.return_value.getresponse.return_value = response_mock
 
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
         with self.assertRaises(EnvironmentError):
-            htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_volumes(data_api_config, self.test_vols)
 
         with self.assertRaises(EnvironmentError):
-            htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/')
+            htrc.volumes.get_pages(data_api_config, self.test_vols)
 
     def test_get_volumes_and_pages_empty(self):
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
         with self.assertRaises(ValueError):
-            htrc.volumes.get_volumes('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_volumes(data_api_config, [])
 
         with self.assertRaises(ValueError):
-            htrc.volumes.get_pages('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_pages(data_api_config, [])
 
     @patch('htrc.volumes.ZipFile')
     @patch('htrc.volumes.get_volumes')
@@ -93,14 +119,21 @@ def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock,
         oauth2_mock.return_value = 'a1b2c3d4e5'
         volumes_mock.return_value = b''
 
-        htrc.volumes.download_volumes(self.test_vols, self.output_path,
-            username='1234', password='1234', token='1234')
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
+        htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
 
         # test directory creation
         import shutil
         shutil.rmtree(self.output_path)
-        htrc.volumes.download_volumes(self.test_vols, self.output_path,
-            username='1234', password='1234', token='1234')
+        htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
 
     # TODO: Fix this test for case where config file exists, but creds not set
     """
@@ -132,6 +165,7 @@ def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_moc
     def test_download(self):
         pass
 
+
 suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes)
 unittest.TextTestRunner(verbosity=2).run(suite)
 

From a06dfbf0b50989d2c42969b140dd5f553d09be24 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 7 Oct 2021 12:49:36 -0400
Subject: [PATCH 7/8] Fixed a conflict in config.py

---
 htrc/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htrc/config.py b/htrc/config.py
index c6b5743..fa2815a 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -33,7 +33,7 @@ def __init__(self,
                  key: Optional[str] = None) -> None:
         super().__init__()
 
-        self.token = token or get_jwt_token(save_new_token=False)
+        self.token = token or get_jwt_token()
         self.host = host or get_dataapi_host()
         self.port = port or get_dataapi_port()
         self.epr = epr or get_dataapi_epr()

From 703de27dfd83db015e301bf61dc8fc5668080809 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Tue, 18 Jan 2022 21:54:26 -0500
Subject: [PATCH 8/8] Set final version.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e2d26f3..9fa5ac0 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.57b0'
+__version__ = '0.1.57'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']