From fefcf5ada39ca8222b2e27d82092440700cdded4 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 12:12:05 -0500
Subject: [PATCH 01/49] Added -hf and -hfc arguments

---
 htrc/__main__.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index 04b26b4..bacc652 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -12,10 +12,12 @@
 import sys
 from tempfile import NamedTemporaryFile
 
+
 from htrc.metadata import get_metadata, get_volume_metadata
 import htrc.volumes
 import htrc.workset
 import htrc.tools.mallet
+
 from argparse import ArgumentParser
 import htrc.tools.topicexplorer
 from htrc.lib.cli import bool_prompt
@@ -33,6 +35,10 @@ def download_parser(parser=None):
         help="remove folder if exists")
     parser.add_argument("-o", "--output", help="output directory",
         default='/media/secure_volume/workset/')
+    parser.add_argument("-hf", "--headfoot", action = 'store_true',
+        help="remove headers and footers from individual pages")
+    parser.add_argument("-hfc", "--headfootcon", action = 'store_true',
+        help="remove headers and footers from individual pages then concatenate pages")
     parser.add_argument("-c", "--concat", action='store_true',
         help="concatenate a volume's pages in to a single file")
     parser.add_argument("-m", "--mets", action='store_true',
@@ -78,7 +84,8 @@ def main():
         help="Download HathiTrust volumes to disk [requires auth]")
     download_parser(parser_download)
     parser_download.set_defaults(func='download')
-
+    
+    
     # Run helper
     parser_run = parsers.add_parser('run', help="Run a built-in algorithm.")
     run_parsers = parser_run.add_subparsers(help="select a command")
@@ -125,7 +132,14 @@ def main():
             else:
                 print("Please choose another output folder and try again.")
                 sys.exit(1)
-
+        d = os.listdir(args.output)
+        if args.headfoot is True:
+            if len(d) == 0:
+                print("This director is empty")
+            else:
+                htrc.volumes.remove_hf(args.output)
+        if args.headfootcon is True:
+            htrc.volumes.remove_hf_concat(args.output)
         if args.pages:
             if args.mets and args.concat:
                 print ("Cannot set both concat and mets with pages")

From 187836b65294391333efeae826b74b2df175077c Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 12:13:14 -0500
Subject: [PATCH 02/49] Added code to try and run header/footer extractor

---
 htrc/volumes/__init__.py | 120 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 116 insertions(+), 4 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 6ddb9a7..ed22628 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -30,7 +30,13 @@
 from urllib.parse import quote_plus, urlencode
 import xml.etree.ElementTree as ET
 from zipfile import ZipFile  # used to decompress requested zip archives.
-
+from htrc.runningheaders import parse_page_structure
+from htrc.hf_vol_load import load_vol
+import pandas as pd
+import fnmatch
+import glob
+from tqdm import tqdm
+import shutil
 from htrc.lib.cli import bool_prompt
 from htrc.util import split_items
 import htrc.config
@@ -63,6 +69,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
 
     data = {'volumeIDs': '|'.join(
         [id.replace('+', ':').replace('=', '/') for id in volume_ids])}
+      
     if concat:
         data['concat'] = 'true'
 
@@ -260,9 +267,110 @@ def check_error_file(output_dir):
     if os.path.isfile(file_path):
         grep(file_path, output_dir, "KeyNotFoundException")
 
+def remove_hf(output_dir):
+    if __name__ == '__main__':
+        os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
+        removed_hf = os.path.join(output_dir, "removed_hf_files")
+        vol_paths = glob.glob(os.path.join(output_dir,'**'))
+        df = pd.DataFrame()
+    
+
+        for path in tqdm(vol_paths):
+            if os.path.isdir(path):
+                page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
+                n = len(page_paths)
+                num = 1
+    
+                while num <= n:
+                    for pg in page_paths:
+                        parsed_path = str(path).split('/')
+                        clean_path_root = '/'.join(parsed_path)
+                        page_num = str(num).zfill(8)
+                        new_filename = page_num+'.txt'
+                        os.rename(pg, clean_path_root+'/'+new_filename)
+                        num += 1
+    
+                folder = os.path.basename(path)
+                n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
+                pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
+    
+                body = []
+                for n, page in enumerate(pages):
+                    s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
+    
+                    pg_boolean = s + "\n" + "-"*len(s)
+                    pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
+                    #pg_body = page.body if page.has_body else ""
+                    pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
+                
+                    body.append(page.body)
+                
+                    df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
+                    df.sort_values("Volume")
+                    for i, g in df.groupby("Volume"):
+                        g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
+            
+                    count = 1
+                    for item in body:
+                        pg_n = str(count).zfill(8)
+                        filename = '{}.txt'.format(pg_n)
+                        count += 1
+                        with open(os.path.join(clean_path_root, filename), "w") as f_out:
+                            f_out.write('{}\n'.format(item))
+
+def remove_hf_concat(output_dir):
+    if __name__ == '__main__':
+        os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
+        removed_hf = os.path.join(output_dir, "removed_hf_files")
+        vol_paths = glob.glob(os.path.join(output_dir,'**'))
+        df = pd.DataFrame()
+        retain = ["removed_hf_files"]
+    
+
+        for path in tqdm(vol_paths):
+            if os.path.isdir(path):
+                page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
+                n = len(page_paths)
+                num = 1
+    
+                while num <= n:
+                    for pg in page_paths:
+                        parsed_path = str(path).split('/')
+                        clean_path_root = '/'.join(parsed_path)
+                        page_num = str(num).zfill(8)
+                        new_filename = page_num+'.txt'
+                        os.rename(pg, clean_path_root+'/'+new_filename)
+                        num += 1
+    
+                folder = os.path.basename(path)
+                n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
+                pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
+                
+                filename = '{}.txt'.format(folder)
+                body = []
+                for n, page in enumerate(pages):
+                    s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
+    
+                    pg_boolean = s + "\n" + "-"*len(s)
+                    pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
+                    #pg_body = page.body if page.has_body else ""
+                    pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
+                
+                    body.append(page.body)
+                
+                    df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
+                    df.sort_values("Volume")
+                    for i, g in df.groupby("Volume"):
+                        g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
+            
+                    
+                with open(os.path.join(output_dir, filename), "w") as f_out:
+                    f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n')
+                if folder not in retain:
+                    shutil.rmtree(os.path.join(output_dir, folder))
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
+                     config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
     # create output_dir folder, if nonexistant
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
@@ -305,7 +413,11 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.close()
 
                 check_error_file(output_dir)
-
+                if headfoot:
+                    remove_hf(output_dir)
+                if headfootcon:
+                    remove_hf_concat(output_dir)
+                
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")
 
@@ -320,7 +432,7 @@ def download(args):
 
     return download_volumes(volumeIDs, args.output,
         username=args.username, password=args.password,
-        token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
+        token=args.token, headfoot=args.headfoot, headfootcon=args.headfootcon, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
         port=args.dataport, cert=args.datacert, key=args.datakey,
         epr=args.dataepr)
 

From fa7ff5ac10864b60787996e2b841eac7bfaefbfa Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:53:12 -0500
Subject: [PATCH 03/49] Create na

---
 htrc/hf_utils/na | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 htrc/hf_utils/na

diff --git a/htrc/hf_utils/na b/htrc/hf_utils/na
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/htrc/hf_utils/na
@@ -0,0 +1 @@
+

From a546eca9da4908fe23ec03db893f7e6d81bbc997 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:53:53 -0500
Subject: [PATCH 04/49] Create na

---
 htrc/hf_vol_load/na | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 htrc/hf_vol_load/na

diff --git a/htrc/hf_vol_load/na b/htrc/hf_vol_load/na
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/htrc/hf_vol_load/na
@@ -0,0 +1 @@
+

From 9ef790adacec73af01a0f20638f6168ec20babfd Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:54:45 -0500
Subject: [PATCH 05/49] Create na

---
 htrc/models/na | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 htrc/models/na

diff --git a/htrc/models/na b/htrc/models/na
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/htrc/models/na
@@ -0,0 +1 @@
+

From 9094677a8933a213500de0411db0265e7be8dd87 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:55:34 -0500
Subject: [PATCH 06/49] Create na

---
 htrc/runningheaders/na | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 htrc/runningheaders/na

diff --git a/htrc/runningheaders/na b/htrc/runningheaders/na
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/htrc/runningheaders/na
@@ -0,0 +1 @@
+

From 9d2520be71d98670a94fd0709e8eb2808d678364 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:56:34 -0500
Subject: [PATCH 07/49] Add files via upload

---
 htrc/runningheaders/__init__.py | 163 ++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 htrc/runningheaders/__init__.py

diff --git a/htrc/runningheaders/__init__.py b/htrc/runningheaders/__init__.py
new file mode 100644
index 0000000..799bf39
--- /dev/null
+++ b/htrc/runningheaders/__init__.py
@@ -0,0 +1,163 @@
+import re
+from collections import defaultdict
+from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict
+
+from htrc.models import Page, PageStructure
+from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when
+
+T = TypeVar('T', bound=Page)
+U = TypeVar('U', bound=PageStructure)
+
+
+class _Line:
+    def __init__(self, text: str, line_number: int, page: Page) -> None:
+        self.text = text
+        self.line_number = line_number
+        self.page = page
+        self.cleaned_text = clean_text(text)
+
+    def __eq__(self, o: object) -> bool:
+        if not isinstance(o, _Line):
+            raise NotImplemented
+
+        are_equal = self.page is o.page and self.line_number == o.line_number
+
+        return are_equal
+
+    def __ne__(self, o: object) -> bool:
+        return not self == o
+
+    def __hash__(self) -> int:
+        line_hash = hash(self.line_number)
+        page_hash = hash(self.page)
+        hash_value = 31 * line_hash + page_hash
+
+        return hash_value
+
+    def __str__(self) -> str:
+        return str((self.line_number, self.cleaned_text))
+
+    def similarity_ratio(self, line: '_Line') -> float:
+        ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text),
+                                                                                   len(line.cleaned_text))
+
+        return ratio
+
+
+def parse_page_structure(pages: List[T],
+                         window_size: int = 6,
+                         min_similarity_ratio: float = 0.7,
+                         min_cluster_size: int = 3,
+                         max_header_lines: int = 3,
+                         max_footer_lines: int = 3) -> List[U]:
+    def _get_page_lines(p: T) -> List[_Line]:
+        return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)]
+
+    def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]:
+        cluster_map = {}
+
+        for l1, l2 in lines:
+            c1 = cluster_map.get(l1)
+            c2 = cluster_map.get(l2)
+
+            if c1 is not None and c2 is not None and c1 is not c2:
+                smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1)
+                larger.extend(smaller)
+                for x in smaller:
+                    cluster_map[x] = larger
+            elif c1 is not None and c2 is None:
+                c1.append(l2)
+                cluster_map[l2] = c1
+            elif c1 is None and c2 is not None:
+                c2.append(l1)
+                cluster_map[l1] = c2
+            elif c1 is None and c2 is None:
+                c = [l1, l2]
+                cluster_map[l1] = c
+                cluster_map[l2] = c
+
+        return set(map(tuple, cluster_map.values()))
+
+    def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]:
+        lines_grouped_by_page = defaultdict(list)
+        for line in lines:
+            lines_grouped_by_page[line.page].append(line)
+
+        return lines_grouped_by_page
+
+    def _get_last_header_line(lines: List[_Line]) -> Optional[int]:
+        if not lines:
+            return None
+
+        return max(l.line_number for l in lines)
+
+    def _get_first_footer_line(lines: List[_Line]) -> Optional[int]:
+        if not lines:
+            return None
+
+        return min(l.line_number for l in lines)
+
+    def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]:
+        numbers = [int(match.group(0)) for match in
+                   re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)]
+
+        return line, numbers
+
+    def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]:
+        assert len(lines) > 0
+        line, numbers = _extract_line_numbers(lines[-1])
+        if not numbers and not str.strip(line.text) and len(lines) > 1:
+            line, numbers = _extract_line_numbers(lines[-2])
+
+        return line, numbers
+
+    candidate_header_lines = []
+    candidate_footer_lines = []
+
+    pages_lines = [_get_page_lines(p) for p in pages]
+
+    for lines in pages_lines:
+        # ignore lines that are <4 characters long and/or have no alphabetic characters
+        candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4])
+        candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4])
+
+    headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size)
+    footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size)
+
+    header_line_similarities = []
+    for (lines1, lines2) in headers_for_comparison:
+        header_line_similarities.extend(
+            (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
+
+    footer_line_similarities = []
+    for (lines1, lines2) in footers_for_comparison:
+        footer_line_similarities.extend(
+            (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio)
+
+    header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if
+                       len(cluster) >= min_cluster_size]
+    footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if
+                       len(cluster) >= min_cluster_size]
+
+    if not footer_clusters:
+        potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines]
+        potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1]
+        potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)),
+                                 group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1))
+        footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size]
+
+    header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters))
+    footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters))
+
+    last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()}
+    first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in
+                                   footer_lines_grouped_by_page.items()}
+
+    for page in pages:
+        last_header_line = last_header_line_pages_map.get(page)
+        first_footer_line = first_footer_line_pages_map.get(page)
+        page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {})
+        page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0
+        page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0
+
+    return pages

From c526c45de7bf389643488ed7587c058a4cacb40d Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:56:59 -0500
Subject: [PATCH 08/49] Delete na

---
 htrc/runningheaders/na | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 htrc/runningheaders/na

diff --git a/htrc/runningheaders/na b/htrc/runningheaders/na
deleted file mode 100644
index 8b13789..0000000
--- a/htrc/runningheaders/na
+++ /dev/null
@@ -1 +0,0 @@
-

From 2ea0cdc4f49294319952ca18d7ffeb4273a98f20 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:57:34 -0500
Subject: [PATCH 09/49] Add files via upload

---
 htrc/models/__init__.py | 68 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 htrc/models/__init__.py

diff --git a/htrc/models/__init__.py b/htrc/models/__init__.py
new file mode 100644
index 0000000..e86e115
--- /dev/null
+++ b/htrc/models/__init__.py
@@ -0,0 +1,68 @@
+import os
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class Page(ABC):
+    @property
+    @abstractmethod
+    def text_lines(self) -> List[str]:
+        """
+        The lines of text on the page
+        """
+        pass
+
+    @property
+    def text(self) -> str:
+        return os.linesep.join(self.text_lines)
+
+
+class PageStructure(Page, ABC):
+    def __init__(self) -> None:
+        self.num_header_lines = 0
+        self.num_footer_lines = 0
+
+    @property
+    def has_header(self) -> bool:
+        return self.num_header_lines > 0
+
+    @property
+    def has_body(self) -> bool:
+        return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0
+
+    @property
+    def has_footer(self) -> bool:
+        return self.num_footer_lines > 0
+
+    @property
+    def header_lines(self) -> List[str]:
+        return self.text_lines[:self.num_header_lines]
+
+    @property
+    def body_lines(self) -> List[str]:
+        return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines]
+
+    @property
+    def footer_lines(self) -> List[str]:
+        return self.text_lines[-self.num_footer_lines:] if self.has_footer else []
+
+    @property
+    def header(self) -> str:
+        return os.linesep.join(self.header_lines)
+
+    @property
+    def body(self) -> str:
+        return os.linesep.join(self.body_lines)
+
+    @property
+    def footer(self) -> str:
+        return os.linesep.join(self.footer_lines)
+
+
+class HtrcPage(Page):
+    def __init__(self, lines: List[str]) -> None:
+        self._lines = lines
+
+    @property
+    def text_lines(self) -> List[str]:
+        return self._lines

From ee06a59fe5a2bfdac00a6513d953c4e571943002 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:57:59 -0500
Subject: [PATCH 10/49] Delete na

---
 htrc/models/na | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 htrc/models/na

diff --git a/htrc/models/na b/htrc/models/na
deleted file mode 100644
index 8b13789..0000000
--- a/htrc/models/na
+++ /dev/null
@@ -1 +0,0 @@
-

From a2479c57a7ee3e74889300e905babf55ae26e5fa Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:58:51 -0500
Subject: [PATCH 11/49] Add files via upload

---
 htrc/hf_vol_load/__init__.py | 117 +++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 htrc/hf_vol_load/__init__.py

diff --git a/htrc/hf_vol_load/__init__.py b/htrc/hf_vol_load/__init__.py
new file mode 100644
index 0000000..72f08d5
--- /dev/null
+++ b/htrc/hf_vol_load/__init__.py
@@ -0,0 +1,117 @@
+import unittest
+from typing import List
+
+from htrc.models import HtrcPage
+from htrc.runningheaders import parse_page_structure, clean_text, levenshtein
+
+
+class TestRunningHeaders(unittest.TestCase):
+    def test_finding_running_headers(self):
+        pages = load_vol("data/vol1", num_pages=10)
+        structured_pages = parse_page_structure(pages)
+        headers = ["|".join(page.header_lines) for page in structured_pages]
+        expected = [
+            "",
+            "",
+            "CHAPTER 1|INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
+            "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
+            "INTRODUCTION TO RUNNING HEADERS 1|Lorem Ipsum style",
+            "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
+            "CHAPTER 2|EVERYTHING IS RELATIVE",
+            "2 EVERYTHING IS RELATIVE",
+            "EVERYTHING IS RELATIVE 2",
+            "2 EVERYTHING IS RELATIVE"
+        ]
+        self.assertListEqual(expected, headers)
+
+    def test_finding_running_footers(self):
+        pages = load_vol("data/vol1", num_pages=10)
+        structured_pages = parse_page_structure(pages)
+        footers = ["|".join(page.footer_lines) for page in structured_pages]
+        expected = [
+            "",
+            "",
+            "Page 2",
+            "Page 3",
+            "Page 4",
+            "Page 5",
+            "Page 6",
+            "Page 7",
+            "Page 8",
+            "Page 9"
+        ]
+        self.assertListEqual(expected, footers)
+
+    def test_identify_correct_page_body(self):
+        pages = load_vol("data/vol1", num_pages=10)
+        structured_pages = parse_page_structure(pages)
+        len_body_per_page = [len(page.body_lines) for page in structured_pages]
+        expected = [0, 7, 43, 28, 26, 30, 31, 27, 28, 15]
+        self.assertListEqual(expected, len_body_per_page)
+
+    def test_find_footer_with_page_numbers(self):
+        pages = load_vol("data/vol2", num_pages=10)
+        structured_pages = parse_page_structure(pages)
+        footers = ["|".join(page.footer_lines) for page in structured_pages]
+        expected = [
+            "",
+            "",
+            "2",
+            "                                                                                    3",
+            "4",
+            "                                                                                    5",
+            "6",
+            "                                                                                    7",
+            "8",
+            "                                                                                    9"
+        ]
+        self.assertListEqual(expected, footers)
+
+
+class TestUtils(unittest.TestCase):
+    def test_clean_text(self):
+        s1 = u"\t На   берегу \tпустынных волн  \t\n"
+        s1_expected = u"на берегу пустынных волн"
+        s2 = u" Pot să mănânc  sticlă și ea nu mă rănește. "
+        s2_expected = u"pot să mănânc sticlă și ea nu mă rănește"
+        s1_clean = clean_text(s1)
+        s2_clean = clean_text(s2)
+
+        self.assertEqual(s1_expected, s1_clean)
+        self.assertEqual(s2_expected, s2_clean)
+
+    def test_levenshtein(self):
+        s1 = "rosettacode"
+        s2 = "raisethysword"
+        lev = levenshtein(s1, s2)
+        self.assertEqual(8, lev)
+
+        s1 = "kitten"
+        s2 = "sitting"
+        lev = levenshtein(s1, s2, replace_cost=2)
+        self.assertEqual(5, lev)
+
+        s1 = "abracadabra"
+        s2 = "abracadabra"
+        lev = levenshtein(s1, s2)
+        self.assertEqual(0, lev)
+
+        s1 = ""
+        s2 = "abc"
+        lev = levenshtein(s1, s2)
+        self.assertEqual(3, lev)
+
+
+def load_vol(path: str, num_pages: int) -> List[HtrcPage]:
+    pages = []
+    for n in range(num_pages):
+        page_num = str(n+1).zfill(8)
+        with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f:
+            lines = [line.rstrip() for line in f.readlines()]
+            pages.append(HtrcPage(lines))
+
+    return pages
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6ff15e617ebb812c6c29bfc32892efb94375129b Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:59:33 -0500
Subject: [PATCH 12/49] Add files via upload

---
 htrc/hf_utils/__init__.py | 110 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 htrc/hf_utils/__init__.py

diff --git a/htrc/hf_utils/__init__.py b/htrc/hf_utils/__init__.py
new file mode 100644
index 0000000..81553de
--- /dev/null
+++ b/htrc/hf_utils/__init__.py
@@ -0,0 +1,110 @@
+import re
+from typing import TypeVar, List, Iterator, Tuple, Callable
+
+T = TypeVar('T')
+
+
+def clean_text(s: str) -> str:
+    # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported)
+    s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE)
+    # replace multiple sequential whitespaces with single whitespace
+    s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE)
+    # trim whitespaces at the beginning and end
+    s = s.strip()
+    # lowercase
+    s = s.lower()
+
+    return s
+
+
+def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int:
+    """ From Wikipedia article; Iterative with two matrix rows. """
+    # degenerate cases
+    if s == t:
+        return 0
+
+    len0 = len(s)
+    len1 = len(t)
+
+    if not len0:
+        return len1
+
+    if not len1:
+        return len0
+
+    # the array of distances
+    v0 = [0] * (len0 + 1)
+    v1 = [0] * (len0 + 1)
+
+    # initial cost of skipping prefix in s
+    for i in range(len(v0)):
+        v0[i] = i
+
+    # dynamically compute the array of distances
+
+    # transformation cost for each letter in t
+    for j in range(len1):
+        # initial cost of skipping prefix in t
+        v1[0] = j + 1
+
+        # transformation cost for each letter in s
+        for i in range(len0):
+            # matching current letters in both strings
+            match = 0 if s[i] == t[j] else 1
+
+            # computing cost for each transformation
+            cost_insert = v0[i + 1] + insert_cost
+            cost_delete = v1[i] + delete_cost
+            cost_replace = v0[i] + match * replace_cost
+
+            # keep minimum cost
+            v1[i + 1] = min(cost_insert, cost_delete, cost_replace)
+
+        # swap cost arrays
+        v0, v1 = v1, v0
+
+    # the distance is the cost for transforming all letters in both strings
+    return v0[len0]
+
+
+def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]:
+    if not xs:
+        return []
+
+    result = []
+    x, xs = xs[0], xs[1:]
+
+    while xs:
+        result = result + [(x, v) for v in xs[:n - 1]]
+        x, xs = xs[0], xs[1:]
+
+    return result
+
+
+def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]:
+    result = []
+    _prev, _next = None, None
+
+    while len(xs) > 1:
+        _prev, _next = xs[0], xs[1]
+        result.append(_prev)
+        if not pred(_prev, _next):
+            yield result
+            result = []
+        xs = xs[1:]
+
+    if len(xs) == 1:
+        _prev, _next = _next, xs[0]
+
+    if _prev is not None and _next is not None and pred(_prev, _next):
+        result.extend([_prev, _next])
+    elif _next is not None:
+        result.append(_next)
+
+    yield result
+
+
+def flatten(xss: List[tuple]) -> Iterator[T]:
+    for xs in xss:
+        for x in xs:
+            yield x

From f0d26f0c57c2146ffa9db5926ce0d3fa46406824 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 15:59:51 -0500
Subject: [PATCH 13/49] Delete na

---
 htrc/hf_vol_load/na | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 htrc/hf_vol_load/na

diff --git a/htrc/hf_vol_load/na b/htrc/hf_vol_load/na
deleted file mode 100644
index 8b13789..0000000
--- a/htrc/hf_vol_load/na
+++ /dev/null
@@ -1 +0,0 @@
-

From 13a099de206e5e355ee57b3d6aec9eba2b6adb6f Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 16:00:08 -0500
Subject: [PATCH 14/49] Delete na

---
 htrc/hf_utils/na | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 htrc/hf_utils/na

diff --git a/htrc/hf_utils/na b/htrc/hf_utils/na
deleted file mode 100644
index 8b13789..0000000
--- a/htrc/hf_utils/na
+++ /dev/null
@@ -1 +0,0 @@
-

From e7dca21c2698222aa050cbdc04fb4a74b7241a33 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 16:01:25 -0500
Subject: [PATCH 15/49] Add files via upload

---
 htrc/__main__.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index bacc652..3f35557 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -132,18 +132,26 @@ def main():
             else:
                 print("Please choose another output folder and try again.")
                 sys.exit(1)
-        d = os.listdir(args.output)
-        if args.headfoot is True:
-            if len(d) == 0:
-                print("This director is empty")
-            else:
-                htrc.volumes.remove_hf(args.output)
-        if args.headfootcon is True:
-            htrc.volumes.remove_hf_concat(args.output)
+        
+        if args.concat and args.headfoot:
+            print("Cannot set both concat and headfoot")
+            sys.exit(1)
+        if args.concat and args.headfootcon:
+            print("Cannot set both concat and headfootcon")
+            sys.exit(1)
+        if args.headfoot and args.headfootcon:
+            print("Cannot set both headfoot and headfootcon")
+            sys.exit(1)
+        if args.mets and args.headfootcon:
+            print("Cannot set both mets and headfootcon")
+            sys.exit(1)
         if args.pages:
             if args.mets and args.concat:
                 print ("Cannot set both concat and mets with pages")
                 sys.exit(1)
+            if args.mets and args.headfootcon:
+                print("Cannot set both mets and headfootcon with pages")
+                sys.exit(1)
 
         try:
             resolve_and_download(args)

From 87ff2b91ecd24e55bb7944c5f04d898481e32ccb Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Fri, 20 Nov 2020 16:02:05 -0500
Subject: [PATCH 16/49] Add files via upload

---
 htrc/volumes/__init__.py | 169 ++++++++++++++++++++-------------------
 1 file changed, 88 insertions(+), 81 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index ed22628..a50ced6 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -268,106 +268,104 @@ def check_error_file(output_dir):
         grep(file_path, output_dir, "KeyNotFoundException")
 
 def remove_hf(output_dir):
-    if __name__ == '__main__':
-        os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
-        removed_hf = os.path.join(output_dir, "removed_hf_files")
-        vol_paths = glob.glob(os.path.join(output_dir,'**'))
-        df = pd.DataFrame()
+    os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
+    removed_hf = os.path.join(output_dir, "removed_hf_files")
+    vol_paths = glob.glob(os.path.join(output_dir,'**'))
+    df = pd.DataFrame()
     
 
-        for path in tqdm(vol_paths):
-            if os.path.isdir(path):
-                page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
-                n = len(page_paths)
-                num = 1
+    for path in tqdm(vol_paths):
+        if os.path.isdir(path):
+            page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
+            n = len(page_paths)
+            num = 1
     
-                while num <= n:
-                    for pg in page_paths:
-                        parsed_path = str(path).split('/')
-                        clean_path_root = '/'.join(parsed_path)
-                        page_num = str(num).zfill(8)
-                        new_filename = page_num+'.txt'
-                        os.rename(pg, clean_path_root+'/'+new_filename)
-                        num += 1
+            while num <= n:
+                for pg in page_paths:
+                    parsed_path = str(path).split('/')
+                    clean_path_root = '/'.join(parsed_path)
+                    page_num = str(num).zfill(8)
+                    new_filename = page_num+'.txt'
+                    os.rename(pg, clean_path_root+'/'+new_filename)
+                    num += 1
     
-                folder = os.path.basename(path)
-                n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
-                pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
+            folder = os.path.basename(path)
+            n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
+            pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
     
-                body = []
-                for n, page in enumerate(pages):
-                    s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
+            body = []
+            for n, page in enumerate(pages):
+                s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
     
-                    pg_boolean = s + "\n" + "-"*len(s)
-                    pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
-                    #pg_body = page.body if page.has_body else ""
-                    pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
+                pg_boolean = s + "\n" + "-"*len(s)
+                pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
+                #pg_body = page.body if page.has_body else ""
+                pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
                 
-                    body.append(page.body)
+                body.append(page.body)
                 
-                    df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
-                    df.sort_values("Volume")
-                    for i, g in df.groupby("Volume"):
-                        g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
+                df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
+                df.sort_values("Volume")
+                for i, g in df.groupby("Volume"):
+                    g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
             
-                    count = 1
-                    for item in body:
-                        pg_n = str(count).zfill(8)
-                        filename = '{}.txt'.format(pg_n)
-                        count += 1
-                        with open(os.path.join(clean_path_root, filename), "w") as f_out:
-                            f_out.write('{}\n'.format(item))
+                count = 1
+                for item in body:
+                    pg_n = str(count).zfill(8)
+                    filename = '{}.txt'.format(pg_n)
+                    count += 1
+                    with open(os.path.join(clean_path_root, filename), "w") as f_out:
+                        f_out.write('{}\n'.format(item))
 
 def remove_hf_concat(output_dir):
-    if __name__ == '__main__':
-        os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
-        removed_hf = os.path.join(output_dir, "removed_hf_files")
-        vol_paths = glob.glob(os.path.join(output_dir,'**'))
-        df = pd.DataFrame()
-        retain = ["removed_hf_files"]
+    os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
+    removed_hf = os.path.join(output_dir, "removed_hf_files")
+    vol_paths = glob.glob(os.path.join(output_dir,'**'))
+    df = pd.DataFrame()
+    retain = ["removed_hf_files"]
     
 
-        for path in tqdm(vol_paths):
-            if os.path.isdir(path):
-                page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
-                n = len(page_paths)
-                num = 1
+    for path in tqdm(vol_paths):
+        if os.path.isdir(path):
+            page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
+            n = len(page_paths)
+            num = 1
     
-                while num <= n:
-                    for pg in page_paths:
-                        parsed_path = str(path).split('/')
-                        clean_path_root = '/'.join(parsed_path)
-                        page_num = str(num).zfill(8)
-                        new_filename = page_num+'.txt'
-                        os.rename(pg, clean_path_root+'/'+new_filename)
-                        num += 1
+            while num <= n:
+                for pg in page_paths:
+                    parsed_path = str(path).split('/')
+                    clean_path_root = '/'.join(parsed_path)
+                    page_num = str(num).zfill(8)
+                    new_filename = page_num+'.txt'
+                    os.rename(pg, clean_path_root+'/'+new_filename)
+                    num += 1
     
-                folder = os.path.basename(path)
-                n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
-                pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
+            folder = os.path.basename(path)
+            n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
+            pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
                 
-                filename = '{}.txt'.format(folder)
-                body = []
-                for n, page in enumerate(pages):
-                    s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
+            filename = '{}.txt'.format(folder)
+            body = []
+            for n, page in enumerate(pages):
+                s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
     
-                    pg_boolean = s + "\n" + "-"*len(s)
-                    pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
-                    #pg_body = page.body if page.has_body else ""
-                    pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
+                pg_boolean = s + "\n" + "-"*len(s)
+                pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
+                #pg_body = page.body if page.has_body else ""
+                pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
                 
-                    body.append(page.body)
+                body.append(page.body)
                 
-                    df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
-                    df.sort_values("Volume")
-                    for i, g in df.groupby("Volume"):
-                        g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
+                df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
+                df.sort_values("Volume")
+                for i, g in df.groupby("Volume"):
+                    g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
             
                     
-                with open(os.path.join(output_dir, filename), "w") as f_out:
-                    f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n')
-                if folder not in retain:
-                    shutil.rmtree(os.path.join(output_dir, folder))
+            with open(os.path.join(output_dir, filename), "w") as f_out:
+                f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n')
+            if folder not in retain:
+                shutil.rmtree(os.path.join(output_dir, folder))
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
                      config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
@@ -413,10 +411,19 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.close()
 
                 check_error_file(output_dir)
+                d = os.listdir(output_dir)
                 if headfoot:
-                    remove_hf(output_dir)
+                    if len(d) == 0:
+                        print("This directory is empty")
+                        sys.exit(1)
+                    else:
+                        remove_hf(output_dir)
                 if headfootcon:
-                    remove_hf_concat(output_dir)
+                    if len(d) == 0:
+                        print("This directory is empty")
+                        sys.exit(1)
+                    else:
+                        remove_hf_concat(output_dir)
                 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From 29f2e249fc80135046c30315f92c684226f0d845 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Mon, 30 Nov 2020 13:14:28 -0500
Subject: [PATCH 17/49] Pinned tqdm 4.46.0 package to setup.py file

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 300957c..70287f0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 __version__ = '0.1.54'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
-                    'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2']
+                    'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']
 # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']
 if sys.version_info.major == 2:
     install_requires.append('configparser')

From b7fa1a2f15aa094af7da3ffa6b6f89369650e8f8 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Mon, 30 Nov 2020 18:06:36 -0500
Subject: [PATCH 18/49] Made changes to hf_remove_concat function

---
 htrc/volumes/__init__.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index a50ced6..d85ec1f 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -268,8 +268,8 @@ def check_error_file(output_dir):
         grep(file_path, output_dir, "KeyNotFoundException")
 
 def remove_hf(output_dir):
-    os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
-    removed_hf = os.path.join(output_dir, "removed_hf_files")
+    os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True)
+    removed_hf = os.path.join(output_dir, "removed_hf_data")
     vol_paths = glob.glob(os.path.join(output_dir,'**'))
     df = pd.DataFrame()
     
@@ -318,11 +318,12 @@ def remove_hf(output_dir):
                         f_out.write('{}\n'.format(item))
 
 def remove_hf_concat(output_dir):
-    os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True)
-    removed_hf = os.path.join(output_dir, "removed_hf_files")
+    os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True)
+    removed_hf = os.path.join(output_dir, "removed_hf_data")
     vol_paths = glob.glob(os.path.join(output_dir,'**'))
     df = pd.DataFrame()
-    retain = ["removed_hf_files"]
+    retain = ["removed_hf_data"]
+    rm_txt = "removed_hf_data.txt"
     
 
     for path in tqdm(vol_paths):
@@ -366,7 +367,11 @@ def remove_hf_concat(output_dir):
                 f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n')
             if folder not in retain:
                 shutil.rmtree(os.path.join(output_dir, folder))
-
+            if os.path.exists(os.path.join(output_dir, rm_txt)):
+                os.remove(os.path.join(output_dir, rm_txt))
+            
+            
+            
 def download_volumes(volume_ids, output_dir, username=None, password=None,
                      config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
     # create output_dir folder, if nonexistant

From 8b74d1f02ed01066319395e21f394f5d6934dcaa Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Wed, 2 Dec 2020 14:02:48 -0500
Subject: [PATCH 19/49] Added documentation for header/footer extractor

---
 docs/source/cli.rst | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 0d19316..8101173 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -1,6 +1,6 @@
 HTRC Workset Toolkit
 ======================
-The HTRC Workset Toolkit povides a command line interface for interacting with 
+The HTRC Workset Toolkit povides a command line interface for interacting with
 and analyzing volumes in the HathiTrust Digital Library:
 
 - Volume Download (``htrc download``)
@@ -11,7 +11,7 @@ and analyzing volumes in the HathiTrust Digital Library:
 Workset Path
 --------------
 
-Each of these commands takes a *workset path*. Valid types of workset paths 
+Each of these commands takes a *workset path*. Valid types of workset paths
 and examples of each are:
 
 ==================================  ==============================================================================
@@ -71,7 +71,7 @@ download`_, the
 
 Topic Modeling
 ''''''''''''''''
-There are two implementations of LDA topic modeling supported by the 
+There are two implementations of LDA topic modeling supported by the
 
 
 Arguments
@@ -114,6 +114,18 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
 
     ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c``
 
+* Download specific pages from a single volume :
+
+    ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]``
+
+* Download volumes and then extract headers/footers from the volumes :
+
+    ``htrc download -hf /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) :
+
+    ``htrc download -hfc /home/dcuser/HTRC/htrc-id``
+
 |
 +---------------------------------+-----------------------------------------------+
 | command: ``htrc metadata``      | capsule mode: **secure** and **maintenance**  |
@@ -246,7 +258,3 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
 * Run topicexplorer on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names).
 
    ``htrc topicexplorer /home/dcuser/unzipped_volumes -k 20``
-
-
-
-

From 528e127897495d05c940fd767fa6ac444d072049 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 10:39:23 -0500
Subject: [PATCH 20/49] Added volume not found error for pd-only access

---
 htrc/.htrc.default       |  1 +
 htrc/config.py           |  3 +++
 htrc/volumes/__init__.py | 10 ++++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/htrc/.htrc.default b/htrc/.htrc.default
index bfeadee..3ec2327 100644
--- a/htrc/.htrc.default
+++ b/htrc/.htrc.default
@@ -8,6 +8,7 @@ port = 443
 url = /
 cert =
 key =
+pd_only =
 
 [oauth]
 host = silvermaple.pti.indiana.edu
diff --git a/htrc/config.py b/htrc/config.py
index ccd7d54..13ecf80 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -56,6 +56,9 @@ def get_dataapi_cert(path=None):
 def get_dataapi_key(path=None):
     return _get_value('data', 'key', path)
 
+def get_dataapi_access(path=None):
+    return _get_value('data', 'pd_only', path)
+
 def get_idp_host_port(path=None):
     host = _get_value('idp', 'host', path)
     port = _get_value('idp', 'port', path)
diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 6ddb9a7..1282159 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -249,8 +249,7 @@ def grep(file_name, output_dir, pattern):
         if len(na_volume) == 100:
             print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
 
-def check_error_file(output_dir):
-    file_name = "ERROR.err"
+def check_error_file(output_dir,file_name,grep_text):
 
     if output_dir.endswith("/"):
         file_path = output_dir+ file_name
@@ -258,7 +257,7 @@ def check_error_file(output_dir):
         file_path = output_dir+"/"+file_name
 
     if os.path.isfile(file_path):
-        grep(file_path, output_dir, "KeyNotFoundException")
+        grep(file_path, output_dir, grep_text)
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
@@ -304,7 +303,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
-                check_error_file(output_dir)
+                if(htrc.config.get_dataapi_access()):
+                    check_error_file(output_dir,"volume-rights.txt", " 3")
+
+                check_error_file(output_dir,"ERROR.err","KeyNotFoundException")
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From 10dd91b8e2879aec9540b9eb05934f01864e0b61 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 10:58:22 -0500
Subject: [PATCH 21/49] Added volume not found error for pd-only access

---
 htrc/volumes/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 1282159..b4db7c9 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -235,11 +235,11 @@ def get_oauth2_token(username, password):
 
     return token
 
-def grep(file_name, output_dir, pattern):
+def grep(file_name, output_dir, pattern, txt_index):
     na_volume = []
     for line in open(file_name):
         if pattern in line:
-            na_volume.append(line.split()[-1])
+            na_volume.append(line.split()[txt_index])
     if len(na_volume) < 100:
         print("\nFollowing volume ids are not available.")
         print("\n".join(str(item) for item in na_volume))
@@ -249,7 +249,7 @@ def grep(file_name, output_dir, pattern):
         if len(na_volume) == 100:
             print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
 
-def check_error_file(output_dir,file_name,grep_text):
+def check_error_file(output_dir,file_name,grep_text,txt_index):
 
     if output_dir.endswith("/"):
         file_path = output_dir+ file_name
@@ -257,7 +257,7 @@ def check_error_file(output_dir,file_name,grep_text):
         file_path = output_dir+"/"+file_name
 
     if os.path.isfile(file_path):
-        grep(file_path, output_dir, grep_text)
+        grep(file_path, output_dir, grep_text,txt_index)
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
@@ -304,9 +304,9 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.close()
 
                 if(htrc.config.get_dataapi_access()):
-                    check_error_file(output_dir,"volume-rights.txt", " 3")
+                    check_error_file(output_dir,"volume-rights.txt", " 3", 0)
 
-                check_error_file(output_dir,"ERROR.err","KeyNotFoundException")
+                check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From cfbbf205dbcc47dabd08e7e4ec69307d24efb8f5 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 13:12:32 -0500
Subject: [PATCH 22/49] Added volume not found error for pd-only access

---
 htrc/volumes/__init__.py | 60 ++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index b4db7c9..5796dfe 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -235,29 +235,39 @@ def get_oauth2_token(username, password):
 
     return token
 
-def grep(file_name, output_dir, pattern, txt_index):
-    na_volume = []
-    for line in open(file_name):
-        if pattern in line:
-            na_volume.append(line.split()[txt_index])
-    if len(na_volume) < 100:
-        print("\nFollowing volume ids are not available.")
-        print("\n".join(str(item) for item in na_volume))
-        with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
-            volume_na.write("\n".join(str(item) for item in na_volume))
-    else:
-        if len(na_volume) == 100:
-            print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
-
-def check_error_file(output_dir,file_name,grep_text,txt_index):
+def grep_error(file_name, output_dir, pattern, txt_index):
 
     if output_dir.endswith("/"):
         file_path = output_dir+ file_name
     else:
         file_path = output_dir+"/"+file_name
 
+    na_volume = []
     if os.path.isfile(file_path):
-        grep(file_path, output_dir, grep_text,txt_index)
+        for line in open(file_name):
+            if pattern in line:
+                volume_id = line.split()[txt_index]
+                na_volume.append(volume_id)
+    return na_volume
+
+# def check_error_file(output_dir,file_name,grep_text,txt_index):
+#
+#     if output_dir.endswith("/"):
+#         file_path = output_dir+ file_name
+#     else:
+#         file_path = output_dir+"/"+file_name
+#
+#     if os.path.isfile(file_path):
+#         grep(file_path, output_dir, grep_text,txt_index)
+#
+#     if len(na_volume) < 100:
+#         print("\nFollowing volume ids are not available.")
+#         print("\n".join(str(item) for item in na_volume))
+#         with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
+#             volume_na.write("\n".join(str(item) for item in na_volume))
+#     else:
+#         if len(na_volume) >= 100:
+#             print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
@@ -303,10 +313,24 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
+
+                na_volume = []
                 if(htrc.config.get_dataapi_access()):
-                    check_error_file(output_dir,"volume-rights.txt", " 3", 0)
+                    na_volume = grep_error("volume-rights.txt",output_dir," 3",0)
+
+                na_volume = na_volume + grep_error("ERROR.err",output_dir,"KeyNotFoundException", -1)
+
+                if len(na_volume) > 0:
+                    with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:volume_na.write("\n".join(str(item) for item in na_volume))
+
+                if len(na_volume) > 0 and len(na_volume) < 100:
+                    print("\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
+                    print("\n".join(str(item) for item in na_volume))
+
+                else:
+                    if len(na_volume) >= 100:
+                        print("\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
 
-                check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From 560ef6b45f96c2820179b5d339b73d7cce3d36d5 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 13:38:37 -0500
Subject: [PATCH 23/49] Added volume not found error for pd-only access

---
 htrc/volumes/__init__.py | 73 +++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 5796dfe..5453368 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -10,6 +10,7 @@
 """
 from __future__ import print_function
 from future import standard_library
+
 standard_library.install_aliases()
 
 from builtins import input
@@ -37,8 +38,10 @@
 
 import logging
 from logging import NullHandler
+
 logging.getLogger(__name__).addHandler(NullHandler())
 
+
 def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False):
     """
     Returns volumes from the Data API as a raw zip stream.
@@ -58,7 +61,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
 
     for id in volume_ids:
         if ("." not in id
-            or " " in id):
+                or " " in id):
             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
 
     data = {'volumeIDs': '|'.join(
@@ -82,7 +85,6 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
-
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -92,9 +94,9 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
         data = BytesIO()
         bytes_downloaded = 0
         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
-            widgets=[progressbar.AnimatedMarker(), '    ',
-                     progressbar.DataSize(),
-                     ' (', progressbar.FileTransferSpeed(), ')'])
+                                      widgets=[progressbar.AnimatedMarker(), '    ',
+                                               progressbar.DataSize(),
+                                               ' (', progressbar.FileTransferSpeed(), ')'])
 
         while body:
             body = response.read(128)
@@ -132,7 +134,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     for id in page_ids:
         if ("." not in id
-            or " " in id):
+                or " " in id):
             print("Invalid volume id " + id + ". Please correct this volume id and try again.")
 
     data = {'pageIDs': '|'.join(
@@ -149,7 +151,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     headers = {"Authorization": "Bearer " + token,
                "Content-type": "application/x-www-form-urlencoded"}
 
-
     # Create SSL lookup
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
@@ -159,7 +160,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
-
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -169,7 +169,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
         data = BytesIO()
         bytes_downloaded = 0
         bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength,
-              widgets=[progressbar.AnimatedMarker(), '    ',
+                                      widgets=[progressbar.AnimatedMarker(), '    ',
                                                progressbar.DataSize(),
                                                ' (', progressbar.FileTransferSpeed(), ')'])
 
@@ -191,12 +191,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     return data
 
+
 def get_oauth2_token(username, password):
     # make sure to set the request content-type as application/x-www-form-urlencoded
     headers = {"Content-type": "application/x-www-form-urlencoded"}
-    data = { "grant_type": "client_credentials",
-             "client_secret": password,
-             "client_id": username }
+    data = {"grant_type": "client_credentials",
+            "client_secret": password,
+            "client_id": username}
     data = urlencode(data)
 
     # create an SSL context
@@ -235,12 +236,12 @@ def get_oauth2_token(username, password):
 
     return token
 
-def grep_error(file_name, output_dir, pattern, txt_index):
 
+def grep_error(file_name, output_dir, pattern, txt_index):
     if output_dir.endswith("/"):
-        file_path = output_dir+ file_name
+        file_path = output_dir + file_name
     else:
-        file_path = output_dir+"/"+file_name
+        file_path = output_dir + "/" + file_name
 
     na_volume = []
     if os.path.isfile(file_path):
@@ -250,6 +251,7 @@ def grep_error(file_name, output_dir, pattern, txt_index):
                 na_volume.append(volume_id)
     return na_volume
 
+
 # def check_error_file(output_dir,file_name,grep_text,txt_index):
 #
 #     if output_dir.endswith("/"):
@@ -271,7 +273,8 @@ def grep_error(file_name, output_dir, pattern, txt_index):
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
+                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None,
+                     cert=None, key=None, epr=None):
     # create output_dir folder, if nonexistant
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
@@ -282,7 +285,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
         htrc.config.remove_jwt_token()
 
     if not host:
-        host= htrc.config.get_dataapi_host()
+        host = htrc.config.get_dataapi_host()
 
     if not port:
         port = htrc.config.get_dataapi_port()
@@ -313,23 +316,32 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
-
                 na_volume = []
-                if(htrc.config.get_dataapi_access()):
-                    na_volume = grep_error("volume-rights.txt",output_dir," 3",0)
+                if htrc.config.get_dataapi_access():
+                    print("PD Access Only")
+                    na_volume = grep_error("volume-rights.txt", output_dir, " 3", 0)
 
-                na_volume = na_volume + grep_error("ERROR.err",output_dir,"KeyNotFoundException", -1)
+                na_volume = na_volume + grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1)
 
                 if len(na_volume) > 0:
-                    with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:volume_na.write("\n".join(str(item) for item in na_volume))
-
-                if len(na_volume) > 0 and len(na_volume) < 100:
-                    print("\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
+                    with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write(
+                        "\n".join(str(item) for item in na_volume))
+
+                if 0 < len(na_volume) < 100:
+                    print(
+                        "\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the "
+                        "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
+                        "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
+                        "for assistance.")
                     print("\n".join(str(item) for item in na_volume))
 
                 else:
                     if len(na_volume) >= 100:
-                        print("\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
+                        print(
+                            "\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt "
+                            "for the complete list. \nTo check the validity of volumes in your workset or volume id "
+                            "file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at "
+                            "htrc-help@hathitrust.org for assistance.")
 
 
         except socket.error:
@@ -345,8 +357,7 @@ def download(args):
         volumeIDs = [line.strip() for line in IDfile]
 
     return download_volumes(volumeIDs, args.output,
-        username=args.username, password=args.password,
-        token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
-        port=args.dataport, cert=args.datacert, key=args.datakey,
-        epr=args.dataepr)
-
+                            username=args.username, password=args.password,
+                            token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
+                            port=args.dataport, cert=args.datacert, key=args.datakey,
+                            epr=args.dataepr)

From d278da00275a3d49fb9477082a8bbfc30b1c89a8 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 13:51:45 -0500
Subject: [PATCH 24/49] Added volume not found error for pd-only access

---
 htrc/volumes/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 5453368..240c828 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -316,12 +316,16 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
+                na_volume_rights = []
+                na_volume_error = []
                 na_volume = []
                 if htrc.config.get_dataapi_access():
+                    na_volume_rights = grep_error("volume-rights.txt", output_dir, " 3", 0)
                     print("PD Access Only")
-                    na_volume = grep_error("volume-rights.txt", output_dir, " 3", 0)
+                    print(na_volume_rights)
 
-                na_volume = na_volume + grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1)
+                na_volume_error = grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1)
+                na_volume = na_volume_error + na_volume_rights
 
                 if len(na_volume) > 0:
                     with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write(

From af681b73ec7c8e775a45059db5d5a90d28d56839 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 14:51:26 -0500
Subject: [PATCH 25/49] FIxes errors in message building

---
 htrc/volumes/__init__.py | 114 ++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 73 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 240c828..da97d48 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -10,7 +10,6 @@
 """
 from __future__ import print_function
 from future import standard_library
-
 standard_library.install_aliases()
 
 from builtins import input
@@ -38,10 +37,8 @@
 
 import logging
 from logging import NullHandler
-
 logging.getLogger(__name__).addHandler(NullHandler())
 
-
 def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False):
     """
     Returns volumes from the Data API as a raw zip stream.
@@ -85,6 +82,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
+
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -151,6 +149,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     headers = {"Authorization": "Bearer " + token,
                "Content-type": "application/x-www-form-urlencoded"}
 
+
     # Create SSL lookup
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
@@ -160,6 +159,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
+
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -191,13 +191,12 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     return data
 
-
 def get_oauth2_token(username, password):
     # make sure to set the request content-type as application/x-www-form-urlencoded
     headers = {"Content-type": "application/x-www-form-urlencoded"}
-    data = {"grant_type": "client_credentials",
-            "client_secret": password,
-            "client_id": username}
+    data = { "grant_type": "client_credentials",
+             "client_secret": password,
+             "client_id": username }
     data = urlencode(data)
 
     # create an SSL context
@@ -236,45 +235,40 @@ def get_oauth2_token(username, password):
 
     return token
 
+def grep(file_name, output_dir, pattern, txt_index):
+    na_volume = []
+    for line in open(file_name):
+        if pattern in line:
+            na_volume.append(line.split()[txt_index])
+    if 0 < len(na_volume) < 100:
+        print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the "
+              "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
+              "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
+              "for assistance.")
+        print("\n".join(str(item) for item in na_volume))
+        with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
+            volume_na.write("\n".join(str(item) for item in na_volume))
+            volume_na.write("\n")
+    else:
+        if len(na_volume) >= 100:
+            print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt for the "
+                  "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
+                  "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
+                  "for assistance.")
+
+def check_error_file(output_dir,file_name,grep_text,txt_index):
 
-def grep_error(file_name, output_dir, pattern, txt_index):
     if output_dir.endswith("/"):
-        file_path = output_dir + file_name
+        file_path = output_dir+ file_name
     else:
-        file_path = output_dir + "/" + file_name
+        file_path = output_dir+"/"+file_name
 
-    na_volume = []
     if os.path.isfile(file_path):
-        for line in open(file_name):
-            if pattern in line:
-                volume_id = line.split()[txt_index]
-                na_volume.append(volume_id)
-    return na_volume
-
-
-# def check_error_file(output_dir,file_name,grep_text,txt_index):
-#
-#     if output_dir.endswith("/"):
-#         file_path = output_dir+ file_name
-#     else:
-#         file_path = output_dir+"/"+file_name
-#
-#     if os.path.isfile(file_path):
-#         grep(file_path, output_dir, grep_text,txt_index)
-#
-#     if len(na_volume) < 100:
-#         print("\nFollowing volume ids are not available.")
-#         print("\n".join(str(item) for item in na_volume))
-#         with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
-#             volume_na.write("\n".join(str(item) for item in na_volume))
-#     else:
-#         if len(na_volume) >= 100:
-#             print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
+        grep(file_path, output_dir, grep_text,txt_index)
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None,
-                     cert=None, key=None, epr=None):
+                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
     # create output_dir folder, if nonexistant
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
@@ -285,7 +279,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
         htrc.config.remove_jwt_token()
 
     if not host:
-        host = htrc.config.get_dataapi_host()
+        host= htrc.config.get_dataapi_host()
 
     if not port:
         port = htrc.config.get_dataapi_port()
@@ -316,37 +310,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
-                na_volume_rights = []
-                na_volume_error = []
-                na_volume = []
-                if htrc.config.get_dataapi_access():
-                    na_volume_rights = grep_error("volume-rights.txt", output_dir, " 3", 0)
-                    print("PD Access Only")
-                    print(na_volume_rights)
-
-                na_volume_error = grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1)
-                na_volume = na_volume_error + na_volume_rights
-
-                if len(na_volume) > 0:
-                    with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write(
-                        "\n".join(str(item) for item in na_volume))
-
-                if 0 < len(na_volume) < 100:
-                    print(
-                        "\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the "
-                        "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
-                        "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
-                        "for assistance.")
-                    print("\n".join(str(item) for item in na_volume))
-
-                else:
-                    if len(na_volume) >= 100:
-                        print(
-                            "\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt "
-                            "for the complete list. \nTo check the validity of volumes in your workset or volume id "
-                            "file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at "
-                            "htrc-help@hathitrust.org for assistance.")
+                if(htrc.config.get_dataapi_access()):
+                    check_error_file(output_dir,"volume-rights.txt", " 3", 0)
 
+                check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")
@@ -361,7 +328,8 @@ def download(args):
         volumeIDs = [line.strip() for line in IDfile]
 
     return download_volumes(volumeIDs, args.output,
-                            username=args.username, password=args.password,
-                            token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
-                            port=args.dataport, cert=args.datacert, key=args.datakey,
-                            epr=args.dataepr)
+        username=args.username, password=args.password,
+        token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
+        port=args.dataport, cert=args.datacert, key=args.datakey,
+        epr=args.dataepr)
+

From 6264cb1fa21e8c2f94237e68285d52a647aa9fbc Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 15:15:27 -0500
Subject: [PATCH 26/49] FIxes errors in message building- WIP

---
 htrc/volumes/__init__.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index da97d48..c8e0550 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -240,31 +240,22 @@ def grep(file_name, output_dir, pattern, txt_index):
     for line in open(file_name):
         if pattern in line:
             na_volume.append(line.split()[txt_index])
-    if 0 < len(na_volume) < 100:
-        print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the "
-              "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
-              "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
-              "for assistance.")
-        print("\n".join(str(item) for item in na_volume))
-        with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
-            volume_na.write("\n".join(str(item) for item in na_volume))
-            volume_na.write("\n")
-    else:
-        if len(na_volume) >= 100:
-            print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt for the "
-                  "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n "
-                  "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org "
-                  "for assistance.")
 
-def check_error_file(output_dir,file_name,grep_text,txt_index):
+    with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na:
+        volume_na.write("\n".join(str(item) for item in na_volume))
+        volume_na.write("\n")
+
+    return na_volume
 
+
+def check_error_file(output_dir,file_name,grep_text,txt_index):
     if output_dir.endswith("/"):
         file_path = output_dir+ file_name
     else:
         file_path = output_dir+"/"+file_name
 
     if os.path.isfile(file_path):
-        grep(file_path, output_dir, grep_text,txt_index)
+        return (grep(file_path, output_dir, grep_text,txt_index))
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
@@ -310,10 +301,13 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
+                
                 if(htrc.config.get_dataapi_access()):
-                    check_error_file(output_dir,"volume-rights.txt", " 3", 0)
+                    na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0)
+                    print(na_volumes_rights)
 
-                check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
+                na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
+                print(na_volumes_error)
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From b4c1d74bf70fde321ad90fec3e659f71f877fe69 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 15:20:43 -0500
Subject: [PATCH 27/49] FIxes errors in message building- WIP

---
 htrc/volumes/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index c8e0550..7beeabb 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -301,13 +301,16 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.extractall(output_dir)
                 myzip.close()
 
-                
+                na_volumes_all = []
                 if(htrc.config.get_dataapi_access()):
                     na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0)
                     print(na_volumes_rights)
+                    na_volumes_all = na_volumes_rights
 
                 na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
                 print(na_volumes_error)
+                na_volumes_all = na_volumes_all + na_volumes_error
+                print(na_volumes_all)
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From a19d47256aa515447c548229e33aff704f1f88c5 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 15:28:01 -0500
Subject: [PATCH 28/49] FIxes errors in message building- WIP

---
 htrc/volumes/__init__.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 7beeabb..666d472 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -304,13 +304,29 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 na_volumes_all = []
                 if(htrc.config.get_dataapi_access()):
                     na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0)
-                    print(na_volumes_rights)
                     na_volumes_all = na_volumes_rights
 
                 na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
-                print(na_volumes_error)
                 na_volumes_all = na_volumes_all + na_volumes_error
-                print(na_volumes_all)
+
+                if len(na_volumes_all) > 0:
+                    with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na:
+                        volume_na.write("\n".join(str(item) for item in na_volumes_all))
+
+                if 0 < len(na_volumes_all) < 100:
+                    print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the "
+                          "complete list. ")
+                    print("\n".join(str(item) for item in na_volumes_all))
+                else:
+                    if len(na_volumes_all) >= 100:
+                        print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt "
+                              "for the "
+                              "complete list. \nTo check the validity of volumes in your workset or volume id file go "
+                              "to:\n "
+                              "https://analytics.hathitrust.org/validateworkset \n or email us at "
+                              "htrc-help@hathitrust.org "
+                              "for assistance.")
+
 
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")

From 6f653b8fda1fd0cfd6113de92f66f80de3173e83 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 18 Feb 2021 15:31:37 -0500
Subject: [PATCH 29/49] FIxes errors in message building- WIP

---
 htrc/volumes/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 666d472..b69aec5 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -241,10 +241,6 @@ def grep(file_name, output_dir, pattern, txt_index):
         if pattern in line:
             na_volume.append(line.split()[txt_index])
 
-    with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na:
-        volume_na.write("\n".join(str(item) for item in na_volume))
-        volume_na.write("\n")
-
     return na_volume
 
 

From 768b018cf78fa7df81819edc03ee196afb04bca3 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Fri, 19 Feb 2021 10:53:46 -0500
Subject: [PATCH 30/49] Added inode error message

---
 htrc/volumes/__init__.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index b69aec5..ae3dcb7 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -298,15 +298,17 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
                 myzip.close()
 
                 na_volumes_all = []
-                if(htrc.config.get_dataapi_access()):
+                if htrc.config.get_dataapi_access() == "true":
                     na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0)
                     na_volumes_all = na_volumes_rights
 
-                na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1)
-                na_volumes_all = na_volumes_all + na_volumes_error
+                na_volumes_error = check_error_file(output_dir,"volume-rights.txt", " unavailable", 0)
+
+                if len(na_volumes_error) > 0:
+                    na_volumes_all = na_volumes_all + na_volumes_error
 
                 if len(na_volumes_all) > 0:
-                    with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na:
+                    with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
                         volume_na.write("\n".join(str(item) for item in na_volumes_all))
 
                 if 0 < len(na_volumes_all) < 100:
@@ -325,7 +327,8 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
 
 
         except socket.error:
-            raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")
+            raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. "
+                               "Contact HTRC for further help.")
 
     else:
         raise RuntimeError("Failed to obtain jwt token.")

From cf01765c3406dc102fa56679b92aefe31856c177 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Fri, 19 Feb 2021 11:11:11 -0500
Subject: [PATCH 31/49] FIxes errors in message building- WIP

---
 htrc/volumes/__init__.py | 62 +++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index ae3dcb7..5278773 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -10,6 +10,7 @@
 """
 from __future__ import print_function
 from future import standard_library
+
 standard_library.install_aliases()
 
 from builtins import input
@@ -37,8 +38,10 @@
 
 import logging
 from logging import NullHandler
+
 logging.getLogger(__name__).addHandler(NullHandler())
 
+
 def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False):
     """
     Returns volumes from the Data API as a raw zip stream.
@@ -82,7 +85,6 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
-
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -149,7 +151,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     headers = {"Authorization": "Bearer " + token,
                "Content-type": "application/x-www-form-urlencoded"}
 
-
     # Create SSL lookup
     # TODO: Fix SSL cert verification
     ctx = ssl.create_default_context()
@@ -159,7 +160,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     # Retrieve the volumes
     httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
 
-
     httpsConnection.request("POST", url, urlencode(data), headers)
 
     response = httpsConnection.getresponse()
@@ -191,12 +191,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     return data
 
+
 def get_oauth2_token(username, password):
     # make sure to set the request content-type as application/x-www-form-urlencoded
     headers = {"Content-type": "application/x-www-form-urlencoded"}
-    data = { "grant_type": "client_credentials",
-             "client_secret": password,
-             "client_id": username }
+    data = {"grant_type": "client_credentials",
+            "client_secret": password,
+            "client_id": username}
     data = urlencode(data)
 
     # create an SSL context
@@ -235,27 +236,35 @@ def get_oauth2_token(username, password):
 
     return token
 
-def grep(file_name, output_dir, pattern, txt_index):
-    na_volume = []
-    for line in open(file_name):
-        if pattern in line:
-            na_volume.append(line.split()[txt_index])
 
-    return na_volume
-
-
-def check_error_file(output_dir,file_name,grep_text,txt_index):
+def grep_error(file_name, output_dir, pattern, txt_index):
+    na_volume = []
     if output_dir.endswith("/"):
-        file_path = output_dir+ file_name
+        file_path = output_dir + file_name
     else:
-        file_path = output_dir+"/"+file_name
+        file_path = output_dir + "/" + file_name
 
     if os.path.isfile(file_path):
-        return (grep(file_path, output_dir, grep_text,txt_index))
+        for line in open(file_name):
+            if pattern in line:
+                na_volume.append(line.split()[txt_index])
+
+    return na_volume
+
+
+# def check_error_file(output_dir, file_name, grep_text, txt_index):
+#     if output_dir.endswith("/"):
+#         file_path = output_dir + file_name
+#     else:
+#         file_path = output_dir + "/" + file_name
+#
+#     if os.path.isfile(file_path):
+#         return grep(file_path, output_dir, grep_text, txt_index)
 
 
 def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
+                     config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None,
+                     cert=None, key=None, epr=None):
     # create output_dir folder, if nonexistant
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
@@ -266,7 +275,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
         htrc.config.remove_jwt_token()
 
     if not host:
-        host= htrc.config.get_dataapi_host()
+        host = htrc.config.get_dataapi_host()
 
     if not port:
         port = htrc.config.get_dataapi_port()
@@ -299,10 +308,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None,
 
                 na_volumes_all = []
                 if htrc.config.get_dataapi_access() == "true":
-                    na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0)
+                    na_volumes_rights = grep_error("volume-rights.txt", output_dir, " 3", 0)
                     na_volumes_all = na_volumes_rights
 
-                na_volumes_error = check_error_file(output_dir,"volume-rights.txt", " unavailable", 0)
+                na_volumes_error = grep_error("volume-rights.txt", output_dir, " unavailable", 0)
 
                 if len(na_volumes_error) > 0:
                     na_volumes_all = na_volumes_all + na_volumes_error
@@ -340,8 +349,7 @@ def download(args):
         volumeIDs = [line.strip() for line in IDfile]
 
     return download_volumes(volumeIDs, args.output,
-        username=args.username, password=args.password,
-        token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
-        port=args.dataport, cert=args.datacert, key=args.datakey,
-        epr=args.dataepr)
-
+                            username=args.username, password=args.password,
+                            token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
+                            port=args.dataport, cert=args.datacert, key=args.datakey,
+                            epr=args.dataepr)

From 5c7780094b751e6a808b128e5139a2f1fa8bd59a Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Fri, 19 Feb 2021 12:04:31 -0500
Subject: [PATCH 32/49] Removed check_error_file method

---
 htrc/volumes/__init__.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 5278773..5453fec 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -245,23 +245,12 @@ def grep_error(file_name, output_dir, pattern, txt_index):
         file_path = output_dir + "/" + file_name
 
     if os.path.isfile(file_path):
-        for line in open(file_name):
+        for line in open(file_path):
             if pattern in line:
                 na_volume.append(line.split()[txt_index])
 
     return na_volume
 
-
-# def check_error_file(output_dir, file_name, grep_text, txt_index):
-#     if output_dir.endswith("/"):
-#         file_path = output_dir + file_name
-#     else:
-#         file_path = output_dir + "/" + file_name
-#
-#     if os.path.isfile(file_path):
-#         return grep(file_path, output_dir, grep_text, txt_index)
-
-
 def download_volumes(volume_ids, output_dir, username=None, password=None,
                      config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None,
                      cert=None, key=None, epr=None):

From 391396d2d7e4d979cab13e4d3d097fbd3d885a51 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Wed, 24 Feb 2021 15:53:20 -0600
Subject: [PATCH 33/49] Fixes #46

---
 htrc/__main__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index 04b26b4..bb53626 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -94,6 +94,9 @@ def main():
     parser_run.set_defaults(func='run')
 
     args = parser.parse_args()
+    if 'func' not in args:
+        parser.print_help()
+        sys.exit(1)
 
     if args.func in ['metadata', 'export']:
         volumes = []
@@ -113,6 +116,9 @@ def main():
             metadata = get_metadata(volumes)
             print(json.dumps(metadata))
     elif args.func == 'run':
+        if 'run' not in args:
+            parser_run.print_help()
+            sys.exit(1)
         if args.run == 'mallet':
             htrc.tools.mallet.main(args.path, args.k, args.iter)
         if args.run == 'topicexplorer':

From 64157ec9d9c2c29a0b3c3e5d13d9de7f93f55c8a Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Tue, 9 Mar 2021 11:18:58 -0500
Subject: [PATCH 34/49] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 91f8bf7..c77fcee 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.55b0'
+__version__ = '0.1.55'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2']

From 3289e4ff5b32b6909966294afa604ad1ce0c1c9e Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Tue, 9 Mar 2021 11:21:58 -0500
Subject: [PATCH 35/49] Added files to gitignore.

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index d77aad6..efb0815 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ htrc.egg-info
 .coverage
 htmlcov/
 .eggs
+ssl-cert-trust
+venv/

From 1c4c7adaa8af21629a3ae7d20f855e41e875223e Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Tue, 9 Mar 2021 11:25:24 -0500
Subject: [PATCH 36/49] Changed the version in setup.py to 0.1.56b0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c77fcee..1bd8806 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.55'
+__version__ = '0.1.56b0'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2']

From 8120c7891a793817d907db63475ecdc11ac638fa Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Wed, 24 Mar 2021 22:38:56 -0500
Subject: [PATCH 37/49] Formatting changes

---
 htrc/auth.py                | 15 +++++++--------
 htrc/lib/cli.py             |  2 ++
 htrc/tools/mallet.py        |  1 +
 htrc/tools/topicexplorer.py |  1 +
 htrc/util/__init__.py       |  7 ++++---
 htrc/util/resolve.py        | 20 +++++++++++---------
 6 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/htrc/auth.py b/htrc/auth.py
index a24255d..c366717 100644
--- a/htrc/auth.py
+++ b/htrc/auth.py
@@ -1,14 +1,12 @@
-from base64 import b64encode
-from getpass import getpass
-import http.client
-import ssl
 import time
+from getpass import getpass
 
 import requests
 import requests.auth
 
 import htrc.config
 
+
 def get_jwt_token():
     # Currently we just store one common jwt token locally at .htrc file for simplicity
     # Expect to add POST method to query unique jwt token with the combo of username and password
@@ -17,10 +15,10 @@ def get_jwt_token():
     client_id, client_secret = htrc.config.get_credentials()
 
     auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
-    data = { "grant_type": "password",
-             "username": username,
-             "password": password,
-             "scope" : "openid"}
+    data = {"grant_type": "password",
+            "username": username,
+            "password": password,
+            "scope": "openid"}
 
     url = htrc.config.get_idp_url()
     r = requests.post(url, data=data, auth=auth)
@@ -35,6 +33,7 @@ def get_jwt_token():
     else:
         raise RuntimeError("JWT token retrieval failed: {}".format(data['error']))
 
+
 def credential_prompt():
     """
     A prompt for entering HathiTrust Research Center credentials.
diff --git a/htrc/lib/cli.py b/htrc/lib/cli.py
index 33c378e..11a6e10 100644
--- a/htrc/lib/cli.py
+++ b/htrc/lib/cli.py
@@ -1,4 +1,6 @@
 from builtins import input
+
+
 def bool_prompt(prompt_str, default=None):
     if default is True:
         default = 'y'
diff --git a/htrc/tools/mallet.py b/htrc/tools/mallet.py
index a005e93..e82758a 100644
--- a/htrc/tools/mallet.py
+++ b/htrc/tools/mallet.py
@@ -19,6 +19,7 @@ def install_mallet():
         mallet_dir.extractall(path=MALLET_DIR)
         mallet_dir.close()
 
+
 def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'):
     if not os.path.exists(MALLET_DIR):
         if not os.path.exists('/media/secure_volume/'):
diff --git a/htrc/tools/topicexplorer.py b/htrc/tools/topicexplorer.py
index 293baca..5149cc3 100644
--- a/htrc/tools/topicexplorer.py
+++ b/htrc/tools/topicexplorer.py
@@ -6,6 +6,7 @@
 from htrc.volumes import download_volumes
 from htrc.workset import path_to_volumes
 
+
 def main(path, topics, iterations, output_dir='/media/secure_volume/workset'):
     if os.path.exists("/media/secure_volume"):
         # If in secure mode, downlaod the volumes from data api
diff --git a/htrc/util/__init__.py b/htrc/util/__init__.py
index edbddd1..2b1dd3e 100644
--- a/htrc/util/__init__.py
+++ b/htrc/util/__init__.py
@@ -4,6 +4,7 @@
 
 from .resolve import ORG_CODES
 
+
 def split_items(seq, split_size):
     """
     Returns a generator that returns portions of `seq` up to `split_size`.
@@ -13,7 +14,7 @@ def split_items(seq, split_size):
     :param split_size: The maximum size of each split.
     """
     full_segments = int(math.floor(len(seq) / split_size))
-    for i in range(1,full_segments+1):
-        yield seq[(i-1)*split_size:i*split_size]
+    for i in range(1, full_segments + 1):
+        yield seq[(i - 1) * split_size:i * split_size]
     if (full_segments * split_size) < len(seq):
-        yield seq[full_segments*split_size:]
+        yield seq[full_segments * split_size:]
diff --git a/htrc/util/resolve.py b/htrc/util/resolve.py
index e3b2b4f..1d1a7e2 100644
--- a/htrc/util/resolve.py
+++ b/htrc/util/resolve.py
@@ -94,29 +94,31 @@ def parse_volume_id(string):
     Organization codes for the volumes can be found in ORG_CODES.
     '''
 
-    # First extract the volume ID from a URL, fallbck to assume string.
+    # First extract the volume ID from a URL, fallback to assume string.
     parsed_url = urlparse(string)
     if parsed_url.netloc == 'hdl.handle.net':
         # Parse the Handle ID, ex:
         # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s'
         # Note that if the Handle URL contains page info, this is discarded.
-        id = parsed_url.path.replace('/2027/', '')
+        htid = parsed_url.path.replace('/2027/', '')
 
     elif parsed_url.netloc == 'babel.hathitrust.org':
         # Parse the HT Digital Library URL, ex:
         # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7
         if parsed_url.query:
-            id = parse_qs(parsed_url.query).get('id', None)
-            if id is not None:
-                id = id[0]
+            htid = parse_qs(parsed_url.query).get('id', None)
+            if htid is not None:
+                htid = htid[0]
+                if ';' in htid:
+                    htid = htid.split(';')[0]
 
     else:
-        id = string
+        htid = string
 
     # Validate ID against ORG_CODES. 
-    # Won't guarantee volume existance, but is a sanity check.
-    if id and any(id.startswith(org) for org in ORG_CODES):
-        return id
+    # Won't guarantee volume existence, but it is a sanity check.
+    if htid and any(htid.startswith(org) for org in ORG_CODES):
+        return htid
     else: 
         raise ValueError("Invalid Organization Code in HathiTrust ID")
 

From c4b2c01cbf971f99d587584ef0aa2ceb5e312034 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Wed, 24 Mar 2021 22:41:46 -0500
Subject: [PATCH 38/49] Added missing package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 70287f0..56cfa52 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 
 __version__ = '0.1.54'
 
-install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2',
+install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']
 # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']
 if sys.version_info.major == 2:

From fad8fafbe5be46fe94dd9f7fbd686a6196535cd7 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Wed, 24 Mar 2021 22:42:06 -0500
Subject: [PATCH 39/49] Added additional test

---
 tests/test_htrc_util_resolve.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_htrc_util_resolve.py b/tests/test_htrc_util_resolve.py
index 432734d..6bbbfd0 100644
--- a/tests/test_htrc_util_resolve.py
+++ b/tests/test_htrc_util_resolve.py
@@ -42,6 +42,9 @@ def test_parse_volume_id(self):
         id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7')
         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
 
+        id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7')
+        self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
+
         id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s')
         self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s')
 

From 2a7aa8380d00f098abe8381ea019c70852dc7847 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Wed, 24 Mar 2021 22:42:33 -0500
Subject: [PATCH 40/49] Reduced the amount of I/O necessary for removing
 headers/footers from volumes; added parallelism to the header/footer removal
 process

---
 htrc/__main__.py             |  47 +++--
 htrc/config.py               |  47 ++++-
 htrc/hf_vol_load/__init__.py | 117 -----------
 htrc/volumes/__init__.py     | 388 +++++++++++++++++------------------
 tests/test_htrc_volumes.py   |  56 ++++-
 5 files changed, 301 insertions(+), 354 deletions(-)
 delete mode 100644 htrc/hf_vol_load/__init__.py

diff --git a/htrc/__main__.py b/htrc/__main__.py
index 3f35557..add1f0f 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -7,12 +7,12 @@
 standard_library.install_aliases()
 
 import json
-import os, os.path
+import os
+import os.path
 import shutil
 import sys
 from tempfile import NamedTemporaryFile
 
-
 from htrc.metadata import get_metadata, get_volume_metadata
 import htrc.volumes
 import htrc.workset
@@ -35,10 +35,23 @@ def download_parser(parser=None):
         help="remove folder if exists")
     parser.add_argument("-o", "--output", help="output directory",
         default='/media/secure_volume/workset/')
-    parser.add_argument("-hf", "--headfoot", action = 'store_true',
+    parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
         help="remove headers and footers from individual pages")
-    parser.add_argument("-hfc", "--headfootcon", action = 'store_true',
+    parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
         help="remove headers and footers from individual pages then concatenate pages")
+    parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
+                        help="How many pages ahead does the header/footer extractor algorithm look to find potential "
+                             "matching headers/footers (higher value gives potentially more accurate results on lower "
+                             "quality OCR volumes at the expense of runtime)")
+    parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7,
+                        help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching "
+                             "algorithm to declare that two headers are considered 'the same' (the higher the value, up "
+                             "to a max of 1.0, the more strict the matching has to be; lower values allow for more "
+                             "fuzziness to account for OCR errors)")
+    parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
+                        help="The max number of concurrent tasks to start when downloading or removing headers/footers")
+    parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,
+                        help="The max number of volumes to download at a time from DataAPI")
     parser.add_argument("-c", "--concat", action='store_true',
         help="concatenate a volume's pages in to a single file")
     parser.add_argument("-m", "--mets", action='store_true',
@@ -53,13 +66,13 @@ def download_parser(parser=None):
     parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.")
     return parser
 
+
 def add_workset_path(parser=None):
     if parser is None:
         parser = ArgumentParser()
     parser.add_argument("path", nargs='+', help="workset path[s]")
     return parser
 
-    
 
 def main():
     parser = ArgumentParser()
@@ -133,24 +146,24 @@ def main():
                 print("Please choose another output folder and try again.")
                 sys.exit(1)
         
-        if args.concat and args.headfoot:
-            print("Cannot set both concat and headfoot")
+        if args.concat and args.remove_headers_footers:
+            print("Cannot set both concat and remove-headers-footers")
             sys.exit(1)
-        if args.concat and args.headfootcon:
-            print("Cannot set both concat and headfootcon")
+        if args.concat and args.remove_headers_footers_and_concat:
+            print("Cannot set both concat and remove-headers-footers-and-concat")
             sys.exit(1)
-        if args.headfoot and args.headfootcon:
-            print("Cannot set both headfoot and headfootcon")
+        if args.remove_headers_footers and args.remove_headers_footers_and_concat:
+            print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat")
             sys.exit(1)
-        if args.mets and args.headfootcon:
-            print("Cannot set both mets and headfootcon")
+        if args.mets and args.remove_headers_footers_and_concat:
+            print("Cannot set both mets and remove_headers_footers_and_concat")
             sys.exit(1)
         if args.pages:
             if args.mets and args.concat:
-                print ("Cannot set both concat and mets with pages")
+                print("Cannot set both concat and mets with pages")
                 sys.exit(1)
-            if args.mets and args.headfootcon:
-                print("Cannot set both mets and headfootcon with pages")
+            if args.mets and args.remove_headers_footers_and_concat:
+                print("Cannot set both mets and remove_headers_footers_and_concat with pages")
                 sys.exit(1)
 
         try:
@@ -159,6 +172,7 @@ def main():
             print("Invalid identifier:", args.file)
             sys.exit(1)
 
+
 def resolve_and_download(args):
     if args.file == sys.stdin:
         # For use with UNIX pipes
@@ -223,6 +237,7 @@ def download(args):
         else:
             raise e
 
+
 def download_with_tempfile(args, volumes):
     f = NamedTemporaryFile()
     for volume in volumes:
diff --git a/htrc/config.py b/htrc/config.py
index ccd7d54..c09f916 100644
--- a/htrc/config.py
+++ b/htrc/config.py
@@ -6,18 +6,14 @@
 """
 from future import standard_library
 standard_library.install_aliases()
-from builtins import input
-
+from typing import Optional
 from configparser import RawConfigParser as ConfigParser, NoSectionError
 from codecs import open
-from getpass import getpass
 import logging
 import os.path
 import shutil
 import time
 
-from htrc.lib.cli import bool_prompt
-
 DEFAULT_PATH = os.path.expanduser('~')
 DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc')
 if not os.path.exists(DEFAULT_PATH):
@@ -26,6 +22,25 @@
     logging.info("Copying default config file to home directory.")
     shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH)
 
+
+class HtrcDataApiConfig:
+    def __init__(self,
+                 token: Optional[str] = None,
+                 host: Optional[str] = None,
+                 port: Optional[int] = None,
+                 epr: Optional[str] = None,
+                 cert: Optional[str] = None,
+                 key: Optional[str] = None) -> None:
+        super().__init__()
+
+        self.token = token or get_jwt_token(save_new_token=False)
+        self.host = host or get_dataapi_host()
+        self.port = port or get_dataapi_port()
+        self.epr = epr or get_dataapi_epr()
+        self.cert = cert or get_dataapi_cert()
+        self.key = key or get_dataapi_key()
+
+
 def _get_value(section, key, path=None):
     if path is None:
         path = DEFAULT_PATH
@@ -38,33 +53,41 @@ def _get_value(section, key, path=None):
     except NoSectionError:
         raise EnvironmentError("Config not set for {} {} in {}".format(
             section, key, path))
-    
+
+
 def get_dataapi_port(path=None):
     port = int(_get_value('data', 'port', path))
     return (port)
 
+
 def get_dataapi_host(path=None):
     host = _get_value('data', 'host', path)
     return (host)
 
+
 def get_dataapi_epr(path=None):
     return _get_value('data', 'url', path)
 
+
 def get_dataapi_cert(path=None):
     return _get_value('data', 'cert', path)
 
+
 def get_dataapi_key(path=None):
     return _get_value('data', 'key', path)
 
+
 def get_idp_host_port(path=None):
     host = _get_value('idp', 'host', path)
     port = _get_value('idp', 'port', path)
 
     return (host, port)
 
+
 def get_idp_path(path=None):
     return _get_value('idp', 'url')
 
+
 def get_idp_url(path=None):
     host, port = get_idp_host_port(path)
     path = get_idp_path(path)
@@ -76,23 +99,26 @@ def get_idp_url(path=None):
 
 
 # Add jwt credential access methods
-def get_jwt_token(path=None):
+def get_jwt_token(path=None, save_new_token=True):
     try:
         token = _get_value('jwt', 'token', path)
 
         # check expiration date
         expiration = int(_get_value('jwt', 'expiration', path))
         if time.time() > expiration:
+            import htrc
+            htrc.config.remove_jwt_token()
             raise RuntimeError("JWT token expired.") 
     except:
         # This should run on either a missing or expired token.
         import htrc.auth
         token, expiration = htrc.auth.get_jwt_token()
-        htrc.config.save_jwt_token(token, expiration, path)
-
+        if save_new_token:
+            htrc.config.save_jwt_token(token, expiration, path)
 
     return token
 
+
 def save_jwt_token(token, expiration=None, path=None):
     """
     Saves JWT token in the config file.
@@ -121,6 +147,7 @@ def save_jwt_token(token, expiration=None, path=None):
 
     return token
 
+
 def remove_jwt_token(path=None):
     """
     Removes JWT token from the config file.
@@ -158,9 +185,11 @@ def get_credentials(path=None):
 
     return (client_id, client_secret)
 
+
 def populate_parser(parser):
     return parser
 
+
 if __name__ == '__main__':
     from argparse import ArgumentParser
 
diff --git a/htrc/hf_vol_load/__init__.py b/htrc/hf_vol_load/__init__.py
deleted file mode 100644
index 72f08d5..0000000
--- a/htrc/hf_vol_load/__init__.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import unittest
-from typing import List
-
-from htrc.models import HtrcPage
-from htrc.runningheaders import parse_page_structure, clean_text, levenshtein
-
-
-class TestRunningHeaders(unittest.TestCase):
-    def test_finding_running_headers(self):
-        pages = load_vol("data/vol1", num_pages=10)
-        structured_pages = parse_page_structure(pages)
-        headers = ["|".join(page.header_lines) for page in structured_pages]
-        expected = [
-            "",
-            "",
-            "CHAPTER 1|INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
-            "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
-            "INTRODUCTION TO RUNNING HEADERS 1|Lorem Ipsum style",
-            "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style",
-            "CHAPTER 2|EVERYTHING IS RELATIVE",
-            "2 EVERYTHING IS RELATIVE",
-            "EVERYTHING IS RELATIVE 2",
-            "2 EVERYTHING IS RELATIVE"
-        ]
-        self.assertListEqual(expected, headers)
-
-    def test_finding_running_footers(self):
-        pages = load_vol("data/vol1", num_pages=10)
-        structured_pages = parse_page_structure(pages)
-        footers = ["|".join(page.footer_lines) for page in structured_pages]
-        expected = [
-            "",
-            "",
-            "Page 2",
-            "Page 3",
-            "Page 4",
-            "Page 5",
-            "Page 6",
-            "Page 7",
-            "Page 8",
-            "Page 9"
-        ]
-        self.assertListEqual(expected, footers)
-
-    def test_identify_correct_page_body(self):
-        pages = load_vol("data/vol1", num_pages=10)
-        structured_pages = parse_page_structure(pages)
-        len_body_per_page = [len(page.body_lines) for page in structured_pages]
-        expected = [0, 7, 43, 28, 26, 30, 31, 27, 28, 15]
-        self.assertListEqual(expected, len_body_per_page)
-
-    def test_find_footer_with_page_numbers(self):
-        pages = load_vol("data/vol2", num_pages=10)
-        structured_pages = parse_page_structure(pages)
-        footers = ["|".join(page.footer_lines) for page in structured_pages]
-        expected = [
-            "",
-            "",
-            "2",
-            "                                                                                    3",
-            "4",
-            "                                                                                    5",
-            "6",
-            "                                                                                    7",
-            "8",
-            "                                                                                    9"
-        ]
-        self.assertListEqual(expected, footers)
-
-
-class TestUtils(unittest.TestCase):
-    def test_clean_text(self):
-        s1 = u"\t На   берегу \tпустынных волн  \t\n"
-        s1_expected = u"на берегу пустынных волн"
-        s2 = u" Pot să mănânc  sticlă și ea nu mă rănește. "
-        s2_expected = u"pot să mănânc sticlă și ea nu mă rănește"
-        s1_clean = clean_text(s1)
-        s2_clean = clean_text(s2)
-
-        self.assertEqual(s1_expected, s1_clean)
-        self.assertEqual(s2_expected, s2_clean)
-
-    def test_levenshtein(self):
-        s1 = "rosettacode"
-        s2 = "raisethysword"
-        lev = levenshtein(s1, s2)
-        self.assertEqual(8, lev)
-
-        s1 = "kitten"
-        s2 = "sitting"
-        lev = levenshtein(s1, s2, replace_cost=2)
-        self.assertEqual(5, lev)
-
-        s1 = "abracadabra"
-        s2 = "abracadabra"
-        lev = levenshtein(s1, s2)
-        self.assertEqual(0, lev)
-
-        s1 = ""
-        s2 = "abc"
-        lev = levenshtein(s1, s2)
-        self.assertEqual(3, lev)
-
-
-def load_vol(path: str, num_pages: int) -> List[HtrcPage]:
-    pages = []
-    for n in range(num_pages):
-        page_num = str(n+1).zfill(8)
-        with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f:
-            lines = [line.rstrip() for line in f.readlines()]
-            pages.append(HtrcPage(lines))
-
-    return pages
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index d85ec1f..865b826 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -9,43 +9,36 @@
 `htrc.mock.volumes` contains Patch objects for testing workflows.
 """
 from __future__ import print_function
+
 from future import standard_library
-standard_library.install_aliases()
 
-from builtins import input
+from htrc.models import HtrcPage
+
+standard_library.install_aliases()
 
 import http.client
-from io import BytesIO  # used to stream http response into zipfile.
+from io import BytesIO, TextIOWrapper
 import json
-import logging
 import os.path
 import progressbar
-import re
 import socket
 import ssl
-import sys
-from time import sleep
-from urllib.request import urlopen
-from urllib.error import HTTPError
-from urllib.parse import quote_plus, urlencode
-import xml.etree.ElementTree as ET
+from urllib.parse import urlencode
 from zipfile import ZipFile  # used to decompress requested zip archives.
+from tqdm import tqdm
 from htrc.runningheaders import parse_page_structure
-from htrc.hf_vol_load import load_vol
+from functools import partial
 import pandas as pd
-import fnmatch
-import glob
-from tqdm import tqdm
-import shutil
-from htrc.lib.cli import bool_prompt
 from htrc.util import split_items
 import htrc.config
+import multiprocessing
 
 import logging
 from logging import NullHandler
 logging.getLogger(__name__).addHandler(NullHandler())
 
-def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False):
+
+def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128):
     """
     Returns volumes from the Data API as a raw zip stream.
 
@@ -60,7 +53,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     if not volume_ids:
         raise ValueError("volume_ids is empty.")
 
-    url = epr + "volumes"
+    url = data_api_config.epr + "volumes"
 
     for id in volume_ids:
         if ("." not in id
@@ -77,7 +70,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
         data['mets'] = 'true'
 
     # Authorization
-    headers = {"Authorization": "Bearer " + token,
+    headers = {"Authorization": "Bearer " + data_api_config.token,
                "Content-type": "application/x-www-form-urlencoded"}
 
     # Create SSL lookup
@@ -87,8 +80,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
-    httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
-
+    httpsConnection = http.client.HTTPSConnection(
+        data_api_config.host,
+        data_api_config.port,
+        context=ctx,
+        key_file=data_api_config.key,
+        cert_file=data_api_config.cert)
 
     httpsConnection.request("POST", url, urlencode(data), headers)
 
@@ -104,7 +101,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
                      ' (', progressbar.FileTransferSpeed(), ')'])
 
         while body:
-            body = response.read(128)
+            body = response.read(buffer_size)
             data.write(body)
             bytes_downloaded += len(body)
             bar.update(bytes_downloaded)
@@ -122,12 +119,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met
     return data
 
 
-def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=False):
+def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128):
     """
     Returns a ZIP file containing specfic pages.
 
     Parameters:
-    :token: An OAuth2 token for the app.
+    :data_api_config: The configuration data of the DataAPI endpoint.
     :volume_ids: A list of volume_ids
     :concat: If True, return a single file per volume. If False, return a single
     file per page (default).
@@ -135,7 +132,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     if not page_ids:
         raise ValueError("page_ids is empty.")
 
-    url = epr + "pages"
+    url = data_api_config.epr + "pages"
 
     for id in page_ids:
         if ("." not in id
@@ -153,7 +150,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
         data['mets'] = 'true'
 
     # Authorization
-    headers = {"Authorization": "Bearer " + token,
+    headers = {"Authorization": "Bearer " + data_api_config.token,
                "Content-type": "application/x-www-form-urlencoded"}
 
 
@@ -164,8 +161,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
     ctx.verify_mode = ssl.CERT_NONE
 
     # Retrieve the volumes
-    httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert)
-
+    httpsConnection = http.client.HTTPSConnection(
+        data_api_config.host,
+        data_api_config.port,
+        context=ctx,
+        key_file=data_api_config.key,
+        cert_file=data_api_config.cert
+    )
 
     httpsConnection.request("POST", url, urlencode(data), headers)
 
@@ -181,7 +183,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
                                                ' (', progressbar.FileTransferSpeed(), ')'])
 
         while body:
-            body = response.read(128)
+            body = response.read(buffer_size)
             data.write(body)
             bytes_downloaded += len(body)
             bar.update(bytes_downloaded)
@@ -198,12 +200,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa
 
     return data
 
+
 def get_oauth2_token(username, password):
     # make sure to set the request content-type as application/x-www-form-urlencoded
     headers = {"Content-type": "application/x-www-form-urlencoded"}
-    data = { "grant_type": "client_credentials",
-             "client_secret": password,
-             "client_id": username }
+    data = {"grant_type": "client_credentials",
+            "client_secret": password,
+            "client_id": username}
     data = urlencode(data)
 
     # create an SSL context
@@ -235,20 +238,21 @@ def get_oauth2_token(username, password):
         logging.debug("Response Code: {}".format(response.status))
         logging.debug("Response: {}".format(response.reason))
         logging.debug(response.read())
-        raise EnvironmentError("Unable to get token.")
+        raise EnvironmentError("Unable to get the token.")
 
     if httpsConnection is not None:
         httpsConnection.close()
 
     return token
 
+
 def grep(file_name, output_dir, pattern):
     na_volume = []
     for line in open(file_name):
         if pattern in line:
             na_volume.append(line.split()[-1])
     if len(na_volume) < 100:
-        print("\nFollowing volume ids are not available.")
+        print("\nThe following volume ids are not available:")
         print("\n".join(str(item) for item in na_volume))
         with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:
             volume_na.write("\n".join(str(item) for item in na_volume))
@@ -256,185 +260,154 @@ def grep(file_name, output_dir, pattern):
         if len(na_volume) == 100:
             print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.")
 
+
 def check_error_file(output_dir):
     file_name = "ERROR.err"
 
     if output_dir.endswith("/"):
-        file_path = output_dir+ file_name
+        file_path = output_dir + file_name
     else:
-        file_path = output_dir+"/"+file_name
+        file_path = output_dir + "/" + file_name
 
     if os.path.isfile(file_path):
         grep(file_path, output_dir, "KeyNotFoundException")
 
-def remove_hf(output_dir):
-    os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True)
-    removed_hf = os.path.join(output_dir, "removed_hf_data")
-    vol_paths = glob.glob(os.path.join(output_dir,'**'))
-    df = pd.DataFrame()
-    
-
-    for path in tqdm(vol_paths):
-        if os.path.isdir(path):
-            page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
-            n = len(page_paths)
-            num = 1
-    
-            while num <= n:
-                for pg in page_paths:
-                    parsed_path = str(path).split('/')
-                    clean_path_root = '/'.join(parsed_path)
-                    page_num = str(num).zfill(8)
-                    new_filename = page_num+'.txt'
-                    os.rename(pg, clean_path_root+'/'+new_filename)
-                    num += 1
-    
-            folder = os.path.basename(path)
-            n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
-            pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
-    
-            body = []
-            for n, page in enumerate(pages):
-                s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
-    
-                pg_boolean = s + "\n" + "-"*len(s)
-                pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
-                #pg_body = page.body if page.has_body else ""
-                pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
-                
-                body.append(page.body)
-                
-                df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
-                df.sort_values("Volume")
-                for i, g in df.groupby("Volume"):
-                    g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
-            
-                count = 1
-                for item in body:
-                    pg_n = str(count).zfill(8)
-                    filename = '{}.txt'.format(pg_n)
-                    count += 1
-                    with open(os.path.join(clean_path_root, filename), "w") as f_out:
-                        f_out.write('{}\n'.format(item))
-
-def remove_hf_concat(output_dir):
-    os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True)
-    removed_hf = os.path.join(output_dir, "removed_hf_data")
-    vol_paths = glob.glob(os.path.join(output_dir,'**'))
-    df = pd.DataFrame()
-    retain = ["removed_hf_data"]
-    rm_txt = "removed_hf_data.txt"
-    
-
-    for path in tqdm(vol_paths):
-        if os.path.isdir(path):
-            page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True))
-            n = len(page_paths)
-            num = 1
-    
-            while num <= n:
-                for pg in page_paths:
-                    parsed_path = str(path).split('/')
-                    clean_path_root = '/'.join(parsed_path)
-                    page_num = str(num).zfill(8)
-                    new_filename = page_num+'.txt'
-                    os.rename(pg, clean_path_root+'/'+new_filename)
-                    num += 1
-    
-            folder = os.path.basename(path)
-            n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt"))
-            pages = parse_page_structure(load_vol(path, num_pages=n_pgs))
-                
-            filename = '{}.txt'.format(folder)
-            body = []
-            for n, page in enumerate(pages):
-                s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer)
-    
-                pg_boolean = s + "\n" + "-"*len(s)
-                pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A")
-                #pg_body = page.body if page.has_body else ""
-                pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A")
-                
-                body.append(page.body)
-                
-                df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True)
-                df.sort_values("Volume")
-                for i, g in df.groupby("Volume"):
-                    g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i)))
-            
-                    
-            with open(os.path.join(output_dir, filename), "w") as f_out:
-                f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n')
-            if folder not in retain:
-                shutil.rmtree(os.path.join(output_dir, folder))
-            if os.path.exists(os.path.join(output_dir, rm_txt)):
-                os.remove(os.path.join(output_dir, rm_txt))
-            
-            
-            
-def download_volumes(volume_ids, output_dir, username=None, password=None,
-                     config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None):
-    # create output_dir folder, if nonexistant
-    if not os.path.isdir(output_dir):
-        os.makedirs(output_dir)
-
-    # get token if not specified
-    if not token:
-        token = htrc.config.get_jwt_token()
-        htrc.config.remove_jwt_token()
-
-    if not host:
-        host= htrc.config.get_dataapi_host()
-
-    if not port:
-        port = htrc.config.get_dataapi_port()
-
-    if not epr:
-        epr = htrc.config.get_dataapi_epr()
-
-    if not cert:
-        cert = htrc.config.get_dataapi_cert()
-
-    if not key:
-        key = htrc.config.get_dataapi_key()
-
-    if any((token, host, port)) is not None:
-        logging.info("obtained token: %s\n" % token)
+
+def _to_htrc_page(page_file, zip):
+    with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page:
+        return HtrcPage([line.rstrip() for line in page.readlines()])
+
+
+def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False,
+                     remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, save_removed_hf=True,
+                     parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None):
+    if not 0 < parallelism <= multiprocessing.cpu_count():
+        raise ValueError("Invalid parallelism level specified")
+
+    remove_hf_fun = partial(
+        _remove_headers_footers_and_save,
+        concat=concat,
+        hf_min_similarity=hf_min_similarity,
+        hf_window_size=hf_window_size,
+        save_removed_hf=save_removed_hf,
+        output_dir=output_dir
+    )
+
+    volume_ids = list(set(volume_ids))  # ensure unique volume ids
+    num_vols = len(volume_ids)
+
+    data_api_config = data_api_config or htrc.config.HtrcDataApiConfig()
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None:
+        logging.info("obtained token: %s\n" % data_api_config.token)
 
         try:
-            for ids in split_items(volume_ids, 250):
-                if pages:
-                    if concat & mets:
-                        raise ValueError("Cannot set both concat and mets with pages.")
+            errors = []
+            rights = []
+
+            with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool:
+                for ids in split_items(volume_ids, batch_size):
+                    if pages:
+                        if concat and mets:
+                            raise ValueError("Cannot set both concat and mets with pages.")
+                        else:
+                            data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets)
                     else:
-                        data = get_pages(token, ids, host, port, cert, key, epr, concat, mets)
-                else:
-                    data = get_volumes(token, ids, host, port, cert, key, epr, concat, mets)
+                        data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets)
+
+                    volumes = []
+
+                    with ZipFile(BytesIO(data)) as vols_zip:
+                        zip_list = vols_zip.namelist()
+                        if 'ERROR.err' in zip_list:
+                            errors.append(vols_zip.read('ERROR.err').decode('utf-8'))
+                            zip_list.remove('ERROR.err')
+                        if 'volume-rights.txt' in zip_list:
+                            rights_data = vols_zip.read('volume-rights.txt').decode('utf-8')
+                            zip_list.remove('volume-rights.txt')
+                            if not rights:
+                                rights.append(rights_data)
+                            else:
+                                # due to the format in which 'volume-rights.txt' is created, we have to skip
+                                # the first 4 lines which make up the header of the file, to extract only the
+                                # actual volume rights data for accumulation
+                                rights.append(''.join(rights_data.splitlines(keepends=True)[4:]))
+
+                        zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')]
+                        num_vols_in_zip = len(zip_volume_paths)
+
+                        if not remove_headers_footers:
+                            vols_zip.extractall(output_dir, members=zip_list)
+                            progress.update(num_vols_in_zip)
+                        else:
+                            for zip_vol_path in zip_volume_paths:
+                                sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/'))
+                                vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths]
+                                volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages))
+
+                    del data, vols_zip
+
+                    num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip
+                    progress.update(num_missing)  # update progress bar state to include the missing volumes also
+
+                    # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted
+                    # without further processing
+                    if volumes:
+                        for _ in pool.imap_unordered(remove_hf_fun, volumes):
+                            progress.update()
+
+            if errors:
+                with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file:
+                    err_file.write(''.join(errors))
+                check_error_file(output_dir)
 
-                myzip = ZipFile(BytesIO(data))
-                myzip.extractall(output_dir)
-                myzip.close()
+            if rights:
+                with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file:
+                    rights_file.write(''.join(rights))
 
-                check_error_file(output_dir)
-                d = os.listdir(output_dir)
-                if headfoot:
-                    if len(d) == 0:
-                        print("This directory is empty")
-                        sys.exit(1)
-                    else:
-                        remove_hf(output_dir)
-                if headfootcon:
-                    if len(d) == 0:
-                        print("This directory is empty")
-                        sys.exit(1)
-                    else:
-                        remove_hf_concat(output_dir)
-                
         except socket.error:
             raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?")
 
     else:
-        raise RuntimeError("Failed to obtain jwt token.")
+        raise RuntimeError("Failed to obtain the JWT token.")
+
+
+def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, save_removed_hf, output_dir):
+    zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data
+    clean_volid = zip_vol_path[:-1]
+
+    vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity)
+    pages_body = (page.body for page in vol_pages)
+
+    if concat:
+        with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
+            vol_file.write('\n'.join(pages_body))
+    else:
+        vol_path = os.path.join(output_dir, zip_vol_path)
+        os.mkdir(vol_path)
+        for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
+            with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
+                page_file.write(page_body)
+
+    if save_removed_hf:
+        # save the removed headers/footers for user inspection
+        removed_hf = []
+        for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages):
+            if not (vol_page.has_header or vol_page.has_footer):
+                # skip reporting pages that don't have an identified header or footer
+                continue
+            _, page_name = os.path.split(vol_page_path)
+            page_name, _ = os.path.splitext(page_name)
+            removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer})
+
+        if concat:
+            removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv')
+        else:
+            removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv')
+
+        pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False)
 
 
 def download(args):
@@ -442,9 +415,22 @@ def download(args):
     with open(args.file) as IDfile:
         volumeIDs = [line.strip() for line in IDfile]
 
-    return download_volumes(volumeIDs, args.output,
-        username=args.username, password=args.password,
-        token=args.token, headfoot=args.headfoot, headfootcon=args.headfootcon, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost,
-        port=args.dataport, cert=args.datacert, key=args.datakey,
-        epr=args.dataepr)
+    data_api_config = htrc.config.HtrcDataApiConfig(
+        token=args.token,
+        host=args.datahost,
+        port=args.dataport,
+        epr=args.dataepr,
+        cert=args.datacert,
+        key=args.datakey
+    )
 
+    return download_volumes(volumeIDs, args.output,
+                            remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat,
+                            concat=args.concat or args.remove_headers_footers_and_concat,
+                            mets=args.mets,
+                            pages=args.pages,
+                            hf_window_size=args.window_size,
+                            hf_min_similarity=args.min_similarity_ratio,
+                            parallelism=args.parallelism,
+                            batch_size=args.batch_size,
+                            data_api_config=data_api_config)
diff --git a/tests/test_htrc_volumes.py b/tests/test_htrc_volumes.py
index d4d9abf..752cbf4 100644
--- a/tests/test_htrc_volumes.py
+++ b/tests/test_htrc_volumes.py
@@ -60,27 +60,53 @@ def test_get_volumes_and_pages(self, https_mock):
         response_mock.read.return_value =\
             ''.encode('utf8')
         https_mock.return_value.getresponse.return_value = response_mock
-
-        htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
-        htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/')
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
+        htrc.volumes.get_volumes(data_api_config, self.test_vols)
+        htrc.volumes.get_pages(data_api_config, self.test_vols)
 
     @patch('htrc.volumes.http.client.HTTPSConnection')
     def test_get_volumes_and_pages_error(self, https_mock):
         response_mock = Mock(status=500)
         https_mock.return_value.getresponse.return_value = response_mock
 
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
         with self.assertRaises(EnvironmentError):
-            htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_volumes(data_api_config, self.test_vols)
 
         with self.assertRaises(EnvironmentError):
-            htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/')
+            htrc.volumes.get_pages(data_api_config, self.test_vols)
 
     def test_get_volumes_and_pages_empty(self):
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
         with self.assertRaises(ValueError):
-            htrc.volumes.get_volumes('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_volumes(data_api_config, [])
 
         with self.assertRaises(ValueError):
-            htrc.volumes.get_pages('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' )
+            htrc.volumes.get_pages(data_api_config, [])
 
     @patch('htrc.volumes.ZipFile')
     @patch('htrc.volumes.get_volumes')
@@ -93,14 +119,21 @@ def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock,
         oauth2_mock.return_value = 'a1b2c3d4e5'
         volumes_mock.return_value = b''
 
-        htrc.volumes.download_volumes(self.test_vols, self.output_path,
-            username='1234', password='1234', token='1234')
+        data_api_config = htrc.config.HtrcDataApiConfig(
+            token='1234',
+            host='data-host',
+            port=443,
+            epr='/',
+            cert='/home/client-certs/client.pem',
+            key='/home/client-certs/client.pem'
+        )
+
+        htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
 
         # test directory creation
         import shutil
         shutil.rmtree(self.output_path)
-        htrc.volumes.download_volumes(self.test_vols, self.output_path,
-            username='1234', password='1234', token='1234')
+        htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config)
 
     # TODO: Fix this test for case where config file exists, but creds not set
     """
@@ -132,6 +165,7 @@ def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_moc
     def test_download(self):
         pass
 
+
 suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes)
 unittest.TextTestRunner(verbosity=2).run(suite)
 

From 6afff024474db6022d969d7e25ed1b4b0e877112 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Thu, 25 Mar 2021 10:10:35 -0500
Subject: [PATCH 41/49] Added cmd line option for user to specify that they
 want the removed headers/footers saved for inspection (no longer turned on by
 default --- user must specify!)

---
 htrc/__main__.py         | 23 ++++++++++++-----------
 htrc/volumes/__init__.py |  1 +
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index add1f0f..d82cb75 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -6,7 +6,6 @@
 from future import standard_library
 standard_library.install_aliases()
 
-import json
 import os
 import os.path
 import shutil
@@ -30,15 +29,15 @@ def download_parser(parser=None):
     parser.add_argument("-u", "--username", help="HTRC username")
     parser.add_argument("-p", "--password", help="HTRC password")
     parser.add_argument("file", nargs='?', default=sys.stdin,
-        help="workset path[s]")
+        help="Workset path[s]")
     parser.add_argument("-f", "--force", action='store_true', 
-        help="remove folder if exists")
-    parser.add_argument("-o", "--output", help="output directory",
+        help="Remove folder if exists")
+    parser.add_argument("-o", "--output", help="Output directory",
         default='/media/secure_volume/workset/')
     parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
-        help="remove headers and footers from individual pages")
+        help="Remove headers and footers from individual pages")
     parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
-        help="remove headers and footers from individual pages then concatenate pages")
+        help="Remove headers and footers from individual pages then concatenate pages")
     parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
                         help="How many pages ahead does the header/footer extractor algorithm look to find potential "
                              "matching headers/footers (higher value gives potentially more accurate results on lower "
@@ -48,14 +47,16 @@ def download_parser(parser=None):
                              "algorithm to declare that two headers are considered 'the same' (the higher the value, up "
                              "to a max of 1.0, the more strict the matching has to be; lower values allow for more "
                              "fuzziness to account for OCR errors)")
+    parser.add_argument("-s", "--save-removed-hf", action='store_true',
+                        help="Save a report of the removed headers and footers for each page for inspection")
     parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
                         help="The max number of concurrent tasks to start when downloading or removing headers/footers")
     parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,
                         help="The max number of volumes to download at a time from DataAPI")
     parser.add_argument("-c", "--concat", action='store_true',
-        help="concatenate a volume's pages in to a single file")
+        help="Concatenate a volume's pages in to a single file")
     parser.add_argument("-m", "--mets", action='store_true',
-                        help="add volume's METS file")
+                        help="Add volume's METS file")
     parser.add_argument("-pg", "--pages",action='store_true',
         help="Download given page numbers of a volumes.")
     parser.add_argument("-t", "--token", help="JWT for volumes download.")
@@ -70,13 +71,13 @@ def download_parser(parser=None):
 def add_workset_path(parser=None):
     if parser is None:
         parser = ArgumentParser()
-    parser.add_argument("path", nargs='+', help="workset path[s]")
+    parser.add_argument("path", nargs='+', help="Workset path[s]")
     return parser
 
 
 def main():
     parser = ArgumentParser()
-    parser.add_argument('-d', '--debug', help="print long debug messages",
+    parser.add_argument('-d', '--debug', help="Print long debug messages",
                         action='store_true')
     parsers = parser.add_subparsers(help="select a command")
 
@@ -101,7 +102,7 @@ def main():
     
     # Run helper
     parser_run = parsers.add_parser('run', help="Run a built-in algorithm.")
-    run_parsers = parser_run.add_subparsers(help="select a command")
+    run_parsers = parser_run.add_subparsers(help="Select a command")
 
     parser_mallet = run_parsers.add_parser('mallet')
     htrc.tools.mallet.populate_parser(parser_mallet)
diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 865b826..9d58a82 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -433,4 +433,5 @@ def download(args):
                             hf_min_similarity=args.min_similarity_ratio,
                             parallelism=args.parallelism,
                             batch_size=args.batch_size,
+                            save_removed_hf=args.save_removed_hf,
                             data_api_config=data_api_config)

From bc43c59b28684a14118f6698999f77805ecf6434 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Wed, 31 Mar 2021 17:34:28 -0400
Subject: [PATCH 42/49] Changed defaults for -s flag

Changed -s flag to mean skip-removed-hf so that users only call the flag if they do NOT wish the .csv file of removed headers/footers to be saved to the output directory.
---
 htrc/__main__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index d82cb75..297ac5f 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -35,9 +35,9 @@ def download_parser(parser=None):
     parser.add_argument("-o", "--output", help="Output directory",
         default='/media/secure_volume/workset/')
     parser.add_argument("-hf", "--remove-headers-footers", action='store_true',
-        help="Remove headers and footers from individual pages")
+        help="Remove headers and footers from individual pages and save in a separate csv file for inspection")
     parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true',
-        help="Remove headers and footers from individual pages then concatenate pages")
+        help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages")
     parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6,
                         help="How many pages ahead does the header/footer extractor algorithm look to find potential "
                              "matching headers/footers (higher value gives potentially more accurate results on lower "
@@ -47,8 +47,8 @@ def download_parser(parser=None):
                              "algorithm to declare that two headers are considered 'the same' (the higher the value, up "
                              "to a max of 1.0, the more strict the matching has to be; lower values allow for more "
                              "fuzziness to account for OCR errors)")
-    parser.add_argument("-s", "--save-removed-hf", action='store_true',
-                        help="Save a report of the removed headers and footers for each page for inspection")
+    parser.add_argument("-s", "--skip-removed-hf", action='store_true',
+                        help="Skip creating a saved report of the removed headers and footers for each page for inspection")
     parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(),
                         help="The max number of concurrent tasks to start when downloading or removing headers/footers")
     parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250,

From ef4dd442e15d267557926cee30aba2f59dd9689c Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Wed, 31 Mar 2021 17:38:17 -0400
Subject: [PATCH 43/49] Changed -s flag default

Changed the function of the -s flag so that if a user does NOT call the flag in conjunction with the -hf or -hfc flags the .csv files containing the removed headers/footers will be saved to the output directory by default. If a user does call the -s flag in conjunction with the -hf or -hfc flags then the .csv files will be "skipped" and not saved to the output directory.
---
 htrc/volumes/__init__.py | 42 +++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 9d58a82..4b638f3 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -279,7 +279,7 @@ def _to_htrc_page(page_file, zip):
 
 
 def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False,
-                     remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, save_removed_hf=True,
+                     remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False,
                      parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None):
     if not 0 < parallelism <= multiprocessing.cpu_count():
         raise ValueError("Invalid parallelism level specified")
@@ -289,7 +289,7 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal
         concat=concat,
         hf_min_similarity=hf_min_similarity,
         hf_window_size=hf_window_size,
-        save_removed_hf=save_removed_hf,
+        skip_removed_hf=skip_removed_hf,
         output_dir=output_dir
     )
 
@@ -374,25 +374,34 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal
         raise RuntimeError("Failed to obtain the JWT token.")
 
 
-def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, save_removed_hf, output_dir):
+def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir):
     zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data
     clean_volid = zip_vol_path[:-1]
 
     vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity)
     pages_body = (page.body for page in vol_pages)
-
-    if concat:
-        with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
-            vol_file.write('\n'.join(pages_body))
+    # save the removed headers/footers for user inspection
+    if skip_removed_hf:
+        if concat:
+            with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
+                vol_file.write('\n'.join(pages_body))
+        else:
+            vol_path = os.path.join(output_dir, zip_vol_path)
+            os.mkdir(vol_path)
+            for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
+                with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
+                    page_file.write(page_body)
     else:
-        vol_path = os.path.join(output_dir, zip_vol_path)
-        os.mkdir(vol_path)
-        for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
-            with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
-                page_file.write(page_body)
-
-    if save_removed_hf:
-        # save the removed headers/footers for user inspection
+        if concat:
+            with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file:
+                vol_file.write('\n'.join(pages_body))
+        else:
+            vol_path = os.path.join(output_dir, zip_vol_path)
+            os.mkdir(vol_path)
+            for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body):
+                with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
+                    page_file.write(page_body)
+    
         removed_hf = []
         for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages):
             if not (vol_page.has_header or vol_page.has_footer):
@@ -408,6 +417,7 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win
             removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv')
 
         pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False)
+        
 
 
 def download(args):
@@ -433,5 +443,5 @@ def download(args):
                             hf_min_similarity=args.min_similarity_ratio,
                             parallelism=args.parallelism,
                             batch_size=args.batch_size,
-                            save_removed_hf=args.save_removed_hf,
+                            skip_removed_hf=args.skip_removed_hf,
                             data_api_config=data_api_config)

From a4832b70f1ef20f1823822f989e7f5833aaa6265 Mon Sep 17 00:00:00 2001
From: David K <klosteda@iu.edu>
Date: Thu, 1 Apr 2021 16:26:13 -0400
Subject: [PATCH 44/49] Update cli.rst

---
 docs/source/cli.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index 8101173..2a0ff7c 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -125,6 +125,23 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da
 * Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) :
 
     ``htrc download -hfc /home/dcuser/HTRC/htrc-id``
+    
+* Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers :
+
+    ``htrc download -hf -s /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) :
+
+    ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id``
+
+* Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) :
+
+    ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id``
+    
+* Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2):
+
+    ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id``
+    
 
 |
 +---------------------------------+-----------------------------------------------+

From eb00ece77ba9c0f49ee0750a960eb2a4cb2c1eb5 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Tue, 6 Apr 2021 09:54:05 -0500
Subject: [PATCH 45/49] Fixed formatting issue

---
 htrc/volumes/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index dc1f035..48de66c 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -368,11 +368,11 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal
                 with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na:
                     volumes_na.write("\n".join(str(item) for item in na_volumes_all))
 
-	            if num_na < 100:
-	                print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt for the "
-	                      "complete list. ")
-	                print('\n'.join(str(item) for item in na_volumes_all))
-	            else:
+                if num_na < 100:
+                    print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt "
+                          "for the complete list. ")
+                    print('\n'.join(str(item) for item in na_volumes_all))
+                else:
                     print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt "
                           "for the "
                           "complete list. \nTo check the validity of volumes in your workset or volume id file go "

From aa3b5db6c35f0c51bc16c1c0ffa46503ae9261a0 Mon Sep 17 00:00:00 2001
From: Boris Capitanu <capitanu@illinois.edu>
Date: Tue, 6 Apr 2021 11:38:24 -0500
Subject: [PATCH 46/49] Fixed another formatting issue

---
 htrc/volumes/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py
index 48de66c..c4c11b9 100644
--- a/htrc/volumes/__init__.py
+++ b/htrc/volumes/__init__.py
@@ -432,7 +432,6 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win
             removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv')
 
         pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False)
-        
 
 
 def download(args):

From 28a5a408f18dd85e5271445d2d17732d0be4d8c2 Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Thu, 8 Apr 2021 10:46:22 -0400
Subject: [PATCH 47/49] Disabled username and password paraser arguments since
 those are not used.

---
 htrc/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/htrc/__main__.py b/htrc/__main__.py
index 843b9e5..6102ad9 100644
--- a/htrc/__main__.py
+++ b/htrc/__main__.py
@@ -26,8 +26,8 @@
 def download_parser(parser=None):
     if parser is None:
         parser = ArgumentParser()
-    parser.add_argument("-u", "--username", help="HTRC username")
-    parser.add_argument("-p", "--password", help="HTRC password")
+    #parser.add_argument("-u", "--username", help="HTRC username")
+    #parser.add_argument("-p", "--password", help="HTRC password")
     parser.add_argument("file", nargs='?', default=sys.stdin,
         help="Workset path[s]")
     parser.add_argument("-f", "--force", action='store_true', 

From ca43dd69af3094ef84450f8cee09a44fced2001b Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Mon, 12 Apr 2021 10:03:56 -0400
Subject: [PATCH 48/49] Set final release version.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index bb9bffe..4fa1ed9 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.56b0'
+__version__ = '0.1.56'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']

From 5225dc739f0ab5afda5790d48d08e6f451b40f4a Mon Sep 17 00:00:00 2001
From: Samitha Liyanage <shliyana@indiana.edu>
Date: Mon, 12 Apr 2021 10:05:33 -0400
Subject: [PATCH 49/49] Set pre release version.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4fa1ed9..e2d26f3 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 import atexit
 import tarfile
 
-__version__ = '0.1.56'
+__version__ = '0.1.57b0'
 
 install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
                     'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']