From fefcf5ada39ca8222b2e27d82092440700cdded4 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 12:12:05 -0500 Subject: [PATCH 01/49] Added -hf and -hfc arguments --- htrc/__main__.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/htrc/__main__.py b/htrc/__main__.py index 04b26b4..bacc652 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -12,10 +12,12 @@ import sys from tempfile import NamedTemporaryFile + from htrc.metadata import get_metadata, get_volume_metadata import htrc.volumes import htrc.workset import htrc.tools.mallet + from argparse import ArgumentParser import htrc.tools.topicexplorer from htrc.lib.cli import bool_prompt @@ -33,6 +35,10 @@ def download_parser(parser=None): help="remove folder if exists") parser.add_argument("-o", "--output", help="output directory", default='/media/secure_volume/workset/') + parser.add_argument("-hf", "--headfoot", action = 'store_true', + help="remove headers and footers from individual pages") + parser.add_argument("-hfc", "--headfootcon", action = 'store_true', + help="remove headers and footers from individual pages then concatenate pages") parser.add_argument("-c", "--concat", action='store_true', help="concatenate a volume's pages in to a single file") parser.add_argument("-m", "--mets", action='store_true', @@ -78,7 +84,8 @@ def main(): help="Download HathiTrust volumes to disk [requires auth]") download_parser(parser_download) parser_download.set_defaults(func='download') - + + # Run helper parser_run = parsers.add_parser('run', help="Run a built-in algorithm.") run_parsers = parser_run.add_subparsers(help="select a command") @@ -125,7 +132,14 @@ def main(): else: print("Please choose another output folder and try again.") sys.exit(1) - + d = os.listdir(args.output) + if args.headfoot is True: + if len(d) == 0: + print("This director is empty") + else: + htrc.volumes.remove_hf(args.output) + if args.headfootcon is True: + htrc.volumes.remove_hf_concat(args.output) if args.pages: if args.mets and args.concat: print ("Cannot set both concat and mets with pages") From 187836b65294391333efeae826b74b2df175077c Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 12:13:14 -0500 Subject: [PATCH 02/49] Added code to try and run header/footer extractor --- htrc/volumes/__init__.py | 120 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 4 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 6ddb9a7..ed22628 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -30,7 +30,13 @@ from urllib.parse import quote_plus, urlencode import xml.etree.ElementTree as ET from zipfile import ZipFile # used to decompress requested zip archives. - +from htrc.runningheaders import parse_page_structure +from htrc.hf_vol_load import load_vol +import pandas as pd +import fnmatch +import glob +from tqdm import tqdm +import shutil from htrc.lib.cli import bool_prompt from htrc.util import split_items import htrc.config @@ -63,6 +69,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met data = {'volumeIDs': '|'.join( [id.replace('+', ':').replace('=', '/') for id in volume_ids])} + if concat: data['concat'] = 'true' @@ -260,9 +267,110 @@ def check_error_file(output_dir): if os.path.isfile(file_path): grep(file_path, output_dir, "KeyNotFoundException") +def remove_hf(output_dir): + if __name__ == '__main__': + os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_files") + vol_paths = glob.glob(os.path.join(output_dir,'**')) + df = pd.DataFrame() + + + for path in tqdm(vol_paths): + if os.path.isdir(path): + page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) + n = len(page_paths) + num = 1 + + while num <= n: + for pg in page_paths: + parsed_path = str(path).split('/') + clean_path_root = '/'.join(parsed_path) + page_num = str(num).zfill(8) + new_filename = page_num+'.txt' + os.rename(pg, clean_path_root+'/'+new_filename) + num += 1 + + folder = os.path.basename(path) + n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) + pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) + + body = [] + for n, page in enumerate(pages): + s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) + + pg_boolean = s + "\n" + "-"*len(s) + pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") + #pg_body = page.body if page.has_body else "" + pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") + + body.append(page.body) + + df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) + df.sort_values("Volume") + for i, g in df.groupby("Volume"): + g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) + + count = 1 + for item in body: + pg_n = str(count).zfill(8) + filename = '{}.txt'.format(pg_n) + count += 1 + with open(os.path.join(clean_path_root, filename), "w") as f_out: + f_out.write('{}\n'.format(item)) + +def remove_hf_concat(output_dir): + if __name__ == '__main__': + os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_files") + vol_paths = glob.glob(os.path.join(output_dir,'**')) + df = pd.DataFrame() + retain = ["removed_hf_files"] + + + for path in tqdm(vol_paths): + if os.path.isdir(path): + page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) + n = len(page_paths) + num = 1 + + while num <= n: + for pg in page_paths: + parsed_path = str(path).split('/') + clean_path_root = '/'.join(parsed_path) + page_num = str(num).zfill(8) + new_filename = page_num+'.txt' + os.rename(pg, clean_path_root+'/'+new_filename) + num += 1 + + folder = os.path.basename(path) + n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) + pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) + + filename = '{}.txt'.format(folder) + body = [] + for n, page in enumerate(pages): + s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) + + pg_boolean = s + "\n" + "-"*len(s) + pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") + #pg_body = page.body if page.has_body else "" + pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") + + body.append(page.body) + + df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) + df.sort_values("Volume") + for i, g in df.groupby("Volume"): + g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) + + + with open(os.path.join(output_dir, filename), "w") as f_out: + f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n') + if folder not in retain: + shutil.rmtree(os.path.join(output_dir, folder)) def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): + config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): # create output_dir folder, if nonexistant if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -305,7 +413,11 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.close() check_error_file(output_dir) - + if headfoot: + remove_hf(output_dir) + if headfootcon: + remove_hf_concat(output_dir) + except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") @@ -320,7 +432,7 @@ def download(args): return download_volumes(volumeIDs, args.output, username=args.username, password=args.password, - token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, + token=args.token, headfoot=args.headfoot, headfootcon=args.headfootcon, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, port=args.dataport, cert=args.datacert, key=args.datakey, epr=args.dataepr) From fa7ff5ac10864b60787996e2b841eac7bfaefbfa Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:53:12 -0500 Subject: [PATCH 03/49] Create na --- htrc/hf_utils/na | 1 + 1 file changed, 1 insertion(+) create mode 100644 htrc/hf_utils/na diff --git a/htrc/hf_utils/na b/htrc/hf_utils/na new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/htrc/hf_utils/na @@ -0,0 +1 @@ + From a546eca9da4908fe23ec03db893f7e6d81bbc997 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:53:53 -0500 Subject: [PATCH 04/49] Create na --- htrc/hf_vol_load/na | 1 + 1 file changed, 1 insertion(+) create mode 100644 htrc/hf_vol_load/na diff --git a/htrc/hf_vol_load/na b/htrc/hf_vol_load/na new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/htrc/hf_vol_load/na @@ -0,0 +1 @@ + From 9ef790adacec73af01a0f20638f6168ec20babfd Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:54:45 -0500 Subject: [PATCH 05/49] Create na --- htrc/models/na | 1 + 1 file changed, 1 insertion(+) create mode 100644 htrc/models/na diff --git a/htrc/models/na b/htrc/models/na new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/htrc/models/na @@ -0,0 +1 @@ + From 9094677a8933a213500de0411db0265e7be8dd87 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:55:34 -0500 Subject: [PATCH 06/49] Create na --- htrc/runningheaders/na | 1 + 1 file changed, 1 insertion(+) create mode 100644 htrc/runningheaders/na diff --git a/htrc/runningheaders/na b/htrc/runningheaders/na new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/htrc/runningheaders/na @@ -0,0 +1 @@ + From 9d2520be71d98670a94fd0709e8eb2808d678364 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:56:34 -0500 Subject: [PATCH 07/49] Add files via upload --- htrc/runningheaders/__init__.py | 163 ++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 htrc/runningheaders/__init__.py diff --git a/htrc/runningheaders/__init__.py b/htrc/runningheaders/__init__.py new file mode 100644 index 0000000..799bf39 --- /dev/null +++ b/htrc/runningheaders/__init__.py @@ -0,0 +1,163 @@ +import re +from collections import defaultdict +from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict + +from htrc.models import Page, PageStructure +from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when + +T = TypeVar('T', bound=Page) +U = TypeVar('U', bound=PageStructure) + + +class _Line: + def __init__(self, text: str, line_number: int, page: Page) -> None: + self.text = text + self.line_number = line_number + self.page = page + self.cleaned_text = clean_text(text) + + def __eq__(self, o: object) -> bool: + if not isinstance(o, _Line): + raise NotImplemented + + are_equal = self.page is o.page and self.line_number == o.line_number + + return are_equal + + def __ne__(self, o: object) -> bool: + return not self == o + + def __hash__(self) -> int: + line_hash = hash(self.line_number) + page_hash = hash(self.page) + hash_value = 31 * line_hash + page_hash + + return hash_value + + def __str__(self) -> str: + return str((self.line_number, self.cleaned_text)) + + def similarity_ratio(self, line: '_Line') -> float: + ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text), + len(line.cleaned_text)) + + return ratio + + +def parse_page_structure(pages: List[T], + window_size: int = 6, + min_similarity_ratio: float = 0.7, + min_cluster_size: int = 3, + max_header_lines: int = 3, + max_footer_lines: int = 3) -> List[U]: + def _get_page_lines(p: T) -> List[_Line]: + return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)] + + def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]: + cluster_map = {} + + for l1, l2 in lines: + c1 = cluster_map.get(l1) + c2 = cluster_map.get(l2) + + if c1 is not None and c2 is not None and c1 is not c2: + smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1) + larger.extend(smaller) + for x in smaller: + cluster_map[x] = larger + elif c1 is not None and c2 is None: + c1.append(l2) + cluster_map[l2] = c1 + elif c1 is None and c2 is not None: + c2.append(l1) + cluster_map[l1] = c2 + elif c1 is None and c2 is None: + c = [l1, l2] + cluster_map[l1] = c + cluster_map[l2] = c + + return set(map(tuple, cluster_map.values())) + + def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]: + lines_grouped_by_page = defaultdict(list) + for line in lines: + lines_grouped_by_page[line.page].append(line) + + return lines_grouped_by_page + + def _get_last_header_line(lines: List[_Line]) -> Optional[int]: + if not lines: + return None + + return max(l.line_number for l in lines) + + def _get_first_footer_line(lines: List[_Line]) -> Optional[int]: + if not lines: + return None + + return min(l.line_number for l in lines) + + def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]: + numbers = [int(match.group(0)) for match in + re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)] + + return line, numbers + + def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]: + assert len(lines) > 0 + line, numbers = _extract_line_numbers(lines[-1]) + if not numbers and not str.strip(line.text) and len(lines) > 1: + line, numbers = _extract_line_numbers(lines[-2]) + + return line, numbers + + candidate_header_lines = [] + candidate_footer_lines = [] + + pages_lines = [_get_page_lines(p) for p in pages] + + for lines in pages_lines: + # ignore lines that are <4 characters long and/or have no alphabetic characters + candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4]) + candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4]) + + headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size) + footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size) + + header_line_similarities = [] + for (lines1, lines2) in headers_for_comparison: + header_line_similarities.extend( + (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) + + footer_line_similarities = [] + for (lines1, lines2) in footers_for_comparison: + footer_line_similarities.extend( + (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) + + header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if + len(cluster) >= min_cluster_size] + footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if + len(cluster) >= min_cluster_size] + + if not footer_clusters: + potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines] + potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1] + potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)), + group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1)) + footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size] + + header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters)) + footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters)) + + last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()} + first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in + footer_lines_grouped_by_page.items()} + + for page in pages: + last_header_line = last_header_line_pages_map.get(page) + first_footer_line = first_footer_line_pages_map.get(page) + page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {}) + page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0 + page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0 + + return pages From c526c45de7bf389643488ed7587c058a4cacb40d Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:56:59 -0500 Subject: [PATCH 08/49] Delete na --- htrc/runningheaders/na | 1 - 1 file changed, 1 deletion(-) delete mode 100644 htrc/runningheaders/na diff --git a/htrc/runningheaders/na b/htrc/runningheaders/na deleted file mode 100644 index 8b13789..0000000 --- a/htrc/runningheaders/na +++ /dev/null @@ -1 +0,0 @@ - From 2ea0cdc4f49294319952ca18d7ffeb4273a98f20 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:57:34 -0500 Subject: [PATCH 09/49] Add files via upload --- htrc/models/__init__.py | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 htrc/models/__init__.py diff --git a/htrc/models/__init__.py b/htrc/models/__init__.py new file mode 100644 index 0000000..e86e115 --- /dev/null +++ b/htrc/models/__init__.py @@ -0,0 +1,68 @@ +import os +from abc import ABC, abstractmethod +from typing import List + + +class Page(ABC): + @property + @abstractmethod + def text_lines(self) -> List[str]: + """ + The lines of text on the page + """ + pass + + @property + def text(self) -> str: + return os.linesep.join(self.text_lines) + + +class PageStructure(Page, ABC): + def __init__(self) -> None: + self.num_header_lines = 0 + self.num_footer_lines = 0 + + @property + def has_header(self) -> bool: + return self.num_header_lines > 0 + + @property + def has_body(self) -> bool: + return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0 + + @property + def has_footer(self) -> bool: + return self.num_footer_lines > 0 + + @property + def header_lines(self) -> List[str]: + return self.text_lines[:self.num_header_lines] + + @property + def body_lines(self) -> List[str]: + return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines] + + @property + def footer_lines(self) -> List[str]: + return self.text_lines[-self.num_footer_lines:] if self.has_footer else [] + + @property + def header(self) -> str: + return os.linesep.join(self.header_lines) + + @property + def body(self) -> str: + return os.linesep.join(self.body_lines) + + @property + def footer(self) -> str: + return os.linesep.join(self.footer_lines) + + +class HtrcPage(Page): + def __init__(self, lines: List[str]) -> None: + self._lines = lines + + @property + def text_lines(self) -> List[str]: + return self._lines From ee06a59fe5a2bfdac00a6513d953c4e571943002 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:57:59 -0500 Subject: [PATCH 10/49] Delete na --- htrc/models/na | 1 - 1 file changed, 1 deletion(-) delete mode 100644 htrc/models/na diff --git a/htrc/models/na b/htrc/models/na deleted file mode 100644 index 8b13789..0000000 --- a/htrc/models/na +++ /dev/null @@ -1 +0,0 @@ - From a2479c57a7ee3e74889300e905babf55ae26e5fa Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:58:51 -0500 Subject: [PATCH 11/49] Add files via upload --- htrc/hf_vol_load/__init__.py | 117 +++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 htrc/hf_vol_load/__init__.py diff --git a/htrc/hf_vol_load/__init__.py b/htrc/hf_vol_load/__init__.py new file mode 100644 index 0000000..72f08d5 --- /dev/null +++ b/htrc/hf_vol_load/__init__.py @@ -0,0 +1,117 @@ +import unittest +from typing import List + +from htrc.models import HtrcPage +from htrc.runningheaders import parse_page_structure, clean_text, levenshtein + + +class TestRunningHeaders(unittest.TestCase): + def test_finding_running_headers(self): + pages = load_vol("data/vol1", num_pages=10) + structured_pages = parse_page_structure(pages) + headers = ["|".join(page.header_lines) for page in structured_pages] + expected = [ + "", + "", + "CHAPTER 1|INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", + "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", + "INTRODUCTION TO RUNNING HEADERS 1|Lorem Ipsum style", + "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", + "CHAPTER 2|EVERYTHING IS RELATIVE", + "2 EVERYTHING IS RELATIVE", + "EVERYTHING IS RELATIVE 2", + "2 EVERYTHING IS RELATIVE" + ] + self.assertListEqual(expected, headers) + + def test_finding_running_footers(self): + pages = load_vol("data/vol1", num_pages=10) + structured_pages = parse_page_structure(pages) + footers = ["|".join(page.footer_lines) for page in structured_pages] + expected = [ + "", + "", + "Page 2", + "Page 3", + "Page 4", + "Page 5", + "Page 6", + "Page 7", + "Page 8", + "Page 9" + ] + self.assertListEqual(expected, footers) + + def test_identify_correct_page_body(self): + pages = load_vol("data/vol1", num_pages=10) + structured_pages = parse_page_structure(pages) + len_body_per_page = [len(page.body_lines) for page in structured_pages] + expected = [0, 7, 43, 28, 26, 30, 31, 27, 28, 15] + self.assertListEqual(expected, len_body_per_page) + + def test_find_footer_with_page_numbers(self): + pages = load_vol("data/vol2", num_pages=10) + structured_pages = parse_page_structure(pages) + footers = ["|".join(page.footer_lines) for page in structured_pages] + expected = [ + "", + "", + "2", + " 3", + "4", + " 5", + "6", + " 7", + "8", + " 9" + ] + self.assertListEqual(expected, footers) + + +class TestUtils(unittest.TestCase): + def test_clean_text(self): + s1 = u"\t На берегу \tпустынных волн \t\n" + s1_expected = u"на берегу пустынных волн" + s2 = u" Pot să mănânc sticlă și ea nu mă rănește. " + s2_expected = u"pot să mănânc sticlă și ea nu mă rănește" + s1_clean = clean_text(s1) + s2_clean = clean_text(s2) + + self.assertEqual(s1_expected, s1_clean) + self.assertEqual(s2_expected, s2_clean) + + def test_levenshtein(self): + s1 = "rosettacode" + s2 = "raisethysword" + lev = levenshtein(s1, s2) + self.assertEqual(8, lev) + + s1 = "kitten" + s2 = "sitting" + lev = levenshtein(s1, s2, replace_cost=2) + self.assertEqual(5, lev) + + s1 = "abracadabra" + s2 = "abracadabra" + lev = levenshtein(s1, s2) + self.assertEqual(0, lev) + + s1 = "" + s2 = "abc" + lev = levenshtein(s1, s2) + self.assertEqual(3, lev) + + +def load_vol(path: str, num_pages: int) -> List[HtrcPage]: + pages = [] + for n in range(num_pages): + page_num = str(n+1).zfill(8) + with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f: + lines = [line.rstrip() for line in f.readlines()] + pages.append(HtrcPage(lines)) + + return pages + + +if __name__ == '__main__': + unittest.main() From 6ff15e617ebb812c6c29bfc32892efb94375129b Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:59:33 -0500 Subject: [PATCH 12/49] Add files via upload --- htrc/hf_utils/__init__.py | 110 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 htrc/hf_utils/__init__.py diff --git a/htrc/hf_utils/__init__.py b/htrc/hf_utils/__init__.py new file mode 100644 index 0000000..81553de --- /dev/null +++ b/htrc/hf_utils/__init__.py @@ -0,0 +1,110 @@ +import re +from typing import TypeVar, List, Iterator, Tuple, Callable + +T = TypeVar('T') + + +def clean_text(s: str) -> str: + # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported) + s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE) + # replace multiple sequential whitespaces with single whitespace + s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE) + # trim whitespaces at the beginning and end + s = s.strip() + # lowercase + s = s.lower() + + return s + + +def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int: + """ From Wikipedia article; Iterative with two matrix rows. """ + # degenerate cases + if s == t: + return 0 + + len0 = len(s) + len1 = len(t) + + if not len0: + return len1 + + if not len1: + return len0 + + # the array of distances + v0 = [0] * (len0 + 1) + v1 = [0] * (len0 + 1) + + # initial cost of skipping prefix in s + for i in range(len(v0)): + v0[i] = i + + # dynamically compute the array of distances + + # transformation cost for each letter in t + for j in range(len1): + # initial cost of skipping prefix in t + v1[0] = j + 1 + + # transformation cost for each letter in s + for i in range(len0): + # matching current letters in both strings + match = 0 if s[i] == t[j] else 1 + + # computing cost for each transformation + cost_insert = v0[i + 1] + insert_cost + cost_delete = v1[i] + delete_cost + cost_replace = v0[i] + match * replace_cost + + # keep minimum cost + v1[i + 1] = min(cost_insert, cost_delete, cost_replace) + + # swap cost arrays + v0, v1 = v1, v0 + + # the distance is the cost for transforming all letters in both strings + return v0[len0] + + +def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]: + if not xs: + return [] + + result = [] + x, xs = xs[0], xs[1:] + + while xs: + result = result + [(x, v) for v in xs[:n - 1]] + x, xs = xs[0], xs[1:] + + return result + + +def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]: + result = [] + _prev, _next = None, None + + while len(xs) > 1: + _prev, _next = xs[0], xs[1] + result.append(_prev) + if not pred(_prev, _next): + yield result + result = [] + xs = xs[1:] + + if len(xs) == 1: + _prev, _next = _next, xs[0] + + if _prev is not None and _next is not None and pred(_prev, _next): + result.extend([_prev, _next]) + elif _next is not None: + result.append(_next) + + yield result + + +def flatten(xss: List[tuple]) -> Iterator[T]: + for xs in xss: + for x in xs: + yield x From f0d26f0c57c2146ffa9db5926ce0d3fa46406824 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 15:59:51 -0500 Subject: [PATCH 13/49] Delete na --- htrc/hf_vol_load/na | 1 - 1 file changed, 1 deletion(-) delete mode 100644 htrc/hf_vol_load/na diff --git a/htrc/hf_vol_load/na b/htrc/hf_vol_load/na deleted file mode 100644 index 8b13789..0000000 --- a/htrc/hf_vol_load/na +++ /dev/null @@ -1 +0,0 @@ - From 13a099de206e5e355ee57b3d6aec9eba2b6adb6f Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 16:00:08 -0500 Subject: [PATCH 14/49] Delete na --- htrc/hf_utils/na | 1 - 1 file changed, 1 deletion(-) delete mode 100644 htrc/hf_utils/na diff --git a/htrc/hf_utils/na b/htrc/hf_utils/na deleted file mode 100644 index 8b13789..0000000 --- a/htrc/hf_utils/na +++ /dev/null @@ -1 +0,0 @@ - From e7dca21c2698222aa050cbdc04fb4a74b7241a33 Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 16:01:25 -0500 Subject: [PATCH 15/49] Add files via upload --- htrc/__main__.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/htrc/__main__.py b/htrc/__main__.py index bacc652..3f35557 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -132,18 +132,26 @@ def main(): else: print("Please choose another output folder and try again.") sys.exit(1) - d = os.listdir(args.output) - if args.headfoot is True: - if len(d) == 0: - print("This director is empty") - else: - htrc.volumes.remove_hf(args.output) - if args.headfootcon is True: - htrc.volumes.remove_hf_concat(args.output) + + if args.concat and args.headfoot: + print("Cannot set both concat and headfoot") + sys.exit(1) + if args.concat and args.headfootcon: + print("Cannot set both concat and headfootcon") + sys.exit(1) + if args.headfoot and args.headfootcon: + print("Cannot set both headfoot and headfootcon") + sys.exit(1) + if args.mets and args.headfootcon: + print("Cannot set both mets and headfootcon") + sys.exit(1) if args.pages: if args.mets and args.concat: print ("Cannot set both concat and mets with pages") sys.exit(1) + if args.mets and args.headfootcon: + print("Cannot set both mets and headfootcon with pages") + sys.exit(1) try: resolve_and_download(args) From 87ff2b91ecd24e55bb7944c5f04d898481e32ccb Mon Sep 17 00:00:00 2001 From: David K Date: Fri, 20 Nov 2020 16:02:05 -0500 Subject: [PATCH 16/49] Add files via upload --- htrc/volumes/__init__.py | 169 ++++++++++++++++++++------------------- 1 file changed, 88 insertions(+), 81 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index ed22628..a50ced6 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -268,106 +268,104 @@ def check_error_file(output_dir): grep(file_path, output_dir, "KeyNotFoundException") def remove_hf(output_dir): - if __name__ == '__main__': - os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_files") - vol_paths = glob.glob(os.path.join(output_dir,'**')) - df = pd.DataFrame() + os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_files") + vol_paths = glob.glob(os.path.join(output_dir,'**')) + df = pd.DataFrame() - for path in tqdm(vol_paths): - if os.path.isdir(path): - page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) - n = len(page_paths) - num = 1 + for path in tqdm(vol_paths): + if os.path.isdir(path): + page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) + n = len(page_paths) + num = 1 - while num <= n: - for pg in page_paths: - parsed_path = str(path).split('/') - clean_path_root = '/'.join(parsed_path) - page_num = str(num).zfill(8) - new_filename = page_num+'.txt' - os.rename(pg, clean_path_root+'/'+new_filename) - num += 1 + while num <= n: + for pg in page_paths: + parsed_path = str(path).split('/') + clean_path_root = '/'.join(parsed_path) + page_num = str(num).zfill(8) + new_filename = page_num+'.txt' + os.rename(pg, clean_path_root+'/'+new_filename) + num += 1 - folder = os.path.basename(path) - n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) - pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) + folder = os.path.basename(path) + n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) + pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) - body = [] - for n, page in enumerate(pages): - s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) + body = [] + for n, page in enumerate(pages): + s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) - pg_boolean = s + "\n" + "-"*len(s) - pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") - #pg_body = page.body if page.has_body else "" - pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") + pg_boolean = s + "\n" + "-"*len(s) + pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") + #pg_body = page.body if page.has_body else "" + pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") - body.append(page.body) + body.append(page.body) - df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) - df.sort_values("Volume") - for i, g in df.groupby("Volume"): - g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) + df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) + df.sort_values("Volume") + for i, g in df.groupby("Volume"): + g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) - count = 1 - for item in body: - pg_n = str(count).zfill(8) - filename = '{}.txt'.format(pg_n) - count += 1 - with open(os.path.join(clean_path_root, filename), "w") as f_out: - f_out.write('{}\n'.format(item)) + count = 1 + for item in body: + pg_n = str(count).zfill(8) + filename = '{}.txt'.format(pg_n) + count += 1 + with open(os.path.join(clean_path_root, filename), "w") as f_out: + f_out.write('{}\n'.format(item)) def remove_hf_concat(output_dir): - if __name__ == '__main__': - os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_files") - vol_paths = glob.glob(os.path.join(output_dir,'**')) - df = pd.DataFrame() - retain = ["removed_hf_files"] + os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_files") + vol_paths = glob.glob(os.path.join(output_dir,'**')) + df = pd.DataFrame() + retain = ["removed_hf_files"] - for path in tqdm(vol_paths): - if os.path.isdir(path): - page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) - n = len(page_paths) - num = 1 + for path in tqdm(vol_paths): + if os.path.isdir(path): + page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) + n = len(page_paths) + num = 1 - while num <= n: - for pg in page_paths: - parsed_path = str(path).split('/') - clean_path_root = '/'.join(parsed_path) - page_num = str(num).zfill(8) - new_filename = page_num+'.txt' - os.rename(pg, clean_path_root+'/'+new_filename) - num += 1 + while num <= n: + for pg in page_paths: + parsed_path = str(path).split('/') + clean_path_root = '/'.join(parsed_path) + page_num = str(num).zfill(8) + new_filename = page_num+'.txt' + os.rename(pg, clean_path_root+'/'+new_filename) + num += 1 - folder = os.path.basename(path) - n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) - pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) + folder = os.path.basename(path) + n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) + pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) - filename = '{}.txt'.format(folder) - body = [] - for n, page in enumerate(pages): - s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) + filename = '{}.txt'.format(folder) + body = [] + for n, page in enumerate(pages): + s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) - pg_boolean = s + "\n" + "-"*len(s) - pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") - #pg_body = page.body if page.has_body else "" - pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") + pg_boolean = s + "\n" + "-"*len(s) + pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") + #pg_body = page.body if page.has_body else "" + pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") - body.append(page.body) + body.append(page.body) - df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) - df.sort_values("Volume") - for i, g in df.groupby("Volume"): - g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) + df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) + df.sort_values("Volume") + for i, g in df.groupby("Volume"): + g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) - with open(os.path.join(output_dir, filename), "w") as f_out: - f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n') - if folder not in retain: - shutil.rmtree(os.path.join(output_dir, folder)) + with open(os.path.join(output_dir, filename), "w") as f_out: + f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n') + if folder not in retain: + shutil.rmtree(os.path.join(output_dir, folder)) def download_volumes(volume_ids, output_dir, username=None, password=None, config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): @@ -413,10 +411,19 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.close() check_error_file(output_dir) + d = os.listdir(output_dir) if headfoot: - remove_hf(output_dir) + if len(d) == 0: + print("This directory is empty") + sys.exit(1) + else: + remove_hf(output_dir) if headfootcon: - remove_hf_concat(output_dir) + if len(d) == 0: + print("This directory is empty") + sys.exit(1) + else: + remove_hf_concat(output_dir) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From 29f2e249fc80135046c30315f92c684226f0d845 Mon Sep 17 00:00:00 2001 From: David K Date: Mon, 30 Nov 2020 13:14:28 -0500 Subject: [PATCH 17/49] Pinned tqdm 4.46.0 package to setup.py file --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 300957c..70287f0 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ __version__ = '0.1.54' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', - 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2'] + 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0'] # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc'] if sys.version_info.major == 2: install_requires.append('configparser') From b7fa1a2f15aa094af7da3ffa6b6f89369650e8f8 Mon Sep 17 00:00:00 2001 From: David K Date: Mon, 30 Nov 2020 18:06:36 -0500 Subject: [PATCH 18/49] Made changes to hf_remove_concat function --- htrc/volumes/__init__.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index a50ced6..d85ec1f 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -268,8 +268,8 @@ def check_error_file(output_dir): grep(file_path, output_dir, "KeyNotFoundException") def remove_hf(output_dir): - os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_files") + os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_data") vol_paths = glob.glob(os.path.join(output_dir,'**')) df = pd.DataFrame() @@ -318,11 +318,12 @@ def remove_hf(output_dir): f_out.write('{}\n'.format(item)) def remove_hf_concat(output_dir): - os.makedirs(os.path.join(output_dir, "removed_hf_files"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_files") + os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True) + removed_hf = os.path.join(output_dir, "removed_hf_data") vol_paths = glob.glob(os.path.join(output_dir,'**')) df = pd.DataFrame() - retain = ["removed_hf_files"] + retain = ["removed_hf_data"] + rm_txt = "removed_hf_data.txt" for path in tqdm(vol_paths): @@ -366,7 +367,11 @@ def remove_hf_concat(output_dir): f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n') if folder not in retain: shutil.rmtree(os.path.join(output_dir, folder)) - + if os.path.exists(os.path.join(output_dir, rm_txt)): + os.remove(os.path.join(output_dir, rm_txt)) + + + def download_volumes(volume_ids, output_dir, username=None, password=None, config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): # create output_dir folder, if nonexistant From 8b74d1f02ed01066319395e21f394f5d6934dcaa Mon Sep 17 00:00:00 2001 From: David K Date: Wed, 2 Dec 2020 14:02:48 -0500 Subject: [PATCH 19/49] Added documentation for header/footer extractor --- docs/source/cli.rst | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 0d19316..8101173 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -1,6 +1,6 @@ HTRC Workset Toolkit ====================== -The HTRC Workset Toolkit povides a command line interface for interacting with +The HTRC Workset Toolkit povides a command line interface for interacting with and analyzing volumes in the HathiTrust Digital Library: - Volume Download (``htrc download``) @@ -11,7 +11,7 @@ and analyzing volumes in the HathiTrust Digital Library: Workset Path -------------- -Each of these commands takes a *workset path*. Valid types of workset paths +Each of these commands takes a *workset path*. Valid types of workset paths and examples of each are: ================================== ============================================================================== @@ -71,7 +71,7 @@ download`_, the Topic Modeling '''''''''''''''' -There are two implementations of LDA topic modeling supported by the +There are two implementations of LDA topic modeling supported by the Arguments @@ -114,6 +114,18 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c`` +* Download specific pages from a single volume : + + ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]`` + +* Download volumes and then extract headers/footers from the volumes : + + ``htrc download -hf /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) : + + ``htrc download -hfc /home/dcuser/HTRC/htrc-id`` + | +---------------------------------+-----------------------------------------------+ | command: ``htrc metadata`` | capsule mode: **secure** and **maintenance** | @@ -246,7 +258,3 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da * Run topicexplorer on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names). ``htrc topicexplorer /home/dcuser/unzipped_volumes -k 20`` - - - - From 528e127897495d05c940fd767fa6ac444d072049 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 10:39:23 -0500 Subject: [PATCH 20/49] Added volume not found error for pd-only access --- htrc/.htrc.default | 1 + htrc/config.py | 3 +++ htrc/volumes/__init__.py | 10 ++++++---- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/htrc/.htrc.default b/htrc/.htrc.default index bfeadee..3ec2327 100644 --- a/htrc/.htrc.default +++ b/htrc/.htrc.default @@ -8,6 +8,7 @@ port = 443 url = / cert = key = +pd_only = [oauth] host = silvermaple.pti.indiana.edu diff --git a/htrc/config.py b/htrc/config.py index ccd7d54..13ecf80 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -56,6 +56,9 @@ def get_dataapi_cert(path=None): def get_dataapi_key(path=None): return _get_value('data', 'key', path) +def get_dataapi_access(path=None): + return _get_value('data', 'pd_only', path) + def get_idp_host_port(path=None): host = _get_value('idp', 'host', path) port = _get_value('idp', 'port', path) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 6ddb9a7..1282159 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -249,8 +249,7 @@ def grep(file_name, output_dir, pattern): if len(na_volume) == 100: print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") -def check_error_file(output_dir): - file_name = "ERROR.err" +def check_error_file(output_dir,file_name,grep_text): if output_dir.endswith("/"): file_path = output_dir+ file_name @@ -258,7 +257,7 @@ def check_error_file(output_dir): file_path = output_dir+"/"+file_name if os.path.isfile(file_path): - grep(file_path, output_dir, "KeyNotFoundException") + grep(file_path, output_dir, grep_text) def download_volumes(volume_ids, output_dir, username=None, password=None, @@ -304,7 +303,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() - check_error_file(output_dir) + if(htrc.config.get_dataapi_access()): + check_error_file(output_dir,"volume-rights.txt", " 3") + + check_error_file(output_dir,"ERROR.err","KeyNotFoundException") except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From 10dd91b8e2879aec9540b9eb05934f01864e0b61 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 10:58:22 -0500 Subject: [PATCH 21/49] Added volume not found error for pd-only access --- htrc/volumes/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 1282159..b4db7c9 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -235,11 +235,11 @@ def get_oauth2_token(username, password): return token -def grep(file_name, output_dir, pattern): +def grep(file_name, output_dir, pattern, txt_index): na_volume = [] for line in open(file_name): if pattern in line: - na_volume.append(line.split()[-1]) + na_volume.append(line.split()[txt_index]) if len(na_volume) < 100: print("\nFollowing volume ids are not available.") print("\n".join(str(item) for item in na_volume)) @@ -249,7 +249,7 @@ def grep(file_name, output_dir, pattern): if len(na_volume) == 100: print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") -def check_error_file(output_dir,file_name,grep_text): +def check_error_file(output_dir,file_name,grep_text,txt_index): if output_dir.endswith("/"): file_path = output_dir+ file_name @@ -257,7 +257,7 @@ def check_error_file(output_dir,file_name,grep_text): file_path = output_dir+"/"+file_name if os.path.isfile(file_path): - grep(file_path, output_dir, grep_text) + grep(file_path, output_dir, grep_text,txt_index) def download_volumes(volume_ids, output_dir, username=None, password=None, @@ -304,9 +304,9 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.close() if(htrc.config.get_dataapi_access()): - check_error_file(output_dir,"volume-rights.txt", " 3") + check_error_file(output_dir,"volume-rights.txt", " 3", 0) - check_error_file(output_dir,"ERROR.err","KeyNotFoundException") + check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From cfbbf205dbcc47dabd08e7e4ec69307d24efb8f5 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 13:12:32 -0500 Subject: [PATCH 22/49] Added volume not found error for pd-only access --- htrc/volumes/__init__.py | 60 ++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index b4db7c9..5796dfe 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -235,29 +235,39 @@ def get_oauth2_token(username, password): return token -def grep(file_name, output_dir, pattern, txt_index): - na_volume = [] - for line in open(file_name): - if pattern in line: - na_volume.append(line.split()[txt_index]) - if len(na_volume) < 100: - print("\nFollowing volume ids are not available.") - print("\n".join(str(item) for item in na_volume)) - with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: - volume_na.write("\n".join(str(item) for item in na_volume)) - else: - if len(na_volume) == 100: - print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") - -def check_error_file(output_dir,file_name,grep_text,txt_index): +def grep_error(file_name, output_dir, pattern, txt_index): if output_dir.endswith("/"): file_path = output_dir+ file_name else: file_path = output_dir+"/"+file_name + na_volume = [] if os.path.isfile(file_path): - grep(file_path, output_dir, grep_text,txt_index) + for line in open(file_name): + if pattern in line: + volume_id = line.split()[txt_index] + na_volume.append(volume_id) + return na_volume + +# def check_error_file(output_dir,file_name,grep_text,txt_index): +# +# if output_dir.endswith("/"): +# file_path = output_dir+ file_name +# else: +# file_path = output_dir+"/"+file_name +# +# if os.path.isfile(file_path): +# grep(file_path, output_dir, grep_text,txt_index) +# +# if len(na_volume) < 100: +# print("\nFollowing volume ids are not available.") +# print("\n".join(str(item) for item in na_volume)) +# with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: +# volume_na.write("\n".join(str(item) for item in na_volume)) +# else: +# if len(na_volume) >= 100: +# print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") def download_volumes(volume_ids, output_dir, username=None, password=None, @@ -303,10 +313,24 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() + + na_volume = [] if(htrc.config.get_dataapi_access()): - check_error_file(output_dir,"volume-rights.txt", " 3", 0) + na_volume = grep_error("volume-rights.txt",output_dir," 3",0) + + na_volume = na_volume + grep_error("ERROR.err",output_dir,"KeyNotFoundException", -1) + + if len(na_volume) > 0: + with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:volume_na.write("\n".join(str(item) for item in na_volume)) + + if len(na_volume) > 0 and len(na_volume) < 100: + print("\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") + print("\n".join(str(item) for item in na_volume)) + + else: + if len(na_volume) >= 100: + print("\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") - check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From 560ef6b45f96c2820179b5d339b73d7cce3d36d5 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 13:38:37 -0500 Subject: [PATCH 23/49] Added volume not found error for pd-only access --- htrc/volumes/__init__.py | 73 +++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 5796dfe..5453368 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -10,6 +10,7 @@ """ from __future__ import print_function from future import standard_library + standard_library.install_aliases() from builtins import input @@ -37,8 +38,10 @@ import logging from logging import NullHandler + logging.getLogger(__name__).addHandler(NullHandler()) + def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False): """ Returns volumes from the Data API as a raw zip stream. @@ -58,7 +61,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met for id in volume_ids: if ("." not in id - or " " in id): + or " " in id): print("Invalid volume id " + id + ". Please correct this volume id and try again.") data = {'volumeIDs': '|'.join( @@ -82,7 +85,6 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -92,9 +94,9 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met data = BytesIO() bytes_downloaded = 0 bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, - widgets=[progressbar.AnimatedMarker(), ' ', - progressbar.DataSize(), - ' (', progressbar.FileTransferSpeed(), ')']) + widgets=[progressbar.AnimatedMarker(), ' ', + progressbar.DataSize(), + ' (', progressbar.FileTransferSpeed(), ')']) while body: body = response.read(128) @@ -132,7 +134,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa for id in page_ids: if ("." not in id - or " " in id): + or " " in id): print("Invalid volume id " + id + ". Please correct this volume id and try again.") data = {'pageIDs': '|'.join( @@ -149,7 +151,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa headers = {"Authorization": "Bearer " + token, "Content-type": "application/x-www-form-urlencoded"} - # Create SSL lookup # TODO: Fix SSL cert verification ctx = ssl.create_default_context() @@ -159,7 +160,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -169,7 +169,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa data = BytesIO() bytes_downloaded = 0 bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, - widgets=[progressbar.AnimatedMarker(), ' ', + widgets=[progressbar.AnimatedMarker(), ' ', progressbar.DataSize(), ' (', progressbar.FileTransferSpeed(), ')']) @@ -191,12 +191,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa return data + def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded headers = {"Content-type": "application/x-www-form-urlencoded"} - data = { "grant_type": "client_credentials", - "client_secret": password, - "client_id": username } + data = {"grant_type": "client_credentials", + "client_secret": password, + "client_id": username} data = urlencode(data) # create an SSL context @@ -235,12 +236,12 @@ def get_oauth2_token(username, password): return token -def grep_error(file_name, output_dir, pattern, txt_index): +def grep_error(file_name, output_dir, pattern, txt_index): if output_dir.endswith("/"): - file_path = output_dir+ file_name + file_path = output_dir + file_name else: - file_path = output_dir+"/"+file_name + file_path = output_dir + "/" + file_name na_volume = [] if os.path.isfile(file_path): @@ -250,6 +251,7 @@ def grep_error(file_name, output_dir, pattern, txt_index): na_volume.append(volume_id) return na_volume + # def check_error_file(output_dir,file_name,grep_text,txt_index): # # if output_dir.endswith("/"): @@ -271,7 +273,8 @@ def grep_error(file_name, output_dir, pattern, txt_index): def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): + config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, + cert=None, key=None, epr=None): # create output_dir folder, if nonexistant if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -282,7 +285,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, htrc.config.remove_jwt_token() if not host: - host= htrc.config.get_dataapi_host() + host = htrc.config.get_dataapi_host() if not port: port = htrc.config.get_dataapi_port() @@ -313,23 +316,32 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() - na_volume = [] - if(htrc.config.get_dataapi_access()): - na_volume = grep_error("volume-rights.txt",output_dir," 3",0) + if htrc.config.get_dataapi_access(): + print("PD Access Only") + na_volume = grep_error("volume-rights.txt", output_dir, " 3", 0) - na_volume = na_volume + grep_error("ERROR.err",output_dir,"KeyNotFoundException", -1) + na_volume = na_volume + grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1) if len(na_volume) > 0: - with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na:volume_na.write("\n".join(str(item) for item in na_volume)) - - if len(na_volume) > 0 and len(na_volume) < 100: - print("\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") + with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write( + "\n".join(str(item) for item in na_volume)) + + if 0 < len(na_volume) < 100: + print( + "\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the " + "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " + "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " + "for assistance.") print("\n".join(str(item) for item in na_volume)) else: if len(na_volume) >= 100: - print("\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt for the complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") + print( + "\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt " + "for the complete list. \nTo check the validity of volumes in your workset or volume id " + "file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at " + "htrc-help@hathitrust.org for assistance.") except socket.error: @@ -345,8 +357,7 @@ def download(args): volumeIDs = [line.strip() for line in IDfile] return download_volumes(volumeIDs, args.output, - username=args.username, password=args.password, - token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, - port=args.dataport, cert=args.datacert, key=args.datakey, - epr=args.dataepr) - + username=args.username, password=args.password, + token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, + port=args.dataport, cert=args.datacert, key=args.datakey, + epr=args.dataepr) From d278da00275a3d49fb9477082a8bbfc30b1c89a8 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 13:51:45 -0500 Subject: [PATCH 24/49] Added volume not found error for pd-only access --- htrc/volumes/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 5453368..240c828 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -316,12 +316,16 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() + na_volume_rights = [] + na_volume_error = [] na_volume = [] if htrc.config.get_dataapi_access(): + na_volume_rights = grep_error("volume-rights.txt", output_dir, " 3", 0) print("PD Access Only") - na_volume = grep_error("volume-rights.txt", output_dir, " 3", 0) + print(na_volume_rights) - na_volume = na_volume + grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1) + na_volume_error = grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1) + na_volume = na_volume_error + na_volume_rights if len(na_volume) > 0: with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write( From af681b73ec7c8e775a45059db5d5a90d28d56839 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 14:51:26 -0500 Subject: [PATCH 25/49] FIxes errors in message building --- htrc/volumes/__init__.py | 114 ++++++++++++++------------------------- 1 file changed, 41 insertions(+), 73 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 240c828..da97d48 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -10,7 +10,6 @@ """ from __future__ import print_function from future import standard_library - standard_library.install_aliases() from builtins import input @@ -38,10 +37,8 @@ import logging from logging import NullHandler - logging.getLogger(__name__).addHandler(NullHandler()) - def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False): """ Returns volumes from the Data API as a raw zip stream. @@ -85,6 +82,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) + httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -151,6 +149,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa headers = {"Authorization": "Bearer " + token, "Content-type": "application/x-www-form-urlencoded"} + # Create SSL lookup # TODO: Fix SSL cert verification ctx = ssl.create_default_context() @@ -160,6 +159,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) + httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -191,13 +191,12 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa return data - def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded headers = {"Content-type": "application/x-www-form-urlencoded"} - data = {"grant_type": "client_credentials", - "client_secret": password, - "client_id": username} + data = { "grant_type": "client_credentials", + "client_secret": password, + "client_id": username } data = urlencode(data) # create an SSL context @@ -236,45 +235,40 @@ def get_oauth2_token(username, password): return token +def grep(file_name, output_dir, pattern, txt_index): + na_volume = [] + for line in open(file_name): + if pattern in line: + na_volume.append(line.split()[txt_index]) + if 0 < len(na_volume) < 100: + print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the " + "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " + "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " + "for assistance.") + print("\n".join(str(item) for item in na_volume)) + with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: + volume_na.write("\n".join(str(item) for item in na_volume)) + volume_na.write("\n") + else: + if len(na_volume) >= 100: + print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt for the " + "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " + "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " + "for assistance.") + +def check_error_file(output_dir,file_name,grep_text,txt_index): -def grep_error(file_name, output_dir, pattern, txt_index): if output_dir.endswith("/"): - file_path = output_dir + file_name + file_path = output_dir+ file_name else: - file_path = output_dir + "/" + file_name + file_path = output_dir+"/"+file_name - na_volume = [] if os.path.isfile(file_path): - for line in open(file_name): - if pattern in line: - volume_id = line.split()[txt_index] - na_volume.append(volume_id) - return na_volume - - -# def check_error_file(output_dir,file_name,grep_text,txt_index): -# -# if output_dir.endswith("/"): -# file_path = output_dir+ file_name -# else: -# file_path = output_dir+"/"+file_name -# -# if os.path.isfile(file_path): -# grep(file_path, output_dir, grep_text,txt_index) -# -# if len(na_volume) < 100: -# print("\nFollowing volume ids are not available.") -# print("\n".join(str(item) for item in na_volume)) -# with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: -# volume_na.write("\n".join(str(item) for item in na_volume)) -# else: -# if len(na_volume) >= 100: -# print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") + grep(file_path, output_dir, grep_text,txt_index) def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, - cert=None, key=None, epr=None): + config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): # create output_dir folder, if nonexistant if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -285,7 +279,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, htrc.config.remove_jwt_token() if not host: - host = htrc.config.get_dataapi_host() + host= htrc.config.get_dataapi_host() if not port: port = htrc.config.get_dataapi_port() @@ -316,37 +310,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() - na_volume_rights = [] - na_volume_error = [] - na_volume = [] - if htrc.config.get_dataapi_access(): - na_volume_rights = grep_error("volume-rights.txt", output_dir, " 3", 0) - print("PD Access Only") - print(na_volume_rights) - - na_volume_error = grep_error("ERROR.err", output_dir, "KeyNotFoundException", -1) - na_volume = na_volume_error + na_volume_rights - - if len(na_volume) > 0: - with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write( - "\n".join(str(item) for item in na_volume)) - - if 0 < len(na_volume) < 100: - print( - "\nFollowing volume ids are not available.\n Please check volume_not_available.txt for the " - "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " - "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " - "for assistance.") - print("\n".join(str(item) for item in na_volume)) - - else: - if len(na_volume) >= 100: - print( - "\nThere are 100 or more unavailable volumes. \n Please check volume_not_available.txt " - "for the complete list. \nTo check the validity of volumes in your workset or volume id " - "file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at " - "htrc-help@hathitrust.org for assistance.") + if(htrc.config.get_dataapi_access()): + check_error_file(output_dir,"volume-rights.txt", " 3", 0) + check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") @@ -361,7 +328,8 @@ def download(args): volumeIDs = [line.strip() for line in IDfile] return download_volumes(volumeIDs, args.output, - username=args.username, password=args.password, - token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, - port=args.dataport, cert=args.datacert, key=args.datakey, - epr=args.dataepr) + username=args.username, password=args.password, + token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, + port=args.dataport, cert=args.datacert, key=args.datakey, + epr=args.dataepr) + From 6264cb1fa21e8c2f94237e68285d52a647aa9fbc Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 15:15:27 -0500 Subject: [PATCH 26/49] FIxes errors in message building- WIP --- htrc/volumes/__init__.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index da97d48..c8e0550 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -240,31 +240,22 @@ def grep(file_name, output_dir, pattern, txt_index): for line in open(file_name): if pattern in line: na_volume.append(line.split()[txt_index]) - if 0 < len(na_volume) < 100: - print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the " - "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " - "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " - "for assistance.") - print("\n".join(str(item) for item in na_volume)) - with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: - volume_na.write("\n".join(str(item) for item in na_volume)) - volume_na.write("\n") - else: - if len(na_volume) >= 100: - print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt for the " - "complete list. \nTo check the validity of volumes in your workset or volume id file go to:\n " - "https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org " - "for assistance.") -def check_error_file(output_dir,file_name,grep_text,txt_index): + with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na: + volume_na.write("\n".join(str(item) for item in na_volume)) + volume_na.write("\n") + + return na_volume + +def check_error_file(output_dir,file_name,grep_text,txt_index): if output_dir.endswith("/"): file_path = output_dir+ file_name else: file_path = output_dir+"/"+file_name if os.path.isfile(file_path): - grep(file_path, output_dir, grep_text,txt_index) + return (grep(file_path, output_dir, grep_text,txt_index)) def download_volumes(volume_ids, output_dir, username=None, password=None, @@ -310,10 +301,13 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() + if(htrc.config.get_dataapi_access()): - check_error_file(output_dir,"volume-rights.txt", " 3", 0) + na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0) + print(na_volumes_rights) - check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) + na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) + print(na_volumes_error) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From b4c1d74bf70fde321ad90fec3e659f71f877fe69 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 15:20:43 -0500 Subject: [PATCH 27/49] FIxes errors in message building- WIP --- htrc/volumes/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index c8e0550..7beeabb 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -301,13 +301,16 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.extractall(output_dir) myzip.close() - + na_volumes_all = [] if(htrc.config.get_dataapi_access()): na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0) print(na_volumes_rights) + na_volumes_all = na_volumes_rights na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) print(na_volumes_error) + na_volumes_all = na_volumes_all + na_volumes_error + print(na_volumes_all) except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From a19d47256aa515447c548229e33aff704f1f88c5 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 15:28:01 -0500 Subject: [PATCH 28/49] FIxes errors in message building- WIP --- htrc/volumes/__init__.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 7beeabb..666d472 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -304,13 +304,29 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, na_volumes_all = [] if(htrc.config.get_dataapi_access()): na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0) - print(na_volumes_rights) na_volumes_all = na_volumes_rights na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) - print(na_volumes_error) na_volumes_all = na_volumes_all + na_volumes_error - print(na_volumes_all) + + if len(na_volumes_all) > 0: + with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na: + volume_na.write("\n".join(str(item) for item in na_volumes_all)) + + if 0 < len(na_volumes_all) < 100: + print("\nFollowing volume ids are not available. \n Please check volume_not_available.txt for the " + "complete list. ") + print("\n".join(str(item) for item in na_volumes_all)) + else: + if len(na_volumes_all) >= 100: + print("\nThere are 100 or more unavailable volumes.\n Please check volume_not_available.txt " + "for the " + "complete list. \nTo check the validity of volumes in your workset or volume id file go " + "to:\n " + "https://analytics.hathitrust.org/validateworkset \n or email us at " + "htrc-help@hathitrust.org " + "for assistance.") + except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") From 6f653b8fda1fd0cfd6113de92f66f80de3173e83 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 18 Feb 2021 15:31:37 -0500 Subject: [PATCH 29/49] FIxes errors in message building- WIP --- htrc/volumes/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 666d472..b69aec5 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -241,10 +241,6 @@ def grep(file_name, output_dir, pattern, txt_index): if pattern in line: na_volume.append(line.split()[txt_index]) - with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na: - volume_na.write("\n".join(str(item) for item in na_volume)) - volume_na.write("\n") - return na_volume From 768b018cf78fa7df81819edc03ee196afb04bca3 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Fri, 19 Feb 2021 10:53:46 -0500 Subject: [PATCH 30/49] Added inode error message --- htrc/volumes/__init__.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index b69aec5..ae3dcb7 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -298,15 +298,17 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, myzip.close() na_volumes_all = [] - if(htrc.config.get_dataapi_access()): + if htrc.config.get_dataapi_access() == "true": na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0) na_volumes_all = na_volumes_rights - na_volumes_error = check_error_file(output_dir,"ERROR.err","KeyNotFoundException", -1) - na_volumes_all = na_volumes_all + na_volumes_error + na_volumes_error = check_error_file(output_dir,"volume-rights.txt", " unavailable", 0) + + if len(na_volumes_error) > 0: + na_volumes_all = na_volumes_all + na_volumes_error if len(na_volumes_all) > 0: - with open(os.path.join(output_dir, "volume_not_available.txt"), "a") as volume_na: + with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write("\n".join(str(item) for item in na_volumes_all)) if 0 < len(na_volumes_all) < 100: @@ -325,7 +327,8 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, except socket.error: - raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") + raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. " + "Contact HTRC for further help.") else: raise RuntimeError("Failed to obtain jwt token.") From cf01765c3406dc102fa56679b92aefe31856c177 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Fri, 19 Feb 2021 11:11:11 -0500 Subject: [PATCH 31/49] FIxes errors in message building- WIP --- htrc/volumes/__init__.py | 62 +++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index ae3dcb7..5278773 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -10,6 +10,7 @@ """ from __future__ import print_function from future import standard_library + standard_library.install_aliases() from builtins import input @@ -37,8 +38,10 @@ import logging from logging import NullHandler + logging.getLogger(__name__).addHandler(NullHandler()) + def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False): """ Returns volumes from the Data API as a raw zip stream. @@ -82,7 +85,6 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -149,7 +151,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa headers = {"Authorization": "Bearer " + token, "Content-type": "application/x-www-form-urlencoded"} - # Create SSL lookup # TODO: Fix SSL cert verification ctx = ssl.create_default_context() @@ -159,7 +160,6 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - httpsConnection.request("POST", url, urlencode(data), headers) response = httpsConnection.getresponse() @@ -191,12 +191,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa return data + def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded headers = {"Content-type": "application/x-www-form-urlencoded"} - data = { "grant_type": "client_credentials", - "client_secret": password, - "client_id": username } + data = {"grant_type": "client_credentials", + "client_secret": password, + "client_id": username} data = urlencode(data) # create an SSL context @@ -235,27 +236,35 @@ def get_oauth2_token(username, password): return token -def grep(file_name, output_dir, pattern, txt_index): - na_volume = [] - for line in open(file_name): - if pattern in line: - na_volume.append(line.split()[txt_index]) - return na_volume - - -def check_error_file(output_dir,file_name,grep_text,txt_index): +def grep_error(file_name, output_dir, pattern, txt_index): + na_volume = [] if output_dir.endswith("/"): - file_path = output_dir+ file_name + file_path = output_dir + file_name else: - file_path = output_dir+"/"+file_name + file_path = output_dir + "/" + file_name if os.path.isfile(file_path): - return (grep(file_path, output_dir, grep_text,txt_index)) + for line in open(file_name): + if pattern in line: + na_volume.append(line.split()[txt_index]) + + return na_volume + + +# def check_error_file(output_dir, file_name, grep_text, txt_index): +# if output_dir.endswith("/"): +# file_path = output_dir + file_name +# else: +# file_path = output_dir + "/" + file_name +# +# if os.path.isfile(file_path): +# return grep(file_path, output_dir, grep_text, txt_index) def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): + config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, + cert=None, key=None, epr=None): # create output_dir folder, if nonexistant if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -266,7 +275,7 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, htrc.config.remove_jwt_token() if not host: - host= htrc.config.get_dataapi_host() + host = htrc.config.get_dataapi_host() if not port: port = htrc.config.get_dataapi_port() @@ -299,10 +308,10 @@ def download_volumes(volume_ids, output_dir, username=None, password=None, na_volumes_all = [] if htrc.config.get_dataapi_access() == "true": - na_volumes_rights = check_error_file(output_dir,"volume-rights.txt", " 3", 0) + na_volumes_rights = grep_error("volume-rights.txt", output_dir, " 3", 0) na_volumes_all = na_volumes_rights - na_volumes_error = check_error_file(output_dir,"volume-rights.txt", " unavailable", 0) + na_volumes_error = grep_error("volume-rights.txt", output_dir, " unavailable", 0) if len(na_volumes_error) > 0: na_volumes_all = na_volumes_all + na_volumes_error @@ -340,8 +349,7 @@ def download(args): volumeIDs = [line.strip() for line in IDfile] return download_volumes(volumeIDs, args.output, - username=args.username, password=args.password, - token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, - port=args.dataport, cert=args.datacert, key=args.datakey, - epr=args.dataepr) - + username=args.username, password=args.password, + token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, + port=args.dataport, cert=args.datacert, key=args.datakey, + epr=args.dataepr) From 5c7780094b751e6a808b128e5139a2f1fa8bd59a Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Fri, 19 Feb 2021 12:04:31 -0500 Subject: [PATCH 32/49] Removed check_error_file method --- htrc/volumes/__init__.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 5278773..5453fec 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -245,23 +245,12 @@ def grep_error(file_name, output_dir, pattern, txt_index): file_path = output_dir + "/" + file_name if os.path.isfile(file_path): - for line in open(file_name): + for line in open(file_path): if pattern in line: na_volume.append(line.split()[txt_index]) return na_volume - -# def check_error_file(output_dir, file_name, grep_text, txt_index): -# if output_dir.endswith("/"): -# file_path = output_dir + file_name -# else: -# file_path = output_dir + "/" + file_name -# -# if os.path.isfile(file_path): -# return grep(file_path, output_dir, grep_text, txt_index) - - def download_volumes(volume_ids, output_dir, username=None, password=None, config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): From 391396d2d7e4d979cab13e4d3d097fbd3d885a51 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Wed, 24 Feb 2021 15:53:20 -0600 Subject: [PATCH 33/49] Fixes #46 --- htrc/__main__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/htrc/__main__.py b/htrc/__main__.py index 04b26b4..bb53626 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -94,6 +94,9 @@ def main(): parser_run.set_defaults(func='run') args = parser.parse_args() + if 'func' not in args: + parser.print_help() + sys.exit(1) if args.func in ['metadata', 'export']: volumes = [] @@ -113,6 +116,9 @@ def main(): metadata = get_metadata(volumes) print(json.dumps(metadata)) elif args.func == 'run': + if 'run' not in args: + parser_run.print_help() + sys.exit(1) if args.run == 'mallet': htrc.tools.mallet.main(args.path, args.k, args.iter) if args.run == 'topicexplorer': From 64157ec9d9c2c29a0b3c3e5d13d9de7f93f55c8a Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Tue, 9 Mar 2021 11:18:58 -0500 Subject: [PATCH 34/49] Version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 91f8bf7..c77fcee 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.55b0' +__version__ = '0.1.55' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2'] From 3289e4ff5b32b6909966294afa604ad1ce0c1c9e Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Tue, 9 Mar 2021 11:21:58 -0500 Subject: [PATCH 35/49] Added files to gitignore. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index d77aad6..efb0815 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ htrc.egg-info .coverage htmlcov/ .eggs +ssl-cert-trust +venv/ From 1c4c7adaa8af21629a3ae7d20f855e41e875223e Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Tue, 9 Mar 2021 11:25:24 -0500 Subject: [PATCH 36/49] Changed the version in setup.py to 0.1.56b0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c77fcee..1bd8806 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.55' +__version__ = '0.1.56b0' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2'] From 8120c7891a793817d907db63475ecdc11ac638fa Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Wed, 24 Mar 2021 22:38:56 -0500 Subject: [PATCH 37/49] Formatting changes --- htrc/auth.py | 15 +++++++-------- htrc/lib/cli.py | 2 ++ htrc/tools/mallet.py | 1 + htrc/tools/topicexplorer.py | 1 + htrc/util/__init__.py | 7 ++++--- htrc/util/resolve.py | 20 +++++++++++--------- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/htrc/auth.py b/htrc/auth.py index a24255d..c366717 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -1,14 +1,12 @@ -from base64 import b64encode -from getpass import getpass -import http.client -import ssl import time +from getpass import getpass import requests import requests.auth import htrc.config + def get_jwt_token(): # Currently we just store one common jwt token locally at .htrc file for simplicity # Expect to add POST method to query unique jwt token with the combo of username and password @@ -17,10 +15,10 @@ def get_jwt_token(): client_id, client_secret = htrc.config.get_credentials() auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - data = { "grant_type": "password", - "username": username, - "password": password, - "scope" : "openid"} + data = {"grant_type": "password", + "username": username, + "password": password, + "scope": "openid"} url = htrc.config.get_idp_url() r = requests.post(url, data=data, auth=auth) @@ -35,6 +33,7 @@ def get_jwt_token(): else: raise RuntimeError("JWT token retrieval failed: {}".format(data['error'])) + def credential_prompt(): """ A prompt for entering HathiTrust Research Center credentials. diff --git a/htrc/lib/cli.py b/htrc/lib/cli.py index 33c378e..11a6e10 100644 --- a/htrc/lib/cli.py +++ b/htrc/lib/cli.py @@ -1,4 +1,6 @@ from builtins import input + + def bool_prompt(prompt_str, default=None): if default is True: default = 'y' diff --git a/htrc/tools/mallet.py b/htrc/tools/mallet.py index a005e93..e82758a 100644 --- a/htrc/tools/mallet.py +++ b/htrc/tools/mallet.py @@ -19,6 +19,7 @@ def install_mallet(): mallet_dir.extractall(path=MALLET_DIR) mallet_dir.close() + def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'): if not os.path.exists(MALLET_DIR): if not os.path.exists('/media/secure_volume/'): diff --git a/htrc/tools/topicexplorer.py b/htrc/tools/topicexplorer.py index 293baca..5149cc3 100644 --- a/htrc/tools/topicexplorer.py +++ b/htrc/tools/topicexplorer.py @@ -6,6 +6,7 @@ from htrc.volumes import download_volumes from htrc.workset import path_to_volumes + def main(path, topics, iterations, output_dir='/media/secure_volume/workset'): if os.path.exists("/media/secure_volume"): # If in secure mode, downlaod the volumes from data api diff --git a/htrc/util/__init__.py b/htrc/util/__init__.py index edbddd1..2b1dd3e 100644 --- a/htrc/util/__init__.py +++ b/htrc/util/__init__.py @@ -4,6 +4,7 @@ from .resolve import ORG_CODES + def split_items(seq, split_size): """ Returns a generator that returns portions of `seq` up to `split_size`. @@ -13,7 +14,7 @@ def split_items(seq, split_size): :param split_size: The maximum size of each split. """ full_segments = int(math.floor(len(seq) / split_size)) - for i in range(1,full_segments+1): - yield seq[(i-1)*split_size:i*split_size] + for i in range(1, full_segments + 1): + yield seq[(i - 1) * split_size:i * split_size] if (full_segments * split_size) < len(seq): - yield seq[full_segments*split_size:] + yield seq[full_segments * split_size:] diff --git a/htrc/util/resolve.py b/htrc/util/resolve.py index e3b2b4f..1d1a7e2 100644 --- a/htrc/util/resolve.py +++ b/htrc/util/resolve.py @@ -94,29 +94,31 @@ def parse_volume_id(string): Organization codes for the volumes can be found in ORG_CODES. ''' - # First extract the volume ID from a URL, fallbck to assume string. + # First extract the volume ID from a URL, fallback to assume string. parsed_url = urlparse(string) if parsed_url.netloc == 'hdl.handle.net': # Parse the Handle ID, ex: # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s' # Note that if the Handle URL contains page info, this is discarded. - id = parsed_url.path.replace('/2027/', '') + htid = parsed_url.path.replace('/2027/', '') elif parsed_url.netloc == 'babel.hathitrust.org': # Parse the HT Digital Library URL, ex: # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7 if parsed_url.query: - id = parse_qs(parsed_url.query).get('id', None) - if id is not None: - id = id[0] + htid = parse_qs(parsed_url.query).get('id', None) + if htid is not None: + htid = htid[0] + if ';' in htid: + htid = htid.split(';')[0] else: - id = string + htid = string # Validate ID against ORG_CODES. - # Won't guarantee volume existance, but is a sanity check. - if id and any(id.startswith(org) for org in ORG_CODES): - return id + # Won't guarantee volume existence, but it is a sanity check. + if htid and any(htid.startswith(org) for org in ORG_CODES): + return htid else: raise ValueError("Invalid Organization Code in HathiTrust ID") From c4b2c01cbf971f99d587584ef0aa2ceb5e312034 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Wed, 24 Mar 2021 22:41:46 -0500 Subject: [PATCH 38/49] Added missing package --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 70287f0..56cfa52 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ __version__ = '0.1.54' -install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', +install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0'] # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc'] if sys.version_info.major == 2: From fad8fafbe5be46fe94dd9f7fbd686a6196535cd7 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Wed, 24 Mar 2021 22:42:06 -0500 Subject: [PATCH 39/49] Added additional test --- tests/test_htrc_util_resolve.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_htrc_util_resolve.py b/tests/test_htrc_util_resolve.py index 432734d..6bbbfd0 100644 --- a/tests/test_htrc_util_resolve.py +++ b/tests/test_htrc_util_resolve.py @@ -42,6 +42,9 @@ def test_parse_volume_id(self): id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7') self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') + id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7') + self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') + id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s') self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') From 2a7aa8380d00f098abe8381ea019c70852dc7847 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Wed, 24 Mar 2021 22:42:33 -0500 Subject: [PATCH 40/49] Reduced the amount of I/O necessary for removing headers/footers from volumes; added parallelism to the header/footer removal process --- htrc/__main__.py | 47 +++-- htrc/config.py | 47 ++++- htrc/hf_vol_load/__init__.py | 117 ----------- htrc/volumes/__init__.py | 388 +++++++++++++++++------------------ tests/test_htrc_volumes.py | 56 ++++- 5 files changed, 301 insertions(+), 354 deletions(-) delete mode 100644 htrc/hf_vol_load/__init__.py diff --git a/htrc/__main__.py b/htrc/__main__.py index 3f35557..add1f0f 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -7,12 +7,12 @@ standard_library.install_aliases() import json -import os, os.path +import os +import os.path import shutil import sys from tempfile import NamedTemporaryFile - from htrc.metadata import get_metadata, get_volume_metadata import htrc.volumes import htrc.workset @@ -35,10 +35,23 @@ def download_parser(parser=None): help="remove folder if exists") parser.add_argument("-o", "--output", help="output directory", default='/media/secure_volume/workset/') - parser.add_argument("-hf", "--headfoot", action = 'store_true', + parser.add_argument("-hf", "--remove-headers-footers", action='store_true', help="remove headers and footers from individual pages") - parser.add_argument("-hfc", "--headfootcon", action = 'store_true', + parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true', help="remove headers and footers from individual pages then concatenate pages") + parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6, + help="How many pages ahead does the header/footer extractor algorithm look to find potential " + "matching headers/footers (higher value gives potentially more accurate results on lower " + "quality OCR volumes at the expense of runtime)") + parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7, + help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching " + "algorithm to declare that two headers are considered 'the same' (the higher the value, up " + "to a max of 1.0, the more strict the matching has to be; lower values allow for more " + "fuzziness to account for OCR errors)") + parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(), + help="The max number of concurrent tasks to start when downloading or removing headers/footers") + parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250, + help="The max number of volumes to download at a time from DataAPI") parser.add_argument("-c", "--concat", action='store_true', help="concatenate a volume's pages in to a single file") parser.add_argument("-m", "--mets", action='store_true', @@ -53,13 +66,13 @@ def download_parser(parser=None): parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.") return parser + def add_workset_path(parser=None): if parser is None: parser = ArgumentParser() parser.add_argument("path", nargs='+', help="workset path[s]") return parser - def main(): parser = ArgumentParser() @@ -133,24 +146,24 @@ def main(): print("Please choose another output folder and try again.") sys.exit(1) - if args.concat and args.headfoot: - print("Cannot set both concat and headfoot") + if args.concat and args.remove_headers_footers: + print("Cannot set both concat and remove-headers-footers") sys.exit(1) - if args.concat and args.headfootcon: - print("Cannot set both concat and headfootcon") + if args.concat and args.remove_headers_footers_and_concat: + print("Cannot set both concat and remove-headers-footers-and-concat") sys.exit(1) - if args.headfoot and args.headfootcon: - print("Cannot set both headfoot and headfootcon") + if args.remove_headers_footers and args.remove_headers_footers_and_concat: + print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat") sys.exit(1) - if args.mets and args.headfootcon: - print("Cannot set both mets and headfootcon") + if args.mets and args.remove_headers_footers_and_concat: + print("Cannot set both mets and remove_headers_footers_and_concat") sys.exit(1) if args.pages: if args.mets and args.concat: - print ("Cannot set both concat and mets with pages") + print("Cannot set both concat and mets with pages") sys.exit(1) - if args.mets and args.headfootcon: - print("Cannot set both mets and headfootcon with pages") + if args.mets and args.remove_headers_footers_and_concat: + print("Cannot set both mets and remove_headers_footers_and_concat with pages") sys.exit(1) try: @@ -159,6 +172,7 @@ def main(): print("Invalid identifier:", args.file) sys.exit(1) + def resolve_and_download(args): if args.file == sys.stdin: # For use with UNIX pipes @@ -223,6 +237,7 @@ def download(args): else: raise e + def download_with_tempfile(args, volumes): f = NamedTemporaryFile() for volume in volumes: diff --git a/htrc/config.py b/htrc/config.py index ccd7d54..c09f916 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -6,18 +6,14 @@ """ from future import standard_library standard_library.install_aliases() -from builtins import input - +from typing import Optional from configparser import RawConfigParser as ConfigParser, NoSectionError from codecs import open -from getpass import getpass import logging import os.path import shutil import time -from htrc.lib.cli import bool_prompt - DEFAULT_PATH = os.path.expanduser('~') DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc') if not os.path.exists(DEFAULT_PATH): @@ -26,6 +22,25 @@ logging.info("Copying default config file to home directory.") shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH) + +class HtrcDataApiConfig: + def __init__(self, + token: Optional[str] = None, + host: Optional[str] = None, + port: Optional[int] = None, + epr: Optional[str] = None, + cert: Optional[str] = None, + key: Optional[str] = None) -> None: + super().__init__() + + self.token = token or get_jwt_token(save_new_token=False) + self.host = host or get_dataapi_host() + self.port = port or get_dataapi_port() + self.epr = epr or get_dataapi_epr() + self.cert = cert or get_dataapi_cert() + self.key = key or get_dataapi_key() + + def _get_value(section, key, path=None): if path is None: path = DEFAULT_PATH @@ -38,33 +53,41 @@ def _get_value(section, key, path=None): except NoSectionError: raise EnvironmentError("Config not set for {} {} in {}".format( section, key, path)) - + + def get_dataapi_port(path=None): port = int(_get_value('data', 'port', path)) return (port) + def get_dataapi_host(path=None): host = _get_value('data', 'host', path) return (host) + def get_dataapi_epr(path=None): return _get_value('data', 'url', path) + def get_dataapi_cert(path=None): return _get_value('data', 'cert', path) + def get_dataapi_key(path=None): return _get_value('data', 'key', path) + def get_idp_host_port(path=None): host = _get_value('idp', 'host', path) port = _get_value('idp', 'port', path) return (host, port) + def get_idp_path(path=None): return _get_value('idp', 'url') + def get_idp_url(path=None): host, port = get_idp_host_port(path) path = get_idp_path(path) @@ -76,23 +99,26 @@ def get_idp_url(path=None): # Add jwt credential access methods -def get_jwt_token(path=None): +def get_jwt_token(path=None, save_new_token=True): try: token = _get_value('jwt', 'token', path) # check expiration date expiration = int(_get_value('jwt', 'expiration', path)) if time.time() > expiration: + import htrc + htrc.config.remove_jwt_token() raise RuntimeError("JWT token expired.") except: # This should run on either a missing or expired token. import htrc.auth token, expiration = htrc.auth.get_jwt_token() - htrc.config.save_jwt_token(token, expiration, path) - + if save_new_token: + htrc.config.save_jwt_token(token, expiration, path) return token + def save_jwt_token(token, expiration=None, path=None): """ Saves JWT token in the config file. @@ -121,6 +147,7 @@ def save_jwt_token(token, expiration=None, path=None): return token + def remove_jwt_token(path=None): """ Removes JWT token from the config file. @@ -158,9 +185,11 @@ def get_credentials(path=None): return (client_id, client_secret) + def populate_parser(parser): return parser + if __name__ == '__main__': from argparse import ArgumentParser diff --git a/htrc/hf_vol_load/__init__.py b/htrc/hf_vol_load/__init__.py deleted file mode 100644 index 72f08d5..0000000 --- a/htrc/hf_vol_load/__init__.py +++ /dev/null @@ -1,117 +0,0 @@ -import unittest -from typing import List - -from htrc.models import HtrcPage -from htrc.runningheaders import parse_page_structure, clean_text, levenshtein - - -class TestRunningHeaders(unittest.TestCase): - def test_finding_running_headers(self): - pages = load_vol("data/vol1", num_pages=10) - structured_pages = parse_page_structure(pages) - headers = ["|".join(page.header_lines) for page in structured_pages] - expected = [ - "", - "", - "CHAPTER 1|INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", - "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", - "INTRODUCTION TO RUNNING HEADERS 1|Lorem Ipsum style", - "1 INTRODUCTION TO RUNNING HEADERS|Lorem Ipsum style", - "CHAPTER 2|EVERYTHING IS RELATIVE", - "2 EVERYTHING IS RELATIVE", - "EVERYTHING IS RELATIVE 2", - "2 EVERYTHING IS RELATIVE" - ] - self.assertListEqual(expected, headers) - - def test_finding_running_footers(self): - pages = load_vol("data/vol1", num_pages=10) - structured_pages = parse_page_structure(pages) - footers = ["|".join(page.footer_lines) for page in structured_pages] - expected = [ - "", - "", - "Page 2", - "Page 3", - "Page 4", - "Page 5", - "Page 6", - "Page 7", - "Page 8", - "Page 9" - ] - self.assertListEqual(expected, footers) - - def test_identify_correct_page_body(self): - pages = load_vol("data/vol1", num_pages=10) - structured_pages = parse_page_structure(pages) - len_body_per_page = [len(page.body_lines) for page in structured_pages] - expected = [0, 7, 43, 28, 26, 30, 31, 27, 28, 15] - self.assertListEqual(expected, len_body_per_page) - - def test_find_footer_with_page_numbers(self): - pages = load_vol("data/vol2", num_pages=10) - structured_pages = parse_page_structure(pages) - footers = ["|".join(page.footer_lines) for page in structured_pages] - expected = [ - "", - "", - "2", - " 3", - "4", - " 5", - "6", - " 7", - "8", - " 9" - ] - self.assertListEqual(expected, footers) - - -class TestUtils(unittest.TestCase): - def test_clean_text(self): - s1 = u"\t На берегу \tпустынных волн \t\n" - s1_expected = u"на берегу пустынных волн" - s2 = u" Pot să mănânc sticlă și ea nu mă rănește. " - s2_expected = u"pot să mănânc sticlă și ea nu mă rănește" - s1_clean = clean_text(s1) - s2_clean = clean_text(s2) - - self.assertEqual(s1_expected, s1_clean) - self.assertEqual(s2_expected, s2_clean) - - def test_levenshtein(self): - s1 = "rosettacode" - s2 = "raisethysword" - lev = levenshtein(s1, s2) - self.assertEqual(8, lev) - - s1 = "kitten" - s2 = "sitting" - lev = levenshtein(s1, s2, replace_cost=2) - self.assertEqual(5, lev) - - s1 = "abracadabra" - s2 = "abracadabra" - lev = levenshtein(s1, s2) - self.assertEqual(0, lev) - - s1 = "" - s2 = "abc" - lev = levenshtein(s1, s2) - self.assertEqual(3, lev) - - -def load_vol(path: str, num_pages: int) -> List[HtrcPage]: - pages = [] - for n in range(num_pages): - page_num = str(n+1).zfill(8) - with open('{}/{}.txt'.format(path, page_num), encoding='utf-8') as f: - lines = [line.rstrip() for line in f.readlines()] - pages.append(HtrcPage(lines)) - - return pages - - -if __name__ == '__main__': - unittest.main() diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index d85ec1f..865b826 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -9,43 +9,36 @@ `htrc.mock.volumes` contains Patch objects for testing workflows. """ from __future__ import print_function + from future import standard_library -standard_library.install_aliases() -from builtins import input +from htrc.models import HtrcPage + +standard_library.install_aliases() import http.client -from io import BytesIO # used to stream http response into zipfile. +from io import BytesIO, TextIOWrapper import json -import logging import os.path import progressbar -import re import socket import ssl -import sys -from time import sleep -from urllib.request import urlopen -from urllib.error import HTTPError -from urllib.parse import quote_plus, urlencode -import xml.etree.ElementTree as ET +from urllib.parse import urlencode from zipfile import ZipFile # used to decompress requested zip archives. +from tqdm import tqdm from htrc.runningheaders import parse_page_structure -from htrc.hf_vol_load import load_vol +from functools import partial import pandas as pd -import fnmatch -import glob -from tqdm import tqdm -import shutil -from htrc.lib.cli import bool_prompt from htrc.util import split_items import htrc.config +import multiprocessing import logging from logging import NullHandler logging.getLogger(__name__).addHandler(NullHandler()) -def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False): + +def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128): """ Returns volumes from the Data API as a raw zip stream. @@ -60,7 +53,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met if not volume_ids: raise ValueError("volume_ids is empty.") - url = epr + "volumes" + url = data_api_config.epr + "volumes" for id in volume_ids: if ("." not in id @@ -77,7 +70,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met data['mets'] = 'true' # Authorization - headers = {"Authorization": "Bearer " + token, + headers = {"Authorization": "Bearer " + data_api_config.token, "Content-type": "application/x-www-form-urlencoded"} # Create SSL lookup @@ -87,8 +80,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes - httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - + httpsConnection = http.client.HTTPSConnection( + data_api_config.host, + data_api_config.port, + context=ctx, + key_file=data_api_config.key, + cert_file=data_api_config.cert) httpsConnection.request("POST", url, urlencode(data), headers) @@ -104,7 +101,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met ' (', progressbar.FileTransferSpeed(), ')']) while body: - body = response.read(128) + body = response.read(buffer_size) data.write(body) bytes_downloaded += len(body) bar.update(bytes_downloaded) @@ -122,12 +119,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met return data -def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=False): +def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128): """ Returns a ZIP file containing specfic pages. Parameters: - :token: An OAuth2 token for the app. + :data_api_config: The configuration data of the DataAPI endpoint. :volume_ids: A list of volume_ids :concat: If True, return a single file per volume. If False, return a single file per page (default). @@ -135,7 +132,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa if not page_ids: raise ValueError("page_ids is empty.") - url = epr + "pages" + url = data_api_config.epr + "pages" for id in page_ids: if ("." not in id @@ -153,7 +150,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa data['mets'] = 'true' # Authorization - headers = {"Authorization": "Bearer " + token, + headers = {"Authorization": "Bearer " + data_api_config.token, "Content-type": "application/x-www-form-urlencoded"} @@ -164,8 +161,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes - httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - + httpsConnection = http.client.HTTPSConnection( + data_api_config.host, + data_api_config.port, + context=ctx, + key_file=data_api_config.key, + cert_file=data_api_config.cert + ) httpsConnection.request("POST", url, urlencode(data), headers) @@ -181,7 +183,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa ' (', progressbar.FileTransferSpeed(), ')']) while body: - body = response.read(128) + body = response.read(buffer_size) data.write(body) bytes_downloaded += len(body) bar.update(bytes_downloaded) @@ -198,12 +200,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa return data + def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded headers = {"Content-type": "application/x-www-form-urlencoded"} - data = { "grant_type": "client_credentials", - "client_secret": password, - "client_id": username } + data = {"grant_type": "client_credentials", + "client_secret": password, + "client_id": username} data = urlencode(data) # create an SSL context @@ -235,20 +238,21 @@ def get_oauth2_token(username, password): logging.debug("Response Code: {}".format(response.status)) logging.debug("Response: {}".format(response.reason)) logging.debug(response.read()) - raise EnvironmentError("Unable to get token.") + raise EnvironmentError("Unable to get the token.") if httpsConnection is not None: httpsConnection.close() return token + def grep(file_name, output_dir, pattern): na_volume = [] for line in open(file_name): if pattern in line: na_volume.append(line.split()[-1]) if len(na_volume) < 100: - print("\nFollowing volume ids are not available.") + print("\nThe following volume ids are not available:") print("\n".join(str(item) for item in na_volume)) with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: volume_na.write("\n".join(str(item) for item in na_volume)) @@ -256,185 +260,154 @@ def grep(file_name, output_dir, pattern): if len(na_volume) == 100: print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") + def check_error_file(output_dir): file_name = "ERROR.err" if output_dir.endswith("/"): - file_path = output_dir+ file_name + file_path = output_dir + file_name else: - file_path = output_dir+"/"+file_name + file_path = output_dir + "/" + file_name if os.path.isfile(file_path): grep(file_path, output_dir, "KeyNotFoundException") -def remove_hf(output_dir): - os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_data") - vol_paths = glob.glob(os.path.join(output_dir,'**')) - df = pd.DataFrame() - - - for path in tqdm(vol_paths): - if os.path.isdir(path): - page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) - n = len(page_paths) - num = 1 - - while num <= n: - for pg in page_paths: - parsed_path = str(path).split('/') - clean_path_root = '/'.join(parsed_path) - page_num = str(num).zfill(8) - new_filename = page_num+'.txt' - os.rename(pg, clean_path_root+'/'+new_filename) - num += 1 - - folder = os.path.basename(path) - n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) - pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) - - body = [] - for n, page in enumerate(pages): - s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) - - pg_boolean = s + "\n" + "-"*len(s) - pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") - #pg_body = page.body if page.has_body else "" - pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") - - body.append(page.body) - - df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) - df.sort_values("Volume") - for i, g in df.groupby("Volume"): - g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) - - count = 1 - for item in body: - pg_n = str(count).zfill(8) - filename = '{}.txt'.format(pg_n) - count += 1 - with open(os.path.join(clean_path_root, filename), "w") as f_out: - f_out.write('{}\n'.format(item)) - -def remove_hf_concat(output_dir): - os.makedirs(os.path.join(output_dir, "removed_hf_data"), exist_ok = True) - removed_hf = os.path.join(output_dir, "removed_hf_data") - vol_paths = glob.glob(os.path.join(output_dir,'**')) - df = pd.DataFrame() - retain = ["removed_hf_data"] - rm_txt = "removed_hf_data.txt" - - - for path in tqdm(vol_paths): - if os.path.isdir(path): - page_paths = sorted(glob.glob(os.path.join(path, '**', '*.txt'), recursive=True)) - n = len(page_paths) - num = 1 - - while num <= n: - for pg in page_paths: - parsed_path = str(path).split('/') - clean_path_root = '/'.join(parsed_path) - page_num = str(num).zfill(8) - new_filename = page_num+'.txt' - os.rename(pg, clean_path_root+'/'+new_filename) - num += 1 - - folder = os.path.basename(path) - n_pgs = len(fnmatch.filter(os.listdir(path), "*.txt")) - pages = parse_page_structure(load_vol(path, num_pages=n_pgs)) - - filename = '{}.txt'.format(folder) - body = [] - for n, page in enumerate(pages): - s = "\nPage {} (has_header: {}, has_body: {}, has_footer: {})".format(n+1, page.has_header, page.has_body, page.has_footer) - - pg_boolean = s + "\n" + "-"*len(s) - pg_header = "Header:\n{}".format(page.header if page.has_header else "N/A") - #pg_body = page.body if page.has_body else "" - pg_footer = "Footer:\n{}".format(page.footer if page.has_footer else "N/A") - - body.append(page.body) - - df = df.append({"Volume":folder, "Page Info":pg_boolean, "Header":pg_header, "Footer":pg_footer}, ignore_index = True) - df.sort_values("Volume") - for i, g in df.groupby("Volume"): - g.to_csv(os.path.join(removed_hf, "removed_hf_data_{}.csv".format(i))) - - - with open(os.path.join(output_dir, filename), "w") as f_out: - f_out.write('\n'.join([str(item) + '\n' for item in body]) + '\n') - if folder not in retain: - shutil.rmtree(os.path.join(output_dir, folder)) - if os.path.exists(os.path.join(output_dir, rm_txt)): - os.remove(os.path.join(output_dir, rm_txt)) - - - -def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, headfootcon=False, headfoot=False, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): - # create output_dir folder, if nonexistant - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - - # get token if not specified - if not token: - token = htrc.config.get_jwt_token() - htrc.config.remove_jwt_token() - - if not host: - host= htrc.config.get_dataapi_host() - - if not port: - port = htrc.config.get_dataapi_port() - - if not epr: - epr = htrc.config.get_dataapi_epr() - - if not cert: - cert = htrc.config.get_dataapi_cert() - - if not key: - key = htrc.config.get_dataapi_key() - - if any((token, host, port)) is not None: - logging.info("obtained token: %s\n" % token) + +def _to_htrc_page(page_file, zip): + with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page: + return HtrcPage([line.rstrip() for line in page.readlines()]) + + +def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False, + remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, save_removed_hf=True, + parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None): + if not 0 < parallelism <= multiprocessing.cpu_count(): + raise ValueError("Invalid parallelism level specified") + + remove_hf_fun = partial( + _remove_headers_footers_and_save, + concat=concat, + hf_min_similarity=hf_min_similarity, + hf_window_size=hf_window_size, + save_removed_hf=save_removed_hf, + output_dir=output_dir + ) + + volume_ids = list(set(volume_ids)) # ensure unique volume ids + num_vols = len(volume_ids) + + data_api_config = data_api_config or htrc.config.HtrcDataApiConfig() + + os.makedirs(output_dir, exist_ok=True) + + if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None: + logging.info("obtained token: %s\n" % data_api_config.token) try: - for ids in split_items(volume_ids, 250): - if pages: - if concat & mets: - raise ValueError("Cannot set both concat and mets with pages.") + errors = [] + rights = [] + + with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool: + for ids in split_items(volume_ids, batch_size): + if pages: + if concat and mets: + raise ValueError("Cannot set both concat and mets with pages.") + else: + data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets) else: - data = get_pages(token, ids, host, port, cert, key, epr, concat, mets) - else: - data = get_volumes(token, ids, host, port, cert, key, epr, concat, mets) + data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets) + + volumes = [] + + with ZipFile(BytesIO(data)) as vols_zip: + zip_list = vols_zip.namelist() + if 'ERROR.err' in zip_list: + errors.append(vols_zip.read('ERROR.err').decode('utf-8')) + zip_list.remove('ERROR.err') + if 'volume-rights.txt' in zip_list: + rights_data = vols_zip.read('volume-rights.txt').decode('utf-8') + zip_list.remove('volume-rights.txt') + if not rights: + rights.append(rights_data) + else: + # due to the format in which 'volume-rights.txt' is created, we have to skip + # the first 4 lines which make up the header of the file, to extract only the + # actual volume rights data for accumulation + rights.append(''.join(rights_data.splitlines(keepends=True)[4:])) + + zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')] + num_vols_in_zip = len(zip_volume_paths) + + if not remove_headers_footers: + vols_zip.extractall(output_dir, members=zip_list) + progress.update(num_vols_in_zip) + else: + for zip_vol_path in zip_volume_paths: + sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/')) + vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths] + volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages)) + + del data, vols_zip + + num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip + progress.update(num_missing) # update progress bar state to include the missing volumes also + + # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted + # without further processing + if volumes: + for _ in pool.imap_unordered(remove_hf_fun, volumes): + progress.update() + + if errors: + with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file: + err_file.write(''.join(errors)) + check_error_file(output_dir) - myzip = ZipFile(BytesIO(data)) - myzip.extractall(output_dir) - myzip.close() + if rights: + with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file: + rights_file.write(''.join(rights)) - check_error_file(output_dir) - d = os.listdir(output_dir) - if headfoot: - if len(d) == 0: - print("This directory is empty") - sys.exit(1) - else: - remove_hf(output_dir) - if headfootcon: - if len(d) == 0: - print("This directory is empty") - sys.exit(1) - else: - remove_hf_concat(output_dir) - except socket.error: raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") else: - raise RuntimeError("Failed to obtain jwt token.") + raise RuntimeError("Failed to obtain the JWT token.") + + +def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, save_removed_hf, output_dir): + zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data + clean_volid = zip_vol_path[:-1] + + vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity) + pages_body = (page.body for page in vol_pages) + + if concat: + with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: + vol_file.write('\n'.join(pages_body)) + else: + vol_path = os.path.join(output_dir, zip_vol_path) + os.mkdir(vol_path) + for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): + with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: + page_file.write(page_body) + + if save_removed_hf: + # save the removed headers/footers for user inspection + removed_hf = [] + for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages): + if not (vol_page.has_header or vol_page.has_footer): + # skip reporting pages that don't have an identified header or footer + continue + _, page_name = os.path.split(vol_page_path) + page_name, _ = os.path.splitext(page_name) + removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer}) + + if concat: + removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv') + else: + removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv') + + pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False) def download(args): @@ -442,9 +415,22 @@ def download(args): with open(args.file) as IDfile: volumeIDs = [line.strip() for line in IDfile] - return download_volumes(volumeIDs, args.output, - username=args.username, password=args.password, - token=args.token, headfoot=args.headfoot, headfootcon=args.headfootcon, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, - port=args.dataport, cert=args.datacert, key=args.datakey, - epr=args.dataepr) + data_api_config = htrc.config.HtrcDataApiConfig( + token=args.token, + host=args.datahost, + port=args.dataport, + epr=args.dataepr, + cert=args.datacert, + key=args.datakey + ) + return download_volumes(volumeIDs, args.output, + remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat, + concat=args.concat or args.remove_headers_footers_and_concat, + mets=args.mets, + pages=args.pages, + hf_window_size=args.window_size, + hf_min_similarity=args.min_similarity_ratio, + parallelism=args.parallelism, + batch_size=args.batch_size, + data_api_config=data_api_config) diff --git a/tests/test_htrc_volumes.py b/tests/test_htrc_volumes.py index d4d9abf..752cbf4 100644 --- a/tests/test_htrc_volumes.py +++ b/tests/test_htrc_volumes.py @@ -60,27 +60,53 @@ def test_get_volumes_and_pages(self, https_mock): response_mock.read.return_value =\ ''.encode('utf8') https_mock.return_value.getresponse.return_value = response_mock - - htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) - htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/') + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + + htrc.volumes.get_volumes(data_api_config, self.test_vols) + htrc.volumes.get_pages(data_api_config, self.test_vols) @patch('htrc.volumes.http.client.HTTPSConnection') def test_get_volumes_and_pages_error(self, https_mock): response_mock = Mock(status=500) https_mock.return_value.getresponse.return_value = response_mock + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + with self.assertRaises(EnvironmentError): - htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_volumes(data_api_config, self.test_vols) with self.assertRaises(EnvironmentError): - htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/') + htrc.volumes.get_pages(data_api_config, self.test_vols) def test_get_volumes_and_pages_empty(self): + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + with self.assertRaises(ValueError): - htrc.volumes.get_volumes('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_volumes(data_api_config, []) with self.assertRaises(ValueError): - htrc.volumes.get_pages('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_pages(data_api_config, []) @patch('htrc.volumes.ZipFile') @patch('htrc.volumes.get_volumes') @@ -93,14 +119,21 @@ def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock, oauth2_mock.return_value = 'a1b2c3d4e5' volumes_mock.return_value = b'' - htrc.volumes.download_volumes(self.test_vols, self.output_path, - username='1234', password='1234', token='1234') + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + + htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) # test directory creation import shutil shutil.rmtree(self.output_path) - htrc.volumes.download_volumes(self.test_vols, self.output_path, - username='1234', password='1234', token='1234') + htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) # TODO: Fix this test for case where config file exists, but creds not set """ @@ -132,6 +165,7 @@ def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_moc def test_download(self): pass + suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes) unittest.TextTestRunner(verbosity=2).run(suite) From 6afff024474db6022d969d7e25ed1b4b0e877112 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Thu, 25 Mar 2021 10:10:35 -0500 Subject: [PATCH 41/49] Added cmd line option for user to specify that they want the removed headers/footers saved for inspection (no longer turned on by default --- user must specify!) --- htrc/__main__.py | 23 ++++++++++++----------- htrc/volumes/__init__.py | 1 + 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/htrc/__main__.py b/htrc/__main__.py index add1f0f..d82cb75 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -6,7 +6,6 @@ from future import standard_library standard_library.install_aliases() -import json import os import os.path import shutil @@ -30,15 +29,15 @@ def download_parser(parser=None): parser.add_argument("-u", "--username", help="HTRC username") parser.add_argument("-p", "--password", help="HTRC password") parser.add_argument("file", nargs='?', default=sys.stdin, - help="workset path[s]") + help="Workset path[s]") parser.add_argument("-f", "--force", action='store_true', - help="remove folder if exists") - parser.add_argument("-o", "--output", help="output directory", + help="Remove folder if exists") + parser.add_argument("-o", "--output", help="Output directory", default='/media/secure_volume/workset/') parser.add_argument("-hf", "--remove-headers-footers", action='store_true', - help="remove headers and footers from individual pages") + help="Remove headers and footers from individual pages") parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true', - help="remove headers and footers from individual pages then concatenate pages") + help="Remove headers and footers from individual pages then concatenate pages") parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6, help="How many pages ahead does the header/footer extractor algorithm look to find potential " "matching headers/footers (higher value gives potentially more accurate results on lower " @@ -48,14 +47,16 @@ def download_parser(parser=None): "algorithm to declare that two headers are considered 'the same' (the higher the value, up " "to a max of 1.0, the more strict the matching has to be; lower values allow for more " "fuzziness to account for OCR errors)") + parser.add_argument("-s", "--save-removed-hf", action='store_true', + help="Save a report of the removed headers and footers for each page for inspection") parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(), help="The max number of concurrent tasks to start when downloading or removing headers/footers") parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250, help="The max number of volumes to download at a time from DataAPI") parser.add_argument("-c", "--concat", action='store_true', - help="concatenate a volume's pages in to a single file") + help="Concatenate a volume's pages in to a single file") parser.add_argument("-m", "--mets", action='store_true', - help="add volume's METS file") + help="Add volume's METS file") parser.add_argument("-pg", "--pages",action='store_true', help="Download given page numbers of a volumes.") parser.add_argument("-t", "--token", help="JWT for volumes download.") @@ -70,13 +71,13 @@ def download_parser(parser=None): def add_workset_path(parser=None): if parser is None: parser = ArgumentParser() - parser.add_argument("path", nargs='+', help="workset path[s]") + parser.add_argument("path", nargs='+', help="Workset path[s]") return parser def main(): parser = ArgumentParser() - parser.add_argument('-d', '--debug', help="print long debug messages", + parser.add_argument('-d', '--debug', help="Print long debug messages", action='store_true') parsers = parser.add_subparsers(help="select a command") @@ -101,7 +102,7 @@ def main(): # Run helper parser_run = parsers.add_parser('run', help="Run a built-in algorithm.") - run_parsers = parser_run.add_subparsers(help="select a command") + run_parsers = parser_run.add_subparsers(help="Select a command") parser_mallet = run_parsers.add_parser('mallet') htrc.tools.mallet.populate_parser(parser_mallet) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 865b826..9d58a82 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -433,4 +433,5 @@ def download(args): hf_min_similarity=args.min_similarity_ratio, parallelism=args.parallelism, batch_size=args.batch_size, + save_removed_hf=args.save_removed_hf, data_api_config=data_api_config) From bc43c59b28684a14118f6698999f77805ecf6434 Mon Sep 17 00:00:00 2001 From: David K Date: Wed, 31 Mar 2021 17:34:28 -0400 Subject: [PATCH 42/49] Changed defaults for -s flag Changed -s flag to mean skip-removed-hf so that users only call the flag if they do NOT wish the .csv file of removed headers/footers to be saved to the output directory. --- htrc/__main__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/htrc/__main__.py b/htrc/__main__.py index d82cb75..297ac5f 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -35,9 +35,9 @@ def download_parser(parser=None): parser.add_argument("-o", "--output", help="Output directory", default='/media/secure_volume/workset/') parser.add_argument("-hf", "--remove-headers-footers", action='store_true', - help="Remove headers and footers from individual pages") + help="Remove headers and footers from individual pages and save in a separate csv file for inspection") parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true', - help="Remove headers and footers from individual pages then concatenate pages") + help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages") parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6, help="How many pages ahead does the header/footer extractor algorithm look to find potential " "matching headers/footers (higher value gives potentially more accurate results on lower " @@ -47,8 +47,8 @@ def download_parser(parser=None): "algorithm to declare that two headers are considered 'the same' (the higher the value, up " "to a max of 1.0, the more strict the matching has to be; lower values allow for more " "fuzziness to account for OCR errors)") - parser.add_argument("-s", "--save-removed-hf", action='store_true', - help="Save a report of the removed headers and footers for each page for inspection") + parser.add_argument("-s", "--skip-removed-hf", action='store_true', + help="Skip creating a saved report of the removed headers and footers for each page for inspection") parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(), help="The max number of concurrent tasks to start when downloading or removing headers/footers") parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250, From ef4dd442e15d267557926cee30aba2f59dd9689c Mon Sep 17 00:00:00 2001 From: David K Date: Wed, 31 Mar 2021 17:38:17 -0400 Subject: [PATCH 43/49] Changed -s flag default Changed the function of the -s flag so that if a user does NOT call the flag in conjunction with the -hf or -hfc flags the .csv files containing the removed headers/footers will be saved to the output directory by default. If a user does call the -s flag in conjunction with the -hf or -hfc flags then the .csv files will be "skipped" and not saved to the output directory. --- htrc/volumes/__init__.py | 42 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 9d58a82..4b638f3 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -279,7 +279,7 @@ def _to_htrc_page(page_file, zip): def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False, - remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, save_removed_hf=True, + remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False, parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None): if not 0 < parallelism <= multiprocessing.cpu_count(): raise ValueError("Invalid parallelism level specified") @@ -289,7 +289,7 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal concat=concat, hf_min_similarity=hf_min_similarity, hf_window_size=hf_window_size, - save_removed_hf=save_removed_hf, + skip_removed_hf=skip_removed_hf, output_dir=output_dir ) @@ -374,25 +374,34 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal raise RuntimeError("Failed to obtain the JWT token.") -def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, save_removed_hf, output_dir): +def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir): zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data clean_volid = zip_vol_path[:-1] vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity) pages_body = (page.body for page in vol_pages) - - if concat: - with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: - vol_file.write('\n'.join(pages_body)) + # save the removed headers/footers for user inspection + if skip_removed_hf: + if concat: + with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: + vol_file.write('\n'.join(pages_body)) + else: + vol_path = os.path.join(output_dir, zip_vol_path) + os.mkdir(vol_path) + for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): + with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: + page_file.write(page_body) else: - vol_path = os.path.join(output_dir, zip_vol_path) - os.mkdir(vol_path) - for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): - with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: - page_file.write(page_body) - - if save_removed_hf: - # save the removed headers/footers for user inspection + if concat: + with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: + vol_file.write('\n'.join(pages_body)) + else: + vol_path = os.path.join(output_dir, zip_vol_path) + os.mkdir(vol_path) + for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): + with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: + page_file.write(page_body) + removed_hf = [] for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages): if not (vol_page.has_header or vol_page.has_footer): @@ -408,6 +417,7 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv') pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False) + def download(args): @@ -433,5 +443,5 @@ def download(args): hf_min_similarity=args.min_similarity_ratio, parallelism=args.parallelism, batch_size=args.batch_size, - save_removed_hf=args.save_removed_hf, + skip_removed_hf=args.skip_removed_hf, data_api_config=data_api_config) From a4832b70f1ef20f1823822f989e7f5833aaa6265 Mon Sep 17 00:00:00 2001 From: David K Date: Thu, 1 Apr 2021 16:26:13 -0400 Subject: [PATCH 44/49] Update cli.rst --- docs/source/cli.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 8101173..2a0ff7c 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -125,6 +125,23 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da * Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) : ``htrc download -hfc /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers : + + ``htrc download -hf -s /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) : + + ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) : + + ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2): + + ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id`` + | +---------------------------------+-----------------------------------------------+ From eb00ece77ba9c0f49ee0750a960eb2a4cb2c1eb5 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Tue, 6 Apr 2021 09:54:05 -0500 Subject: [PATCH 45/49] Fixed formatting issue --- htrc/volumes/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index dc1f035..48de66c 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -368,11 +368,11 @@ def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=Fal with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na: volumes_na.write("\n".join(str(item) for item in na_volumes_all)) - if num_na < 100: - print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt for the " - "complete list. ") - print('\n'.join(str(item) for item in na_volumes_all)) - else: + if num_na < 100: + print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt " + "for the complete list. ") + print('\n'.join(str(item) for item in na_volumes_all)) + else: print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt " "for the " "complete list. \nTo check the validity of volumes in your workset or volume id file go " From aa3b5db6c35f0c51bc16c1c0ffa46503ae9261a0 Mon Sep 17 00:00:00 2001 From: Boris Capitanu Date: Tue, 6 Apr 2021 11:38:24 -0500 Subject: [PATCH 46/49] Fixed another formatting issue --- htrc/volumes/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 48de66c..c4c11b9 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -432,7 +432,6 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv') pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False) - def download(args): From 28a5a408f18dd85e5271445d2d17732d0be4d8c2 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 8 Apr 2021 10:46:22 -0400 Subject: [PATCH 47/49] Disabled username and password paraser arguments since those are not used. --- htrc/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htrc/__main__.py b/htrc/__main__.py index 843b9e5..6102ad9 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -26,8 +26,8 @@ def download_parser(parser=None): if parser is None: parser = ArgumentParser() - parser.add_argument("-u", "--username", help="HTRC username") - parser.add_argument("-p", "--password", help="HTRC password") + #parser.add_argument("-u", "--username", help="HTRC username") + #parser.add_argument("-p", "--password", help="HTRC password") parser.add_argument("file", nargs='?', default=sys.stdin, help="Workset path[s]") parser.add_argument("-f", "--force", action='store_true', From ca43dd69af3094ef84450f8cee09a44fced2001b Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Mon, 12 Apr 2021 10:03:56 -0400 Subject: [PATCH 48/49] Set final release version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb9bffe..4fa1ed9 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.56b0' +__version__ = '0.1.56' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0'] From 5225dc739f0ab5afda5790d48d08e6f451b40f4a Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Mon, 12 Apr 2021 10:05:33 -0400 Subject: [PATCH 49/49] Set pre release version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4fa1ed9..e2d26f3 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.56' +__version__ = '0.1.57b0' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']