From aab01281d04466a81cdc13a2870999c6dbeb2954 Mon Sep 17 00:00:00 2001 From: David K Date: Mon, 20 Jul 2020 22:06:49 +0000 Subject: [PATCH 1/8] Made changes to auth.py file Made changes to auth.py file that should get the token and save it to the .htrc file and then be read by the htrc.volumes __init__.py file. I am getting an API timeout error which is from the download_volumes function, but cannot figure out why. --- htrc/auth.py | 41 ++++++++++++++++++++++++----------------- htrc/config.py | 20 ++++++++++---------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/htrc/auth.py b/htrc/auth.py index a24255d..53c2290 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -3,32 +3,39 @@ import http.client import ssl import time - +import subprocess import requests import requests.auth +import configparser import htrc.config def get_jwt_token(): # Currently we just store one common jwt token locally at .htrc file for simplicity # Expect to add POST method to query unique jwt token with the combo of username and password - username, password = credential_prompt() - - client_id, client_secret = htrc.config.get_credentials() - - auth = requests.auth.HTTPBasicAuth(client_id, client_secret) - data = { "grant_type": "password", - "username": username, - "password": password, - "scope" : "openid"} - - url = htrc.config.get_idp_url() - r = requests.post(url, data=data, auth=auth) + #username, password = credential_prompt() + + #client_id, client_secret = htrc.config.get_credentials() + + #auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + #data = { "grant_type": "password", + #"username": username, + #"password": password, + #"scope" : "openid"} + + url1 = htrc.config.get_idp_url() + capsule_id = htrc.config._get_value("jwt", "capsule_id") + result = subprocess.check_output(['hostname', '-s', '-I']) + result = result.decode('utf-8') + result = result[:-1] + capsule_ip = result.strip() + url = url1 + "/" + capsule_id + "/" + capsule_ip + r = requests.get(url, verify = False) data = r.json() if 'error' not in data: - expiration = int(time.time()) + data['expires_in'] - return data['id_token'], expiration + #expiration = int(time.time()) + data['expires_in'] + return data['token'] elif data['error'] == 'invalid_grant': print("Invalid username or password. Please try again.\n") return get_jwt_token() @@ -51,5 +58,5 @@ def credential_prompt(): if __name__ == '__main__': - token, expiration = get_jwt_token() - htrc.config.save_jwt_token(token, expiration) + token = get_jwt_token() + htrc.config.save_jwt_token(token) diff --git a/htrc/config.py b/htrc/config.py index ccd7d54..deb0ffe 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -81,19 +81,19 @@ def get_jwt_token(path=None): token = _get_value('jwt', 'token', path) # check expiration date - expiration = int(_get_value('jwt', 'expiration', path)) - if time.time() > expiration: - raise RuntimeError("JWT token expired.") + #expiration = int(_get_value('jwt', 'expiration', path)) + #if time.time() > expiration: + #raise RuntimeError("JWT token expired.") except: # This should run on either a missing or expired token. import htrc.auth - token, expiration = htrc.auth.get_jwt_token() - htrc.config.save_jwt_token(token, expiration, path) + token = htrc.auth.get_jwt_token() + htrc.config.save_jwt_token(token, path) return token -def save_jwt_token(token, expiration=None, path=None): +def save_jwt_token(token, path=None): """ Saves JWT token in the config file. """ @@ -102,8 +102,8 @@ def save_jwt_token(token, expiration=None, path=None): path = DEFAULT_PATH # Default to expiration of now - force a new token on next request - if expiration is None: - expiration = time.time() + #if expiration is None: + #expiration = time.time() # Open and modify existing config file, if it exists. config = ConfigParser(allow_no_value=True) @@ -114,7 +114,7 @@ def save_jwt_token(token, expiration=None, path=None): # set token and expiration config.set('jwt', 'token', token) - config.set('jwt', 'expiration', expiration) + #config.set('jwt', 'expiration', expiration) with open(path, 'w') as credential_file: config.write(credential_file) @@ -137,7 +137,7 @@ def remove_jwt_token(path=None): config.add_section('jwt') # set token and expiration config.set('jwt', 'token', " ") - config.set('jwt', 'expiration', " ") + #config.set('jwt', 'expiration', " ") with open(path, 'w') as credential_file: config.write(credential_file) From 46b9710b50a2f5718c4aaf1f2b70fb61cddd2765 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Fri, 24 Jul 2020 14:15:56 -0400 Subject: [PATCH 2/8] skip config file for JWT. --- htrc/config.py | 14 ++------------ setup.py | 2 +- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/htrc/config.py b/htrc/config.py index deb0ffe..eb1dbf7 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -77,19 +77,9 @@ def get_idp_url(path=None): # Add jwt credential access methods def get_jwt_token(path=None): - try: - token = _get_value('jwt', 'token', path) - - # check expiration date - #expiration = int(_get_value('jwt', 'expiration', path)) - #if time.time() > expiration: - #raise RuntimeError("JWT token expired.") - except: - # This should run on either a missing or expired token. - import htrc.auth - token = htrc.auth.get_jwt_token() - htrc.config.save_jwt_token(token, path) + import htrc.auth + token = htrc.auth.get_jwt_token() return token diff --git a/setup.py b/setup.py index 221915a..bd2ea20 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.53' +__version__ = '0.1.54b2' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226'] From 4618d532e1320343ef5d62bffb6d5276f9c9fca0 Mon Sep 17 00:00:00 2001 From: David K Date: Mon, 27 Jul 2020 20:20:41 +0000 Subject: [PATCH 3/8] Add files via upload --- htrc/auth.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/htrc/auth.py b/htrc/auth.py index 53c2290..cd44c48 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -1,12 +1,12 @@ -from base64 import b64encode +#from base64 import b64encode from getpass import getpass -import http.client -import ssl -import time +#import http.client +#import ssl +#import time import subprocess import requests import requests.auth -import configparser +#import configparser import htrc.config @@ -30,7 +30,7 @@ def get_jwt_token(): result = result[:-1] capsule_ip = result.strip() url = url1 + "/" + capsule_id + "/" + capsule_ip - r = requests.get(url, verify = False) + r = requests.get(url) data = r.json() if 'error' not in data: From cd6e90db3cff8d97d74af2ac7d6641c47667785c Mon Sep 17 00:00:00 2001 From: David K Date: Mon, 27 Jul 2020 20:22:39 +0000 Subject: [PATCH 4/8] Commented out unused functions Commented out unused functions and packages. --- htrc/volumes/__init__.py | 85 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 6ddb9a7..6c8787a 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -12,30 +12,29 @@ from future import standard_library standard_library.install_aliases() -from builtins import input +#from builtins import input import http.client from io import BytesIO # used to stream http response into zipfile. -import json +#import json import logging import os.path import progressbar -import re +#import re import socket import ssl -import sys -from time import sleep -from urllib.request import urlopen -from urllib.error import HTTPError -from urllib.parse import quote_plus, urlencode -import xml.etree.ElementTree as ET +#import sys +#from time import sleep +#from urllib.request import urlopen +#from urllib.error import HTTPError +from urllib.parse import urlencode +#import xml.etree.ElementTree as ET from zipfile import ZipFile # used to decompress requested zip archives. -from htrc.lib.cli import bool_prompt +#from htrc.lib.cli import bool_prompt from htrc.util import split_items import htrc.config -import logging from logging import NullHandler logging.getLogger(__name__).addHandler(NullHandler()) @@ -77,7 +76,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met # TODO: Fix SSL cert verification ctx = ssl.create_default_context() ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE + #ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) @@ -154,7 +153,7 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa # TODO: Fix SSL cert verification ctx = ssl.create_default_context() ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE + #ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) @@ -191,49 +190,49 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa return data -def get_oauth2_token(username, password): +#def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded - headers = {"Content-type": "application/x-www-form-urlencoded"} - data = { "grant_type": "client_credentials", - "client_secret": password, - "client_id": username } - data = urlencode(data) + #headers = {"Content-type": "application/x-www-form-urlencoded"} + #data = { "grant_type": "client_credentials", + #"client_secret": password, + #"client_id": username } + #data = urlencode(data) # create an SSL context - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE + #ctx = ssl.create_default_context() + #ctx.check_hostname = False + #ctx.verify_mode = ssl.CERT_NONE # make sure the request method is POST - host, port = htrc.config.get_oauth2_host_port() - oauth2port = htrc.config.get_oauth2_port() - oauth2EPRurl = htrc.config.get_oauth2_url() - httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx) - httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers) + #host, port = htrc.config.get_oauth2_host_port() + #oauth2port = htrc.config.get_oauth2_port() + #oauth2EPRurl = htrc.config.get_oauth2_url() + #httpsConnection = http.client.HTTPSConnection(host, oauth2port, context=ctx) + #httpsConnection.request("POST", oauth2EPRurl + "?" + data, "", headers) - response = httpsConnection.getresponse() + #response = httpsConnection.getresponse() # if response status is OK - if response.status == 200: - data = response.read().decode('utf8') + #if response.status == 200: + #data = response.read().decode('utf8') - jsonData = json.loads(data) - logging.info("*** JSON: {}".format(jsonData)) + #jsonData = json.loads(data) + #logging.info("*** JSON: {}".format(jsonData)) - token = jsonData["access_token"] - logging.info("*** parsed token: {}".format(token)) + #token = jsonData["access_token"] + #logging.info("*** parsed token: {}".format(token)) - else: - logging.debug("Unable to get token") - logging.debug("Response Code: {}".format(response.status)) - logging.debug("Response: {}".format(response.reason)) - logging.debug(response.read()) - raise EnvironmentError("Unable to get token.") + #else: + #logging.debug("Unable to get token") + #logging.debug("Response Code: {}".format(response.status)) + #logging.debug("Response: {}".format(response.reason)) + #logging.debug(response.read()) + #raise EnvironmentError("Unable to get token.") - if httpsConnection is not None: - httpsConnection.close() + #if httpsConnection is not None: + #httpsConnection.close() - return token + #return token def grep(file_name, output_dir, pattern): na_volume = [] From eac2786e08341911d94a424653dc0656477fbf4d Mon Sep 17 00:00:00 2001 From: David K Date: Wed, 5 Aug 2020 18:42:55 +0000 Subject: [PATCH 5/8] Changes to work with docker Made changes so IP address can be found while working in docker. --- htrc/auth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htrc/auth.py b/htrc/auth.py index cd44c48..76b9152 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -25,7 +25,7 @@ def get_jwt_token(): url1 = htrc.config.get_idp_url() capsule_id = htrc.config._get_value("jwt", "capsule_id") - result = subprocess.check_output(['hostname', '-s', '-I']) + result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True) result = result.decode('utf-8') result = result[:-1] capsule_ip = result.strip() From 700f7f42522121b12ad69a65f19a584f6bf86cc1 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 7 Oct 2021 12:23:27 -0400 Subject: [PATCH 6/8] Merge develop into dk_test. --- .gitignore | 2 + docs/source/cli.rst | 39 +++- htrc/.htrc.default | 1 + htrc/__main__.py | 76 ++++++-- htrc/auth.py | 3 +- htrc/config.py | 42 ++++- htrc/hf_utils/__init__.py | 110 ++++++++++++ htrc/lib/cli.py | 2 + htrc/models/__init__.py | 68 +++++++ htrc/runningheaders/__init__.py | 163 +++++++++++++++++ htrc/tools/mallet.py | 1 + htrc/tools/topicexplorer.py | 1 + htrc/util/__init__.py | 7 +- htrc/util/resolve.py | 20 ++- htrc/volumes/__init__.py | 307 +++++++++++++++++++++++--------- setup.py | 6 +- tests/test_htrc_util_resolve.py | 3 + tests/test_htrc_volumes.py | 56 ++++-- 18 files changed, 771 insertions(+), 136 deletions(-) create mode 100644 htrc/hf_utils/__init__.py create mode 100644 htrc/models/__init__.py create mode 100644 htrc/runningheaders/__init__.py diff --git a/.gitignore b/.gitignore index d77aad6..efb0815 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ htrc.egg-info .coverage htmlcov/ .eggs +ssl-cert-trust +venv/ diff --git a/docs/source/cli.rst b/docs/source/cli.rst index 0d19316..2a0ff7c 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -1,6 +1,6 @@ HTRC Workset Toolkit ====================== -The HTRC Workset Toolkit povides a command line interface for interacting with +The HTRC Workset Toolkit povides a command line interface for interacting with and analyzing volumes in the HathiTrust Digital Library: - Volume Download (``htrc download``) @@ -11,7 +11,7 @@ and analyzing volumes in the HathiTrust Digital Library: Workset Path -------------- -Each of these commands takes a *workset path*. Valid types of workset paths +Each of these commands takes a *workset path*. Valid types of workset paths and examples of each are: ================================== ============================================================================== @@ -71,7 +71,7 @@ download`_, the Topic Modeling '''''''''''''''' -There are two implementations of LDA topic modeling supported by the +There are two implementations of LDA topic modeling supported by the Arguments @@ -114,6 +114,35 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da ``htrc download /home/dcuser/HTRC/htrc-id -o /media/secure_volume/my-workset -c`` +* Download specific pages from a single volume : + + ``htrc download -pg coo.31924089593846[5,10,15,20,25,30]`` + +* Download volumes and then extract headers/footers from the volumes : + + ``htrc download -hf /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from the volume pages then concatenate the pages - (This will concatenate all the pages of the volume into one txt file.) : + + ``htrc download -hfc /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from the volumes, skip downloading the .csv files containing removed headers and footers : + + ``htrc download -hf -s /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change window of pages in extractor algorithm (The default is 6, lower numbers increase speed, but are less accurate) : + + ``htrc download -hf -w 3 /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change minimum similarity rate for lines on pages to be considered a header or footer (Default is .7 or 70%, so if a line is 70% the same as other lines on other pages within the window of pages it is labeled a header or footer and removed) : + + ``htrc download -hf -msr .9 /home/dcuser/HTRC/htrc-id`` + +* Download volumes, extract headers/footers from volumes, change the max number of concurrent tasks (note that the only options are 1 or 2): + + ``htrc download -hf --parallelism 2 /home/dcuser/HTRC/htrc-id`` + + | +---------------------------------+-----------------------------------------------+ | command: ``htrc metadata`` | capsule mode: **secure** and **maintenance** | @@ -246,7 +275,3 @@ Following are the use cases and examples of ``htrc`` commands inside the HTRC Da * Run topicexplorer on already downloaded volume - (Sample volumes are available in capsules created with ubuntu-16-04-with-sample-volumes image. Those sample volumes are available as zip files. Please unzip before use them because the metadata function gets volume ids from volume directory names). ``htrc topicexplorer /home/dcuser/unzipped_volumes -k 20`` - - - - diff --git a/htrc/.htrc.default b/htrc/.htrc.default index bfeadee..3ec2327 100644 --- a/htrc/.htrc.default +++ b/htrc/.htrc.default @@ -8,6 +8,7 @@ port = 443 url = / cert = key = +pd_only = [oauth] host = silvermaple.pti.indiana.edu diff --git a/htrc/__main__.py b/htrc/__main__.py index 04b26b4..6102ad9 100644 --- a/htrc/__main__.py +++ b/htrc/__main__.py @@ -6,8 +6,8 @@ from future import standard_library standard_library.install_aliases() -import json -import os, os.path +import os +import os.path import shutil import sys from tempfile import NamedTemporaryFile @@ -16,6 +16,7 @@ import htrc.volumes import htrc.workset import htrc.tools.mallet + from argparse import ArgumentParser import htrc.tools.topicexplorer from htrc.lib.cli import bool_prompt @@ -25,18 +26,37 @@ def download_parser(parser=None): if parser is None: parser = ArgumentParser() - parser.add_argument("-u", "--username", help="HTRC username") - parser.add_argument("-p", "--password", help="HTRC password") + #parser.add_argument("-u", "--username", help="HTRC username") + #parser.add_argument("-p", "--password", help="HTRC password") parser.add_argument("file", nargs='?', default=sys.stdin, - help="workset path[s]") + help="Workset path[s]") parser.add_argument("-f", "--force", action='store_true', - help="remove folder if exists") - parser.add_argument("-o", "--output", help="output directory", + help="Remove folder if exists") + parser.add_argument("-o", "--output", help="Output directory", default='/media/secure_volume/workset/') + parser.add_argument("-hf", "--remove-headers-footers", action='store_true', + help="Remove headers and footers from individual pages and save in a separate csv file for inspection") + parser.add_argument("-hfc", "--remove-headers-footers-and-concat", action='store_true', + help="Remove headers and footers from individual pages and save in a separate csv file for inspection then concatenate pages") + parser.add_argument("-w", "--window-size", required=False, type=int, metavar="N", default=6, + help="How many pages ahead does the header/footer extractor algorithm look to find potential " + "matching headers/footers (higher value gives potentially more accurate results on lower " + "quality OCR volumes at the expense of runtime)") + parser.add_argument("-msr", "--min-similarity-ratio", required=False, type=float, metavar="N", default=0.7, + help="The minimum string similarity ratio required for the Levenshtein distance fuzzy-matching " + "algorithm to declare that two headers are considered 'the same' (the higher the value, up " + "to a max of 1.0, the more strict the matching has to be; lower values allow for more " + "fuzziness to account for OCR errors)") + parser.add_argument("-s", "--skip-removed-hf", action='store_true', + help="Skip creating a saved report of the removed headers and footers for each page for inspection") + parser.add_argument("--parallelism", required=False, type=int, metavar="N", default=os.cpu_count(), + help="The max number of concurrent tasks to start when downloading or removing headers/footers") + parser.add_argument("--batch-size", required=False, type=int, metavar="N", default=250, + help="The max number of volumes to download at a time from DataAPI") parser.add_argument("-c", "--concat", action='store_true', - help="concatenate a volume's pages in to a single file") + help="Concatenate a volume's pages in to a single file") parser.add_argument("-m", "--mets", action='store_true', - help="add volume's METS file") + help="Add volume's METS file") parser.add_argument("-pg", "--pages",action='store_true', help="Download given page numbers of a volumes.") parser.add_argument("-t", "--token", help="JWT for volumes download.") @@ -47,17 +67,17 @@ def download_parser(parser=None): parser.add_argument("-dk", "--datakey", help="Client key file for mutual TLS with Data API.") return parser + def add_workset_path(parser=None): if parser is None: parser = ArgumentParser() - parser.add_argument("path", nargs='+', help="workset path[s]") + parser.add_argument("path", nargs='+', help="Workset path[s]") return parser - def main(): parser = ArgumentParser() - parser.add_argument('-d', '--debug', help="print long debug messages", + parser.add_argument('-d', '--debug', help="Print long debug messages", action='store_true') parsers = parser.add_subparsers(help="select a command") @@ -78,10 +98,11 @@ def main(): help="Download HathiTrust volumes to disk [requires auth]") download_parser(parser_download) parser_download.set_defaults(func='download') - + + # Run helper parser_run = parsers.add_parser('run', help="Run a built-in algorithm.") - run_parsers = parser_run.add_subparsers(help="select a command") + run_parsers = parser_run.add_subparsers(help="Select a command") parser_mallet = run_parsers.add_parser('mallet') htrc.tools.mallet.populate_parser(parser_mallet) @@ -94,6 +115,9 @@ def main(): parser_run.set_defaults(func='run') args = parser.parse_args() + if 'func' not in args: + parser.print_help() + sys.exit(1) if args.func in ['metadata', 'export']: volumes = [] @@ -113,6 +137,9 @@ def main(): metadata = get_metadata(volumes) print(json.dumps(metadata)) elif args.func == 'run': + if 'run' not in args: + parser_run.print_help() + sys.exit(1) if args.run == 'mallet': htrc.tools.mallet.main(args.path, args.k, args.iter) if args.run == 'topicexplorer': @@ -125,10 +152,25 @@ def main(): else: print("Please choose another output folder and try again.") sys.exit(1) - + + if args.concat and args.remove_headers_footers: + print("Cannot set both concat and remove-headers-footers") + sys.exit(1) + if args.concat and args.remove_headers_footers_and_concat: + print("Cannot set both concat and remove-headers-footers-and-concat") + sys.exit(1) + if args.remove_headers_footers and args.remove_headers_footers_and_concat: + print("Cannot set both remove_headers_footers and remove_headers_footers_and_concat") + sys.exit(1) + if args.mets and args.remove_headers_footers_and_concat: + print("Cannot set both mets and remove_headers_footers_and_concat") + sys.exit(1) if args.pages: if args.mets and args.concat: - print ("Cannot set both concat and mets with pages") + print("Cannot set both concat and mets with pages") + sys.exit(1) + if args.mets and args.remove_headers_footers_and_concat: + print("Cannot set both mets and remove_headers_footers_and_concat with pages") sys.exit(1) try: @@ -137,6 +179,7 @@ def main(): print("Invalid identifier:", args.file) sys.exit(1) + def resolve_and_download(args): if args.file == sys.stdin: # For use with UNIX pipes @@ -201,6 +244,7 @@ def download(args): else: raise e + def download_with_tempfile(args, volumes): f = NamedTemporaryFile() for volume in volumes: diff --git a/htrc/auth.py b/htrc/auth.py index 76b9152..b975e63 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -10,6 +10,7 @@ import htrc.config + def get_jwt_token(): # Currently we just store one common jwt token locally at .htrc file for simplicity # Expect to add POST method to query unique jwt token with the combo of username and password @@ -22,7 +23,7 @@ def get_jwt_token(): #"username": username, #"password": password, #"scope" : "openid"} - + url1 = htrc.config.get_idp_url() capsule_id = htrc.config._get_value("jwt", "capsule_id") result = subprocess.check_output("hostname -s -I | awk '{print $1}'", shell=True) diff --git a/htrc/config.py b/htrc/config.py index eb1dbf7..c6b5743 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -6,18 +6,14 @@ """ from future import standard_library standard_library.install_aliases() -from builtins import input - +from typing import Optional from configparser import RawConfigParser as ConfigParser, NoSectionError from codecs import open -from getpass import getpass import logging import os.path import shutil import time -from htrc.lib.cli import bool_prompt - DEFAULT_PATH = os.path.expanduser('~') DEFAULT_PATH = os.path.join(DEFAULT_PATH, '.htrc') if not os.path.exists(DEFAULT_PATH): @@ -26,6 +22,25 @@ logging.info("Copying default config file to home directory.") shutil.copyfile(DEFAULT_FILE, DEFAULT_PATH) + +class HtrcDataApiConfig: + def __init__(self, + token: Optional[str] = None, + host: Optional[str] = None, + port: Optional[int] = None, + epr: Optional[str] = None, + cert: Optional[str] = None, + key: Optional[str] = None) -> None: + super().__init__() + + self.token = token or get_jwt_token(save_new_token=False) + self.host = host or get_dataapi_host() + self.port = port or get_dataapi_port() + self.epr = epr or get_dataapi_epr() + self.cert = cert or get_dataapi_cert() + self.key = key or get_dataapi_key() + + def _get_value(section, key, path=None): if path is None: path = DEFAULT_PATH @@ -38,33 +53,45 @@ def _get_value(section, key, path=None): except NoSectionError: raise EnvironmentError("Config not set for {} {} in {}".format( section, key, path)) - + + def get_dataapi_port(path=None): port = int(_get_value('data', 'port', path)) return (port) + def get_dataapi_host(path=None): host = _get_value('data', 'host', path) return (host) + def get_dataapi_epr(path=None): return _get_value('data', 'url', path) + def get_dataapi_cert(path=None): return _get_value('data', 'cert', path) + def get_dataapi_key(path=None): return _get_value('data', 'key', path) + +def get_dataapi_access(path=None): + return _get_value('data', 'pd_only', path) + + def get_idp_host_port(path=None): host = _get_value('idp', 'host', path) port = _get_value('idp', 'port', path) return (host, port) + def get_idp_path(path=None): return _get_value('idp', 'url') + def get_idp_url(path=None): host, port = get_idp_host_port(path) path = get_idp_path(path) @@ -111,6 +138,7 @@ def save_jwt_token(token, path=None): return token + def remove_jwt_token(path=None): """ Removes JWT token from the config file. @@ -148,9 +176,11 @@ def get_credentials(path=None): return (client_id, client_secret) + def populate_parser(parser): return parser + if __name__ == '__main__': from argparse import ArgumentParser diff --git a/htrc/hf_utils/__init__.py b/htrc/hf_utils/__init__.py new file mode 100644 index 0000000..81553de --- /dev/null +++ b/htrc/hf_utils/__init__.py @@ -0,0 +1,110 @@ +import re +from typing import TypeVar, List, Iterator, Tuple, Callable + +T = TypeVar('T') + + +def clean_text(s: str) -> str: + # replace all characters which aren't letters with whitespaces ([\W\d_] is equivalent of \P{L} which is unsupported) + s = re.sub(r'[\W\d_]+', " ", s, flags=re.UNICODE) + # replace multiple sequential whitespaces with single whitespace + s = re.sub(r'\s{2,}', " ", s, flags=re.UNICODE) + # trim whitespaces at the beginning and end + s = s.strip() + # lowercase + s = s.lower() + + return s + + +def levenshtein(s: str, t: str, insert_cost: int = 1, delete_cost: int = 1, replace_cost: int = 1) -> int: + """ From Wikipedia article; Iterative with two matrix rows. """ + # degenerate cases + if s == t: + return 0 + + len0 = len(s) + len1 = len(t) + + if not len0: + return len1 + + if not len1: + return len0 + + # the array of distances + v0 = [0] * (len0 + 1) + v1 = [0] * (len0 + 1) + + # initial cost of skipping prefix in s + for i in range(len(v0)): + v0[i] = i + + # dynamically compute the array of distances + + # transformation cost for each letter in t + for j in range(len1): + # initial cost of skipping prefix in t + v1[0] = j + 1 + + # transformation cost for each letter in s + for i in range(len0): + # matching current letters in both strings + match = 0 if s[i] == t[j] else 1 + + # computing cost for each transformation + cost_insert = v0[i + 1] + insert_cost + cost_delete = v1[i] + delete_cost + cost_replace = v0[i] + match * replace_cost + + # keep minimum cost + v1[i + 1] = min(cost_insert, cost_delete, cost_replace) + + # swap cost arrays + v0, v1 = v1, v0 + + # the distance is the cost for transforming all letters in both strings + return v0[len0] + + +def pairwise_combine_within_distance(xs: List[T], n: int) -> List[Tuple[T, T]]: + if not xs: + return [] + + result = [] + x, xs = xs[0], xs[1:] + + while xs: + result = result + [(x, v) for v in xs[:n - 1]] + x, xs = xs[0], xs[1:] + + return result + + +def group_consecutive_when(xs: List[T], pred: Callable[[T, T], bool]) -> Iterator[List[T]]: + result = [] + _prev, _next = None, None + + while len(xs) > 1: + _prev, _next = xs[0], xs[1] + result.append(_prev) + if not pred(_prev, _next): + yield result + result = [] + xs = xs[1:] + + if len(xs) == 1: + _prev, _next = _next, xs[0] + + if _prev is not None and _next is not None and pred(_prev, _next): + result.extend([_prev, _next]) + elif _next is not None: + result.append(_next) + + yield result + + +def flatten(xss: List[tuple]) -> Iterator[T]: + for xs in xss: + for x in xs: + yield x diff --git a/htrc/lib/cli.py b/htrc/lib/cli.py index 33c378e..11a6e10 100644 --- a/htrc/lib/cli.py +++ b/htrc/lib/cli.py @@ -1,4 +1,6 @@ from builtins import input + + def bool_prompt(prompt_str, default=None): if default is True: default = 'y' diff --git a/htrc/models/__init__.py b/htrc/models/__init__.py new file mode 100644 index 0000000..e86e115 --- /dev/null +++ b/htrc/models/__init__.py @@ -0,0 +1,68 @@ +import os +from abc import ABC, abstractmethod +from typing import List + + +class Page(ABC): + @property + @abstractmethod + def text_lines(self) -> List[str]: + """ + The lines of text on the page + """ + pass + + @property + def text(self) -> str: + return os.linesep.join(self.text_lines) + + +class PageStructure(Page, ABC): + def __init__(self) -> None: + self.num_header_lines = 0 + self.num_footer_lines = 0 + + @property + def has_header(self) -> bool: + return self.num_header_lines > 0 + + @property + def has_body(self) -> bool: + return len(self.text_lines) - self.num_header_lines - self.num_footer_lines > 0 + + @property + def has_footer(self) -> bool: + return self.num_footer_lines > 0 + + @property + def header_lines(self) -> List[str]: + return self.text_lines[:self.num_header_lines] + + @property + def body_lines(self) -> List[str]: + return self.text_lines[self.num_header_lines:len(self.text_lines) - self.num_footer_lines] + + @property + def footer_lines(self) -> List[str]: + return self.text_lines[-self.num_footer_lines:] if self.has_footer else [] + + @property + def header(self) -> str: + return os.linesep.join(self.header_lines) + + @property + def body(self) -> str: + return os.linesep.join(self.body_lines) + + @property + def footer(self) -> str: + return os.linesep.join(self.footer_lines) + + +class HtrcPage(Page): + def __init__(self, lines: List[str]) -> None: + self._lines = lines + + @property + def text_lines(self) -> List[str]: + return self._lines diff --git a/htrc/runningheaders/__init__.py b/htrc/runningheaders/__init__.py new file mode 100644 index 0000000..799bf39 --- /dev/null +++ b/htrc/runningheaders/__init__.py @@ -0,0 +1,163 @@ +import re +from collections import defaultdict +from typing import List, TypeVar, Set, Iterator, Optional, Tuple, Dict + +from htrc.models import Page, PageStructure +from htrc.hf_utils import clean_text, levenshtein, pairwise_combine_within_distance, flatten, group_consecutive_when + +T = TypeVar('T', bound=Page) +U = TypeVar('U', bound=PageStructure) + + +class _Line: + def __init__(self, text: str, line_number: int, page: Page) -> None: + self.text = text + self.line_number = line_number + self.page = page + self.cleaned_text = clean_text(text) + + def __eq__(self, o: object) -> bool: + if not isinstance(o, _Line): + raise NotImplemented + + are_equal = self.page is o.page and self.line_number == o.line_number + + return are_equal + + def __ne__(self, o: object) -> bool: + return not self == o + + def __hash__(self) -> int: + line_hash = hash(self.line_number) + page_hash = hash(self.page) + hash_value = 31 * line_hash + page_hash + + return hash_value + + def __str__(self) -> str: + return str((self.line_number, self.cleaned_text)) + + def similarity_ratio(self, line: '_Line') -> float: + ratio = 1 - float(levenshtein(self.cleaned_text, line.cleaned_text)) / max(len(self.cleaned_text), + len(line.cleaned_text)) + + return ratio + + +def parse_page_structure(pages: List[T], + window_size: int = 6, + min_similarity_ratio: float = 0.7, + min_cluster_size: int = 3, + max_header_lines: int = 3, + max_footer_lines: int = 3) -> List[U]: + def _get_page_lines(p: T) -> List[_Line]: + return [_Line(text, line_num, p) for line_num, text in enumerate(p.text_lines)] + + def _cluster_lines(lines: List[Tuple[_Line, _Line]]) -> Set[tuple]: + cluster_map = {} + + for l1, l2 in lines: + c1 = cluster_map.get(l1) + c2 = cluster_map.get(l2) + + if c1 is not None and c2 is not None and c1 is not c2: + smaller, larger = (c1, c2) if len(c1) < len(c2) else (c2, c1) + larger.extend(smaller) + for x in smaller: + cluster_map[x] = larger + elif c1 is not None and c2 is None: + c1.append(l2) + cluster_map[l2] = c1 + elif c1 is None and c2 is not None: + c2.append(l1) + cluster_map[l1] = c2 + elif c1 is None and c2 is None: + c = [l1, l2] + cluster_map[l1] = c + cluster_map[l2] = c + + return set(map(tuple, cluster_map.values())) + + def _group_lines_by_page(lines: Iterator[_Line]) -> Dict[Page, List[_Line]]: + lines_grouped_by_page = defaultdict(list) + for line in lines: + lines_grouped_by_page[line.page].append(line) + + return lines_grouped_by_page + + def _get_last_header_line(lines: List[_Line]) -> Optional[int]: + if not lines: + return None + + return max(l.line_number for l in lines) + + def _get_first_footer_line(lines: List[_Line]) -> Optional[int]: + if not lines: + return None + + return min(l.line_number for l in lines) + + def _extract_line_numbers(line: _Line) -> Tuple[_Line, List[int]]: + numbers = [int(match.group(0)) for match in + re.finditer(r"(?:(?<=^)|(?<=\s))\d{1,4}(?=\s|$)", line.text, flags=re.UNICODE)] + + return line, numbers + + def _extract_potential_page_numbers(lines: List[_Line]) -> Tuple[_Line, List[int]]: + assert len(lines) > 0 + line, numbers = _extract_line_numbers(lines[-1]) + if not numbers and not str.strip(line.text) and len(lines) > 1: + line, numbers = _extract_line_numbers(lines[-2]) + + return line, numbers + + candidate_header_lines = [] + candidate_footer_lines = [] + + pages_lines = [_get_page_lines(p) for p in pages] + + for lines in pages_lines: + # ignore lines that are <4 characters long and/or have no alphabetic characters + candidate_header_lines.append([l for l in lines[:max_header_lines] if not len(l.cleaned_text) < 4]) + candidate_footer_lines.append([l for l in lines[-max_footer_lines:] if not len(l.cleaned_text) < 4]) + + headers_for_comparison = pairwise_combine_within_distance(candidate_header_lines, window_size) + footers_for_comparison = pairwise_combine_within_distance(candidate_footer_lines, window_size) + + header_line_similarities = [] + for (lines1, lines2) in headers_for_comparison: + header_line_similarities.extend( + (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) + + footer_line_similarities = [] + for (lines1, lines2) in footers_for_comparison: + footer_line_similarities.extend( + (l1, l2) for l1 in lines1 for l2 in lines2 if l1.similarity_ratio(l2) >= min_similarity_ratio) + + header_clusters = [cluster for cluster in _cluster_lines(header_line_similarities) if + len(cluster) >= min_cluster_size] + footer_clusters = [cluster for cluster in _cluster_lines(footer_line_similarities) if + len(cluster) >= min_cluster_size] + + if not footer_clusters: + potential_page_numbers = [_extract_potential_page_numbers(lines) for lines in pages_lines if lines] + potential_page_numbers = [(line, numbers[0]) for line, numbers in potential_page_numbers if len(numbers) == 1] + potential_clusters = map(lambda group: tuple(map(lambda t: t[0], group)), + group_consecutive_when(potential_page_numbers, lambda x, y: y[1] - x[1] == 1)) + footer_clusters = [cluster for cluster in potential_clusters if len(cluster) >= min_cluster_size] + + header_lines_grouped_by_page = _group_lines_by_page(flatten(header_clusters)) + footer_lines_grouped_by_page = _group_lines_by_page(flatten(footer_clusters)) + + last_header_line_pages_map = {p: _get_last_header_line(lines) for p, lines in header_lines_grouped_by_page.items()} + first_footer_line_pages_map = {p: _get_first_footer_line(lines) for p, lines in + footer_lines_grouped_by_page.items()} + + for page in pages: + last_header_line = last_header_line_pages_map.get(page) + first_footer_line = first_footer_line_pages_map.get(page) + page.__class__ = type('StructuredPage', (page.__class__, PageStructure), {}) + page.num_header_lines = last_header_line + 1 if last_header_line is not None else 0 + page.num_footer_lines = len(page.text_lines) - first_footer_line if first_footer_line is not None else 0 + + return pages diff --git a/htrc/tools/mallet.py b/htrc/tools/mallet.py index a005e93..e82758a 100644 --- a/htrc/tools/mallet.py +++ b/htrc/tools/mallet.py @@ -19,6 +19,7 @@ def install_mallet(): mallet_dir.extractall(path=MALLET_DIR) mallet_dir.close() + def main(path, topics, iterations, output_dir='/media/secure_volume/workset/'): if not os.path.exists(MALLET_DIR): if not os.path.exists('/media/secure_volume/'): diff --git a/htrc/tools/topicexplorer.py b/htrc/tools/topicexplorer.py index 293baca..5149cc3 100644 --- a/htrc/tools/topicexplorer.py +++ b/htrc/tools/topicexplorer.py @@ -6,6 +6,7 @@ from htrc.volumes import download_volumes from htrc.workset import path_to_volumes + def main(path, topics, iterations, output_dir='/media/secure_volume/workset'): if os.path.exists("/media/secure_volume"): # If in secure mode, downlaod the volumes from data api diff --git a/htrc/util/__init__.py b/htrc/util/__init__.py index edbddd1..2b1dd3e 100644 --- a/htrc/util/__init__.py +++ b/htrc/util/__init__.py @@ -4,6 +4,7 @@ from .resolve import ORG_CODES + def split_items(seq, split_size): """ Returns a generator that returns portions of `seq` up to `split_size`. @@ -13,7 +14,7 @@ def split_items(seq, split_size): :param split_size: The maximum size of each split. """ full_segments = int(math.floor(len(seq) / split_size)) - for i in range(1,full_segments+1): - yield seq[(i-1)*split_size:i*split_size] + for i in range(1, full_segments + 1): + yield seq[(i - 1) * split_size:i * split_size] if (full_segments * split_size) < len(seq): - yield seq[full_segments*split_size:] + yield seq[full_segments * split_size:] diff --git a/htrc/util/resolve.py b/htrc/util/resolve.py index e3b2b4f..1d1a7e2 100644 --- a/htrc/util/resolve.py +++ b/htrc/util/resolve.py @@ -94,29 +94,31 @@ def parse_volume_id(string): Organization codes for the volumes can be found in ORG_CODES. ''' - # First extract the volume ID from a URL, fallbck to assume string. + # First extract the volume ID from a URL, fallback to assume string. parsed_url = urlparse(string) if parsed_url.netloc == 'hdl.handle.net': # Parse the Handle ID, ex: # https://hdl.handle.net/2027/uc2.ark:/13960/fk92805m1s' # Note that if the Handle URL contains page info, this is discarded. - id = parsed_url.path.replace('/2027/', '') + htid = parsed_url.path.replace('/2027/', '') elif parsed_url.netloc == 'babel.hathitrust.org': # Parse the HT Digital Library URL, ex: # https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7 if parsed_url.query: - id = parse_qs(parsed_url.query).get('id', None) - if id is not None: - id = id[0] + htid = parse_qs(parsed_url.query).get('id', None) + if htid is not None: + htid = htid[0] + if ';' in htid: + htid = htid.split(';')[0] else: - id = string + htid = string # Validate ID against ORG_CODES. - # Won't guarantee volume existance, but is a sanity check. - if id and any(id.startswith(org) for org in ORG_CODES): - return id + # Won't guarantee volume existence, but it is a sanity check. + if htid and any(htid.startswith(org) for org in ORG_CODES): + return htid else: raise ValueError("Invalid Organization Code in HathiTrust ID") diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index 6c8787a..aabb171 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -10,13 +10,15 @@ """ from __future__ import print_function from future import standard_library + standard_library.install_aliases() #from builtins import input +from htrc.models import HtrcPage import http.client -from io import BytesIO # used to stream http response into zipfile. -#import json +from io import BytesIO, TextIOWrapper +import json import logging import os.path import progressbar @@ -29,16 +31,25 @@ #from urllib.error import HTTPError from urllib.parse import urlencode #import xml.etree.ElementTree as ET +from urllib.parse import urlencode from zipfile import ZipFile # used to decompress requested zip archives. +from tqdm import tqdm +from htrc.runningheaders import parse_page_structure +from functools import partial +import pandas as pd #from htrc.lib.cli import bool_prompt from htrc.util import split_items import htrc.config +import multiprocessing +import logging from logging import NullHandler + logging.getLogger(__name__).addHandler(NullHandler()) -def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, mets=False): + +def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, concat=False, mets=False, buffer_size=128): """ Returns volumes from the Data API as a raw zip stream. @@ -53,15 +64,16 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met if not volume_ids: raise ValueError("volume_ids is empty.") - url = epr + "volumes" + url = data_api_config.epr + "volumes" for id in volume_ids: if ("." not in id - or " " in id): + or " " in id): print("Invalid volume id " + id + ". Please correct this volume id and try again.") data = {'volumeIDs': '|'.join( [id.replace('+', ':').replace('=', '/') for id in volume_ids])} + if concat: data['concat'] = 'true' @@ -69,7 +81,7 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met data['mets'] = 'true' # Authorization - headers = {"Authorization": "Bearer " + token, + headers = {"Authorization": "Bearer " + data_api_config.token, "Content-type": "application/x-www-form-urlencoded"} # Create SSL lookup @@ -79,8 +91,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met #ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes - httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - + httpsConnection = http.client.HTTPSConnection( + data_api_config.host, + data_api_config.port, + context=ctx, + key_file=data_api_config.key, + cert_file=data_api_config.cert) httpsConnection.request("POST", url, urlencode(data), headers) @@ -91,12 +107,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met data = BytesIO() bytes_downloaded = 0 bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, - widgets=[progressbar.AnimatedMarker(), ' ', - progressbar.DataSize(), - ' (', progressbar.FileTransferSpeed(), ')']) + widgets=[progressbar.AnimatedMarker(), ' ', + progressbar.DataSize(), + ' (', progressbar.FileTransferSpeed(), ')']) while body: - body = response.read(128) + body = response.read(buffer_size) data.write(body) bytes_downloaded += len(body) bar.update(bytes_downloaded) @@ -114,12 +130,12 @@ def get_volumes(token, volume_ids, host, port, cert, key, epr, concat=False, met return data -def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=False): +def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=False, mets=False, buffer_size=128): """ Returns a ZIP file containing specfic pages. Parameters: - :token: An OAuth2 token for the app. + :data_api_config: The configuration data of the DataAPI endpoint. :volume_ids: A list of volume_ids :concat: If True, return a single file per volume. If False, return a single file per page (default). @@ -127,11 +143,11 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa if not page_ids: raise ValueError("page_ids is empty.") - url = epr + "pages" + url = data_api_config.epr + "pages" for id in page_ids: if ("." not in id - or " " in id): + or " " in id): print("Invalid volume id " + id + ". Please correct this volume id and try again.") data = {'pageIDs': '|'.join( @@ -145,10 +161,9 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa data['mets'] = 'true' # Authorization - headers = {"Authorization": "Bearer " + token, + headers = {"Authorization": "Bearer " + data_api_config.token, "Content-type": "application/x-www-form-urlencoded"} - # Create SSL lookup # TODO: Fix SSL cert verification ctx = ssl.create_default_context() @@ -156,8 +171,13 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa #ctx.verify_mode = ssl.CERT_NONE # Retrieve the volumes - httpsConnection = http.client.HTTPSConnection(host, port, context=ctx, key_file=key, cert_file=cert) - + httpsConnection = http.client.HTTPSConnection( + data_api_config.host, + data_api_config.port, + context=ctx, + key_file=data_api_config.key, + cert_file=data_api_config.cert + ) httpsConnection.request("POST", url, urlencode(data), headers) @@ -168,12 +188,12 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa data = BytesIO() bytes_downloaded = 0 bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength, - widgets=[progressbar.AnimatedMarker(), ' ', + widgets=[progressbar.AnimatedMarker(), ' ', progressbar.DataSize(), ' (', progressbar.FileTransferSpeed(), ')']) while body: - body = response.read(128) + body = response.read(buffer_size) data.write(body) bytes_downloaded += len(body) bar.update(bytes_downloaded) @@ -234,82 +254,194 @@ def get_pages(token, page_ids, host, port, cert, key, epr, concat=False, mets=Fa #return token -def grep(file_name, output_dir, pattern): - na_volume = [] - for line in open(file_name): - if pattern in line: - na_volume.append(line.split()[-1]) - if len(na_volume) < 100: - print("\nFollowing volume ids are not available.") - print("\n".join(str(item) for item in na_volume)) - with open(os.path.join(output_dir, "volume_not_available.txt"), "w") as volume_na: - volume_na.write("\n".join(str(item) for item in na_volume)) - else: - if len(na_volume) == 100: - print("\nThere are 100 or more unavailable volumes.\nTo check the validity of volumes in your workset or volume id file go to:\n https://analytics.hathitrust.org/validateworkset \n or email us at htrc-help@hathitrust.org for assistance.") - -def check_error_file(output_dir): - file_name = "ERROR.err" +def grep_error(file_name, output_dir, pattern, txt_index): + na_volume = [] if output_dir.endswith("/"): - file_path = output_dir+ file_name + file_path = output_dir + file_name else: - file_path = output_dir+"/"+file_name + file_path = output_dir + "/" + file_name if os.path.isfile(file_path): - grep(file_path, output_dir, "KeyNotFoundException") + for line in open(file_path): + if pattern in line: + na_volume.append(line.split()[txt_index]) + return na_volume -def download_volumes(volume_ids, output_dir, username=None, password=None, - config_path=None, token=None, concat=False, mets=False, pages=False, host=None, port=None, cert=None, key=None, epr=None): - # create output_dir folder, if nonexistant - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - # get token if not specified - if not token: - token = htrc.config.get_jwt_token() - htrc.config.remove_jwt_token() +def _to_htrc_page(page_file, zip): + with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page: + return HtrcPage([line.rstrip() for line in page.readlines()]) - if not host: - host= htrc.config.get_dataapi_host() - if not port: - port = htrc.config.get_dataapi_port() +def download_volumes(volume_ids, output_dir, concat=False, mets=False, pages=False, + remove_headers_footers=False, hf_window_size=6, hf_min_similarity=0.7, skip_removed_hf=False, + parallelism=multiprocessing.cpu_count(), batch_size=250, data_api_config=None): + if not 0 < parallelism <= multiprocessing.cpu_count(): + raise ValueError("Invalid parallelism level specified") - if not epr: - epr = htrc.config.get_dataapi_epr() + remove_hf_fun = partial( + _remove_headers_footers_and_save, + concat=concat, + hf_min_similarity=hf_min_similarity, + hf_window_size=hf_window_size, + skip_removed_hf=skip_removed_hf, + output_dir=output_dir + ) - if not cert: - cert = htrc.config.get_dataapi_cert() + volume_ids = list(set(volume_ids)) # ensure unique volume ids + num_vols = len(volume_ids) - if not key: - key = htrc.config.get_dataapi_key() + data_api_config = data_api_config or htrc.config.HtrcDataApiConfig() - if any((token, host, port)) is not None: - logging.info("obtained token: %s\n" % token) + os.makedirs(output_dir, exist_ok=True) + + if any((data_api_config.token, data_api_config.host, data_api_config.port)) is not None: + logging.info("obtained token: %s\n" % data_api_config.token) try: - for ids in split_items(volume_ids, 250): - if pages: - if concat & mets: - raise ValueError("Cannot set both concat and mets with pages.") + errors = [] + rights = [] + + with tqdm(total=num_vols) as progress, multiprocessing.Pool(processes=parallelism) as pool: + for ids in split_items(volume_ids, batch_size): + if pages: + if concat and mets: + raise ValueError("Cannot set both concat and mets with pages.") + else: + data = get_pages(data_api_config, ids, concat and not remove_headers_footers, mets) else: - data = get_pages(token, ids, host, port, cert, key, epr, concat, mets) + data = get_volumes(data_api_config, ids, concat and not remove_headers_footers, mets) + + volumes = [] + + with ZipFile(BytesIO(data)) as vols_zip: + zip_list = vols_zip.namelist() + if 'ERROR.err' in zip_list: + errors.append(vols_zip.read('ERROR.err').decode('utf-8')) + zip_list.remove('ERROR.err') + if 'volume-rights.txt' in zip_list: + rights_data = vols_zip.read('volume-rights.txt').decode('utf-8') + zip_list.remove('volume-rights.txt') + if not rights: + rights.append(rights_data) + else: + # due to the format in which 'volume-rights.txt' is created, we have to skip + # the first 4 lines which make up the header of the file, to extract only the + # actual volume rights data for accumulation + rights.append(''.join(rights_data.splitlines(keepends=True)[4:])) + + zip_volume_paths = [zip_vol_path for zip_vol_path in zip_list if zip_vol_path.endswith('/')] + num_vols_in_zip = len(zip_volume_paths) + + if not remove_headers_footers: + vols_zip.extractall(output_dir, members=zip_list) + progress.update(num_vols_in_zip) + else: + for zip_vol_path in zip_volume_paths: + sorted_vol_zip_page_paths = sorted(zip_page_path for zip_page_path in zip_list if zip_page_path.startswith(zip_vol_path) and not zip_page_path.endswith('/')) + vol_pages = [_to_htrc_page(page_path, vols_zip) for page_path in sorted_vol_zip_page_paths] + volumes.append((zip_vol_path, sorted_vol_zip_page_paths, vol_pages)) + + del data, vols_zip + + num_missing = batch_size - num_vols_in_zip if num_vols >= batch_size else num_vols - num_vols_in_zip + progress.update(num_missing) # update progress bar state to include the missing volumes also + + # `volumes` will be empty if `remove_headers_footers=False` since the ZIP was extracted + # without further processing + if volumes: + for _ in pool.imap_unordered(remove_hf_fun, volumes): + progress.update() + + na_volumes_all = [] + + if errors: + with open(os.path.join(output_dir, 'ERROR.err'), 'w') as err_file: + err_file.write(''.join(errors)) + + na_volumes_error = grep_error('ERROR.err', output_dir, 'KeyNotFoundException', -1) + na_volumes_all.extend(na_volumes_error) + + if rights: + with open(os.path.join(output_dir, 'volume-rights.txt'), 'w') as rights_file: + rights_file.write(''.join(rights)) + + if htrc.config.get_dataapi_access() == "true": + na_volumes_rights = grep_error('volume-rights.txt', output_dir, ' 3', 0) + na_volumes_all.extend(na_volumes_rights) + + num_na = len(na_volumes_all) + + if num_na > 0: + with open(os.path.join(output_dir, 'volumes_not_available.txt'), 'w') as volumes_na: + volumes_na.write("\n".join(str(item) for item in na_volumes_all)) + + if num_na < 100: + print("\nThe following volume ids are not available. \n Please check volumes_not_available.txt " + "for the complete list. ") + print('\n'.join(str(item) for item in na_volumes_all)) else: - data = get_volumes(token, ids, host, port, cert, key, epr, concat, mets) - - myzip = ZipFile(BytesIO(data)) - myzip.extractall(output_dir) - myzip.close() - - check_error_file(output_dir) + print("\nThere are {:,} unavailable volumes.\n Please check volumes_not_available.txt " + "for the " + "complete list. \nTo check the validity of volumes in your workset or volume id file go " + "to:\n " + "https://analytics.hathitrust.org/validateworkset \n or email us at " + "htrc-help@hathitrust.org " + "for assistance.".format(num_na)) except socket.error: - raise RuntimeError("Data API request timeout. Is your Data Capsule in Secure Mode?") + raise RuntimeError("HTRC Data API time out. Check your inode usage if downloading a large workset. " + "Contact HTRC for further help.") else: - raise RuntimeError("Failed to obtain jwt token.") + raise RuntimeError("Failed to obtain the JWT token.") + + +def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_window_size, skip_removed_hf, output_dir): + zip_vol_path, sorted_vol_zip_page_paths, vol_pages = vol_data + clean_volid = zip_vol_path[:-1] + + vol_pages = parse_page_structure(vol_pages, window_size=hf_window_size, min_similarity_ratio=hf_min_similarity) + pages_body = (page.body for page in vol_pages) + # save the removed headers/footers for user inspection + if skip_removed_hf: + if concat: + with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: + vol_file.write('\n'.join(pages_body)) + else: + vol_path = os.path.join(output_dir, zip_vol_path) + os.mkdir(vol_path) + for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): + with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: + page_file.write(page_body) + else: + if concat: + with open(os.path.join(output_dir, clean_volid + '.txt'), 'w', encoding='utf-8') as vol_file: + vol_file.write('\n'.join(pages_body)) + else: + vol_path = os.path.join(output_dir, zip_vol_path) + os.mkdir(vol_path) + for vol_page_path, page_body in zip(sorted_vol_zip_page_paths, pages_body): + with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: + page_file.write(page_body) + + removed_hf = [] + for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages): + if not (vol_page.has_header or vol_page.has_footer): + # skip reporting pages that don't have an identified header or footer + continue + _, page_name = os.path.split(vol_page_path) + page_name, _ = os.path.splitext(page_name) + removed_hf.append({'page': page_name, 'header': vol_page.header, 'footer': vol_page.footer}) + + if concat: + removed_hf_filename = os.path.join(output_dir, clean_volid + '_removed_hf.csv') + else: + removed_hf_filename = os.path.join(output_dir, clean_volid, 'removed_hf.csv') + + pd.DataFrame(removed_hf, columns=['page', 'header', 'footer']).to_csv(removed_hf_filename, index=False) def download(args): @@ -317,9 +449,24 @@ def download(args): with open(args.file) as IDfile: volumeIDs = [line.strip() for line in IDfile] + data_api_config = htrc.config.HtrcDataApiConfig( + token=args.token, + host=args.datahost, + port=args.dataport, + epr=args.dataepr, + cert=args.datacert, + key=args.datakey + ) + return download_volumes(volumeIDs, args.output, - username=args.username, password=args.password, - token=args.token, concat=args.concat, mets=args.mets, pages=args.pages, host=args.datahost, - port=args.dataport, cert=args.datacert, key=args.datakey, - epr=args.dataepr) + remove_headers_footers=args.remove_headers_footers or args.remove_headers_footers_and_concat, + concat=args.concat or args.remove_headers_footers_and_concat, + mets=args.mets, + pages=args.pages, + hf_window_size=args.window_size, + hf_min_similarity=args.min_similarity_ratio, + parallelism=args.parallelism, + batch_size=args.batch_size, + skip_removed_hf=args.skip_removed_hf, + data_api_config=data_api_config) diff --git a/setup.py b/setup.py index 91f8bf7..e2d26f3 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,10 @@ import atexit import tarfile -__version__ = '0.1.55b0' +__version__ = '0.1.57b0' -install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', - 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2'] +install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', + 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0'] # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc'] if sys.version_info.major == 2: install_requires.append('configparser') diff --git a/tests/test_htrc_util_resolve.py b/tests/test_htrc_util_resolve.py index 432734d..6bbbfd0 100644 --- a/tests/test_htrc_util_resolve.py +++ b/tests/test_htrc_util_resolve.py @@ -42,6 +42,9 @@ def test_parse_volume_id(self): id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s;view=1up;seq=7') self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') + id = resolve.parse_volume_id('https://babel.hathitrust.org/cgi/pt?id=uc2.ark:/13960/fk92805m1s&view=1up&seq=7') + self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') + id = resolve.parse_volume_id('uc2.ark:/13960/fk92805m1s') self.assertEqual(id, 'uc2.ark:/13960/fk92805m1s') diff --git a/tests/test_htrc_volumes.py b/tests/test_htrc_volumes.py index d4d9abf..752cbf4 100644 --- a/tests/test_htrc_volumes.py +++ b/tests/test_htrc_volumes.py @@ -60,27 +60,53 @@ def test_get_volumes_and_pages(self, https_mock): response_mock.read.return_value =\ ''.encode('utf8') https_mock.return_value.getresponse.return_value = response_mock - - htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) - htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/') + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + + htrc.volumes.get_volumes(data_api_config, self.test_vols) + htrc.volumes.get_pages(data_api_config, self.test_vols) @patch('htrc.volumes.http.client.HTTPSConnection') def test_get_volumes_and_pages_error(self, https_mock): response_mock = Mock(status=500) https_mock.return_value.getresponse.return_value = response_mock + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + with self.assertRaises(EnvironmentError): - htrc.volumes.get_volumes('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_volumes(data_api_config, self.test_vols) with self.assertRaises(EnvironmentError): - htrc.volumes.get_pages('1234', self.test_vols, 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/') + htrc.volumes.get_pages(data_api_config, self.test_vols) def test_get_volumes_and_pages_empty(self): + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + with self.assertRaises(ValueError): - htrc.volumes.get_volumes('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_volumes(data_api_config, []) with self.assertRaises(ValueError): - htrc.volumes.get_pages('1234', [], 'data-host', '443', '/home/client-certs/client.pem', '/home/client-certs/client.pem', '/' ) + htrc.volumes.get_pages(data_api_config, []) @patch('htrc.volumes.ZipFile') @patch('htrc.volumes.get_volumes') @@ -93,14 +119,21 @@ def test_download_volumes(self, https_mock, oauth2_mock, volumes_mock, oauth2_mock.return_value = 'a1b2c3d4e5' volumes_mock.return_value = b'' - htrc.volumes.download_volumes(self.test_vols, self.output_path, - username='1234', password='1234', token='1234') + data_api_config = htrc.config.HtrcDataApiConfig( + token='1234', + host='data-host', + port=443, + epr='/', + cert='/home/client-certs/client.pem', + key='/home/client-certs/client.pem' + ) + + htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) # test directory creation import shutil shutil.rmtree(self.output_path) - htrc.volumes.download_volumes(self.test_vols, self.output_path, - username='1234', password='1234', token='1234') + htrc.volumes.download_volumes(self.test_vols, self.output_path, data_api_config=data_api_config) # TODO: Fix this test for case where config file exists, but creds not set """ @@ -132,6 +165,7 @@ def test_download_volumes_saved_creds(self, https_mock, oauth2_mock, volumes_moc def test_download(self): pass + suite = unittest.TestLoader().loadTestsFromTestCase(TestVolumes) unittest.TextTestRunner(verbosity=2).run(suite) From a06dfbf0b50989d2c42969b140dd5f553d09be24 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Thu, 7 Oct 2021 12:49:36 -0400 Subject: [PATCH 7/8] Fixed a conflict in config.py --- htrc/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htrc/config.py b/htrc/config.py index c6b5743..fa2815a 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -33,7 +33,7 @@ def __init__(self, key: Optional[str] = None) -> None: super().__init__() - self.token = token or get_jwt_token(save_new_token=False) + self.token = token or get_jwt_token() self.host = host or get_dataapi_host() self.port = port or get_dataapi_port() self.epr = epr or get_dataapi_epr() From 703de27dfd83db015e301bf61dc8fc5668080809 Mon Sep 17 00:00:00 2001 From: Samitha Liyanage Date: Tue, 18 Jan 2022 21:54:26 -0500 Subject: [PATCH 8/8] Set final version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e2d26f3..9fa5ac0 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ import atexit import tarfile -__version__ = '0.1.57b0' +__version__ = '0.1.57' install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']