diff --git a/htrc/auth.py b/htrc/auth.py index b975e63..7d4a9df 100644 --- a/htrc/auth.py +++ b/htrc/auth.py @@ -4,6 +4,7 @@ #import ssl #import time import subprocess + import requests import requests.auth #import configparser @@ -43,6 +44,7 @@ def get_jwt_token(): else: raise RuntimeError("JWT token retrieval failed: {}".format(data['error'])) + def credential_prompt(): """ A prompt for entering HathiTrust Research Center credentials. diff --git a/htrc/config.py b/htrc/config.py index fa2815a..e48c3cf 100644 --- a/htrc/config.py +++ b/htrc/config.py @@ -111,6 +111,7 @@ def get_jwt_token(path=None): return token def save_jwt_token(token, path=None): + """ Saves JWT token in the config file. """ diff --git a/htrc/volumes/__init__.py b/htrc/volumes/__init__.py index aabb171..6fa2145 100644 --- a/htrc/volumes/__init__.py +++ b/htrc/volumes/__init__.py @@ -13,15 +13,16 @@ standard_library.install_aliases() + #from builtins import input from htrc.models import HtrcPage import http.client from io import BytesIO, TextIOWrapper import json -import logging import os.path import progressbar + #import re import socket import ssl @@ -37,7 +38,6 @@ from htrc.runningheaders import parse_page_structure from functools import partial import pandas as pd - #from htrc.lib.cli import bool_prompt from htrc.util import split_items import htrc.config @@ -74,6 +74,7 @@ def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, conc data = {'volumeIDs': '|'.join( [id.replace('+', ':').replace('=', '/') for id in volume_ids])} + if concat: data['concat'] = 'true' @@ -210,6 +211,7 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F return data + #def get_oauth2_token(username, password): # make sure to set the request content-type as application/x-www-form-urlencoded #headers = {"Content-type": "application/x-www-form-urlencoded"} @@ -242,6 +244,7 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F #token = jsonData["access_token"] #logging.info("*** parsed token: {}".format(token)) + #else: #logging.debug("Unable to get token") #logging.debug("Response Code: {}".format(response.status)) @@ -252,9 +255,11 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F #if httpsConnection is not None: #httpsConnection.close() + #return token + def grep_error(file_name, output_dir, pattern, txt_index): na_volume = [] if output_dir.endswith("/"): @@ -270,6 +275,9 @@ def grep_error(file_name, output_dir, pattern, txt_index): return na_volume + return na_volume + + def _to_htrc_page(page_file, zip): with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page: return HtrcPage([line.rstrip() for line in page.readlines()]) @@ -427,6 +435,8 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file: page_file.write(page_body) + + removed_hf = [] for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages): if not (vol_page.has_header or vol_page.has_footer): diff --git a/setup.py b/setup.py index 9fa5ac0..215fae3 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,10 @@ import atexit import tarfile + __version__ = '0.1.57' + install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas', 'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0'] # TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']