Skip to content

Commit

Permalink
Merge branch 'develop' into merge-develop
Browse files Browse the repository at this point in the history
  • Loading branch information
samithaliyanage authored Jan 19, 2022
2 parents 703de27 + 5225dc7 commit c3d311d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 2 deletions.
2 changes: 2 additions & 0 deletions htrc/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#import ssl
#import time
import subprocess

import requests
import requests.auth
#import configparser
Expand Down Expand Up @@ -43,6 +44,7 @@ def get_jwt_token():
else:
raise RuntimeError("JWT token retrieval failed: {}".format(data['error']))


def credential_prompt():
"""
A prompt for entering HathiTrust Research Center credentials.
Expand Down
1 change: 1 addition & 0 deletions htrc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def get_jwt_token(path=None):
return token

def save_jwt_token(token, path=None):

"""
Saves JWT token in the config file.
"""
Expand Down
14 changes: 12 additions & 2 deletions htrc/volumes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,16 @@

standard_library.install_aliases()


#from builtins import input
from htrc.models import HtrcPage

import http.client
from io import BytesIO, TextIOWrapper
import json
import logging
import os.path
import progressbar

#import re
import socket
import ssl
Expand All @@ -37,7 +38,6 @@
from htrc.runningheaders import parse_page_structure
from functools import partial
import pandas as pd

#from htrc.lib.cli import bool_prompt
from htrc.util import split_items
import htrc.config
Expand Down Expand Up @@ -74,6 +74,7 @@ def get_volumes(data_api_config: htrc.config.HtrcDataApiConfig, volume_ids, conc
data = {'volumeIDs': '|'.join(
[id.replace('+', ':').replace('=', '/') for id in volume_ids])}


if concat:
data['concat'] = 'true'

Expand Down Expand Up @@ -210,6 +211,7 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F

return data


#def get_oauth2_token(username, password):
# make sure to set the request content-type as application/x-www-form-urlencoded
#headers = {"Content-type": "application/x-www-form-urlencoded"}
Expand Down Expand Up @@ -242,6 +244,7 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F
#token = jsonData["access_token"]
#logging.info("*** parsed token: {}".format(token))


#else:
#logging.debug("Unable to get token")
#logging.debug("Response Code: {}".format(response.status))
Expand All @@ -252,9 +255,11 @@ def get_pages(data_api_config: htrc.config.HtrcDataApiConfig, page_ids, concat=F
#if httpsConnection is not None:
#httpsConnection.close()


#return token



def grep_error(file_name, output_dir, pattern, txt_index):
na_volume = []
if output_dir.endswith("/"):
Expand All @@ -270,6 +275,9 @@ def grep_error(file_name, output_dir, pattern, txt_index):
return na_volume


return na_volume


def _to_htrc_page(page_file, zip):
with TextIOWrapper(BytesIO(zip.read(page_file)), encoding='utf-8') as page:
return HtrcPage([line.rstrip() for line in page.readlines()])
Expand Down Expand Up @@ -427,6 +435,8 @@ def _remove_headers_footers_and_save(vol_data, concat, hf_min_similarity, hf_win
with open(os.path.join(output_dir, vol_page_path), 'w', encoding='utf-8') as page_file:
page_file.write(page_body)



removed_hf = []
for vol_page_path, vol_page in zip(sorted_vol_zip_page_paths, vol_pages):
if not (vol_page.has_header or vol_page.has_footer):
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
import atexit
import tarfile


__version__ = '0.1.57'


install_requires = ['PyLD', 'future', 'prov', 'unicodecsv', 'progressbar2', 'pandas',
'requests', 'argparse==1.1', 'topicexplorer==1.0b226', 'numpy==1.16.2', 'tqdm==4.46.0']
# TODO: migrate to docs confix:, 'sphinx-argparse', 'sphinxcontrib-fulltoc']
Expand Down

0 comments on commit c3d311d

Please sign in to comment.