Skip to content

Commit

Permalink
Merge pull request #38 from seasidesparrow/update_config.20240730
Browse files Browse the repository at this point in the history
Update config.20240730
  • Loading branch information
seasidesparrow authored Jul 30, 2024
2 parents 04691a5 + 717c52f commit 9d21568
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 38 deletions.
8 changes: 4 additions & 4 deletions adsdocmatch/oracle_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,8 @@ def query(self, output_filename, days=None):
return 'Got %d records from db.' % count

def dump_oracledb(self):
daily_file = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_FILE', '/tmp/oracle_dump.tsv')
daily_maxage = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_AGE', 9999)
daily_file = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_ORACLE_DUMP_FILE", "oracle_dump.tsv")
daily_maxage = config.get("DOCMATCHPIPELINE_ORACLE_DUMP_AGE", 9999)
result = self.query(daily_file, days=daily_maxage)
logger.info('Query returns: %s; Oracle db successfully dumped to file: %s' % (result, daily_file))

Expand Down Expand Up @@ -616,8 +616,8 @@ def cleanup_db(self):

def load_curated_file(self, input_filename=None, frozen_filename=None, input_score=1.0, do_backup=True):
if not input_filename:
input_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "/tmp/user_submitted.list")
frozen_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "user_submitted.list")
frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
input_pairs, failed_lines = utils.read_user_submitted(input_filename)
if failed_lines:
logger.warning("read_user_submitted found %s failing lines: %s" % (str(len(failed_lines)), str(failed_lines)))
Expand Down
22 changes: 11 additions & 11 deletions adsdocmatch/spreadsheet_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@


proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
conf = load_config(proj_home=proj_home)
config = load_config(proj_home=proj_home)

logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))


class GoogleUploadException(Exception):
Expand All @@ -29,9 +29,9 @@ def __init__(self):
"""
# initially directory is set to top level
folderId = conf.get("GOOGLE_BASEDIR_ID", None)
secretsPath = conf.get("GOOGLE_SECRETS_FILENAME", None)
scopesList = [conf.get("GOOGLE_API_SCOPE", None)]
folderId = config.get("GOOGLE_BASEDIR_ID", None)
secretsPath = config.get("GOOGLE_SECRETS_FILENAME", None)
scopesList = [config.get("GOOGLE_API_SCOPE", None)]

try:
self.gm = GoogleManager(authtype="service",
Expand All @@ -56,7 +56,7 @@ def upload(self, filename):
"mtype": "text/csv",
"meta_mtype": "application/vnd.google-apps.spreadsheet"}
# make sure the directory is set to curated
self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
return self.gm.upload_file(**kwargs)

except Exception as err:
Expand All @@ -72,7 +72,7 @@ def download(self, metadata):
kwargs = {"fileId": metadata.get("id", None),
"export_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}
data = self.gm.export_sheet_contents(**kwargs)
xls_filename = conf.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
xls_filename = config.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
with open(xls_filename, "wb") as fx:
fx.write(data)
return xls_filename
Expand All @@ -89,13 +89,13 @@ def archive(self, metadata):
try:
# reparent curated to archive on Google Drive, ...
file_id = metadata.get("id", None)
old_parent = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
old_parent = config.get("GOOGLE_CURATED_FOLDER_ID", None)
kwargs = {"fileId": file_id,
"removeParents": old_parent,
"addParents": conf.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
"addParents": config.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
if old_parent in metadata.get("parents", []):
# make sure the directory is set to top level
self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
self.gm.reparent_file(**kwargs)

except Exception as err:
Expand All @@ -107,5 +107,5 @@ def get_curated_filenames(self):
:return:
"""
# make sure the directory is set to curated
self.gm.folderid = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
self.gm.folderid = config.get("GOOGLE_CURATED_FOLDER_ID", None)
return self.gm.list_files()
2 changes: 0 additions & 2 deletions adsdocmatch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
import pwd
import re
from datetime import datetime
from adsputils import load_config

proj_home = os.path.realpath(os.path.dirname(__file__)+ "/../")
conf = load_config(proj_home=proj_home)

class BackupFileException(Exception):
pass
Expand Down
41 changes: 25 additions & 16 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
LOGGING_LEVEL="WARN"
LOG_STDOUT=True

DOCMATCHPIPELINE_API_TOKEN = 'api token'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = 'http://0.0.0.0:5000'
DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = 'http://0.0.0.0:5050'
DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = '5000'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = '10'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = '1'
DOCMATCHPIPELINE_API_TOKEN = "api token"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = "http://0.0.0.0:5000"
DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = "http://0.0.0.0:5050"
DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = "5000"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = "10"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = "1"

# input filenames
DOCMATCHPIPELINE_INPUT_FILENAME = '/match_oracle.input'
DOCMATCHPIPELINE_INPUT_FILENAME = "/match_oracle.input"

# classic match of arxiv to published, or vice versa
DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = '/match.out'
DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = "/match.out"

# intermediate step filenames
DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = '/matched_eprint.output.csv'
DOCMATCHPIPELINE_PUB_RESULT_FILENAME = '/matched_pub.output.csv'
DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = "/matched_eprint.output.csv"
DOCMATCHPIPELINE_PUB_RESULT_FILENAME = "/matched_pub.output.csv"

# final filename to be uploaded to google drive
DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = '/compare_eprint.csv'
DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = '/compare_pub.csv'
DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = "/compare_eprint.csv"
DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = "/compare_pub.csv"

# filename to log failed metadata filenames
DOCMATCHPIPELINE_RERUN_FILENAME = '../rerun.input'
DOCMATCHPIPELINE_RERUN_FILENAME = "../rerun.input"

# Google Drive integration
GOOGLE_SECRETS_FILENAME = "credentials.txt"
Expand All @@ -42,6 +42,15 @@
# how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched
DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12

# daily dump of oracledb to text, for use by classic
DOCMATCHPIPELINE_ORACLE_DUMP_FILE = './oracle_dump.list'
DOCMATCHPIPELINE_ORACLE_DUMP_AGE = 9999
# backend maintenance directory and files
# define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls
DOCMATCHPIPELINE_PUBLISHED_DIR="/tmp/"

DOCMATCHPIPELINE_MATCHES_KILL_FILE="matches.kill"
DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE="matches.kill.frozen"
DOCMATCHPIPELINE_ORACLE_DUMP_FILE="oracle_dump.tsv"
DOCMATCHPIPELINE_ORACLE_DUMP_AGE=9999
DOCMATCHPIPELINE_USER_SUBMITTED_FILE="user_submitted.list"
DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE="user_submitted_frozen.list"


10 changes: 5 additions & 5 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from adsputils import load_config, setup_logging

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./"))
conf = load_config(proj_home=proj_home)
config = load_config(proj_home=proj_home)

logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))


def get_args():
Expand Down Expand Up @@ -129,7 +129,7 @@ def main():
if args.datapath:
path = args.datapath
elif args.date:
path = conf.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
path = config.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
if path:
try:
if args.match_to_pub:
Expand Down Expand Up @@ -183,8 +183,8 @@ def main():

# daily: process matches.kill without archiving
elif args.load_matches_kill:
input_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "")
frozen_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "")
input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "matches.kill")
frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "matches.kill.frozen")
if input_filename:
OracleUtil().load_curated_file(input_filename=input_filename, frozen_filename=frozen_filename, input_score=-1.0, do_backup=False)

Expand Down

0 comments on commit 9d21568

Please sign in to comment.