From d7c921e7b8ea69a4c9875a48468b6e8dee4484d2 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Tue, 30 Jul 2024 14:14:32 -0400 Subject: [PATCH 1/2] Mods to config variables, added PUBLISHED_DIR modified: adsdocmatch/oracle_util.py modified: adsdocmatch/spreadsheet_util.py modified: adsdocmatch/utils.py modified: config.py modified: run.py --- adsdocmatch/oracle_util.py | 8 +++--- adsdocmatch/spreadsheet_util.py | 22 ++++++++--------- adsdocmatch/utils.py | 2 -- config.py | 43 ++++++++++++++++++++------------- run.py | 10 ++++---- 5 files changed, 46 insertions(+), 39 deletions(-) diff --git a/adsdocmatch/oracle_util.py b/adsdocmatch/oracle_util.py index e9d2fe9..eb1b712 100644 --- a/adsdocmatch/oracle_util.py +++ b/adsdocmatch/oracle_util.py @@ -499,8 +499,8 @@ def query(self, output_filename, days=None): return 'Got %d records from db.' % count def dump_oracledb(self): - daily_file = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_FILE', '/tmp/oracle_dump.tsv') - daily_maxage = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_AGE', 9999) + daily_file = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_ORACLE_DUMP_FILE", "oracle_dump.tsv") + daily_maxage = config.get("DOCMATCHPIPELINE_ORACLE_DUMP_AGE", 9999) result = self.query(daily_file, days=daily_maxage) logger.info('Query returns: %s; Oracle db successfully dumped to file: %s' % (result, daily_file)) @@ -616,8 +616,8 @@ def cleanup_db(self): def load_curated_file(self, input_filename=None, frozen_filename=None, input_score=1.0, do_backup=True): if not input_filename: - input_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "/tmp/user_submitted.list") - frozen_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list") + input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "user_submitted.list") + frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list") input_pairs, failed_lines = utils.read_user_submitted(input_filename) if failed_lines: logger.warning("read_user_submitted found %s failing lines: %s" % (str(len(failed_lines)), str(failed_lines))) diff --git a/adsdocmatch/spreadsheet_util.py b/adsdocmatch/spreadsheet_util.py index c74934e..3425e09 100644 --- a/adsdocmatch/spreadsheet_util.py +++ b/adsdocmatch/spreadsheet_util.py @@ -4,9 +4,9 @@ proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../")) -conf = load_config(proj_home=proj_home) +config = load_config(proj_home=proj_home) -logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE")) +logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE")) class GoogleUploadException(Exception): @@ -29,9 +29,9 @@ def __init__(self): """ # initially directory is set to top level - folderId = conf.get("GOOGLE_BASEDIR_ID", None) - secretsPath = conf.get("GOOGLE_SECRETS_FILENAME", None) - scopesList = [conf.get("GOOGLE_API_SCOPE", None)] + folderId = config.get("GOOGLE_BASEDIR_ID", None) + secretsPath = config.get("GOOGLE_SECRETS_FILENAME", None) + scopesList = [config.get("GOOGLE_API_SCOPE", None)] try: self.gm = GoogleManager(authtype="service", @@ -56,7 +56,7 @@ def upload(self, filename): "mtype": "text/csv", "meta_mtype": "application/vnd.google-apps.spreadsheet"} # make sure the directory is set to curated - self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None) + self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None) return self.gm.upload_file(**kwargs) except Exception as err: @@ -72,7 +72,7 @@ def download(self, metadata): kwargs = {"fileId": metadata.get("id", None), "export_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"} data = self.gm.export_sheet_contents(**kwargs) - xls_filename = conf.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx" + xls_filename = config.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx" with open(xls_filename, "wb") as fx: fx.write(data) return xls_filename @@ -89,13 +89,13 @@ def archive(self, metadata): try: # reparent curated to archive on Google Drive, ... file_id = metadata.get("id", None) - old_parent = conf.get("GOOGLE_CURATED_FOLDER_ID", None) + old_parent = config.get("GOOGLE_CURATED_FOLDER_ID", None) kwargs = {"fileId": file_id, "removeParents": old_parent, - "addParents": conf.get("GOOGLE_ARCHIVE_FOLDER_ID", None)} + "addParents": config.get("GOOGLE_ARCHIVE_FOLDER_ID", None)} if old_parent in metadata.get("parents", []): # make sure the directory is set to top level - self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None) + self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None) self.gm.reparent_file(**kwargs) except Exception as err: @@ -107,5 +107,5 @@ def get_curated_filenames(self): :return: """ # make sure the directory is set to curated - self.gm.folderid = conf.get("GOOGLE_CURATED_FOLDER_ID", None) + self.gm.folderid = config.get("GOOGLE_CURATED_FOLDER_ID", None) return self.gm.list_files() diff --git a/adsdocmatch/utils.py b/adsdocmatch/utils.py index 10b64c6..5c762b3 100644 --- a/adsdocmatch/utils.py +++ b/adsdocmatch/utils.py @@ -3,10 +3,8 @@ import pwd import re from datetime import datetime -from adsputils import load_config proj_home = os.path.realpath(os.path.dirname(__file__)+ "/../") -conf = load_config(proj_home=proj_home) class BackupFileException(Exception): pass diff --git a/config.py b/config.py index 0e8acdb..7f5a785 100644 --- a/config.py +++ b/config.py @@ -1,29 +1,29 @@ LOGGING_LEVEL="WARN" LOG_STDOUT=True -DOCMATCHPIPELINE_API_TOKEN = 'api token' -DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = 'http://0.0.0.0:5000' -DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = 'http://0.0.0.0:5050' -DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = '5000' -DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = '10' -DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = '1' +DOCMATCHPIPELINE_API_TOKEN = "api token" +DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = "http://0.0.0.0:5000" +DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = "http://0.0.0.0:5050" +DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = "5000" +DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = "10" +DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = "1" # input filenames -DOCMATCHPIPELINE_INPUT_FILENAME = '/match_oracle.input' +DOCMATCHPIPELINE_INPUT_FILENAME = "/match_oracle.input" # classic match of arxiv to published, or vice versa -DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = '/match.out' +DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = "/match.out" # intermediate step filenames -DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = '/matched_eprint.output.csv' -DOCMATCHPIPELINE_PUB_RESULT_FILENAME = '/matched_pub.output.csv' +DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = "/matched_eprint.output.csv" +DOCMATCHPIPELINE_PUB_RESULT_FILENAME = "/matched_pub.output.csv" # final filename to be uploaded to google drive -DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = '/compare_eprint.csv' -DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = '/compare_pub.csv' +DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = "/compare_eprint.csv" +DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = "/compare_pub.csv" # filename to log failed metadata filenames -DOCMATCHPIPELINE_RERUN_FILENAME = '../rerun.input' +DOCMATCHPIPELINE_RERUN_FILENAME = "../rerun.input" # Google Drive integration GOOGLE_SECRETS_FILENAME = "credentials.txt" @@ -40,8 +40,17 @@ DOCMATCHPIPELINE_SOURCE_INCORRECT = "incorrect" # how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched -DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12 +DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 1 + +# backend maintenance directory and files +# define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls +DOCMATCHPIPELINE_PUBLISHED_DIR="/tmp/" + +DOCMATCHPIPELINE_MATCHES_KILL_FILE="matches.kill" +DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE="matches.kill.frozen" +DOCMATCHPIPELINE_ORACLE_DUMP_FILE="oracle_dump.tsv" +DOCMATCHPIPELINE_ORACLE_DUMP_AGE=9999 +DOCMATCHPIPELINE_USER_SUBMITTED_FILE="user_submitted.list" +DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE="user_submitted_frozen.list" + -# daily dump of oracledb to text, for use by classic -DOCMATCHPIPELINE_ORACLE_DUMP_FILE = './oracle_dump.list' -DOCMATCHPIPELINE_ORACLE_DUMP_AGE = 9999 diff --git a/run.py b/run.py index 8702249..aeabbc2 100644 --- a/run.py +++ b/run.py @@ -6,9 +6,9 @@ from adsputils import load_config, setup_logging proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./")) -conf = load_config(proj_home=proj_home) +config = load_config(proj_home=proj_home) -logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE")) +logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE")) def get_args(): @@ -129,7 +129,7 @@ def main(): if args.datapath: path = args.datapath elif args.date: - path = conf.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date + path = config.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date if path: try: if args.match_to_pub: @@ -183,8 +183,8 @@ def main(): # daily: process matches.kill without archiving elif args.load_matches_kill: - input_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "") - frozen_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "") + input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "matches.kill") + frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "matches.kill.frozen") if input_filename: OracleUtil().load_curated_file(input_filename=input_filename, frozen_filename=frozen_filename, input_score=-1.0, do_backup=False) From 717c52ff978699a614cb18367c7fa64b0b66def5 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Tue, 30 Jul 2024 15:57:56 -0400 Subject: [PATCH 2/2] Fixed spurious edit to rerun_months in config modified: config.py --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index 7f5a785..55d054b 100644 --- a/config.py +++ b/config.py @@ -40,7 +40,7 @@ DOCMATCHPIPELINE_SOURCE_INCORRECT = "incorrect" # how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched -DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 1 +DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12 # backend maintenance directory and files # define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls