From d7c921e7b8ea69a4c9875a48468b6e8dee4484d2 Mon Sep 17 00:00:00 2001
From: Matthew Templeton <matthew.templeton@cfa.harvard.edu>
Date: Tue, 30 Jul 2024 14:14:32 -0400
Subject: [PATCH 1/2] Mods to config variables, added PUBLISHED_DIR  
 modified:   adsdocmatch/oracle_util.py  	modified:  
 adsdocmatch/spreadsheet_util.py  	modified:   adsdocmatch/utils.py  
 modified:   config.py  	modified:   run.py

---
 adsdocmatch/oracle_util.py      |  8 +++---
 adsdocmatch/spreadsheet_util.py | 22 ++++++++---------
 adsdocmatch/utils.py            |  2 --
 config.py                       | 43 ++++++++++++++++++++-------------
 run.py                          | 10 ++++----
 5 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/adsdocmatch/oracle_util.py b/adsdocmatch/oracle_util.py
index e9d2fe9..eb1b712 100644
--- a/adsdocmatch/oracle_util.py
+++ b/adsdocmatch/oracle_util.py
@@ -499,8 +499,8 @@ def query(self, output_filename, days=None):
         return 'Got %d records from db.' % count
 
     def dump_oracledb(self):
-        daily_file = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_FILE', '/tmp/oracle_dump.tsv')
-        daily_maxage = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_AGE', 9999)
+        daily_file = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_ORACLE_DUMP_FILE", "oracle_dump.tsv")
+        daily_maxage = config.get("DOCMATCHPIPELINE_ORACLE_DUMP_AGE", 9999)
         result = self.query(daily_file, days=daily_maxage)
         logger.info('Query returns: %s; Oracle db successfully dumped to file: %s' % (result, daily_file))
 
@@ -616,8 +616,8 @@ def cleanup_db(self):
 
     def load_curated_file(self, input_filename=None, frozen_filename=None, input_score=1.0, do_backup=True):
         if not input_filename:
-            input_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "/tmp/user_submitted.list")
-            frozen_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
+            input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "user_submitted.list")
+            frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
         input_pairs, failed_lines = utils.read_user_submitted(input_filename)
         if failed_lines:
             logger.warning("read_user_submitted found %s failing lines: %s" % (str(len(failed_lines)), str(failed_lines)))
diff --git a/adsdocmatch/spreadsheet_util.py b/adsdocmatch/spreadsheet_util.py
index c74934e..3425e09 100644
--- a/adsdocmatch/spreadsheet_util.py
+++ b/adsdocmatch/spreadsheet_util.py
@@ -4,9 +4,9 @@
 
 
 proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
-conf = load_config(proj_home=proj_home)
+config = load_config(proj_home=proj_home)
 
-logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
+logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))
 
 
 class GoogleUploadException(Exception):
@@ -29,9 +29,9 @@ def __init__(self):
 
         """
         # initially directory is set to top level
-        folderId = conf.get("GOOGLE_BASEDIR_ID", None)
-        secretsPath = conf.get("GOOGLE_SECRETS_FILENAME", None)
-        scopesList = [conf.get("GOOGLE_API_SCOPE", None)]
+        folderId = config.get("GOOGLE_BASEDIR_ID", None)
+        secretsPath = config.get("GOOGLE_SECRETS_FILENAME", None)
+        scopesList = [config.get("GOOGLE_API_SCOPE", None)]
 
         try:
             self.gm = GoogleManager(authtype="service",
@@ -56,7 +56,7 @@ def upload(self, filename):
                       "mtype": "text/csv",
                       "meta_mtype": "application/vnd.google-apps.spreadsheet"}
             # make sure the directory is set to curated
-            self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
+            self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
             return self.gm.upload_file(**kwargs)
 
         except Exception as err:
@@ -72,7 +72,7 @@ def download(self, metadata):
             kwargs = {"fileId": metadata.get("id", None),
                       "export_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}
             data = self.gm.export_sheet_contents(**kwargs)
-            xls_filename = conf.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
+            xls_filename = config.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
             with open(xls_filename, "wb") as fx:
                 fx.write(data)
             return xls_filename
@@ -89,13 +89,13 @@ def archive(self, metadata):
         try:
             # reparent curated to archive on Google Drive, ...
             file_id = metadata.get("id", None)
-            old_parent = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
+            old_parent = config.get("GOOGLE_CURATED_FOLDER_ID", None)
             kwargs = {"fileId": file_id,
                       "removeParents": old_parent,
-                      "addParents": conf.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
+                      "addParents": config.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
             if old_parent in metadata.get("parents", []):
                 # make sure the directory is set to top level
-                self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
+                self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
                 self.gm.reparent_file(**kwargs)
 
         except Exception as err:
@@ -107,5 +107,5 @@ def get_curated_filenames(self):
         :return:
         """
         # make sure the directory is set to curated
-        self.gm.folderid = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
+        self.gm.folderid = config.get("GOOGLE_CURATED_FOLDER_ID", None)
         return self.gm.list_files()
diff --git a/adsdocmatch/utils.py b/adsdocmatch/utils.py
index 10b64c6..5c762b3 100644
--- a/adsdocmatch/utils.py
+++ b/adsdocmatch/utils.py
@@ -3,10 +3,8 @@
 import pwd
 import re
 from datetime import datetime
-from adsputils import load_config
 
 proj_home = os.path.realpath(os.path.dirname(__file__)+ "/../")
-conf = load_config(proj_home=proj_home)
 
 class BackupFileException(Exception):
     pass
diff --git a/config.py b/config.py
index 0e8acdb..7f5a785 100644
--- a/config.py
+++ b/config.py
@@ -1,29 +1,29 @@
 LOGGING_LEVEL="WARN"
 LOG_STDOUT=True
 
-DOCMATCHPIPELINE_API_TOKEN = 'api token'
-DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = 'http://0.0.0.0:5000'
-DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = 'http://0.0.0.0:5050'
-DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = '5000'
-DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = '10'
-DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = '1'
+DOCMATCHPIPELINE_API_TOKEN = "api token"
+DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = "http://0.0.0.0:5000"
+DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = "http://0.0.0.0:5050"
+DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = "5000"
+DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = "10"
+DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = "1"
 
 # input filenames
-DOCMATCHPIPELINE_INPUT_FILENAME = '/match_oracle.input'
+DOCMATCHPIPELINE_INPUT_FILENAME = "/match_oracle.input"
 
 # classic match of arxiv to published, or vice versa
-DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = '/match.out'
+DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = "/match.out"
 
 # intermediate step filenames
-DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = '/matched_eprint.output.csv'
-DOCMATCHPIPELINE_PUB_RESULT_FILENAME = '/matched_pub.output.csv'
+DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = "/matched_eprint.output.csv"
+DOCMATCHPIPELINE_PUB_RESULT_FILENAME = "/matched_pub.output.csv"
 
 # final filename to be uploaded to google drive
-DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = '/compare_eprint.csv'
-DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = '/compare_pub.csv'
+DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = "/compare_eprint.csv"
+DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = "/compare_pub.csv"
 
 # filename to log failed metadata filenames
-DOCMATCHPIPELINE_RERUN_FILENAME = '../rerun.input'
+DOCMATCHPIPELINE_RERUN_FILENAME = "../rerun.input"
 
 # Google Drive integration
 GOOGLE_SECRETS_FILENAME = "credentials.txt"
@@ -40,8 +40,17 @@
 DOCMATCHPIPELINE_SOURCE_INCORRECT = "incorrect"
 
 # how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched
-DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12
+DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 1
+
+# backend maintenance directory and files
+# define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls
+DOCMATCHPIPELINE_PUBLISHED_DIR="/tmp/"
+
+DOCMATCHPIPELINE_MATCHES_KILL_FILE="matches.kill"
+DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE="matches.kill.frozen"
+DOCMATCHPIPELINE_ORACLE_DUMP_FILE="oracle_dump.tsv"
+DOCMATCHPIPELINE_ORACLE_DUMP_AGE=9999
+DOCMATCHPIPELINE_USER_SUBMITTED_FILE="user_submitted.list"
+DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE="user_submitted_frozen.list"
+
 
-# daily dump of oracledb to text, for use by classic
-DOCMATCHPIPELINE_ORACLE_DUMP_FILE = './oracle_dump.list'
-DOCMATCHPIPELINE_ORACLE_DUMP_AGE = 9999
diff --git a/run.py b/run.py
index 8702249..aeabbc2 100644
--- a/run.py
+++ b/run.py
@@ -6,9 +6,9 @@
 from adsputils import load_config, setup_logging
 
 proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./"))
-conf = load_config(proj_home=proj_home)
+config = load_config(proj_home=proj_home)
 
-logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
+logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))
 
 
 def get_args():
@@ -129,7 +129,7 @@ def main():
             if args.datapath:
                 path = args.datapath
             elif args.date:
-                path = conf.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
+                path = config.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
             if path:
                 try:
                     if args.match_to_pub:
@@ -183,8 +183,8 @@ def main():
 
         # daily: process matches.kill without archiving
         elif args.load_matches_kill:
-            input_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "")
-            frozen_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "")
+            input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "matches.kill")
+            frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "matches.kill.frozen")
             if input_filename:
                 OracleUtil().load_curated_file(input_filename=input_filename, frozen_filename=frozen_filename, input_score=-1.0, do_backup=False)
 

From 717c52ff978699a614cb18367c7fa64b0b66def5 Mon Sep 17 00:00:00 2001
From: Matthew Templeton <matthew.templeton@cfa.harvard.edu>
Date: Tue, 30 Jul 2024 15:57:56 -0400
Subject: [PATCH 2/2] Fixed spurious edit to rerun_months in config  
 modified:   config.py

---
 config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.py b/config.py
index 7f5a785..55d054b 100644
--- a/config.py
+++ b/config.py
@@ -40,7 +40,7 @@
 DOCMATCHPIPELINE_SOURCE_INCORRECT = "incorrect"
 
 # how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched
-DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 1
+DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12
 
 # backend maintenance directory and files
 # define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls