Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update config.20240730 #38

Merged
merged 2 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions adsdocmatch/oracle_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,8 @@ def query(self, output_filename, days=None):
return 'Got %d records from db.' % count

def dump_oracledb(self):
daily_file = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_FILE', '/tmp/oracle_dump.tsv')
daily_maxage = config.get('DOCMATCHPIPELINE_ORACLE_DUMP_AGE', 9999)
daily_file = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_ORACLE_DUMP_FILE", "oracle_dump.tsv")
daily_maxage = config.get("DOCMATCHPIPELINE_ORACLE_DUMP_AGE", 9999)
result = self.query(daily_file, days=daily_maxage)
logger.info('Query returns: %s; Oracle db successfully dumped to file: %s' % (result, daily_file))

Expand Down Expand Up @@ -616,8 +616,8 @@ def cleanup_db(self):

def load_curated_file(self, input_filename=None, frozen_filename=None, input_score=1.0, do_backup=True):
if not input_filename:
input_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "/tmp/user_submitted.list")
frozen_filename = config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FILE", "user_submitted.list")
frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE", "/tmp/user_submitted_frozen.list")
input_pairs, failed_lines = utils.read_user_submitted(input_filename)
if failed_lines:
logger.warning("read_user_submitted found %s failing lines: %s" % (str(len(failed_lines)), str(failed_lines)))
Expand Down
22 changes: 11 additions & 11 deletions adsdocmatch/spreadsheet_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@


proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
conf = load_config(proj_home=proj_home)
config = load_config(proj_home=proj_home)

logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))


class GoogleUploadException(Exception):
Expand All @@ -29,9 +29,9 @@ def __init__(self):

"""
# initially directory is set to top level
folderId = conf.get("GOOGLE_BASEDIR_ID", None)
secretsPath = conf.get("GOOGLE_SECRETS_FILENAME", None)
scopesList = [conf.get("GOOGLE_API_SCOPE", None)]
folderId = config.get("GOOGLE_BASEDIR_ID", None)
secretsPath = config.get("GOOGLE_SECRETS_FILENAME", None)
scopesList = [config.get("GOOGLE_API_SCOPE", None)]

try:
self.gm = GoogleManager(authtype="service",
Expand All @@ -56,7 +56,7 @@ def upload(self, filename):
"mtype": "text/csv",
"meta_mtype": "application/vnd.google-apps.spreadsheet"}
# make sure the directory is set to curated
self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
return self.gm.upload_file(**kwargs)

except Exception as err:
Expand All @@ -72,7 +72,7 @@ def download(self, metadata):
kwargs = {"fileId": metadata.get("id", None),
"export_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}
data = self.gm.export_sheet_contents(**kwargs)
xls_filename = conf.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
xls_filename = config.get("DOCMATCHPIPELINE_DATA_PATH", "./") + metadata.get("name", None) + ".xlsx"
with open(xls_filename, "wb") as fx:
fx.write(data)
return xls_filename
Expand All @@ -89,13 +89,13 @@ def archive(self, metadata):
try:
# reparent curated to archive on Google Drive, ...
file_id = metadata.get("id", None)
old_parent = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
old_parent = config.get("GOOGLE_CURATED_FOLDER_ID", None)
kwargs = {"fileId": file_id,
"removeParents": old_parent,
"addParents": conf.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
"addParents": config.get("GOOGLE_ARCHIVE_FOLDER_ID", None)}
if old_parent in metadata.get("parents", []):
# make sure the directory is set to top level
self.gm.folderid = conf.get("GOOGLE_BASEDIR_ID", None)
self.gm.folderid = config.get("GOOGLE_BASEDIR_ID", None)
self.gm.reparent_file(**kwargs)

except Exception as err:
Expand All @@ -107,5 +107,5 @@ def get_curated_filenames(self):
:return:
"""
# make sure the directory is set to curated
self.gm.folderid = conf.get("GOOGLE_CURATED_FOLDER_ID", None)
self.gm.folderid = config.get("GOOGLE_CURATED_FOLDER_ID", None)
return self.gm.list_files()
2 changes: 0 additions & 2 deletions adsdocmatch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
import pwd
import re
from datetime import datetime
from adsputils import load_config

proj_home = os.path.realpath(os.path.dirname(__file__)+ "/../")
conf = load_config(proj_home=proj_home)

class BackupFileException(Exception):
pass
Expand Down
41 changes: 25 additions & 16 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
LOGGING_LEVEL="WARN"
LOG_STDOUT=True

DOCMATCHPIPELINE_API_TOKEN = 'api token'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = 'http://0.0.0.0:5000'
DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = 'http://0.0.0.0:5050'
DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = '5000'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = '10'
DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = '1'
DOCMATCHPIPELINE_API_TOKEN = "api token"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_URL = "http://0.0.0.0:5000"
DOCMATCHPIPELINE_API_JOURNALS_SERVICE_URL = "http://0.0.0.0:5050"
DOCMATCHPIPELINE_API_MAX_RECORDS_TO_ORACLE = "5000"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_ATTEMPTS = "10"
DOCMATCHPIPELINE_API_ORACLE_SERVICE_SLEEP_SEC = "1"

# input filenames
DOCMATCHPIPELINE_INPUT_FILENAME = '/match_oracle.input'
DOCMATCHPIPELINE_INPUT_FILENAME = "/match_oracle.input"

# classic match of arxiv to published, or vice versa
DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = '/match.out'
DOCMATCHPIPELINE_CLASSIC_MATCHES_FILENAME = "/match.out"

# intermediate step filenames
DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = '/matched_eprint.output.csv'
DOCMATCHPIPELINE_PUB_RESULT_FILENAME = '/matched_pub.output.csv'
DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME = "/matched_eprint.output.csv"
DOCMATCHPIPELINE_PUB_RESULT_FILENAME = "/matched_pub.output.csv"

# final filename to be uploaded to google drive
DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = '/compare_eprint.csv'
DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = '/compare_pub.csv'
DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME = "/compare_eprint.csv"
DOCMATCHPIPELINE_PUB_COMBINED_FILENAME = "/compare_pub.csv"

# filename to log failed metadata filenames
DOCMATCHPIPELINE_RERUN_FILENAME = '../rerun.input'
DOCMATCHPIPELINE_RERUN_FILENAME = "../rerun.input"

# Google Drive integration
GOOGLE_SECRETS_FILENAME = "credentials.txt"
Expand All @@ -42,6 +42,15 @@
# how many months to log the arXiv article that was not matched, among the classes of the arXiv that should have been matched
DOCMATCHPIPELINE_EPRINT_RERUN_MONTHS = 12

# daily dump of oracledb to text, for use by classic
DOCMATCHPIPELINE_ORACLE_DUMP_FILE = './oracle_dump.list'
DOCMATCHPIPELINE_ORACLE_DUMP_AGE = 9999
# backend maintenance directory and files
# define the correct DOCMATCHPIPELINE_PUBLISHED_DIR in deployment yamls
DOCMATCHPIPELINE_PUBLISHED_DIR="/tmp/"

DOCMATCHPIPELINE_MATCHES_KILL_FILE="matches.kill"
DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE="matches.kill.frozen"
DOCMATCHPIPELINE_ORACLE_DUMP_FILE="oracle_dump.tsv"
DOCMATCHPIPELINE_ORACLE_DUMP_AGE=9999
DOCMATCHPIPELINE_USER_SUBMITTED_FILE="user_submitted.list"
DOCMATCHPIPELINE_USER_SUBMITTED_FROZEN_FILE="user_submitted_frozen.list"


10 changes: 5 additions & 5 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from adsputils import load_config, setup_logging

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./"))
conf = load_config(proj_home=proj_home)
config = load_config(proj_home=proj_home)

logger = setup_logging("docmatching", level=conf.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=conf.get("LOG_STDOUT", "FALSE"))
logger = setup_logging("docmatching", level=config.get("LOGGING_LEVEL", "WARN"), proj_home=proj_home, attach_stdout=config.get("LOG_STDOUT", "FALSE"))


def get_args():
Expand Down Expand Up @@ -129,7 +129,7 @@ def main():
if args.datapath:
path = args.datapath
elif args.date:
path = conf.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
path = config.get("EPRINT_BASE_DIRECTORY", "./") + "/" + args.date
if path:
try:
if args.match_to_pub:
Expand Down Expand Up @@ -183,8 +183,8 @@ def main():

# daily: process matches.kill without archiving
elif args.load_matches_kill:
input_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "")
frozen_filename = conf.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "")
input_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FILE", "matches.kill")
frozen_filename = config.get("DOCMATCHPIPELINE_PUBLISHED_DIR", "/tmp/") + config.get("DOCMATCHPIPELINE_MATCHES_KILL_FROZEN_FILE", "matches.kill.frozen")
if input_filename:
OracleUtil().load_curated_file(input_filename=input_filename, frozen_filename=frozen_filename, input_score=-1.0, do_backup=False)

Expand Down
Loading