From 6972469abd5c0487788851d214f0ab87b3bde8dc Mon Sep 17 00:00:00 2001 From: PascalEgn Date: Fri, 2 Aug 2024 14:58:09 +0200 Subject: [PATCH] classifier: fix memory usage --- inspire_classifier/api.py | 47 ++++++++++++++++++++++++--------------- inspire_classifier/app.py | 6 +++-- scripts/create_dataset.py | 16 +++++++++---- 3 files changed, 45 insertions(+), 24 deletions(-) diff --git a/inspire_classifier/api.py b/inspire_classifier/api.py index f22c7a5..8a4fdff 100644 --- a/inspire_classifier/api.py +++ b/inspire_classifier/api.py @@ -151,24 +151,30 @@ def train(): train_and_save_classifier() -def predict_coreness(title, abstract): +def initialize_classifier(): """ - Predicts class-wise probabilities given the title and abstract. + Initializes the classifier. """ - text = title + " " + abstract - categories = ["rejected", "non_core", "core"] - try: - classifier = Classifier( - cuda_device_id=current_app.config["CLASSIFIER_CUDA_DEVICE_ID"] - ) - except IOError as error: - raise IOError("Data ITOS not found.") from error - + classifier = Classifier( + cuda_device_id=current_app.config["CLASSIFIER_CUDA_DEVICE_ID"] + ) try: classifier.load_trained_classifier_weights(path_for("trained_classifier")) except IOError as error: - raise IOError("Could not load the trained classifier weights.") from error + raise IOError( + "Could not load the trained classifier weights.", + path_for("trained_classifier"), + ) from error + return classifier + + +def predict_coreness(classifier, title, abstract): + """ + Predicts class-wise probabilities given the title and abstract. + """ + text = title + " " + abstract + categories = ["rejected", "non_core", "core"] class_probabilities = classifier.predict( text, temperature=current_app.config["CLASSIFIER_SOFTMAX_TEMPERATUR"] ) @@ -191,17 +197,22 @@ def validate(validation_df): raise IOError("There was a problem loading the classifier model") from error predictions = [] validation_df = validation_df.sample(frac=1, random_state=42) - for _, row in tqdm( - validation_df.iterrows(), total=len(validation_df.label.values) - ): + for _, row in tqdm(validation_df.iterrows(), total=len(validation_df.label.values)): predicted_value = classifier.predict( row.text, temperature=current_app.config["CLASSIFIER_SOFTMAX_TEMPERATUR"] ) predicted_class = np.argmax(predicted_value) predictions.append(predicted_class) - validation_df.insert(2, 'predicted_label', predictions) + validation_df.insert(2, "predicted_label", predictions) validation_df.to_csv(f"{path_for('data')}/validation_results.csv", index=False) - print("f1 score ", f1_score(validation_df["label"], validation_df["predicted_label"], average="micro")) - pprint(classification_report(validation_df["label"], validation_df["predicted_label"])) + print( + "f1 score ", + f1_score( + validation_df["label"], validation_df["predicted_label"], average="micro" + ), + ) + pprint( + classification_report(validation_df["label"], validation_df["predicted_label"]) + ) pprint(confusion_matrix(validation_df["label"], validation_df["predicted_label"])) diff --git a/inspire_classifier/app.py b/inspire_classifier/app.py index eefadb1..a8adda6 100644 --- a/inspire_classifier/app.py +++ b/inspire_classifier/app.py @@ -28,7 +28,7 @@ from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics from webargs.flaskparser import use_args -from inspire_classifier.api import predict_coreness +from inspire_classifier.api import initialize_classifier, predict_coreness from . import serializers @@ -55,6 +55,8 @@ def create_app(): app.config["CLASSIFIER_BASE_PATH"] = app.instance_path app.config.from_object("inspire_classifier.config") app.config.from_pyfile("classifier.cfg", silent=True) + with app.app_context(): + classifier = initialize_classifier() @app.route("/api/health") def date(): @@ -69,7 +71,7 @@ def date(): ) def core_classifier(args): """Endpoint for the CORE classifier.""" - prediction = predict_coreness(args["title"], args["abstract"]) + prediction = predict_coreness(classifier, args["title"], args["abstract"]) response = coreness_schema.dump(prediction) return response diff --git a/scripts/create_dataset.py b/scripts/create_dataset.py index f994d25..bdb0f49 100644 --- a/scripts/create_dataset.py +++ b/scripts/create_dataset.py @@ -92,8 +92,13 @@ def __init__(self, index, query_filters, year_from, year_to, month_from, month_t self.inspire_categories_field = "inspire_categories.term" self.query_filters = [ query_filters - & Q("range", _created={"gte": f"{self.year_from}-{self.month_from}", - "lt": f"{self.year_to}-{self.month_to}",}), + & Q( + "range", + _created={ + "gte": f"{self.year_from}-{self.month_from}", + "lt": f"{self.year_to}-{self.month_to}", + }, + ), ] def _postprocess_record_data(self, record_data): @@ -143,7 +148,9 @@ def prepare_inspire_classifier_dataset(data, save_data_path): inspire_data_df["text"] = ( inspire_data_df["title"] + " " + inspire_data_df["abstract"] ) - inspire_classifier_data_df = inspire_data_df[["id", "inspire_categories", "label", "text"]] + inspire_classifier_data_df = inspire_data_df[ + ["id", "inspire_categories", "label", "text"] + ] inspire_classifier_data_df.to_pickle(save_data_path) @@ -163,7 +170,8 @@ def get_inspire_classifier_dataset(year_from, year_to, month_from, month_to): month_to = f"{month_to:02d}-31" print(f"Fetching {year_from}-{month_from} to {year_to}-{month_to}") inspire_classifier_dataset_path = os.path.join( - os.getcwd(), f"inspire_classifier_dataset_{year_from}-{month_from}_{year_to}-{month_to}.pkl" + os.getcwd(), + f"inspire_classifier_dataset_{year_from}-{month_from}_{year_to}-{month_to}.pkl", ) data = get_data_for_decisions(year_from, year_to, month_from, month_to) prepare_inspire_classifier_dataset(data, inspire_classifier_dataset_path)