diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d062b05..7b674fb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -54,4 +54,4 @@ jobs: type=ref,event=pr type=ref,event=tag username: ${{ secrets.HARBOR_USERNAME }} - password: ${{ secrets.HARBOR_SECRET }} \ No newline at end of file + password: ${{ secrets.HARBOR_SECRET }} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 5e30061..b024d04 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -23,4 +23,4 @@ jobs: python-version: "3.11" - name: Run pre-commit - uses: pre-commit/action@v3.0.0 \ No newline at end of file + uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/pull-request-master.yml b/.github/workflows/pull-request-master.yml index 6487d4e..26fafec 100644 --- a/.github/workflows/pull-request-master.yml +++ b/.github/workflows/pull-request-master.yml @@ -9,4 +9,4 @@ jobs: uses: ./.github/workflows/pre-commit.yml with: ref: ${{ github.event.pull_request.head.sha }} - secrets: inherit \ No newline at end of file + secrets: inherit diff --git a/.gitignore b/.gitignore index 1410c7b..6269b87 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,4 @@ cython_debug/ *.csv *.h5 -!/tests/integration/fixtures/inspire_test_data.df \ No newline at end of file +!/tests/integration/fixtures/inspire_test_data.df diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5221851..9c3e966 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,14 +1,17 @@ repos: - - repo: https://github.com/psf/black - rev: '24.4.2' + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 hooks: - - id: black - - repo: https://github.com/pycqa/isort - rev: '5.13.2' + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: fix-byte-order-marker + - id: mixed-line-ending + - id: name-tests-test + args: [ --pytest-test-first ] + exclude: '^(?!factories/)' + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 hooks: - - id: isort - - repo: https://github.com/pycqa/flake8 - rev: '7.1.0' - hooks: - - id: flake8 - args: ['--config=setup.cfg'] \ No newline at end of file + - id: ruff + args: [ --fix] diff --git a/README.md b/README.md index eec57bc..7fceb4b 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Inspire Classifier -## About +## About INSPIRE module aimed at automatically classifying the new papers that are added to INSPIRE, such as if they are core or not, or the arXiv category corresponding to each of them. -The current implemntation uses the ULMfit approach. Universal Language Model Fine-tuning, is a method for training text classifiers by first pretraining a language model on a large corpus to learn general language features (in this case a pre-loaded model, which was trained using the WikiText-103 dataset is used). The pretrained model is then fine-tuned on the title and abstract of the inpsire dataset before training the classifier on top. +The current implemntation uses the ULMfit approach. Universal Language Model Fine-tuning, is a method for training text classifiers by first pretraining a language model on a large corpus to learn general language features (in this case a pre-loaded model, which was trained using the WikiText-103 dataset is used). The pretrained model is then fine-tuned on the title and abstract of the inpsire dataset before training the classifier on top. @@ -28,7 +28,7 @@ poetry run python scripts/create_dataset.py --year-from $YEAR_FROM --month-from ### 2. Run training and validate model -The [`train_classifier.py`](scripts/train_classifier.py) script will run the commands to train and validate a new model. Configurations changes like the amount of training epochs as well as the train-test split can be adjusted here. In short, the script first splits the pkl file from the first step into a training and a test dataset inside the `classifier/data` folder. The training set is then used to train the model, while the test set is used to evaluate the model after the training is finished. The model will be saved into `classifier/models/language_model/finetuned_language_model_encoder.h5` +The [`train_classifier.py`](scripts/train_classifier.py) script will run the commands to train and validate a new model. Configurations changes like the amount of training epochs as well as the train-test split can be adjusted here. In short, the script first splits the pkl file from the first step into a training and a test dataset inside the `classifier/data` folder. The training set is then used to train the model, while the test set is used to evaluate the model after the training is finished. The model will be saved into `classifier/models/language_model/finetuned_language_model_encoder.h5` ``` poetry run python scripts/train_classifier.py @@ -49,15 +49,13 @@ poetry run python scripts/upload_to_s3.py 1. Build docker image: `docker build -t inspirehep/classifier: .` 2. Login with inspirehep user on dockerhub: `docker login` -3. Push image to dockerhub: `docker push inspirehep/classifier:` +3. Push image to dockerhub: `docker push inspirehep/classifier:` 4. Change `newTag` in the `kustomization.yml` file in the [k8s repo](https://github.com/cern-sis/kubernetes/tree/master/classifier). ## How to run -For testing, the cli of the classifier can be used via `poetry run inspire-classifier 'example title' 'exmaple abstract'`, with the `-b` flag, the basepath to check for the training data, can be passed (which currently should be `-b classifier`). +For testing, the cli of the classifier can be used via `poetry run inspire-classifier 'example title' 'exmaple abstract'`, with the `-b` flag, the basepath to check for the training data, can be passed (which currently should be `-b classifier`). In the production, the api is used to predict the 'coreness' of records using the `/api/predict/coreness` endpoint and passing `title` and `abstract` as json fields in a POST request (see [this file](inspire_classifier/app.py) for details). - - diff --git a/inspire_classifier/__init__.py b/inspire_classifier/__init__.py index 6cf077a..2ef62c4 100644 --- a/inspire_classifier/__init__.py +++ b/inspire_classifier/__init__.py @@ -20,4 +20,6 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. -"""INSPIRE module aimed at automatically classifying the new papers that are added to INSPIRE, such as if they are core or not, or the arXiv category corresponding to each of them.""" +"""INSPIRE module aimed at automatically classifying the new papers that are added to +INSPIRE, such as if they are core or not, or the arXiv category corresponding to each +of them.""" diff --git a/inspire_classifier/api.py b/inspire_classifier/api.py index 8a4fdff..2c737d0 100644 --- a/inspire_classifier/api.py +++ b/inspire_classifier/api.py @@ -56,9 +56,9 @@ def split_data(): ) except IOError as error: raise IOError( - "Training dataframe not found. Make sure the file is present in the right directory. " - "Please use the path specified in config.py for CLASSIFIER_DATAFRAME_PATH relative to the " - "CLASSIFIER_BASE_PATH." + "Training dataframe not found. Make sure the file is present in the right " + "directory. Please use the path specified in config.py for " + "CLASSIFIER_DATAFRAME_PATH relative to the CLASSIFIER_BASE_PATH." ) from error @@ -93,8 +93,8 @@ def finetune_and_save_language_model(): ) except IOError as error: raise IOError( - "Unable to save the finetuned language model. Please check that the language model data directory " - "exists." + "Unable to save the finetuned language model. Please check that the " + "language model data directory exists." ) from error @@ -182,7 +182,7 @@ def predict_coreness(classifier, title, abstract): predicted_class = categories[np.argmax(class_probabilities)] output_dict = {"prediction": predicted_class} - output_dict["scores"] = dict(zip(categories, class_probabilities)) + output_dict["scores"] = dict(zip(categories, class_probabilities, strict=False)) return output_dict diff --git a/inspire_classifier/app.py b/inspire_classifier/app.py index a8adda6..7cc34a7 100644 --- a/inspire_classifier/app.py +++ b/inspire_classifier/app.py @@ -28,13 +28,13 @@ from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics from webargs.flaskparser import use_args +from inspire_classifier import serializers from inspire_classifier.api import initialize_classifier, predict_coreness -from . import serializers - class JsonResponse(Response): - """ "By creaitng this Response class, we force the response to always be in json, getting rid of the jsonify function.""" + """ By creating this Response class, we force the response to always be in json, + getting rid of the jsonify function.""" @classmethod def force_type(cls, rv, environ=None): @@ -60,7 +60,8 @@ def create_app(): @app.route("/api/health") def date(): - """Basic endpoint that returns the date, used to check if everything is up and working.""" + """Basic endpoint that returns the date, used to check if everything is up + and working.""" now = datetime.datetime.now() return jsonify(now) diff --git a/inspire_classifier/cli.py b/inspire_classifier/cli.py index 3eda875..0d930fe 100644 --- a/inspire_classifier/cli.py +++ b/inspire_classifier/cli.py @@ -43,11 +43,10 @@ def inspire_classifier(): "-b", "--base-path", type=click.Path(exists=True), required=False, nargs=1 ) def predict(title, abstract, base_path): - with click_spinner.spinner(): - with current_app.app_context(): - if base_path: - current_app.config["CLASSIFIER_BASE_PATH"] = base_path - click.echo(predict_coreness(title, abstract)) + with click_spinner.spinner(),current_app.app_context(): + if base_path: + current_app.config["CLASSIFIER_BASE_PATH"] = base_path + click.echo(predict_coreness(title, abstract)) @inspire_classifier.command("train") @@ -58,19 +57,18 @@ def predict(title, abstract, base_path): "-b", "--base-path", type=click.Path(exists=True), required=False, nargs=1 ) def train_classifier(language_model_epochs, classifier_epochs, base_path): - with click_spinner.spinner(): - with current_app.app_context(): - if language_model_epochs: - current_app.config["CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH"] = ( - language_model_epochs - ) - if classifier_epochs: - current_app.config["CLASSIFIER_CLASSIFIER_CYCLE_LENGTH"] = ( - classifier_epochs - ) - if base_path: - current_app.config["CLASSIFIER_BASE_PATH"] = base_path - train() + with click_spinner.spinner(),current_app.app_context(): + if language_model_epochs: + current_app.config["CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH"] = ( + language_model_epochs + ) + if classifier_epochs: + current_app.config["CLASSIFIER_CLASSIFIER_CYCLE_LENGTH"] = ( + classifier_epochs + ) + if base_path: + current_app.config["CLASSIFIER_BASE_PATH"] = base_path + train() @inspire_classifier.command("validate") diff --git a/inspire_classifier/domain/models.py b/inspire_classifier/domain/models.py index cb57b9f..302b639 100644 --- a/inspire_classifier/domain/models.py +++ b/inspire_classifier/domain/models.py @@ -124,8 +124,11 @@ def initialize_learner( self, dropout_multiplier=0.5, weight_decay=1e-6, - learning_rates=np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]), + learning_rates=None, ): + if learning_rates is None: + learning_rates = np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]) + self.learner = text_classifier_learner( self.dataloader, AWD_LSTM, diff --git a/inspire_classifier/domain/preprocessor.py b/inspire_classifier/domain/preprocessor.py index d867949..d87a2bb 100644 --- a/inspire_classifier/domain/preprocessor.py +++ b/inspire_classifier/domain/preprocessor.py @@ -31,9 +31,11 @@ def split_and_save_data_for_training(dataframe_path, dest_dir, val_fraction=0.1): """ Args: - dataframe_path: The path to the pandas dataframe containing the records. The dataframe should have one - column containing the title and abstract text appended (title + abstract). The second - column should contain the label as an integer (0: Rejected, 1: Non-Core, 2: Core). + dataframe_path: The path to the pandas dataframe containing the records. + The dataframe should have one column containing the title and + abstract text appended (title + abstract). The second column + should contain the label as an integer + (0: Rejected, 1: Non-Core, 2: Core). dest_dir: Directory to save the training/validation csv. val_fraction: the fraction of data to use as the validation set. """ diff --git a/pyproject.toml b/pyproject.toml index a18edfb..f385871 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,14 +35,12 @@ mock = "^5.1.0" [tool.poetry.group.dev.dependencies] -black = "^24.4.2" pre-commit = "*" elasticsearch-dsl = "^7.4.0" elasticsearch = "<7.14.0" inspire-utils = "3.0.22" -isort = "^5.13.2" boto3 = "^1.34.130" [build-system] requires = ["poetry-core"] @@ -56,8 +54,3 @@ inspire-classifier = 'inspire_classifier.cli:inspire_classifier' testpaths = [ "tests", ] - -[tool.isort] -profile = "black" -multi_line_output = 3 -atomic = true \ No newline at end of file diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..41c11f7 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,28 @@ +target-version = "py311" +[lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-tidy-imports + "TID", + # flake8-pytest-style + "PT", +] +ignore = ["B904"] + +[lint.pycodestyle] +ignore-overlong-task-comments = true + +[lint.pydocstyle] +convention = "google" diff --git a/scripts/train_classifier.py b/scripts/train_classifier.py index b5e69cb..509e138 100644 --- a/scripts/train_classifier.py +++ b/scripts/train_classifier.py @@ -46,7 +46,9 @@ def train_classifier( print("-----------------") os.system( - f"inspire-classifier train -b classifier --classifier-epochs {number_of_classifier_epochs} --language-model-epochs {number_of_lanuage_model_epochs}" + f"inspire-classifier train -b classifier " + f"--classifier-epochs {number_of_classifier_epochs} " + f"--language-model-epochs {number_of_lanuage_model_epochs}" ) print("training finished successfully!") os.system( diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index d3feb6e..0000000 --- a/setup.cfg +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2024 CERN. -# -# inspirehep is free software; you can redistribute it and/or modify it under -# the terms of the MIT License; see LICENSE file for more details. - -[aliases] -test = pytest - -[bdist_wheel] -universal = 1 - -[flake8] -# E501 black takes care of the line size -ignore = D401,W503,E501,E265,E203 -max-line-length = 88 -max-complexity = 15 \ No newline at end of file diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index e81bad1..060feb4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -46,7 +46,7 @@ def app(): yield app -@pytest.fixture() +@pytest.fixture def app_client(app): return app.test_client() @@ -55,9 +55,10 @@ class Mock_Learner(Learner): """ Mocks the fit method of the Learner. - This is done to reduce the model training time during testing by making the fit run once (as opposed to 2 times and - 3 times for the LanguageModel and Classifier respectively). It stores the result of the first run and then returns - the same result for the other times fit is run. + This is done to reduce the model training time during testing by making the fit + run once (as opposed to 2 times and 3 times for the LanguageModel and Classifier + respectively). It stores the result of the first run and then returns the same + result for the other times fit is run. """ def fit(self, *args, **kwargs): @@ -70,7 +71,7 @@ def fit(self, *args, **kwargs): @pytest.fixture(scope="session") @patch("fastai.text.learner.text_classifier_learner", Mock_Learner) -def trained_pipeline(app, tmp_path_factory): +def _trained_pipeline(app, tmp_path_factory): app.config["CLASSIFIER_BASE_PATH"] = tmp_path_factory.getbasetemp() create_directories() shutil.copy( diff --git a/tests/integration/test_classifier_api.py b/tests/integration/test_classifier_api.py index 571cf25..f22f4a0 100644 --- a/tests/integration/test_classifier_api.py +++ b/tests/integration/test_classifier_api.py @@ -24,25 +24,32 @@ from math import isclose import pandas as pd +import pytest from inspire_classifier.api import predict_coreness from inspire_classifier.utils import path_for TEST_TITLE = "Pre-images of extreme points of the numerical range, and applications" -TEST_ABSTRACT = "We extend the pre-image representation of exposed points of the numerical range of a matrix to all \ -extreme points. With that we characterize extreme points which are multiply generated, having at least two linearly \ -independent pre-images, as the extreme points which are Hausdorff limits of flat boundary portions on numerical ranges \ -of a sequence converging to the given matrix. These studies address the inverse numerical range map and the \ -maximum-entropy inference map which are continuous functions on the numerical range except possibly at certain \ -multiply generated extreme points. This work also allows us to describe closures of subsets of 3-by-3 matrices having \ -the same shape of the numerical range." - - -def test_create_directories(trained_pipeline): +TEST_ABSTRACT =\ + ("We extend the pre-image representation of exposed points of the numerical range " + "of a matrix to all extreme points. With that we characterize extreme points which" + " are multiply generated, having at least two linearly independent pre-images," + " as the extreme points which are Hausdorff limits of flat boundary portions on" + " numerical ranges of a sequence converging to the given matrix." + " These studies address the inverse numerical range map and the maximum-entropy " + "inference map which are continuous functions on the numerical range except " + "possibly at certain multiply generated extreme points. This work also allows us" + " to describe closures of subsets of 3-by-3 matrices having the same shape of the" + " numerical range.") + + +@pytest.mark.usefixtures("_trained_pipeline") +def test_create_directories(): assert path_for("classifier_model").exists() -def test_preprocess_and_save_data(app, trained_pipeline): +@pytest.mark.usefixtures("_trained_pipeline") +def test_preprocess_and_save_data(app): dataframe = pd.read_pickle(path_for("dataframe")) training_valid__csv = pd.read_csv(path_for("train_valid_data")) @@ -60,10 +67,12 @@ def test_preprocess_and_save_data(app, trained_pipeline): abs_tol=1, ) - -def test_vocab(app, trained_pipeline): - data_itos = pickle.load(open(path_for("data_itos"), "rb")) - # For performance when using mixed precision, the vocabulary is always made of size a multiple of 8, potentially by adding xxfake tokens. +@pytest.mark.usefixtures("_trained_pipeline") +def test_vocab(app): + with open(path_for("data_itos"), "rb") as file: + data_itos = pickle.load(file) + # For performance when using mixed precision, the vocabulary is always made of + # size a multiple of 8, potentially by adding xxfake tokens. adjusted_max_vocab = ( app.config["CLASSIFIER_MAXIMUM_VOCABULARY_SIZE"] + 8 @@ -72,15 +81,18 @@ def test_vocab(app, trained_pipeline): assert len(data_itos) == adjusted_max_vocab -def test_save_language_model(trained_pipeline): +@pytest.mark.usefixtures("_trained_pipeline") +def test_save_language_model(): assert path_for("finetuned_language_model_encoder").exists() -def test_train_and_save_classifier(trained_pipeline): +@pytest.mark.usefixtures("_trained_pipeline") +def test_train_and_save_classifier(): assert path_for("trained_classifier").exists() -def test_predict_coreness(trained_pipeline): +@pytest.mark.usefixtures("_trained_pipeline") +def test_predict_coreness(): assert path_for("data_itos").exists() assert path_for("trained_classifier").exists() output_dict = predict_coreness(title=TEST_TITLE, abstract=TEST_ABSTRACT)