Template/Example code to get mlflow working with training text classifier #13563

jpmorris · 2024-07-09T18:14:05Z

jpmorris
Jul 9, 2024

First time running spacy (current prefer NOT to use cli interface unless can be convinced otherwise) and trying to integrate mlflow. I'm currently getting an error :

RESOURCE_DOES_NOT_EXIST: Could not find experiment with ID 895504060992160933

I've been searching around and I can't fnd a complete example for this uses case. It seems to find my tracking uri. Does anyone have any sample code that will do this type of training and tracking with mlflow. This is what I am using currently (grabbed from what I can find on the web)

from spacy.cli.train import train as spacy_train
import constants
import mlflow

mlflow.set_experiment("spacy_textcat")
mlflow.set_tracking_uri("http://192.168.1.30:3274")

config_path = f"{constants.TEXTCAT_DIR}/config.cfg"
output_model_path = "output/spacy_textcat"
spacy_train(
    config_path,
    output_path=output_model_path,
    overrides={
        "paths.train": f"{constants.TEXTCAT_DATA_DIR}/train.spacy",
        "paths.dev": f"{constants.TEXTCAT_DATA_DIR}/valid.spacy",
    },
)

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["textcat"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@Tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.textcat]
factory = "textcat"
scorer = {"@scorers":"spacy.textcat_scorer.v1"}
threshold = 0.5

[components.textcat.model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
nO = null

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@Loggers = "spacy.MLflowLogger.v2"
run_name = "spacy_text_classifier"
nested = False
remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
cats_score = 1.0
cats_score_desc = null
cats_micro_p = null
cats_micro_r = null
cats_micro_f = null
cats_macro_p = null
cats_macro_r = null
cats_macro_f = null
cats_macro_auc = null
cats_f_per_type = null
cats_macro_auc_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Template/Example code to get mlflow working with training text classifier #13563

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

Template/Example code to get mlflow working with training text classifier #13563

jpmorris Jul 9, 2024

Replies: 0 comments

jpmorris
Jul 9, 2024