diff --git a/configs/dataset/drugprot_base.yaml b/configs/dataset/drugprot_base.yaml new file mode 100644 index 00000000..855d860d --- /dev/null +++ b/configs/dataset/drugprot_base.yaml @@ -0,0 +1,6 @@ +_target_: src.utils.execute_pipeline +input: + _target_: pie_datasets.DatasetDict.load_dataset + path: pie/drugprot + name: drugprot_source + revision: d0a3f1183bb793f81350c8b5fb56743d7af29d8d diff --git a/configs/dataset/drugprot_prepared.yaml b/configs/dataset/drugprot_prepared.yaml new file mode 100644 index 00000000..dd94fad8 --- /dev/null +++ b/configs/dataset/drugprot_prepared.yaml @@ -0,0 +1,7 @@ +defaults: + - drugprot_base + - _convert_documents + - _add_sentences + +convert_documents: + document_type: pytorch_ie.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions diff --git a/configs/experiment/drugprot.yaml b/configs/experiment/drugprot.yaml new file mode 100644 index 00000000..a1a3eb05 --- /dev/null +++ b/configs/experiment/drugprot.yaml @@ -0,0 +1,69 @@ +# @package _global_ + +# This setup is based on: +# Leon Weber, Mario Sänger, Samuele Garda, Fabio Barth, Christoph Alt, Ulf Leser, +# Chemical–protein relation extraction with ensembles of carefully tuned pretrained +# language models, Database, Volume 2022, 2022, baac098, https://doi.org/10.1093/database/baac098 +# i.e. https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204 +# +# The only differences (to our knowledge) are: +# - we use NLTK sentence splitter instead of FlairSegtokSentenceSplitter, and +# - we use a different base model (RoBERTa-base). You can use the original model by downloading +# RoBERTa-base-PM-M3-Voc-hf from https://github.com/facebookresearch/bio-lm?tab=readme-ov-file#models +# and setting base_model_name to the path where you downloaded the model. + +# To test this config and execute a debug run (one batch only), call: +# python src/train.py experiment=drugprot +trainer.fast_dev_run=true + +# To execute the full training run, call: +# python src/train.py experiment=drugprot + +# Imports all configurations from the specified files (the file extension .yaml can be omitted) +defaults: + - override /dataset: drugprot_prepared + - override /datamodule: default + - override /taskmodule: re_text_classification_with_indices + - override /model: sequence_classification_with_pooler + - override /callbacks: default + # this requires Weights & Biases (wandb package) to be installed + - override /logger: wandb + - override /trainer: default + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +# name of the run determines folder name in logs +name: "drugprot/re_text_classification_with_indices" + +base_model_name: "FacebookAI/roberta-base" + +tags: ["dataset=drugprot", "model=sequence_classification_with_pooler"] + +seed: 12345 + +monitor_metric: metric/micro/f1/val + +trainer: + min_epochs: 3 + max_epochs: 3 + # gradient_clip_val: 0.5 + +datamodule: + batch_size: 32 + num_workers: 8 + +taskmodule: + # overwrite default values of the taskmodule (see taskmodule/re_text_classification_with_indices.yaml) + max_window: 256 + add_type_to_marker: false + tokenizer_name_or_path: ${base_model_name} + add_candidate_relations: true + partition_annotation: labeled_partitions + collect_statistics: true + +model: + # overwrite default values of the model (see model/sequence_classification_with_pooler.yaml) + learning_rate: 3e-5 + task_learning_rate: 3e-5 + model_name_or_path: ${base_model_name} + warmup_proportion: 0.1 diff --git a/configs/model/sequence_classification_with_pooler.yaml b/configs/model/sequence_classification_with_pooler.yaml new file mode 100644 index 00000000..2d12d651 --- /dev/null +++ b/configs/model/sequence_classification_with_pooler.yaml @@ -0,0 +1,4 @@ +_target_: pie_modules.models.SequenceClassificationModelWithPooler + +model_name_or_path: ??? +# see the actual model implementation for all available options and defaults diff --git a/configs/taskmodule/re_text_classification_with_indices.yaml b/configs/taskmodule/re_text_classification_with_indices.yaml new file mode 100644 index 00000000..d5738d09 --- /dev/null +++ b/configs/taskmodule/re_text_classification_with_indices.yaml @@ -0,0 +1,7 @@ +_target_: pie_modules.taskmodules.RETextClassificationWithIndicesTaskModule +tokenizer_name_or_path: ??? + +## Long sequence handling +max_window: ??? # Maximum sequence length, depends on the model (e.g. 512 for BERT) + +# see the actual model implementation for all available options and defaults