new example: RE on drugprot (#169)

* add dataset, model, taskmodule and experiment config for drugprot * add documentation * fix documentation
ArneBinder · Jul 26, 2024 · d1babb2 · d1babb2
1 parent c9ac6f4
commit d1babb2
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 0 deletions.
diff --git a/configs/dataset/drugprot_base.yaml b/configs/dataset/drugprot_base.yaml
@@ -0,0 +1,6 @@
+_target_: src.utils.execute_pipeline
+input:
+  _target_: pie_datasets.DatasetDict.load_dataset
+  path: pie/drugprot
+  name: drugprot_source
+  revision: d0a3f1183bb793f81350c8b5fb56743d7af29d8d
diff --git a/configs/dataset/drugprot_prepared.yaml b/configs/dataset/drugprot_prepared.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - drugprot_base
+  - _convert_documents
+  - _add_sentences
+
+convert_documents:
+  document_type: pytorch_ie.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
diff --git a/configs/experiment/drugprot.yaml b/configs/experiment/drugprot.yaml
@@ -0,0 +1,69 @@
+# @package _global_
+
+# This setup is based on:
+#   Leon Weber, Mario Sänger, Samuele Garda, Fabio Barth, Christoph Alt, Ulf Leser,
+#   Chemical–protein relation extraction with ensembles of carefully tuned pretrained
+#   language models, Database, Volume 2022, 2022, baac098, https://doi.org/10.1093/database/baac098
+# i.e. https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204
+#
+# The only differences (to our knowledge) are:
+# - we use NLTK sentence splitter instead of FlairSegtokSentenceSplitter, and
+# - we use a different base model (RoBERTa-base). You can use the original model by downloading
+#   RoBERTa-base-PM-M3-Voc-hf from https://github.com/facebookresearch/bio-lm?tab=readme-ov-file#models
+#   and setting base_model_name to the path where you downloaded the model.
+
+# To test this config and execute a debug run (one batch only), call:
+#   python src/train.py experiment=drugprot +trainer.fast_dev_run=true
+
+# To execute the full training run, call:
+#   python src/train.py experiment=drugprot
+
+# Imports all configurations from the specified files (the file extension .yaml can be omitted)
+defaults:
+  - override /dataset: drugprot_prepared
+  - override /datamodule: default
+  - override /taskmodule: re_text_classification_with_indices
+  - override /model: sequence_classification_with_pooler
+  - override /callbacks: default
+  # this requires Weights & Biases (wandb package) to be installed
+  - override /logger: wandb
+  - override /trainer: default
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+# name of the run determines folder name in logs
+name: "drugprot/re_text_classification_with_indices"
+
+base_model_name: "FacebookAI/roberta-base"
+
+tags: ["dataset=drugprot", "model=sequence_classification_with_pooler"]
+
+seed: 12345
+
+monitor_metric: metric/micro/f1/val
+
+trainer:
+  min_epochs: 3
+  max_epochs: 3
+  # gradient_clip_val: 0.5
+
+datamodule:
+  batch_size: 32
+  num_workers: 8
+
+taskmodule:
+  # overwrite default values of the taskmodule (see taskmodule/re_text_classification_with_indices.yaml)
+  max_window: 256
+  add_type_to_marker: false
+  tokenizer_name_or_path: ${base_model_name}
+  add_candidate_relations: true
+  partition_annotation: labeled_partitions
+  collect_statistics: true
+
+model:
+  # overwrite default values of the model (see model/sequence_classification_with_pooler.yaml)
+  learning_rate: 3e-5
+  task_learning_rate: 3e-5
+  model_name_or_path: ${base_model_name}
+  warmup_proportion: 0.1
diff --git a/configs/model/sequence_classification_with_pooler.yaml b/configs/model/sequence_classification_with_pooler.yaml
@@ -0,0 +1,4 @@
+_target_: pie_modules.models.SequenceClassificationModelWithPooler
+
+model_name_or_path: ???
+# see the actual model implementation for all available options and defaults
diff --git a/configs/taskmodule/re_text_classification_with_indices.yaml b/configs/taskmodule/re_text_classification_with_indices.yaml
@@ -0,0 +1,7 @@
+_target_: pie_modules.taskmodules.RETextClassificationWithIndicesTaskModule
+tokenizer_name_or_path: ???
+
+## Long sequence handling
+max_window: ??? # Maximum sequence length, depends on the model (e.g. 512 for BERT)
+
+# see the actual model implementation for all available options and defaults