Skip to content

Commit

Permalink
add dataset, model, taskmodule and experiment config for drugprot
Browse files Browse the repository at this point in the history
  • Loading branch information
ArneBinder committed Jul 26, 2024
1 parent fc55427 commit 2d31684
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 0 deletions.
6 changes: 6 additions & 0 deletions configs/dataset/drugprot_base.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
_target_: src.utils.execute_pipeline
input:
_target_: pie_datasets.DatasetDict.load_dataset
path: pie/drugprot
name: drugprot_source
revision: d0a3f1183bb793f81350c8b5fb56743d7af29d8d
7 changes: 7 additions & 0 deletions configs/dataset/drugprot_prepared.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defaults:
- drugprot_base
- _convert_documents
- _add_sentences

convert_documents:
document_type: pytorch_ie.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
66 changes: 66 additions & 0 deletions configs/experiment/drugprot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# @package _global_

# This setup is based on:
# Leon Weber, Mario Sänger, Samuele Garda, Fabio Barth, Christoph Alt, Ulf Leser,
# Chemical–protein relation extraction with ensembles of carefully tuned pretrained
# language models, Database, Volume 2022, 2022, baac098, https://doi.org/10.1093/database/baac098
# i.e. https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204
#
# The only differences (to our knowledge) are:
# - we use NLTK sentence splitter instead of FlairSegtokSentenceSplitter, and
# - we use a different base model (RoBERTa-base). You can use the original model by downloading
# RoBERTa-base-PM-M3-Voc-hf from https://github.com/facebookresearch/bio-lm?tab=readme-ov-file#models
# and setting base_model_name to the path where you downloaded the model.

# To execute this experiment run:
# python src/train.py experiment=drugprot

# Imports all configurations from the specified files (the file extension .yaml can be omitted)
defaults:
- override /dataset: drugprot_prepared
- override /datamodule: default
- override /taskmodule: re_text_classification_with_indices
- override /model: sequence_classification_with_pooler
- override /callbacks: default
# this requires Weights & Biases (wandb package) to be installed
- override /logger: wandb
- override /trainer: default

# all parameters below will be merged with parameters from default configurations set above
# this allows you to overwrite only specified parameters

# name of the run determines folder name in logs
name: "drugprot/re_text_classification_with_indices"

base_model_name: "FacebookAI/roberta-base"

tags: ["dataset=drugprot", "model=sequence_classification_with_pooler"]

seed: 12345

monitor_metric: metric/micro/f1/val

trainer:
min_epochs: 3
max_epochs: 3
# gradient_clip_val: 0.5

datamodule:
batch_size: 32
num_workers: 8

taskmodule:
# overwrite default values of the taskmodule (see taskmodule/re_text_classification_with_indices.yaml)
max_window: 256
add_type_to_marker: false
tokenizer_name_or_path: ${base_model_name}
add_candidate_relations: true
partition_annotation: labeled_partitions
collect_statistics: true

model:
# overwrite default values of the model (see model/sequence_classification_with_pooler.yaml)
learning_rate: 3e-5
task_learning_rate: 3e-5
model_name_or_path: ${base_model_name}
warmup_proportion: 0.1
4 changes: 4 additions & 0 deletions configs/model/sequence_classification_with_pooler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
_target_: pie_modules.models.SequenceClassificationModelWithPooler

model_name_or_path: ???
# see the actual model implementation for all available options and defaults
7 changes: 7 additions & 0 deletions configs/taskmodule/re_text_classification_with_indices.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
_target_: pie_modules.taskmodules.RETextClassificationWithIndicesTaskModule
tokenizer_name_or_path: ???

## Long sequence handling
max_window: ??? # Maximum sequence length, depends on the model (e.g. 512 for BERT)

# see the actual model implementation for all available options and defaults

0 comments on commit 2d31684

Please sign in to comment.