-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add dataset, model, taskmodule and experiment config for drugprot
- Loading branch information
1 parent
fc55427
commit 2d31684
Showing
5 changed files
with
90 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
_target_: src.utils.execute_pipeline | ||
input: | ||
_target_: pie_datasets.DatasetDict.load_dataset | ||
path: pie/drugprot | ||
name: drugprot_source | ||
revision: d0a3f1183bb793f81350c8b5fb56743d7af29d8d |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
defaults: | ||
- drugprot_base | ||
- _convert_documents | ||
- _add_sentences | ||
|
||
convert_documents: | ||
document_type: pytorch_ie.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# @package _global_ | ||
|
||
# This setup is based on: | ||
# Leon Weber, Mario Sänger, Samuele Garda, Fabio Barth, Christoph Alt, Ulf Leser, | ||
# Chemical–protein relation extraction with ensembles of carefully tuned pretrained | ||
# language models, Database, Volume 2022, 2022, baac098, https://doi.org/10.1093/database/baac098 | ||
# i.e. https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204 | ||
# | ||
# The only differences (to our knowledge) are: | ||
# - we use NLTK sentence splitter instead of FlairSegtokSentenceSplitter, and | ||
# - we use a different base model (RoBERTa-base). You can use the original model by downloading | ||
# RoBERTa-base-PM-M3-Voc-hf from https://github.com/facebookresearch/bio-lm?tab=readme-ov-file#models | ||
# and setting base_model_name to the path where you downloaded the model. | ||
|
||
# To execute this experiment run: | ||
# python src/train.py experiment=drugprot | ||
|
||
# Imports all configurations from the specified files (the file extension .yaml can be omitted) | ||
defaults: | ||
- override /dataset: drugprot_prepared | ||
- override /datamodule: default | ||
- override /taskmodule: re_text_classification_with_indices | ||
- override /model: sequence_classification_with_pooler | ||
- override /callbacks: default | ||
# this requires Weights & Biases (wandb package) to be installed | ||
- override /logger: wandb | ||
- override /trainer: default | ||
|
||
# all parameters below will be merged with parameters from default configurations set above | ||
# this allows you to overwrite only specified parameters | ||
|
||
# name of the run determines folder name in logs | ||
name: "drugprot/re_text_classification_with_indices" | ||
|
||
base_model_name: "FacebookAI/roberta-base" | ||
|
||
tags: ["dataset=drugprot", "model=sequence_classification_with_pooler"] | ||
|
||
seed: 12345 | ||
|
||
monitor_metric: metric/micro/f1/val | ||
|
||
trainer: | ||
min_epochs: 3 | ||
max_epochs: 3 | ||
# gradient_clip_val: 0.5 | ||
|
||
datamodule: | ||
batch_size: 32 | ||
num_workers: 8 | ||
|
||
taskmodule: | ||
# overwrite default values of the taskmodule (see taskmodule/re_text_classification_with_indices.yaml) | ||
max_window: 256 | ||
add_type_to_marker: false | ||
tokenizer_name_or_path: ${base_model_name} | ||
add_candidate_relations: true | ||
partition_annotation: labeled_partitions | ||
collect_statistics: true | ||
|
||
model: | ||
# overwrite default values of the model (see model/sequence_classification_with_pooler.yaml) | ||
learning_rate: 3e-5 | ||
task_learning_rate: 3e-5 | ||
model_name_or_path: ${base_model_name} | ||
warmup_proportion: 0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
_target_: pie_modules.models.SequenceClassificationModelWithPooler | ||
|
||
model_name_or_path: ??? | ||
# see the actual model implementation for all available options and defaults |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
_target_: pie_modules.taskmodules.RETextClassificationWithIndicesTaskModule | ||
tokenizer_name_or_path: ??? | ||
|
||
## Long sequence handling | ||
max_window: ??? # Maximum sequence length, depends on the model (e.g. 512 for BERT) | ||
|
||
# see the actual model implementation for all available options and defaults |