Skip to content

Commit

Permalink
fixed tokenizer upload issue
Browse files Browse the repository at this point in the history
  • Loading branch information
MoritzLaurer authored Jan 10, 2024
1 parent 064d2b8 commit 43d1fda
Showing 1 changed file with 17 additions and 10 deletions.
27 changes: 17 additions & 10 deletions 4_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"""

# for versioning of experiments with W&B
DATE = 20231127
DATE = 20240109

SEED_GLOBAL = 42

Expand All @@ -26,12 +26,12 @@

if USING_COLAB:
# comment this away this if you are not using colab
"""!pip install transformers[sentencepiece]~=4.33.0 -q
!pip install transformers[sentencepiece]~=4.33.0 -q
!pip install datasets~=2.14.0 -q
!pip install accelerate~=0.23.0 -q
!pip install wandb~=0.16.0 -q
!pip install mdutils~=1.6.0 -q
!pip install scikit-learn~=1.2.0 -q"""
!pip install scikit-learn~=1.2.0 -q

## load packages
import pandas as pd
Expand Down Expand Up @@ -116,7 +116,7 @@ def str2bool(v):

if USING_COLAB:
args = parser.parse_args([
"--dataset_name_heldout", "none" #"all_except_nli",
"--dataset_name_heldout", "none", #"all_except_nli",
# comment following arguments away to set them to False
"--do_train", "True",
"--upload_to_hub", "True"
Expand Down Expand Up @@ -174,7 +174,7 @@ def str2bool(v):
### Load model and tokenizer

if args.do_train:
model_name = "microsoft/deberta-v3-base" #"microsoft/deberta-v3-large" #"microsoft/deberta-v3-base"
model_name = "microsoft/xtremedistil-l6-h256-uncased" #"microsoft/deberta-v3-xsmall" #"microsoft/deberta-v3-large" #"microsoft/deberta-v3-base" # microsoft/xtremedistil-l6-h256-uncased
else:
# can only comprehensively test binary NLI models, because NLI test datasets are binarized
model_name = "MoritzLaurer/deberta-v3-base-mnli-fever-anli-ling-wanli-binary" #"facebook/bart-large-mnli" #"sileod/deberta-v3-base-tasksource-nli" #"MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"
Expand Down Expand Up @@ -371,10 +371,12 @@ def chunks(lst, n): # Yield successive n-sized chunks from lst. https://stackov
per_device_train_batch_size = 8 if "large" in model_name else 32
gradient_accumulation_steps = 4 if "large" in model_name else 1

if USING_COLAB:
per_device_train_batch_size = int(per_device_train_batch_size / 4)
gradient_accumulation_steps = int(gradient_accumulation_steps * 4)
eval_batch = int(eval_batch / 32) if "large" in model_name else int(eval_batch / 8)
#if USING_COLAB:
#per_device_train_batch_size = int(per_device_train_batch_size / 4)
#gradient_accumulation_steps = int(gradient_accumulation_steps * 4)
#eval_batch = int(eval_batch / 32) if "large" in model_name else int(eval_batch / 8)

hub_model_id = f'MoritzLaurer/{model_name.split("/")[-1]}-zeroshot-v1.1-{args.dataset_name_heldout}'

train_args = TrainingArguments(
output_dir=training_directory,
Expand Down Expand Up @@ -406,7 +408,7 @@ def chunks(lst, n): # Yield successive n-sized chunks from lst. https://stackov
report_to="all", # "all"
run_name=run_name,
push_to_hub=True, # does not seem to work if save_strategy="no"
hub_model_id=f'MoritzLaurer/{model_name.split("/")[-1]}-zeroshot-v1.1-{args.dataset_name_heldout}',
hub_model_id=hub_model_id,
hub_token=config.HF_ACCESS_TOKEN,
hub_strategy="end",
hub_private_repo=True,
Expand Down Expand Up @@ -493,6 +495,11 @@ def chunks(lst, n): # Yield successive n-sized chunks from lst. https://stackov

trainer.push_to_hub(commit_message="End of training")

# tokenizer needs to be uploaded separately to create tokenizer.json
# otherwise only tokenizer_config.json is created and pip install sentencepiece is required
tokenizer.push_to_hub(repo_id=hub_model_id, use_temp_dir=True, private=True, use_auth_token=config.HF_ACCESS_TOKEN)


# to save best model to disk
"""
model_path = f"{training_directory}/best-{model_name.split('/')[-1]}-{DATE}"
Expand Down

0 comments on commit 43d1fda

Please sign in to comment.