Skip to content

Commit

Permalink
Update from pytorch-transformers to transformers library (huggingface#61
Browse files Browse the repository at this point in the history
)

* updated dependencies, updated ignore_index and ignored values in tensors

* removed idea project files

* set transformers library version, updated additional special tokens to list
  • Loading branch information
andr-ec authored Feb 27, 2020
1 parent f071ee6 commit 74b9b3c
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 12 deletions.
2 changes: 1 addition & 1 deletion convai_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from projects.convai2.eval_f1 import eval_f1, setup_args as setup_args_f1
from projects.convai2.eval_ppl import eval_ppl, setup_args as setup_args_ppl
from projects.convai2.build_dict import build_dict
from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
GPT2DoubleHeadsModel, GPT2LMHeadModel, GPT2Tokenizer)

from train import build_input_from_segments, pad_dataset, SPECIAL_TOKENS, add_special_tokens_
Expand Down
2 changes: 1 addition & 1 deletion interact.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import torch
import torch.nn.functional as F

from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from train import SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_
from utils import get_dataset, download_pretrained_model

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
torch
pytorch-ignite
pytorch-transformers>=1.2
transformers==2.5.1
tensorboardX==1.8
tensorflow # for tensorboardX
2 changes: 1 addition & 1 deletion test_special_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import shutil
import unittest

from pytorch_transformers import OpenAIGPTTokenizer, GPT2Tokenizer
from transformers import OpenAIGPTTokenizer, GPT2Tokenizer
from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS

class TestSpecialTokenTreatment(unittest.TestCase):
Expand Down
14 changes: 7 additions & 7 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
from pytorch_transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

from utils import get_dataset, make_logdir

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
'additional_special_tokens': ('<speaker1>', '<speaker2>')}
'additional_special_tokens': ['<speaker1>', '<speaker2>']}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

Expand All @@ -42,7 +42,7 @@ def pad_dataset(dataset, padding=0):
""" Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """
max_l = max(len(x) for x in dataset["input_ids"])
for name in PADDED_INPUTS:
dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
return dataset


Expand All @@ -62,9 +62,9 @@ def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=Fals
instance["input_ids"] = list(chain(*sequence))
instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
instance["mc_token_ids"] = len(instance["input_ids"]) - 1
instance["lm_labels"] = [-1] * len(instance["input_ids"])
instance["lm_labels"] = [-100] * len(instance["input_ids"])
if lm_labels:
instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
return instance


Expand Down Expand Up @@ -227,7 +227,7 @@ def inference(engine, batch):

# Prepare metrics - note how we compute distributed metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
"accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
"average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
Expand Down Expand Up @@ -260,7 +260,7 @@ def inference(engine, batch):

# On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
if args.local_rank in [-1, 0] and args.n_epochs > 0:
os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner)
os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner)
tb_logger.close()

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import torch

from pytorch_transformers import cached_path
from transformers import cached_path

PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/gpt_personachat_cache.tar.gz"
Expand Down

0 comments on commit 74b9b3c

Please sign in to comment.