From 0c0f60b4aae39452500ab6f49d60c80038207bf4 Mon Sep 17 00:00:00 2001 From: fh Date: Fri, 20 Sep 2024 14:47:47 +0200 Subject: [PATCH] get rid of unnecessary decoder_start_table --- eole/bin/convert/convert_HF.py | 38 +--------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index ee08baf2..b39c0d87 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -190,16 +190,6 @@ "XLMRobertaXLForMaskedLM": TransformerEncoderModelConfig, } -decoder_start_table = { - "LlamaForCausalLM": "", - "MistralForCausalLM": "", - "MixtralForCausalLM": "", - "PhiForCausalLM": "", - "Phi3ForCausalLM": "", - "GPT2LMHeadModel": "", - "XLMRobertaXLForMaskedLM": "", -} - class Tokenizer: def __init__(self, model_path: str): @@ -591,7 +581,6 @@ def run(cls, args): "n_positions": 0, } left_pad = True - # eos_token = None optional_eos = [] mapped_tokens = [] @@ -951,11 +940,6 @@ def get_weight(checkpoint, tensor_name): data["added_tokens_decoder"][str(index)]["content"] for index in eos_token_id[1:] ] - # eos_token = optional_eos[0] - # elif isinstance(eos_token_id, int): - # eos_token = data["added_tokens_decoder"][str(eos_token_id)][ - # "content" - # ] # Automatically convert added_tokens into mapped_tokens mapped_tokens = [ ( @@ -1008,16 +992,6 @@ def get_weight(checkpoint, tensor_name): vocab.extend(newtokens) for tok in data["added_tokens"]: vocab[tok["id"]] = tok["content"] - # if "<|startoftext|>" in vocab: - # index = vocab.index("<|startoftext|>") - # vocab[index] = DefaultTokens.BOS - # if eos_token is not None: - # if eos_token in vocab and "" not in vocab: - # index = vocab.index(eos_token) - # vocab[index] = DefaultTokens.EOS - # if "<0x00>" in vocab: - # index = vocab.index("<0x00>") - # vocab[index] = DefaultTokens.PAD src_vocab = pyonmttok.build_vocab_from_tokens( vocab, ) @@ -1042,16 +1016,6 @@ def get_weight(checkpoint, tensor_name): vocab.append(DefaultTokens.VOCAB_PAD + str(i)) for tok in data["added_tokens"]: vocab[tok["id"]] = tok["content"] - # if "<|startoftext|>" in vocab: - # index = vocab.index("<|startoftext|>") - # vocab[index] = DefaultTokens.BOS - # if "<|begin_of_text|>" in vocab: - # index = vocab.index("<|begin_of_text|>") - # vocab[index] = DefaultTokens.BOS - # if eos_token is not None: - # if eos_token in vocab and "" not in vocab: - # index = vocab.index(eos_token) - # vocab[index] = DefaultTokens.EOS src_vocab = pyonmttok.build_vocab_from_tokens(vocab) tokenizer_basename = "bpe.model" @@ -1066,7 +1030,7 @@ def get_weight(checkpoint, tensor_name): vocabs["src"] = src_vocab vocabs["tgt"] = src_vocab if add_bos_token: - vocabs["decoder_start_token"] = decoder_start_table[arch] + vocabs["decoder_start_token"] = vocabs["specials"]["bos_token"] else: vocabs["decoder_start_token"] = "" vocab_dict = vocabs_to_dict(vocabs)