diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py index 7836e221..06752cf5 100755 --- a/eole/bin/convert/convert_HF.py +++ b/eole/bin/convert/convert_HF.py @@ -190,6 +190,16 @@ "XLMRobertaXLForMaskedLM": TransformerEncoderModelConfig, } +tok_table = { + "LlamaForCausalLM": "huggingface_tokenize", + "MistralForCausalLM": "mistral_tokenize", + "MixtralForCausalLM": "mistral_tokenize", + "PhiForCausalLM": "huggingface_tokenize", + "Phi3ForCausalLM": "huggingface_tokenize", + "GPT2LMHeadModel": "huggingface_tokenize", + "XLMRobertaXLForMaskedLM": "huggingface_tokenize", +} + class Tokenizer: def __init__(self, model_path: str): @@ -306,6 +316,7 @@ def run(cls, args): else: generation_config_json = None else: + huggingface_model = args.model_dir directory_path = args.output os.makedirs(directory_path, exist_ok=True) try: @@ -1053,6 +1064,33 @@ def get_weight(checkpoint, tensor_name): for merge in data["model"]["merges"]: bpemodel.write(merge + "\n") + transforms = [ + tok_table[arch] + ] # , "filtertoolong"] # the filtertoolong transform is not plug-n-play with id_tokenize + if tok_table[arch] == "huggingface_tokenize": + transforms_configs = { + tok_table[arch]: {"max_length": 512}, + } + elif tok_table[arch] == "mistral_tokenize": + transforms_configs = { + tok_table[arch]: { + "path": os.path.join("${MODEL_PATH}", tokenizer_basename) + } + } + else: + # not used right now, but keeping for reference + transforms_configs = { + "filtertoolong": {"src_seq_length": 512, "tgt_seq_length": 512}, + "onmt_tokenize": { + "src_subword_type": src_subword_type, + "src_subword_model": os.path.join( + "${MODEL_PATH}", tokenizer_basename + ), + "gpt2_pretok": gpt2_pretok, + "mapped_tokens": mapped_tokens, + }, + } + vocabs["src"] = src_vocab vocabs["tgt"] = src_vocab if add_bos_token: @@ -1084,18 +1122,8 @@ def get_weight(checkpoint, tensor_name): vocab_size_multiple=8, decoder_start_token=vocabs["decoder_start_token"], **vocabs["specials"], - transforms=["onmt_tokenize", "filtertoolong"], - transforms_configs={ - "filtertoolong": {"src_seq_length": 512, "tgt_seq_length": 512}, - "onmt_tokenize": { - "src_subword_type": src_subword_type, - "src_subword_model": os.path.join( - "${MODEL_PATH}", tokenizer_basename - ), - "gpt2_pretok": gpt2_pretok, - "mapped_tokens": mapped_tokens, - }, - }, + transforms=transforms, + transforms_configs=transforms_configs, model=arch_table[arch]( layers=n_layers, hidden_size=hidden_size, @@ -1122,6 +1150,7 @@ def get_weight(checkpoint, tensor_name): num_experts=num_experts, num_experts_per_tok=num_experts_per_tok, left_pad=left_pad, + huggingface_model=huggingface_model, ), training=TrainingConfig( compute_dtype=compute_dtype, diff --git a/eole/config/data.py b/eole/config/data.py index 1675b59a..851adba4 100644 --- a/eole/config/data.py +++ b/eole/config/data.py @@ -1,6 +1,6 @@ import os from typing import Dict, List, Literal -from pydantic import Field, field_validator # , model_validator +from pydantic import Field, field_validator, model_validator from pydantic import create_model from eole import constants @@ -332,3 +332,14 @@ def _validate_data_config(self, build_vocab_only=False): # TrainConfig without existing files (e.g. inference) # self._validate_vocab_config(build_vocab_only=build_vocab_only) return self + + @model_validator(mode="after") + def _maybe_set_huggingface_model(self): + if getattr(self, "model", None) is None: + return self + if self.model.huggingface_model is not None: + if hasattr(self.transforms_configs, "huggingface_tokenize"): + self.transforms_configs.huggingface_tokenize.huggingface_model = ( + self.model.huggingface_model + ) + return self diff --git a/eole/config/models.py b/eole/config/models.py index 51ee8654..3a89ede5 100644 --- a/eole/config/models.py +++ b/eole/config/models.py @@ -6,6 +6,7 @@ computed_field, ) # , TypeAdapter +import eole from eole.constants import PositionEncodingType, ActivationFunction, ModelType from eole.config.config import Config @@ -438,6 +439,13 @@ class BaseModelConfig(Config): left_pad: bool = Field( default=False, description="Enable left-padding, useful for some LLMs." ) + huggingface_model: str | None = Field( + default=None, description="Original huggingface model." + ) + eole_version: str | None = Field( + default=eole.__version__, + description="Eole version used to convert/train/save the model.", + ) # @computed_field() # @property diff --git a/eole/inputters/dynamic_iterator.py b/eole/inputters/dynamic_iterator.py index a39aabc2..3763cda8 100644 --- a/eole/inputters/dynamic_iterator.py +++ b/eole/inputters/dynamic_iterator.py @@ -278,9 +278,13 @@ def _tuple_to_json_with_tokIDs(self, tuple_bucket): tuple_bucket = transform_bucket(self.task, tuple_bucket, self.score_threshold) for example in tuple_bucket: if example is not None: - bucket.append( - numericalize(self.vocabs, example, model_type=self.model_type) + numericalized = numericalize( + self.vocabs, example, model_type=self.model_type ) + bucket.append(numericalized) + # print(numericalized) + # exit() + return bucket def _add_indice(self, bucket): diff --git a/eole/inputters/text_utils.py b/eole/inputters/text_utils.py index bd1ef022..89427932 100644 --- a/eole/inputters/text_utils.py +++ b/eole/inputters/text_utils.py @@ -59,34 +59,45 @@ def transform_bucket(task, bucket, threshold=0): def numericalize(vocabs, example, model_type=ModelType.ENCODER_DECODER): """ """ decoder_start_token = vocabs["decoder_start_token"] + # print("decoder_start_token:", decoder_start_token) + # print(example) numeric = example - numeric["src"]["src_ids"] = [] + numeric["src"]["src_ids"] = example.get("src_ids", []) + maybe_tgt_ids = example.get("tgt_ids", []) if model_type == ModelType.ENCODER_DECODER: src_text = example["src"]["src"].split(" ") - numeric["src"]["src_ids"] = vocabs["src"](src_text) + if numeric["src"]["src_ids"] == []: + numeric["src"]["src_ids"] = vocabs["src"](src_text) if example["tgt"] is not None: - numeric["tgt"]["tgt_ids"] = [] - tgt_text = example["tgt"]["tgt"].split(" ") - numeric["tgt"]["tgt_ids"] = vocabs["tgt"]( - [decoder_start_token] - + tgt_text - + [vocabs["specials"].get("eos_token", "")] - ) + if maybe_tgt_ids != []: + numeric["tgt"]["tgt_ids"] = maybe_tgt_ids + else: + tgt_text = example["tgt"]["tgt"].split(" ") + numeric["tgt"]["tgt_ids"] = vocabs["tgt"]( + [decoder_start_token] + + tgt_text + + [vocabs["specials"].get("eos_token", "")] + ) elif model_type == ModelType.DECODER: - src_text = example["src"]["src"].split(" ") - if decoder_start_token != "": - src_text = [decoder_start_token] + src_text - numeric["src"]["src_ids"] = vocabs["src"](src_text) + if numeric["src"]["src_ids"] == []: + src_text = example["src"]["src"].split(" ") + if decoder_start_token != "": + src_text = [decoder_start_token] + src_text + numeric["src"]["src_ids"] = vocabs["src"](src_text) if example["tgt"] is not None: - numeric["tgt"]["tgt_ids"] = [] - tgt_text = example["tgt"]["tgt"].split(" ") - numeric["tgt"]["tgt_ids"] = vocabs["tgt"]( - tgt_text + [vocabs["specials"].get("eos_token", "")] - ) + if maybe_tgt_ids != []: + # decoder_start_token logic is supposedly handled in the tokenizer + numeric["tgt"]["tgt_ids"] = maybe_tgt_ids + else: + tgt_text = example["tgt"]["tgt"].split(" ") + numeric["tgt"]["tgt_ids"] = vocabs["tgt"]( + tgt_text + [vocabs["specials"].get("eos_token", "")] + ) if decoder_start_token == "": numeric["tgt"]["tgt_ids"] = numeric["tgt"]["tgt_ids"][1:] + # TODO: support id tokenization elif model_type == ModelType.ENCODER: src_text = example["src"]["src"].split(" ") if example["tgt"] is not None: # TO BE DISCUSSED diff --git a/eole/models/model_saver.py b/eole/models/model_saver.py index b5babc91..bf57b2cb 100644 --- a/eole/models/model_saver.py +++ b/eole/models/model_saver.py @@ -50,6 +50,8 @@ def load_checkpoint(model_path): if os.path.exists(config_path): with open(config_path) as f: config_dict = json.loads(os.path.expandvars(f.read())) + print(config_path) + print(config_dict) # drop data to prevent validation issues config_dict["data"] = {} # drop inference to prevent validation issues diff --git a/eole/predict/inference.py b/eole/predict/inference.py index aaf86b6e..cb532202 100644 --- a/eole/predict/inference.py +++ b/eole/predict/inference.py @@ -7,7 +7,7 @@ from math import exp import codecs -from eole.transforms import TransformPipe +from eole.transforms import TransformPipe, AVAILABLE_TRANSFORMS from eole.constants import DefaultTokens from eole.predict.prediction import PredictionBuilder from eole.utils.misc import set_random_seed, report_matrix, sequence_mask @@ -89,6 +89,7 @@ def __init__( return_gold_log_probs=False, add_estimator=False, optional_eos=[], + id_tokenization=False, ): self.model = model self.vocabs = vocabs @@ -170,6 +171,7 @@ def __init__( self.return_gold_log_probs = return_gold_log_probs self.add_estimator = add_estimator + self.id_tokenization = id_tokenization @classmethod def from_config( @@ -204,6 +206,12 @@ def from_config( """ # TODO: maybe add dynamic part + id_tokenization = False + if len(config.transforms) > 0: + tail_transform_cls = AVAILABLE_TRANSFORMS.get(config.transforms[-1], None) + if getattr(tail_transform_cls, "output_type", None) == "ids": + id_tokenization = True + return cls( model, vocabs, @@ -238,6 +246,7 @@ def from_config( with_score=config.with_score, add_estimator=model_config.add_estimator, optional_eos=config.optional_eos, + id_tokenization=id_tokenization, ) def _log(self, msg): @@ -296,6 +305,7 @@ def _predict( self.replace_unk, self.phrase_table, self._tgt_eos_idx, + self.id_tokenization, ) # Statistics @@ -384,9 +394,12 @@ def _process_bucket(bucket_predictions): bucket_gold_score += trans.gold_score bucket_gold_words += len(trans.gold_sent) + 1 - n_best_preds = [ - " ".join(pred) for pred in trans.pred_sents[: self.n_best] - ] + if self.id_tokenization: + n_best_preds = trans.pred_sents[: self.n_best] + else: + n_best_preds = [ + " ".join(pred) for pred in trans.pred_sents[: self.n_best] + ] if self.report_align: align_pharaohs = [ diff --git a/eole/predict/prediction.py b/eole/predict/prediction.py index e096a8a2..d999b268 100644 --- a/eole/predict/prediction.py +++ b/eole/predict/prediction.py @@ -20,13 +20,20 @@ class PredictionBuilder(object): """ def __init__( - self, vocabs, n_best=1, replace_unk=False, phrase_table="", tgt_eos_idx=None + self, + vocabs, + n_best=1, + replace_unk=False, + phrase_table="", + tgt_eos_idx=None, + id_tokenization=False, ): self.vocabs = vocabs self.n_best = n_best self.replace_unk = replace_unk self.phrase_table_dict = {} self.tgt_eos_idx = tgt_eos_idx # List of IDs here + self.id_tokenization = id_tokenization if phrase_table != "" and os.path.exists(phrase_table): with open(phrase_table) as phrase_table_fd: for line in phrase_table_fd: @@ -39,7 +46,10 @@ def _build_target_tokens(self, src, srclen, pred, attn, voc, dyn_voc): pred_list = pred.tolist() if pred_list[-1] in self.tgt_eos_idx: pred_list = pred_list[:-1] - if dyn_voc is None: + if self.id_tokenization: + # TODO assert dyn_voc is not compatible with id_tokenization + tokens = pred_list + elif dyn_voc is None: tokens = [voc[tok] for tok in pred_list] else: tokens = [ @@ -49,15 +59,19 @@ def _build_target_tokens(self, src, srclen, pred, attn, voc, dyn_voc): for tok in pred_list ] - if self.replace_unk and attn is not None and src is not None: - for i in range(len(tokens)): - if tokens[i] == DefaultTokens.UNK: - _, max_index = attn[i][:srclen].max(0) - src_tok = self.vocabs["src"].ids_to_tokens[src[max_index.item()]] - tokens[i] = src_tok - if self.phrase_table_dict: - if src_tok in self.phrase_table_dict: - tokens[i] = self.phrase_table_dict[src_tok] + # TODO: either support this properly or remove? + if not self.id_tokenization: + if self.replace_unk and attn is not None and src is not None: + for i in range(len(tokens)): + if tokens[i] == DefaultTokens.UNK: + _, max_index = attn[i][:srclen].max(0) + src_tok = self.vocabs["src"].ids_to_tokens[ + src[max_index.item()] + ] + tokens[i] = src_tok + if self.phrase_table_dict: + if src_tok in self.phrase_table_dict: + tokens[i] = self.phrase_table_dict[src_tok] return tokens def from_batch(self, prediction_batch): @@ -203,7 +217,9 @@ def log(self, sent_number, src_raw=""): best_pred = self.pred_sents[0] best_score = self.pred_scores[0] best_estim = self.estim[0] - pred_sent = " ".join(best_pred) + pred_sent = " ".join( + [str(x) for x in best_pred] + ) # this will display IDs for id_tokenize case msg.append("PRED {}: {}\n".format(sent_number, pred_sent)) msg.append("PRED SCORE: {:.4f}\n".format(best_score)) msg.append("ESTIM SCORE: {:.4f}\n".format(best_estim)) diff --git a/eole/transforms/__init__.py b/eole/transforms/__init__.py index 5a48f1f3..88806d5a 100644 --- a/eole/transforms/__init__.py +++ b/eole/transforms/__init__.py @@ -35,7 +35,7 @@ def get_transforms_cls(transform_names): def register_transform(name): """Transform register that can be used to add new transform class.""" - def register_transfrom_cls(cls): + def register_transform_cls(cls): if name in AVAILABLE_TRANSFORMS: raise ValueError("Cannot register duplicate transform ({})".format(name)) if not issubclass(cls, Transform): @@ -47,7 +47,7 @@ def register_transfrom_cls(cls): cls.name = name return cls - return register_transfrom_cls + return register_transform_cls # Auto import python files in this directory diff --git a/eole/transforms/tokenize_id.py b/eole/transforms/tokenize_id.py new file mode 100644 index 00000000..da3b323d --- /dev/null +++ b/eole/transforms/tokenize_id.py @@ -0,0 +1,154 @@ +"""Tokenization transforms that also numericalize, such as HF, or Mistral""" +import os +from eole.utils.logging import logger +from eole.transforms import register_transform +from .transform import Transform, TransformConfig # , ObservableStats +from pydantic import Field +from eole.constants import DefaultTokens + + +class BaseIdTokenizerConfig(TransformConfig): + # model_name: str | None = Field(default=None) # if tokenizer deduced from model + path: str | None = Field(default=None) # if local tokenizer + + +class HuggingfaceTokenizerConfig(BaseIdTokenizerConfig): + huggingface_model: str | None = Field(default=None) # if retrieved from huggingface + max_length: int | None = Field(default=None) + + +class IntTokenizerTransform(Transform): + """IntTokenizer transform abstract class.""" + + config_model = BaseIdTokenizerConfig + output_type = "ids" + + def __init__(self, config): + super().__init__(config) + + def _parse_config(self): + # self.model_name = self.config.model_name + self.path = self.config.path + + def warm_up(self, vocabs=None): + # TODO: factorize here + raise NotImplementedError + + +# TODO: this requires further investigation, and better support +# of chat templates across all paths (training+inference) + +# @register_transform(name="mistral_tokenize") +# class MistralTokenizer(IntTokenizerTransform): +# def __init__(self, config): +# super().__init__(config) + +# def warm_up(self, vocabs=None): +# from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + +# if self.model_name is not None: +# self.tokenizer = MistralTokenizer.from_model(self.model_name) +# elif self.path is not None: +# if os.path.exists(self.path): +# self.tokenizer = MistralTokenizer.from_file(self.path) +# else: +# raise FileNotFoundError(self.path) +# else: +# raise RuntimeError( +# f"Either model_name or path must be configured for {self.name} transform" +# ) + +# def apply(self, example, is_train=False, stats=None, **kwargs): +# return example + + +@register_transform(name="huggingface_tokenize") +class HuggingfaceTokenizer(IntTokenizerTransform): + config_model = HuggingfaceTokenizerConfig + + def __init__(self, config): + super().__init__(config) + + def _parse_config(self): + super()._parse_config() + self.huggingface_model = self.config.huggingface_model + self.max_length = self.config.max_length + + def warm_up(self, vocabs=None): + if self.huggingface_model is not None: + from transformers import AutoTokenizer + from tokenizers.processors import TemplateProcessing + + self.tokenizers = {} + + self.tokenizers["src"] = AutoTokenizer.from_pretrained( + self.huggingface_model + ) + # https://github.com/huggingface/transformers/issues/22794#issuecomment-2092623992 + # TODO: this needs to be tested and adapted for various models + tgt_tokenizer = AutoTokenizer.from_pretrained(self.huggingface_model) + # bos = tgt_tokenizer.bos_token + eos = tgt_tokenizer.eos_token + tgt_tokenizer._tokenizer.post_processor = TemplateProcessing( + single=f"$A:0 {eos}:0", + pair=f"$A:0 {eos}:0 $B:1 {eos}:1", + special_tokens=[ + # (f"{bos}", tgt_tokenizer.bos_token_id), + (f"{eos}", tgt_tokenizer.eos_token_id) + ], + ) + self.tokenizers["tgt"] = tgt_tokenizer + + logger.info( + f"Initialized tokenizer from HF model: {self.huggingface_model}" + ) + + elif self.path is not None: + if os.path.exists(self.path): + from tokenizers import Tokenizer + + self.tokenizers["src"] = Tokenizer.from_file(self.path) + # TODO: this is not efficient, we shall have a single tokenizer + tgt_tokenizer = Tokenizer.from_file(self.path) + # bos = tgt_tokenizer.bos_token + eos = tgt_tokenizer.eos_token + tgt_tokenizer._tokenizer.post_processor = TemplateProcessing( + single=f"$A:0 {eos}:0", + pair=f"$A:0 {eos}:0 $B:1 {eos}:1", + special_tokens=[ + # (f"{bos}", tgt_tokenizer.bos_token_id), + (f"{eos}", tgt_tokenizer.eos_token_id) + ], + ) + self.tokenizers["tgt"] = tgt_tokenizer + + logger.info(f"Initialized tokenizer from local file: {self.path}") + else: + raise FileNotFoundError(self.path) + else: + raise RuntimeError( + f"Either model_name or path must be configured for {self.name} transform" + ) + + def tokenize_string(self, string, side="src", is_train=False): + if self.max_length is not None and is_train: + kwargs = {"max_length": self.max_length, "truncation": True} + else: + kwargs = {} + tokens = self.tokenizers[side].encode(string, **kwargs) + return tokens + + def apply(self, example, is_train=False, stats=None, **kwargs): + src_tokens = self.tokenize_string(" ".join(example["src"]), side="src") + example["src_ids"] = src_tokens + if example.get("tgt", None) is not None: + tgt_tokens = self.tokenize_string(" ".join(example["tgt"]), side="tgt") + example["tgt_ids"] = tgt_tokens + return example + + def apply_reverse(self, predicted): + print(predicted) + detokenized = ( + self.tokenizers["tgt"].decode(predicted).replace("\n", DefaultTokens.SEP) + ) + return detokenized diff --git a/eole/transforms/transform.py b/eole/transforms/transform.py index 90837568..155ecedd 100644 --- a/eole/transforms/transform.py +++ b/eole/transforms/transform.py @@ -21,6 +21,7 @@ class Transform(object): name = None # set in register_transform wrapper type = TransformType.Default + output_type = "text" def __init__(self, config): """Initialize Transform by parsing `opts` and add them as attribute.""" @@ -277,7 +278,8 @@ def batch_apply(self, batch, is_train=False, **kwargs): return batch def apply_reverse(self, predicted): - for transform in self.transforms: + # apply_reverse in reversed order + for transform in reversed(self.transforms): predicted = transform.apply_reverse(predicted) return predicted