various fixes (OpenNMT#2563)

vince62s · Feb 21, 2024 · 5deb20e · 5deb20e
1 parent a02330a
commit 5deb20e
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 26 deletions.
diff --git a/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py b/eval_llm/WIKITEXT2/run_wikitext-2_benchmark.py
@@ -1,24 +1,70 @@
 import copy
 import numpy as np
 import time
+import re
 from onmt.inference_engine import InferenceEnginePY
 import onmt.opts as opts
 from onmt.utils.logging import init_logger
 from onmt.utils.parse import ArgumentParser
 from onmt.utils.misc import use_gpu, set_random_seed
 
 
+def wikitext_detokenizer(line):
+ string = line
+ # contractions
+ string = string.replace("s '", "s'")
+ string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+ # number separators
+ string = string.replace(" @-@ ", "-")
+ string = string.replace(" @,@ ", ",")
+ string = string.replace(" @.@ ", ".")
+ # punctuation
+ string = string.replace(" : ", ": ")
+ string = string.replace(" ; ", "; ")
+ string = string.replace(" . ", ". ")
+ string = string.replace(" ! ", "! ")
+ string = string.replace(" ? ", "? ")
+ string = string.replace(" , ", ", ")
+ # double brackets
+ string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+ string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+ string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+ string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+ string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+ # miscellaneous
+ string = string.replace("= = = =", "====")
+ string = string.replace("= = =", "===")
+ string = string.replace("= =", "==")
+ string = string.replace(" " + chr(176) + " ", chr(176))
+ string = string.replace(" \n", "\n")
+ string = string.replace("\n ", "\n")
+ string = string.replace(" N ", " 1 ")
+ string = string.replace(" 's", "'s")
+ return string
+
+
 def tokenize_dataset(opt, context_length):
  print("Tokenization...")
  # Clean and Concat the dataset
- x = open(opt.src, "r").readlines()
- xx = [_x for _x in x if _x != " \n"]
- from onmt.transforms.tokenize import SentencePieceTransform
+ xx = open(opt.src, "r").readlines()
+ if "sentencepiece" in opt.transforms:
+ from onmt.transforms.tokenize import SentencePieceTransform
+
+ tokenizer = SentencePieceTransform(opt)
+ elif "onmt_tokenize" in opt.transforms:
+ from onmt.transforms.tokenize import ONMTTokenizerTransform
 
- tokenizer = SentencePieceTransform(opt)
+ tokenizer = ONMTTokenizerTransform(opt)
+ else:
+ raise ValueError("No valid tokenizer found")
  tokenizer.warm_up()
- tokens = tokenizer._tokenize(xx)
- print("Done !")
+ print("warmup done")
+ # joiner = tokenizer._tokenize("\n")
+ tokens = []
+ for x in xx:
+ tokens += tokenizer._tokenize([wikitext_detokenizer(x)])
+ # tokens += tokenizer._tokenize([x])
+ print("Tokenization Done !")
  return tokens
 
 
@@ -38,7 +84,7 @@ def evaluate(opt):
  set_random_seed(opt.seed, use_gpu(opt))
 
  # Tokenize the dataset.
- opt.src = "wikitext-2-raw-v1/wikitext-2-raw/wiki.test.raw"
+ opt.src = "eval_llm/WIKITEXT2/wikitext-2-raw-v1/wikitext-2-raw/wiki.test.raw"
  tokens = tokenize_dataset(opt, context_length=512)
 
  # Build the translator (along with the model.
@@ -47,8 +93,8 @@ def evaluate(opt):
  engine = InferenceEnginePY(engine_opt)
 
  # Score the dataset.
- stride = 512
- max_seq_length = 4096
+ stride = 256
+ max_seq_length = 512
 
  seq_len = len(tokens)
  src = []
@@ -75,8 +121,7 @@ def evaluate(opt):
  end_time = time.time()
  logger.info("total run time %.2f" % (end_time - start_time))
  logger.info(
- "wikitext-2 perplexity with rolling likelihood and sliding window size 1000 and stride 512 %.2f" # noqa: E501
- % (ppl)
+ "wikitext-2 perplexity with rolling likelihood: %.2f" % (ppl) # noqa: E501
  )
 
 

diff --git a/onmt/inputters/text_corpus.py b/onmt/inputters/text_corpus.py
@@ -38,6 +38,63 @@ def exfile_open(filename, *args, **kwargs):
  _file.close()
 
 
+class BlockwiseCorpus(object):
+ """A corpus class for reading a single file block by block."""
+
+ def __init__(self, name, file_path, block_size=4096):
+ """Initialize file path and block size."""
+ self.id = name
+ self.file_path = file_path
+ self.block_size = block_size
+
+ def load(self, offset=0, stride=1):
+ """
+ Load file and iterate by blocks.
+ `offset` and `stride` allow iterating only on every
+ `stride` block, starting from `offset`.
+ """
+
+ def make_ex(block_content):
+ example = {
+ "src": block_content,
+ "tgt": block_content,
+ "src_original": block_content,
+ "tgt_original": block_content,
+ }
+ return example
+
+ with open(self.file_path, mode="r", encoding="utf-8") as file:
+ block_content = ""
+ block_index = 0
+
+ while True:
+ chunk = file.read(self.block_size)
+ if not chunk:
+ break
+
+ if (block_index // stride) % stride == offset:
+ block_content += chunk
+
+ if len(chunk) < self.block_size:
+ # Reached end of file
+ yield make_ex(block_content)
+ break
+
+ if len(block_content) >= self.block_size:
+ yield make_ex(block_content)
+ block_content = ""
+ block_index += 1
+
+ def __str__(self):
+ cls_name = type(self).__name__
+ return (
+ f"{cls_name}({self.id}, {self.file_path}, {self.file_path}"
+ f"align={None}, "
+ f"n_src_feats={0}, "
+ f'src_feats_defaults="{None}")'
+ )
+
+
 class ParallelCorpus(object):
  """A parallel corpus file pair that can be loaded to iterate."""
 
@@ -117,14 +174,21 @@ def get_corpora(opts, task=CorpusTask.TRAIN, src=None, tgt=None, align=None):
  if task == CorpusTask.TRAIN:
  for corpus_id, corpus_dict in opts.data.items():
  if corpus_id != CorpusName.VALID:
- corpora_dict[corpus_id] = ParallelCorpus(
- corpus_id,
- corpus_dict["path_src"],
- corpus_dict["path_tgt"],
- corpus_dict["path_align"],
- n_src_feats=opts.n_src_feats,
- src_feats_defaults=opts.src_feats_defaults,
- )
+ if corpus_dict.get("path_txt", None) is None:
+ corpora_dict[corpus_id] = ParallelCorpus(
+ corpus_id,
+ corpus_dict["path_src"],
+ corpus_dict["path_tgt"],
+ corpus_dict["path_align"],
+ n_src_feats=opts.n_src_feats,
+ src_feats_defaults=opts.src_feats_defaults,
+ )
+ else:
+ corpora_dict[corpus_id] = BlockwiseCorpus(
+ corpus_id,
+ corpus_dict["path_txt"],
+ block_size=8192, # number of characters
+ )
  elif task == CorpusTask.VALID:
  if CorpusName.VALID in opts.data.keys():
  corpora_dict[CorpusName.VALID] = ParallelCorpus(

diff --git a/onmt/inputters/text_utils.py b/onmt/inputters/text_utils.py
@@ -149,7 +149,6 @@ def numericalize(vocabs, example):
  for fv, feat in zip(vocabs["src_feats"], example["src"]["feats"]):
  numeric_feats.append(fv(feat.split(" ")))
  numeric["src"]["feats"] = numeric_feats
-
  return numeric
 
 

diff --git a/onmt/opts.py b/onmt/opts.py
@@ -872,8 +872,7 @@ def model_opts(parser):
  group.add(
  "--rotary_interleave",
  "-rotary_interleave",
- type=bool,
- default=True,
+ action="store_true",
  help="Interleave the head dimensions when rotary"
  " embeddings are applied."
  " Otherwise the head dimensions are sliced in half."

diff --git a/onmt/utils/parse.py b/onmt/utils/parse.py
@@ -39,9 +39,10 @@ def _validate_data(cls, opt):
  # Check path
  path_src = corpus.get("path_src", None)
  path_tgt = corpus.get("path_tgt", None)
- if path_src is None:
+ path_txt = corpus.get("path_txt", None)
+ if path_src is None and path_txt is None:
  raise ValueError(
- f"Corpus {cname} src path is required."
+ f"Corpus {cname} src/txt path is required."
  "tgt path is also required for non language"
  " modeling tasks."
  )
@@ -57,8 +58,12 @@ def _validate_data(cls, opt):
  corpus["path_tgt"] = path_src
  corpora[cname] = corpus
  path_tgt = path_src
- cls._validate_file(path_src, info=f"{cname}/path_src")
- cls._validate_file(path_tgt, info=f"{cname}/path_tgt")
+ if path_src is not None:
+ cls._validate_file(path_src, info=f"{cname}/path_src")
+ if path_txt is not None:
+ cls._validate_file(path_txt, info=f"{cname}/path_txt")
+ if path_tgt is not None:
+ cls._validate_file(path_tgt, info=f"{cname}/path_tgt")
  path_align = corpus.get("path_align", None)
  if path_align is None:
  if hasattr(opt, "lambda_align") and opt.lambda_align > 0.0: