Address the feedback on the tokenizer's library (dotnet#7024)

* Fix cache when calling EncodeToIds * Make EnglishRoberta _mergeRanks thread safe * Delete Trainer * Remove the setters on the Bpe properties * Remove Roberta and Tiktoken special casing in the Tokenizer and support the cases in the Model abstraction * Support text-embedding-3-small/large embedding * Remove redundant TokenToId abstraction and keep the one with the extra parameters * Enable creating Tiktoken asynchronously or directly using the tokenizer data * Add cancellationToken support in CreateAsync APIs * Rename sequence to text and Tokenize to Encode * Rename skipSpecialTokens to considerSpecialTokens * Rename TokenizerResult to EncodingResult * Make Token publicly immutable * Change offset tuples from (Index, End) to (Index, Length) * Rename NormalizedString method's parameters * Rename Model's methods to start with verb * Convert Model.GetVocab() method to a Vocab property * Some method's parameters and variable renaming * Remove Vocab and VocabSize from the abstraction * Cleanup normalization support * Minor Bpe cleanup * Resolve rebase change * Address the feedback
ibrahim324 · Feb 26, 2024 · d0aa2c2 · d0aa2c2
1 parent 4b89d98
commit d0aa2c2
Show file tree

Hide file tree

Showing 31 changed files with 838 additions and 6,033 deletions.
diff --git a/...icrosoft.ML.Tokenizers/TokenizerResult.cs → ...Microsoft.ML.Tokenizers/EncodingResult.cs b/...icrosoft.ML.Tokenizers/TokenizerResult.cs → ...Microsoft.ML.Tokenizers/EncodingResult.cs
@@ -11,16 +11,16 @@ namespace Microsoft.ML.Tokenizers
     /// <summary>
     /// The Encoding represents the output of a Tokenizer.
     /// </summary>
-    public sealed class TokenizerResult
+    public sealed class EncodingResult
     {
         /// <summary>
-        /// Create a new object of the TokenizerResult object.
+        /// Create a new object of the EncodingResult object.
         /// </summary>
         /// <param name="originalString">The list of tokens to merge.</param>
         /// <param name="normalizedString">The list of tokens to merge.</param>
         /// <param name="splits">The list of tokens to merge.</param>
         /// <param name="offsetsMappedToOriginalString">Indicate whether the offsets is mapped to the original string or the normalized string.</param>
-        public TokenizerResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
+        public EncodingResult(string originalString, string normalizedString, IEnumerable<Split> splits, bool offsetsMappedToOriginalString)
         {
             OriginalString = originalString;
             NormalizedString = normalizedString;
@@ -47,7 +47,7 @@ public TokenizerResult(string originalString, string normalizedString, IEnumerab
         private List<Token>? _tokens;
         private List<string>? _tokensWords;
         private List<int>? _ids;
-        private List<(int Index, int End)>? _offsets;
+        private List<(int Index, int Length)>? _offsets;
 
         internal void AddTokens(IReadOnlyList<Token> addedTokens)
         {
@@ -121,10 +121,10 @@ public IReadOnlyList<string> Tokens
         }
 
         /// <summary>
-        /// Gets The list of offsets. These offsets let’s you slice the input string, and thus retrieve
+        /// Gets The list of offsets. These offsets let's you slice the input string, and thus retrieve
         /// the original part that led to producing the corresponding token.
         /// </summary>
-        public IReadOnlyList<(int Index, int End)> Offsets
+        public IReadOnlyList<(int Index, int Length)> Offsets
         {
             get
             {
@@ -138,7 +138,7 @@ public IReadOnlyList<string> Tokens
                     return Array.Empty<(int, int)>();
                 }
 
-                _offsets = new List<(int Index, int End)>(_tokens.Count);
+                _offsets = new List<(int Index, int Length)>(_tokens.Count);
 
                 foreach (var token in _tokens)
                 {

diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs