Merge branch 'refs/heads/main' into Cjian/ado-zip

microsoft · May 3, 2024 · 72ab9b8 · 72ab9b8
2 parents 653f189 + b272ba4
commit 72ab9b8
Show file tree

Hide file tree

Showing 20 changed files with 174 additions and 67 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}")
 # Checking if CUDA is supported
 include(CheckLanguage)
 add_compile_definitions(BUILDING_ORT_GENAI_C)
+
 if(USE_CUDA)
   check_language(CUDA)
   if(CMAKE_CUDA_COMPILER)

diff --git a/cmake/cxx_standard.cmake b/cmake/cxx_standard.cmake
@@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION
 else ()
     message("Test is using C++20")
     set(CMAKE_CXX_STANDARD 20)
-endif ()
+endif ()
+
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+    add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM)
+endif()
diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py
@@ -13,15 +13,30 @@ def main(args):
         prompts = args.prompts
     else:
         prompts = ["I like walking my cute dog",
-                "What is the best restaurant in town?",
-                "Hello, how are you today?"]
+                   "What is the best restaurant in town?",
+                   "Hello, how are you today?"]
+
+    if args.chat_template:
+        if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+            print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+            exit(1)
+        prompts[:] = [f'{args.chat_template.format(input=text)}' for text in prompts]
+
     input_tokens = tokenizer.encode_batch(prompts)
-    if args.verbose: print("Prompt(s) encoded")
+    if args.verbose: print(f'Prompt(s) encoded: {prompts}')
 
     params = og.GeneratorParams(model)
-    params.set_search_options(max_length=args.max_length, top_p=args.top_p, top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty)
-    if args.cuda_graph_with_max_batch_size > 0:
-        params.try_use_cuda_graph_with_max_batch_size(args.cuda_graph_with_max_batch_size)
+
+    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} 
+
+    if (args.verbose): print(f'Args: {args}')
+    if (args.verbose): print(f'Search options: {search_options}')
+
+    params.set_search_options(**search_options)
+    # Set the batch size for the CUDA graph to the number of prompts if the user didn't specify a batch size
+    params.try_use_cuda_graph_with_max_batch_size(len(prompts))
+    if args.batch_size_for_cuda_graph:
+        params.try_use_cuda_graph_with_max_batch_size(args.batch_size_for_cuda_graph)
     params.input_ids = input_tokens
     if args.verbose: print("GeneratorParams created")
 
@@ -37,19 +52,24 @@ def main(args):
         print()
 
     print()
-    print(f"Tokens: {len(output_tokens[0])} Time: {run_time:.2f} Tokens per second: {len(output_tokens[0])/run_time:.2f}")
+    total_tokens = sum(len(x) for x in output_tokens)
+    print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
     print()
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="End-to-end token generation loop example for gen-ai")
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai")
     parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
-    parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from')
-    parser.add_argument('-l', '--max_length', type=int, default=512, help='Max number of tokens to generate after prompt')
-    parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with')
-    parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from')
-    parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
-    parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
-    parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
-    parser.add_argument('-c', '--cuda_graph_with_max_batch_size', type=int, default=0, help='Max batch size for CUDA graph')
+    parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts')
+    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph')
+    parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.')
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -14,10 +14,15 @@ def main(args):
     tokenizer_stream = tokenizer.create_stream()
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
+
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
-    if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
-        print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
-        exit(1)
+
+    if args.verbose: print(search_options)
+
+    if args.chat_template:
+        if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+            print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+            exit(1)
 
     # Keep asking for input prompts in a loop
     while True:

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
@@ -9,6 +9,12 @@
 
 Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face.
 
+To download the Phi-3 mini models, you will need to have git-lfs installed.
+* MacOS: `brew install git-lfs`
+* Linux: `apt-get install git-lfs`
+* Windows: `winget install -e --id GitHub.GitLFS` (If you don't have winget, download and run the `exe` from the [official source](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=windows))
+
+Then run `git lfs install`
 
 For the short context model.
 
@@ -68,15 +74,14 @@ Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/b
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
-This example is using the long context model running with DirectML on Windows.
+<!--This example is using the long context model running with DirectML on Windows.-->
 
 The `-m` argument is the path to the model you downloaded from HuggingFace above.
 The `-l` argument is the length of output you would like to generate with the model.
 
 ```bash
 curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
-model_path="./Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128"
-python phi3-qa.py -m $model_path -l 2048
+python phi3-qa.py -m *replace your relative model_path here* -l 2048
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:

diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
@@ -15,6 +15,12 @@ def main(args):
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+
+    # Set the max length to something sensible by default, unless it is specified by the user,
+    # since otherwise it will be set to the entire context length
+    if 'max_length' not in search_options:
+        search_options['max_length'] = 2048
+
     chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
 
     # Keep asking for input prompts in a loop

diff --git a/src/config.cpp b/src/config.cpp
@@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element {
   JSON::Element& t_;
 };
 
-void ParseConfig(const std::filesystem::path& filename, Config& config) {
+void ParseConfig(const fs::path& filename, Config& config) {
   std::ifstream file(filename, std::ios::binary | std::ios::ate);
   if (!file.is_open()) {
     throw std::runtime_error("Error opening " + filename.string());
@@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) {
   }
 }
 
-Config::Config(const std::filesystem::path& path) : config_path{path} {
+Config::Config(const fs::path& path) : config_path{path} {
   ParseConfig(path / "genai_config.json", *this);
 
   if (model.context_length == 0)

diff --git a/src/config.h b/src/config.h
@@ -6,9 +6,9 @@ namespace Generators {
 
 struct Config {
   Config() = default;
-  Config(const std::filesystem::path& path);
+  Config(const fs::path& path);
 
-  std::filesystem::path config_path;  // Path of the config directory
+  fs::path config_path;  // Path of the config directory
 
   using ProviderOption = std::pair<std::string, std::string>;
   struct ProviderOptions {

diff --git a/src/filesystem.h b/src/filesystem.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8
+#ifdef USE_EXPERIMENTAL_FILESYSTEM
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
diff --git a/src/generators.h b/src/generators.h
@@ -5,8 +5,9 @@
 #include <assert.h>
 #include <cmath>
 #include <cstring>
-#include <filesystem>
+#include "filesystem.h"
 #include <functional>
+#include <iostream>
 #include "span.h"
 #include <memory>
 #include <numeric>

diff --git a/src/logging.cpp b/src/logging.cpp
@@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) {
     if (value.empty())
       gp_logfile.reset();
     else {
-      std::filesystem::path filename{value};
+      fs::path filename{std::string(value)};
       gp_logfile = std::make_unique<std::ofstream>(filename);
     }
 

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -297,7 +297,7 @@ void Model::CreateSessionOptions() {
   }
 
   if (options.enable_profiling.has_value()) {
-    std::filesystem::path profile_file_prefix{options.enable_profiling.value()};
+    fs::path profile_file_prefix{options.enable_profiling.value()};
     ort_options.EnableProfiling(profile_file_prefix.c_str());
   }
 

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
@@ -64,8 +64,12 @@ if(BUILD_WHEEL)
     "libcufft.so.11"
     "libcurand.so.10"
     "libnvinfer.so.8"
+    "libnvinfer.so.10"
     "libnvinfer_plugin.so.8"
+    "libnvinfer_plugin.so.10"
     "libnvonnxparser.so.8"
+    "libnvonnxparser.so.10"
+
   )
   set(modified_exclude_list)
   foreach(item IN LISTS auditwheel_exclude_list)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
-            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
+            print("GroupQueryAttention (GQA) is used in this model.")
 
             # DML doesn't support packed Q/K/V for GQA yet
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
@@ -228,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                     "num_key_value_heads": self.num_kv_heads,
                 },
                 "eos_token_id": config.eos_token_id,
-                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id,
+                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id,
                 "type": self.model_type[ : self.model_type.find("For")].lower(),
                 "vocab_size": self.vocab_size,
             },

diff --git a/src/tokenizer/c_api/tfmtok_c.cc b/src/tokenizer/c_api/tfmtok_c.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include <cstdarg>
-#include <filesystem>
+#include "../filesystem.h"
 #include <algorithm>
 
 #include "tfmtok.h"
@@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer,
     return kTfmErrorInvalidArgument;
   }
 
-  if (!std::filesystem::is_directory(tokenizer_path)) {
+  if (!fs::is_directory(tokenizer_path)) {
     last_error_message = std::string("Cannot find the directory of ") + tokenizer_path;
     return kTfmErrorInvalidArgument;
   }

diff --git a/src/tokenizer/config.cc b/src/tokenizer/config.cc
@@ -4,7 +4,7 @@
 #include <string>
 #include <fstream>
 #include <streambuf>
-#include <filesystem>
+#include "../filesystem.h"
 
 #include "config.h"
 
@@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) {
   simdjson::dom::parser parser;
   simdjson::dom::element root;
 
-  if (!std::filesystem::exists(
-          std::filesystem::path(json_path).lexically_normal())) {
+  if (!fs::exists(fs::path(json_path))) {
     return {kTfmErrorInvalidFile, std::string(json_path) + " not found"};
   }
   std::string json_text = PatchJsonText(json_path);

diff --git a/src/tokenizer/token_bpe.cc b/src/tokenizer/token_bpe.cc
@@ -237,15 +237,17 @@ std::vector<tfmTokenId_t> BPETokenizer::Encode(std::string_view sv_input, int64_
       text = text.strip()
     */
     std::u32string str = RemoveConsecutiveSpaces(input);
-    if (IsUnicodeSpace(str.front())) {
-      str.erase(str.begin());
-    }
-    if (IsUnicodeSpace(str.back())) {
-      str.pop_back();
+    if (!str.empty()) {
+      if (IsUnicodeSpace(str.front())) {
+        str.erase(str.begin());
+      }
+      if (IsUnicodeSpace(str.back())) {
+        str.pop_back();
+      }
+      // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
+      str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
+      str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     }
-    // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
-    str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
-    str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     input = str;
   }
 
@@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta
         token.push_back(' ');
       }
     }  // end case of whitespace_token_
+
+    bpe_state->incomplete_utf8_ += token;
+    token.clear();
+    std::string& s_utf8 = bpe_state->incomplete_utf8_;
+    size_t utf8_len = 1;
+    size_t utf8_all_len = 0;
+    for (size_t i = 0; i < s_utf8.size(); i += utf8_len) {
+      utf8_len = UTF8Len(s_utf8[i]);
+      if (utf8_len <= s_utf8.size() - i) {
+        utf8_all_len += utf8_len;
+        auto _t = s_utf8.substr(i, utf8_len);
+        token += ValidateUTF8(_t) ? _t : "";
+      }
+    }
+    s_utf8 = s_utf8.substr(utf8_all_len);
   }
 
   return status;

diff --git a/src/tokenizer/token_bpe.h b/src/tokenizer/token_bpe.h
@@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl {
     BPEDeocerState() = default;
     ~BPEDeocerState() override = default;
     bool f_special_last;
+    std::string incomplete_utf8_;
   };
 
  public:

diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc
@@ -1,7 +1,7 @@
 #include "token_bpe.h"
 #include "token_rwkv.h"
 
-#include <filesystem>
+#include "../filesystem.h"
 #include <memory>
 
 namespace tfm {
@@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   if (type.empty()) {
     if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
       type = "BPE";
-    } else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
+    } /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
       // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
       type = "SPM";
-    } else {
+    } */ else {
       status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument");
     }
   }
@@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   } /* else if (type == "SPM") {
     token_ptr = std::make_unique<SpmTokenizer>();
   } */ else {
-    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported.");
+    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported.");
   }
 
   if (status.ok()) {