From 43cd5e335916e67f627d4e47306a425563dae5d4 Mon Sep 17 00:00:00 2001 From: Ben Epstein Date: Thu, 2 May 2024 18:16:35 -0400 Subject: [PATCH 1/8] Update phi-3-tutorial.md (#378) relates to https://github.com/microsoft/onnxruntime-genai/issues/376 --------- Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> --- examples/python/phi-3-tutorial.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index aed02ea42..e1b4456ab 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -9,6 +9,12 @@ Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face. +To download the Phi-3 mini models, you will need to have git-lfs installed. +* MacOS: `brew install git-lfs` +* Linux: `apt-get install git-lfs` +* Windows: `winget install -e --id GitHub.GitLFS` (If you don't have winget, download and run the `exe` from the [official source](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=windows)) + +Then run `git lfs install` For the short context model. From edfee65e47cde1f332a0b5be112274d0c95a6432 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Thu, 2 May 2024 16:49:45 -0700 Subject: [PATCH 2/8] Model generate template (#389) --- examples/python/model-generate.py | 52 +++++++++++++++++++++---------- examples/python/model-qa.py | 11 +++++-- examples/python/phi3-qa.py | 6 ++++ 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py index 9b5af011a..3e2b38d90 100644 --- a/examples/python/model-generate.py +++ b/examples/python/model-generate.py @@ -13,15 +13,30 @@ def main(args): prompts = args.prompts else: prompts = ["I like walking my cute dog", - "What is the best restaurant in town?", - "Hello, how are you today?"] + "What is the best restaurant in town?", + "Hello, how are you today?"] + + if args.chat_template: + if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: + print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + exit(1) + prompts[:] = [f'{args.chat_template.format(input=text)}' for text in prompts] + input_tokens = tokenizer.encode_batch(prompts) - if args.verbose: print("Prompt(s) encoded") + if args.verbose: print(f'Prompt(s) encoded: {prompts}') params = og.GeneratorParams(model) - params.set_search_options(max_length=args.max_length, top_p=args.top_p, top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty) - if args.cuda_graph_with_max_batch_size > 0: - params.try_use_cuda_graph_with_max_batch_size(args.cuda_graph_with_max_batch_size) + + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + + if (args.verbose): print(f'Args: {args}') + if (args.verbose): print(f'Search options: {search_options}') + + params.set_search_options(**search_options) + # Set the batch size for the CUDA graph to the number of prompts if the user didn't specify a batch size + params.try_use_cuda_graph_with_max_batch_size(len(prompts)) + if args.batch_size_for_cuda_graph: + params.try_use_cuda_graph_with_max_batch_size(args.batch_size_for_cuda_graph) params.input_ids = input_tokens if args.verbose: print("GeneratorParams created") @@ -37,19 +52,24 @@ def main(args): print() print() - print(f"Tokens: {len(output_tokens[0])} Time: {run_time:.2f} Tokens per second: {len(output_tokens[0])/run_time:.2f}") + total_tokens = sum(len(x) for x in output_tokens) + print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}") print() if __name__ == "__main__": - parser = argparse.ArgumentParser(description="End-to-end token generation loop example for gen-ai") + parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai") parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)') - parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from') - parser.add_argument('-l', '--max_length', type=int, default=512, help='Max number of tokens to generate after prompt') - parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with') - parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output') - parser.add_argument('-c', '--cuda_graph_with_max_batch_size', type=int, default=0, help='Max batch size for CUDA graph') + parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts') + parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') + parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') + parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') + parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') + parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') + parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') + parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') + parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph') + parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.') + args = parser.parse_args() main(args) \ No newline at end of file diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 57ec9f6db..c1e628eb5 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -14,10 +14,15 @@ def main(args): tokenizer_stream = tokenizer.create_stream() if args.verbose: print("Tokenizer created") if args.verbose: print() + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: - print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") - exit(1) + + if args.verbose: print(search_options) + + if args.chat_template: + if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: + print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + exit(1) # Keep asking for input prompts in a loop while True: diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py index 4f175c102..977a47589 100644 --- a/examples/python/phi3-qa.py +++ b/examples/python/phi3-qa.py @@ -15,6 +15,12 @@ def main(args): if args.verbose: print("Tokenizer created") if args.verbose: print() search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + + # Set the max length to something sensible by default, unless it is specified by the user, + # since otherwise it will be set to the entire context length + if 'max_length' not in search_options: + search_options['max_length'] = 2048 + chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' # Keep asking for input prompts in a loop From b79467e68d3638fd7c46bd5b228092765da4e387 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Fri, 3 May 2024 00:01:52 -0400 Subject: [PATCH 3/8] Exclude libnvinfer_plugin.so.10 (#391) --- src/python/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index bc85d8bce..6dbf82371 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -64,8 +64,12 @@ if(BUILD_WHEEL) "libcufft.so.11" "libcurand.so.10" "libnvinfer.so.8" + "libnvinfer.so.10" "libnvinfer_plugin.so.8" + "libnvinfer_plugin.so.10" "libnvonnxparser.so.8" + "libnvonnxparser.so.10" + ) set(modified_exclude_list) foreach(item IN LISTS auditwheel_exclude_list) From 42bd7152eccf1c24c10b6d2531183f3d6777fa2f Mon Sep 17 00:00:00 2001 From: Parinita Rahi <101819959+parinitarahi@users.noreply.github.com> Date: Thu, 2 May 2024 21:03:23 -0700 Subject: [PATCH 4/8] Update phi-3-tutorial.md (#386) Replacing the variable with generic text, so users can replace with model path for appropriate models. --- examples/python/phi-3-tutorial.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index e1b4456ab..7c5c689cb 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -74,15 +74,14 @@ Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/b The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line. -This example is using the long context model running with DirectML on Windows. + The `-m` argument is the path to the model you downloaded from HuggingFace above. The `-l` argument is the length of output you would like to generate with the model. ```bash curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py -model_path="./Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128" -python phi3-qa.py -m $model_path -l 2048 +python phi3-qa.py -m *replace your relative model_path here* -l 2048 ``` Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: From e82ab3d2e6be48c799b0eab097399c92403973a6 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Thu, 2 May 2024 23:02:14 -0700 Subject: [PATCH 5/8] Merge tokenizer library invalid UTF-8 fix (#390) Port over a fix from the onnxruntime-extensions tokenizer library to fix an invalid UTF-8 issue. --- src/tokenizer/token_bpe.cc | 33 ++++++++++++---- src/tokenizer/token_bpe.h | 1 + src/tokenizer/tokenizer.cc | 6 +-- src/tokenizer/utils/unescape.cc | 69 ++++++++++++++++++++++++--------- 4 files changed, 80 insertions(+), 29 deletions(-) diff --git a/src/tokenizer/token_bpe.cc b/src/tokenizer/token_bpe.cc index 93c897eea..80ac9d5bf 100644 --- a/src/tokenizer/token_bpe.cc +++ b/src/tokenizer/token_bpe.cc @@ -237,15 +237,17 @@ std::vector BPETokenizer::Encode(std::string_view sv_input, int64_ text = text.strip() */ std::u32string str = RemoveConsecutiveSpaces(input); - if (IsUnicodeSpace(str.front())) { - str.erase(str.begin()); - } - if (IsUnicodeSpace(str.back())) { - str.pop_back(); + if (!str.empty()) { + if (IsUnicodeSpace(str.front())) { + str.erase(str.begin()); + } + if (IsUnicodeSpace(str.back())) { + str.pop_back(); + } + // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned) + str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end()); + str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end()); } - // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned) - str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end()); - str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end()); input = str; } @@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta token.push_back(' '); } } // end case of whitespace_token_ + + bpe_state->incomplete_utf8_ += token; + token.clear(); + std::string& s_utf8 = bpe_state->incomplete_utf8_; + size_t utf8_len = 1; + size_t utf8_all_len = 0; + for (size_t i = 0; i < s_utf8.size(); i += utf8_len) { + utf8_len = UTF8Len(s_utf8[i]); + if (utf8_len <= s_utf8.size() - i) { + utf8_all_len += utf8_len; + auto _t = s_utf8.substr(i, utf8_len); + token += ValidateUTF8(_t) ? _t : ""; + } + } + s_utf8 = s_utf8.substr(utf8_all_len); } return status; diff --git a/src/tokenizer/token_bpe.h b/src/tokenizer/token_bpe.h index ed5f1f23c..2327b3a60 100644 --- a/src/tokenizer/token_bpe.h +++ b/src/tokenizer/token_bpe.h @@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl { BPEDeocerState() = default; ~BPEDeocerState() override = default; bool f_special_last; + std::string incomplete_utf8_; }; public: diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc index b2a0622e7..251595856 100644 --- a/src/tokenizer/tokenizer.cc +++ b/src/tokenizer/tokenizer.cc @@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path, if (type.empty()) { if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) { type = "BPE"; - } else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) { + } /* else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) { // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model type = "SPM"; - } else { + } */ else { status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument"); } } @@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path, } /* else if (type == "SPM") { token_ptr = std::make_unique(); } */ else { - status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported."); + status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported."); } if (status.ok()) { diff --git a/src/tokenizer/utils/unescape.cc b/src/tokenizer/utils/unescape.cc index f42e962f9..f94a1f192 100644 --- a/src/tokenizer/utils/unescape.cc +++ b/src/tokenizer/utils/unescape.cc @@ -41,27 +41,60 @@ std::string EncodeUTF8Char(char32_t utf8_char) { return {utf8_buf}; } -bool ValidateUTF8(const std::string& data) { - int cnt = 0; - for (size_t i = 0; i < data.size(); i++) { - int x = data[i]; - if (!cnt) { - if ((x >> 5) == 0b110) { - cnt = 1; - } else if ((x >> 4) == 0b1110) { - cnt = 2; - } else if ((x >> 3) == 0b11110) { - cnt = 3; - } else if ((x >> 7) != 0) { + bool ValidateUTF8(const std::string& data) { + const unsigned char* s = reinterpret_cast(data.c_str()); + const unsigned char* s_end = s + data.size(); + if (*s_end != '\0') + return false; + + while (*s) { + if (*s < 0x80) + /* 0xxxxxxx */ + s++; + else if ((s[0] & 0xe0) == 0xc0) { + /* 110XXXXx 10xxxxxx */ + if (s + 1 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) /* overlong? */ + return false; + else + s += 2; + } else if ((s[0] & 0xf0) == 0xe0) { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if (s + 2 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ + return false; + else + s += 3; + } else if ((s[0] & 0xf8) == 0xf0) { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if (s + 3 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ + return false; + else + s += 4; + } else return false; - } - } else { - if ((x >> 6) != 0b10) return false; - cnt--; } + + return true; } - return cnt == 0; -} + bool IsDigit(char c) { return c >= '0' && c <= '9'; } bool IsHexDigit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } From 1f3776d425afbd2e8f83f126f1c02f0d13633ea0 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Thu, 2 May 2024 23:23:58 -0700 Subject: [PATCH 6/8] Fix pad token id in config (#394) ### Description This PR sets `pad_token_id` in `genai_config.json` to a single value when a model does not specify a pad token id but it specifies a list of EOS token ids. ### Motivation and Context When the pad token id is not specified, `pad_token_id` in `genai_config.json` stores the same value that `eos_token_id` in `genai_config.json` contains. When `eos_token_id` has a list of EOS token ids, then `pad_token_id` also has a list of pad token ids. This causes a parsing issue in ONNX Runtime GenAI because it expects only one pad token id. This PR also fixes [this issue](https://github.com/microsoft/onnxruntime-genai/issues/384). --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index fd9c90d53..2f320b729 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -228,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): "num_key_value_heads": self.num_kv_heads, }, "eos_token_id": config.eos_token_id, - "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id, + "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id, "type": self.model_type[ : self.model_type.find("For")].lower(), "vocab_size": self.vocab_size, }, From 88d46dd30c34af12f31c2d587fd959e8534ade8c Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Fri, 3 May 2024 09:26:06 -0700 Subject: [PATCH 7/8] Use std::experimental::filesystem instead of std::filesystem (#393) --- CMakeLists.txt | 1 + cmake/cxx_standard.cmake | 6 +++++- src/config.cpp | 4 ++-- src/config.h | 4 ++-- src/filesystem.h | 11 +++++++++++ src/generators.h | 3 ++- src/logging.cpp | 2 +- src/models/model.cpp | 2 +- src/tokenizer/c_api/tfmtok_c.cc | 4 ++-- src/tokenizer/config.cc | 5 ++--- src/tokenizer/tokenizer.cc | 4 ++-- 11 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 src/filesystem.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ac13d590..8926e8a6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}") # Checking if CUDA is supported include(CheckLanguage) add_compile_definitions(BUILDING_ORT_GENAI_C) + if(USE_CUDA) check_language(CUDA) if(CMAKE_CUDA_COMPILER) diff --git a/cmake/cxx_standard.cmake b/cmake/cxx_standard.cmake index 7e752d40b..52732c2f2 100644 --- a/cmake/cxx_standard.cmake +++ b/cmake/cxx_standard.cmake @@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION else () message("Test is using C++20") set(CMAKE_CXX_STANDARD 20) -endif () \ No newline at end of file +endif () + +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9) + add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM) +endif() \ No newline at end of file diff --git a/src/config.cpp b/src/config.cpp index 39341f5b5..7dc3133ec 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element { JSON::Element& t_; }; -void ParseConfig(const std::filesystem::path& filename, Config& config) { +void ParseConfig(const fs::path& filename, Config& config) { std::ifstream file(filename, std::ios::binary | std::ios::ate); if (!file.is_open()) { throw std::runtime_error("Error opening " + filename.string()); @@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) { } } -Config::Config(const std::filesystem::path& path) : config_path{path} { +Config::Config(const fs::path& path) : config_path{path} { ParseConfig(path / "genai_config.json", *this); if (model.context_length == 0) diff --git a/src/config.h b/src/config.h index b94e05ca0..8fb5debdc 100644 --- a/src/config.h +++ b/src/config.h @@ -6,9 +6,9 @@ namespace Generators { struct Config { Config() = default; - Config(const std::filesystem::path& path); + Config(const fs::path& path); - std::filesystem::path config_path; // Path of the config directory + fs::path config_path; // Path of the config directory using ProviderOption = std::pair; struct ProviderOptions { diff --git a/src/filesystem.h b/src/filesystem.h new file mode 100644 index 000000000..45c4c7015 --- /dev/null +++ b/src/filesystem.h @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8 +#ifdef USE_EXPERIMENTAL_FILESYSTEM +#include +namespace fs = std::experimental::filesystem; +#else +#include +namespace fs = std::filesystem; +#endif diff --git a/src/generators.h b/src/generators.h index e6ad6f0e1..3f7c1d8e0 100644 --- a/src/generators.h +++ b/src/generators.h @@ -5,8 +5,9 @@ #include #include #include -#include +#include "filesystem.h" #include +#include #include "span.h" #include #include diff --git a/src/logging.cpp b/src/logging.cpp index 6c334f50a..edd698168 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) { if (value.empty()) gp_logfile.reset(); else { - std::filesystem::path filename{value}; + fs::path filename{std::string(value)}; gp_logfile = std::make_unique(filename); } diff --git a/src/models/model.cpp b/src/models/model.cpp index 35a9b4ad4..d760824b3 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -297,7 +297,7 @@ void Model::CreateSessionOptions() { } if (options.enable_profiling.has_value()) { - std::filesystem::path profile_file_prefix{options.enable_profiling.value()}; + fs::path profile_file_prefix{options.enable_profiling.value()}; ort_options.EnableProfiling(profile_file_prefix.c_str()); } diff --git a/src/tokenizer/c_api/tfmtok_c.cc b/src/tokenizer/c_api/tfmtok_c.cc index 02c57ce65..3dc9be009 100644 --- a/src/tokenizer/c_api/tfmtok_c.cc +++ b/src/tokenizer/c_api/tfmtok_c.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include -#include +#include "../filesystem.h" #include #include "tfmtok.h" @@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer, return kTfmErrorInvalidArgument; } - if (!std::filesystem::is_directory(tokenizer_path)) { + if (!fs::is_directory(tokenizer_path)) { last_error_message = std::string("Cannot find the directory of ") + tokenizer_path; return kTfmErrorInvalidArgument; } diff --git a/src/tokenizer/config.cc b/src/tokenizer/config.cc index dbc0908cf..a40b7d7db 100644 --- a/src/tokenizer/config.cc +++ b/src/tokenizer/config.cc @@ -4,7 +4,7 @@ #include #include #include -#include +#include "../filesystem.h" #include "config.h" @@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) { simdjson::dom::parser parser; simdjson::dom::element root; - if (!std::filesystem::exists( - std::filesystem::path(json_path).lexically_normal())) { + if (!fs::exists(fs::path(json_path))) { return {kTfmErrorInvalidFile, std::string(json_path) + " not found"}; } std::string json_text = PatchJsonText(json_path); diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc index 251595856..4f52acd72 100644 --- a/src/tokenizer/tokenizer.cc +++ b/src/tokenizer/tokenizer.cc @@ -1,7 +1,7 @@ #include "token_bpe.h" #include "token_rwkv.h" -#include +#include "../filesystem.h" #include namespace tfm { @@ -30,7 +30,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path, if (type.empty()) { if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) { type = "BPE"; - } /* else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) { + } /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) { // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model type = "SPM"; } */ else { From b272ba45ca0c1dc62cfb35f4675b6683b421ebd5 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Fri, 3 May 2024 10:43:18 -0700 Subject: [PATCH 8/8] update GQA message (#396) --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 2f320b729..940f76e55 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT): # Change model settings for GroupQueryAttention self.attention_attrs["op_type"] = "GroupQueryAttention" - print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.") + print("GroupQueryAttention (GQA) is used in this model.") # DML doesn't support packed Q/K/V for GQA yet self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads