Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into Cjian/ado-zip
Browse files Browse the repository at this point in the history
  • Loading branch information
jchen351 committed May 3, 2024
2 parents 653f189 + b272ba4 commit 72ab9b8
Show file tree
Hide file tree
Showing 20 changed files with 174 additions and 67 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}")
# Checking if CUDA is supported
include(CheckLanguage)
add_compile_definitions(BUILDING_ORT_GENAI_C)

if(USE_CUDA)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
Expand Down
6 changes: 5 additions & 1 deletion cmake/cxx_standard.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION
else ()
message("Test is using C++20")
set(CMAKE_CXX_STANDARD 20)
endif ()
endif ()

if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM)
endif()
52 changes: 36 additions & 16 deletions examples/python/model-generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,30 @@ def main(args):
prompts = args.prompts
else:
prompts = ["I like walking my cute dog",
"What is the best restaurant in town?",
"Hello, how are you today?"]
"What is the best restaurant in town?",
"Hello, how are you today?"]

if args.chat_template:
if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
exit(1)
prompts[:] = [f'{args.chat_template.format(input=text)}' for text in prompts]

input_tokens = tokenizer.encode_batch(prompts)
if args.verbose: print("Prompt(s) encoded")
if args.verbose: print(f'Prompt(s) encoded: {prompts}')

params = og.GeneratorParams(model)
params.set_search_options(max_length=args.max_length, top_p=args.top_p, top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty)
if args.cuda_graph_with_max_batch_size > 0:
params.try_use_cuda_graph_with_max_batch_size(args.cuda_graph_with_max_batch_size)

search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}

if (args.verbose): print(f'Args: {args}')
if (args.verbose): print(f'Search options: {search_options}')

params.set_search_options(**search_options)
# Set the batch size for the CUDA graph to the number of prompts if the user didn't specify a batch size
params.try_use_cuda_graph_with_max_batch_size(len(prompts))
if args.batch_size_for_cuda_graph:
params.try_use_cuda_graph_with_max_batch_size(args.batch_size_for_cuda_graph)
params.input_ids = input_tokens
if args.verbose: print("GeneratorParams created")

Expand All @@ -37,19 +52,24 @@ def main(args):
print()

print()
print(f"Tokens: {len(output_tokens[0])} Time: {run_time:.2f} Tokens per second: {len(output_tokens[0])/run_time:.2f}")
total_tokens = sum(len(x) for x in output_tokens)
print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
print()

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="End-to-end token generation loop example for gen-ai")
parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai")
parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from')
parser.add_argument('-l', '--max_length', type=int, default=512, help='Max number of tokens to generate after prompt')
parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with')
parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from')
parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
parser.add_argument('-c', '--cuda_graph_with_max_batch_size', type=int, default=0, help='Max batch size for CUDA graph')
parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts')
parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph')
parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.')

args = parser.parse_args()
main(args)
11 changes: 8 additions & 3 deletions examples/python/model-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,15 @@ def main(args):
tokenizer_stream = tokenizer.create_stream()
if args.verbose: print("Tokenizer created")
if args.verbose: print()

search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
exit(1)

if args.verbose: print(search_options)

if args.chat_template:
if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
exit(1)

# Keep asking for input prompts in a loop
while True:
Expand Down
11 changes: 8 additions & 3 deletions examples/python/phi-3-tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@

Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face.

To download the Phi-3 mini models, you will need to have git-lfs installed.
* MacOS: `brew install git-lfs`
* Linux: `apt-get install git-lfs`
* Windows: `winget install -e --id GitHub.GitLFS` (If you don't have winget, download and run the `exe` from the [official source](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=windows))

Then run `git lfs install`

For the short context model.

Expand Down Expand Up @@ -68,15 +74,14 @@ Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/b

The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.

This example is using the long context model running with DirectML on Windows.
<!--This example is using the long context model running with DirectML on Windows.-->

The `-m` argument is the path to the model you downloaded from HuggingFace above.
The `-l` argument is the length of output you would like to generate with the model.

```bash
curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
model_path="./Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128"
python phi3-qa.py -m $model_path -l 2048
python phi3-qa.py -m *replace your relative model_path here* -l 2048
```

Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
Expand Down
6 changes: 6 additions & 0 deletions examples/python/phi3-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def main(args):
if args.verbose: print("Tokenizer created")
if args.verbose: print()
search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}

# Set the max length to something sensible by default, unless it is specified by the user,
# since otherwise it will be set to the entire context length
if 'max_length' not in search_options:
search_options['max_length'] = 2048

chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'

# Keep asking for input prompts in a loop
Expand Down
4 changes: 2 additions & 2 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element {
JSON::Element& t_;
};

void ParseConfig(const std::filesystem::path& filename, Config& config) {
void ParseConfig(const fs::path& filename, Config& config) {
std::ifstream file(filename, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
throw std::runtime_error("Error opening " + filename.string());
Expand All @@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) {
}
}

Config::Config(const std::filesystem::path& path) : config_path{path} {
Config::Config(const fs::path& path) : config_path{path} {
ParseConfig(path / "genai_config.json", *this);

if (model.context_length == 0)
Expand Down
4 changes: 2 additions & 2 deletions src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ namespace Generators {

struct Config {
Config() = default;
Config(const std::filesystem::path& path);
Config(const fs::path& path);

std::filesystem::path config_path; // Path of the config directory
fs::path config_path; // Path of the config directory

using ProviderOption = std::pair<std::string, std::string>;
struct ProviderOptions {
Expand Down
11 changes: 11 additions & 0 deletions src/filesystem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8
#ifdef USE_EXPERIMENTAL_FILESYSTEM
#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;
#else
#include <filesystem>
namespace fs = std::filesystem;
#endif
3 changes: 2 additions & 1 deletion src/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
#include <assert.h>
#include <cmath>
#include <cstring>
#include <filesystem>
#include "filesystem.h"
#include <functional>
#include <iostream>
#include "span.h"
#include <memory>
#include <numeric>
Expand Down
2 changes: 1 addition & 1 deletion src/logging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) {
if (value.empty())
gp_logfile.reset();
else {
std::filesystem::path filename{value};
fs::path filename{std::string(value)};
gp_logfile = std::make_unique<std::ofstream>(filename);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ void Model::CreateSessionOptions() {
}

if (options.enable_profiling.has_value()) {
std::filesystem::path profile_file_prefix{options.enable_profiling.value()};
fs::path profile_file_prefix{options.enable_profiling.value()};
ort_options.EnableProfiling(profile_file_prefix.c_str());
}

Expand Down
4 changes: 4 additions & 0 deletions src/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ if(BUILD_WHEEL)
"libcufft.so.11"
"libcurand.so.10"
"libnvinfer.so.8"
"libnvinfer.so.10"
"libnvinfer_plugin.so.8"
"libnvinfer_plugin.so.10"
"libnvonnxparser.so.8"
"libnvonnxparser.so.10"

)
set(modified_exclude_list)
foreach(item IN LISTS auditwheel_exclude_list)
Expand Down
4 changes: 2 additions & 2 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
# Change model settings for GroupQueryAttention
self.attention_attrs["op_type"] = "GroupQueryAttention"
print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
print("GroupQueryAttention (GQA) is used in this model.")

# DML doesn't support packed Q/K/V for GQA yet
self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
Expand Down Expand Up @@ -228,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
"num_key_value_heads": self.num_kv_heads,
},
"eos_token_id": config.eos_token_id,
"pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id,
"pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id,
"type": self.model_type[ : self.model_type.find("For")].lower(),
"vocab_size": self.vocab_size,
},
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/c_api/tfmtok_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Licensed under the MIT License.

#include <cstdarg>
#include <filesystem>
#include "../filesystem.h"
#include <algorithm>

#include "tfmtok.h"
Expand Down Expand Up @@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer,
return kTfmErrorInvalidArgument;
}

if (!std::filesystem::is_directory(tokenizer_path)) {
if (!fs::is_directory(tokenizer_path)) {
last_error_message = std::string("Cannot find the directory of ") + tokenizer_path;
return kTfmErrorInvalidArgument;
}
Expand Down
5 changes: 2 additions & 3 deletions src/tokenizer/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <string>
#include <fstream>
#include <streambuf>
#include <filesystem>
#include "../filesystem.h"

#include "config.h"

Expand Down Expand Up @@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) {
simdjson::dom::parser parser;
simdjson::dom::element root;

if (!std::filesystem::exists(
std::filesystem::path(json_path).lexically_normal())) {
if (!fs::exists(fs::path(json_path))) {
return {kTfmErrorInvalidFile, std::string(json_path) + " not found"};
}
std::string json_text = PatchJsonText(json_path);
Expand Down
33 changes: 25 additions & 8 deletions src/tokenizer/token_bpe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,15 +237,17 @@ std::vector<tfmTokenId_t> BPETokenizer::Encode(std::string_view sv_input, int64_
text = text.strip()
*/
std::u32string str = RemoveConsecutiveSpaces(input);
if (IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
str.pop_back();
if (!str.empty()) {
if (IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
input = str;
}

Expand Down Expand Up @@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta
token.push_back(' ');
}
} // end case of whitespace_token_

bpe_state->incomplete_utf8_ += token;
token.clear();
std::string& s_utf8 = bpe_state->incomplete_utf8_;
size_t utf8_len = 1;
size_t utf8_all_len = 0;
for (size_t i = 0; i < s_utf8.size(); i += utf8_len) {
utf8_len = UTF8Len(s_utf8[i]);
if (utf8_len <= s_utf8.size() - i) {
utf8_all_len += utf8_len;
auto _t = s_utf8.substr(i, utf8_len);
token += ValidateUTF8(_t) ? _t : "";
}
}
s_utf8 = s_utf8.substr(utf8_all_len);
}

return status;
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/token_bpe.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl {
BPEDeocerState() = default;
~BPEDeocerState() override = default;
bool f_special_last;
std::string incomplete_utf8_;
};

public:
Expand Down
8 changes: 4 additions & 4 deletions src/tokenizer/tokenizer.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "token_bpe.h"
#include "token_rwkv.h"

#include <filesystem>
#include "../filesystem.h"
#include <memory>

namespace tfm {
Expand Down Expand Up @@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
if (type.empty()) {
if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
type = "BPE";
} else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
} /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
// if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
type = "SPM";
} else {
} */ else {
status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument");
}
}
Expand All @@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
} /* else if (type == "SPM") {
token_ptr = std::make_unique<SpmTokenizer>();
} */ else {
status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported.");
status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported.");
}

if (status.ok()) {
Expand Down
Loading

0 comments on commit 72ab9b8

Please sign in to comment.