Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into ryanunderhill/span_fix
  • Loading branch information
RyanUnderhill committed Feb 15, 2024
2 parents 357d9c3 + b20feff commit ad0399b
Show file tree
Hide file tree
Showing 11 changed files with 90 additions and 95 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/mac-cpu-arm64-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: "MacOS CPU ARM64 Build"
on: [ workflow_dispatch, pull_request ]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:
ort_dir: "onnxruntime-osx-arm64-1.17.0"
ort_zip: "onnxruntime-osx-arm64-1.17.0.tgz"
ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.0/onnxruntime-osx-arm64-1.17.0.tgz"
jobs:
job:
runs-on: macos-latest
steps:
- name: Checkout OnnxRuntime GenAI repo
uses: actions/checkout@v4
with:
submodules: true

- name: Install ninja
run: |
brew install ninja
- name: Download OnnxRuntime
run: |
curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }}
- name: Unzip OnnxRuntime
run: |
tar -xzf ${{ env.ort_zip }}
rm ${{ env.ort_zip }}
- name: Rename OnnxRuntime to ort
run: |
mv ${{ env.ort_dir }} ort
- name: Build with CMake and Clang
run: |
- name: Build with CMake
run: |
cmake -G "Ninja" -B build . -DCMAKE_BUILD_TYPE=Release -DUSE_CUDA=OFF
cmake --build build --config Release --parallel
continue-on-error: true

- name: Verify Build Artifacts
if: always()
run: |
ls -l ${{ github.workspace }}/build
- name: Upload Build Artifacts
uses: actions/upload-artifact@v3
with:
name: onnxruntime-genai-mac-cpu-arm64
path: ${{ github.workspace }}/build/**/*.a


18 changes: 8 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,15 @@ else()
set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so")
endif()

if(USE_TOKENIZER)
if(NO_TOKENIZEROOT)
add_compile_definitions(NO_TOKENIZER=1)
message("----------------Tokenizer Disabled------------------")
else()
add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer")
add_compile_definitions(USE_TOKENIZER=1)
message("------------------Using Tokenizer------------------")
target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
endif()

if(ENABLE_TESTS)
Expand Down Expand Up @@ -153,12 +158,5 @@ foreach(DLL_FILE ${onnxruntime_libs})
)
endforeach()

if(USE_TOKENIZER)
target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
endif()

# Have visual studio put all files into one single folder vs the default split of header files into a separate folder
source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs})
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ To source `microsoft/phi-2` optimized for your target, download and run the foll


```bash
wget https://raw.githubusercontent.com/microsoft/onnxruntime-genai/kvaishnavi/models/src/python/models/export.py
wget https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/src/python/models/export.py
```

Export int4 CPU version
```bash
huggingface-cli login --token <your HuggingFace token>
python export.py python models/export.py -m microsoft/phi-2 -p int4 -e cpu -o phi2-int4-cpu.onnx
python export.py -m microsoft/phi-2 -p int4 -e cpu -o phi2-int4-cpu.onnx
```


Expand Down
2 changes: 1 addition & 1 deletion cmake/options.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include(CMakeDependentOption)

option(USE_CUDA "Build with CUDA support" ON)
option(USE_TOKENIZER "Build with Tokenizer support" ON)
option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
option(ENABLE_PYTHON "Enable python buildings" ON)
option(ENABLE_TESTS "Enable tests" ON)

Expand Down
1 change: 1 addition & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "generators.h"
#include "json.h"
#include <fstream>
#include <sstream>

namespace Generators {

Expand Down
1 change: 1 addition & 0 deletions src/json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cmath>
#include <charconv>
#include <sstream>

namespace JSON {

Expand Down
15 changes: 12 additions & 3 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,18 @@ void State::ClearIO() {
outputs_.clear();
}

#if USE_TOKENIZER
#ifdef NO_TOKENIZER
Tokenizer::Tokenizer(Config& config) {
}

std::vector<int32_t> Tokenizer::Encode(const char* text) const {
throw std::runtime_error("Tokenizer not enabled");
}

std::string Tokenizer::Decode(std::span<int32_t> tokens) const {
throw std::runtime_error("Tokenizer not enabled");
}
#else
void CheckResult(tfmError_t error) {
if (error != kTfmOK)
throw std::runtime_error(TfmGetLastErrorMessage());
Expand Down Expand Up @@ -88,11 +99,9 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
#endif
}

#if USE_TOKENIZER
std::unique_ptr<Tokenizer> Model::CreateTokenizer() const {
return std::make_unique<Tokenizer>(*config_);
}
#endif

std::unique_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, const ProviderOptions* provider_options) {
auto config = std::make_unique<Config>(config_path);
Expand Down
14 changes: 10 additions & 4 deletions src/models/model.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#if USE_TOKENIZER
#ifndef NO_TOKENIZER
#include "tfmtok_c.h"
#endif

Expand All @@ -23,6 +23,15 @@ struct State {
void ClearIO(); // Clear all inputs/outputs
};

#ifdef NO_TOKENIZER
struct Tokenizer {
Tokenizer(Config& config);

std::vector<int32_t> Encode(const char* text) const;
std::string Decode(std::span<int32_t> tokens) const;
};
#else

template <typename T>
struct TfmPtr {
~TfmPtr() { TfmDispose(&p_); }
Expand All @@ -36,7 +45,6 @@ struct TfmPtr {
T* p_{};
};

#if USE_TOKENIZER
struct Tokenizer {
Tokenizer(Config& config);

Expand All @@ -51,9 +59,7 @@ struct Model {
Model(std::unique_ptr<Config> config, const ProviderOptions* provider_options);
virtual ~Model();

#if USE_TOKENIZER
std::unique_ptr<Tokenizer> CreateTokenizer() const;
#endif

virtual std::unique_ptr<State> CreateState(RoamingArray<int32_t> sequence_lengths, const GeneratorParams& params) const = 0;

Expand Down
15 changes: 0 additions & 15 deletions src/python/python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,6 @@ std::string ToString(const GeneratorParams& v) {
return oss.str();
}

#if 0
std::string ToString(const Gpt_Model& v) {
std::ostringstream oss;
oss << "Gpt_Model("
"vocab_size="
<< v.vocab_size_ << ", head_count=" << v.head_count_ << ", hidden_size=" << v.hidden_size_ << ", layer_count=" << v.layer_count_ << ")";

return oss.str();
}
#endif

std::unique_ptr<OrtEnv> g_ort_env;

OrtEnv& GetOrtEnv() {
Expand Down Expand Up @@ -254,11 +243,9 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
m.def("print", &TestFP32, "Test float32");
m.def("print", &TestFP16, "Test float16");

#if USE_TOKENIZER
pybind11::class_<Tokenizer>(m, "Tokenizer")
.def("encode", &Tokenizer::Encode)
.def("decode", [](const Tokenizer& t, pybind11::array_t<int32_t> tokens) { return t.Decode(ToSpan(tokens)); });
#endif

pybind11::class_<Model>(m, "Model")
.def(pybind11::init([](const std::string& config_path, DeviceType device_type) {
Expand All @@ -267,9 +254,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
}),
"str"_a, "device_type"_a = DeviceType::Auto)
.def("generate", [](Model& model, PySearchParams& search_params) { search_params.Prepare(); return Generate(model, search_params); })
#if USE_TOKENIZER
.def("create_tokenizer", [](Model& model) { return model.CreateTokenizer(); })
#endif
.def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });

pybind11::class_<PyGenerator>(m, "Generator")
Expand Down
2 changes: 0 additions & 2 deletions test/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ void Test_GreedySearch_Gpt_Fp32();
void Test_BeamSearch_Gpt_Fp32();

#if USE_CUDA
void Test_Phi2_Cuda();
void Test_GreedySearch_Gpt_Cuda();
void Test_BeamSearch_Gpt_Cuda();
#endif
Expand All @@ -34,7 +33,6 @@ int main() {
#if USE_CUDA
Test_GreedySearch_Gpt_Cuda();
Test_BeamSearch_Gpt_Cuda();
Test_Phi2_Cuda();
#endif
} catch (const std::exception& e) {
std::cout << "Fatal Exception: " << e.what() << std::endl;
Expand Down
58 changes: 0 additions & 58 deletions test/tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,62 +226,4 @@ void Test_BeamSearch_Gpt_Cuda() {
for (auto model_path : c_tiny_gpt2_model_paths)
Test_BeamSearch_Gpt_Cuda(model_path.first, model_path.second);
}

void Test_Phi2_Cuda() {
#if TEST_PHI2
std::cout << "Testing_Phi2\r\n";
#if USE_TOKENIZER

auto prompt = R"(
def print_prime(n):
'''
Print all primes between 1 and n
'''
)";

std::cout << "With prompt:" << prompt << "\r\n";

auto provider_options = Generators::GetDefaultProviderOptions(Generators::DeviceType::CUDA);
auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2", &provider_options);
auto tokenizer = model->CreateTokenizer();
auto tokens = tokenizer->Encode(prompt);

Generators::SearchParams params{*model};
params.batch_size = 1;
params.sequence_length = static_cast<int>(tokens.size());
params.input_ids = tokens;
params.max_length = 128;

// Original version
auto search = params.CreateSearch();
auto state = model->CreateState(search->GetSequenceLengths(), params);

while (!search->IsDone()) {
search->SetLogits(state->Run(search->GetSequenceLength(), search->GetNextTokens()));
search->SelectTop();
}

auto result = search->GetSequence(0);

// Generator version
auto generator = model->CreateGenerator();
while (!generator->IsDone()) {
auto logits = generator->RunStep();

generator->SelectTop();
}

auto result = generator->GetSequence(0);

// High level version
auto result = model->Generate(params);

std::cout << tokenizer->Decode(result) << "\r\n";
std::cout << "Test complete\r\n";
#else
std::cout << "Test skipped - not built with onnxruntime extensions\r\n";
#endif
#endif
}

#endif

0 comments on commit ad0399b

Please sign in to comment.