Merge branch 'main' of https://github.com/microsoft/onnxruntime-genai …

…into ryanunderhill/span_fix
microsoft · Feb 15, 2024 · ad0399b · ad0399b
2 parents 357d9c3 + b20feff
commit ad0399b
Show file tree

Hide file tree

Showing 11 changed files with 90 additions and 95 deletions.
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
@@ -0,0 +1,55 @@
+name: "MacOS CPU ARM64 Build"
+on: [ workflow_dispatch, pull_request ]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+env:
+  ort_dir: "onnxruntime-osx-arm64-1.17.0"
+  ort_zip: "onnxruntime-osx-arm64-1.17.0.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.0/onnxruntime-osx-arm64-1.17.0.tgz"
+jobs:
+  job:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout OnnxRuntime GenAI repo
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Install ninja
+        run: |
+          brew install ninja
+
+      - name: Download OnnxRuntime
+        run: |
+          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+
+      - name: Unzip OnnxRuntime
+        run: |
+          tar -xzf ${{ env.ort_zip }}
+          rm ${{ env.ort_zip }}
+
+      - name: Rename OnnxRuntime to ort
+        run: |
+          mv ${{ env.ort_dir }} ort
+
+      - name: Build with CMake and Clang
+        run: |
+      - name: Build with CMake
+        run: |
+          cmake -G "Ninja" -B build . -DCMAKE_BUILD_TYPE=Release -DUSE_CUDA=OFF
+          cmake --build build --config Release --parallel
+        continue-on-error: true
+
+      - name: Verify Build Artifacts
+        if: always()
+        run: |
+          ls -l ${{ github.workspace }}/build
+
+      - name: Upload Build Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: onnxruntime-genai-mac-cpu-arm64
+          path: ${{ github.workspace }}/build/**/*.a
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -95,10 +95,15 @@ else()
   set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so")
 endif()
 
-if(USE_TOKENIZER)
+if(NO_TOKENIZEROOT)
+  add_compile_definitions(NO_TOKENIZER=1)
+  message("----------------Tokenizer Disabled------------------")
+else()
   add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer")
-  add_compile_definitions(USE_TOKENIZER=1)
-  message("------------------Using Tokenizer------------------")
+  target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
+  target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
+  target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
+  target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
 endif()
 
 if(ENABLE_TESTS)
@@ -153,12 +158,5 @@ foreach(DLL_FILE ${onnxruntime_libs})
   )
 endforeach()
 
-if(USE_TOKENIZER)
-  target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
-  target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
-  target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
-  target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
-endif()
-
 # Have visual studio put all files into one single folder vs the default split of header files into a separate folder
 source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs})
diff --git a/README.md b/README.md
@@ -117,13 +117,13 @@ To source `microsoft/phi-2` optimized for your target, download and run the foll
 
 
 ```bash
-wget https://raw.githubusercontent.com/microsoft/onnxruntime-genai/kvaishnavi/models/src/python/models/export.py
+wget https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/src/python/models/export.py
 ```
 
 Export int4 CPU version 
 ```bash
 huggingface-cli login --token <your HuggingFace token>
-python export.py python models/export.py -m microsoft/phi-2 -p int4 -e cpu -o phi2-int4-cpu.onnx
+python export.py -m microsoft/phi-2 -p int4 -e cpu -o phi2-int4-cpu.onnx
 ```
 
 

diff --git a/cmake/options.cmake b/cmake/options.cmake
@@ -1,7 +1,7 @@
 include(CMakeDependentOption)
 
 option(USE_CUDA "Build with CUDA support" ON)
-option(USE_TOKENIZER "Build with Tokenizer support" ON)
+option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
 option(ENABLE_PYTHON "Enable python buildings" ON)
 option(ENABLE_TESTS "Enable tests" ON)
 

diff --git a/src/config.cpp b/src/config.cpp
@@ -1,6 +1,7 @@
 #include "generators.h"
 #include "json.h"
 #include <fstream>
+#include <sstream>
 
 namespace Generators {
 

diff --git a/src/json.cpp b/src/json.cpp
@@ -3,6 +3,7 @@
 
 #include <cmath>
 #include <charconv>
+#include <sstream>
 
 namespace JSON {
 

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -31,7 +31,18 @@ void State::ClearIO() {
   outputs_.clear();
 }
 
-#if USE_TOKENIZER
+#ifdef NO_TOKENIZER
+Tokenizer::Tokenizer(Config& config) {
+}
+
+std::vector<int32_t> Tokenizer::Encode(const char* text) const {
+  throw std::runtime_error("Tokenizer not enabled");
+}
+
+std::string Tokenizer::Decode(std::span<int32_t> tokens) const {
+  throw std::runtime_error("Tokenizer not enabled");
+}
+#else
 void CheckResult(tfmError_t error) {
   if (error != kTfmOK)
     throw std::runtime_error(TfmGetLastErrorMessage());
@@ -88,11 +99,9 @@ void Model::InitDeviceAllocator([[maybe_unused]] OrtSession& session) {
 #endif
 }
 
-#if USE_TOKENIZER
 std::unique_ptr<Tokenizer> Model::CreateTokenizer() const {
   return std::make_unique<Tokenizer>(*config_);
 }
-#endif
 
 std::unique_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path, const ProviderOptions* provider_options) {
   auto config = std::make_unique<Config>(config_path);

diff --git a/src/models/model.h b/src/models/model.h
@@ -1,5 +1,5 @@
 #pragma once
-#if USE_TOKENIZER
+#ifndef NO_TOKENIZER
 #include "tfmtok_c.h"
 #endif
 
@@ -23,6 +23,15 @@ struct State {
   void ClearIO();                 // Clear all inputs/outputs
 };
 
+#ifdef NO_TOKENIZER
+struct Tokenizer {
+  Tokenizer(Config& config);
+
+  std::vector<int32_t> Encode(const char* text) const;
+  std::string Decode(std::span<int32_t> tokens) const;
+};
+#else
+
 template <typename T>
 struct TfmPtr {
   ~TfmPtr() { TfmDispose(&p_); }
@@ -36,7 +45,6 @@ struct TfmPtr {
   T* p_{};
 };
 
-#if USE_TOKENIZER
 struct Tokenizer {
   Tokenizer(Config& config);
 
@@ -51,9 +59,7 @@ struct Model {
   Model(std::unique_ptr<Config> config, const ProviderOptions* provider_options);
   virtual ~Model();
 
-#if USE_TOKENIZER
   std::unique_ptr<Tokenizer> CreateTokenizer() const;
-#endif
 
   virtual std::unique_ptr<State> CreateState(RoamingArray<int32_t> sequence_lengths, const GeneratorParams& params) const = 0;
 

diff --git a/src/python/python.cpp b/src/python/python.cpp
@@ -84,17 +84,6 @@ std::string ToString(const GeneratorParams& v) {
   return oss.str();
 }
 
-#if 0
-std::string ToString(const Gpt_Model& v) {
-  std::ostringstream oss;
-  oss << "Gpt_Model("
-         "vocab_size="
-      << v.vocab_size_ << ", head_count=" << v.head_count_ << ", hidden_size=" << v.hidden_size_ << ", layer_count=" << v.layer_count_ << ")";
-
-  return oss.str();
-}
-#endif
-
 std::unique_ptr<OrtEnv> g_ort_env;
 
 OrtEnv& GetOrtEnv() {
@@ -254,11 +243,9 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
   m.def("print", &TestFP32, "Test float32");
   m.def("print", &TestFP16, "Test float16");
 
-#if USE_TOKENIZER
   pybind11::class_<Tokenizer>(m, "Tokenizer")
       .def("encode", &Tokenizer::Encode)
       .def("decode", [](const Tokenizer& t, pybind11::array_t<int32_t> tokens) { return t.Decode(ToSpan(tokens)); });
-#endif
 
   pybind11::class_<Model>(m, "Model")
       .def(pybind11::init([](const std::string& config_path, DeviceType device_type) {
@@ -267,9 +254,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
            }),
            "str"_a, "device_type"_a = DeviceType::Auto)
       .def("generate", [](Model& model, PySearchParams& search_params) { search_params.Prepare(); return Generate(model, search_params); })
-#if USE_TOKENIZER
       .def("create_tokenizer", [](Model& model) { return model.CreateTokenizer(); })
-#endif
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")

diff --git a/test/main.cpp b/test/main.cpp
@@ -11,7 +11,6 @@ void Test_GreedySearch_Gpt_Fp32();
 void Test_BeamSearch_Gpt_Fp32();
 
 #if USE_CUDA
-void Test_Phi2_Cuda();
 void Test_GreedySearch_Gpt_Cuda();
 void Test_BeamSearch_Gpt_Cuda();
 #endif
@@ -34,7 +33,6 @@ int main() {
 #if USE_CUDA
     Test_GreedySearch_Gpt_Cuda();
     Test_BeamSearch_Gpt_Cuda();
-    Test_Phi2_Cuda();
 #endif
   } catch (const std::exception& e) {
     std::cout << "Fatal Exception: " << e.what() << std::endl;

diff --git a/test/tests.cpp b/test/tests.cpp
@@ -226,62 +226,4 @@ void Test_BeamSearch_Gpt_Cuda() {
   for (auto model_path : c_tiny_gpt2_model_paths)
     Test_BeamSearch_Gpt_Cuda(model_path.first, model_path.second);
 }
-
-void Test_Phi2_Cuda() {
-#if TEST_PHI2
-  std::cout << "Testing_Phi2\r\n";
-#if USE_TOKENIZER
-
-  auto prompt = R"(
-def print_prime(n):
-'''
-Print all primes between 1 and n
-'''
-)";
-
-  std::cout << "With prompt:" << prompt << "\r\n";
-
-  auto provider_options = Generators::GetDefaultProviderOptions(Generators::DeviceType::CUDA);
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2", &provider_options);
-  auto tokenizer = model->CreateTokenizer();
-  auto tokens = tokenizer->Encode(prompt);
-
-  Generators::SearchParams params{*model};
-  params.batch_size = 1;
-  params.sequence_length = static_cast<int>(tokens.size());
-  params.input_ids = tokens;
-  params.max_length = 128;
-
-  // Original version
-  auto search = params.CreateSearch();
-  auto state = model->CreateState(search->GetSequenceLengths(), params);
-
-  while (!search->IsDone()) {
-    search->SetLogits(state->Run(search->GetSequenceLength(), search->GetNextTokens()));
-    search->SelectTop();
-  }
-
-  auto result = search->GetSequence(0);
-
-  // Generator version
-  auto generator = model->CreateGenerator();
-  while (!generator->IsDone()) {
-    auto logits = generator->RunStep();
-
-    generator->SelectTop();
-  }
-
-  auto result = generator->GetSequence(0);
-
-  // High level version
-  auto result = model->Generate(params);
-
-  std::cout << tokenizer->Decode(result) << "\r\n";
-  std::cout << "Test complete\r\n";
-#else
-  std::cout << "Test skipped - not built with onnxruntime extensions\r\n";
-#endif
-#endif
-}
-
 #endif