diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml new file mode 100644 index 000000000..4d0af3bdf --- /dev/null +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -0,0 +1,76 @@ +name: llama_cpp_plugin_build_and_test + +on: + pull_request: + paths: + - 'modules/llama_cpp_plugin/**' + +jobs: + build_ubuntu20: + runs-on: ubuntu-20.04-8-cores + steps: + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.24.x' + + - name: Checkout openvino_contrib + uses: actions/checkout@v4 + with: + submodules: recursive + path: openvino_contrib + + - name: Checkout openvino + uses: actions/checkout@v4 + with: + submodules: recursive + repository: openvinotoolkit/openvino + path: openvino + + - name: CMake - configure + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON openvino + + - name: CMake - build + run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests + + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: build_artifacts + path: ${{ github.workspace }}/openvino/bin/intel64/Release/ + + test_ubuntu20: + needs: build_ubuntu20 + runs-on: ubuntu-20.04 + steps: + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: build_artifacts + path: ${{ github.workspace }}/binaries + + - name: Prepare test data - checkout llama.cpp repo + uses: actions/checkout@v4 + with: + repository: ggerganov/llama.cpp + path: llama.cpp + + - name: Prepare test data - convert test model files + run: | + pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt + huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 + mkdir -p ${{ github.workspace }}/test_data + python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + + - name: Install libtbb2 + run: | + wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz + mkdir -p tbb + tar xvzf oneapi-tbb-2021.2.4-lin.tgz + + - name: Run E2E tests + run: | + chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib + ${{ github.workspace }}/binaries/llama_cpp_e2e_tests diff --git a/modules/llama_cpp_plugin/.clang-format b/modules/llama_cpp_plugin/.clang-format new file mode 100644 index 000000000..ebe747b78 --- /dev/null +++ b/modules/llama_cpp_plugin/.clang-format @@ -0,0 +1,28 @@ +BasedOnStyle: Google +IndentWidth: 4 +UseTab: Never +ColumnLimit: 120 + +Language: Cpp +Standard: Cpp11 + +AccessModifierOffset: -4 +AlignConsecutiveMacros: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Empty +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: false +BinPackParameters: false +CommentPragmas: '^#' +DerivePointerAlignment: false +FixNamespaceComments: true +IndentCaseLabels: false +IndentPPDirectives: AfterHash +ForEachMacros: + - foreach + - FOREACH_CHILD diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt new file mode 100644 index 000000000..8c9939eab --- /dev/null +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.13) + +project(LlamaCppPlugin) + +find_package(OpenVINODeveloperPackage REQUIRED) + +ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF) + +add_subdirectory(src) + +FetchContent_Declare( + llama_cpp + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG b2417 + ) + +FetchContent_MakeAvailable(llama_cpp) + +if(ENABLE_TESTS) + include(CTest) + enable_testing() + add_subdirectory(tests/e2e) +endif() + +# install + +if(OpenVINODeveloperPackage_FOUND) + ov_cpack(LlamaCppPlugin) +endif() diff --git a/modules/llama_cpp_plugin/README.md b/modules/llama_cpp_plugin/README.md new file mode 100644 index 000000000..df20db7d3 --- /dev/null +++ b/modules/llama_cpp_plugin/README.md @@ -0,0 +1,52 @@ +### Build instructions + +This plugin should be built in the same fashion as the rest of the modules: + +1. Check out the OpenVINO repository proper (https://github.com/openvinotoolkit/openvino) +2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well. + +```bash +cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON . +``` + +3. Build the plugin either as part of the complete openvino build by executing: + +```bash +cmake --build build --parallel +``` + +or separately by specifying only the `llama_cpp_plugin` target: + +```bash +cmake --build build --parallel -- llama_cpp_plugin +``` + +4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately). + +#### Example of LLM inference code + +```C++ + +ov::Core core; +auto model = core.compile_model("model.gguf", "LLAMA_CPP") +auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); +auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); +std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + +auto infer_request == model.create_infer_request(); +infer_request.set_tensor("input_ids", input_ids); +infer_request.set_tensor("position_ids", position_ids); +infer_request.infer(); + +size_t vocab_size = lm.get_tensor("logits").get_shape().back(); +float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_size() - 1) * vocab_size; +int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; +``` + +The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. + +Only batch size of 1 is currently supported. + + + + diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp new file mode 100644 index 000000000..4dce17819 --- /dev/null +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifndef LLAMA_CPP_COMPILED_MODEL_HPP +#define LLAMA_CPP_COMPILED_MODEL_HPP + +#include "llama.h" +#include "openvino/runtime/icompiled_model.hpp" +#include "openvino/runtime/isync_infer_request.hpp" + +namespace ov { +namespace llama_cpp_plugin { +class LlamaCppSyncInferRequest; +class LlamaCppPlugin; +class LlamaCppState; +class LlamaCppModel : public ICompiledModel { +public: + LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin); + /** + * @brief Export compiled model to stream + * + * @param model output stream + */ + virtual void export_model(std::ostream& model) const override; + + /** + * @brief Returns runtime model + * + * @return OpenVINO Model which represents runtime graph + */ + virtual std::shared_ptr get_runtime_model() const override; + + /** + * @brief Allows to set property + * + * @param properties new plugin properties + */ + virtual void set_property(const ov::AnyMap& properties) override; + + /** + * @brief Returns property + * + * @param name Property name + * + * @return Property value + * virtual std::shared_ptr create_sync_infer_request() const override; + **/ + virtual ov::Any get_property(const std::string& name) const override; + virtual const std::vector>& inputs() const override; + virtual const std::vector>& outputs() const override; + virtual ~LlamaCppModel(); + +protected: + /** + * @brief Method creates infer request implementation + * + * @return Sync infer request + */ + virtual std::shared_ptr create_sync_infer_request() const override; + +private: + gguf_context* m_gguf_ctx = nullptr; + std::string m_gguf_fname; + + llama_model* m_llama_model_ptr = nullptr; + llama_context* m_llama_ctx = nullptr; + std::shared_ptr m_fake_model; + + std::vector> m_fake_inputs; + std::vector> m_fake_outputs; + + friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest; + friend class ov::llama_cpp_plugin::LlamaCppState; +}; +} // namespace llama_cpp_plugin +} // namespace ov + +#endif // LLAMA_CPP_COMPILED_MODEL_HPP diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp new file mode 100644 index 000000000..8f298ab57 --- /dev/null +++ b/modules/llama_cpp_plugin/include/infer_request.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifndef LLAMA_CPP_INFER_REQUEST_HPP +#define LLAMA_CPP_INFER_REQUEST_HPP + +#include "compiled_model.hpp" +#include "openvino/openvino.hpp" + +namespace ov { +namespace llama_cpp_plugin { + +class LlamaCppSyncInferRequest : public ISyncInferRequest { +public: + explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model); + virtual ~LlamaCppSyncInferRequest(){}; + + virtual void set_tensors_impl(const ov::Output port, + const std::vector>& tensors) override; + + virtual void infer() override; + virtual std::vector get_profiling_info() const override; + virtual std::vector> query_state() const override; + +private: + std::shared_ptr m_compiled_model_ptr; +}; + +} // namespace llama_cpp_plugin +}; // namespace ov + +#endif /* LLAMA_CPP_INFER_REQUEST_HPP */ diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp new file mode 100644 index 000000000..1bcb6abbd --- /dev/null +++ b/modules/llama_cpp_plugin/include/plugin.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifndef LLAMA_CPP_PLUGIN_HPP +#define LLAMA_CPP_PLUGIN_HPP + +#include "openvino/runtime/iplugin.hpp" + +namespace ov { +namespace llama_cpp_plugin { +class LlamaCppPlugin : public IPlugin { +public: + LlamaCppPlugin(); + virtual std::shared_ptr compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; + + virtual std::shared_ptr compile_model( + const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const override; + + virtual void set_property(const ov::AnyMap& properties) override; + + virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override; + + virtual ov::SoPtr create_context(const ov::AnyMap& remote_properties) const override; + + virtual ov::SoPtr get_default_context(const ov::AnyMap& remote_properties) const override; + + virtual std::shared_ptr import_model(std::istream& model, + const ov::AnyMap& properties) const override; + + virtual std::shared_ptr compile_model(const std::string& fname, + const ov::AnyMap& properties) const override; + + virtual std::shared_ptr import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const override; + + virtual ov::SupportedOpsMap query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; +}; +} // namespace llama_cpp_plugin +} // namespace ov + +#endif // LLAMA_CPP_PLUGIN_HPP diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp new file mode 100644 index 000000000..229970894 --- /dev/null +++ b/modules/llama_cpp_plugin/include/state.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifndef LLAMA_CPP_PLUGIN_HPP +#define LLAMA_CPP_PLUGIN_HPP + +#include "compiled_model.hpp" +#include "openvino/runtime/ivariable_state.hpp" + +namespace ov { +namespace llama_cpp_plugin { +class LlamaCppState : public IVariableState { +public: + LlamaCppState() = delete; + LlamaCppState(const std::shared_ptr& model_ptr) + : m_model_ptr(model_ptr), + IVariableState("llama_cpp_state") {} + void reset() override { + llama_kv_cache_clear(m_model_ptr->m_llama_ctx); + } + +private: + const std::shared_ptr& m_model_ptr; +}; +} // namespace llama_cpp_plugin +} // namespace ov +#endif // LLAMA_CPP_STATE_HPP diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt new file mode 100644 index 000000000..258df852f --- /dev/null +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set( PLUGIN_LIBRARY_NAME CACHE STRING "Library name for the generated plugin" ${TARGET_NAME}) +if(NOT PLUGIN_LIBRARY_NAME) + set( PLUGIN_LIBRARY_NAME "llama_cpp_plugin" ) +endif() + +set( PLUGIN_DEVICE_NAME CACHE STRING "Device name for the resulting plugin") +if(NOT PLUGIN_DEVICE_NAME) + set( PLUGIN_DEVICE_NAME "LLAMA_CPP" ) +endif() + +set(TARGET_NAME ${PLUGIN_LIBRARY_NAME}) + +file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) + +if (NOT ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION) + # Skip install and registration of template component + set(skip_plugin SKIP_INSTALL SKIP_REGISTRATION) +endif() + + + +# adds a shared library with plugin +ov_add_plugin(NAME ${TARGET_NAME} + DEVICE_NAME ${PLUGIN_DEVICE_NAME} + SOURCES ${SOURCES} ${HEADERS} + ${skip_plugin} + VERSION_DEFINES_FOR plugin.cpp + ADD_CLANG_FORMAT) + +target_include_directories(${TARGET_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}" + "${LlamaCppPlugin_SOURCE_DIR}/include") + +set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library") +if(NOT LLAMA_TARGET_NAME) + set( LLAMA_TARGET_NAME "llama" ) +endif() + +# include and link llama.cpp and ggml code +target_link_libraries(${TARGET_NAME} PRIVATE ${LLAMA_TARGET_NAME}) +target_link_libraries(${TARGET_NAME} PRIVATE ggml) + + +set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) + +if (ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION) + # Update the plugins.xml file + ov_register_plugins(MAIN_TARGET ${TARGET_NAME}) +endif() diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp new file mode 100644 index 000000000..adf9e17cf --- /dev/null +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "compiled_model.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "infer_request.hpp" +#include "plugin.hpp" + +namespace ov { +namespace llama_cpp_plugin { + +LlamaCppModel::~LlamaCppModel() { + llama_free(m_llama_ctx); + llama_free_model(m_llama_model_ptr); + llama_backend_free(); +} + +LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) + : ICompiledModel(nullptr, plugin), + m_gguf_fname(gguf_fname) { + OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + cparams.n_threads = + std::thread::hardware_concurrency(); // TODO (vshampor): reuse equivalent setting defined by OV API + cparams.n_ctx = 0; // this means that the actual n_ctx will be taken equal to the model's train-time value + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl; + + auto input_ids = std::make_shared(ov::element::Type_t::i64, ov::PartialShape({-1, -1})); + auto fake_convert = std::make_shared(input_ids->output(0), ov::element::Type_t::f32); + auto logits = std::make_shared(fake_convert->output(0)); + + ov::ParameterVector inputs{input_ids}; + + std::vector> additional_inputs_in_order = { + {"attention_mask", ov::element::Type_t::i64, {-1, -1}}, + {"position_ids", ov::element::Type_t::i64, {-1, -1}}, + {"beam_idx", ov::element::Type_t::i32, {-1, -1}}}; + + for (const auto& descr : additional_inputs_in_order) { + auto unused_inp = std::make_shared(std::get<1>(descr), std::get<2>(descr)); + inputs.push_back(unused_inp); + } + + m_fake_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); + + m_fake_model->inputs()[0].set_names({"input_ids"}); + for (size_t i = 0; i < additional_inputs_in_order.size(); i++) { + m_fake_model->inputs()[i + 1].set_names({std::get<0>(additional_inputs_in_order[i])}); + } + + m_fake_model->outputs()[0].set_names({"logits"}); + + for (auto input : m_fake_model->inputs()) { + m_fake_inputs.emplace_back(input); + } + for (auto output : m_fake_model->outputs()) { + m_fake_outputs.emplace_back(output); + } +} + +std::shared_ptr LlamaCppModel::get_runtime_model() const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); +} + +void LlamaCppModel::set_property(const ov::AnyMap& properties) { + OPENVINO_DEBUG << "llama_cpp_plugin: attempted to set_property (did nothing)"; +} + +ov::Any LlamaCppModel::get_property(const std::string& name) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type(std::vector()); + } + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); +} + +std::shared_ptr LlamaCppModel::create_sync_infer_request() const { + return std::make_shared( + std::static_pointer_cast(shared_from_this())); +} + +const std::vector>& LlamaCppModel::inputs() const { + return m_fake_inputs; +}; +const std::vector>& LlamaCppModel::outputs() const { + return m_fake_outputs; +}; + +void LlamaCppModel::export_model(std::ostream& output_stream) const { + std::ifstream in(m_gguf_fname, std::ios::binary); + output_stream << in.rdbuf(); +} + +} // namespace llama_cpp_plugin +} // namespace ov diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp new file mode 100644 index 000000000..5efd868d8 --- /dev/null +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -0,0 +1,129 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "infer_request.hpp" + +#include +#include + +#include "llama.h" +#include "openvino/runtime/make_tensor.hpp" +#include "openvino/util/log.hpp" +#include "state.hpp" + +namespace ov { +namespace llama_cpp_plugin { + +void allocate_tensor_impl(ov::SoPtr& tensor, + const ov::element::Type& element_type, + const ov::Shape& shape) { + if (!tensor || tensor->get_element_type() != element_type) { + tensor = ov::make_tensor(element_type, shape); + } else { + tensor->set_shape(shape); + } +} + +LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model) + : ov::ISyncInferRequest(compiled_model) { + OPENVINO_DEBUG << "llama_cpp_plugin: infer request ctor called\n"; + m_compiled_model_ptr = compiled_model; + for (const auto& input : get_inputs()) { + allocate_tensor(input, [input](ov::SoPtr& tensor) { + allocate_tensor_impl(tensor, + input.get_element_type(), + input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); + }); + } + for (const auto& output : get_outputs()) { + allocate_tensor(output, [output](ov::SoPtr& tensor) { + allocate_tensor_impl(tensor, + output.get_element_type(), + output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); + }); + } +} +void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output port, + const std::vector>& tensors) { + OPENVINO_DEBUG << "llama_cpp_plugin: set_tensors_impl called\n"; +} + +void llama_batch_add_reimpl(struct llama_batch& batch, + llama_token id, + llama_pos pos, + const std::vector& seq_ids, + bool logits) { + batch.token[batch.n_tokens] = id; + batch.pos[batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; + } + batch.logits[batch.n_tokens] = logits; + + batch.n_tokens++; +} + +void LlamaCppSyncInferRequest::infer() { + auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among + // all inputs without hardcode + // + auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]); // TODO (vshampor) correctly identify input_ids among + // all inputs without hardcode + OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); + OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); + size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; + + // llama_batch actually contains one sequence + llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); + const int64_t* data_ptr = input_ids_tensor_ptr->data(); + + const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; + + const int64_t* position_idx_ptr = position_ids_tensor_ptr->data(); + + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { + const int64_t token_id = sequence_start_ptr[tok_idx]; + const int64_t position_id = position_idx_ptr[tok_idx]; + llama_batch_add_reimpl(batch, + token_id, + position_id, + {0}, + true); // the last `true` here is a marker that the logits for this + // token should be computed and returned + } + + llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; + int32_t sts = llama_decode(ctx, batch); + + if (sts != 0) { + OPENVINO_THROW("llama_decode failed with code ", sts); + } + + size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); + + ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; + float* output_tensor_data_ptr = output_tensor.data(); + + for (size_t pos = 0; pos < sequence_length; pos++) { + float* logits_from_llama = llama_get_logits_ith(ctx, pos); + std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); + } + + auto& logit_output = get_outputs()[0]; + allocate_tensor(logit_output, [&output_tensor](ov::SoPtr& tensor) { + allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); + output_tensor.copy_to(ov::make_tensor(tensor)); + }); +}; +std::vector LlamaCppSyncInferRequest::get_profiling_info() const { + OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n"; + return std::vector{}; +}; + +std::vector> LlamaCppSyncInferRequest::query_state() const { + OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n"; + return {std::static_pointer_cast(std::make_shared(m_compiled_model_ptr))}; +} +} // namespace llama_cpp_plugin +} // namespace ov diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp new file mode 100644 index 000000000..52536130c --- /dev/null +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "plugin.hpp" + +#include + +#include "compiled_model.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/runtime/internal_properties.hpp" +#include "openvino/util/log.hpp" + +namespace { +static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor"; +static constexpr const char* stream_executor_name = "LlamaCppStreamsExecutor"; +static constexpr const char* template_exclusive_executor = "LlamaCppExecutor"; +} // namespace + +namespace ov { +namespace llama_cpp_plugin { +LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { + set_device_name("LLAMA_CPP"); +} +std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); +} + +std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const { + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); +} +std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, + const ov::AnyMap& properties) const { + return std::make_shared(fname, shared_from_this()); +} + +void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { + for (const auto& map_entry : properties) { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented"); + } +} + +ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type( + std::vector({ov::device::capabilities, ov::device::full_name})); + } + if (ov::device::capabilities == name) { + return decltype(ov::device::capabilities)::value_type( + std::vector({ov::device::capability::EXPORT_IMPORT})); + } + if (ov::internal::supported_properties == name) { + return decltype(ov::internal::supported_properties)::value_type( + std::vector({ov::internal::caching_properties})); + } + + if (ov::internal::caching_properties == name) { + return std::vector{ov::device::full_name}; + } + + if (ov::device::full_name == name) { + return std::string("LLAMA_CPP"); + } + + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: getting property ", name, "not implemented"); +} + +ov::SoPtr LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); +} +ov::SoPtr LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); +} +std::shared_ptr LlamaCppPlugin::import_model(std::istream& model_file_stream, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); +} + +std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); +} + +ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); +} +} // namespace llama_cpp_plugin +} // namespace ov + +static const ov::Version version = {CI_BUILD_NUMBER, "llama_cpp_plugin"}; +OV_DEFINE_PLUGIN_CREATE_FUNCTION(ov::llama_cpp_plugin::LlamaCppPlugin, version) diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt new file mode 100644 index 000000000..096ad46ad --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set(TARGET_NAME llama_cpp_e2e_tests) + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDENCIES + llama_cpp_plugin + LINK_LIBRARIES + openvino::runtime::dev + openvino::funcSharedTests + INCLUDES + "${LlamaCppPlugin_SOURCE_DIR}/include" + ADD_CLANG_FORMAT + LABELS + OV UNIT TEMPLATE + ) + diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp new file mode 100644 index 000000000..351104bf1 --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "common_test_utils/file_utils.hpp" +#include "openvino/openvino.hpp" + +const std::string TEST_FILES_DIR = "test_data"; + +// "Why is the Sun yellow?" +const std::vector GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30}; +// "The Sun is a bright red, which means it is a bright red. The Sun is a bright +// red because it is a bright red." +const std::vector GPT2_REFERENCE_RESPONSE_TOKEN_IDS = { + 198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, + 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464}; + +const auto SEP = ov::util::FileTraits::file_separator; + +TEST(PromptResponseTest, TestGPT2) { + const std::string plugin_name = "LLAMA_CPP"; + ov::Core core; + + const std::string model_file_name = "gpt2.gguf"; + const std::string model_file = + ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + model_file_name; + ov::InferRequest lm = core.compile_model(model_file, plugin_name).create_infer_request(); + auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()}); + std::copy(GPT2_PROMPT_TOKEN_IDS.begin(), GPT2_PROMPT_TOKEN_IDS.end(), input_ids_tensor.data()); + lm.set_tensor("input_ids", input_ids_tensor); + lm.set_tensor("attention_mask", ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()})); + ov::Tensor position_ids = lm.get_tensor("position_ids"); + position_ids.set_shape(input_ids_tensor.get_shape()); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + + constexpr size_t BATCH_SIZE = 1; + lm.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + lm.get_tensor("beam_idx").data()[0] = 0; + + lm.infer(); + + size_t vocab_size = lm.get_tensor("logits").get_shape().back(); + float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_size() - 1) * vocab_size; + int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; + + lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); + position_ids.set_shape({BATCH_SIZE, 1}); + + size_t cnt = 0; + std::vector out_token_ids; + + while (cnt < GPT2_REFERENCE_RESPONSE_TOKEN_IDS.size()) { + lm.get_tensor("input_ids").data()[0] = out_token; + lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, lm.get_tensor("attention_mask").get_shape().at(1) + 1}); + std::fill_n(lm.get_tensor("attention_mask").data(), lm.get_tensor("attention_mask").get_size(), 1); + position_ids.data()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2); + lm.start_async(); + lm.wait(); + logits = lm.get_tensor("logits").data(); + out_token = std::max_element(logits, logits + vocab_size) - logits; + out_token_ids.push_back(out_token); + cnt++; + } + + lm.reset_state(); + + ASSERT_EQ(out_token_ids, GPT2_REFERENCE_RESPONSE_TOKEN_IDS); +} diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp new file mode 100644 index 000000000..7577f1673 --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp @@ -0,0 +1,15 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +namespace ov { +namespace test { +void set_device_suffix(const std::string& suffix) { + if (!suffix.empty()) { + throw std::runtime_error("The suffix can't be used for LLAMA_CPP device!"); + } +} +} // namespace test +} // namespace ov