Skip to content

Commit

Permalink
Revert "Revert "LLAMA_CPP plugin - basic version with direct file loa…
Browse files Browse the repository at this point in the history
…ding (#8…" (#897)

This reverts commit 3eeb232.
  • Loading branch information
ilya-lavrenov authored Mar 29, 2024
1 parent 3eeb232 commit 2ba8532
Show file tree
Hide file tree
Showing 15 changed files with 860 additions and 0 deletions.
76 changes: 76 additions & 0 deletions .github/workflows/llama_cpp_plugin_build_and_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: llama_cpp_plugin_build_and_test

on:
pull_request:
paths:
- 'modules/llama_cpp_plugin/**'

jobs:
build_ubuntu20:
runs-on: ubuntu-20.04-8-cores
steps:
- name: Setup cmake
uses: jwlawson/[email protected]
with:
cmake-version: '3.24.x'

- name: Checkout openvino_contrib
uses: actions/checkout@v4
with:
submodules: recursive
path: openvino_contrib

- name: Checkout openvino
uses: actions/checkout@v4
with:
submodules: recursive
repository: openvinotoolkit/openvino
path: openvino

- name: CMake - configure
run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON openvino

- name: CMake - build
run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests


- name: Upload build artifacts
uses: actions/upload-artifact@v4
with:
name: build_artifacts
path: ${{ github.workspace }}/openvino/bin/intel64/Release/

test_ubuntu20:
needs: build_ubuntu20
runs-on: ubuntu-20.04
steps:
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: build_artifacts
path: ${{ github.workspace }}/binaries

- name: Prepare test data - checkout llama.cpp repo
uses: actions/checkout@v4
with:
repository: ggerganov/llama.cpp
path: llama.cpp

- name: Prepare test data - convert test model files
run: |
pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
mkdir -p ${{ github.workspace }}/test_data
python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
- name: Install libtbb2
run: |
wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
mkdir -p tbb
tar xvzf oneapi-tbb-2021.2.4-lin.tgz
- name: Run E2E tests
run: |
chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
${{ github.workspace }}/binaries/llama_cpp_e2e_tests
28 changes: 28 additions & 0 deletions modules/llama_cpp_plugin/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
BasedOnStyle: Google
IndentWidth: 4
UseTab: Never
ColumnLimit: 120

Language: Cpp
Standard: Cpp11

AccessModifierOffset: -4
AlignConsecutiveMacros: true
AllowAllArgumentsOnNextLine: false
AllowAllConstructorInitializersOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Empty
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: false
BinPackArguments: false
BinPackParameters: false
CommentPragmas: '^#'
DerivePointerAlignment: false
FixNamespaceComments: true
IndentCaseLabels: false
IndentPPDirectives: AfterHash
ForEachMacros:
- foreach
- FOREACH_CHILD
32 changes: 32 additions & 0 deletions modules/llama_cpp_plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

cmake_minimum_required(VERSION 3.13)

project(LlamaCppPlugin)

find_package(OpenVINODeveloperPackage REQUIRED)

ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)

add_subdirectory(src)

FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG b2417
)

FetchContent_MakeAvailable(llama_cpp)

if(ENABLE_TESTS)
include(CTest)
enable_testing()
add_subdirectory(tests/e2e)
endif()

# install

if(OpenVINODeveloperPackage_FOUND)
ov_cpack(LlamaCppPlugin)
endif()
52 changes: 52 additions & 0 deletions modules/llama_cpp_plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
### Build instructions

This plugin should be built in the same fashion as the rest of the modules:

1. Check out the OpenVINO repository proper (https://github.com/openvinotoolkit/openvino)
2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.

```bash
cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON .
```

3. Build the plugin either as part of the complete openvino build by executing:

```bash
cmake --build build --parallel
```

or separately by specifying only the `llama_cpp_plugin` target:

```bash
cmake --build build --parallel -- llama_cpp_plugin
```

4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).

#### Example of LLM inference code

```C++

ov::Core core;
auto model = core.compile_model("model.gguf", "LLAMA_CPP")
auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);

auto infer_request == model.create_infer_request();
infer_request.set_tensor("input_ids", input_ids);
infer_request.set_tensor("position_ids", position_ids);
infer_request.infer();

size_t vocab_size = lm.get_tensor("logits").get_shape().back();
float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
```
The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
Only batch size of 1 is currently supported.
78 changes: 78 additions & 0 deletions modules/llama_cpp_plugin/include/compiled_model.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef LLAMA_CPP_COMPILED_MODEL_HPP
#define LLAMA_CPP_COMPILED_MODEL_HPP

#include "llama.h"
#include "openvino/runtime/icompiled_model.hpp"
#include "openvino/runtime/isync_infer_request.hpp"

namespace ov {
namespace llama_cpp_plugin {
class LlamaCppSyncInferRequest;
class LlamaCppPlugin;
class LlamaCppState;
class LlamaCppModel : public ICompiledModel {
public:
LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin);
/**
* @brief Export compiled model to stream
*
* @param model output stream
*/
virtual void export_model(std::ostream& model) const override;

/**
* @brief Returns runtime model
*
* @return OpenVINO Model which represents runtime graph
*/
virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;

/**
* @brief Allows to set property
*
* @param properties new plugin properties
*/
virtual void set_property(const ov::AnyMap& properties) override;

/**
* @brief Returns property
*
* @param name Property name
*
* @return Property value
* virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
**/
virtual ov::Any get_property(const std::string& name) const override;
virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
virtual ~LlamaCppModel();

protected:
/**
* @brief Method creates infer request implementation
*
* @return Sync infer request
*/
virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;

private:
gguf_context* m_gguf_ctx = nullptr;
std::string m_gguf_fname;

llama_model* m_llama_model_ptr = nullptr;
llama_context* m_llama_ctx = nullptr;
std::shared_ptr<ov::Model> m_fake_model;

std::vector<ov::Output<const ov::Node>> m_fake_inputs;
std::vector<ov::Output<const ov::Node>> m_fake_outputs;

friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
friend class ov::llama_cpp_plugin::LlamaCppState;
};
} // namespace llama_cpp_plugin
} // namespace ov

#endif // LLAMA_CPP_COMPILED_MODEL_HPP
32 changes: 32 additions & 0 deletions modules/llama_cpp_plugin/include/infer_request.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef LLAMA_CPP_INFER_REQUEST_HPP
#define LLAMA_CPP_INFER_REQUEST_HPP

#include "compiled_model.hpp"
#include "openvino/openvino.hpp"

namespace ov {
namespace llama_cpp_plugin {

class LlamaCppSyncInferRequest : public ISyncInferRequest {
public:
explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
virtual ~LlamaCppSyncInferRequest(){};

virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;

virtual void infer() override;
virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;

private:
std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
};

} // namespace llama_cpp_plugin
}; // namespace ov

#endif /* LLAMA_CPP_INFER_REQUEST_HPP */
46 changes: 46 additions & 0 deletions modules/llama_cpp_plugin/include/plugin.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef LLAMA_CPP_PLUGIN_HPP
#define LLAMA_CPP_PLUGIN_HPP

#include "openvino/runtime/iplugin.hpp"

namespace ov {
namespace llama_cpp_plugin {
class LlamaCppPlugin : public IPlugin {
public:
LlamaCppPlugin();
virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
const ov::AnyMap& properties) const override;

virtual std::shared_ptr<ov::ICompiledModel> compile_model(
const std::shared_ptr<const ov::Model>& model,
const ov::AnyMap& properties,
const ov::SoPtr<ov::IRemoteContext>& context) const override;

virtual void set_property(const ov::AnyMap& properties) override;

virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;

virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;

virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;

virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
const ov::AnyMap& properties) const override;

virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
const ov::AnyMap& properties) const override;

virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
const ov::SoPtr<ov::IRemoteContext>& context,
const ov::AnyMap& properties) const override;

virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
const ov::AnyMap& properties) const override;
};
} // namespace llama_cpp_plugin
} // namespace ov

#endif // LLAMA_CPP_PLUGIN_HPP
27 changes: 27 additions & 0 deletions modules/llama_cpp_plugin/include/state.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef LLAMA_CPP_PLUGIN_HPP
#define LLAMA_CPP_PLUGIN_HPP

#include "compiled_model.hpp"
#include "openvino/runtime/ivariable_state.hpp"

namespace ov {
namespace llama_cpp_plugin {
class LlamaCppState : public IVariableState {
public:
LlamaCppState() = delete;
LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr)
: m_model_ptr(model_ptr),
IVariableState("llama_cpp_state") {}
void reset() override {
llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
}

private:
const std::shared_ptr<const LlamaCppModel>& m_model_ptr;
};
} // namespace llama_cpp_plugin
} // namespace ov
#endif // LLAMA_CPP_STATE_HPP
Loading

0 comments on commit 2ba8532

Please sign in to comment.