From f38c12e0b6a2101a436a9a8bb8d596ff746327ed Mon Sep 17 00:00:00 2001 From: Tao He Date: Mon, 17 Jun 2024 13:24:15 +0800 Subject: [PATCH] Enhance the implementation of llm cache Signed-off-by: Tao He --- CMakeLists.txt | 96 ++-- modules/basic/ds/arrow_utils.cc | 34 +- modules/llm-cache/CMakeLists.txt | 7 +- modules/llm-cache/README.md | 16 +- modules/llm-cache/ds/config.h | 22 +- .../ds/{kv_state_cache.cc => kv_cache.cc} | 218 ++++----- .../ds/{kv_state_cache.h => kv_cache.h} | 52 +- ...state_cache_block.cc => kv_cache_block.cc} | 142 +++--- ...v_state_cache_block.h => kv_cache_block.h} | 50 +- ...e_cache_manager.cc => kv_cache_manager.cc} | 235 ++++----- ...ate_cache_manager.h => kv_cache_manager.h} | 34 +- modules/llm-cache/hash/hasher.h | 22 +- modules/llm-cache/storage/blob_storage.cc | 311 ++++++------ modules/llm-cache/storage/blob_storage.h | 29 +- modules/llm-cache/storage/file_storage.cc | 450 ++++++++++-------- modules/llm-cache/storage/file_storage.h | 21 +- .../llm-cache/storage/local_file_storage.h | 14 +- modules/llm-cache/storage/storage.h | 15 +- modules/llm-cache/tests/k8s-test/worker.py | 20 +- .../tests/k8s-test/yamls/worker.yaml | 4 +- ...ark_test.cc => kv_cache_benchmark_test.cc} | 16 +- ...che_hash_test.cc => kv_cache_hash_test.cc} | 22 +- ...le_test.cc => kv_cache_local_file_test.cc} | 57 ++- ...ee_test.cc => kv_cache_radix_tree_test.cc} | 2 +- ...v_state_cache_test.cc => kv_cache_test.cc} | 63 ++- modules/llm-cache/tests/refcnt_map_test.cc | 77 ++- modules/llm-cache/thread_group.h | 11 +- python/vineyard/llm/__init__.py | 168 +------ python/vineyard/llm/cache.cc | 193 ++++++++ python/vineyard/llm/cache.py | 378 +++++++++++++++ python/vineyard/llm/config.py | 113 ----- python/vineyard/llm/kv_state_cache.cc | 181 ------- python/vineyard/llm/tests/test_llm.py | 10 +- setup_llm.py | 2 +- src/common/util/functions.h | 8 +- test/runner.py | 8 +- 36 files changed, 1632 insertions(+), 1469 deletions(-) rename modules/llm-cache/ds/{kv_state_cache.cc => kv_cache.cc} (61%) rename modules/llm-cache/ds/{kv_state_cache.h => kv_cache.h} (66%) rename modules/llm-cache/ds/{kv_state_cache_block.cc => kv_cache_block.cc} (70%) rename modules/llm-cache/ds/{kv_state_cache_block.h => kv_cache_block.h} (77%) rename modules/llm-cache/ds/{kv_state_cache_manager.cc => kv_cache_manager.cc} (81%) rename modules/llm-cache/ds/{kv_state_cache_manager.h => kv_cache_manager.h} (72%) rename modules/llm-cache/tests/{kv_state_cache_benchmark_test.cc => kv_cache_benchmark_test.cc} (92%) rename modules/llm-cache/tests/{kv_state_cache_hash_test.cc => kv_cache_hash_test.cc} (86%) rename modules/llm-cache/tests/{kv_state_cache_local_file_test.cc => kv_cache_local_file_test.cc} (83%) rename modules/llm-cache/tests/{kv_state_cache_radix_tree_test.cc => kv_cache_radix_tree_test.cc} (99%) rename modules/llm-cache/tests/{kv_state_cache_test.cc => kv_cache_test.cc} (84%) create mode 100644 python/vineyard/llm/cache.cc create mode 100644 python/vineyard/llm/cache.py delete mode 100644 python/vineyard/llm/config.py delete mode 100644 python/vineyard/llm/kv_state_cache.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 115aab1019..592c4e35ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -856,49 +856,54 @@ if(BUILD_VINEYARD_CLIENT) list(APPEND VINEYARD_INSTALL_LIBS vineyard_client) endif() -if(BUILD_VINEYARD_PYTHON_BINDINGS) +if (BUILD_VINEYARD_PYTHON_BINDINGS) set(PYBIND11_PYTHON_VERSION 3) if(NOT (CMAKE_VERSION VERSION_LESS "3.27")) set(PYBIND11_FINDPYTHON ON) endif() add_subdirectory_static(thirdparty/pybind11) - set(PYTHON_BIND_FILES "python/client.cc" - "python/core.cc" - "python/error.cc" - "python/pybind11_docs.cc" - "python/pybind11_utils.cc" - "python/vineyard.cc") - pybind11_add_module(_C MODULE ${PYTHON_BIND_FILES}) - target_add_debuginfo(_C) - target_link_libraries(_C PRIVATE vineyard_client) - target_include_directories(_C PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include") - target_compile_options(_C PRIVATE -Wno-unused-value) - set_target_properties(_C PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib") +endif() + +macro(setup_pybind11_module target relpath) + target_add_debuginfo(${target}) + target_link_libraries(${target} PRIVATE vineyard_client) + target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include") + target_compile_options(${target} PRIVATE -Wno-unused-value) + set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib") if(UNIX AND NOT APPLE) - target_add_link_options(_C PRIVATE OPTIONS -Wl,--exclude-libs=ALL) + target_add_link_options(${target} PRIVATE OPTIONS -Wl,--exclude-libs=ALL) endif() if(BUILD_VINEYARD_PYPI_PACKAGES AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - target_compile_options(_C PRIVATE -static) - target_add_link_options(_C PRIVATE OPTIONS -static) + target_compile_options(${target} PRIVATE -static) + target_add_link_options(${target} PRIVATE OPTIONS -static) else() - target_compile_options(_C PRIVATE -Os) - target_add_link_options(_C PRIVATE OPTIONS -Os) + target_compile_options(${target} PRIVATE -Os) + target_add_link_options(${target} PRIVATE OPTIONS -Os) endif() - file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard" "${CMAKE_BINARY_DIR}/shared-lib") if(UNIX AND NOT APPLE) - set_target_properties(_C PROPERTIES + set_target_properties(${target} PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH TRUE - INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${RELATIVE_BUILD_PATH}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}") + INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${relpath}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}") endif() if(APPLE) - set_target_properties(_C PROPERTIES + set_target_properties(${target} PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH TRUE - INSTALL_RPATH ".;@loader_path;@loader_path/${RELATIVE_BUILD_PATH}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}") + INSTALL_RPATH ".;@loader_path;@loader_path/${relpath}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}") endif() +endmacro() +if(BUILD_VINEYARD_PYTHON_BINDINGS) + pybind11_add_module(_C MODULE "python/client.cc" + "python/core.cc" + "python/error.cc" + "python/pybind11_docs.cc" + "python/pybind11_utils.cc" + "python/vineyard.cc") + file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard" "${CMAKE_BINARY_DIR}/shared-lib") + setup_pybind11_module(_C ${RELATIVE_BUILD_PATH}) add_custom_target(vineyard_client_python ALL COMMAND cp "$" "${PROJECT_SOURCE_DIR}/python/vineyard/" @@ -909,47 +914,16 @@ if(BUILD_VINEYARD_PYTHON_BINDINGS) endif() if(BUILD_VINEYARD_PYTHON_BINDINGS AND BUILD_VINEYARD_LLM_CACHE) - set(PYBIND11_PYTHON_VERSION 3) - if(NOT (CMAKE_VERSION VERSION_LESS "3.27")) - set(PYBIND11_FINDPYTHON ON) - endif() - file(GLOB PYTHON_BIND_FILES "python/vineyard/llm/kv_state_cache.cc") - pybind11_add_module(llm_C MODULE ${PYTHON_BIND_FILES}) - # make sure `vineyard_llm_cache` been built. - add_dependencies(llm_C vineyard_llm_cache) - target_link_libraries(llm_C PRIVATE vineyard_client vineyard_llm_cache) - target_include_directories(llm_C PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include") - target_compile_options(llm_C PRIVATE -Wno-unused-value) - set_target_properties(llm_C PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib") - if(UNIX AND NOT APPLE) - target_add_link_options(llm_C PRIVATE OPTIONS -Wl,--exclude-libs=ALL) - endif() - if(BUILD_VINEYARD_PYPI_PACKAGES AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - target_compile_options(llm_C PRIVATE -static) - target_add_link_options(llm_C PRIVATE OPTIONS -static) - else() - target_compile_options(llm_C PRIVATE -Os) - target_add_link_options(llm_C PRIVATE OPTIONS -Os) - endif() - + pybind11_add_module(_llm_C MODULE "python/vineyard/llm/cache.cc") file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard/llm" "${CMAKE_BINARY_DIR}/shared-lib") - if(UNIX AND NOT APPLE) - set_target_properties(llm_C PROPERTIES - BUILD_WITH_INSTALL_RPATH TRUE - INSTALL_RPATH_USE_LINK_PATH TRUE - INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${RELATIVE_BUILD_PATH}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}") - endif() - if(APPLE) - set_target_properties(llm_C PROPERTIES - BUILD_WITH_INSTALL_RPATH TRUE - INSTALL_RPATH_USE_LINK_PATH TRUE - INSTALL_RPATH ".;@loader_path;@loader_path/${RELATIVE_BUILD_PATH}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}") - endif() - + setup_pybind11_module(_llm_C ${RELATIVE_BUILD_PATH}) + # make sure `vineyard_llm_cache` been built. + add_dependencies(_llm_C vineyard_llm_cache) + target_link_libraries(_llm_C PRIVATE vineyard_client vineyard_llm_cache) add_custom_target(vineyard_llm_python ALL - COMMAND cp "$" "${PROJECT_SOURCE_DIR}/python/vineyard/llm/" - DEPENDS llm_C + COMMAND cp "$" "${PROJECT_SOURCE_DIR}/python/vineyard/llm/" + DEPENDS _llm_C COMMENT "Copying llm kv cache python extensions." VERBATIM) add_dependencies(vineyard_llm_python vineyard_client_python) diff --git a/modules/basic/ds/arrow_utils.cc b/modules/basic/ds/arrow_utils.cc index 55f7ec91b2..ab4785f0c3 100644 --- a/modules/basic/ds/arrow_utils.cc +++ b/modules/basic/ds/arrow_utils.cc @@ -34,7 +34,7 @@ namespace vineyard { namespace detail { static inline std::string string_join(std::vector const& srcs, - std::string const& sep) { + std::string const& sep) { std::stringstream ss; if (!srcs.empty()) { ss << srcs[0]; @@ -45,21 +45,23 @@ static inline std::string string_join(std::vector const& srcs, return ss.str(); } -static inline void string_split(std::vector &rs, std::string const &content, std::string const &patterns) { - size_t i = 0, k = 0; - while (i < content.size()) { - while (k < content.size()) { - char c = content[k]; - if (patterns.find_first_of(c) != std::string::npos) { - break; - } - k += 1; - } - if (i < k) { - rs.emplace_back(content.substr(i, k - i)); - } - i = k; - } +static inline void string_split(std::vector& rs, + std::string const& content, + std::string const& patterns) { + size_t i = 0, k = 0; + while (i < content.size()) { + while (k < content.size()) { + char c = content[k]; + if (patterns.find_first_of(c) != std::string::npos) { + break; + } + k += 1; + } + if (i < k) { + rs.emplace_back(content.substr(i, k - i)); + } + i = k; + } } } // namespace detail diff --git a/modules/llm-cache/CMakeLists.txt b/modules/llm-cache/CMakeLists.txt index a4e9d686d2..15f43a7cf6 100644 --- a/modules/llm-cache/CMakeLists.txt +++ b/modules/llm-cache/CMakeLists.txt @@ -20,10 +20,9 @@ target_link_libraries(vineyard_llm_cache PRIVATE libzstd_static ${GLOG_LIBRARIES target_link_libraries(vineyard_llm_cache PUBLIC vineyard_client) # install bundled thirdparty: rax and MurmurHash3 -install(DIRECTORY - ${PROJECT_SOURCE_DIR}/thirdparty/rax - ${PROJECT_SOURCE_DIR}/thirdparty/MurmurHash3 - ${PROJECT_SOURCE_DIR}/thirdparty/cityhash +install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/rax + ${PROJECT_SOURCE_DIR}/thirdparty/MurmurHash3 + ${PROJECT_SOURCE_DIR}/thirdparty/cityhash DESTINATION include/vineyard/contrib # target directory FILES_MATCHING # install only matched files PATTERN "*.h" # select header files diff --git a/modules/llm-cache/README.md b/modules/llm-cache/README.md index 6616f67be2..6259728da8 100644 --- a/modules/llm-cache/README.md +++ b/modules/llm-cache/README.md @@ -31,7 +31,7 @@ In this section, we will compare the two methods in terms of latency and suitabl ## Usage -We provide [C++](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_state_cache_manager.h) and [Python](https://github.com/v6d-io/v6d/blob/main/python/vineyard/llm/__init__.py) APIs for Vineyard LLM KV Cache. Based on the inference framework, you can use the corresponding API to integrate the Vineyard LLM KV Cache. +We provide [C++](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_cache_manager.h) and [Python](https://github.com/v6d-io/v6d/blob/main/python/vineyard/llm/__init__.py) APIs for Vineyard LLM KV Cache. Based on the inference framework, you can use the corresponding API to integrate the Vineyard LLM KV Cache. ### C++ API @@ -90,10 +90,10 @@ $ ./build/bin/vineyardd --socket /tmp/vineyard_test.sock Then open another terminal to run the vineyard llm kv cache test. ```bash -$ ./bin/kv_state_cache_test --client-num 1 --vineyard-ipc-sockets /tmp/vineyard_test.sock +$ ./bin/kv_cache_test --client-num 1 --vineyard-ipc-sockets /tmp/vineyard_test.sock ``` -For more information about how to use the C++ API, you can refer to the the [C++ API implementation](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_state_cache_manager.cc) and the [related tests](https://github.com/v6d-io/v6d/tree/main/modules/llm-cache/tests). +For more information about how to use the C++ API, you can refer to the the [C++ API implementation](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_cache_manager.cc) and the [related tests](https://github.com/v6d-io/v6d/tree/main/modules/llm-cache/tests). ### Python API @@ -165,8 +165,8 @@ vineyard_cache_config = VineyardCacheConfig( ) cache = KVCache( cache_config=vineyard_cache_config, - tensor_bytes=16, # should be the same as the nbytes of the tensor - cache_capacity=10, + tensor_nbytes=16, # should be the same as the nbytes of the tensor + cache_capacity=1024, layer=2, ) @@ -248,13 +248,13 @@ from vineyard.llm.config import VineyardCacheConfig file_cache_config = FileCacheConfig( chunk_size=2, - split_number=2, + hash_chunk_size=2, root="/tmp/vineyard/llm_cache", ) cache = KVCache( cache_config=file_cache_config, - tensor_bytes=16, # should be the same as the nbytes of the tensor - cache_capacity=10, + tensor_nbytes=16, # should be the same as the nbytes of the tensor + cache_capacity=1024, layer=2, ) diff --git a/modules/llm-cache/ds/config.h b/modules/llm-cache/ds/config.h index 356d0bc507..cbafe4d433 100644 --- a/modules/llm-cache/ds/config.h +++ b/modules/llm-cache/ds/config.h @@ -49,32 +49,32 @@ struct VineyardCacheConfig : public KVCacheConfig { }; struct FileCacheConfig : public KVCacheConfig { - int batchSize; - int splitNumber; + int chunkSize; + int hashChunkSize; std::string root; FilesystemType filesystemType; - int clientGCInterval; // second - int ttl; // second + int gcInterval; // in seconds + int ttl; // in seconds bool enbaleGlobalGC; - int globalGCInterval; // second - int globalTTL; // second + int globalGCInterval; // in seconds + int globalTTL; // in seconds // Default gc interval is 30 minutes and default global gc interval is 3 // hours. FileCacheConfig(int tensorByte = 10, int cacheCapacity = 10, int layer = 1, - int batchSize = 4, int splitNumber = 2, + int chunkSize = 4, int hashChunkSize = 2, std::string root = "/tmp/llm_cache/", FilesystemType filesystemType = LOCAL, - int clientGCInterval = 30 * 60, int ttl = 30 * 60, + int gcInterval = 30 * 60, int ttl = 30 * 60, bool enbaleGlobalGC = false, int globalGCInterval = 3 * 60 * 60, int globalTTL = 3 * 60 * 60) : KVCacheConfig{tensorByte, cacheCapacity, layer} { this->root = root; - this->batchSize = batchSize; - this->splitNumber = splitNumber; + this->chunkSize = chunkSize; + this->hashChunkSize = hashChunkSize; this->filesystemType = filesystemType; - this->clientGCInterval = clientGCInterval; + this->gcInterval = gcInterval; this->ttl = ttl; this->enbaleGlobalGC = enbaleGlobalGC; this->globalGCInterval = globalGCInterval; diff --git a/modules/llm-cache/ds/kv_state_cache.cc b/modules/llm-cache/ds/kv_cache.cc similarity index 61% rename from modules/llm-cache/ds/kv_state_cache.cc rename to modules/llm-cache/ds/kv_cache.cc index f6ba7bb829..29a207de3c 100644 --- a/modules/llm-cache/ds/kv_state_cache.cc +++ b/modules/llm-cache/ds/kv_cache.cc @@ -25,18 +25,18 @@ limitations under the License. #include "common/util/base64.h" #include "common/util/logging.h" #include "common/util/status.h" -#include "llm-cache/ds/kv_state_cache.h" +#include "llm-cache/ds/kv_cache.h" #include "llm-cache/radix-tree/radix-tree.h" namespace vineyard { -void KVStateCache::Construct(const ObjectMeta& meta) { +void KVCache::Construct(const ObjectMeta& meta) { Object::Construct(meta); Resolve(); } -void KVStateCache::Resolve() { - std::string typeName = type_name(); +void KVCache::Resolve() { + std::string typeName = type_name(); VINEYARD_ASSERT(this->meta_.GetTypeName() == typeName, "Expect typename '" + typeName + "', but got '" + @@ -50,15 +50,15 @@ void KVStateCache::Resolve() { } // 2. construct the member field - this->tensorBytes = this->meta_.GetKeyValue("tensorBytes"); + this->tensorNBytes = this->meta_.GetKeyValue("tensorNBytes"); this->version = this->meta_.GetKeyValue("version"); this->layer = this->meta_.GetKeyValue("layer"); - VLOG(100) << "construct the member field success, with tensorBytes:" - << this->tensorBytes << " version:" << this->version + VLOG(100) << "construct the member field success, with tensorNBytes:" + << this->tensorNBytes << " version:" << this->version << " layer:" << this->layer; } -void KVStateCache::GetCurrentBlockIDSet(std::set& objectIDSet) { +void KVCache::GetCurrentBlockIDSet(std::set& objectIDSet) { std::set subTreeData = rootTree->GetSubTreeDataSet(); for (auto iter = subTreeData.begin(); iter != subTreeData.end(); ++iter) { TreeData* treeData = reinterpret_cast(*iter); @@ -68,29 +68,29 @@ void KVStateCache::GetCurrentBlockIDSet(std::set& objectIDSet) { } } -KVStateCache::~KVStateCache() {} +KVCache::~KVCache() {} -KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int tensorBytes, - int layer, - std::shared_ptr& rootTree) +KVCacheBuilder::KVCacheBuilder(Client& client, int tensorNBytes, int layer, + std::shared_ptr& rootTree) : client(client) { - this->tensorBytes = tensorBytes; + this->tensorNBytes = tensorNBytes; this->version = 0; this->layer = layer; this->rootTree = rootTree; } -Status KVStateCacheBuilder::Make( - Client& client, std::shared_ptr& kvStateCacheBuilder, - int tensorBytes, int cacheCapacity, int layer, int blockSize) { - KVStateCacheBlockBuilder* builder = - new KVStateCacheBlockBuilder(client, tensorBytes, layer, blockSize); +Status KVCacheBuilder::Make(Client& client, + std::shared_ptr& kvCacheBuilder, + int tensorNBytes, int cacheCapacity, int layer, + int blockSize) { + KVCacheBlockBuilder* builder = + new KVCacheBlockBuilder(client, tensorNBytes, layer, blockSize); std::shared_ptr rootTree = std::make_shared(cacheCapacity); TreeData* treeData = new TreeData(); - treeData->kvStateCacheBlockBuilder = builder; + treeData->kvCacheBlockBuilder = builder; treeData->isPtr = true; std::shared_ptr rootTreeHeader = rootTree->GetRootNode(); @@ -98,28 +98,28 @@ Status KVStateCacheBuilder::Make( rootTreeHeader->treeData->dataLength = sizeof(TreeData); rootTree->SetSubtreeData(treeData); - kvStateCacheBuilder = std::shared_ptr( - new KVStateCacheBuilder(client, tensorBytes, layer, rootTree)); + kvCacheBuilder = std::shared_ptr( + new KVCacheBuilder(client, tensorNBytes, layer, rootTree)); return Status::OK(); } -Status KVStateCacheBuilder::Make( - Client& client, std::shared_ptr& kvStateCacheBuilder, - std::shared_ptr& cache) { - kvStateCacheBuilder = std::make_shared( - client, cache->GetTensorBytes(), cache->GetLayer(), cache->rootTree); +Status KVCacheBuilder::Make(Client& client, + std::shared_ptr& kvCacheBuilder, + std::shared_ptr& cache) { + kvCacheBuilder = std::make_shared( + client, cache->GetTensorNBytes(), cache->GetLayer(), cache->rootTree); return Status::OK(); } -Status KVStateCacheBuilder::Split( - KVStateCacheBlockBuilder* kvStateCacheBlockBuilder, +Status KVCacheBuilder::Split( + KVCacheBlockBuilder* kvCacheBlockBuilder, std::vector> nodeDataList, - KVStateCacheBlockBuilder*& childKVStateCacheBlockBuilder) { + KVCacheBlockBuilder*& childKVCacheBlockBuilder) { // Split the tree if the list of kvState is full. - childKVStateCacheBlockBuilder = - new KVStateCacheBlockBuilder(client, this->tensorBytes, this->layer, - kvStateCacheBlockBuilder->GetBlockSize()); - VINEYARD_ASSERT(childKVStateCacheBlockBuilder != nullptr, + childKVCacheBlockBuilder = + new KVCacheBlockBuilder(client, this->tensorNBytes, this->layer, + kvCacheBlockBuilder->GetBlockSize()); + VINEYARD_ASSERT(childKVCacheBlockBuilder != nullptr, "Not enough memory for new block builder."); for (size_t i = 0; i < nodeDataList.size(); i++) { @@ -130,17 +130,16 @@ Status KVStateCacheBuilder::Split( int index = data->offset; // Transfer the data from this builder to the child builder. - data->offset = - kvStateCacheBlockBuilder->Split(childKVStateCacheBlockBuilder, index); + data->offset = kvCacheBlockBuilder->Split(childKVCacheBlockBuilder, index); } - VLOG(100) << "builder:" << kvStateCacheBlockBuilder - << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr(); - VLOG(100) << "child_builder:" << childKVStateCacheBlockBuilder - << " bitmap:" << childKVStateCacheBlockBuilder->GetBitmapStr(); + VLOG(100) << "builder:" << kvCacheBlockBuilder + << " bitmap:" << kvCacheBlockBuilder->GetBitmapStr(); + VLOG(100) << "child_builder:" << childKVCacheBlockBuilder + << " bitmap:" << childKVCacheBlockBuilder->GetBitmapStr(); return Status::OK(); } -Status KVStateCacheBuilder::Update( +Status KVCacheBuilder::Update( const std::vector& tokenList, int nextToken, const std::vector>& kvState) { std::vector tokenListCopy = tokenList; @@ -152,16 +151,16 @@ Status KVStateCacheBuilder::Update( this->rootTree->Insert(tokenListCopy, evictedNodeData); RETURN_ON_ASSERT(nodeData != nullptr, "Update llm cache failed."); - KVStateCacheBlockBuilder* kvStateCacheBlockBuilder; + KVCacheBlockBuilder* kvCacheBlockBuilder; TreeData* treeData = reinterpret_cast(nodeData->treeData->data); if (treeData->isPtr) { - kvStateCacheBlockBuilder = reinterpret_cast( - treeData->kvStateCacheBlockBuilder); + kvCacheBlockBuilder = + reinterpret_cast(treeData->kvCacheBlockBuilder); } else { ObjectID blockObjectID = treeData->builderObjectID; - RETURN_ON_ERROR(KVStateCacheBlockBuilder::Make(client, treeData, - kvStateCacheBlockBuilder)); - treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder; + RETURN_ON_ERROR( + KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder)); + treeData->kvCacheBlockBuilder = kvCacheBlockBuilder; treeData->isPtr = true; blockIDSetToDelete.insert(blockObjectID); } @@ -170,7 +169,7 @@ Status KVStateCacheBuilder::Update( Delete(evictedNodeData); } - if (kvStateCacheBlockBuilder->IsFull()) { + if (kvCacheBlockBuilder->IsFull()) { /** * If the kv-state cache of the tree is full, trigger split. Delete the * empty node from the radix tree and split the tree. Then, kv-state cache @@ -184,14 +183,14 @@ Status KVStateCacheBuilder::Update( std::vector> nodeDataList = rootTree->Split(tokenListCopy, subTreeHeader); RETURN_ON_ASSERT(nodeDataList.size() != 0, "Split llm cache failed."); - KVStateCacheBlockBuilder* newKVStateCacheBlockBuilder; - Status status = Split(kvStateCacheBlockBuilder, nodeDataList, - newKVStateCacheBlockBuilder); + KVCacheBlockBuilder* newKVCacheBlockBuilder; + Status status = + Split(kvCacheBlockBuilder, nodeDataList, newKVCacheBlockBuilder); RETURN_ON_ERROR(status); TreeData* newTreeData = new TreeData(); RETURN_ON_ASSERT(newTreeData != nullptr, "Split llm cache failed."); - newTreeData->kvStateCacheBlockBuilder = newKVStateCacheBlockBuilder; + newTreeData->kvCacheBlockBuilder = newKVCacheBlockBuilder; newTreeData->isPtr = true; subTreeHeader->treeData->data = newTreeData; @@ -199,7 +198,7 @@ Status KVStateCacheBuilder::Update( rootTree->SetSubtreeData(newTreeData); VLOG(100) << "block split success"; - // kv_state_cache_builder->UnLock(); + // kv_cache_builder->UnLock(); status = Update(tokenList, nextToken, kvState); RETURN_ON_ERROR(status); } else { @@ -207,19 +206,18 @@ Status KVStateCacheBuilder::Update( OffsetData* data = new OffsetData(); RETURN_ON_ASSERT(data != nullptr, "Not enough memory for new offset data."); - RETURN_ON_ERROR(kvStateCacheBlockBuilder->Update(kvState, data)); + RETURN_ON_ERROR(kvCacheBlockBuilder->Update(kvState, data)); nodeData->nodeData->data = data; nodeData->nodeData->dataLength = sizeof(OffsetData); } - VLOG(100) << "builder:" << kvStateCacheBlockBuilder - << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr(); + VLOG(100) << "builder:" << kvCacheBlockBuilder + << " bitmap:" << kvCacheBlockBuilder->GetBitmapStr(); return Status::OK(); } -Status KVStateCacheBuilder::Query( - const std::vector& tokenList, int token, - std::vector>& kvState) { +Status KVCacheBuilder::Query(const std::vector& tokenList, int token, + std::vector>& kvState) { std::vector tokenListCopy = tokenList; tokenListCopy.push_back(token); @@ -231,40 +229,40 @@ Status KVStateCacheBuilder::Query( int offset = data->offset; TreeData* treeData = reinterpret_cast(nodeData->treeData->data); - KVStateCacheBlockBuilder* kvStateCacheBlockBuilder; + KVCacheBlockBuilder* kvCacheBlockBuilder; if (treeData->isPtr) { - kvStateCacheBlockBuilder = reinterpret_cast( - treeData->kvStateCacheBlockBuilder); + kvCacheBlockBuilder = + reinterpret_cast(treeData->kvCacheBlockBuilder); } else { ObjectID blockObjectID = treeData->builderObjectID; - RETURN_ON_ERROR(KVStateCacheBlockBuilder::Make(client, treeData, - kvStateCacheBlockBuilder)); - treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder; + RETURN_ON_ERROR( + KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder)); + treeData->kvCacheBlockBuilder = kvCacheBlockBuilder; treeData->isPtr = true; blockIDSetToDelete.insert(blockObjectID); } - return kvStateCacheBlockBuilder->Query(offset, kvState); + return kvCacheBlockBuilder->Query(offset, kvState); } -void KVStateCacheBuilder::Delete(std::shared_ptr evictedNodeData) { +void KVCacheBuilder::Delete(std::shared_ptr evictedNodeData) { TreeData* treeData = reinterpret_cast(evictedNodeData->treeData->data); - KVStateCacheBlockBuilder* kvStateCacheBlockBuilder; + KVCacheBlockBuilder* kvCacheBlockBuilder; if (treeData->isPtr) { - kvStateCacheBlockBuilder = reinterpret_cast( - treeData->kvStateCacheBlockBuilder); + kvCacheBlockBuilder = + reinterpret_cast(treeData->kvCacheBlockBuilder); } else { ObjectID blockObjectID = treeData->builderObjectID; - Status status = KVStateCacheBlockBuilder::Make(client, treeData, - kvStateCacheBlockBuilder); + Status status = + KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder); if (!status.ok()) { // Not a deadly error, just log it and return. - LOG(FATAL) << "Failed to make kvStateCacheBlockBuilder. It may cause " + LOG(FATAL) << "Failed to make kvCacheBlockBuilder. It may cause " "memory leak."; return; } - treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder; + treeData->kvCacheBlockBuilder = kvCacheBlockBuilder; treeData->isPtr = true; blockIDSetToDelete.insert(blockObjectID); @@ -272,36 +270,34 @@ void KVStateCacheBuilder::Delete(std::shared_ptr evictedNodeData) { OffsetData* data = reinterpret_cast(evictedNodeData->nodeData->data); - kvStateCacheBlockBuilder->DeleteKVCache(data->offset); + kvCacheBlockBuilder->DeleteKVCache(data->offset); delete data; // TBD // Refactor this code. The data should be deleted by the RadixTree // delete (DataWrapper*) evictedNodeData->nodeData; if (evictedNodeData->cleanTreeData) { this->rootTree->ClearSubtreeData(treeData); - std::shared_ptr blockObject = - kvStateCacheBlockBuilder->_Seal(client); + std::shared_ptr blockObject = kvCacheBlockBuilder->_Seal(client); Status status = client.DelData(blockObject->id()); if (!status.ok()) { LOG(ERROR) << "Delete object failed: " << status.ToString() << " It may cause memory leak."; } - delete kvStateCacheBlockBuilder; + delete kvCacheBlockBuilder; } evictedNodeData->RecycleSource(); } -Status KVStateCacheBuilder::Merge(std::shared_ptr kvStateCache) { - if (kvStateCache == nullptr) { +Status KVCacheBuilder::Merge(std::shared_ptr kvCache) { + if (kvCache == nullptr) { return Status::OK(); } - std::shared_ptr globalCacheBuilder; - Status status = - KVStateCacheBuilder::Make(client, globalCacheBuilder, kvStateCache); + std::shared_ptr globalCacheBuilder; + Status status = KVCacheBuilder::Make(client, globalCacheBuilder, kvCache); RETURN_ON_ERROR(status); - std::shared_ptr globalCacheTree = kvStateCache->GetRootTree(); + std::shared_ptr globalCacheTree = kvCache->GetRootTree(); std::set> insertTokenList; std::vector> evicted_token_list; @@ -350,8 +346,7 @@ Status KVStateCacheBuilder::Merge(std::shared_ptr kvStateCache) { return Status::OK(); } -void KVStateCacheBuilder::GetCurrentBlockIDSet( - std::set& objectIDSet) { +void KVCacheBuilder::GetCurrentBlockIDSet(std::set& objectIDSet) { std::set subTreeData = rootTree->GetSubTreeDataSet(); for (auto iter = subTreeData.begin(); iter != subTreeData.end(); ++iter) { TreeData* treeData = reinterpret_cast(*iter); @@ -361,17 +356,17 @@ void KVStateCacheBuilder::GetCurrentBlockIDSet( } } -Status KVStateCacheBuilder::Build(Client& client) { return Status::OK(); } +Status KVCacheBuilder::Build(Client& client) { return Status::OK(); } -std::shared_ptr KVStateCacheBuilder::_Seal(Client& client) { +std::shared_ptr KVCacheBuilder::_Seal(Client& client) { VINEYARD_CHECK_OK(this->Build(client)); - std::shared_ptr kvStateCache = std::make_shared(); + std::shared_ptr kvCache = std::make_shared(); // 1. store the member variables to cache object meta - kvStateCache->meta_.AddKeyValue("tensorBytes", this->tensorBytes); - kvStateCache->meta_.AddKeyValue("version", this->version); - kvStateCache->meta_.AddKeyValue("layer", this->layer); + kvCache->meta_.AddKeyValue("tensorNBytes", this->tensorNBytes); + kvCache->meta_.AddKeyValue("version", this->version); + kvCache->meta_.AddKeyValue("layer", this->layer); // 2. seal all the block and put object id to cache object and // change the tree data from pointer to object id @@ -384,31 +379,28 @@ std::shared_ptr KVStateCacheBuilder::_Seal(Client& client) { continue; } - KVStateCacheBlockBuilder* kvStateCacheBlockBuilder = - reinterpret_cast( - treeData->kvStateCacheBlockBuilder); - std::shared_ptr kvStateCacheBlock = - kvStateCacheBlockBuilder->_Seal(client); - VINEYARD_CHECK_OK(client.Persist(kvStateCacheBlock->id())); - treeData->builderObjectID = kvStateCacheBlock->id(); + KVCacheBlockBuilder* kvCacheBlockBuilder = + reinterpret_cast(treeData->kvCacheBlockBuilder); + std::shared_ptr kvCacheBlock = kvCacheBlockBuilder->_Seal(client); + VINEYARD_CHECK_OK(client.Persist(kvCacheBlock->id())); + treeData->builderObjectID = kvCacheBlock->id(); treeData->isPtr = false; } // 3. put the serialized sequence radix tree to cache object meta - kvStateCache->meta_.AddKeyValue("radix_tree", - base64_encode(this->rootTree->Serialize())); + kvCache->meta_.AddKeyValue("radix_tree", + base64_encode(this->rootTree->Serialize())); // 4. put the object type to the meta - kvStateCache->meta_.SetTypeName(type_name()); + kvCache->meta_.SetTypeName(type_name()); - VINEYARD_CHECK_OK( - client.CreateMetaData(kvStateCache->meta_, kvStateCache->id_)); - VLOG(100) << "KVStateCacheBuilder::_Seal: " << kvStateCache->id_; + VINEYARD_CHECK_OK(client.CreateMetaData(kvCache->meta_, kvCache->id_)); + VLOG(100) << "KVCacheBuilder::_Seal: " << kvCache->id_; this->set_sealed(true); - return kvStateCache; + return kvCache; } -KVStateCacheBuilder::~KVStateCacheBuilder() { +KVCacheBuilder::~KVCacheBuilder() { // get all subtree data and node data std::set subTreeDataSet = rootTree->GetSubTreeDataSet(); std::set nodeDataSet = rootTree->GetAllNodeData(); @@ -416,10 +408,9 @@ KVStateCacheBuilder::~KVStateCacheBuilder() { for (auto iter = subTreeDataSet.begin(); iter != subTreeDataSet.end(); ++iter) { TreeData* treeData = reinterpret_cast(*iter); - if (treeData->isPtr == true && - treeData->kvStateCacheBlockBuilder != nullptr) { - delete reinterpret_cast( - treeData->kvStateCacheBlockBuilder); + if (treeData->isPtr == true && treeData->kvCacheBlockBuilder != nullptr) { + delete reinterpret_cast( + treeData->kvCacheBlockBuilder); delete treeData; } } @@ -431,15 +422,14 @@ KVStateCacheBuilder::~KVStateCacheBuilder() { } } -void KVStateCacheBuilder::Close() { +void KVCacheBuilder::Close() { std::set subTreeDataSet = rootTree->GetSubTreeDataSet(); for (auto iter = subTreeDataSet.begin(); iter != subTreeDataSet.end(); ++iter) { TreeData* treeData = reinterpret_cast(*iter); - if (treeData->isPtr && treeData->kvStateCacheBlockBuilder != nullptr) { + if (treeData->isPtr && treeData->kvCacheBlockBuilder != nullptr) { std::shared_ptr object = - reinterpret_cast( - treeData->kvStateCacheBlockBuilder) + reinterpret_cast(treeData->kvCacheBlockBuilder) ->_Seal(client); Status status = client.DelData(object->id()); if (!status.ok()) { diff --git a/modules/llm-cache/ds/kv_state_cache.h b/modules/llm-cache/ds/kv_cache.h similarity index 66% rename from modules/llm-cache/ds/kv_state_cache.h rename to modules/llm-cache/ds/kv_cache.h index 0c65bb43cd..b275b651c4 100644 --- a/modules/llm-cache/ds/kv_state_cache.h +++ b/modules/llm-cache/ds/kv_cache.h @@ -22,19 +22,19 @@ limitations under the License. #include "client/client.h" #include "common/util/logging.h" #include "common/util/status.h" -#include "llm-cache/ds/kv_state_cache_block.h" +#include "llm-cache/ds/kv_cache_block.h" #include "llm-cache/radix-tree/radix-tree.h" -#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_ -#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_ +#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_H_ +#define MODULES_LLM_CACHE_DS_KV_CACHE_H_ namespace vineyard { -class KVStateCache : public vineyard::Registered { +class KVCache : public vineyard::Registered { private: - std::vector> kvStateCacheBlockList; + std::vector> kvCacheBlockList; std::shared_ptr rootTree; - int tensorBytes; + int tensorNBytes; int cacheCapacity; int layer; uint64_t version; @@ -42,7 +42,7 @@ class KVStateCache : public vineyard::Registered { public: static std::unique_ptr Create() __attribute__((used)) { return std::static_pointer_cast( - std::unique_ptr{new KVStateCache()}); + std::unique_ptr{new KVCache()}); } void Construct(const ObjectMeta& meta) override; @@ -50,11 +50,11 @@ class KVStateCache : public vineyard::Registered { void Resolve(); // for test - std::vector>& GetKVStateCacheBlockList() { - return this->kvStateCacheBlockList; + std::vector>& GetKVCacheBlockList() { + return this->kvCacheBlockList; } - int GetTensorBytes() { return this->tensorBytes; } + int GetTensorNBytes() { return this->tensorNBytes; } int GetCacheCapacity() { return this->cacheCapacity; } @@ -66,37 +66,37 @@ class KVStateCache : public vineyard::Registered { void GetCurrentBlockIDSet(std::set& objectIDSet); - ~KVStateCache(); + ~KVCache(); - friend class KVStateCacheBuilder; + friend class KVCacheBuilder; }; -class KVStateCacheBuilder : public vineyard::ObjectBuilder { +class KVCacheBuilder : public vineyard::ObjectBuilder { Client& client; std::shared_ptr rootTree; std::set blockIDSetToDelete; - int tensorBytes; + int tensorNBytes; int layer; uint64_t version; int blockSize; int cacheCapacity; public: - KVStateCacheBuilder(Client& client, int tensorBytes, int layer, - std::shared_ptr& rootTree); + KVCacheBuilder(Client& client, int tensorNBytes, int layer, + std::shared_ptr& rootTree); static Status Make(Client& client, - std::shared_ptr& kvStateCacheBuilder, - int tensorBytes = 10, int cacheCapacity = 10, + std::shared_ptr& kvCacheBuilder, + int tensorNBytes = 10, int cacheCapacity = 10, int layer = 1, int blockSize = DEFAULT_BLOCK_SIZE); static Status Make(Client& client, - std::shared_ptr& kvStateCacheBuilder, - std::shared_ptr& cache); + std::shared_ptr& kvCacheBuilder, + std::shared_ptr& cache); - Status Split(KVStateCacheBlockBuilder* kvStateCacheBlockBuilder, + Status Split(KVCacheBlockBuilder* kvCacheBlockBuilder, std::vector> nodeDataList, - KVStateCacheBlockBuilder*& childKVStateCacheBlockBuilder); + KVCacheBlockBuilder*& childKVCacheBlockBuilder); Status Update(const std::vector& token_list, int next_token, const std::vector>& kv_state); @@ -106,7 +106,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder { void Delete(std::shared_ptr evicted_node); - Status Merge(std::shared_ptr kv_state_cache); + Status Merge(std::shared_ptr kv_cache); uint64_t GetVersion() { return this->version; } @@ -118,7 +118,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder { std::shared_ptr _Seal(Client& client) override; - uint64_t GetTensorBytes() { return this->tensorBytes; } + uint64_t GetTensorNBytes() { return this->tensorNBytes; } std::shared_ptr GetRootTree() { return this->rootTree; } @@ -134,9 +134,9 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder { void ClearBlockIDSetToDelete() { this->blockIDSetToDelete.clear(); } - ~KVStateCacheBuilder(); + ~KVCacheBuilder(); }; } // namespace vineyard -#endif // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_ +#endif // MODULES_LLM_CACHE_DS_KV_CACHE_H_ diff --git a/modules/llm-cache/ds/kv_state_cache_block.cc b/modules/llm-cache/ds/kv_cache_block.cc similarity index 70% rename from modules/llm-cache/ds/kv_state_cache_block.cc rename to modules/llm-cache/ds/kv_cache_block.cc index c82b0453ed..4429b309de 100644 --- a/modules/llm-cache/ds/kv_state_cache_block.cc +++ b/modules/llm-cache/ds/kv_cache_block.cc @@ -20,12 +20,12 @@ limitations under the License. #include "client/client.h" #include "common/memory/memcpy.h" #include "common/util/logging.h" -#include "llm-cache/ds/kv_state_cache_block.h" +#include "llm-cache/ds/kv_cache_block.h" namespace vineyard { // this function will be removed in the future -std::string KVStateCacheBlock::GetBitmapStr() { +std::string KVCacheBlock::GetBitmapStr() { std::string result; const int bits = 8 * sizeof(uint64_t); for (int i = 0; i < this->bitmapSize; i++) { @@ -36,7 +36,7 @@ std::string KVStateCacheBlock::GetBitmapStr() { return result; } -std::string KVStateCacheBlockBuilder::GetBitmapStr() { +std::string KVCacheBlockBuilder::GetBitmapStr() { std::string result; const int bits = 8 * sizeof(uint64_t); for (int i = 0; i < this->bitmapSize; i++) { @@ -47,10 +47,10 @@ std::string KVStateCacheBlockBuilder::GetBitmapStr() { return result; } -void KVStateCacheBlock::Construct(const ObjectMeta& meta) { +void KVCacheBlock::Construct(const ObjectMeta& meta) { Object::Construct(meta); - std::string typeName = type_name(); + std::string typeName = type_name(); VINEYARD_ASSERT(meta.GetTypeName() == typeName, "Expect typename '" + typeName + "', but got '" + @@ -75,45 +75,44 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) { this->bitmap[i] = this->meta_.GetKeyValue("bitmap_" + std::to_string(i)); } - this->tensorBytes = this->meta_.GetKeyValue("tensorBytes"); + this->tensorNBytes = this->meta_.GetKeyValue("tensorNBytes"); this->blockSize = this->meta_.GetKeyValue("block_size"); } -KVStateCacheBlock::~KVStateCacheBlock() { delete this->bitmap; } +KVCacheBlock::~KVCacheBlock() { delete this->bitmap; } -KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(Client& client, - int tensorBytes, int layer, - int blockSize) +KVCacheBlockBuilder::KVCacheBlockBuilder(Client& client, int tensorNBytes, + int layer, int blockSize) : client(client) { this->blockSize = blockSize; this->bitmapSize = (blockSize + 63) / 64; this->bitmap = new uint64_t[this->bitmapSize]; memset(this->bitmap, UINT8_MAX, this->bitmapSize * sizeof(uint64_t)); - std::vector shape = {(int64_t)(blockSize), tensorBytes}; + std::vector shape = {(int64_t)(blockSize), tensorNBytes}; for (int i = 0; i < layer; i++) { this->keyStateTensorBuilderList.push_back( std::make_shared(client, shape)); this->valueStateTensorBuilderList.push_back( std::make_shared(client, shape)); } - this->tensorBytes = tensorBytes; + this->tensorNBytes = tensorNBytes; this->layer = layer; } -KVStateCacheBlockBuilder::KVStateCacheBlockBuilder( - Client& client, std::shared_ptr kvStateCacheBlock) +KVCacheBlockBuilder::KVCacheBlockBuilder( + Client& client, std::shared_ptr kvCacheBlock) : client(client) { - this->bitmapSize = kvStateCacheBlock->bitmapSize; - this->blockSize = kvStateCacheBlock->blockSize; + this->bitmapSize = kvCacheBlock->bitmapSize; + this->blockSize = kvCacheBlock->blockSize; VLOG(100) << "create builder from block object, bitmap size:" << this->bitmapSize << " block size:" << blockSize; this->bitmap = new uint64_t[this->bitmapSize]; for (int i = 0; i < this->bitmapSize; i++) { - this->bitmap[i] = kvStateCacheBlock->bitmap[i]; + this->bitmap[i] = kvCacheBlock->bitmap[i]; } - this->tensorBytes = kvStateCacheBlock->tensorBytes; - this->layer = kvStateCacheBlock->layer; - std::vector shape = {(int64_t)(blockSize), this->tensorBytes}; + this->tensorNBytes = kvCacheBlock->tensorNBytes; + this->layer = kvCacheBlock->layer; + std::vector shape = {(int64_t)(blockSize), this->tensorNBytes}; for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { this->keyStateTensorBuilderList.push_back( std::make_shared(client, shape)); @@ -124,24 +123,23 @@ KVStateCacheBlockBuilder::KVStateCacheBlockBuilder( for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { vineyard::memory::concurrent_memcpy( this->keyStateTensorBuilderList[currentLayer]->data(), - kvStateCacheBlock->keyStateTensorList[currentLayer]->data(), - (int64_t)(blockSize) * this->tensorBytes); + kvCacheBlock->keyStateTensorList[currentLayer]->data(), + (int64_t)(blockSize) * this->tensorNBytes); vineyard::memory::concurrent_memcpy( this->valueStateTensorBuilderList[currentLayer]->data(), - kvStateCacheBlock->valueStateTensorList[currentLayer]->data(), - (int64_t)(blockSize) * this->tensorBytes); + kvCacheBlock->valueStateTensorList[currentLayer]->data(), + (int64_t)(blockSize) * this->tensorNBytes); } } -Status KVStateCacheBlockBuilder::Make( - Client& client, TreeData* treeData, - KVStateCacheBlockBuilder*& kvStateCacheBlockBuilder) { +Status KVCacheBlockBuilder::Make(Client& client, TreeData* treeData, + KVCacheBlockBuilder*& kvCacheBlockBuilder) { RETURN_ON_ASSERT(treeData != nullptr && treeData->isPtr == false); ObjectID blockObjectID = treeData->builderObjectID; - std::shared_ptr blockObject; + std::shared_ptr blockObject; RETURN_ON_ERROR(client.FetchAndGetObject(blockObjectID, blockObject)); - kvStateCacheBlockBuilder = new KVStateCacheBlockBuilder(client, blockObject); + kvCacheBlockBuilder = new KVCacheBlockBuilder(client, blockObject); if (blockObjectID != blockObject->id()) { // If the object is migrated, we should delete the copied object. Status status = client.DelData(blockObject->id()); @@ -153,7 +151,7 @@ Status KVStateCacheBlockBuilder::Make( return Status::OK(); } -Status KVStateCacheBlockBuilder::Query( +Status KVCacheBlockBuilder::Query( int index, std::vector>& kvState) { RETURN_ON_ASSERT((index >= 0 && index < this->blockSize), "Index out of range: " + std::to_string(index)); @@ -164,16 +162,16 @@ Status KVStateCacheBlockBuilder::Query( LLMKV& valueState = kvState[currentLayer].second; VINEYARD_ASSERT(keyState.data == nullptr && valueState.data == nullptr); keyState.data = - keyStateTensorBuilderList[currentLayer]->data() + index * tensorBytes; - keyState.length = tensorBytes; - valueState.data = - valueStateTensorBuilderList[currentLayer]->data() + index * tensorBytes; - valueState.length = tensorBytes; + keyStateTensorBuilderList[currentLayer]->data() + index * tensorNBytes; + keyState.length = tensorNBytes; + valueState.data = valueStateTensorBuilderList[currentLayer]->data() + + index * tensorNBytes; + valueState.length = tensorNBytes; } return Status::OK(); } -int KVStateCacheBlockBuilder::FindEmptySlot() { +int KVCacheBlockBuilder::FindEmptySlot() { for (int i = 0; i < this->bitmapSize; i++) { if (this->bitmap[i] != 0) { int index = ffsll(this->bitmap[i]) - 1; @@ -183,7 +181,7 @@ int KVStateCacheBlockBuilder::FindEmptySlot() { return -1; } -bool KVStateCacheBlockBuilder::IsFull() { +bool KVCacheBlockBuilder::IsFull() { int left = this->blockSize; for (int i = 0; i < this->bitmapSize; i++) { if (this->bitmap[i] != 0 && ffsll(this->bitmap[i]) - 1 < left) { @@ -194,7 +192,7 @@ bool KVStateCacheBlockBuilder::IsFull() { return true; } -Status KVStateCacheBlockBuilder::Update( +Status KVCacheBlockBuilder::Update( const std::vector>& kvState, OffsetData* data) { int index = this->FindEmptySlot(); RETURN_ON_ASSERT((index >= 0 && index < this->blockSize), @@ -205,15 +203,15 @@ Status KVStateCacheBlockBuilder::Update( for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { LLMKV keyState = kvState[currentLayer].first; LLMKV valueState = kvState[currentLayer].second; - RETURN_ON_ASSERT((keyState.length == (size_t) this->tensorBytes && - valueState.length == (size_t) this->tensorBytes)); + RETURN_ON_ASSERT((keyState.length == (size_t) this->tensorNBytes && + valueState.length == (size_t) this->tensorNBytes)); uint8_t* keyData = keyStateTensorBuilderList[currentLayer]->data(); uint8_t* valueData = valueStateTensorBuilderList[currentLayer]->data(); - vineyard::memory::concurrent_memcpy(keyData + index * this->tensorBytes, - keyState.data, this->tensorBytes); - vineyard::memory::concurrent_memcpy(valueData + index * this->tensorBytes, - valueState.data, this->tensorBytes); + vineyard::memory::concurrent_memcpy(keyData + index * this->tensorNBytes, + keyState.data, this->tensorNBytes); + vineyard::memory::concurrent_memcpy(valueData + index * this->tensorNBytes, + valueState.data, this->tensorNBytes); } data->offset = index; @@ -221,8 +219,7 @@ Status KVStateCacheBlockBuilder::Update( return Status::OK(); } -int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child, - int index) { +int16_t KVCacheBlockBuilder::Split(KVCacheBlockBuilder* child, int index) { // Child builder must be empty. int childIndex = child->FindEmptySlot(); for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { @@ -236,62 +233,61 @@ int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child, child->valueStateTensorBuilderList[currentLayer]; uint8_t* keyState = - keyStateTensorBuilder->data() + index * this->tensorBytes; + keyStateTensorBuilder->data() + index * this->tensorNBytes; uint8_t* valueState = - valueStateTensorBuilder->data() + index * this->tensorBytes; + valueStateTensorBuilder->data() + index * this->tensorNBytes; uint8_t* childKeyState = - childKeyStateTensorBuilder->data() + childIndex * this->tensorBytes; + childKeyStateTensorBuilder->data() + childIndex * this->tensorNBytes; uint8_t* childValueState = - childValueStateTensorBuilder->data() + childIndex * this->tensorBytes; + childValueStateTensorBuilder->data() + childIndex * this->tensorNBytes; vineyard::memory::concurrent_memcpy(childKeyState, keyState, - this->tensorBytes); + this->tensorNBytes); vineyard::memory::concurrent_memcpy(childValueState, valueState, - this->tensorBytes); + this->tensorNBytes); } ACQUIRE_BIT_RESOURCE(child->bitmap[childIndex / 64], childIndex % 64); FREE_BIT_RESOURCE(this->bitmap[index / 64], index % 64); return childIndex; } -Status KVStateCacheBlockBuilder::Build(Client& client) { return Status::OK(); } +Status KVCacheBlockBuilder::Build(Client& client) { return Status::OK(); } -std::shared_ptr KVStateCacheBlockBuilder::_Seal(Client& client) { +std::shared_ptr KVCacheBlockBuilder::_Seal(Client& client) { VINEYARD_CHECK_OK(this->Build(client)); - std::shared_ptr kvStateCacheBlock = - std::make_shared(); + std::shared_ptr kvCacheBlock = std::make_shared(); // 1. seal keyStateTensorBuilder and valueStateTensorBuilder for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { - kvStateCacheBlock->meta_.AddMember( + kvCacheBlock->meta_.AddMember( "keyStateTensorBuilder_" + std::to_string(currentLayer), keyStateTensorBuilderList[currentLayer]->Seal(client)); - kvStateCacheBlock->meta_.AddMember( + kvCacheBlock->meta_.AddMember( "valueStateTensorBuilder_" + std::to_string(currentLayer), valueStateTensorBuilderList[currentLayer]->Seal(client)); } // 2. store the member field to meta - kvStateCacheBlock->meta_.AddKeyValue("bitmap_size", this->bitmapSize); + kvCacheBlock->meta_.AddKeyValue("bitmap_size", this->bitmapSize); for (int i = 0; i < this->bitmapSize; i++) { - kvStateCacheBlock->meta_.AddKeyValue("bitmap_" + std::to_string(i), - this->bitmap[i]); + kvCacheBlock->meta_.AddKeyValue("bitmap_" + std::to_string(i), + this->bitmap[i]); } - kvStateCacheBlock->meta_.AddKeyValue("block_size", this->blockSize); - kvStateCacheBlock->meta_.AddKeyValue("tensorBytes", this->tensorBytes); - kvStateCacheBlock->meta_.AddKeyValue("layer", this->layer); + kvCacheBlock->meta_.AddKeyValue("block_size", this->blockSize); + kvCacheBlock->meta_.AddKeyValue("tensorNBytes", this->tensorNBytes); + kvCacheBlock->meta_.AddKeyValue("layer", this->layer); // 3. set the object type to meta - kvStateCacheBlock->meta_.SetTypeName(type_name()); + kvCacheBlock->meta_.SetTypeName(type_name()); VINEYARD_CHECK_OK( - client.CreateMetaData(kvStateCacheBlock->meta_, kvStateCacheBlock->id_)); + client.CreateMetaData(kvCacheBlock->meta_, kvCacheBlock->id_)); this->set_sealed(true); - return kvStateCacheBlock; + return kvCacheBlock; } -void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() { +void KVCacheBlockBuilder::PrintKVCacheBlock() { LOG(INFO) << "builder:" << this; for (int i = 0; i < this->blockSize; i++) { LOG(INFO) << "index:" << i << " bitmap:" << this->GetBitmapStr(); @@ -304,13 +300,13 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() { uint8_t* key_state_data = keyStateTensorBuilderList[currentLayer]->data(); uint8_t* value_state_data = valueStateTensorBuilderList[currentLayer]->data(); - // print the first tensorBytes bytes + // print the first tensorNBytes bytes std::string keyState = ""; std::string valueState = ""; - for (int j = 0; j < this->tensorBytes; j++) { - keyState += std::to_string(key_state_data[i * tensorBytes + j]) + " "; + for (int j = 0; j < this->tensorNBytes; j++) { + keyState += std::to_string(key_state_data[i * tensorNBytes + j]) + " "; valueState += - std::to_string(value_state_data[i * tensorBytes + j]) + " "; + std::to_string(value_state_data[i * tensorNBytes + j]) + " "; } LOG(INFO) << "keyState:" << keyState; LOG(INFO) << "valueState:" << valueState; @@ -320,6 +316,6 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() { LOG(INFO) << "=========================="; } -KVStateCacheBlockBuilder::~KVStateCacheBlockBuilder() { delete this->bitmap; } +KVCacheBlockBuilder::~KVCacheBlockBuilder() { delete this->bitmap; } } // namespace vineyard diff --git a/modules/llm-cache/ds/kv_state_cache_block.h b/modules/llm-cache/ds/kv_cache_block.h similarity index 77% rename from modules/llm-cache/ds/kv_state_cache_block.h rename to modules/llm-cache/ds/kv_cache_block.h index 808d2cbc45..4d88281083 100644 --- a/modules/llm-cache/ds/kv_state_cache_block.h +++ b/modules/llm-cache/ds/kv_cache_block.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_ -#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_ +#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_ +#define MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_ #include #include @@ -48,26 +48,26 @@ struct OffsetData { struct TreeData { union { - void* kvStateCacheBlockBuilder; + void* kvCacheBlockBuilder; uint64_t builderObjectID; }; bool isPtr = true; }; /** - * @brief KVStateCacheBlock is a cache for kv-cache of LLM. When a new prompt - * comes, LLM can query KVStateCacheBlock to get the state of the kv-cache to + * @brief KVCacheBlock is a cache for kv-cache of LLM. When a new prompt + * comes, LLM can query KVCacheBlock to get the state of the kv-cache to * avoid calculate the kv-cache again if the new prompt is similar to the * previous one. * - * KVStateCacheBlock is stored in vineyard as a vineyard object which contains a + * KVCacheBlock is stored in vineyard as a vineyard object which contains a * radix tree. The token sequence is the key of the radix tree and the value * point out the offset of the kv-cache in the tensor list. * - * KVStateCacheBlock can be shared by multiple machines. + * KVCacheBlock can be shared by multiple machines. */ -class KVStateCacheBlock : public vineyard::Registered { +class KVCacheBlock : public vineyard::Registered { private: std::vector> keyStateTensorList; std::vector> valueStateTensorList; @@ -75,19 +75,19 @@ class KVStateCacheBlock : public vineyard::Registered { int blockSize; int bitmapSize; int layer; - int tensorBytes; + int tensorNBytes; public: static std::unique_ptr Create() __attribute__((used)) { return std::static_pointer_cast( - std::unique_ptr{new KVStateCacheBlock()}); + std::unique_ptr{new KVCacheBlock()}); } void Construct(const ObjectMeta& meta) override; std::string GetBitmapStr(); - uint64_t GetTensorBytes() { return this->tensorBytes; } + uint64_t GetTensorNBytes() { return this->tensorNBytes; } uint64_t* GetBitmap() { return this->bitmap; } @@ -109,12 +109,12 @@ class KVStateCacheBlock : public vineyard::Registered { return this->valueStateTensorList; } - ~KVStateCacheBlock(); + ~KVCacheBlock(); - friend class KVStateCacheBlockBuilder; + friend class KVCacheBlockBuilder; }; -class KVStateCacheBlockBuilder : public ObjectBuilder { +class KVCacheBlockBuilder : public ObjectBuilder { private: Client& client; std::vector> keyStateTensorBuilderList; @@ -124,20 +124,20 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { uint64_t* bitmap; int blockSize; int bitmapSize; - int tensorBytes; + int tensorNBytes; int layer; int FindEmptySlot(); public: - KVStateCacheBlockBuilder(Client& client, int tensorBytes, int layer, - int blockSize); + KVCacheBlockBuilder(Client& client, int tensorNBytes, int layer, + int blockSize); - KVStateCacheBlockBuilder( - Client& client, std::shared_ptr kv_state_cache_block); + KVCacheBlockBuilder(Client& client, + std::shared_ptr kv_cache_block); static Status Make(Client& client, TreeData* treeData, - KVStateCacheBlockBuilder*& kvStateCacheBlockBuilder); + KVCacheBlockBuilder*& kvCacheBlockBuilder); /** * @brief Update the kv-state using next token. @@ -165,7 +165,7 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { std::shared_ptr _Seal(Client& client) override; - int16_t Split(KVStateCacheBlockBuilder* child, int index); + int16_t Split(KVCacheBlockBuilder* child, int index); const std::shared_ptr& GetKeyStateBuilder(int layer) { return keyStateTensorBuilderList[layer]; @@ -193,15 +193,15 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { uint64_t* GetBitmap() { return this->bitmap; } - uint64_t GetTensorBytes() { return this->tensorBytes; } + uint64_t GetTensorNBytes() { return this->tensorNBytes; } int GetBlockSize() { return this->blockSize; } - void PrintKVStateCacheBlock(); + void PrintKVCacheBlock(); - ~KVStateCacheBlockBuilder(); + ~KVCacheBlockBuilder(); }; } // namespace vineyard -#endif // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_ +#endif // MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_ diff --git a/modules/llm-cache/ds/kv_state_cache_manager.cc b/modules/llm-cache/ds/kv_cache_manager.cc similarity index 81% rename from modules/llm-cache/ds/kv_state_cache_manager.cc rename to modules/llm-cache/ds/kv_cache_manager.cc index 2023eadc8c..659db8c090 100644 --- a/modules/llm-cache/ds/kv_state_cache_manager.cc +++ b/modules/llm-cache/ds/kv_cache_manager.cc @@ -22,22 +22,21 @@ limitations under the License. #include "client/client.h" #include "common/util/logging.h" #include "common/util/status.h" -#include "llm-cache/ds/kv_state_cache.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache.h" +#include "llm-cache/ds/kv_cache_manager.h" #include "llm-cache/storage/blob_storage.h" #include "llm-cache/storage/local_file_storage.h" namespace vineyard { -KVStateCacheManager::KVStateCacheManager( - std::shared_ptr storageImpl) { +KVCacheManager::KVCacheManager(std::shared_ptr storageImpl) { storage = storageImpl; } // use the memory storage for manager -Status KVStateCacheManager::Make(Client& client, - std::shared_ptr& manager, - VineyardCacheConfig& config) { +Status KVCacheManager::Make(Client& client, + std::shared_ptr& manager, + VineyardCacheConfig& config) { if (config.tensorByte <= 0 || config.cacheCapacity <= 0 || config.layer <= 0) { return Status::Invalid("Invalid tensor byte, cache capacity or layer."); @@ -58,15 +57,15 @@ Status KVStateCacheManager::Make(Client& client, config.layer, config.blockSize, config.syncInterval, config.llmCacheSyncLock, config.llmCacheObjectName, config.llmRefcntObjectName)); - manager = std::make_shared(blob_storage); + manager = std::make_shared(blob_storage); manager->config = std::make_shared(config); return Status::OK(); } // use the file storage for manager -Status KVStateCacheManager::Make(std::shared_ptr& manager, - FileCacheConfig& config) { - if (config.batchSize <= 0 || config.splitNumber <= 0) { +Status KVCacheManager::Make(std::shared_ptr& manager, + FileCacheConfig& config) { + if (config.chunkSize <= 0 || config.hashChunkSize <= 0) { return Status::Invalid("Invalid batch size or split number."); } if (config.tensorByte <= 0 || config.cacheCapacity <= 0 || @@ -77,13 +76,13 @@ Status KVStateCacheManager::Make(std::shared_ptr& manager, std::shared_ptr file_storage; if (config.filesystemType == FilesystemType::LOCAL) { file_storage = std::make_shared( - config.tensorByte, config.cacheCapacity, config.layer, config.batchSize, - config.splitNumber, config.root, config.clientGCInterval, config.ttl, + config.tensorByte, config.cacheCapacity, config.layer, config.chunkSize, + config.hashChunkSize, config.root, config.gcInterval, config.ttl, config.enbaleGlobalGC, config.globalGCInterval, config.globalTTL); } else { return Status::Invalid("Unsupported filesystem type"); } - manager = std::make_shared(file_storage); + manager = std::make_shared(file_storage); RETURN_ON_ERROR(file_storage->Init()); manager->config = std::make_shared(config); return Status::OK(); @@ -94,7 +93,7 @@ Status KVStateCacheManager::Make(std::shared_ptr& manager, * manager. * * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -111,43 +110,43 @@ Status KVStateCacheManager::Make(std::shared_ptr& manager, * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList manually * + * * kv buffer of the kvCacheList manually * * * * * ***************************************************************** * * * @note The length of the token list should be as same as the length of the - * kvStateList. + * kvCacheList. * * * @return Status */ -Status KVStateCacheManager::Update( +Status KVCacheManager::Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { - if (kvStateList.size() != tokenList.size()) { + if (kvCacheList.size() != tokenList.size()) { return Status::Invalid("Token list size not match kv state list size"); } - return storage->Update(tokenList, kvStateList, updated); + return storage->Update(tokenList, kvCacheList, updated); } /** @@ -156,7 +155,7 @@ Status KVStateCacheManager::Update( * * @param prefix The prefix of the token list. * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -173,41 +172,41 @@ Status KVStateCacheManager::Update( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList manually * + * * kv buffer of the kvCacheList manually * * * * * ***************************************************************** * * @note The length of the token list should be as same as the length of the - * kvStateList. + * kvCacheList. * * @return Status */ -Status KVStateCacheManager::Update( +Status KVCacheManager::Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { - if (kvStateList.size() != tokenList.size()) { + if (kvCacheList.size() != tokenList.size()) { return Status::Invalid("Token list size not match kv state list size"); } - return storage->Update(prefix, tokenList, kvStateList, updated); + return storage->Update(prefix, tokenList, kvCacheList, updated); } /** @@ -228,12 +227,12 @@ Status KVStateCacheManager::Update( * * for (int i = 0; i < 2; i++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * *} * * * * @@ -245,67 +244,18 @@ Status KVStateCacheManager::Update( * @return Status to indicate whether the kv state has been updated * successfully. */ -Status KVStateCacheManager::Update( +Status KVCacheManager::Update( const std::vector& tokenList, int nextToken, const std::vector>& kvState) { return storage->Update(tokenList, nextToken, kvState); } -/** - * @brief Query the kv state with the given token and its prefix in the kv state - * cache manager. - * - * @param tokenList The token list as the prefix of the next token. - * @param nextToken The next token to be queried. - * @param kvState The kv state of the next token. It must be initialized before - * calling this function, including the data and length of the kv - * tensor. Also, the length of the kvState should be as same as - * the layer of the kv state. - * - * ***************************************************************** - * * Only support for blob storage, the kv state is managed by the * - * * kv state cache manager, the caller does not need to malloc * - * * and free the memory of the kv state. Besides, the data * - * * pointer should be nullptr and the length should be 0. * - * * * - * * Assume the layer is 2, you should allocate the memory for the * - * * kv state like this: * - * * std::vector> kvState; * - * * for (int i = 0; i < 2; i++) { * - * * LLMKV key_state; * - * * LLMKV value_state; * - * * key_state.data = nullptr * - * * value_state.data = nullptr * - * * key_state.length = 0; * - * * value_state.length = 0; * - * * kvState.emplace_back(key_state, value_state); * - * *} * - * * * - * * After calling this function, the key_state's data is pointing * - * * to the K tensor data stored in vineyard blob, and the * - * * value_state's data is pointing to the V tensor data stored in * - * * vineyard blob. All the length of the kv state is the size of * - * * the tensor data. Then you can copy the kv state to the LLM KV * - * * Cache. The memory of the kv state will be freed when calling * - * * the close function of the kv state cache manager. * - * * * - * ***************************************************************** - * - * @return Status to indicate whether the kv state has been queried - * successfully. - */ -Status KVStateCacheManager::Query( - const std::vector& tokenList, int nextToken, - std::vector>& kvState) { - return storage->Query(tokenList, nextToken, kvState); -} - /** * @brief Query the kv state with the given token list in the kv state cache * manager. * * @param tokenList The token list to be queried. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It must be initialized before calling this function, * including the data and length of the kv tensor. * It's a 2D vector, the first dimension is the token index, @@ -327,7 +277,7 @@ Status KVStateCacheManager::Query( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * @@ -339,7 +289,7 @@ Status KVStateCacheManager::Query( * * value_state.length = 0; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, the key_state's data is pointing * @@ -360,19 +310,19 @@ Status KVStateCacheManager::Query( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, the key_state and value_state * @@ -383,35 +333,86 @@ Status KVStateCacheManager::Query( * ***************************************************************** * * @note The length of the token list should be as same as the length of the - * kvStateList. and the second dimension of the kvStateList should be as same as + * kvCacheList. and the second dimension of the kvCacheList should be as same as * the layer of the kv state. * * @return Status */ -Status KVStateCacheManager::Query( +Status KVCacheManager::Query( const std::vector& tokenList, - std::vector>>& kvStateList, + std::vector>>& kvCacheList, size_t& matched) { - return storage->Query(tokenList, kvStateList, matched); + return storage->Query(tokenList, kvCacheList, matched); } -Status KVStateCacheManager::ClearGlobalCache(Client& client, - VineyardCacheConfig& config) { +/** + * @brief Query the kv state with the given token and its prefix in the kv state + * cache manager. + * + * @param tokenList The token list as the prefix of the next token. + * @param nextToken The next token to be queried. + * @param kvState The kv state of the next token. It must be initialized before + * calling this function, including the data and length of the kv + * tensor. Also, the length of the kvState should be as same as + * the layer of the kv state. + * + * ***************************************************************** + * * Only support for blob storage, the kv state is managed by the * + * * kv state cache manager, the caller does not need to malloc * + * * and free the memory of the kv state. Besides, the data * + * * pointer should be nullptr and the length should be 0. * + * * * + * * Assume the layer is 2, you should allocate the memory for the * + * * kv state like this: * + * * std::vector> kvState; * + * * for (int i = 0; i < 2; i++) { * + * * LLMKV key_state; * + * * LLMKV value_state; * + * * key_state.data = nullptr * + * * value_state.data = nullptr * + * * key_state.length = 0; * + * * value_state.length = 0; * + * * kvState.emplace_back(key_state, value_state); * + * *} * + * * * + * * After calling this function, the key_state's data is pointing * + * * to the K tensor data stored in vineyard blob, and the * + * * value_state's data is pointing to the V tensor data stored in * + * * vineyard blob. All the length of the kv state is the size of * + * * the tensor data. Then you can copy the kv state to the LLM KV * + * * Cache. The memory of the kv state will be freed when calling * + * * the close function of the kv state cache manager. * + * * * + * ***************************************************************** + * + * @return Status to indicate whether the kv state has been queried + * successfully. + */ +Status KVCacheManager::Query(const std::vector& prefix, int nextToken, + std::vector>& kvState) { + return storage->Query(prefix, nextToken, kvState); +} + +Status KVCacheManager::Query( + const std::vector& prefix, const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) { + return storage->Query(prefix, tokenList, kvCacheList, matched); +} + +Status KVCacheManager::ClearGlobalCache(Client& client, + VineyardCacheConfig& config) { return BlobStorage::ClearGlobalCache(client, config.llmCacheSyncLock, config.llmCacheObjectName, config.llmRefcntObjectName); } -void KVStateCacheManager::Close() { storage->CloseCache(); } +void KVCacheManager::Close() { storage->CloseCache(); } -void KVStateCacheManager::StopGlobalGCThread() { - storage->StopGlobalGCThread(); -} +void KVCacheManager::StopGlobalGCThread() { storage->StopGlobalGCThread(); } -void KVStateCacheManager::StartGlobalGCThread() { - storage->StartGlobalGCThread(); -} +void KVCacheManager::StartGlobalGCThread() { storage->StartGlobalGCThread(); } -KVStateCacheManager::~KVStateCacheManager() {} +KVCacheManager::~KVCacheManager() {} } // namespace vineyard diff --git a/modules/llm-cache/ds/kv_state_cache_manager.h b/modules/llm-cache/ds/kv_cache_manager.h similarity index 72% rename from modules/llm-cache/ds/kv_state_cache_manager.h rename to modules/llm-cache/ds/kv_cache_manager.h index 9bf513da28..8cccabc8f4 100644 --- a/modules/llm-cache/ds/kv_state_cache_manager.h +++ b/modules/llm-cache/ds/kv_cache_manager.h @@ -20,26 +20,25 @@ limitations under the License. #include #include "llm-cache/ds/config.h" -#include "llm-cache/ds/kv_state_cache.h" +#include "llm-cache/ds/kv_cache.h" #include "llm-cache/storage/blob_storage.h" #include "llm-cache/storage/file_storage.h" -#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_ -#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_ +#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_ +#define MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_ namespace vineyard { -class KVStateCacheManager { +class KVCacheManager { public: - explicit KVStateCacheManager(std::shared_ptr storageImpl); + explicit KVCacheManager(std::shared_ptr storageImpl); - ~KVStateCacheManager(); + ~KVCacheManager(); - static Status Make(Client& client, - std::shared_ptr& manager, + static Status Make(Client& client, std::shared_ptr& manager, VineyardCacheConfig& config); - static Status Make(std::shared_ptr& manager, + static Status Make(std::shared_ptr& manager, FileCacheConfig& config); Status Update(const std::vector& tokenList, int nextToken, @@ -47,19 +46,24 @@ class KVStateCacheManager { Status Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated); Status Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated); - Status Query(const std::vector& tokenList, int token, + Status Query(const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched); + + Status Query(const std::vector& prefix, int nextToken, std::vector>& kvState); - Status Query(const std::vector& tokenList, - std::vector>>& kvStateList, + Status Query(const std::vector& prefix, + const std::vector& tokenList, + std::vector>>& kvCacheList, size_t& matched); void Close(); @@ -79,4 +83,4 @@ class KVStateCacheManager { } // namespace vineyard -#endif // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_ +#endif // MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_ diff --git a/modules/llm-cache/hash/hasher.h b/modules/llm-cache/hash/hasher.h index 0b4c9b47a1..0d936df3fc 100644 --- a/modules/llm-cache/hash/hasher.h +++ b/modules/llm-cache/hash/hasher.h @@ -32,8 +32,8 @@ class Hasher { * @brief Compute the path list for the token list * * @param tokenList The list of tokens - * @param batchSize The size of the batch - * @param splitNumber The number of splits + * @param chunkSize The size of the batch + * @param hashChunkSize The number of splits * @param pathList The Relative path list of the token list * * @return Status @@ -54,29 +54,29 @@ class Hasher { * hashValue3(4c90a490) -> 4c/90/a4/90 * */ - Status computePathForTokens(const std::vector& tokenList, int batchSize, - int splitNumber, + Status computePathForTokens(const std::vector& tokenList, int chunkSize, + int hashChunkSize, std::vector& pathList) { char hashBuffer[9]; - int tokenSize = tokenList.size() - tokenList.size() % batchSize; + int tokenSize = tokenList.size() - tokenList.size() % chunkSize; // if the token list (upper_bound) is less than the batch size, then return // directly - if (tokenSize < batchSize) { + if (tokenSize < chunkSize) { return Status::OK(); } // split the token list into batches - for (int i = 0; i < tokenSize; i += batchSize) { + for (int i = 0; i < tokenSize; i += chunkSize) { int hashValue = hashAlgorithm->hash(reinterpret_cast(tokenList.data()), - (i + batchSize) * sizeof(int)); + (i + chunkSize) * sizeof(int)); // split the hash value into paths std::snprintf(hashBuffer, sizeof(hashBuffer), "%08x", hashValue); int index = 0; std::string path; - while (index + splitNumber < 8) { - path += std::string(hashBuffer + index, splitNumber) + "/"; - index += splitNumber; + while (index + hashChunkSize < 8) { + path += std::string(hashBuffer + index, hashChunkSize) + "/"; + index += hashChunkSize; } path += std::string(hashBuffer + index, 8 - index); pathList.push_back(path); diff --git a/modules/llm-cache/storage/blob_storage.cc b/modules/llm-cache/storage/blob_storage.cc index 1c4b8beb00..fed7bab297 100644 --- a/modules/llm-cache/storage/blob_storage.cc +++ b/modules/llm-cache/storage/blob_storage.cc @@ -23,14 +23,13 @@ limitations under the License. namespace vineyard { -BlobStorage::BlobStorage(Client& client, - std::shared_ptr& cache, +BlobStorage::BlobStorage(Client& client, std::shared_ptr& cache, int syncInterval, std::string& llmCacheSyncLock, std::string& llmCacheObjectName, std::string& llmRefcntObjectName) : client(client) { this->syncInterval = syncInterval; - this->kvStateCacheBuilder = cache; + this->kvCacheBuilder = cache; this->llmCacheSyncLock = llmCacheSyncLock; this->llmCacheObjectName = llmCacheObjectName; this->llmRefcntObjectName = llmRefcntObjectName; @@ -38,7 +37,7 @@ BlobStorage::BlobStorage(Client& client, } Status BlobStorage::Make(Client& client, std::shared_ptr& storage, - int tensorBytes, int cacheCapacity, int layer, + int tensorNBytes, int cacheCapacity, int layer, int blockSize, int syncInterval, std::string llmCacheSyncLock, std::string llmCacheObjectName, @@ -50,40 +49,37 @@ Status BlobStorage::Make(Client& client, std::shared_ptr& storage, AcquireServerLock(client, llmCacheSyncLock, actualKey); // sync global cache object with vineyard - ObjectID globalKVStateCacheID; + ObjectID globalKVCacheID; std::set blockIDSetToAdd; std::set blockIDSetToDelete; - Status status = client.GetName(llmCacheObjectName, globalKVStateCacheID); - std::shared_ptr kvStateCacheBuilder; + Status status = client.GetName(llmCacheObjectName, globalKVCacheID); + std::shared_ptr kvCacheBuilder; if (status.ok()) { // if success, pull the cache object - std::shared_ptr globalKVStateCache = - std::dynamic_pointer_cast( - client.FetchAndGetObject(globalKVStateCacheID)); - Status status = KVStateCacheBuilder::Make(client, kvStateCacheBuilder, - globalKVStateCache); + std::shared_ptr globalKVCache = std::dynamic_pointer_cast( + client.FetchAndGetObject(globalKVCacheID)); + Status status = KVCacheBuilder::Make(client, kvCacheBuilder, globalKVCache); if (!status.ok()) { ReleaseServerLock(client, actualKey); return Status::Invalid( "Failed to make the cache object from global cache object."); } - if (globalKVStateCache->id() != globalKVStateCacheID) { + if (globalKVCache->id() != globalKVCacheID) { VLOG(100) << "Del migrate object"; - Status status = client.DelData(globalKVStateCache->id()); + Status status = client.DelData(globalKVCache->id()); if (!status.ok()) { LOG(ERROR) << "Delete object failed: " << status.ToString() << " It may cause memory leak."; } } - kvStateCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd); - blockIDSetToDelete = kvStateCacheBuilder->GetBlockIDSetToDelete(); + kvCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd); + blockIDSetToDelete = kvCacheBuilder->GetBlockIDSetToDelete(); } else { // if failed, create a new cache object LOG(INFO) << "failed to get the cache object, create a new one."; - Status status = - KVStateCacheBuilder::Make(client, kvStateCacheBuilder, tensorBytes, - cacheCapacity, layer, blockSize); + Status status = KVCacheBuilder::Make(client, kvCacheBuilder, tensorNBytes, + cacheCapacity, layer, blockSize); if (!status.ok()) { ReleaseServerLock(client, actualKey); return Status::Invalid("Failed to make new cache object."); @@ -92,9 +88,9 @@ Status BlobStorage::Make(Client& client, std::shared_ptr& storage, // TBD // use lease to prevent the deadlock if the client is down - storage = std::make_shared( - client, kvStateCacheBuilder, syncInterval, llmCacheSyncLock, - llmCacheObjectName, llmRefcntObjectName); + storage = std::make_shared(client, kvCacheBuilder, syncInterval, + llmCacheSyncLock, llmCacheObjectName, + llmRefcntObjectName); VINEYARD_CHECK_OK(storage->SetRefcntMap(blockIDSetToDelete, blockIDSetToAdd)); // release the lock ReleaseServerLock(client, actualKey); @@ -104,13 +100,13 @@ Status BlobStorage::Make(Client& client, std::shared_ptr& storage, Status BlobStorage::UpdateInternal( const std::vector& tokenList, int nextToken, const std::vector>& kvState) { - return kvStateCacheBuilder->Update(tokenList, nextToken, kvState); + return kvCacheBuilder->Update(tokenList, nextToken, kvState); } Status BlobStorage::QueryInternal( const std::vector& tokenList, int token, std::vector>& kvState) { - return kvStateCacheBuilder->Query(tokenList, token, kvState); + return kvCacheBuilder->Query(tokenList, token, kvState); } /** @@ -132,12 +128,12 @@ Status BlobStorage::QueryInternal( * * for (int i = 0; i < 2; i++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * *} * * * * @@ -169,7 +165,7 @@ Status BlobStorage::Update( * manager. * * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -185,38 +181,38 @@ Status BlobStorage::Update( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList manually * + * * kv buffer of the kvCacheList manually * * * * * ***************************************************************** * * * @note The length of the token list should be as same as the length of the - * kvStateList. and the second dimension of the kvStateList should be as same as + * kvCacheList. and the second dimension of the kvCacheList should be as same as * the layer of the kv state. * * @return Status */ Status BlobStorage::Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { std::unique_lock lock(cacheAccessMutex, std::defer_lock); if (!lock.try_lock()) { @@ -227,7 +223,7 @@ Status BlobStorage::Update( } std::vector tokenListCopy; for (size_t i = 0; i < tokenList.size(); i++) { - Status result = UpdateInternal(tokenListCopy, tokenList[i], kvStateList[i]); + Status result = UpdateInternal(tokenListCopy, tokenList[i], kvCacheList[i]); if (!result.ok()) { break; } @@ -244,7 +240,7 @@ Status BlobStorage::Update( * * @param prefix The prefix of the token list. * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -258,25 +254,25 @@ Status BlobStorage::Update( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList * + * * kv buffer of the kvCacheList * * * * * ***************************************************************** * @@ -287,7 +283,7 @@ Status BlobStorage::Update( */ Status BlobStorage::Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { std::unique_lock lock(cacheAccessMutex, std::defer_lock); if (!lock.try_lock()) { @@ -298,7 +294,7 @@ Status BlobStorage::Update( } std::vector tokenListCopy(prefix.begin(), prefix.end()); for (size_t i = 0; i < tokenList.size(); i++) { - Status result = UpdateInternal(tokenListCopy, tokenList[i], kvStateList[i]); + Status result = UpdateInternal(tokenListCopy, tokenList[i], kvCacheList[i]); if (!result.ok()) { break; } @@ -309,68 +305,12 @@ Status BlobStorage::Update( return Status::OK(); } -/** - * @brief Query the kv state with the given token and its prefix in the kv state - * cache manager. - * - * @param tokenList The token list as the prefix of the updated token. - * @param token The token to be queried. - * @param kvState The kv state of the token. It must be initialized(allocated) - * before calling this function, including the data and length - * of the kv state. The length of the kv state should be as same - * as the layer of the kv state cache manager. - * - * ***************************************************************** - * * Important, the kv state is managed by the kv state cache * - * * manager, the caller does not need to malloc and free the * - * * memory of the kv state. Besides, the data pointer should be * - * * nullptr and the length should be 0. * - * * * - * * Assume the layer is 2, you should allocate the memory for the * - * * kv state like this: * - * * std::vector> kvState; * - * * for (int i = 0; i < 2; i++) { * - * * LLMKV key_state; * - * * LLMKV value_state; * - * * key_state.data = nullptr * - * * value_state.data = nullptr * - * * key_state.length = 0; * - * * value_state.length = 0; * - * * kvState.emplace_back(key_state, value_state); * - * *} * - * * * - * * After calling this function, the key_state's data is pointing * - * * to the K tensor data stored in vineyard blob, and the * - * * value_state's data is pointing to the V tensor data stored in * - * * vineyard blob. All the length of the kv state is the size of * - * * the tensor data. Then you can copy the kv state to the LLM KV * - * * Cache. The memory of the kv state will be freed when calling * - * * the close function of the kv state cache manager. * - * * * - * ***************************************************************** - * - * @return Status - */ -Status BlobStorage::Query(const std::vector& tokenList, int token, - std::vector>& kvState) { - std::unique_lock lock(cacheAccessMutex, std::defer_lock); - if (!lock.try_lock()) { - // If failed to gain the lock, return OK and wait for next time - return Status::OK(); - } - if (isClosed) { - return Status::Invalid("The memory storage is closed."); - } - - return QueryInternal(tokenList, token, kvState); -} - /** * @brief Query the kv state with the given token list and its prefix in the kv * state cache manager. * * @param tokenList The token list as the prefix of the updated token. - * @param kvStateList The kv state list of the token list. It must be + * @param kvCacheList The kv state list of the token list. It must be * initialized before calling this function, including the * data and length of the kv tensor. * The kv state list is a 2D vector, the first dimension is @@ -391,7 +331,7 @@ Status BlobStorage::Query(const std::vector& tokenList, int token, * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * @@ -403,7 +343,7 @@ Status BlobStorage::Query(const std::vector& tokenList, int token, * * value_state.length = 0; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, the key_state's data is pointing * @@ -420,7 +360,7 @@ Status BlobStorage::Query(const std::vector& tokenList, int token, */ Status BlobStorage::Query( const std::vector& tokenList, - std::vector>>& kvStateList, + std::vector>>& kvCacheList, size_t& matched) { std::unique_lock lock(cacheAccessMutex, std::defer_lock); if (!lock.try_lock()) { @@ -434,9 +374,9 @@ Status BlobStorage::Query( // copy the token list and query the cache one token by one token matched = 0; std::vector tokenListPrefix; - for (size_t i = 0; i < tokenList.size() && i < kvStateList.size(); i++) { + for (size_t i = 0; i < tokenList.size() && i < kvCacheList.size(); i++) { Status result = - QueryInternal(tokenListPrefix, tokenList[i], kvStateList[i]); + QueryInternal(tokenListPrefix, tokenList[i], kvCacheList[i]); if (!result.ok()) { return Status::OK(); } @@ -447,6 +387,68 @@ Status BlobStorage::Query( return Status::OK(); } +/** + * @brief Query the kv state with the given token and its prefix in the kv state + * cache manager. + * + * @param prefix The token list as the prefix of the updated token. + * @param token The token to be queried. + * @param kvState The kv state of the token. It must be initialized(allocated) + * before calling this function, including the data and length + * of the kv state. The length of the kv state should be as same + * as the layer of the kv state cache manager. + * + * ***************************************************************** + * * Important, the kv state is managed by the kv state cache * + * * manager, the caller does not need to malloc and free the * + * * memory of the kv state. Besides, the data pointer should be * + * * nullptr and the length should be 0. * + * * * + * * Assume the layer is 2, you should allocate the memory for the * + * * kv state like this: * + * * std::vector> kvState; * + * * for (int i = 0; i < 2; i++) { * + * * LLMKV key_state; * + * * LLMKV value_state; * + * * key_state.data = nullptr * + * * value_state.data = nullptr * + * * key_state.length = 0; * + * * value_state.length = 0; * + * * kvState.emplace_back(key_state, value_state); * + * *} * + * * * + * * After calling this function, the key_state's data is pointing * + * * to the K tensor data stored in vineyard blob, and the * + * * value_state's data is pointing to the V tensor data stored in * + * * vineyard blob. All the length of the kv state is the size of * + * * the tensor data. Then you can copy the kv state to the LLM KV * + * * Cache. The memory of the kv state will be freed when calling * + * * the close function of the kv state cache manager. * + * * * + * ***************************************************************** + * + * @return Status + */ +Status BlobStorage::Query(const std::vector& prefix, int token, + std::vector>& kvState) { + std::unique_lock lock(cacheAccessMutex, std::defer_lock); + if (!lock.try_lock()) { + // If failed to gain the lock, return OK and wait for next time + return Status::OK(); + } + if (isClosed) { + return Status::Invalid("The memory storage is closed."); + } + return QueryInternal(prefix, token, kvState); +} + +Status BlobStorage::Query( + const std::vector& prefix, const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) { + return Status::NotImplemented(); +} + BlobStorage::~BlobStorage() { StopSync(); LOG(INFO) << "BlobStorage exit."; @@ -455,10 +457,10 @@ BlobStorage::~BlobStorage() { // This function is used for testing void BlobStorage::Delete(std::vector& token) { std::shared_ptr evictedNode; - kvStateCacheBuilder->GetRootTree()->Delete(token, evictedNode); - kvStateCacheBuilder->Delete(evictedNode); + kvCacheBuilder->GetRootTree()->Delete(token, evictedNode); + kvCacheBuilder->Delete(evictedNode); if (VLOG_IS_ON(100)) { - VLOG(100) << raxShow(kvStateCacheBuilder->GetRootTree()->tree); + VLOG(100) << raxShow(kvCacheBuilder->GetRootTree()->tree); } } @@ -468,16 +470,16 @@ Status BlobStorage::Sync() { std::set blockIDSetToDelete; std::set globalBlockIDSet; // 1. pull the cache object - ObjectID globalKVStateCacheID; + ObjectID globalKVCacheID; std::vector deleteList; - std::shared_ptr globalKVStateCache = nullptr; - status = client.GetName(llmCacheObjectName, globalKVStateCacheID); + std::shared_ptr globalKVCache = nullptr; + status = client.GetName(llmCacheObjectName, globalKVCacheID); if (status.ok()) { - deleteList.push_back(globalKVStateCacheID); - globalKVStateCache = std::dynamic_pointer_cast( - client.FetchAndGetObject(globalKVStateCacheID)); - globalKVStateCache->GetCurrentBlockIDSet(globalBlockIDSet); + deleteList.push_back(globalKVCacheID); + globalKVCache = std::dynamic_pointer_cast( + client.FetchAndGetObject(globalKVCacheID)); + globalKVCache->GetCurrentBlockIDSet(globalBlockIDSet); } else { // Not an error. VLOG(100) << "There is no cache object in the meta server."; @@ -485,48 +487,47 @@ Status BlobStorage::Sync() { // 2. merge the cache object // only the global cache object with higher version will be merged - VLOG(100) << "Current builder version:" << kvStateCacheBuilder->GetVersion() + VLOG(100) << "Current builder version:" << kvCacheBuilder->GetVersion() << " global version:" - << (globalKVStateCache == nullptr + << (globalKVCache == nullptr ? "null" - : std::to_string(globalKVStateCache->GetVersion())); - if (globalKVStateCache != nullptr && - kvStateCacheBuilder->GetVersion() < globalKVStateCache->GetVersion()) { - status = kvStateCacheBuilder->Merge(globalKVStateCache); + : std::to_string(globalKVCache->GetVersion())); + if (globalKVCache != nullptr && + kvCacheBuilder->GetVersion() < globalKVCache->GetVersion()) { + status = kvCacheBuilder->Merge(globalKVCache); RETURN_ON_ERROR(status); - if (globalKVStateCache->id() != globalKVStateCacheID) { + if (globalKVCache->id() != globalKVCacheID) { VLOG(100) << "Del migrate object"; - Status status = client.DelData(globalKVStateCache->id()); + Status status = client.DelData(globalKVCache->id()); if (!status.ok()) { LOG(ERROR) << "Delete object failed: " << status.ToString() << " It may cause memory leak."; } } } - kvStateCacheBuilder->UpdateVersion(); + kvCacheBuilder->UpdateVersion(); /** * 3. get the current block id set, which stores the block id(instead of block * ptr) and the block id set to delete. */ std::set currentObjectIDSet; - kvStateCacheBuilder->GetCurrentBlockIDSet(currentObjectIDSet); - blockIDSetToDelete = kvStateCacheBuilder->GetBlockIDSetToDelete(); + kvCacheBuilder->GetCurrentBlockIDSet(currentObjectIDSet); + blockIDSetToDelete = kvCacheBuilder->GetBlockIDSetToDelete(); // 4. push the cache object to the vineyardd - kvStateCache = std::dynamic_pointer_cast( - kvStateCacheBuilder->_Seal(client)); + kvCache = std::dynamic_pointer_cast(kvCacheBuilder->_Seal(client)); std::set currentGlobalBlockIDSet; - kvStateCacheBuilder->GetCurrentBlockIDSet(currentGlobalBlockIDSet); + kvCacheBuilder->GetCurrentBlockIDSet(currentGlobalBlockIDSet); - status = client.Persist(kvStateCache->id()); + status = client.Persist(kvCache->id()); RETURN_ON_ERROR(status); // 5. put the name of the new cache object to the meta server status = client.DropName(llmCacheObjectName); RETURN_ON_ERROR(status); - status = client.PutName(kvStateCache->id(), llmCacheObjectName); + status = client.PutName(kvCache->id(), llmCacheObjectName); RETURN_ON_ERROR(status); // 6. delete old cache object @@ -537,11 +538,10 @@ Status BlobStorage::Sync() { } // 7. create a global cache object replica - kvStateCache->Resolve(); - RETURN_ON_ERROR( - KVStateCacheBuilder::Make(client, kvStateCacheBuilder, kvStateCache)); + kvCache->Resolve(); + RETURN_ON_ERROR(KVCacheBuilder::Make(client, kvCacheBuilder, kvCache)); - kvStateCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd); + kvCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd); /** * 8. get the add set, which contains the block id in the new cache object @@ -625,31 +625,30 @@ Status BlobStorage::AfterSyncFailed() { * If there exists a global cache object, recover from the global object * and delete the cache object if the builder is sealed. */ - ObjectID globalKVStateCacheID; - std::shared_ptr globalKVStateCache = nullptr; - Status status = client.GetName(llmCacheObjectName, globalKVStateCacheID); + ObjectID globalKVCacheID; + std::shared_ptr globalKVCache = nullptr; + Status status = client.GetName(llmCacheObjectName, globalKVCacheID); if (status.ok()) { - globalKVStateCache = std::dynamic_pointer_cast( - client.FetchAndGetObject(globalKVStateCacheID)); + globalKVCache = std::dynamic_pointer_cast( + client.FetchAndGetObject(globalKVCacheID)); } else { VLOG(100) << "There is no cache object in the meta server."; return Status::OK(); } - status = KVStateCacheBuilder::Make(client, kvStateCacheBuilder, - globalKVStateCache); + status = KVCacheBuilder::Make(client, kvCacheBuilder, globalKVCache); RETURN_ON_ERROR(status); - if (kvStateCache != nullptr && kvStateCache->id() != globalKVStateCacheID) { + if (kvCache != nullptr && kvCache->id() != globalKVCacheID) { // It means the builder is sealed but not pushed to the vineyardd - deleteList.push_back(kvStateCache->id()); - deleteList.push_back(globalKVStateCache->id()); + deleteList.push_back(kvCache->id()); + deleteList.push_back(globalKVCache->id()); } status = client.DelData(deleteList, false, true); if (!status.ok()) { LOG(ERROR) << "Delete object failed: " << status.ToString() << " It may cause memory leak."; } - kvStateCache = nullptr; + kvCache = nullptr; return Status::OK(); } @@ -696,8 +695,8 @@ Status BlobStorage::ClearGlobalCache(Client& client, RETURN_ON_ERROR(client.GetName(llmRefcntObjectName, globalRefcntMapId)); RETURN_ON_ERROR(client.DropName(llmRefcntObjectName)); - std::shared_ptr globalCacheObject = - std::dynamic_pointer_cast( + std::shared_ptr globalCacheObject = + std::dynamic_pointer_cast( client.FetchAndGetObject(globalCacheObjectID)); std::set blockIDSetToDelete; globalCacheObject->GetCurrentBlockIDSet(blockIDSetToDelete); @@ -719,7 +718,7 @@ void BlobStorage::CloseCache() { LOG(INFO) << "Clear block set and recycle blob."; std::lock_guard cacheLock(cacheAccessMutex); - this->kvStateCacheBuilder->Close(); + this->kvCacheBuilder->Close(); this->isClosed = true; RefreshRefcnt(); } @@ -785,7 +784,7 @@ Status BlobStorage::SetRefcntMap(std::set& blockIDSetToDelete, void BlobStorage::RefreshRefcnt() { std::set blockIDSetToDelete = - this->kvStateCacheBuilder->GetBlockIDSetToDelete(); + this->kvCacheBuilder->GetBlockIDSetToDelete(); std::set blockIDSetToAdd; std::string actualKey; AcquireServerLock(client, llmCacheSyncLock, actualKey); diff --git a/modules/llm-cache/storage/blob_storage.h b/modules/llm-cache/storage/blob_storage.h index 0f66aa41a1..c7cb182edd 100644 --- a/modules/llm-cache/storage/blob_storage.h +++ b/modules/llm-cache/storage/blob_storage.h @@ -28,7 +28,7 @@ limitations under the License. #include "client/client.h" #include "common/util/logging.h" -#include "llm-cache/ds/kv_state_cache.h" +#include "llm-cache/ds/kv_cache.h" #include "llm-cache/ds/refcnt_map.h" #include "llm-cache/storage/storage.h" @@ -37,8 +37,8 @@ namespace vineyard { class BlobStorage : public IStorage { private: Client& client; - std::shared_ptr kvStateCacheBuilder = nullptr; - std::shared_ptr kvStateCache = nullptr; + std::shared_ptr kvCacheBuilder = nullptr; + std::shared_ptr kvCache = nullptr; std::shared_ptr refcntMapObjectBuilder = nullptr; std::string llmCacheSyncLock; std::string llmCacheObjectName; @@ -52,13 +52,13 @@ class BlobStorage : public IStorage { bool isClosed = false; public: - BlobStorage(Client& client, std::shared_ptr& cache, + BlobStorage(Client& client, std::shared_ptr& cache, int syncInterval, std::string& llmCacheSyncLock, std::string& llmCacheObjectName, std::string& llmRefcntObjectName); static Status Make(Client& client, std::shared_ptr& storage, - int tensorBytes = 10, int cacheCapacity = 10, + int tensorNBytes = 10, int cacheCapacity = 10, int layer = 1, int blockSize = 5, int syncInterval = 3, std::string llmCacheSyncLock = "llmCacheSyncLock", std::string llmCacheObjectName = "llm_cache_object", @@ -69,25 +69,30 @@ class BlobStorage : public IStorage { Status Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) override; Status Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) override; - Status Query(const std::vector& tokenList, int token, + Status Query(const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) override; + + Status Query(const std::vector& prefix, int token, std::vector>& kvState) override; - Status Query(const std::vector& tokenList, - std::vector>>& kvStateList, + Status Query(const std::vector& prefix, + const std::vector& tokenList, + std::vector>>& kvCacheList, size_t& matched) override; void CloseCache() override; - std::shared_ptr& GetKVStateCacheBuilder() { - return this->kvStateCacheBuilder; + std::shared_ptr& GetKVCacheBuilder() { + return this->kvCacheBuilder; } std::shared_ptr& GetRefcntMapObjectBuilder() { diff --git a/modules/llm-cache/storage/file_storage.cc b/modules/llm-cache/storage/file_storage.cc index 85efa3601c..7774801b71 100644 --- a/modules/llm-cache/storage/file_storage.cc +++ b/modules/llm-cache/storage/file_storage.cc @@ -30,29 +30,13 @@ limitations under the License. #include "llm-cache/storage/file_storage.h" #include "llm-cache/thread_group.h" -#define RETURN_ON_ERROR_WITH_PATH_INDEX(index, status) \ - do { \ - auto _ret = (status); \ - if (!_ret.ok()) { \ - return std::pair(index, _ret); \ - } \ - } while (0) - -#define RETURN_ON_ASSERT_WITH_PATH_INDEX(index, condition, message) \ - do { \ - if (!(condition)) { \ - return std::pair(index, vineyard::Status::AssertionFailed( \ - std::string(#condition ": ") + message)); \ - } \ - } while (0) - namespace vineyard { /** * @brief Update the kv state with the given token list in the file storage. * * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -68,58 +52,63 @@ namespace vineyard { * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList manually * + * * kv buffer of the kvCacheList manually * * * * * ***************************************************************** * * @note The length of the token list should be as same as the length of the - * kvStateList. + * kvCacheList. * * * @example Suppose the token list is [1, 2, 3, 4], the layer is 2, - * then the kvStateList should be a 2D vector with size 4 * 2. + * then the kvCacheList should be a 2D vector with size 4 * 2. * * @return Status */ Status FileStorage::Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { if (this->exitFlag) { return Status::Invalid("The file storage has been closed!"); } + if (tokenList.size() % chunkSize != 0) { + return Status::Invalid("Tokens size " + std::to_string(tokenList.size()) + + " should be multiple of batch size " + + std::to_string(chunkSize) + "!"); + } + std::vector pathList; std::set createFileSet; std::mutex createFileSetMutex; - RETURN_ON_ERROR(hasher->computePathForTokens(tokenList, batchSize, - splitNumber, pathList)); + RETURN_ON_ERROR(hasher->computePathForTokens(tokenList, chunkSize, + hashChunkSize, pathList)); if (pathList.size() == 0) { return Status::OK(); } std::vector tempFilePaths(pathList.size()); - auto fn = [this, &tempFilePaths, &pathList, &tokenList, &kvStateList, - &createFileSet, - &createFileSetMutex](int i) -> std::pair { - int tokenLength = (i + 1) * batchSize; + auto fn = [this, &tempFilePaths, &pathList, &tokenList, &kvCacheList, + &createFileSet, &createFileSetMutex](int i) -> Status { + int tokenLength = (i + 1) * chunkSize; std::shared_ptr fd = CreateFileDescriptor(); std::string tmpPathStr = GetTmpFileDir() + "-" + std::to_string(i); tempFilePaths[i] = tmpPathStr; @@ -127,47 +116,42 @@ Status FileStorage::Update( std::string pathStr = this->rootPath + pathList[i]; ghc::filesystem::path path(pathStr); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(path.parent_path().string())); + RETURN_ON_ERROR(Mkdir(path.parent_path().string())); if (Open(pathStr, fd, FileOperationType::READ).ok()) { int tokenLengthInFile; - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Read(fd, &tokenLengthInFile, sizeof(int))); + RETURN_ON_ERROR(Read(fd, &tokenLengthInFile, sizeof(int))); std::vector tokens; tokens.resize(tokenLengthInFile); - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Read(fd, tokens.data(), tokenLengthInFile * sizeof(int))); + RETURN_ON_ERROR(Read(fd, tokens.data(), tokenLengthInFile * sizeof(int))); if (!CompareTokenList(tokenList, tokens, tokenLengthInFile)) { // Token list not match VINEYARD_DISCARD(Close(fd)); - return std::pair( - i, Status::ObjectExists("File exists for another token sequence")); + return Status::ObjectExists("File exists for another token sequence"); } // Skip this kv state VINEYARD_DISCARD(Close(fd)); - return std::pair(i, Status::OK()); + return Status::OK(); } - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(tmpPath.parent_path().string())); + RETURN_ON_ERROR(Mkdir(tmpPath.parent_path().string())); auto status = Open(tmpPathStr, fd, FileOperationType::WRITE); if (!status.ok()) { LOG(WARNING) << "Failed to create temporary cache entry: " << status.ToString(); - return std::pair( - i, Status::Wrap(status, "Failed to create temporary cache entry")); + return Status::Wrap(status, "Failed to create temporary cache entry"); } // Currently we do not consider delete. - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, &tokenLength, sizeof(int))); - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Write(fd, tokenList.data(), tokenLength * sizeof(int))); - for (int currentTokenIndex = i * batchSize; - currentTokenIndex < (i + 1) * batchSize; currentTokenIndex++) { + RETURN_ON_ERROR(Write(fd, &tokenLength, sizeof(int))); + RETURN_ON_ERROR(Write(fd, tokenList.data(), tokenLength * sizeof(int))); + for (int currentTokenIndex = i * chunkSize; + currentTokenIndex < (i + 1) * chunkSize; currentTokenIndex++) { for (int currentLayer = 0; currentLayer < layer; currentLayer++) { - const LLMKV& k = kvStateList[currentTokenIndex][currentLayer].first; - const LLMKV& v = kvStateList[currentTokenIndex][currentLayer].second; - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, k.data, k.length)); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, v.data, k.length)); + const LLMKV& k = kvCacheList[currentTokenIndex][currentLayer].first; + const LLMKV& v = kvCacheList[currentTokenIndex][currentLayer].second; + RETURN_ON_ERROR(Write(fd, k.data, k.length)); + RETURN_ON_ERROR(Write(fd, v.data, k.length)); } } @@ -178,34 +162,31 @@ Status FileStorage::Update( // Move failed. There exists a file with the same name. LOG(WARNING) << "Failed to move cache entry: " << status.ToString(); VINEYARD_SUPPRESS(Delete(tmpPathStr)); - return std::pair(i, Status::Wrap(status, "Failed to move cache entry")); + return Status::Wrap(status, "Failed to move cache entry"); } std::lock_guard lock(createFileSetMutex); createFileSet.insert(pathStr); - return std::pair(i, Status::OK()); + return Status::OK(); }; parallel::ThreadGroup tg( std::min(pathList.size(), static_cast(std::thread::hardware_concurrency()))); - for (size_t i = 0; i < pathList.size(); i++) { - tg.AddTask(fn, i); + std::vector tids(pathList.size()); + for (size_t i = 0; i < pathList.size(); ++i) { + tids[i] = tg.AddTask(fn, i); } - - std::vector> ss = tg.TakeResults(); - std::map pathIndexMap; - for (size_t i = 0; i < pathList.size(); i++) { - if (ss[i].second.ok()) { - pathIndexMap[ss[i].first] = true; - } + std::vector taskResults(pathList.size(), Status::OK()); + for (size_t i = 0; i < pathList.size(); ++i) { + taskResults[i] = tg.TaskResult(tids[i]); } - int j = 0; + size_t upper_bound = 0; { std::lock_guard lock(gcMutex); for (size_t i = 0; i < pathList.size(); i++) { - if (pathIndexMap.find(i) != pathIndexMap.end()) { - j += 1; + if (taskResults[i].ok()) { + upper_bound += 1; if (createFileSet.find(this->rootPath + pathList[i]) != createFileSet.end()) { TouchFile(this->rootPath + pathList[i]); @@ -216,12 +197,11 @@ Status FileStorage::Update( } } } - updated = ((size_t) j) * batchSize; - for (size_t i = j; i < pathList.size(); i++) { + updated = upper_bound * chunkSize; + for (size_t i = upper_bound; i < pathList.size(); i++) { VINEYARD_SUPPRESS(Delete(this->rootPath + pathList[i])); VINEYARD_SUPPRESS(Delete(tempFilePaths[i])); } - return Status::OK(); } @@ -232,7 +212,7 @@ Status FileStorage::Update( * @param prefix The prefix token list. It should be a multiple of the batch * size. * @param tokenList The token list to be updated. - * @param kvStateList The kv state list of the token list. + * @param kvCacheList The kv state list of the token list. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. * The kv state is a pair of LLMKV, the first is the K tensor @@ -248,47 +228,52 @@ Status FileStorage::Update( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * * * // Copy the k_state of LLM KV Cache to key_state.data * * * // Copy the v_state of LLM KV Cache to value_state.data * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, you must release(free) the * - * * kv buffer of the kvStateList manually * + * * kv buffer of the kvCacheList manually * * * * * ***************************************************************** * * @note The length of the token list should be as same as the length of the - * kvStateList. + * kvCacheList. * * @example Suppose the prefix is [1, 2], the token list is [3, 4], the layer is - * 2, then the kvStateList should be a 2D vector with size 2 * 2. + * 2, then the kvCacheList should be a 2D vector with size 2 * 2. * * @return Status */ Status FileStorage::Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) { if (this->exitFlag) { return Status::Invalid("The file storage has been closed!"); } - if (prefix.size() % batchSize != 0) { + if (prefix.size() % chunkSize != 0) { return Status::Invalid("Prefix size " + std::to_string(prefix.size()) + " should be multiple of batch size " + - std::to_string(batchSize) + "!"); + std::to_string(chunkSize) + "!"); + } + if (tokenList.size() % chunkSize != 0) { + return Status::Invalid("Tokens size " + std::to_string(tokenList.size()) + + " should be multiple of batch size " + + std::to_string(chunkSize) + "!"); } std::vector pathList; @@ -298,17 +283,17 @@ Status FileStorage::Update( totalTokenList.insert(totalTokenList.end(), tokenList.begin(), tokenList.end()); - RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, batchSize, - splitNumber, pathList)); + RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, chunkSize, + hashChunkSize, pathList)); if (pathList.size() == 0) { return Status::OK(); } std::vector tempFilePaths(pathList.size()); auto fn = [this, &tempFilePaths, &pathList, &prefix, &totalTokenList, - &kvStateList, &createFileSet, - &createFileSetMutex](size_t i) -> std::pair { - int tokenLength = (i + 1) * batchSize; + &kvCacheList, &createFileSet, + &createFileSetMutex](size_t i) -> Status { + int tokenLength = (i + 1) * chunkSize; std::shared_ptr fd = CreateFileDescriptor(); std::string tmpPathStr = GetTmpFileDir() + "-" + std::to_string(i); tempFilePaths[i] = tmpPathStr; @@ -316,52 +301,47 @@ Status FileStorage::Update( std::string pathStr = this->rootPath + pathList[i]; ghc::filesystem::path path(pathStr); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(path.parent_path().string())); + RETURN_ON_ERROR(Mkdir(path.parent_path().string())); if (Open(pathStr, fd, FileOperationType::READ).ok()) { int tokenLength; - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, &tokenLength, sizeof(int))); - std::vector tokens; - tokens.resize(tokenLength); - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Read(fd, tokens.data(), tokenLength * sizeof(int))); + RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int))); + std::vector tokens(tokenLength, -1); + RETURN_ON_ERROR(Read(fd, tokens.data(), tokenLength * sizeof(int))); if (!CompareTokenList(totalTokenList, tokens, tokenLength)) { // Token list not match VINEYARD_DISCARD(Close(fd)); - return std::pair( - i, Status::ObjectExists("File exists for another token sequence")); + return Status::ObjectExists("File exists for another token sequence"); } // Skip this kv state VINEYARD_DISCARD(Close(fd)); - return std::pair(i, Status::OK()); + return Status::OK(); } - if ((i + 1) * batchSize <= prefix.size()) { - return std::pair( - i, Status::ObjectNotExists("The prefix is not in the file cache")); + if ((i + 1) * chunkSize <= prefix.size()) { + return Status::ObjectNotExists("The prefix is not in the file cache"); } - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(tmpPath.parent_path().string())); + RETURN_ON_ERROR(Mkdir(tmpPath.parent_path().string())); auto status = Open(tmpPathStr, fd, FileOperationType::WRITE); if (!status.ok()) { - return std::pair( - i, Status::Wrap(status, "Failed to create temporary cache entry")); + return Status::Wrap(status, "Failed to create temporary cache entry"); } // Currently we do not consider delete. - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, &tokenLength, sizeof(int))); - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Write(fd, totalTokenList.data(), tokenLength * sizeof(int))); + RETURN_ON_ERROR(Write(fd, &tokenLength, sizeof(int))); + RETURN_ON_ERROR( + Write(fd, totalTokenList.data(), tokenLength * sizeof(int))); size_t kvStatePos = - (i * batchSize) < prefix.size() ? 0 : (i * batchSize) - prefix.size(); + (i * chunkSize) < prefix.size() ? 0 : (i * chunkSize) - prefix.size(); for (size_t currentTokenIndex = kvStatePos; - currentTokenIndex < kvStatePos + batchSize; currentTokenIndex++) { + currentTokenIndex < kvStatePos + chunkSize; currentTokenIndex++) { for (int currentLayer = 0; currentLayer < layer; currentLayer++) { - const LLMKV& k = kvStateList[currentTokenIndex][currentLayer].first; - const LLMKV& v = kvStateList[currentTokenIndex][currentLayer].second; - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, k.data, k.length)); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, v.data, k.length)); + const LLMKV& k = kvCacheList[currentTokenIndex][currentLayer].first; + const LLMKV& v = kvCacheList[currentTokenIndex][currentLayer].second; + RETURN_ON_ERROR(Write(fd, k.data, k.length)); + RETURN_ON_ERROR(Write(fd, v.data, k.length)); } } @@ -370,35 +350,32 @@ Status FileStorage::Update( if (!MoveFileAtomic(tmpPathStr, pathStr).ok()) { // Move failed. There exists a file with the same name. VINEYARD_SUPPRESS(Delete(tmpPathStr)); - return std::pair(i, Status::Wrap(status, "Failed to move cache entry")); + return Status::Wrap(status, "Failed to move cache entry"); } std::lock_guard lock(createFileSetMutex); createFileSet.insert(pathStr); - return std::pair(i, Status::OK()); + return Status::OK(); }; parallel::ThreadGroup tg( std::min(pathList.size(), static_cast(std::thread::hardware_concurrency()))); - for (size_t i = 0; i < pathList.size(); i++) { - tg.AddTask(fn, i); + std::vector tids(pathList.size()); + for (size_t i = 0; i < pathList.size(); ++i) { + tids[i] = tg.AddTask(fn, i); } - - std::vector> ss = tg.TakeResults(); - std::map pathIndexMap; - for (size_t i = 0; i < pathList.size(); i++) { - if (ss[i].second.ok()) { - pathIndexMap[ss[i].first] = true; - } + std::vector taskResults(pathList.size(), Status::OK()); + for (size_t i = 0; i < pathList.size(); ++i) { + taskResults[i] = tg.TaskResult(tids[i]); } - int j = 0; + size_t upper_bound = 0; { std::lock_guard lock(gcMutex); for (size_t i = 0; i < pathList.size(); i++) { - if (pathIndexMap.find(i) != pathIndexMap.end()) { - j += 1; - if (((size_t) j) * batchSize > prefix.size() && + if (taskResults[i].ok()) { + upper_bound += 1; + if (upper_bound * chunkSize > prefix.size() && createFileSet.find(this->rootPath + pathList[i]) != createFileSet.end()) { // Only this part is created. @@ -410,13 +387,13 @@ Status FileStorage::Update( } } } - updated = - size_t(j * batchSize) < prefix.size() ? 0 : j * batchSize - prefix.size(); - for (size_t i = j; i < pathList.size(); i++) { + updated = upper_bound * chunkSize <= prefix.size() + ? 0 + : upper_bound * chunkSize - prefix.size(); + for (size_t i = upper_bound; i < pathList.size(); i++) { VINEYARD_SUPPRESS(Delete(this->rootPath + pathList[i])); VINEYARD_SUPPRESS(Delete(tempFilePaths[i])); } - return Status::OK(); } @@ -431,7 +408,7 @@ Status FileStorage::Update( * @brief Query the kv state with the given token list in the file storage. * * @param tokenList The token list to be queried. - * @param kvStateList The kv state list of the token list to be fulfilled. + * @param kvCacheList The kv state list of the token list to be fulfilled. * It's a 2D vector, the first dimension is the token index, * and the second dimension is the layer index. The kv state * is a pair of LLMKV, the first is the K tensor and the @@ -447,19 +424,19 @@ Status FileStorage::Update( * * * * * Assume the layer is 2, and the token list is [1,2] you should * * * allocate the memory for the kv state like this: * - * * std::vector>> kvStateList;* + * * std::vector>> kvCacheList;* * * for (int i = 0; i < 2; i++) { * * * std::vector> kvState; * * * for (int j = 0; j < 2; j++) { * * * LLMKV key_state; * * * LLMKV value_state; * - * * key_state.data = malloc(tensorBytes); * - * * value_state.data = malloc(tensorBytes) * - * * key_state.length = tensorBytes; * - * * value_state.length = tensorBytes; * + * * key_state.data = malloc(tensorNBytes); * + * * value_state.data = malloc(tensorNBytes) * + * * key_state.length = tensorNBytes; * + * * value_state.length = tensorNBytes; * * * kvState.emplace_back(key_state, value_state); * * * } * - * * kvStateList.push_back(kvState); * + * * kvCacheList.push_back(kvState); * * *} * * * * * * After calling this function, the key_state and value_state * @@ -469,97 +446,90 @@ Status FileStorage::Update( * * * * ***************************************************************** * - * @note The kvStateList must be initialized before calling this function, + * @note The kvCacheList must be initialized before calling this function, * including the data and length of the kv tensor. * * @return Status */ Status FileStorage::Query( const std::vector& tokenList, - std::vector>>& kvStateList, + std::vector>>& kvCacheList, size_t& matched) { if (this->exitFlag) { return Status::Invalid("The file storage has been closed!"); } + std::vector paths; std::string dir = rootPath; RETURN_ON_ERROR( - hasher->computePathForTokens(tokenList, batchSize, splitNumber, paths)); + hasher->computePathForTokens(tokenList, chunkSize, hashChunkSize, paths)); - auto fn = [&](size_t i, size_t matched_start) -> std::pair { + auto fn = [&](size_t i, size_t matched_start) -> Status { ghc::filesystem::path filePath(dir + paths[i]); std::shared_ptr fd = CreateFileDescriptor(); // If open failed, it means the kv state is not in the cache(file not exist) if (!Open(filePath.string(), fd, FileOperationType::READ).ok()) { - return std::pair(i, Status::ObjectNotExists("file doesn't exist")); + return Status::ObjectNotExists("file doesn't exist"); } size_t file_size = 0; auto s = GetFileSize(fd, file_size); if (!s.ok()) { VINEYARD_DISCARD(Close(fd)); - return std::pair(i, Status::ObjectNotExists("cannot get file size")); + return Status::ObjectNotExists("cannot get file size"); } if (file_size == 0) { VINEYARD_DISCARD(Close(fd)); VINEYARD_DISCARD(Delete(filePath.string())); - return std::pair(i, Status::ObjectNotExists("file is empty")); + return Status::ObjectNotExists("file is empty"); } - int tokenLength; - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, &tokenLength, sizeof(int))); - std::vector prefix; - prefix.resize(tokenLength); - RETURN_ON_ERROR_WITH_PATH_INDEX( - i, Read(fd, prefix.data(), tokenLength * sizeof(int))); + int tokenLength = 0; + RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int))); + std::vector blockTokenList(tokenLength, 0); + RETURN_ON_ERROR(Read(fd, blockTokenList.data(), tokenLength * sizeof(int))); - if (!CompareTokenList(tokenList, prefix, prefix.size())) { + if (!CompareTokenList(tokenList, blockTokenList, tokenLength)) { VINEYARD_DISCARD(Close(fd)); - return std::pair(i, Status::ObjectNotExists("token mismatch")); - } else { - for (int j = 0; j < batchSize; j++) { - if (matched_start + j >= tokenList.size() || - matched_start + j >= kvStateList.size()) { - break; - } - auto& kvState = kvStateList[matched_start + j]; - for (int currentLayer = 0; currentLayer < layer; currentLayer++) { - RETURN_ON_ASSERT_WITH_PATH_INDEX( - i, static_cast(kvState.size()) == layer, - "The size of kvState is not equal to layer"); - LLMKV& k = kvState[currentLayer].first; - LLMKV& v = kvState[currentLayer].second; - RETURN_ON_ASSERT_WITH_PATH_INDEX( - i, k.length == tensorBytes && v.length == tensorBytes, - "The size of kv tensor doesn't match with the tensorBytes"); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, k.data, k.length)); - RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, v.data, v.length)); - } + return Status::ObjectNotExists("token mismatch"); + } + for (int j = 0; j < chunkSize; j++) { + if (matched_start + j >= tokenList.size() || + matched_start + j >= kvCacheList.size()) { + break; + } + auto& kvState = kvCacheList[matched_start + j]; + for (int currentLayer = 0; currentLayer < layer; currentLayer++) { + RETURN_ON_ASSERT(static_cast(kvState.size()) == layer, + "The size of kvState is not equal to layer"); + LLMKV& k = kvState[currentLayer].first; + LLMKV& v = kvState[currentLayer].second; + RETURN_ON_ASSERT( + k.length == tensorNBytes && v.length == tensorNBytes, + "The size of kv tensor doesn't match with the tensorNBytes"); + RETURN_ON_ERROR(Read(fd, k.data, k.length)); + RETURN_ON_ERROR(Read(fd, v.data, v.length)); } } - VINEYARD_DISCARD(Close(fd)); - return std::pair(i, Status::OK()); + return Status::OK(); }; parallel::ThreadGroup tg(std::min( paths.size(), static_cast(std::thread::hardware_concurrency()))); - for (size_t i = 0; i < paths.size(); i++) { - tg.AddTask(fn, i, i * batchSize); + std::vector tids(paths.size()); + for (size_t i = 0; i < paths.size(); ++i) { + tids[i] = tg.AddTask(fn, i, i * chunkSize); } - - matched = 0; - std::vector> ss = tg.TakeResults(); - std::map pathIndexMap; - for (size_t i = 0; i < paths.size(); i++) { - if (ss[i].second.ok()) { - pathIndexMap[ss[i].first] = true; - } + std::vector taskResults(paths.size(), Status::OK()); + for (size_t i = 0; i < paths.size(); ++i) { + taskResults[i] = tg.TaskResult(tids[i]); } + matched = 0; for (size_t i = 0; i < paths.size(); i++) { - if (pathIndexMap.find(i) != pathIndexMap.end()) { - matched += batchSize; + if (taskResults[i].ok()) { + matched += chunkSize; } else { break; } @@ -567,20 +537,120 @@ Status FileStorage::Query( return Status::OK(); } -Status FileStorage::Query(const std::vector& tokenList, int nextToken, +Status FileStorage::Query(const std::vector& prefix, int nextToken, std::vector>& kvState) { // TBD return Status::NotImplemented(); } -bool FileStorage::CompareTokenList(const std::vector& tokenList, +Status FileStorage::Query( + const std::vector& prefix, const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) { + if (this->exitFlag) { + return Status::Invalid("The file storage has been closed!"); + } + if (prefix.size() % chunkSize != 0) { + return Status::Invalid("Prefix size " + std::to_string(prefix.size()) + + " should be multiple of batch size " + + std::to_string(chunkSize) + "!"); + } + + size_t numPrefixChunks = prefix.size() / chunkSize; + std::vector totalTokenList(prefix.begin(), prefix.end()); + totalTokenList.insert(totalTokenList.end(), tokenList.begin(), + tokenList.end()); + + std::vector paths; + std::string dir = rootPath; + RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, chunkSize, + hashChunkSize, paths)); + + auto fn = [&](size_t i, size_t matched_start) -> Status { + ghc::filesystem::path filePath(dir + paths[i]); + std::shared_ptr fd = CreateFileDescriptor(); + + // If open failed, it means the kv state is not in the cache(file not exist) + if (!Open(filePath.string(), fd, FileOperationType::READ).ok()) { + return Status::ObjectNotExists("Failed to open file '" + + filePath.string() + "'"); + } + size_t file_size = 0; + auto s = GetFileSize(fd, file_size); + if (!s.ok()) { + VINEYARD_DISCARD(Close(fd)); + return Status::ObjectNotExists("Cannot get file size"); + } + if (file_size == 0) { + VINEYARD_DISCARD(Close(fd)); + VINEYARD_DISCARD(Delete(filePath.string())); + return Status::ObjectNotExists("The target file is empty"); + } + + int tokenLength = 0; + RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int))); + std::vector blockTokenList(tokenLength, -1); + RETURN_ON_ERROR(Read(fd, blockTokenList.data(), tokenLength * sizeof(int))); + + if (!CompareTokenList(totalTokenList, blockTokenList, tokenLength)) { + VINEYARD_DISCARD(Close(fd)); + return Status::ObjectNotExists("Token mismatch"); + } + for (int j = 0; j < chunkSize; j++) { + if (matched_start + j >= totalTokenList.size() || + matched_start + j >= kvCacheList.size()) { + break; + } + auto& kvState = kvCacheList[matched_start + j]; + for (int currentLayer = 0; currentLayer < layer; currentLayer++) { + RETURN_ON_ASSERT(static_cast(kvState.size()) == layer, + "The size of kvState is not equal to layer"); + LLMKV& k = kvState[currentLayer].first; + LLMKV& v = kvState[currentLayer].second; + RETURN_ON_ASSERT( + k.length == tensorNBytes && v.length == tensorNBytes, + "The size of kv tensor doesn't match with the tensorNBytes"); + RETURN_ON_ERROR(Read(fd, k.data, k.length)); + RETURN_ON_ERROR(Read(fd, v.data, v.length)); + } + } + + VINEYARD_DISCARD(Close(fd)); + return Status::OK(); + }; + + parallel::ThreadGroup tg(std::min( + paths.size(), static_cast(std::thread::hardware_concurrency()))); + std::vector tids(paths.size() - + numPrefixChunks); + for (size_t i = numPrefixChunks; i < paths.size(); i++) { + tids[i - numPrefixChunks] = + tg.AddTask(fn, i, (i - numPrefixChunks) * chunkSize); + } + std::vector taskResults(paths.size() - numPrefixChunks, Status::OK()); + for (size_t i = numPrefixChunks; i < paths.size(); i++) { + taskResults[i - numPrefixChunks] = tg.TaskResult(tids[i - numPrefixChunks]); + } + + matched = 0; + for (size_t i = numPrefixChunks; i < paths.size(); i++) { + if (taskResults[i - numPrefixChunks].ok()) { + matched += chunkSize; + } else { + break; + } + } + return Status::OK(); +} + +bool FileStorage::CompareTokenList(const std::vector& tokenList1, const std::vector& tokenList2, size_t length) { - if (tokenList.size() < length || tokenList2.size() < length) { + if (tokenList1.size() < length || tokenList2.size() < length) { return false; } for (size_t i = 0; i < length; i++) { - if (tokenList[i] != tokenList2[i]) { + if (tokenList1[i] != tokenList2[i]) { return false; } } diff --git a/modules/llm-cache/storage/file_storage.h b/modules/llm-cache/storage/file_storage.h index e3223180a9..a3be5c1f80 100644 --- a/modules/llm-cache/storage/file_storage.h +++ b/modules/llm-cache/storage/file_storage.h @@ -50,7 +50,7 @@ enum FileOperationType { class FileStorage : public IStorage, public std::enable_shared_from_this { private: - bool CompareTokenList(const std::vector& tokenList, + bool CompareTokenList(const std::vector& tokenList1, const std::vector& tokenList2, size_t length); virtual std::shared_ptr CreateFileDescriptor() = 0; @@ -117,7 +117,7 @@ class FileStorage : public IStorage, Status Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) override; Status Update(const std::vector& tokenList, int nextToken, @@ -125,16 +125,21 @@ class FileStorage : public IStorage, Status Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) override; Status Query(const std::vector& tokenList, - std::vector>>& kvStateList, + std::vector>>& kvCacheList, size_t& matched) override; - Status Query(const std::vector& tokenList, int nextToken, + Status Query(const std::vector& prefix, int nextToken, std::vector>& kvState) override; + Status Query(const std::vector& prefix, + const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) override; + void CloseCache() override; virtual Status Init() = 0; @@ -144,11 +149,11 @@ class FileStorage : public IStorage, void StartGlobalGCThread() override { this->enableGlobalGC = true; } protected: - size_t tensorBytes; + size_t tensorNBytes; size_t cacheCapacity; int layer; - int batchSize; - int splitNumber; + int chunkSize; + int hashChunkSize; std::string rootPath; std::string tempFileDir; std::shared_ptr hashAlgorithm; diff --git a/modules/llm-cache/storage/local_file_storage.h b/modules/llm-cache/storage/local_file_storage.h index 5abd9a0b8b..1396373a9d 100644 --- a/modules/llm-cache/storage/local_file_storage.h +++ b/modules/llm-cache/storage/local_file_storage.h @@ -35,21 +35,21 @@ struct LocalFileDescriptor : public FileDescriptor { class LocalFileStorage : public FileStorage { public: - LocalFileStorage(int tensorBytes, int cacheCapacity, int layer, int batchSize, - int splitNumber, std::string rootPath, - int64_t clientGCInterval, int64_t ttl, bool enableGlobalGC, + LocalFileStorage(int tensorNBytes, int cacheCapacity, int layer, + int chunkSize, int hashChunkSize, std::string rootPath, + int64_t gcInterval, int64_t ttl, bool enableGlobalGC, int64_t globalGCInterval, int64_t globalTTL) { this->hashAlgorithm = std::make_shared(); this->hasher = std::make_shared(hashAlgorithm.get()); - this->tensorBytes = tensorBytes; + this->tensorNBytes = tensorNBytes; this->cacheCapacity = cacheCapacity; this->layer = layer; - this->batchSize = batchSize; - this->splitNumber = splitNumber; + this->chunkSize = chunkSize; + this->hashChunkSize = hashChunkSize; this->rootPath = std::regex_replace(rootPath + "/", std::regex("/+"), "/"); this->tempFileDir = std::regex_replace(rootPath + "/__temp/", std::regex("/+"), "/"); - this->gcInterval = std::chrono::seconds(clientGCInterval); + this->gcInterval = std::chrono::seconds(gcInterval); this->fileTTL = std::chrono::seconds(ttl); this->globalGCInterval = std::chrono::seconds(globalGCInterval); this->globalFileTTL = std::chrono::seconds(globalTTL); diff --git a/modules/llm-cache/storage/storage.h b/modules/llm-cache/storage/storage.h index abbe79e640..ce4344edfa 100644 --- a/modules/llm-cache/storage/storage.h +++ b/modules/llm-cache/storage/storage.h @@ -21,7 +21,7 @@ limitations under the License. #include #include "common/util/status.h" -#include "llm-cache/ds/kv_state_cache_block.h" +#include "llm-cache/ds/kv_cache_block.h" namespace vineyard { @@ -31,7 +31,7 @@ class IStorage { virtual Status Update( const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) = 0; virtual Status Update( @@ -40,17 +40,22 @@ class IStorage { virtual Status Update( const std::vector& prefix, const std::vector& tokenList, - const std::vector>>& kvStateList, + const std::vector>>& kvCacheList, size_t& updated) = 0; virtual Status Query( const std::vector& tokenList, - std::vector>>& kvStateList, + std::vector>>& kvCacheList, size_t& matched) = 0; - virtual Status Query(const std::vector& tokenList, int nextToken, + virtual Status Query(const std::vector& prefix, int nextToken, std::vector>& kvState) = 0; + virtual Status Query( + const std::vector& prefix, const std::vector& tokenList, + std::vector>>& kvCacheList, + size_t& matched) = 0; + virtual void CloseCache() = 0; virtual void StartGlobalGCThread() {} diff --git a/modules/llm-cache/tests/k8s-test/worker.py b/modules/llm-cache/tests/k8s-test/worker.py index 3d689212ca..f46da83c81 100644 --- a/modules/llm-cache/tests/k8s-test/worker.py +++ b/modules/llm-cache/tests/k8s-test/worker.py @@ -13,9 +13,9 @@ def start_server(port=8888): ip = os.environ.get('POD_IP', 'localhost') layer = int(os.environ.get('LAYER', 96)) batch_size = int(os.environ.get('BATCH_SIZE', 16)) - split_number = int(os.environ.get('SPLIT_NUMBER', 2)) + hash_chunk_size = int(os.environ.get('HASH_CHUNK_SIZE', 2)) cache_path = os.environ.get('CACHE_PATH', '/mnt/llm_cache') - client_gc_interval = int(os.environ.get('CLIENT_GC_INTERVAL', 30 * 60)) + gc_interval = int(os.environ.get('GC_INTERVAL', 30 * 60)) ttl = int(os.environ.get('TTL', 30 * 60)) enable_global_gc = os.environ.get('ENABLE_GLOBAL_GC', False).lower() in ['true', '1'] global_gc_interval = int(os.environ.get('GLOBAL_GC_INTERVAL', 3 * 60 * 60)) @@ -29,9 +29,9 @@ def start_server(port=8888): file_cache_config = FileCacheConfig( chunk_size = int(batch_size), - split_number = int(split_number), + hash_chunk_size = int(hash_chunk_size), root = cache_path, - client_gc_interval = client_gc_interval, + gc_interval = gc_interval, ttl = ttl, enable_global_gc = enable_global_gc, global_gc_interval = global_gc_interval, @@ -39,7 +39,7 @@ def start_server(port=8888): ) cache = KVCache( cache_config = file_cache_config, - tensor_bytes=kv_tensor.nbytes, # should be the same as the nbytes of the tensor + tensor_nbytes=kv_tensor.nbytes, # should be the same as the nbytes of the tensor cache_capacity=1024, layer=int(layer), ) @@ -64,7 +64,7 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor): return kv_tensors # used to hold the query results - kv_state_list = [] + kv_cache_list = [] while True: clientsocket, _ = serversocket.accept() @@ -80,10 +80,10 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor): tokens = tokens.replace('\n', '').split(' ') tokens = [int(token) for token in tokens] - kv_state_list = reserve_kv_tensors(kv_state_list, len(tokens), kv_tensor) + kv_cache_list = reserve_kv_tensors(kv_cache_list, len(tokens), kv_tensor) query_start_time = time.time() - matched = cache.query(tokens, kv_state_list) + matched = cache.query(tokens, kv_cache_list) query_end_time = time.time() if matched > 0: total_query_time += query_end_time - query_start_time @@ -92,14 +92,14 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor): total_tokens += len(tokens) remaining = tokens[matched:] - kv_state_list_remaining = [ + kv_cache_list_remaining = [ [ (KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes), KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes)) for _ in range(layer) ] for _ in remaining ] update_start_time = time.time() - updated = cache.update(tokens[:matched], remaining, kv_state_list_remaining) + updated = cache.update(tokens[:matched], remaining, kv_cache_list_remaining) total_updated_tokens += updated update_end_time = time.time() if updated > 0: diff --git a/modules/llm-cache/tests/k8s-test/yamls/worker.yaml b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml index 603fb72f0b..32cc86212f 100644 --- a/modules/llm-cache/tests/k8s-test/yamls/worker.yaml +++ b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml @@ -35,9 +35,9 @@ spec: value: "96" - name: BATCH_SIZE value: "16" - - name: SPLIT_NUMBER + - name: HASH_CHUNK_SIZE value: "2" - - name: CLIENT_GC_INTERVAL + - name: GC_INTERVAL value: "1800" - name: TTL value: "1800" diff --git a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc b/modules/llm-cache/tests/kv_cache_benchmark_test.cc similarity index 92% rename from modules/llm-cache/tests/kv_state_cache_benchmark_test.cc rename to modules/llm-cache/tests/kv_cache_benchmark_test.cc index 8dea2d4041..4d6ea93477 100644 --- a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc +++ b/modules/llm-cache/tests/kv_cache_benchmark_test.cc @@ -23,7 +23,7 @@ limitations under the License. #include "client/ds/object_meta.h" #include "common/util/logging.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache_manager.h" using namespace vineyard; // NOLINT(build/namespaces) @@ -32,13 +32,13 @@ constexpr int CAPACITY = 8000; constexpr int LAYER = 64; constexpr int BLOCK_SIZE = 100; -std::shared_ptr manager; +std::shared_ptr manager; VineyardCacheConfig config(TENSORBYTES, CAPACITY, LAYER, BLOCK_SIZE, 300); Client client; void init(std::string socket) { VINEYARD_CHECK_OK(client.Connect(socket)); - VINEYARD_CHECK_OK(KVStateCacheManager::Make(client, manager, config)); + VINEYARD_CHECK_OK(KVCacheManager::Make(client, manager, config)); } std::vector generate_unique_tokens(size_t max_length) { @@ -83,7 +83,7 @@ void benchmark_inference(std::vector>& tokens) { std::chrono::duration query_duration(0); std::vector inference_tokens; - std::vector> kv_state_list; + std::vector> kv_cache_list; void* key_state = malloc(TENSORBYTES); void* value_state = malloc(TENSORBYTES); @@ -106,15 +106,15 @@ void benchmark_inference(std::vector>& tokens) { // query time for (size_t i = 0; i < tokens.size(); ++i) { inference_tokens.clear(); - kv_state_list.clear(); + kv_cache_list.clear(); for (size_t j = 0; j < tokens[i].size(); ++j) { start = std::chrono::steady_clock::now(); Status status = - manager->Query(inference_tokens, tokens[i][j], kv_state_list); + manager->Query(inference_tokens, tokens[i][j], kv_cache_list); if (!status.ok()) { VLOG(100) << "KV state is not in the cache."; } - for (auto& kv : kv_state_list) { + for (auto& kv : kv_cache_list) { for (int currentLayer = 0; currentLayer < LAYER; currentLayer++) { memcpy(key_state, kv.first.data, kv.first.length); memcpy(value_state, kv.second.data, kv.second.length); @@ -138,7 +138,7 @@ void benchmark_inference(std::vector>& tokens) { int main(int argc, char** argv) { if (argc < 2) { - printf("usage ./kv_state_cache_benchmark "); + printf("usage ./kv_cache_benchmark "); return 1; } std::string ipc_socket = std::string(argv[1]); diff --git a/modules/llm-cache/tests/kv_state_cache_hash_test.cc b/modules/llm-cache/tests/kv_cache_hash_test.cc similarity index 86% rename from modules/llm-cache/tests/kv_state_cache_hash_test.cc rename to modules/llm-cache/tests/kv_cache_hash_test.cc index 63bcc6e37b..48a1b9ed77 100644 --- a/modules/llm-cache/tests/kv_state_cache_hash_test.cc +++ b/modules/llm-cache/tests/kv_cache_hash_test.cc @@ -24,7 +24,7 @@ limitations under the License. using namespace vineyard; // NOLINT(build/namespaces) constexpr int BATCHSIZE = 16; -constexpr int SPLITNUMBER = 2; +constexpr int HASH_CHUNK_SIZE = 2; constexpr int TOKENLISTSIZE = 100000; std::vector generate_random_tokens(size_t max_length) { @@ -49,7 +49,7 @@ void test_with_tokens(IHashAlgorithm* hash_algorithm, std::vector tokens = generate_random_tokens(10); std::vector paths; VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths)); + hasher.computePathForTokens(tokens, BATCHSIZE, HASH_CHUNK_SIZE, paths)); VINEYARD_ASSERT(paths.size() == 0); // test the hash with the tokens more than the batch size @@ -57,15 +57,15 @@ void test_with_tokens(IHashAlgorithm* hash_algorithm, std::vector tokens1 = generate_random_tokens(17); std::vector tokens2 = generate_random_tokens(18); VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens1, BATCHSIZE, SPLITNUMBER, paths1)); + hasher.computePathForTokens(tokens1, BATCHSIZE, HASH_CHUNK_SIZE, paths1)); VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens2, BATCHSIZE, SPLITNUMBER, paths2)); + hasher.computePathForTokens(tokens2, BATCHSIZE, HASH_CHUNK_SIZE, paths2)); VINEYARD_ASSERT(paths1.size() == paths1.size()); paths.clear(); tokens = generate_random_tokens(100); VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths)); + hasher.computePathForTokens(tokens, BATCHSIZE, HASH_CHUNK_SIZE, paths)); VINEYARD_ASSERT(paths.size() == size_t(100 / 16)); LOG(INFO) << "Passed the " << hash_name << " test of tokens"; } @@ -79,10 +79,10 @@ void test_accuracy(IHashAlgorithm* hash_algorithm, std::vector paths2; for (int i = 0; i < 100; i++) { std::vector tokens = generate_random_tokens(100); - VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths1)); - VINEYARD_CHECK_OK( - hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths2)); + VINEYARD_CHECK_OK(hasher.computePathForTokens(tokens, BATCHSIZE, + HASH_CHUNK_SIZE, paths1)); + VINEYARD_CHECK_OK(hasher.computePathForTokens(tokens, BATCHSIZE, + HASH_CHUNK_SIZE, paths2)); } VINEYARD_ASSERT(paths1.size() == paths2.size()); @@ -119,9 +119,9 @@ void test_hash_conflict() { tokens_map[tokens]++; token_size += tokens.size(); VINEYARD_CHECK_OK(murmur_hasher.computePathForTokens( - tokens, BATCHSIZE, SPLITNUMBER, murmur_hash_paths)); + tokens, BATCHSIZE, HASH_CHUNK_SIZE, murmur_hash_paths)); VINEYARD_CHECK_OK(city_hasher.computePathForTokens( - tokens, BATCHSIZE, SPLITNUMBER, city_hash_paths)); + tokens, BATCHSIZE, HASH_CHUNK_SIZE, city_hash_paths)); } for (size_t i = 0; i < murmur_hash_paths.size(); i++) { diff --git a/modules/llm-cache/tests/kv_state_cache_local_file_test.cc b/modules/llm-cache/tests/kv_cache_local_file_test.cc similarity index 83% rename from modules/llm-cache/tests/kv_state_cache_local_file_test.cc rename to modules/llm-cache/tests/kv_cache_local_file_test.cc index 30ae068b4d..ee9e4f4d9a 100644 --- a/modules/llm-cache/tests/kv_state_cache_local_file_test.cc +++ b/modules/llm-cache/tests/kv_cache_local_file_test.cc @@ -20,12 +20,12 @@ limitations under the License. #include "gulrak/filesystem.hpp" #include "llm-cache/ds/config.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache_manager.h" #include "rax/radix.h" using namespace vineyard; // NOLINT(build/namespaces) -int tensorBytes = 80; +int tensorNBytes = 80; int capacity = 20; int layer = 3; @@ -43,10 +43,10 @@ std::vector round_4_tokens = {1, 2, 3, 4, 5, 6}; std::vector> tokens_list = {round_1_tokens, round_2_tokens, round_3_tokens, round_4_tokens}; -std::shared_ptr init() { - std::shared_ptr kv_state_cache_manager; - VINEYARD_CHECK_OK(KVStateCacheManager::Make(kv_state_cache_manager, config)); - return kv_state_cache_manager; +std::shared_ptr init() { + std::shared_ptr kv_cache_manager; + VINEYARD_CHECK_OK(KVCacheManager::Make(kv_cache_manager, config)); + return kv_cache_manager; } void print_current_tokens(const std::vector& prefix, int next_token) { @@ -65,10 +65,10 @@ void print_kv_state(const std::vector>& kv_state) { reinterpret_cast(kv_state[i].first.data); uint8_t* value_state_data = reinterpret_cast(kv_state[i].second.data); - // print the first tensorBytes bytes + // print the first tensorNBytes bytes std::string key_state_str = ""; std::string value_state_str = ""; - for (int j = 0; j < tensorBytes; j++) { + for (int j = 0; j < tensorNBytes; j++) { key_state_str += std::to_string(key_state_data[j]) + " "; value_state_str += std::to_string(value_state_data[j]) + " "; } @@ -85,13 +85,13 @@ std::vector> generate_kv_state(int token) { for (int currentLayer = 0; currentLayer < layer; currentLayer++) { LLMKV key_state; LLMKV value_state; - key_state.data = malloc(tensorBytes); - value_state.data = malloc(tensorBytes); + key_state.data = malloc(tensorNBytes); + value_state.data = malloc(tensorNBytes); - key_state.length = tensorBytes; - value_state.length = tensorBytes; + key_state.length = tensorNBytes; + value_state.length = tensorNBytes; - for (int i = 0; i < tensorBytes; ++i) { + for (int i = 0; i < tensorNBytes; ++i) { (reinterpret_cast(key_state.data))[i] = (static_cast(token)) + i + currentLayer; (reinterpret_cast(value_state.data))[i] = @@ -108,13 +108,13 @@ void check_kv_state(const std::vector>& kv_state, VINEYARD_ASSERT(kv_state.size() == (size_t) layer); for (size_t index = 0; index < kv_state.size(); ++index) { LOG(INFO) << "kv_state length: " << kv_state[index].first.length - << "tensorBytes: " << tensorBytes << "layer: " << layer; - VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes); - VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes); - for (int i = 0; i < tensorBytes; ++i) { + << "tensorNBytes: " << tensorNBytes << "layer: " << layer; + VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes); + VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes); + for (int i = 0; i < tensorNBytes; ++i) { if ((reinterpret_cast(kv_state[index].first.data))[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "key_state[" << i << "]: " << (reinterpret_cast(kv_state[index].first.data))[i] @@ -124,7 +124,7 @@ void check_kv_state(const std::vector>& kv_state, } if (reinterpret_cast(kv_state[index].second.data)[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "value_state[" << i << "]: " << (reinterpret_cast( @@ -137,7 +137,7 @@ void check_kv_state(const std::vector>& kv_state, } } -void inference(std::shared_ptr& kv_state_cache_manager, +void inference(std::shared_ptr& kv_cache_manager, std::vector tokens, bool block = false) { std::vector inference_tokens; std::vector>> kv_state; @@ -150,8 +150,7 @@ void inference(std::shared_ptr& kv_state_cache_manager, } size_t updated = 0; - Status result = - kv_state_cache_manager->Update(inference_tokens, kv_state, updated); + Status result = kv_cache_manager->Update(inference_tokens, kv_state, updated); std::vector>> kv_state_to_query; for (size_t i = 0; i < tokens.size(); ++i) { @@ -160,8 +159,8 @@ void inference(std::shared_ptr& kv_state_cache_manager, kv_state_to_query.push_back(current_kv_state); } size_t matched = 0; - Status query_result = kv_state_cache_manager->Query( - inference_tokens, kv_state_to_query, matched); + Status query_result = + kv_cache_manager->Query(inference_tokens, kv_state_to_query, matched); if (!query_result.ok()) { LOG(INFO) << "Query failed!"; } @@ -180,7 +179,7 @@ void checkFilesNotExist(std::string dir) { } void threadFunc(int sleep_time) { - std::shared_ptr manager = init(); + std::shared_ptr manager = init(); for (size_t i = 0; i < tokens_list.size(); i++) { LOG(INFO) << "Round " << i << " :"; @@ -193,10 +192,10 @@ void threadFunc(int sleep_time) { } int main(int argc, char** argv) { - LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes + LOG(INFO) << "Test KVCache with tensorNBytes: " << tensorNBytes << ", capacity: " << capacity << ", layer: " << layer; - config = FileCacheConfig(tensorBytes, capacity, layer, 4, 2, + config = FileCacheConfig(tensorNBytes, capacity, layer, 4, 2, "/tmp/llm_cache/", LOCAL, 1, 1, false, 3, 5); std::vector threads; @@ -211,7 +210,7 @@ int main(int argc, char** argv) { checkFilesNotExist("/tmp/llm_cache/"); - config = FileCacheConfig(tensorBytes, capacity, layer, 4, 2, + config = FileCacheConfig(tensorNBytes, capacity, layer, 4, 2, "/tmp/llm_cache/", LOCAL, 10, 20, true, 1, 2); threads.clear(); @@ -227,6 +226,6 @@ int main(int argc, char** argv) { sleep(3); checkFilesNotExist("/tmp/llm_cache/"); - LOG(INFO) << "Passed KVStateCache tests..."; + LOG(INFO) << "Passed KVCache tests..."; return 0; } diff --git a/modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc b/modules/llm-cache/tests/kv_cache_radix_tree_test.cc similarity index 99% rename from modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc rename to modules/llm-cache/tests/kv_cache_radix_tree_test.cc index 235e344405..723c9b15a2 100644 --- a/modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc +++ b/modules/llm-cache/tests/kv_cache_radix_tree_test.cc @@ -21,7 +21,7 @@ limitations under the License. #include "rax/radix.h" #include "common/util/logging.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache_manager.h" using namespace vineyard; // NOLINT(build/namespaces) diff --git a/modules/llm-cache/tests/kv_state_cache_test.cc b/modules/llm-cache/tests/kv_cache_test.cc similarity index 84% rename from modules/llm-cache/tests/kv_state_cache_test.cc rename to modules/llm-cache/tests/kv_cache_test.cc index e936a8afed..25b39060cc 100644 --- a/modules/llm-cache/tests/kv_state_cache_test.cc +++ b/modules/llm-cache/tests/kv_cache_test.cc @@ -22,11 +22,11 @@ limitations under the License. #include "common/util/logging.h" #include "llm-cache/ds/config.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache_manager.h" using namespace vineyard; // NOLINT(build/namespaces) -int tensorBytes = 80; +int tensorNBytes = 80; int capacity = 20; int layer = 3; int block_size = 5; @@ -48,11 +48,10 @@ std::vector round_4_tokens = {1, 2, 3, 4, 5, 6}; std::vector> tokens_list = {round_1_tokens, round_2_tokens, round_3_tokens, round_4_tokens}; -std::shared_ptr init(Client& client) { - std::shared_ptr kv_state_cache_manager; - VINEYARD_CHECK_OK( - KVStateCacheManager::Make(client, kv_state_cache_manager, config)); - return kv_state_cache_manager; +std::shared_ptr init(Client& client) { + std::shared_ptr kv_cache_manager; + VINEYARD_CHECK_OK(KVCacheManager::Make(client, kv_cache_manager, config)); + return kv_cache_manager; } void print_current_tokens(const std::vector& prefix, int next_token) { @@ -71,10 +70,10 @@ void print_kv_state(const std::vector>& kv_state) { reinterpret_cast(kv_state[i].first.data); uint8_t* value_state_data = reinterpret_cast(kv_state[i].second.data); - // print the first tensorBytes bytes + // print the first tensorNBytes bytes std::string key_state_str = ""; std::string value_state_str = ""; - for (int j = 0; j < tensorBytes; j++) { + for (int j = 0; j < tensorNBytes; j++) { key_state_str += std::to_string(key_state_data[j]) + " "; value_state_str += std::to_string(value_state_data[j]) + " "; } @@ -91,13 +90,13 @@ std::vector> generate_kv_state(int token) { for (int currentLayer = 0; currentLayer < layer; currentLayer++) { LLMKV key_state; LLMKV value_state; - key_state.data = malloc(tensorBytes); - value_state.data = malloc(tensorBytes); + key_state.data = malloc(tensorNBytes); + value_state.data = malloc(tensorNBytes); - key_state.length = tensorBytes; - value_state.length = tensorBytes; + key_state.length = tensorNBytes; + value_state.length = tensorNBytes; - for (int i = 0; i < tensorBytes; ++i) { + for (int i = 0; i < tensorNBytes; ++i) { (reinterpret_cast(key_state.data))[i] = (static_cast(token)) + i + currentLayer; (reinterpret_cast(value_state.data))[i] = @@ -112,12 +111,12 @@ void check_kv_state(const std::vector>& kv_state, int& token) { VINEYARD_ASSERT(kv_state.size() == (size_t) layer); for (size_t index = 0; index < kv_state.size(); ++index) { - VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes); - VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes); - for (int i = 0; i < tensorBytes; ++i) { + VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes); + VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes); + for (int i = 0; i < tensorNBytes; ++i) { if ((reinterpret_cast(kv_state[index].first.data))[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "key_state[" << i << "]: " << (reinterpret_cast(kv_state[index].first.data))[i] @@ -127,7 +126,7 @@ void check_kv_state(const std::vector>& kv_state, } if (reinterpret_cast(kv_state[index].second.data)[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "value_state[" << i << "]: " << (reinterpret_cast( @@ -140,7 +139,7 @@ void check_kv_state(const std::vector>& kv_state, } } -void inference(std::shared_ptr& kv_state_cache_manager, +void inference(std::shared_ptr& kv_cache_manager, std::vector tokens, bool block = false) { std::vector inference_tokens; std::vector> kv_state; @@ -151,13 +150,13 @@ void inference(std::shared_ptr& kv_state_cache_manager, LOG(INFO) << "before query"; LOG(INFO) << "kv_state_to_query size: " << kv_state_to_query.size() - << "layer" << layer << "tensorBytes" << tensorBytes; + << "layer" << layer << "tensorNBytes" << tensorNBytes; kv_state_to_query.clear(); for (int currentLayer = 0; currentLayer < layer; currentLayer++) { kv_state_to_query.emplace_back(LLMKV{nullptr, 0}, LLMKV{nullptr, 0}); } - Status result = kv_state_cache_manager->Query(inference_tokens, tokens[i], - kv_state_to_query); + Status result = + kv_cache_manager->Query(inference_tokens, tokens[i], kv_state_to_query); if (!result.ok()) { LOG(INFO) << "Can not find the kv_state from cache:"; print_current_tokens(inference_tokens, tokens[i]); @@ -165,7 +164,7 @@ void inference(std::shared_ptr& kv_state_cache_manager, kv_state = generate_kv_state(tokens[i]); print_kv_state(kv_state); Status status = - kv_state_cache_manager->Update(inference_tokens, tokens[i], kv_state); + kv_cache_manager->Update(inference_tokens, tokens[i], kv_state); if (!status.ok()) { // Not a error. May be the cache is full. VLOG(100) << "Put kv state into cache failed."; @@ -183,7 +182,7 @@ void inference(std::shared_ptr& kv_state_cache_manager, void threadFunc(std::string socket) { Client client; VINEYARD_CHECK_OK(client.Connect(socket)); - std::shared_ptr manager = init(client); + std::shared_ptr manager = init(client); for (size_t i = 0; i < tokens_list.size(); i++) { inference(manager, tokens_list[i]); @@ -205,7 +204,7 @@ void clearGlobalObject(std::vector& sockets) { Client client; VINEYARD_CHECK_OK(client.Connect(sockets[0])); - VINEYARD_CHECK_OK(KVStateCacheManager::ClearGlobalCache(client, config)); + VINEYARD_CHECK_OK(KVCacheManager::ClearGlobalCache(client, config)); client.Disconnect(); for (size_t i = 0; i < sockets.size(); i++) { @@ -231,9 +230,9 @@ int main(int argc, char** argv) { std::vector sockets; if (argc < 2) { printf( - "usage ./kv_state_cache_test --client-num " + "usage ./kv_cache_test --client-num " "--vineyard-ipc-sockets ... -d " - " -c -l -b \n"); + " -c -l -b \n"); return 1; } @@ -245,7 +244,7 @@ int main(int argc, char** argv) { for (int i = 3; i < argc; i++) { if (strcmp(argv[i], "-d") == 0) { - tensorBytes = atoi(argv[i + 1]); + tensorNBytes = atoi(argv[i + 1]); } else if (strcmp(argv[i], "-c") == 0) { capacity = atoi(argv[i + 1]); } else if (strcmp(argv[i], "-l") == 0) { @@ -273,12 +272,12 @@ int main(int argc, char** argv) { } } - LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes + LOG(INFO) << "Test KVCache with tensorNBytes: " << tensorNBytes << ", capacity: " << capacity << ", layer: " << layer << ", block_size: " << block_size << " and use " << client_num << " client."; - config = VineyardCacheConfig(tensorBytes, capacity, layer, block_size, 3, + config = VineyardCacheConfig(tensorNBytes, capacity, layer, block_size, 3, llmCacheSyncLock, llmCacheObjectName, llmRefcntObjectName); @@ -308,6 +307,6 @@ int main(int argc, char** argv) { } LOG(INFO) << "Total memory usage:" << total_memory_usage; - LOG(INFO) << "Passed KVStateCache tests..."; + LOG(INFO) << "Passed KVCache tests..."; return 0; } diff --git a/modules/llm-cache/tests/refcnt_map_test.cc b/modules/llm-cache/tests/refcnt_map_test.cc index 0cf02d4b42..d2fc92ccea 100644 --- a/modules/llm-cache/tests/refcnt_map_test.cc +++ b/modules/llm-cache/tests/refcnt_map_test.cc @@ -20,11 +20,11 @@ limitations under the License. #include "rax/radix.h" #include "common/util/logging.h" -#include "llm-cache/ds/kv_state_cache_manager.h" +#include "llm-cache/ds/kv_cache_manager.h" using namespace vineyard; // NOLINT(build/namespaces) -constexpr int tensorBytes = 80; +constexpr int tensorNBytes = 80; constexpr int capacity = 5; constexpr int layer = 3; constexpr int block_size = 4; @@ -34,7 +34,7 @@ std::vector round_2_tokens = {1, 2, 4, 9, 10}; // split to two blocks std::vector> round_token_list = {round_1_tokens, round_2_tokens}; -std::vector> kv_state_cache_managers; +std::vector> kv_cache_managers; std::vector> blob_storages; std::string llmCacheObjectName = "refcnt_map_test_cache_object"; std::string llmCacheSyncLock = "refcnt_map_test_cache_lock"; @@ -58,10 +58,10 @@ void print_kv_state(const std::vector>& kv_state) { reinterpret_cast(kv_state[i].first.data); uint8_t* value_state_data = reinterpret_cast(kv_state[i].second.data); - // print the first tensorBytes bytes + // print the first tensorNBytes bytes std::string key_state_str = ""; std::string value_state_str = ""; - for (int j = 0; j < tensorBytes; j++) { + for (int j = 0; j < tensorNBytes; j++) { key_state_str += std::to_string(key_state_data[j]) + " "; value_state_str += std::to_string(value_state_data[j]) + " "; } @@ -78,13 +78,13 @@ std::vector> generate_kv_state(int token) { for (int currentLayer = 0; currentLayer < layer; currentLayer++) { LLMKV key_state; LLMKV value_state; - key_state.data = malloc(tensorBytes); - value_state.data = malloc(tensorBytes); + key_state.data = malloc(tensorNBytes); + value_state.data = malloc(tensorNBytes); - key_state.length = tensorBytes; - value_state.length = tensorBytes; + key_state.length = tensorNBytes; + value_state.length = tensorNBytes; - for (int i = 0; i < tensorBytes; ++i) { + for (int i = 0; i < tensorNBytes; ++i) { (reinterpret_cast(key_state.data))[i] = (static_cast(token)) + i + currentLayer; (reinterpret_cast(value_state.data))[i] = @@ -100,12 +100,12 @@ void check_kv_state(const std::vector>& kv_state, int& token) { VINEYARD_ASSERT(kv_state.size() == (size_t) layer); for (size_t index = 0; index < kv_state.size(); ++index) { - VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes); - VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes); - for (int i = 0; i < tensorBytes; ++i) { + VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes); + VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes); + for (int i = 0; i < tensorNBytes; ++i) { if ((reinterpret_cast(kv_state[index].first.data))[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "key_state[" << i << "]: " << (reinterpret_cast(kv_state[index].first.data))[i] @@ -115,7 +115,7 @@ void check_kv_state(const std::vector>& kv_state, } if (reinterpret_cast(kv_state[index].second.data)[i] != (static_cast(token)) + i + index) { - LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes + LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes << " layer:" << index; LOG(INFO) << "value_state[" << i << "]: " << (reinterpret_cast( @@ -128,7 +128,7 @@ void check_kv_state(const std::vector>& kv_state, } } -void inference(std::shared_ptr& kv_state_cache_manager, +void inference(std::shared_ptr& kv_cache_manager, std::vector tokens, size_t begin = 0) { std::vector inference_tokens; std::vector> kv_state; @@ -140,16 +140,16 @@ void inference(std::shared_ptr& kv_state_cache_manager, for (int current_layer = 0; current_layer < layer; current_layer++) { kv_state_to_query.emplace_back(LLMKV{nullptr, 0}, LLMKV{nullptr, 0}); } - Status result = kv_state_cache_manager->Query(inference_tokens, tokens[i], - kv_state_to_query); + Status result = kv_cache_manager->Query(inference_tokens, tokens[i], + kv_state_to_query); if (!result.ok()) { LOG(INFO) << "Can not find the kv_state from cache:"; print_current_tokens(inference_tokens, tokens[i]); LOG(INFO) << "Generate the kv_state and update the cache."; kv_state = generate_kv_state(tokens[i]); // print_kv_state(kv_state); - Status status = kv_state_cache_manager->Update(inference_tokens, - tokens[i], kv_state); + Status status = + kv_cache_manager->Update(inference_tokens, tokens[i], kv_state); if (!status.ok()) { // Not a error. May be the cache is full. LOG(INFO) << "Put kv state into cache failed:" << status.ToString(); @@ -180,18 +180,18 @@ Status checkRefCnt(std::string ipc_socket) { std::vector> treeDataSets; treeDataSets.push_back(blob_storages[0] - ->GetKVStateCacheBuilder() + ->GetKVCacheBuilder() ->GetRootTree() ->GetSubTreeDataSet()); treeDataSets.push_back(blob_storages[1] - ->GetKVStateCacheBuilder() + ->GetKVCacheBuilder() ->GetRootTree() ->GetSubTreeDataSet()); LOG(INFO) << raxShow( - blob_storages[0]->GetKVStateCacheBuilder()->GetRootTree()->GetRootTree()); + blob_storages[0]->GetKVCacheBuilder()->GetRootTree()->GetRootTree()); LOG(INFO) << "------------------------------------------"; LOG(INFO) << raxShow( - blob_storages[1]->GetKVStateCacheBuilder()->GetRootTree()->GetRootTree()); + blob_storages[1]->GetKVCacheBuilder()->GetRootTree()->GetRootTree()); std::shared_ptr refcnt_map = std::make_shared(client[0]); @@ -211,13 +211,12 @@ Status checkRefCnt(std::string ipc_socket) { ObjectID globalCacheObjectID; blockIDSetToAdd.clear(); VINEYARD_CHECK_OK(client[0].GetName(llmCacheObjectName, globalCacheObjectID)); - std::shared_ptr kvStateCache = - std::dynamic_pointer_cast( - client[0].FetchAndGetObject(globalCacheObjectID)); - kvStateCache->GetCurrentBlockIDSet(blockIDSetToAdd); + std::shared_ptr kvCache = std::dynamic_pointer_cast( + client[0].FetchAndGetObject(globalCacheObjectID)); + kvCache->GetCurrentBlockIDSet(blockIDSetToAdd); refcnt_map->IncSetRefcnt(blockIDSetToAdd); - if (kvStateCache->id() != globalCacheObjectID) { - client[0].DelData(kvStateCache->id()); + if (kvCache->id() != globalCacheObjectID) { + client[0].DelData(kvCache->id()); } LOG(INFO) << "Prepare refcnt done"; @@ -244,20 +243,20 @@ void threadFunc(std::string socket, int threadId) { sleep(4); } - std::shared_ptr kv_state_cache_manager; + std::shared_ptr kv_cache_manager; std::shared_ptr blob_storage; VINEYARD_CHECK_OK(BlobStorage::Make( - client[threadId], blob_storage, tensorBytes, capacity, layer, block_size, + client[threadId], blob_storage, tensorNBytes, capacity, layer, block_size, 3, llmCacheSyncLock, llmCacheObjectName, llmRefcntObjectName)); blob_storages.push_back(blob_storage); - kv_state_cache_manager = std::make_shared(blob_storage); - kv_state_cache_managers.push_back(kv_state_cache_manager); + kv_cache_manager = std::make_shared(blob_storage); + kv_cache_managers.push_back(kv_cache_manager); std::vector tokenList = round_token_list[threadId]; if (threadId == 1) { - inference(kv_state_cache_manager, tokenList, 2); + inference(kv_cache_manager, tokenList, 2); } else { - inference(kv_state_cache_manager, tokenList); + inference(kv_cache_manager, tokenList); } sleep(5); @@ -292,7 +291,7 @@ int main(int argc, char** argv) { std::string sockets[2]; if (argc < 2) { printf( - "Usage ./kv_state_cache_test " + "Usage ./kv_cache_test " " -d \n"); return 1; } @@ -315,8 +314,8 @@ int main(int argc, char** argv) { VINEYARD_CHECK_OK(checkRefCnt(sockets[0])); for (int i = 0; i < 2; i++) { - kv_state_cache_managers[i]->Close(); - kv_state_cache_managers[i] = nullptr; + kv_cache_managers[i]->Close(); + kv_cache_managers[i] = nullptr; } LOG(INFO) << "Clear global object"; diff --git a/modules/llm-cache/thread_group.h b/modules/llm-cache/thread_group.h index 335f96fdf9..3e7cc8be70 100644 --- a/modules/llm-cache/thread_group.h +++ b/modules/llm-cache/thread_group.h @@ -42,13 +42,10 @@ namespace vineyard { namespace parallel { class ThreadGroup { + public: using tid_t = uint32_t; - // Returns the path index and task status for parallel execution. - // The path index is used to identify and delete results of unsuccessful - // tasks. - using return_t = std::pair; + using return_t = Status; - public: explicit ThreadGroup( uint32_t parallelism = std::thread::hardware_concurrency()); @@ -67,7 +64,7 @@ class ThreadGroup { try { return std::move(_f(std::forward(_args)...)); } catch (std::exception& e) { - return std::pair(-1, Status(StatusCode::kUnknownError, e.what())); + return Status(StatusCode::kUnknownError, e.what()); } }; @@ -114,10 +111,10 @@ class ThreadGroup { * @AddTask@ will be blocked until there are spare thread resources. */ class DynamicThreadGroup { + public: using tid_t = uint32_t; using return_t = Status; - public: explicit DynamicThreadGroup( tid_t parallelism = std::thread::hardware_concurrency()); diff --git a/python/vineyard/llm/__init__.py b/python/vineyard/llm/__init__.py index 99a994518e..07511e257b 100644 --- a/python/vineyard/llm/__init__.py +++ b/python/vineyard/llm/__init__.py @@ -16,167 +16,7 @@ # limitations under the License. # -from typing import Dict -from typing import List -from typing import Optional -from typing import Tuple -from typing import Union - -import numpy as np - -from .config import FileCacheConfig -from .config import VineyardCacheConfig -from .llm_C import KVTensor -from .llm_C import _generate - - -class KVCache: # pylint: disable=too-many-instance-attributes - """KVCache is a class that manages the llm kv cache in vineyard.""" - - def __init__( - self, - cache_config: Union[VineyardCacheConfig, FileCacheConfig], - tensor_bytes: int = 10, - cache_capacity: int = 10, - layer: int = 1, - **kwargs - ): - """Create a llm kv cache manager based on vineyard blob. - - Args: - cache_config (Union[VineyardCacheConfig, FileCacheConfig]): - The config of the kv cache, including vineyard cache and file cache. - tensor_bytes (int, optional): - The size of the kv cache tensor. - Defaults to 10. - cache_capacity (int, optional): - The capacity of the KV cache refers to the maximum number of - tokens it can hold. Defaults to 10. - layer (int, optional): - The number of layers of the kv cache. Defaults to 1. - """ - self.kv_cache_manager = None - if not isinstance(cache_config, VineyardCacheConfig) and not isinstance( - cache_config, FileCacheConfig - ): - raise ValueError( - "The cache_config should be VineyardCacheConfig or FileCacheConfig." - ) - self.tensor_bytes = tensor_bytes - self.cache_capacity = cache_capacity - self.layer = layer - - self.kv_cache_manager = _generate( - tensor_bytes=tensor_bytes, - cache_capacity=cache_capacity, - layer=layer, - **cache_config.__dict__, - **kwargs - ) - - def update( - self, - prefix: Optional[List[int]], - tokens: List[int], - kv_state_list: List[List[Tuple[KVTensor, KVTensor]]], - ) -> int: - """Update the kv cache stored in vineyard. - - Args: - prefix (list): the prefix of the tokens - For FileCacheConfig, the length of the prefix should be - multiple of the chunk size. - tokens (list): the tokens of the kv cache - e,g, [1 2 3 4] - kv_cache_list (List[List[Tuple[KVTensor, KVTensor]]]): - the kv tensors list of the related tokens including all layers, and - its length should be the same as the length of tokens. - - The k, v tensor for i-th token at the j-th layer is: kv_state_list[i][j] - - Whether the underlying kv cache is vineyard or file, the - kv_state_list is managed by the caller. - Assume the layer is 2, the tokens is [1, 2], then you should allocate - the kv_state_list as follows: - - .. code:: python - - kv_state_list = [] - for _ in range(2): # the number of tokens - k_tensor = np.random.rand(2,2).astype(np.float32) - v_tensor = np.random.rand(2,2).astype(np.float32) - kv_state_list.append( - [ - ( - KVTensor(k_tensor.ctypes.data, k_tensor.nbytes), - KVTensor(v_tensor.ctypes.data, v_tensor.nbytes), - ) - for _ in range(2) # the number of layers - ] - ) - - """ - if prefix: - return self.kv_cache_manager.update(prefix, tokens, kv_state_list) - else: - return self.kv_cache_manager.update(tokens, kv_state_list) - - def query( - self, - tokens: List[int], - kv_state_list: List[List[Tuple[KVTensor, KVTensor]]], - ) -> int: - """Query the kv cache stored in vineyard. - - Args: - tokens (list): the tokens of the kv cache - e,g, [1 2 3 4] - kv_state_list: (List[List[Tuple[KVTensor, KVTensor]]]): - the kv tensors list of the related tokens including all layers, and its - length should be the same as the length of tokens. - - The k, v tensor for i-th token at the j-th layer is: kv_state_list[i][j] - - For VineyardConfigCache, the kv_state_list is managed by vineyard. - The caller does not need to malloc and free the memory of the kv state. - Assume the layer is 2, the tokens is [1, 2], then you should allocate - the kv_state_list as follows: - - .. code:: python - - kv_state_list = [ - ( - KVTensor(0, 0), - KVTensor(0, 0), - ) for _ in range(2) # the number of layers - ] * 2 # the number of tokens - - For FileCacheConfig, the kv_state_list is managed by the caller. - The caller needs to malloc and free the memory of the kv state. - Assume the layer is 2, the tokens is [1, 2], then you should allocate - the kv_state_list as follows: - - .. code:: python - - kv_state_list = [] - for _ in range(2): # the number of tokens - k_tensor = np.empty((2,2), dtype=np.float32) - v_tensor = np.empty((2,2), dtype=np.float32) - kv_state_list.append( - [ - ( - KVTensor(k_tensor.ctypes.data, k_tensor.nbytes), - KVTensor(v_tensor.ctypes.data, v_tensor.nbytes), - ) - for _ in range(2) # the number of layers - ] - ) - - Returns: - int: The number of matched tokens. - """ - return self.kv_cache_manager.query(tokens, kv_state_list) - - def __del__(self): - if self.kv_cache_manager: - self.kv_cache_manager.close() +from vineyard.llm.cache import FileCacheConfig +from vineyard.llm.cache import KVCache +from vineyard.llm.cache import KVTensor +from vineyard.llm.cache import VineyardCacheConfig diff --git a/python/vineyard/llm/cache.cc b/python/vineyard/llm/cache.cc new file mode 100644 index 0000000000..b3de645bbf --- /dev/null +++ b/python/vineyard/llm/cache.cc @@ -0,0 +1,193 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "client/client.h" + +#include "llm-cache/ds/config.h" +#include "llm-cache/ds/kv_cache_block.h" +#include "llm-cache/ds/kv_cache_manager.h" + +namespace py = pybind11; + +namespace vineyard { + +PYBIND11_MODULE(_llm_C, m) { + m.doc() = "vineyard llm kv cache manager module"; + + pybind11::enum_(m, "FilesystemType") + .value("LOCAL", FilesystemType::LOCAL) + .export_values(); + + py::class_>(m, "KVTensor", + py::buffer_protocol()) + .def(py::init([](uintptr_t data, size_t length) { + return LLMKV{reinterpret_cast(data), length}; + }), + py::arg("data"), py::arg("length")) + .def_property( + "data", + [](LLMKV& self) -> uintptr_t { // getter + return reinterpret_cast(self.data); + }, + [](LLMKV& self, uintptr_t new_ptr) { // setter + self.data = reinterpret_cast(new_ptr); + }) + .def_property( + "length", + [](LLMKV& self) -> size_t { // getter + return self.length; + }, + [](LLMKV& self, size_t new_length) { // setter + self.length = new_length; + }) + .def_buffer([](LLMKV& self) -> py::buffer_info { + return py::buffer_info(self.data, sizeof(char), + py::format_descriptor::value, 1, + {self.length}, {sizeof(char)}); + }); + + py::class_>(m, + "KVCacheManager") + .def(py::init([](py::object ipc_client, int tensor_nbytes, + int cache_capacity, int layer, int block_size, + int sync_interval, std::string llm_cache_sync_lock, + std::string llm_cache_object_name, + std::string llm_ref_cnt_object_name) + -> std::shared_ptr { + VineyardCacheConfig config( + tensor_nbytes, cache_capacity, layer, block_size, + sync_interval, llm_cache_sync_lock, llm_cache_object_name, + llm_ref_cnt_object_name); + Client& client = ipc_client.cast(); + std::shared_ptr manager; + VINEYARD_CHECK_OK( + vineyard::KVCacheManager::Make(client, manager, config)); + return manager; + }), + py::arg("ipc_client"), py::arg("tensor_nbytes") = 1024, + py::arg("cache_capacity") = 1024, py::arg("layer") = 1, + py::arg("block_size") = 16, py::arg("sync_interval") = 3, + py::arg("llm_cache_sync_lock") = "llmCacheSyncLock", + py::arg("llm_cache_object_name") = "llm_cache_object", + py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object") + .def(py::init([](int tensor_nbytes, int cache_capacity, int layer, + int chunk_size, int hash_chunk_size, std::string root, + FilesystemType filesystemType, int gc_interval, int ttl, + bool enable_global_gc, int global_gc_interval, + int global_ttl) -> std::shared_ptr { + FileCacheConfig config( + tensor_nbytes, cache_capacity, layer, chunk_size, + hash_chunk_size, root, filesystemType, gc_interval, ttl, + enable_global_gc, global_gc_interval, global_ttl); + std::shared_ptr manager; + VINEYARD_CHECK_OK(vineyard::KVCacheManager::Make(manager, config)); + return manager; + }), + py::arg("tensor_nbytes") = 1024, py::arg("cache_capacity") = 1024, + py::arg("layer") = 1, py::arg("chunk_size") = 16, + py::arg("hash_chunk_size") = 4, py::arg("root") = "root", + py::arg("filesystem_type") = FilesystemType::LOCAL, + py::arg("gc_interval") = 30 * 60, py::arg("ttl") = 30 * 60, + py::arg("enable_global_gc") = false, + py::arg("global_gc_interval") = 30 * 60, + py::arg("global_ttl") = 30 * 60) + .def( + "update", + [](KVCacheManager* self, const std::vector& tokenList, + int& next_token, + const std::vector>& + kv_state) { + VINEYARD_CHECK_OK(self->Update(tokenList, next_token, kv_state)); + }, + py::arg("tokens"), py::arg("next_token"), py::arg("kv_state")) + .def( + "update", + [](KVCacheManager* self, const std::vector& tokens, + const std::vector>>& kv_states) + -> size_t { + size_t updated = 0; + VINEYARD_CHECK_OK(self->Update(tokens, kv_states, updated)); + return updated; + }, + py::arg("tokens"), py::arg("kv_states")) + .def( + "update", + [](KVCacheManager* self, const std::vector& prefix, + std::vector& tokens, + const std::vector>>& kv_states) + -> size_t { + size_t updated = 0; + VINEYARD_CHECK_OK(self->Update(prefix, tokens, kv_states, updated)); + return updated; + }, + py::arg("prefix"), py::arg("tokens"), py::arg("kv_states")) + .def( + "query", + [](KVCacheManager* self, const std::vector& tokens, + py::list& kv_cache_list) -> size_t { + std::vector>> kv_state_vec = + kv_cache_list + .cast>>>(); + size_t matched = 0; + VINEYARD_CHECK_OK(self->Query(tokens, kv_state_vec, matched)); + for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) { + for (size_t j = 0; j < kv_state_vec[i].size(); ++j) { + kv_cache_list[i].cast()[j] = + py::cast(kv_state_vec[i][j]); + } + } + return matched; + }, + py::arg("tokens"), py::arg("kv_states")) + .def( + "query", + [](KVCacheManager* self, const std::vector& prefix, + int& next_token, py::list& kv_state) { + std::vector> kv_state_vec = + kv_state.cast>>(); + VINEYARD_CHECK_OK(self->Query(prefix, next_token, kv_state_vec)); + for (size_t i = 0; i < kv_state_vec.size(); ++i) { + kv_state[i] = py::cast(kv_state_vec[i]); + } + }, + py::arg("prefix"), py::arg("next_token"), py::arg("kv_states")) + .def( + "query", + [](KVCacheManager* self, const std::vector& prefix, + const std::vector& tokens, + py::list& kv_cache_list) -> size_t { + std::vector>> kv_state_vec = + kv_cache_list + .cast>>>(); + size_t matched = 0; + VINEYARD_CHECK_OK( + self->Query(prefix, tokens, kv_state_vec, matched)); + for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) { + for (size_t j = 0; j < kv_state_vec[i].size(); ++j) { + kv_cache_list[i].cast()[j] = + py::cast(kv_state_vec[i][j]); + } + } + return matched; + }, + py::arg("prefix"), py::arg("tokens"), py::arg("kv_states")) + .def("close", [](KVCacheManager* self) { self->Close(); }); +} + +} // namespace vineyard diff --git a/python/vineyard/llm/cache.py b/python/vineyard/llm/cache.py new file mode 100644 index 0000000000..3e8859afd9 --- /dev/null +++ b/python/vineyard/llm/cache.py @@ -0,0 +1,378 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020-2023 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import contextlib +import logging +import os +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +from ._llm_C import FilesystemType +from ._llm_C import KVCacheManager +from ._llm_C import KVTensor + +logger = logging.getLogger('vineyard') + + +def _argument_from_env( + kwargs: Dict[str, Any], + envprefix: str, + name: str, + dtype=None, +): + envname = f'{envprefix}_{name.upper()}' + if envname in os.environ: + value = os.environ.get(envname) + if dtype: + value = dtype(value) + kwargs[name] = value + + +class VineyardCacheConfig: + """VineyardCacheConfig is a class to configure the llm kv cache in vineyard.""" + + def __init__( + self, + socket: str, + block_size: int = 5, + sync_interval: int = 3, + llm_cache_sync_lock: str = "llmCacheSyncLock", + llm_cache_object_name: str = "llm_cache_object", + llm_ref_cnt_object_name: str = "llm_refcnt_object", + ): + """Create a vineyard cache config. + + Args: + socket (str): + The ipc socket of the vineyardd instance. + block_size (int, optional): + The block size of the kv cache. Defaults to 5. + sync_interval (int, optional): + The sync interval of the kv cache. Defaults to 3. + llm_cache_sync_lock (str, optional): + The name of the kv cache sync lock. Defaults to "llmCacheSyncLock". + llm_cache_object_name (str, optional): + The name of the kv cache object. Defaults to "llm_cache_object". + llm_ref_cnt_object_name (str, optional): + The name of the kv cache ref cnt object. + Defaults to "llm_refcnt_object". + """ + import vineyard + + self.socket = socket + self.block_size = block_size + self.sync_interval = sync_interval + self.llm_cache_sync_lock = llm_cache_sync_lock + self.llm_cache_object_name = llm_cache_object_name + self.llm_ref_cnt_object_name = llm_ref_cnt_object_name + + # Connecting to vineyardd + self.ipc_client = vineyard.connect(socket).ipc_client + + def __repr__(self): + return ( + f'VineyardCacheConfig(' + f'socket={self.socket}, ' + f'block_size={self.block_size}, ' + f'sync_interval={self.sync_interval}, ' + f'llm_cache_sync_lock={self.llm_cache_sync_lock}, ' + f'llm_cache_object_name={self.llm_cache_object_name}, ' + f'llm_ref_cnt_object_name={self.llm_ref_cnt_object_name})' + ) + + +class FileCacheConfig: + """FileCacheConfig is a class to configure the llm kv cache on filesystem.""" + + def __init__( + self, + chunk_size: int = 16, + hash_chunk_size: int = 2, + root: str = "/tmp/vineyard/llm_cache", + filesystem_type: FilesystemType = FilesystemType.LOCAL, + gc_interval: int = 30 * 60, + ttl: int = 30 * 60, + enable_global_gc: bool = False, + global_gc_interval: int = 3 * 60 * 60, + global_ttl: int = 3 * 60 * 60, + ): + """Create a file cache config. + + Args: + chunk_size (int): + Divide the token list into batches, each batch + contains chunk_size tokens. Defaults to 16. + hash_chunk_size (int): + Split the hash value into the file with multiple directories. + e.g, hash_chunk_size=2, hash value=123456, the file path is 12/34/56. + root (str): + The root directory of the kv state files. + Defaults to "/tmp/vineyard/llm_cache". + filesystem_type (str): + The type of the filesystem. Defaults to "local". + gc_interval (int): + The interval of the client gc (seconds). + Defaults to 30 * 60 seconds. + ttl (int): + The time to live of the kv state files (seconds). + Defaults to 30 * 60 seconds. + enable_global_gc (bool): + Enable the global gc or not. Defaults to False. + global_gc_interval (int): + The interval of the global gc (seconds). + Defaults to 3 * 60 * 60 seconds. + global_ttl (int): + The time to live of the global gc files (seconds). + Defaults to 3 * 60 * 60 seconds. + """ + self.chunk_size = chunk_size + self.hash_chunk_size = hash_chunk_size + self.root = root + self.filesystem_type = filesystem_type + self.gc_interval = gc_interval + self.ttl = ttl + self.enable_global_gc = enable_global_gc + self.global_gc_interval = global_gc_interval + self.global_ttl = global_ttl + + def __repr__(self): + return ( + f'FileCacheConfig(' + f'chunk_size={self.chunk_size}, ' + f'hash_chunk_size={self.hash_chunk_size}, ' + f'root={self.root}, ' + f'filesystem_type={self.filesystem_type}, ' + f'gc_interval={self.gc_interval}, ' + f'ttl={self.ttl}, ' + f'enable_global_gc={self.enable_global_gc}, ' + f'global_gc_interval={self.global_gc_interval}, ' + f'global_ttl={self.global_ttl})' + ) + + +class KVCache: # pylint: disable=too-many-instance-attributes + """KVCache is a class that manages the llm kv cache in vineyard.""" + + def __init__( + self, + cache_config: Optional[Union[VineyardCacheConfig, FileCacheConfig]] = None, + tensor_nbytes: int = 1024, + cache_capacity: int = 1024, + layer: int = 1, + rank: Optional[int] = None, + world_size: Optional[int] = None, + **kwargs, + ): + """Create a llm kv cache manager based on vineyard blob. + + Args: + cache_config (Union[VineyardCacheConfig, FileCacheConfig]): + The config of the KV cache, including vineyard cache and file cache. + tensor_nbytes (int, optional): + The size of the k/v cache tensor for each token at each layer. + Defaults to 10. + cache_capacity (int, optional): + The capacity of the KV cache refers to the maximum number of + tokens it can hold. Defaults to 10. + layer (int, optional): + The number of layers of the kv cache. Defaults to 1. + rank (int, optional): + The rank of the current worker. Defaults to None. + """ + self.kv_cache_manager = None + + if cache_config is None: + if 'VINEYARD_LLM_CACHE_SHARED_MEMORY' in os.environ: + config = {} + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'socket', str + ) + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'block_size', int + ) + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'sync_interval', int + ) + cache_config = VineyardCacheConfig(**config) + if 'VINEYARD_LLM_CACHE_FILESYSTEM' in os.environ: + config = {} + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'chunk_size', int + ) + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'hash_chunk_size', int + ) + _argument_from_env( + config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'root', dtype=str + ) + cache_config = FileCacheConfig(**config) + + if rank is not None and world_size is not None: + if isinstance(cache_config, FileCacheConfig): + cache_config.root = os.path.join( + cache_config.root, f'{world_size}-{rank}' + ) + + logger.info("Initializing vineyard llm cache with config: %r", cache_config) + if not isinstance(cache_config, VineyardCacheConfig) and not isinstance( + cache_config, FileCacheConfig + ): + raise ValueError( + "The cache_config should be VineyardCacheConfig or FileCacheConfig." + ) + self.cache_config = cache_config + self.tensor_nbytes = tensor_nbytes + self.cache_capacity = cache_capacity + self.layer = layer + + self.kv_cache_manager = KVCacheManager( + tensor_nbytes=tensor_nbytes, + cache_capacity=cache_capacity, + layer=layer, + **cache_config.__dict__, + **kwargs, + ) + if isinstance(cache_config, VineyardCacheConfig): + self.chunk_size = cache_config.block_size + else: + self.chunk_size = cache_config.chunk_size + + def __repr__(self): + return ( + 'KVCache(' + f'cache_config={self.cache_config}, ' + f'tensor_nbytes={self.tensor_nbytes}, ' + f'cache_capacity={self.cache_capacity}, ' + f'layer={self.layer})' + ) + + def update( + self, + prefix: List[int], + tokens: List[int], + kv_cache_list: List[List[Tuple[KVTensor, KVTensor]]], + ) -> int: + """Update the kv cache stored in vineyard. + + Args: + prefix (list): the prefix of the tokens + For FileCacheConfig, the length of the prefix should be + multiple of the chunk size. + tokens (list): the tokens of the kv cache + e,g, [1 2 3 4] + kv_cache_list (List[List[Tuple[KVTensor, KVTensor]]]): + the kv tensors list of the related tokens including all layers, and + its length should be the same as the length of tokens. + + The k, v tensor for i-th token at the j-th layer is: kv_cache_list[i][j] + + Whether the underlying kv cache is vineyard or file, the + kv_cache_list is managed by the caller. + Assume the layer is 2, the tokens is [1, 2], then you should allocate + the kv_cache_list as follows: + + .. code:: python + + kv_cache_list = [] + for _ in range(2): # the number of tokens + k_tensor = np.random.rand(2,2).astype(np.float32) + v_tensor = np.random.rand(2,2).astype(np.float32) + kv_cache_list.append( + [ + ( + KVTensor(k_tensor.ctypes.data, k_tensor.nbytes), + KVTensor(v_tensor.ctypes.data, v_tensor.nbytes), + ) + for _ in range(2) # the number of layers + ] + ) + + """ + if prefix: + return self.kv_cache_manager.update(prefix, tokens, kv_cache_list) + else: + return self.kv_cache_manager.update(tokens, kv_cache_list) + + def query( + self, + prefix: List[int], + tokens: List[int], + kv_cache_list: List[List[Tuple[KVTensor, KVTensor]]], + ) -> int: + """Query the kv cache stored in vineyard. + + Args: + tokens (list): the tokens of the kv cache + e,g, [1 2 3 4] + kv_cache_list: (List[List[Tuple[KVTensor, KVTensor]]]): + the kv tensors list of the related tokens including all layers, and its + length should be the same as the length of tokens. + + The k, v tensor for i-th token at the j-th layer is: kv_cache_list[i][j] + + For VineyardConfigCache, the kv_cache_list is managed by vineyard. + The caller does not need to malloc and free the memory of the kv state. + Assume the layer is 2, the tokens is [1, 2], then you should allocate + the kv_cache_list as follows: + + .. code:: python + + kv_cache_list = [ + ( + KVTensor(0, 0), + KVTensor(0, 0), + ) for _ in range(2) # the number of layers + ] * 2 # the number of tokens + + For FileCacheConfig, the kv_cache_list is managed by the caller. + The caller needs to malloc and free the memory of the kv state. + Assume the layer is 2, the tokens is [1, 2], then you should allocate + the kv_cache_list as follows: + + .. code:: python + + kv_cache_list = [] + for _ in range(2): # the number of tokens + k_tensor = np.empty((2,2), dtype=np.float32) + v_tensor = np.empty((2,2), dtype=np.float32) + kv_cache_list.append( + [ + ( + KVTensor(k_tensor.ctypes.data, k_tensor.nbytes), + KVTensor(v_tensor.ctypes.data, v_tensor.nbytes), + ) + for _ in range(2) # the number of layers + ] + ) + + Returns: + int: The number of matched tokens. + """ + if prefix: + return self.kv_cache_manager.query(prefix, tokens, kv_cache_list) + else: + return self.kv_cache_manager.query(tokens, kv_cache_list) + + def __del__(self): + if self.kv_cache_manager: + with contextlib.suppress(Exception): + self.kv_cache_manager.close() diff --git a/python/vineyard/llm/config.py b/python/vineyard/llm/config.py deleted file mode 100644 index 1305d22602..0000000000 --- a/python/vineyard/llm/config.py +++ /dev/null @@ -1,113 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright 2020-2023 Alibaba Group Holding Limited. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import vineyard - -from .llm_C import FilesystemType - - -class VineyardCacheConfig: - """VineyardCacheConfig is a class to configure the llm kv cache in vineyard.""" - - def __init__( - self, - socket: str, - block_size: int = 5, - sync_interval: int = 3, - llm_cache_sync_lock: str = "llmCacheSyncLock", - llm_cache_object_name: str = "llm_cache_object", - llm_ref_cnt_object_name: str = "llm_refcnt_object", - ): - """Create a vineyard cache config. - - Args: - socket (str): - The ipc socket of the vineyardd instance. - block_size (int, optional): - The block size of the kv cache. Defaults to 5. - sync_interval (int, optional): - The sync interval of the kv cache. Defaults to 3. - llm_cache_sync_lock (str, optional): - The name of the kv cache sync lock. Defaults to "llmCacheSyncLock". - llm_cache_object_name (str, optional): - The name of the kv cache object. Defaults to "llm_cache_object". - llm_ref_cnt_object_name (str, optional): - The name of the kv cache ref cnt object. - Defaults to "llm_refcnt_object". - """ - self.ipc_client = vineyard.connect(socket).ipc_client - self.block_size = block_size - self.sync_interval = sync_interval - self.llm_cache_sync_lock = llm_cache_sync_lock - self.llm_cache_object_name = llm_cache_object_name - self.llm_ref_cnt_object_name = llm_ref_cnt_object_name - - -class FileCacheConfig: - """FileCacheConfig is a class to configure the llm kv cache on filesystem.""" - - def __init__( - self, - chunk_size: int = 16, - split_number: int = 2, - root: str = "/tmp/vineyard/llm_cache", - filesystem_type: FilesystemType = FilesystemType.LOCAL, - client_gc_interval: int = 30 * 60, - ttl: int = 30 * 60, - enable_global_gc: bool = False, - global_gc_interval: int = 3 * 60 * 60, - global_ttl: int = 3 * 60 * 60, - ): - """Create a file cache config. - - Args: - chunk_size (int): - Divide the token list into batches, each batch - contains batchSize tokens. Defaults to 16. - split_number (int): - Split the hash value into the file with multiple directories. - e.g, splitNumber=2, hash value=123456, the file path is 12/34/56. - root (str): - The root directory of the kv state files. - Defaults to "/tmp/vineyard/llm_cache". - filesystem_type (str): - The type of the filesystem. Defaults to "local". - client_gc_interval (int): - The interval of the client gc (seconds). - Defaults to 30 * 60 seconds. - ttl (int): - The time to live of the kv state files (seconds). - Defaults to 30 * 60 seconds. - enable_global_gc (bool): - Enable the global gc or not. Defaults to False. - global_gc_interval (int): - The interval of the global gc (seconds). - Defaults to 3 * 60 * 60 seconds. - global_ttl (int): - The time to live of the global gc files (seconds). - Defaults to 3 * 60 * 60 seconds. - """ - self.chunk_size = chunk_size - self.split_number = split_number - self.root = root - self.filesystem_type = filesystem_type - self.client_gc_interval = client_gc_interval - self.ttl = ttl - self.enable_global_gc = enable_global_gc - self.global_gc_interval = global_gc_interval - self.global_ttl = global_ttl diff --git a/python/vineyard/llm/kv_state_cache.cc b/python/vineyard/llm/kv_state_cache.cc deleted file mode 100644 index 88e333fc9d..0000000000 --- a/python/vineyard/llm/kv_state_cache.cc +++ /dev/null @@ -1,181 +0,0 @@ -/** Copyright 2020-2023 Alibaba Group Holding Limited. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -#include "client/client.h" - -#include "llm-cache/ds/config.h" -#include "llm-cache/ds/kv_state_cache_block.h" -#include "llm-cache/ds/kv_state_cache_manager.h" - -namespace py = pybind11; - -namespace vineyard { - -PYBIND11_MODULE(llm_C, m) { - m.doc() = "vineyard llm kv cache manager module"; - - pybind11::enum_(m, "FilesystemType") - .value("LOCAL", FilesystemType::LOCAL) - .export_values(); - - py::class_>(m, "KVTensor", - py::buffer_protocol()) - .def(py::init([](uintptr_t data, size_t length) { - return LLMKV{reinterpret_cast(data), length}; - }), - py::arg("data"), py::arg("length")) - .def_property( - "data", - [](LLMKV& self) -> uintptr_t { // getter - return reinterpret_cast(self.data); - }, - [](LLMKV& self, uintptr_t new_ptr) { // setter - self.data = reinterpret_cast(new_ptr); - }) - .def_property( - "length", - [](LLMKV& self) -> size_t { // getter - return self.length; - }, - [](LLMKV& self, size_t new_length) { // setter - self.length = new_length; - }) - .def_buffer([](LLMKV& self) -> py::buffer_info { - return py::buffer_info(self.data, sizeof(char), - py::format_descriptor::value, 1, - {self.length}, {sizeof(char)}); - }); - - py::class_>( - m, "KVStateCacheManager") - .def( - "update", - [](KVStateCacheManager* self, const std::vector& tokenList, - int& next_token, - const std::vector>& - kv_state) { - VINEYARD_CHECK_OK(self->Update(tokenList, next_token, kv_state)); - }, - py::arg("tokens"), py::arg("next_token"), py::arg("kv_state")) - .def( - "update", - [](KVStateCacheManager* self, const std::vector& tokens, - const std::vector>>& kv_states) - -> size_t { - size_t updated = 0; - VINEYARD_CHECK_OK(self->Update(tokens, kv_states, updated)); - return updated; - }, - py::arg("tokens"), py::arg("kv_states")) - .def( - "update", - [](KVStateCacheManager* self, const std::vector& prefix, - std::vector& tokens, - const std::vector>>& kv_states) - -> size_t { - size_t updated = 0; - VINEYARD_CHECK_OK(self->Update(prefix, tokens, kv_states, updated)); - return updated; - }, - py::arg("prefix"), py::arg("tokens"), py::arg("kv_states")) - .def( - "query", - [](KVStateCacheManager* self, const std::vector& tokens, - int& next_token, py::list& kv_state) { - std::vector> kv_state_vec = - kv_state.cast>>(); - VINEYARD_CHECK_OK(self->Query(tokens, next_token, kv_state_vec)); - for (size_t i = 0; i < kv_state_vec.size(); ++i) { - kv_state[i] = py::cast(kv_state_vec[i]); - } - }, - py::arg("tokens"), py::arg("next_token"), py::arg("kv_states")) - .def( - "query", - [](KVStateCacheManager* self, const std::vector& tokens, - py::list& kv_state_list) -> size_t { - std::vector>> kv_state_vec = - kv_state_list - .cast>>>(); - size_t matched = 0; - VINEYARD_CHECK_OK(self->Query(tokens, kv_state_vec, matched)); - for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) { - for (size_t j = 0; j < kv_state_vec[i].size(); ++j) { - kv_state_list[i].cast()[j] = - py::cast(kv_state_vec[i][j]); - } - } - return matched; - }, - py::arg("tokens"), py::arg("kv_states")) - .def("close", [](KVStateCacheManager* self) { self->Close(); }); - - m.def( - "_generate", - [](py::object ipc_client, int tensor_bytes, int cache_capacity, - int layer, int block_size, int sync_interval, - std::string llm_cache_sync_lock, std::string llm_cache_object_name, - std::string llm_ref_cnt_object_name) - -> std::shared_ptr { - std::shared_ptr manager; - VineyardCacheConfig config(tensor_bytes, cache_capacity, layer, - block_size, sync_interval, - llm_cache_sync_lock, llm_cache_object_name, - llm_ref_cnt_object_name); - Client& client = ipc_client.cast(); - vineyard::Status status = - vineyard::KVStateCacheManager::Make(client, manager, config); - if (!status.ok()) { - throw std::runtime_error(status.ToString()); - } - return manager; - }, - py::arg("ipc_client"), py::arg("tensor_bytes") = 10, - py::arg("cache_capacity") = 10, py::arg("layer") = 1, - py::arg("block_size") = 5, py::arg("sync_interval") = 3, - py::arg("llm_cache_sync_lock") = "llmCacheSyncLock", - py::arg("llm_cache_object_name") = "llm_cache_object", - py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object") - .def( - "_generate", - [](int tensor_bytes, int cache_capacity, int layer, int chunk_size, - int split_number, std::string root, FilesystemType filesystemType, - int client_gc_interval, int ttl, bool enable_global_gc, - int global_gc_interval, - int global_ttl) -> std::shared_ptr { - std::shared_ptr manager; - FileCacheConfig config( - tensor_bytes, cache_capacity, layer, chunk_size, split_number, - root, filesystemType, client_gc_interval, ttl, enable_global_gc, - global_gc_interval, global_ttl); - VINEYARD_CHECK_OK( - vineyard::KVStateCacheManager::Make(manager, config)); - return manager; - }, - py::arg("tensor_bytes") = 10, py::arg("cache_capacity") = 10, - py::arg("layer") = 1, py::arg("chunk_size") = 5, - py::arg("split_number") = 3, py::arg("root") = "root", - py::arg("filesystem_type") = FilesystemType::LOCAL, - py::arg("client_gc_interval") = 30 * 60, py::arg("ttl") = 30 * 60, - py::arg("enable_global_gc") = false, - py::arg("global_gc_interval") = 30 * 60, - py::arg("global_ttl") = 30 * 60); -} - -} // namespace vineyard diff --git a/python/vineyard/llm/tests/test_llm.py b/python/vineyard/llm/tests/test_llm.py index 014fef6e3e..57549e4c7f 100644 --- a/python/vineyard/llm/tests/test_llm.py +++ b/python/vineyard/llm/tests/test_llm.py @@ -35,8 +35,8 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets): ) cache = KVCache( cache_config=vineyard_cache_config, - tensor_bytes=16, # should be the same as the nbytes of the tensor - cache_capacity=10, + tensor_nbytes=16, # should be the same as the nbytes of the tensor + cache_capacity=1024, layer=2, ) @@ -100,13 +100,13 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets): def test_kv_cache_update_and_query_on_fs(): file_cache_config = FileCacheConfig( chunk_size=2, - split_number=2, + hash_chunk_size=2, root="/tmp/vineyard/llm_cache", ) cache = KVCache( cache_config=file_cache_config, - tensor_bytes=16, # should be the same as the nbytes of the tensor - cache_capacity=10, + tensor_nbytes=16, # should be the same as the nbytes of the tensor + cache_capacity=1024, layer=2, ) diff --git a/setup_llm.py b/setup_llm.py index 829730cdd0..a27e8f3e90 100644 --- a/setup_llm.py +++ b/setup_llm.py @@ -101,7 +101,7 @@ def find_llm_packages(root): package_dir={'vineyard.llm': 'python/vineyard/llm'}, packages=find_llm_packages('python'), ext_modules=[ - CopyCMakeExtension('vineyard.llm.llm_C'), + CopyCMakeExtension('vineyard.llm._llm_C'), ], cmdclass={ 'build_ext': build_ext_with_precompiled, diff --git a/src/common/util/functions.h b/src/common/util/functions.h index 1d245038af..eef3d6a610 100644 --- a/src/common/util/functions.h +++ b/src/common/util/functions.h @@ -23,8 +23,6 @@ limitations under the License. #include #include -#include "boost/algorithm/string/replace.hpp" - #include "common/util/env.h" namespace vineyard { @@ -37,7 +35,11 @@ inline std::string ExpandEnvironmentVariables(const std::string& text) { std::smatch match; while (std::regex_search(text_copy, match, env)) { std::string var = read_env(match.str(1).c_str()); - boost::replace_first(text_copy, match[0].str(), var); + const std::string& matched = match[0].str(); + size_t pos = text_copy.find_first_of(matched); + if (pos != std::string::npos) { + text_copy.replace(pos, matched.size(), var, 0, std::string::npos); + } } return text_copy; } catch (std::exception& e) { diff --git a/test/runner.py b/test/runner.py index 9250e55ab8..70615e1675 100755 --- a/test/runner.py +++ b/test/runner.py @@ -474,9 +474,9 @@ def run_vineyard_cpp_tests(meta, allocator, endpoints, tests): run_test(tests, 'tensor_test') run_test(tests, 'typename_test') run_test(tests, 'version_test') - run_test(tests, 'kv_state_cache_radix_tree_test') - run_test(tests, 'kv_state_cache_hash_test') - run_test(tests, 'kv_state_cache_local_file_test') + run_test(tests, 'kv_cache_radix_tree_test') + run_test(tests, 'kv_cache_hash_test') + run_test(tests, 'kv_cache_local_file_test') run_test(tests, 'local_file_storage_gc_test') @@ -711,7 +711,7 @@ def run_llm_tests(meta, allocator, endpoints): subprocess.check_call( [ - './build/bin/kv_state_cache_test', + './build/bin/kv_cache_test', '--client-num', '2', '--vineyard-ipc-sockets',