From f38c12e0b6a2101a436a9a8bb8d596ff746327ed Mon Sep 17 00:00:00 2001
From: Tao He <linzhu.ht@alibaba-inc.com>
Date: Mon, 17 Jun 2024 13:24:15 +0800
Subject: [PATCH] Enhance the implementation of llm cache

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
---
 CMakeLists.txt                                |  96 ++--
 modules/basic/ds/arrow_utils.cc               |  34 +-
 modules/llm-cache/CMakeLists.txt              |   7 +-
 modules/llm-cache/README.md                   |  16 +-
 modules/llm-cache/ds/config.h                 |  22 +-
 .../ds/{kv_state_cache.cc => kv_cache.cc}     | 218 ++++-----
 .../ds/{kv_state_cache.h => kv_cache.h}       |  52 +-
 ...state_cache_block.cc => kv_cache_block.cc} | 142 +++---
 ...v_state_cache_block.h => kv_cache_block.h} |  50 +-
 ...e_cache_manager.cc => kv_cache_manager.cc} | 235 ++++-----
 ...ate_cache_manager.h => kv_cache_manager.h} |  34 +-
 modules/llm-cache/hash/hasher.h               |  22 +-
 modules/llm-cache/storage/blob_storage.cc     | 311 ++++++------
 modules/llm-cache/storage/blob_storage.h      |  29 +-
 modules/llm-cache/storage/file_storage.cc     | 450 ++++++++++--------
 modules/llm-cache/storage/file_storage.h      |  21 +-
 .../llm-cache/storage/local_file_storage.h    |  14 +-
 modules/llm-cache/storage/storage.h           |  15 +-
 modules/llm-cache/tests/k8s-test/worker.py    |  20 +-
 .../tests/k8s-test/yamls/worker.yaml          |   4 +-
 ...ark_test.cc => kv_cache_benchmark_test.cc} |  16 +-
 ...che_hash_test.cc => kv_cache_hash_test.cc} |  22 +-
 ...le_test.cc => kv_cache_local_file_test.cc} |  57 ++-
 ...ee_test.cc => kv_cache_radix_tree_test.cc} |   2 +-
 ...v_state_cache_test.cc => kv_cache_test.cc} |  63 ++-
 modules/llm-cache/tests/refcnt_map_test.cc    |  77 ++-
 modules/llm-cache/thread_group.h              |  11 +-
 python/vineyard/llm/__init__.py               | 168 +------
 python/vineyard/llm/cache.cc                  | 193 ++++++++
 python/vineyard/llm/cache.py                  | 378 +++++++++++++++
 python/vineyard/llm/config.py                 | 113 -----
 python/vineyard/llm/kv_state_cache.cc         | 181 -------
 python/vineyard/llm/tests/test_llm.py         |  10 +-
 setup_llm.py                                  |   2 +-
 src/common/util/functions.h                   |   8 +-
 test/runner.py                                |   8 +-
 36 files changed, 1632 insertions(+), 1469 deletions(-)
 rename modules/llm-cache/ds/{kv_state_cache.cc => kv_cache.cc} (61%)
 rename modules/llm-cache/ds/{kv_state_cache.h => kv_cache.h} (66%)
 rename modules/llm-cache/ds/{kv_state_cache_block.cc => kv_cache_block.cc} (70%)
 rename modules/llm-cache/ds/{kv_state_cache_block.h => kv_cache_block.h} (77%)
 rename modules/llm-cache/ds/{kv_state_cache_manager.cc => kv_cache_manager.cc} (81%)
 rename modules/llm-cache/ds/{kv_state_cache_manager.h => kv_cache_manager.h} (72%)
 rename modules/llm-cache/tests/{kv_state_cache_benchmark_test.cc => kv_cache_benchmark_test.cc} (92%)
 rename modules/llm-cache/tests/{kv_state_cache_hash_test.cc => kv_cache_hash_test.cc} (86%)
 rename modules/llm-cache/tests/{kv_state_cache_local_file_test.cc => kv_cache_local_file_test.cc} (83%)
 rename modules/llm-cache/tests/{kv_state_cache_radix_tree_test.cc => kv_cache_radix_tree_test.cc} (99%)
 rename modules/llm-cache/tests/{kv_state_cache_test.cc => kv_cache_test.cc} (84%)
 create mode 100644 python/vineyard/llm/cache.cc
 create mode 100644 python/vineyard/llm/cache.py
 delete mode 100644 python/vineyard/llm/config.py
 delete mode 100644 python/vineyard/llm/kv_state_cache.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 115aab1019..592c4e35ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -856,49 +856,54 @@ if(BUILD_VINEYARD_CLIENT)
     list(APPEND VINEYARD_INSTALL_LIBS vineyard_client)
 endif()
 
-if(BUILD_VINEYARD_PYTHON_BINDINGS)
+if (BUILD_VINEYARD_PYTHON_BINDINGS)
     set(PYBIND11_PYTHON_VERSION 3)
     if(NOT (CMAKE_VERSION VERSION_LESS "3.27"))
         set(PYBIND11_FINDPYTHON ON)
     endif()
     add_subdirectory_static(thirdparty/pybind11)
-    set(PYTHON_BIND_FILES "python/client.cc"
-                          "python/core.cc"
-                          "python/error.cc"
-                          "python/pybind11_docs.cc"
-                          "python/pybind11_utils.cc"
-                          "python/vineyard.cc")
-    pybind11_add_module(_C MODULE ${PYTHON_BIND_FILES})
-    target_add_debuginfo(_C)
-    target_link_libraries(_C PRIVATE vineyard_client)
-    target_include_directories(_C PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include")
-    target_compile_options(_C PRIVATE -Wno-unused-value)
-    set_target_properties(_C PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib")
+endif()
+
+macro(setup_pybind11_module target relpath)
+    target_add_debuginfo(${target})
+    target_link_libraries(${target} PRIVATE vineyard_client)
+    target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include")
+    target_compile_options(${target} PRIVATE -Wno-unused-value)
+    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib")
     if(UNIX AND NOT APPLE)
-        target_add_link_options(_C PRIVATE OPTIONS -Wl,--exclude-libs=ALL)
+        target_add_link_options(${target} PRIVATE OPTIONS -Wl,--exclude-libs=ALL)
     endif()
     if(BUILD_VINEYARD_PYPI_PACKAGES AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        target_compile_options(_C PRIVATE -static)
-        target_add_link_options(_C PRIVATE OPTIONS -static)
+        target_compile_options(${target} PRIVATE -static)
+        target_add_link_options(${target} PRIVATE OPTIONS -static)
     else()
-        target_compile_options(_C PRIVATE -Os)
-        target_add_link_options(_C PRIVATE OPTIONS -Os)
+        target_compile_options(${target} PRIVATE -Os)
+        target_add_link_options(${target} PRIVATE OPTIONS -Os)
     endif()
 
-    file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard" "${CMAKE_BINARY_DIR}/shared-lib")
     if(UNIX AND NOT APPLE)
-        set_target_properties(_C PROPERTIES
+        set_target_properties(${target} PROPERTIES
                               BUILD_WITH_INSTALL_RPATH TRUE
                               INSTALL_RPATH_USE_LINK_PATH TRUE
-                              INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${RELATIVE_BUILD_PATH}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}")
+                              INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${relpath}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}")
     endif()
     if(APPLE)
-        set_target_properties(_C PROPERTIES
+        set_target_properties(${target} PROPERTIES
                               BUILD_WITH_INSTALL_RPATH TRUE
                               INSTALL_RPATH_USE_LINK_PATH TRUE
-                              INSTALL_RPATH ".;@loader_path;@loader_path/${RELATIVE_BUILD_PATH}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}")
+                              INSTALL_RPATH ".;@loader_path;@loader_path/${relpath}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}")
     endif()
+endmacro()
 
+if(BUILD_VINEYARD_PYTHON_BINDINGS)
+    pybind11_add_module(_C MODULE "python/client.cc"
+                                  "python/core.cc"
+                                  "python/error.cc"
+                                  "python/pybind11_docs.cc"
+                                  "python/pybind11_utils.cc"
+                                  "python/vineyard.cc")
+    file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard" "${CMAKE_BINARY_DIR}/shared-lib")
+    setup_pybind11_module(_C ${RELATIVE_BUILD_PATH})
     add_custom_target(vineyard_client_python
         ALL
         COMMAND cp "$<TARGET_FILE:_C>" "${PROJECT_SOURCE_DIR}/python/vineyard/"
@@ -909,47 +914,16 @@ if(BUILD_VINEYARD_PYTHON_BINDINGS)
 endif()
 
 if(BUILD_VINEYARD_PYTHON_BINDINGS AND BUILD_VINEYARD_LLM_CACHE)
-    set(PYBIND11_PYTHON_VERSION 3)
-    if(NOT (CMAKE_VERSION VERSION_LESS "3.27"))
-        set(PYBIND11_FINDPYTHON ON)
-    endif()
-    file(GLOB PYTHON_BIND_FILES "python/vineyard/llm/kv_state_cache.cc")
-    pybind11_add_module(llm_C MODULE ${PYTHON_BIND_FILES})
-    # make sure `vineyard_llm_cache` been built.
-    add_dependencies(llm_C vineyard_llm_cache)
-    target_link_libraries(llm_C PRIVATE vineyard_client vineyard_llm_cache)
-    target_include_directories(llm_C PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/pybind11/include")
-    target_compile_options(llm_C PRIVATE -Wno-unused-value)
-    set_target_properties(llm_C PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/shared-lib")
-    if(UNIX AND NOT APPLE)
-        target_add_link_options(llm_C PRIVATE OPTIONS -Wl,--exclude-libs=ALL)
-    endif()
-    if(BUILD_VINEYARD_PYPI_PACKAGES AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        target_compile_options(llm_C PRIVATE -static)
-        target_add_link_options(llm_C PRIVATE OPTIONS -static)
-    else()
-        target_compile_options(llm_C PRIVATE -Os)
-        target_add_link_options(llm_C PRIVATE OPTIONS -Os)
-    endif()
-
+    pybind11_add_module(_llm_C MODULE "python/vineyard/llm/cache.cc")
     file(RELATIVE_PATH RELATIVE_BUILD_PATH "${PROJECT_SOURCE_DIR}/python/vineyard/llm" "${CMAKE_BINARY_DIR}/shared-lib")
-    if(UNIX AND NOT APPLE)
-        set_target_properties(llm_C PROPERTIES
-                            BUILD_WITH_INSTALL_RPATH TRUE
-                            INSTALL_RPATH_USE_LINK_PATH TRUE
-                            INSTALL_RPATH ".:\$ORIGIN:\$ORIGIN/${RELATIVE_BUILD_PATH}/:${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${INSTALL_RPATH}")
-    endif()
-    if(APPLE)
-        set_target_properties(llm_C PROPERTIES
-                            BUILD_WITH_INSTALL_RPATH TRUE
-                            INSTALL_RPATH_USE_LINK_PATH TRUE
-                            INSTALL_RPATH ".;@loader_path;@loader_path/${RELATIVE_BUILD_PATH}/;${CMAKE_INSTALL_PREFIX}/lib;${CMAKE_INSTALL_PREFIX}/lib64;${INSTALL_RPATH}")
-    endif()
-
+    setup_pybind11_module(_llm_C ${RELATIVE_BUILD_PATH})
+    # make sure `vineyard_llm_cache` been built.
+    add_dependencies(_llm_C vineyard_llm_cache)
+    target_link_libraries(_llm_C PRIVATE vineyard_client vineyard_llm_cache)
     add_custom_target(vineyard_llm_python
         ALL
-        COMMAND cp "$<TARGET_FILE:llm_C>" "${PROJECT_SOURCE_DIR}/python/vineyard/llm/"
-        DEPENDS llm_C
+        COMMAND cp "$<TARGET_FILE:_llm_C>" "${PROJECT_SOURCE_DIR}/python/vineyard/llm/"
+        DEPENDS _llm_C
         COMMENT "Copying llm kv cache python extensions."
         VERBATIM)
     add_dependencies(vineyard_llm_python vineyard_client_python)
diff --git a/modules/basic/ds/arrow_utils.cc b/modules/basic/ds/arrow_utils.cc
index 55f7ec91b2..ab4785f0c3 100644
--- a/modules/basic/ds/arrow_utils.cc
+++ b/modules/basic/ds/arrow_utils.cc
@@ -34,7 +34,7 @@ namespace vineyard {
 namespace detail {
 
 static inline std::string string_join(std::vector<std::string> const& srcs,
-                               std::string const& sep) {
+                                      std::string const& sep) {
   std::stringstream ss;
   if (!srcs.empty()) {
     ss << srcs[0];
@@ -45,21 +45,23 @@ static inline std::string string_join(std::vector<std::string> const& srcs,
   return ss.str();
 }
 
-static inline void string_split(std::vector<std::string> &rs, std::string const &content, std::string const &patterns) {
-	size_t i = 0, k = 0;
-    while (i < content.size()) {
-		while (k < content.size()) {
-			char c = content[k];
-			if (patterns.find_first_of(c) != std::string::npos) {
-				break;
-			}
-			k += 1;
-		}
-		if (i < k) {
-			rs.emplace_back(content.substr(i, k - i));
-		}
-		i = k;
-	}
+static inline void string_split(std::vector<std::string>& rs,
+                                std::string const& content,
+                                std::string const& patterns) {
+  size_t i = 0, k = 0;
+  while (i < content.size()) {
+    while (k < content.size()) {
+      char c = content[k];
+      if (patterns.find_first_of(c) != std::string::npos) {
+        break;
+      }
+      k += 1;
+    }
+    if (i < k) {
+      rs.emplace_back(content.substr(i, k - i));
+    }
+    i = k;
+  }
 }
 
 }  // namespace detail
diff --git a/modules/llm-cache/CMakeLists.txt b/modules/llm-cache/CMakeLists.txt
index a4e9d686d2..15f43a7cf6 100644
--- a/modules/llm-cache/CMakeLists.txt
+++ b/modules/llm-cache/CMakeLists.txt
@@ -20,10 +20,9 @@ target_link_libraries(vineyard_llm_cache PRIVATE libzstd_static ${GLOG_LIBRARIES
 target_link_libraries(vineyard_llm_cache PUBLIC vineyard_client)
 
 # install bundled thirdparty: rax and MurmurHash3
-install(DIRECTORY
-            ${PROJECT_SOURCE_DIR}/thirdparty/rax
-            ${PROJECT_SOURCE_DIR}/thirdparty/MurmurHash3
-            ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/rax
+                  ${PROJECT_SOURCE_DIR}/thirdparty/MurmurHash3
+                  ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
         DESTINATION include/vineyard/contrib    # target directory
         FILES_MATCHING                          # install only matched files
         PATTERN "*.h"                           # select header files
diff --git a/modules/llm-cache/README.md b/modules/llm-cache/README.md
index 6616f67be2..6259728da8 100644
--- a/modules/llm-cache/README.md
+++ b/modules/llm-cache/README.md
@@ -31,7 +31,7 @@ In this section, we will compare the two methods in terms of latency and suitabl
 
 ## Usage
 
-We provide [C++](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_state_cache_manager.h) and [Python](https://github.com/v6d-io/v6d/blob/main/python/vineyard/llm/__init__.py) APIs for Vineyard LLM KV Cache. Based on the inference framework, you can use the corresponding API to integrate the Vineyard LLM KV Cache.
+We provide [C++](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_cache_manager.h) and [Python](https://github.com/v6d-io/v6d/blob/main/python/vineyard/llm/__init__.py) APIs for Vineyard LLM KV Cache. Based on the inference framework, you can use the corresponding API to integrate the Vineyard LLM KV Cache.
 
 ### C++ API
 
@@ -90,10 +90,10 @@ $ ./build/bin/vineyardd --socket /tmp/vineyard_test.sock
 Then open another terminal to run the vineyard llm kv cache test.
 
 ```bash
-$ ./bin/kv_state_cache_test --client-num 1 --vineyard-ipc-sockets /tmp/vineyard_test.sock
+$ ./bin/kv_cache_test --client-num 1 --vineyard-ipc-sockets /tmp/vineyard_test.sock
 ```
 
-For more information about how to use the C++ API, you can refer to the the [C++ API implementation](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_state_cache_manager.cc) and the [related tests](https://github.com/v6d-io/v6d/tree/main/modules/llm-cache/tests).
+For more information about how to use the C++ API, you can refer to the the [C++ API implementation](https://github.com/v6d-io/v6d/blob/main/modules/llm-cache/ds/kv_cache_manager.cc) and the [related tests](https://github.com/v6d-io/v6d/tree/main/modules/llm-cache/tests).
 
 
 ### Python API
@@ -165,8 +165,8 @@ vineyard_cache_config = VineyardCacheConfig(
 )
 cache = KVCache(
     cache_config=vineyard_cache_config,
-    tensor_bytes=16,  # should be the same as the nbytes of the tensor
-    cache_capacity=10,
+    tensor_nbytes=16,  # should be the same as the nbytes of the tensor
+    cache_capacity=1024,
     layer=2,
 )
 
@@ -248,13 +248,13 @@ from vineyard.llm.config import VineyardCacheConfig
 
 file_cache_config = FileCacheConfig(
     chunk_size=2,
-    split_number=2,
+    hash_chunk_size=2,
     root="/tmp/vineyard/llm_cache",
 )
 cache = KVCache(
     cache_config=file_cache_config,
-    tensor_bytes=16,  # should be the same as the nbytes of the tensor
-    cache_capacity=10,
+    tensor_nbytes=16,  # should be the same as the nbytes of the tensor
+    cache_capacity=1024,
     layer=2,
 )
 
diff --git a/modules/llm-cache/ds/config.h b/modules/llm-cache/ds/config.h
index 356d0bc507..cbafe4d433 100644
--- a/modules/llm-cache/ds/config.h
+++ b/modules/llm-cache/ds/config.h
@@ -49,32 +49,32 @@ struct VineyardCacheConfig : public KVCacheConfig {
 };
 
 struct FileCacheConfig : public KVCacheConfig {
-  int batchSize;
-  int splitNumber;
+  int chunkSize;
+  int hashChunkSize;
   std::string root;
   FilesystemType filesystemType;
-  int clientGCInterval;  // second
-  int ttl;               // second
+  int gcInterval;  // in seconds
+  int ttl;         // in seconds
   bool enbaleGlobalGC;
-  int globalGCInterval;  // second
-  int globalTTL;         // second
+  int globalGCInterval;  // in seconds
+  int globalTTL;         // in seconds
 
   // Default gc interval is 30 minutes and default global gc interval is 3
   // hours.
   FileCacheConfig(int tensorByte = 10, int cacheCapacity = 10, int layer = 1,
-                  int batchSize = 4, int splitNumber = 2,
+                  int chunkSize = 4, int hashChunkSize = 2,
                   std::string root = "/tmp/llm_cache/",
                   FilesystemType filesystemType = LOCAL,
-                  int clientGCInterval = 30 * 60, int ttl = 30 * 60,
+                  int gcInterval = 30 * 60, int ttl = 30 * 60,
                   bool enbaleGlobalGC = false,
                   int globalGCInterval = 3 * 60 * 60,
                   int globalTTL = 3 * 60 * 60)
       : KVCacheConfig{tensorByte, cacheCapacity, layer} {
     this->root = root;
-    this->batchSize = batchSize;
-    this->splitNumber = splitNumber;
+    this->chunkSize = chunkSize;
+    this->hashChunkSize = hashChunkSize;
     this->filesystemType = filesystemType;
-    this->clientGCInterval = clientGCInterval;
+    this->gcInterval = gcInterval;
     this->ttl = ttl;
     this->enbaleGlobalGC = enbaleGlobalGC;
     this->globalGCInterval = globalGCInterval;
diff --git a/modules/llm-cache/ds/kv_state_cache.cc b/modules/llm-cache/ds/kv_cache.cc
similarity index 61%
rename from modules/llm-cache/ds/kv_state_cache.cc
rename to modules/llm-cache/ds/kv_cache.cc
index f6ba7bb829..29a207de3c 100644
--- a/modules/llm-cache/ds/kv_state_cache.cc
+++ b/modules/llm-cache/ds/kv_cache.cc
@@ -25,18 +25,18 @@ limitations under the License.
 #include "common/util/base64.h"
 #include "common/util/logging.h"
 #include "common/util/status.h"
-#include "llm-cache/ds/kv_state_cache.h"
+#include "llm-cache/ds/kv_cache.h"
 #include "llm-cache/radix-tree/radix-tree.h"
 
 namespace vineyard {
 
-void KVStateCache::Construct(const ObjectMeta& meta) {
+void KVCache::Construct(const ObjectMeta& meta) {
   Object::Construct(meta);
   Resolve();
 }
 
-void KVStateCache::Resolve() {
-  std::string typeName = type_name<KVStateCache>();
+void KVCache::Resolve() {
+  std::string typeName = type_name<KVCache>();
 
   VINEYARD_ASSERT(this->meta_.GetTypeName() == typeName,
                   "Expect typename '" + typeName + "', but got '" +
@@ -50,15 +50,15 @@ void KVStateCache::Resolve() {
   }
 
   // 2. construct the member field
-  this->tensorBytes = this->meta_.GetKeyValue<int>("tensorBytes");
+  this->tensorNBytes = this->meta_.GetKeyValue<int>("tensorNBytes");
   this->version = this->meta_.GetKeyValue<uint64_t>("version");
   this->layer = this->meta_.GetKeyValue<int>("layer");
-  VLOG(100) << "construct the member field success, with tensorBytes:"
-            << this->tensorBytes << " version:" << this->version
+  VLOG(100) << "construct the member field success, with tensorNBytes:"
+            << this->tensorNBytes << " version:" << this->version
             << " layer:" << this->layer;
 }
 
-void KVStateCache::GetCurrentBlockIDSet(std::set<ObjectID>& objectIDSet) {
+void KVCache::GetCurrentBlockIDSet(std::set<ObjectID>& objectIDSet) {
   std::set<void*> subTreeData = rootTree->GetSubTreeDataSet();
   for (auto iter = subTreeData.begin(); iter != subTreeData.end(); ++iter) {
     TreeData* treeData = reinterpret_cast<TreeData*>(*iter);
@@ -68,29 +68,29 @@ void KVStateCache::GetCurrentBlockIDSet(std::set<ObjectID>& objectIDSet) {
   }
 }
 
-KVStateCache::~KVStateCache() {}
+KVCache::~KVCache() {}
 
-KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int tensorBytes,
-                                         int layer,
-                                         std::shared_ptr<RadixTree>& rootTree)
+KVCacheBuilder::KVCacheBuilder(Client& client, int tensorNBytes, int layer,
+                               std::shared_ptr<RadixTree>& rootTree)
     : client(client) {
-  this->tensorBytes = tensorBytes;
+  this->tensorNBytes = tensorNBytes;
   this->version = 0;
   this->layer = layer;
   this->rootTree = rootTree;
 }
 
-Status KVStateCacheBuilder::Make(
-    Client& client, std::shared_ptr<KVStateCacheBuilder>& kvStateCacheBuilder,
-    int tensorBytes, int cacheCapacity, int layer, int blockSize) {
-  KVStateCacheBlockBuilder* builder =
-      new KVStateCacheBlockBuilder(client, tensorBytes, layer, blockSize);
+Status KVCacheBuilder::Make(Client& client,
+                            std::shared_ptr<KVCacheBuilder>& kvCacheBuilder,
+                            int tensorNBytes, int cacheCapacity, int layer,
+                            int blockSize) {
+  KVCacheBlockBuilder* builder =
+      new KVCacheBlockBuilder(client, tensorNBytes, layer, blockSize);
 
   std::shared_ptr<RadixTree> rootTree =
       std::make_shared<RadixTree>(cacheCapacity);
 
   TreeData* treeData = new TreeData();
-  treeData->kvStateCacheBlockBuilder = builder;
+  treeData->kvCacheBlockBuilder = builder;
   treeData->isPtr = true;
 
   std::shared_ptr<NodeData> rootTreeHeader = rootTree->GetRootNode();
@@ -98,28 +98,28 @@ Status KVStateCacheBuilder::Make(
   rootTreeHeader->treeData->dataLength = sizeof(TreeData);
   rootTree->SetSubtreeData(treeData);
 
-  kvStateCacheBuilder = std::shared_ptr<KVStateCacheBuilder>(
-      new KVStateCacheBuilder(client, tensorBytes, layer, rootTree));
+  kvCacheBuilder = std::shared_ptr<KVCacheBuilder>(
+      new KVCacheBuilder(client, tensorNBytes, layer, rootTree));
   return Status::OK();
 }
 
-Status KVStateCacheBuilder::Make(
-    Client& client, std::shared_ptr<KVStateCacheBuilder>& kvStateCacheBuilder,
-    std::shared_ptr<KVStateCache>& cache) {
-  kvStateCacheBuilder = std::make_shared<KVStateCacheBuilder>(
-      client, cache->GetTensorBytes(), cache->GetLayer(), cache->rootTree);
+Status KVCacheBuilder::Make(Client& client,
+                            std::shared_ptr<KVCacheBuilder>& kvCacheBuilder,
+                            std::shared_ptr<KVCache>& cache) {
+  kvCacheBuilder = std::make_shared<KVCacheBuilder>(
+      client, cache->GetTensorNBytes(), cache->GetLayer(), cache->rootTree);
   return Status::OK();
 }
 
-Status KVStateCacheBuilder::Split(
-    KVStateCacheBlockBuilder* kvStateCacheBlockBuilder,
+Status KVCacheBuilder::Split(
+    KVCacheBlockBuilder* kvCacheBlockBuilder,
     std::vector<std::shared_ptr<NodeData>> nodeDataList,
-    KVStateCacheBlockBuilder*& childKVStateCacheBlockBuilder) {
+    KVCacheBlockBuilder*& childKVCacheBlockBuilder) {
   // Split the tree if the list of kvState is full.
-  childKVStateCacheBlockBuilder =
-      new KVStateCacheBlockBuilder(client, this->tensorBytes, this->layer,
-                                   kvStateCacheBlockBuilder->GetBlockSize());
-  VINEYARD_ASSERT(childKVStateCacheBlockBuilder != nullptr,
+  childKVCacheBlockBuilder =
+      new KVCacheBlockBuilder(client, this->tensorNBytes, this->layer,
+                              kvCacheBlockBuilder->GetBlockSize());
+  VINEYARD_ASSERT(childKVCacheBlockBuilder != nullptr,
                   "Not enough memory for new block builder.");
 
   for (size_t i = 0; i < nodeDataList.size(); i++) {
@@ -130,17 +130,16 @@ Status KVStateCacheBuilder::Split(
     int index = data->offset;
 
     // Transfer the data from this builder to the child builder.
-    data->offset =
-        kvStateCacheBlockBuilder->Split(childKVStateCacheBlockBuilder, index);
+    data->offset = kvCacheBlockBuilder->Split(childKVCacheBlockBuilder, index);
   }
-  VLOG(100) << "builder:" << kvStateCacheBlockBuilder
-            << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr();
-  VLOG(100) << "child_builder:" << childKVStateCacheBlockBuilder
-            << " bitmap:" << childKVStateCacheBlockBuilder->GetBitmapStr();
+  VLOG(100) << "builder:" << kvCacheBlockBuilder
+            << " bitmap:" << kvCacheBlockBuilder->GetBitmapStr();
+  VLOG(100) << "child_builder:" << childKVCacheBlockBuilder
+            << " bitmap:" << childKVCacheBlockBuilder->GetBitmapStr();
   return Status::OK();
 }
 
-Status KVStateCacheBuilder::Update(
+Status KVCacheBuilder::Update(
     const std::vector<int>& tokenList, int nextToken,
     const std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
   std::vector<int> tokenListCopy = tokenList;
@@ -152,16 +151,16 @@ Status KVStateCacheBuilder::Update(
       this->rootTree->Insert(tokenListCopy, evictedNodeData);
   RETURN_ON_ASSERT(nodeData != nullptr, "Update llm cache failed.");
 
-  KVStateCacheBlockBuilder* kvStateCacheBlockBuilder;
+  KVCacheBlockBuilder* kvCacheBlockBuilder;
   TreeData* treeData = reinterpret_cast<TreeData*>(nodeData->treeData->data);
   if (treeData->isPtr) {
-    kvStateCacheBlockBuilder = reinterpret_cast<KVStateCacheBlockBuilder*>(
-        treeData->kvStateCacheBlockBuilder);
+    kvCacheBlockBuilder =
+        reinterpret_cast<KVCacheBlockBuilder*>(treeData->kvCacheBlockBuilder);
   } else {
     ObjectID blockObjectID = treeData->builderObjectID;
-    RETURN_ON_ERROR(KVStateCacheBlockBuilder::Make(client, treeData,
-                                                   kvStateCacheBlockBuilder));
-    treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder;
+    RETURN_ON_ERROR(
+        KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder));
+    treeData->kvCacheBlockBuilder = kvCacheBlockBuilder;
     treeData->isPtr = true;
     blockIDSetToDelete.insert(blockObjectID);
   }
@@ -170,7 +169,7 @@ Status KVStateCacheBuilder::Update(
     Delete(evictedNodeData);
   }
 
-  if (kvStateCacheBlockBuilder->IsFull()) {
+  if (kvCacheBlockBuilder->IsFull()) {
     /**
      * If the kv-state cache of the tree is full, trigger split. Delete the
      * empty node from the radix tree and split the tree. Then, kv-state cache
@@ -184,14 +183,14 @@ Status KVStateCacheBuilder::Update(
     std::vector<std::shared_ptr<NodeData>> nodeDataList =
         rootTree->Split(tokenListCopy, subTreeHeader);
     RETURN_ON_ASSERT(nodeDataList.size() != 0, "Split llm cache failed.");
-    KVStateCacheBlockBuilder* newKVStateCacheBlockBuilder;
-    Status status = Split(kvStateCacheBlockBuilder, nodeDataList,
-                          newKVStateCacheBlockBuilder);
+    KVCacheBlockBuilder* newKVCacheBlockBuilder;
+    Status status =
+        Split(kvCacheBlockBuilder, nodeDataList, newKVCacheBlockBuilder);
     RETURN_ON_ERROR(status);
 
     TreeData* newTreeData = new TreeData();
     RETURN_ON_ASSERT(newTreeData != nullptr, "Split llm cache failed.");
-    newTreeData->kvStateCacheBlockBuilder = newKVStateCacheBlockBuilder;
+    newTreeData->kvCacheBlockBuilder = newKVCacheBlockBuilder;
     newTreeData->isPtr = true;
 
     subTreeHeader->treeData->data = newTreeData;
@@ -199,7 +198,7 @@ Status KVStateCacheBuilder::Update(
     rootTree->SetSubtreeData(newTreeData);
     VLOG(100) << "block split success";
 
-    // kv_state_cache_builder->UnLock();
+    // kv_cache_builder->UnLock();
     status = Update(tokenList, nextToken, kvState);
     RETURN_ON_ERROR(status);
   } else {
@@ -207,19 +206,18 @@ Status KVStateCacheBuilder::Update(
     OffsetData* data = new OffsetData();
     RETURN_ON_ASSERT(data != nullptr, "Not enough memory for new offset data.");
 
-    RETURN_ON_ERROR(kvStateCacheBlockBuilder->Update(kvState, data));
+    RETURN_ON_ERROR(kvCacheBlockBuilder->Update(kvState, data));
     nodeData->nodeData->data = data;
     nodeData->nodeData->dataLength = sizeof(OffsetData);
   }
 
-  VLOG(100) << "builder:" << kvStateCacheBlockBuilder
-            << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr();
+  VLOG(100) << "builder:" << kvCacheBlockBuilder
+            << " bitmap:" << kvCacheBlockBuilder->GetBitmapStr();
   return Status::OK();
 }
 
-Status KVStateCacheBuilder::Query(
-    const std::vector<int>& tokenList, int token,
-    std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
+Status KVCacheBuilder::Query(const std::vector<int>& tokenList, int token,
+                             std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
   std::vector<int> tokenListCopy = tokenList;
   tokenListCopy.push_back(token);
 
@@ -231,40 +229,40 @@ Status KVStateCacheBuilder::Query(
   int offset = data->offset;
 
   TreeData* treeData = reinterpret_cast<TreeData*>(nodeData->treeData->data);
-  KVStateCacheBlockBuilder* kvStateCacheBlockBuilder;
+  KVCacheBlockBuilder* kvCacheBlockBuilder;
   if (treeData->isPtr) {
-    kvStateCacheBlockBuilder = reinterpret_cast<KVStateCacheBlockBuilder*>(
-        treeData->kvStateCacheBlockBuilder);
+    kvCacheBlockBuilder =
+        reinterpret_cast<KVCacheBlockBuilder*>(treeData->kvCacheBlockBuilder);
   } else {
     ObjectID blockObjectID = treeData->builderObjectID;
-    RETURN_ON_ERROR(KVStateCacheBlockBuilder::Make(client, treeData,
-                                                   kvStateCacheBlockBuilder));
-    treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder;
+    RETURN_ON_ERROR(
+        KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder));
+    treeData->kvCacheBlockBuilder = kvCacheBlockBuilder;
     treeData->isPtr = true;
     blockIDSetToDelete.insert(blockObjectID);
   }
 
-  return kvStateCacheBlockBuilder->Query(offset, kvState);
+  return kvCacheBlockBuilder->Query(offset, kvState);
 }
 
-void KVStateCacheBuilder::Delete(std::shared_ptr<NodeData> evictedNodeData) {
+void KVCacheBuilder::Delete(std::shared_ptr<NodeData> evictedNodeData) {
   TreeData* treeData =
       reinterpret_cast<TreeData*>(evictedNodeData->treeData->data);
-  KVStateCacheBlockBuilder* kvStateCacheBlockBuilder;
+  KVCacheBlockBuilder* kvCacheBlockBuilder;
   if (treeData->isPtr) {
-    kvStateCacheBlockBuilder = reinterpret_cast<KVStateCacheBlockBuilder*>(
-        treeData->kvStateCacheBlockBuilder);
+    kvCacheBlockBuilder =
+        reinterpret_cast<KVCacheBlockBuilder*>(treeData->kvCacheBlockBuilder);
   } else {
     ObjectID blockObjectID = treeData->builderObjectID;
-    Status status = KVStateCacheBlockBuilder::Make(client, treeData,
-                                                   kvStateCacheBlockBuilder);
+    Status status =
+        KVCacheBlockBuilder::Make(client, treeData, kvCacheBlockBuilder);
     if (!status.ok()) {
       // Not a deadly error, just log it and return.
-      LOG(FATAL) << "Failed to make kvStateCacheBlockBuilder. It may cause "
+      LOG(FATAL) << "Failed to make kvCacheBlockBuilder. It may cause "
                     "memory leak.";
       return;
     }
-    treeData->kvStateCacheBlockBuilder = kvStateCacheBlockBuilder;
+    treeData->kvCacheBlockBuilder = kvCacheBlockBuilder;
     treeData->isPtr = true;
 
     blockIDSetToDelete.insert(blockObjectID);
@@ -272,36 +270,34 @@ void KVStateCacheBuilder::Delete(std::shared_ptr<NodeData> evictedNodeData) {
 
   OffsetData* data =
       reinterpret_cast<OffsetData*>(evictedNodeData->nodeData->data);
-  kvStateCacheBlockBuilder->DeleteKVCache(data->offset);
+  kvCacheBlockBuilder->DeleteKVCache(data->offset);
   delete data;
   // TBD
   // Refactor this code. The data should be deleted by the RadixTree
   // delete (DataWrapper*) evictedNodeData->nodeData;
   if (evictedNodeData->cleanTreeData) {
     this->rootTree->ClearSubtreeData(treeData);
-    std::shared_ptr<Object> blockObject =
-        kvStateCacheBlockBuilder->_Seal(client);
+    std::shared_ptr<Object> blockObject = kvCacheBlockBuilder->_Seal(client);
     Status status = client.DelData(blockObject->id());
     if (!status.ok()) {
       LOG(ERROR) << "Delete object failed: " << status.ToString()
                  << " It may cause memory leak.";
     }
-    delete kvStateCacheBlockBuilder;
+    delete kvCacheBlockBuilder;
   }
   evictedNodeData->RecycleSource();
 }
 
-Status KVStateCacheBuilder::Merge(std::shared_ptr<KVStateCache> kvStateCache) {
-  if (kvStateCache == nullptr) {
+Status KVCacheBuilder::Merge(std::shared_ptr<KVCache> kvCache) {
+  if (kvCache == nullptr) {
     return Status::OK();
   }
 
-  std::shared_ptr<KVStateCacheBuilder> globalCacheBuilder;
-  Status status =
-      KVStateCacheBuilder::Make(client, globalCacheBuilder, kvStateCache);
+  std::shared_ptr<KVCacheBuilder> globalCacheBuilder;
+  Status status = KVCacheBuilder::Make(client, globalCacheBuilder, kvCache);
   RETURN_ON_ERROR(status);
 
-  std::shared_ptr<RadixTree> globalCacheTree = kvStateCache->GetRootTree();
+  std::shared_ptr<RadixTree> globalCacheTree = kvCache->GetRootTree();
 
   std::set<std::vector<int>> insertTokenList;
   std::vector<std::vector<int>> evicted_token_list;
@@ -350,8 +346,7 @@ Status KVStateCacheBuilder::Merge(std::shared_ptr<KVStateCache> kvStateCache) {
   return Status::OK();
 }
 
-void KVStateCacheBuilder::GetCurrentBlockIDSet(
-    std::set<ObjectID>& objectIDSet) {
+void KVCacheBuilder::GetCurrentBlockIDSet(std::set<ObjectID>& objectIDSet) {
   std::set<void*> subTreeData = rootTree->GetSubTreeDataSet();
   for (auto iter = subTreeData.begin(); iter != subTreeData.end(); ++iter) {
     TreeData* treeData = reinterpret_cast<TreeData*>(*iter);
@@ -361,17 +356,17 @@ void KVStateCacheBuilder::GetCurrentBlockIDSet(
   }
 }
 
-Status KVStateCacheBuilder::Build(Client& client) { return Status::OK(); }
+Status KVCacheBuilder::Build(Client& client) { return Status::OK(); }
 
-std::shared_ptr<Object> KVStateCacheBuilder::_Seal(Client& client) {
+std::shared_ptr<Object> KVCacheBuilder::_Seal(Client& client) {
   VINEYARD_CHECK_OK(this->Build(client));
 
-  std::shared_ptr<KVStateCache> kvStateCache = std::make_shared<KVStateCache>();
+  std::shared_ptr<KVCache> kvCache = std::make_shared<KVCache>();
 
   // 1. store the member variables to cache object meta
-  kvStateCache->meta_.AddKeyValue("tensorBytes", this->tensorBytes);
-  kvStateCache->meta_.AddKeyValue("version", this->version);
-  kvStateCache->meta_.AddKeyValue("layer", this->layer);
+  kvCache->meta_.AddKeyValue("tensorNBytes", this->tensorNBytes);
+  kvCache->meta_.AddKeyValue("version", this->version);
+  kvCache->meta_.AddKeyValue("layer", this->layer);
 
   // 2. seal all the block and put object id to cache object and
   // change the tree data from pointer to object id
@@ -384,31 +379,28 @@ std::shared_ptr<Object> KVStateCacheBuilder::_Seal(Client& client) {
       continue;
     }
 
-    KVStateCacheBlockBuilder* kvStateCacheBlockBuilder =
-        reinterpret_cast<KVStateCacheBlockBuilder*>(
-            treeData->kvStateCacheBlockBuilder);
-    std::shared_ptr<Object> kvStateCacheBlock =
-        kvStateCacheBlockBuilder->_Seal(client);
-    VINEYARD_CHECK_OK(client.Persist(kvStateCacheBlock->id()));
-    treeData->builderObjectID = kvStateCacheBlock->id();
+    KVCacheBlockBuilder* kvCacheBlockBuilder =
+        reinterpret_cast<KVCacheBlockBuilder*>(treeData->kvCacheBlockBuilder);
+    std::shared_ptr<Object> kvCacheBlock = kvCacheBlockBuilder->_Seal(client);
+    VINEYARD_CHECK_OK(client.Persist(kvCacheBlock->id()));
+    treeData->builderObjectID = kvCacheBlock->id();
     treeData->isPtr = false;
   }
 
   // 3. put the serialized sequence radix tree to cache object meta
-  kvStateCache->meta_.AddKeyValue("radix_tree",
-                                  base64_encode(this->rootTree->Serialize()));
+  kvCache->meta_.AddKeyValue("radix_tree",
+                             base64_encode(this->rootTree->Serialize()));
 
   // 4. put the object type to the meta
-  kvStateCache->meta_.SetTypeName(type_name<KVStateCache>());
+  kvCache->meta_.SetTypeName(type_name<KVCache>());
 
-  VINEYARD_CHECK_OK(
-      client.CreateMetaData(kvStateCache->meta_, kvStateCache->id_));
-  VLOG(100) << "KVStateCacheBuilder::_Seal: " << kvStateCache->id_;
+  VINEYARD_CHECK_OK(client.CreateMetaData(kvCache->meta_, kvCache->id_));
+  VLOG(100) << "KVCacheBuilder::_Seal: " << kvCache->id_;
   this->set_sealed(true);
-  return kvStateCache;
+  return kvCache;
 }
 
-KVStateCacheBuilder::~KVStateCacheBuilder() {
+KVCacheBuilder::~KVCacheBuilder() {
   // get all subtree data and node data
   std::set<void*> subTreeDataSet = rootTree->GetSubTreeDataSet();
   std::set<void*> nodeDataSet = rootTree->GetAllNodeData();
@@ -416,10 +408,9 @@ KVStateCacheBuilder::~KVStateCacheBuilder() {
   for (auto iter = subTreeDataSet.begin(); iter != subTreeDataSet.end();
        ++iter) {
     TreeData* treeData = reinterpret_cast<TreeData*>(*iter);
-    if (treeData->isPtr == true &&
-        treeData->kvStateCacheBlockBuilder != nullptr) {
-      delete reinterpret_cast<KVStateCacheBlockBuilder*>(
-          treeData->kvStateCacheBlockBuilder);
+    if (treeData->isPtr == true && treeData->kvCacheBlockBuilder != nullptr) {
+      delete reinterpret_cast<KVCacheBlockBuilder*>(
+          treeData->kvCacheBlockBuilder);
       delete treeData;
     }
   }
@@ -431,15 +422,14 @@ KVStateCacheBuilder::~KVStateCacheBuilder() {
   }
 }
 
-void KVStateCacheBuilder::Close() {
+void KVCacheBuilder::Close() {
   std::set<void*> subTreeDataSet = rootTree->GetSubTreeDataSet();
   for (auto iter = subTreeDataSet.begin(); iter != subTreeDataSet.end();
        ++iter) {
     TreeData* treeData = reinterpret_cast<TreeData*>(*iter);
-    if (treeData->isPtr && treeData->kvStateCacheBlockBuilder != nullptr) {
+    if (treeData->isPtr && treeData->kvCacheBlockBuilder != nullptr) {
       std::shared_ptr<Object> object =
-          reinterpret_cast<KVStateCacheBlockBuilder*>(
-              treeData->kvStateCacheBlockBuilder)
+          reinterpret_cast<KVCacheBlockBuilder*>(treeData->kvCacheBlockBuilder)
               ->_Seal(client);
       Status status = client.DelData(object->id());
       if (!status.ok()) {
diff --git a/modules/llm-cache/ds/kv_state_cache.h b/modules/llm-cache/ds/kv_cache.h
similarity index 66%
rename from modules/llm-cache/ds/kv_state_cache.h
rename to modules/llm-cache/ds/kv_cache.h
index 0c65bb43cd..b275b651c4 100644
--- a/modules/llm-cache/ds/kv_state_cache.h
+++ b/modules/llm-cache/ds/kv_cache.h
@@ -22,19 +22,19 @@ limitations under the License.
 #include "client/client.h"
 #include "common/util/logging.h"
 #include "common/util/status.h"
-#include "llm-cache/ds/kv_state_cache_block.h"
+#include "llm-cache/ds/kv_cache_block.h"
 #include "llm-cache/radix-tree/radix-tree.h"
 
-#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_
-#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_
+#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_H_
+#define MODULES_LLM_CACHE_DS_KV_CACHE_H_
 
 namespace vineyard {
 
-class KVStateCache : public vineyard::Registered<KVStateCache> {
+class KVCache : public vineyard::Registered<KVCache> {
  private:
-  std::vector<std::shared_ptr<KVStateCacheBlock>> kvStateCacheBlockList;
+  std::vector<std::shared_ptr<KVCacheBlock>> kvCacheBlockList;
   std::shared_ptr<RadixTree> rootTree;
-  int tensorBytes;
+  int tensorNBytes;
   int cacheCapacity;
   int layer;
   uint64_t version;
@@ -42,7 +42,7 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
  public:
   static std::unique_ptr<Object> Create() __attribute__((used)) {
     return std::static_pointer_cast<Object>(
-        std::unique_ptr<KVStateCache>{new KVStateCache()});
+        std::unique_ptr<KVCache>{new KVCache()});
   }
 
   void Construct(const ObjectMeta& meta) override;
@@ -50,11 +50,11 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
   void Resolve();
 
   // for test
-  std::vector<std::shared_ptr<KVStateCacheBlock>>& GetKVStateCacheBlockList() {
-    return this->kvStateCacheBlockList;
+  std::vector<std::shared_ptr<KVCacheBlock>>& GetKVCacheBlockList() {
+    return this->kvCacheBlockList;
   }
 
-  int GetTensorBytes() { return this->tensorBytes; }
+  int GetTensorNBytes() { return this->tensorNBytes; }
 
   int GetCacheCapacity() { return this->cacheCapacity; }
 
@@ -66,37 +66,37 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
 
   void GetCurrentBlockIDSet(std::set<ObjectID>& objectIDSet);
 
-  ~KVStateCache();
+  ~KVCache();
 
-  friend class KVStateCacheBuilder;
+  friend class KVCacheBuilder;
 };
 
-class KVStateCacheBuilder : public vineyard::ObjectBuilder {
+class KVCacheBuilder : public vineyard::ObjectBuilder {
   Client& client;
   std::shared_ptr<RadixTree> rootTree;
   std::set<ObjectID> blockIDSetToDelete;
-  int tensorBytes;
+  int tensorNBytes;
   int layer;
   uint64_t version;
   int blockSize;
   int cacheCapacity;
 
  public:
-  KVStateCacheBuilder(Client& client, int tensorBytes, int layer,
-                      std::shared_ptr<RadixTree>& rootTree);
+  KVCacheBuilder(Client& client, int tensorNBytes, int layer,
+                 std::shared_ptr<RadixTree>& rootTree);
 
   static Status Make(Client& client,
-                     std::shared_ptr<KVStateCacheBuilder>& kvStateCacheBuilder,
-                     int tensorBytes = 10, int cacheCapacity = 10,
+                     std::shared_ptr<KVCacheBuilder>& kvCacheBuilder,
+                     int tensorNBytes = 10, int cacheCapacity = 10,
                      int layer = 1, int blockSize = DEFAULT_BLOCK_SIZE);
 
   static Status Make(Client& client,
-                     std::shared_ptr<KVStateCacheBuilder>& kvStateCacheBuilder,
-                     std::shared_ptr<KVStateCache>& cache);
+                     std::shared_ptr<KVCacheBuilder>& kvCacheBuilder,
+                     std::shared_ptr<KVCache>& cache);
 
-  Status Split(KVStateCacheBlockBuilder* kvStateCacheBlockBuilder,
+  Status Split(KVCacheBlockBuilder* kvCacheBlockBuilder,
                std::vector<std::shared_ptr<NodeData>> nodeDataList,
-               KVStateCacheBlockBuilder*& childKVStateCacheBlockBuilder);
+               KVCacheBlockBuilder*& childKVCacheBlockBuilder);
 
   Status Update(const std::vector<int>& token_list, int next_token,
                 const std::vector<std::pair<LLMKV, LLMKV>>& kv_state);
@@ -106,7 +106,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder {
 
   void Delete(std::shared_ptr<NodeData> evicted_node);
 
-  Status Merge(std::shared_ptr<KVStateCache> kv_state_cache);
+  Status Merge(std::shared_ptr<KVCache> kv_cache);
 
   uint64_t GetVersion() { return this->version; }
 
@@ -118,7 +118,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder {
 
   std::shared_ptr<Object> _Seal(Client& client) override;
 
-  uint64_t GetTensorBytes() { return this->tensorBytes; }
+  uint64_t GetTensorNBytes() { return this->tensorNBytes; }
 
   std::shared_ptr<RadixTree> GetRootTree() { return this->rootTree; }
 
@@ -134,9 +134,9 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder {
 
   void ClearBlockIDSetToDelete() { this->blockIDSetToDelete.clear(); }
 
-  ~KVStateCacheBuilder();
+  ~KVCacheBuilder();
 };
 
 }  // namespace vineyard
 
-#endif  // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_H_
+#endif  // MODULES_LLM_CACHE_DS_KV_CACHE_H_
diff --git a/modules/llm-cache/ds/kv_state_cache_block.cc b/modules/llm-cache/ds/kv_cache_block.cc
similarity index 70%
rename from modules/llm-cache/ds/kv_state_cache_block.cc
rename to modules/llm-cache/ds/kv_cache_block.cc
index c82b0453ed..4429b309de 100644
--- a/modules/llm-cache/ds/kv_state_cache_block.cc
+++ b/modules/llm-cache/ds/kv_cache_block.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include "client/client.h"
 #include "common/memory/memcpy.h"
 #include "common/util/logging.h"
-#include "llm-cache/ds/kv_state_cache_block.h"
+#include "llm-cache/ds/kv_cache_block.h"
 
 namespace vineyard {
 
 // this function will be removed in the future
-std::string KVStateCacheBlock::GetBitmapStr() {
+std::string KVCacheBlock::GetBitmapStr() {
   std::string result;
   const int bits = 8 * sizeof(uint64_t);
   for (int i = 0; i < this->bitmapSize; i++) {
@@ -36,7 +36,7 @@ std::string KVStateCacheBlock::GetBitmapStr() {
   return result;
 }
 
-std::string KVStateCacheBlockBuilder::GetBitmapStr() {
+std::string KVCacheBlockBuilder::GetBitmapStr() {
   std::string result;
   const int bits = 8 * sizeof(uint64_t);
   for (int i = 0; i < this->bitmapSize; i++) {
@@ -47,10 +47,10 @@ std::string KVStateCacheBlockBuilder::GetBitmapStr() {
   return result;
 }
 
-void KVStateCacheBlock::Construct(const ObjectMeta& meta) {
+void KVCacheBlock::Construct(const ObjectMeta& meta) {
   Object::Construct(meta);
 
-  std::string typeName = type_name<KVStateCacheBlock>();
+  std::string typeName = type_name<KVCacheBlock>();
 
   VINEYARD_ASSERT(meta.GetTypeName() == typeName,
                   "Expect typename '" + typeName + "', but got '" +
@@ -75,45 +75,44 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) {
     this->bitmap[i] =
         this->meta_.GetKeyValue<uint64_t>("bitmap_" + std::to_string(i));
   }
-  this->tensorBytes = this->meta_.GetKeyValue<int>("tensorBytes");
+  this->tensorNBytes = this->meta_.GetKeyValue<int>("tensorNBytes");
   this->blockSize = this->meta_.GetKeyValue<int>("block_size");
 }
 
-KVStateCacheBlock::~KVStateCacheBlock() { delete this->bitmap; }
+KVCacheBlock::~KVCacheBlock() { delete this->bitmap; }
 
-KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(Client& client,
-                                                   int tensorBytes, int layer,
-                                                   int blockSize)
+KVCacheBlockBuilder::KVCacheBlockBuilder(Client& client, int tensorNBytes,
+                                         int layer, int blockSize)
     : client(client) {
   this->blockSize = blockSize;
   this->bitmapSize = (blockSize + 63) / 64;
   this->bitmap = new uint64_t[this->bitmapSize];
   memset(this->bitmap, UINT8_MAX, this->bitmapSize * sizeof(uint64_t));
-  std::vector<int64_t> shape = {(int64_t)(blockSize), tensorBytes};
+  std::vector<int64_t> shape = {(int64_t)(blockSize), tensorNBytes};
   for (int i = 0; i < layer; i++) {
     this->keyStateTensorBuilderList.push_back(
         std::make_shared<KVTensorBuilder>(client, shape));
     this->valueStateTensorBuilderList.push_back(
         std::make_shared<KVTensorBuilder>(client, shape));
   }
-  this->tensorBytes = tensorBytes;
+  this->tensorNBytes = tensorNBytes;
   this->layer = layer;
 }
 
-KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(
-    Client& client, std::shared_ptr<KVStateCacheBlock> kvStateCacheBlock)
+KVCacheBlockBuilder::KVCacheBlockBuilder(
+    Client& client, std::shared_ptr<KVCacheBlock> kvCacheBlock)
     : client(client) {
-  this->bitmapSize = kvStateCacheBlock->bitmapSize;
-  this->blockSize = kvStateCacheBlock->blockSize;
+  this->bitmapSize = kvCacheBlock->bitmapSize;
+  this->blockSize = kvCacheBlock->blockSize;
   VLOG(100) << "create builder from block object, bitmap size:"
             << this->bitmapSize << " block size:" << blockSize;
   this->bitmap = new uint64_t[this->bitmapSize];
   for (int i = 0; i < this->bitmapSize; i++) {
-    this->bitmap[i] = kvStateCacheBlock->bitmap[i];
+    this->bitmap[i] = kvCacheBlock->bitmap[i];
   }
-  this->tensorBytes = kvStateCacheBlock->tensorBytes;
-  this->layer = kvStateCacheBlock->layer;
-  std::vector<int64_t> shape = {(int64_t)(blockSize), this->tensorBytes};
+  this->tensorNBytes = kvCacheBlock->tensorNBytes;
+  this->layer = kvCacheBlock->layer;
+  std::vector<int64_t> shape = {(int64_t)(blockSize), this->tensorNBytes};
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     this->keyStateTensorBuilderList.push_back(
         std::make_shared<KVTensorBuilder>(client, shape));
@@ -124,24 +123,23 @@ KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     vineyard::memory::concurrent_memcpy(
         this->keyStateTensorBuilderList[currentLayer]->data(),
-        kvStateCacheBlock->keyStateTensorList[currentLayer]->data(),
-        (int64_t)(blockSize) * this->tensorBytes);
+        kvCacheBlock->keyStateTensorList[currentLayer]->data(),
+        (int64_t)(blockSize) * this->tensorNBytes);
     vineyard::memory::concurrent_memcpy(
         this->valueStateTensorBuilderList[currentLayer]->data(),
-        kvStateCacheBlock->valueStateTensorList[currentLayer]->data(),
-        (int64_t)(blockSize) * this->tensorBytes);
+        kvCacheBlock->valueStateTensorList[currentLayer]->data(),
+        (int64_t)(blockSize) * this->tensorNBytes);
   }
 }
 
-Status KVStateCacheBlockBuilder::Make(
-    Client& client, TreeData* treeData,
-    KVStateCacheBlockBuilder*& kvStateCacheBlockBuilder) {
+Status KVCacheBlockBuilder::Make(Client& client, TreeData* treeData,
+                                 KVCacheBlockBuilder*& kvCacheBlockBuilder) {
   RETURN_ON_ASSERT(treeData != nullptr && treeData->isPtr == false);
   ObjectID blockObjectID = treeData->builderObjectID;
 
-  std::shared_ptr<KVStateCacheBlock> blockObject;
+  std::shared_ptr<KVCacheBlock> blockObject;
   RETURN_ON_ERROR(client.FetchAndGetObject(blockObjectID, blockObject));
-  kvStateCacheBlockBuilder = new KVStateCacheBlockBuilder(client, blockObject);
+  kvCacheBlockBuilder = new KVCacheBlockBuilder(client, blockObject);
   if (blockObjectID != blockObject->id()) {
     // If the object is migrated, we should delete the copied object.
     Status status = client.DelData(blockObject->id());
@@ -153,7 +151,7 @@ Status KVStateCacheBlockBuilder::Make(
   return Status::OK();
 }
 
-Status KVStateCacheBlockBuilder::Query(
+Status KVCacheBlockBuilder::Query(
     int index, std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
   RETURN_ON_ASSERT((index >= 0 && index < this->blockSize),
                    "Index out of range: " + std::to_string(index));
@@ -164,16 +162,16 @@ Status KVStateCacheBlockBuilder::Query(
     LLMKV& valueState = kvState[currentLayer].second;
     VINEYARD_ASSERT(keyState.data == nullptr && valueState.data == nullptr);
     keyState.data =
-        keyStateTensorBuilderList[currentLayer]->data() + index * tensorBytes;
-    keyState.length = tensorBytes;
-    valueState.data =
-        valueStateTensorBuilderList[currentLayer]->data() + index * tensorBytes;
-    valueState.length = tensorBytes;
+        keyStateTensorBuilderList[currentLayer]->data() + index * tensorNBytes;
+    keyState.length = tensorNBytes;
+    valueState.data = valueStateTensorBuilderList[currentLayer]->data() +
+                      index * tensorNBytes;
+    valueState.length = tensorNBytes;
   }
   return Status::OK();
 }
 
-int KVStateCacheBlockBuilder::FindEmptySlot() {
+int KVCacheBlockBuilder::FindEmptySlot() {
   for (int i = 0; i < this->bitmapSize; i++) {
     if (this->bitmap[i] != 0) {
       int index = ffsll(this->bitmap[i]) - 1;
@@ -183,7 +181,7 @@ int KVStateCacheBlockBuilder::FindEmptySlot() {
   return -1;
 }
 
-bool KVStateCacheBlockBuilder::IsFull() {
+bool KVCacheBlockBuilder::IsFull() {
   int left = this->blockSize;
   for (int i = 0; i < this->bitmapSize; i++) {
     if (this->bitmap[i] != 0 && ffsll(this->bitmap[i]) - 1 < left) {
@@ -194,7 +192,7 @@ bool KVStateCacheBlockBuilder::IsFull() {
   return true;
 }
 
-Status KVStateCacheBlockBuilder::Update(
+Status KVCacheBlockBuilder::Update(
     const std::vector<std::pair<LLMKV, LLMKV>>& kvState, OffsetData* data) {
   int index = this->FindEmptySlot();
   RETURN_ON_ASSERT((index >= 0 && index < this->blockSize),
@@ -205,15 +203,15 @@ Status KVStateCacheBlockBuilder::Update(
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     LLMKV keyState = kvState[currentLayer].first;
     LLMKV valueState = kvState[currentLayer].second;
-    RETURN_ON_ASSERT((keyState.length == (size_t) this->tensorBytes &&
-                      valueState.length == (size_t) this->tensorBytes));
+    RETURN_ON_ASSERT((keyState.length == (size_t) this->tensorNBytes &&
+                      valueState.length == (size_t) this->tensorNBytes));
 
     uint8_t* keyData = keyStateTensorBuilderList[currentLayer]->data();
     uint8_t* valueData = valueStateTensorBuilderList[currentLayer]->data();
-    vineyard::memory::concurrent_memcpy(keyData + index * this->tensorBytes,
-                                        keyState.data, this->tensorBytes);
-    vineyard::memory::concurrent_memcpy(valueData + index * this->tensorBytes,
-                                        valueState.data, this->tensorBytes);
+    vineyard::memory::concurrent_memcpy(keyData + index * this->tensorNBytes,
+                                        keyState.data, this->tensorNBytes);
+    vineyard::memory::concurrent_memcpy(valueData + index * this->tensorNBytes,
+                                        valueState.data, this->tensorNBytes);
   }
   data->offset = index;
 
@@ -221,8 +219,7 @@ Status KVStateCacheBlockBuilder::Update(
   return Status::OK();
 }
 
-int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child,
-                                        int index) {
+int16_t KVCacheBlockBuilder::Split(KVCacheBlockBuilder* child, int index) {
   // Child builder must be empty.
   int childIndex = child->FindEmptySlot();
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
@@ -236,62 +233,61 @@ int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child,
         child->valueStateTensorBuilderList[currentLayer];
 
     uint8_t* keyState =
-        keyStateTensorBuilder->data() + index * this->tensorBytes;
+        keyStateTensorBuilder->data() + index * this->tensorNBytes;
     uint8_t* valueState =
-        valueStateTensorBuilder->data() + index * this->tensorBytes;
+        valueStateTensorBuilder->data() + index * this->tensorNBytes;
     uint8_t* childKeyState =
-        childKeyStateTensorBuilder->data() + childIndex * this->tensorBytes;
+        childKeyStateTensorBuilder->data() + childIndex * this->tensorNBytes;
     uint8_t* childValueState =
-        childValueStateTensorBuilder->data() + childIndex * this->tensorBytes;
+        childValueStateTensorBuilder->data() + childIndex * this->tensorNBytes;
 
     vineyard::memory::concurrent_memcpy(childKeyState, keyState,
-                                        this->tensorBytes);
+                                        this->tensorNBytes);
     vineyard::memory::concurrent_memcpy(childValueState, valueState,
-                                        this->tensorBytes);
+                                        this->tensorNBytes);
   }
   ACQUIRE_BIT_RESOURCE(child->bitmap[childIndex / 64], childIndex % 64);
   FREE_BIT_RESOURCE(this->bitmap[index / 64], index % 64);
   return childIndex;
 }
 
-Status KVStateCacheBlockBuilder::Build(Client& client) { return Status::OK(); }
+Status KVCacheBlockBuilder::Build(Client& client) { return Status::OK(); }
 
-std::shared_ptr<Object> KVStateCacheBlockBuilder::_Seal(Client& client) {
+std::shared_ptr<Object> KVCacheBlockBuilder::_Seal(Client& client) {
   VINEYARD_CHECK_OK(this->Build(client));
 
-  std::shared_ptr<KVStateCacheBlock> kvStateCacheBlock =
-      std::make_shared<KVStateCacheBlock>();
+  std::shared_ptr<KVCacheBlock> kvCacheBlock = std::make_shared<KVCacheBlock>();
 
   // 1. seal keyStateTensorBuilder and valueStateTensorBuilder
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
-    kvStateCacheBlock->meta_.AddMember(
+    kvCacheBlock->meta_.AddMember(
         "keyStateTensorBuilder_" + std::to_string(currentLayer),
         keyStateTensorBuilderList[currentLayer]->Seal(client));
-    kvStateCacheBlock->meta_.AddMember(
+    kvCacheBlock->meta_.AddMember(
         "valueStateTensorBuilder_" + std::to_string(currentLayer),
         valueStateTensorBuilderList[currentLayer]->Seal(client));
   }
 
   // 2. store the member field to meta
-  kvStateCacheBlock->meta_.AddKeyValue("bitmap_size", this->bitmapSize);
+  kvCacheBlock->meta_.AddKeyValue("bitmap_size", this->bitmapSize);
   for (int i = 0; i < this->bitmapSize; i++) {
-    kvStateCacheBlock->meta_.AddKeyValue("bitmap_" + std::to_string(i),
-                                         this->bitmap[i]);
+    kvCacheBlock->meta_.AddKeyValue("bitmap_" + std::to_string(i),
+                                    this->bitmap[i]);
   }
 
-  kvStateCacheBlock->meta_.AddKeyValue("block_size", this->blockSize);
-  kvStateCacheBlock->meta_.AddKeyValue("tensorBytes", this->tensorBytes);
-  kvStateCacheBlock->meta_.AddKeyValue("layer", this->layer);
+  kvCacheBlock->meta_.AddKeyValue("block_size", this->blockSize);
+  kvCacheBlock->meta_.AddKeyValue("tensorNBytes", this->tensorNBytes);
+  kvCacheBlock->meta_.AddKeyValue("layer", this->layer);
   // 3. set the object type to meta
-  kvStateCacheBlock->meta_.SetTypeName(type_name<KVStateCacheBlock>());
+  kvCacheBlock->meta_.SetTypeName(type_name<KVCacheBlock>());
 
   VINEYARD_CHECK_OK(
-      client.CreateMetaData(kvStateCacheBlock->meta_, kvStateCacheBlock->id_));
+      client.CreateMetaData(kvCacheBlock->meta_, kvCacheBlock->id_));
   this->set_sealed(true);
-  return kvStateCacheBlock;
+  return kvCacheBlock;
 }
 
-void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() {
+void KVCacheBlockBuilder::PrintKVCacheBlock() {
   LOG(INFO) << "builder:" << this;
   for (int i = 0; i < this->blockSize; i++) {
     LOG(INFO) << "index:" << i << " bitmap:" << this->GetBitmapStr();
@@ -304,13 +300,13 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() {
       uint8_t* key_state_data = keyStateTensorBuilderList[currentLayer]->data();
       uint8_t* value_state_data =
           valueStateTensorBuilderList[currentLayer]->data();
-      // print the first tensorBytes bytes
+      // print the first tensorNBytes bytes
       std::string keyState = "";
       std::string valueState = "";
-      for (int j = 0; j < this->tensorBytes; j++) {
-        keyState += std::to_string(key_state_data[i * tensorBytes + j]) + " ";
+      for (int j = 0; j < this->tensorNBytes; j++) {
+        keyState += std::to_string(key_state_data[i * tensorNBytes + j]) + " ";
         valueState +=
-            std::to_string(value_state_data[i * tensorBytes + j]) + " ";
+            std::to_string(value_state_data[i * tensorNBytes + j]) + " ";
       }
       LOG(INFO) << "keyState:" << keyState;
       LOG(INFO) << "valueState:" << valueState;
@@ -320,6 +316,6 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() {
   LOG(INFO) << "==========================";
 }
 
-KVStateCacheBlockBuilder::~KVStateCacheBlockBuilder() { delete this->bitmap; }
+KVCacheBlockBuilder::~KVCacheBlockBuilder() { delete this->bitmap; }
 
 }  // namespace vineyard
diff --git a/modules/llm-cache/ds/kv_state_cache_block.h b/modules/llm-cache/ds/kv_cache_block.h
similarity index 77%
rename from modules/llm-cache/ds/kv_state_cache_block.h
rename to modules/llm-cache/ds/kv_cache_block.h
index 808d2cbc45..4d88281083 100644
--- a/modules/llm-cache/ds/kv_state_cache_block.h
+++ b/modules/llm-cache/ds/kv_cache_block.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_
-#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_
+#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_
+#define MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_
 
 #include <map>
 #include <memory>
@@ -48,26 +48,26 @@ struct OffsetData {
 
 struct TreeData {
   union {
-    void* kvStateCacheBlockBuilder;
+    void* kvCacheBlockBuilder;
     uint64_t builderObjectID;
   };
   bool isPtr = true;
 };
 
 /**
- * @brief KVStateCacheBlock is a cache for kv-cache of LLM. When a new prompt
- * comes, LLM can query KVStateCacheBlock to get the state of the kv-cache to
+ * @brief KVCacheBlock is a cache for kv-cache of LLM. When a new prompt
+ * comes, LLM can query KVCacheBlock to get the state of the kv-cache to
  * avoid calculate the kv-cache again if the new prompt is similar to the
  * previous one.
  *
- * KVStateCacheBlock is stored in vineyard as a vineyard object which contains a
+ * KVCacheBlock is stored in vineyard as a vineyard object which contains a
  * radix tree. The token sequence is the key of the radix tree and the value
  * point out the offset of the kv-cache in the tensor list.
  *
- * KVStateCacheBlock can be shared by multiple machines.
+ * KVCacheBlock can be shared by multiple machines.
  */
 
-class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
+class KVCacheBlock : public vineyard::Registered<KVCacheBlock> {
  private:
   std::vector<std::shared_ptr<KVTensor>> keyStateTensorList;
   std::vector<std::shared_ptr<KVTensor>> valueStateTensorList;
@@ -75,19 +75,19 @@ class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
   int blockSize;
   int bitmapSize;
   int layer;
-  int tensorBytes;
+  int tensorNBytes;
 
  public:
   static std::unique_ptr<Object> Create() __attribute__((used)) {
     return std::static_pointer_cast<Object>(
-        std::unique_ptr<KVStateCacheBlock>{new KVStateCacheBlock()});
+        std::unique_ptr<KVCacheBlock>{new KVCacheBlock()});
   }
 
   void Construct(const ObjectMeta& meta) override;
 
   std::string GetBitmapStr();
 
-  uint64_t GetTensorBytes() { return this->tensorBytes; }
+  uint64_t GetTensorNBytes() { return this->tensorNBytes; }
 
   uint64_t* GetBitmap() { return this->bitmap; }
 
@@ -109,12 +109,12 @@ class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
     return this->valueStateTensorList;
   }
 
-  ~KVStateCacheBlock();
+  ~KVCacheBlock();
 
-  friend class KVStateCacheBlockBuilder;
+  friend class KVCacheBlockBuilder;
 };
 
-class KVStateCacheBlockBuilder : public ObjectBuilder {
+class KVCacheBlockBuilder : public ObjectBuilder {
  private:
   Client& client;
   std::vector<std::shared_ptr<KVTensorBuilder>> keyStateTensorBuilderList;
@@ -124,20 +124,20 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
   uint64_t* bitmap;
   int blockSize;
   int bitmapSize;
-  int tensorBytes;
+  int tensorNBytes;
   int layer;
 
   int FindEmptySlot();
 
  public:
-  KVStateCacheBlockBuilder(Client& client, int tensorBytes, int layer,
-                           int blockSize);
+  KVCacheBlockBuilder(Client& client, int tensorNBytes, int layer,
+                      int blockSize);
 
-  KVStateCacheBlockBuilder(
-      Client& client, std::shared_ptr<KVStateCacheBlock> kv_state_cache_block);
+  KVCacheBlockBuilder(Client& client,
+                      std::shared_ptr<KVCacheBlock> kv_cache_block);
 
   static Status Make(Client& client, TreeData* treeData,
-                     KVStateCacheBlockBuilder*& kvStateCacheBlockBuilder);
+                     KVCacheBlockBuilder*& kvCacheBlockBuilder);
 
   /**
    * @brief Update the kv-state using next token.
@@ -165,7 +165,7 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
 
   std::shared_ptr<Object> _Seal(Client& client) override;
 
-  int16_t Split(KVStateCacheBlockBuilder* child, int index);
+  int16_t Split(KVCacheBlockBuilder* child, int index);
 
   const std::shared_ptr<KVTensorBuilder>& GetKeyStateBuilder(int layer) {
     return keyStateTensorBuilderList[layer];
@@ -193,15 +193,15 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
 
   uint64_t* GetBitmap() { return this->bitmap; }
 
-  uint64_t GetTensorBytes() { return this->tensorBytes; }
+  uint64_t GetTensorNBytes() { return this->tensorNBytes; }
 
   int GetBlockSize() { return this->blockSize; }
 
-  void PrintKVStateCacheBlock();
+  void PrintKVCacheBlock();
 
-  ~KVStateCacheBlockBuilder();
+  ~KVCacheBlockBuilder();
 };
 
 }  // namespace vineyard
 
-#endif  // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_BLOCK_H_
+#endif  // MODULES_LLM_CACHE_DS_KV_CACHE_BLOCK_H_
diff --git a/modules/llm-cache/ds/kv_state_cache_manager.cc b/modules/llm-cache/ds/kv_cache_manager.cc
similarity index 81%
rename from modules/llm-cache/ds/kv_state_cache_manager.cc
rename to modules/llm-cache/ds/kv_cache_manager.cc
index 2023eadc8c..659db8c090 100644
--- a/modules/llm-cache/ds/kv_state_cache_manager.cc
+++ b/modules/llm-cache/ds/kv_cache_manager.cc
@@ -22,22 +22,21 @@ limitations under the License.
 #include "client/client.h"
 #include "common/util/logging.h"
 #include "common/util/status.h"
-#include "llm-cache/ds/kv_state_cache.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 #include "llm-cache/storage/blob_storage.h"
 #include "llm-cache/storage/local_file_storage.h"
 
 namespace vineyard {
 
-KVStateCacheManager::KVStateCacheManager(
-    std::shared_ptr<IStorage> storageImpl) {
+KVCacheManager::KVCacheManager(std::shared_ptr<IStorage> storageImpl) {
   storage = storageImpl;
 }
 
 // use the memory storage for manager
-Status KVStateCacheManager::Make(Client& client,
-                                 std::shared_ptr<KVStateCacheManager>& manager,
-                                 VineyardCacheConfig& config) {
+Status KVCacheManager::Make(Client& client,
+                            std::shared_ptr<KVCacheManager>& manager,
+                            VineyardCacheConfig& config) {
   if (config.tensorByte <= 0 || config.cacheCapacity <= 0 ||
       config.layer <= 0) {
     return Status::Invalid("Invalid tensor byte, cache capacity or layer.");
@@ -58,15 +57,15 @@ Status KVStateCacheManager::Make(Client& client,
       config.layer, config.blockSize, config.syncInterval,
       config.llmCacheSyncLock, config.llmCacheObjectName,
       config.llmRefcntObjectName));
-  manager = std::make_shared<KVStateCacheManager>(blob_storage);
+  manager = std::make_shared<KVCacheManager>(blob_storage);
   manager->config = std::make_shared<VineyardCacheConfig>(config);
   return Status::OK();
 }
 
 // use the file storage for manager
-Status KVStateCacheManager::Make(std::shared_ptr<KVStateCacheManager>& manager,
-                                 FileCacheConfig& config) {
-  if (config.batchSize <= 0 || config.splitNumber <= 0) {
+Status KVCacheManager::Make(std::shared_ptr<KVCacheManager>& manager,
+                            FileCacheConfig& config) {
+  if (config.chunkSize <= 0 || config.hashChunkSize <= 0) {
     return Status::Invalid("Invalid batch size or split number.");
   }
   if (config.tensorByte <= 0 || config.cacheCapacity <= 0 ||
@@ -77,13 +76,13 @@ Status KVStateCacheManager::Make(std::shared_ptr<KVStateCacheManager>& manager,
   std::shared_ptr<FileStorage> file_storage;
   if (config.filesystemType == FilesystemType::LOCAL) {
     file_storage = std::make_shared<LocalFileStorage>(
-        config.tensorByte, config.cacheCapacity, config.layer, config.batchSize,
-        config.splitNumber, config.root, config.clientGCInterval, config.ttl,
+        config.tensorByte, config.cacheCapacity, config.layer, config.chunkSize,
+        config.hashChunkSize, config.root, config.gcInterval, config.ttl,
         config.enbaleGlobalGC, config.globalGCInterval, config.globalTTL);
   } else {
     return Status::Invalid("Unsupported filesystem type");
   }
-  manager = std::make_shared<KVStateCacheManager>(file_storage);
+  manager = std::make_shared<KVCacheManager>(file_storage);
   RETURN_ON_ERROR(file_storage->Init());
   manager->config = std::make_shared<FileCacheConfig>(config);
   return Status::OK();
@@ -94,7 +93,7 @@ Status KVStateCacheManager::Make(std::shared_ptr<KVStateCacheManager>& manager,
  * manager.
  *
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -111,43 +110,43 @@ Status KVStateCacheManager::Make(std::shared_ptr<KVStateCacheManager>& manager,
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList manually                         *
+ *           * kv buffer of the kvCacheList manually                         *
  *           *                                                               *
  *           *****************************************************************
  *
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList.
+ * kvCacheList.
  *
  *
  * @return Status
  */
-Status KVStateCacheManager::Update(
+Status KVCacheManager::Update(
     const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
-  if (kvStateList.size() != tokenList.size()) {
+  if (kvCacheList.size() != tokenList.size()) {
     return Status::Invalid("Token list size not match kv state list size");
   }
-  return storage->Update(tokenList, kvStateList, updated);
+  return storage->Update(tokenList, kvCacheList, updated);
 }
 
 /**
@@ -156,7 +155,7 @@ Status KVStateCacheManager::Update(
  *
  * @param prefix The prefix of the token list.
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -173,41 +172,41 @@ Status KVStateCacheManager::Update(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList manually                         *
+ *           * kv buffer of the kvCacheList manually                         *
  *           *                                                               *
  *           *****************************************************************
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList.
+ * kvCacheList.
  *
  * @return Status
  */
-Status KVStateCacheManager::Update(
+Status KVCacheManager::Update(
     const std::vector<int>& prefix, const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
-  if (kvStateList.size() != tokenList.size()) {
+  if (kvCacheList.size() != tokenList.size()) {
     return Status::Invalid("Token list size not match kv state list size");
   }
-  return storage->Update(prefix, tokenList, kvStateList, updated);
+  return storage->Update(prefix, tokenList, kvCacheList, updated);
 }
 
 /**
@@ -228,12 +227,12 @@ Status KVStateCacheManager::Update(
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   LLMKV key_state;                                            *
  *           *   LLMKV value_state;                                          *
- *           *   key_state.data = malloc(tensorBytes);                       *
- *           *   value_state.data = malloc(tensorBytes)                      *
+ *           *   key_state.data = malloc(tensorNBytes);                       *
+ *           *   value_state.data = malloc(tensorNBytes)                      *
  *           *   // Copy the k_state of LLM KV Cache to key_state.data       *
  *           *   // Copy the v_state of LLM KV Cache to value_state.data     *
- *           *   key_state.length = tensorBytes;                             *
- *           *   value_state.length = tensorBytes;                           *
+ *           *   key_state.length = tensorNBytes;                             *
+ *           *   value_state.length = tensorNBytes;                           *
  *           *   kvState.emplace_back(key_state, value_state);               *
  *           *}                                                              *
  *           *                                                               *
@@ -245,67 +244,18 @@ Status KVStateCacheManager::Update(
  * @return Status to indicate whether the kv state has been updated
  * successfully.
  */
-Status KVStateCacheManager::Update(
+Status KVCacheManager::Update(
     const std::vector<int>& tokenList, int nextToken,
     const std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
   return storage->Update(tokenList, nextToken, kvState);
 }
 
-/**
- * @brief Query the kv state with the given token and its prefix in the kv state
- * cache manager.
- *
- * @param tokenList The token list as the prefix of the next token.
- * @param nextToken The next token to be queried.
- * @param kvState The kv state of the next token. It must be initialized before
- *                calling this function, including the data and length of the kv
- *                tensor. Also, the length of the kvState should be as same as
- *                the layer of the kv state.
- *
- *           *****************************************************************
- *           * Only support for blob storage, the kv state is managed by the *
- *           * kv state cache manager, the caller does not need to malloc    *
- *           * and free the memory of the kv state. Besides, the data        *
- *           * pointer should be nullptr and the length should be 0.         *
- *           *                                                               *
- *           * Assume the layer is 2, you should allocate the memory for the *
- *           * kv state like this:                                           *
- *           * std::vector<std::pair<LLMKV, LLMKV>> kvState;                 *
- *           * for (int i = 0; i < 2; i++) {                                 *
- *           *   LLMKV key_state;                                            *
- *           *   LLMKV value_state;                                          *
- *           *   key_state.data = nullptr                                    *
- *           *   value_state.data = nullptr                                  *
- *           *   key_state.length = 0;                                       *
- *           *   value_state.length = 0;                                     *
- *           *   kvState.emplace_back(key_state, value_state);               *
- *           *}                                                              *
- *           *                                                               *
- *           * After calling this function, the key_state's data is pointing *
- *           * to the K tensor data stored in vineyard blob, and the         *
- *           * value_state's data is pointing to the V tensor data stored in *
- *           * vineyard blob. All the length of the kv state is the size of  *
- *           * the tensor data. Then you can copy the kv state to the LLM KV *
- *           * Cache. The memory of the kv state will be freed when calling  *
- *           * the close function of the kv state cache manager.             *
- *           *                                                               *
- *           *****************************************************************
- *
- * @return Status to indicate whether the kv state has been queried
- * successfully.
- */
-Status KVStateCacheManager::Query(
-    const std::vector<int>& tokenList, int nextToken,
-    std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
-  return storage->Query(tokenList, nextToken, kvState);
-}
-
 /**
  * @brief Query the kv state with the given token list in the kv state cache
  * manager.
  *
  * @param tokenList The token list to be queried.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It must be initialized before calling this function,
  *                    including the data and length of the kv tensor.
  *                    It's a 2D vector, the first dimension is the token index,
@@ -327,7 +277,7 @@ Status KVStateCacheManager::Query(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
@@ -339,7 +289,7 @@ Status KVStateCacheManager::Query(
  *           *     value_state.length = 0;                                   *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, the key_state's data is pointing *
@@ -360,19 +310,19 @@ Status KVStateCacheManager::Query(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, the key_state and value_state    *
@@ -383,35 +333,86 @@ Status KVStateCacheManager::Query(
  *           *****************************************************************
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList. and the second dimension of the kvStateList should be as same as
+ * kvCacheList. and the second dimension of the kvCacheList should be as same as
  * the layer of the kv state.
  *
  * @return Status
  */
-Status KVStateCacheManager::Query(
+Status KVCacheManager::Query(
     const std::vector<int>& tokenList,
-    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& matched) {
-  return storage->Query(tokenList, kvStateList, matched);
+  return storage->Query(tokenList, kvCacheList, matched);
 }
 
-Status KVStateCacheManager::ClearGlobalCache(Client& client,
-                                             VineyardCacheConfig& config) {
+/**
+ * @brief Query the kv state with the given token and its prefix in the kv state
+ * cache manager.
+ *
+ * @param tokenList The token list as the prefix of the next token.
+ * @param nextToken The next token to be queried.
+ * @param kvState The kv state of the next token. It must be initialized before
+ *                calling this function, including the data and length of the kv
+ *                tensor. Also, the length of the kvState should be as same as
+ *                the layer of the kv state.
+ *
+ *           *****************************************************************
+ *           * Only support for blob storage, the kv state is managed by the *
+ *           * kv state cache manager, the caller does not need to malloc    *
+ *           * and free the memory of the kv state. Besides, the data        *
+ *           * pointer should be nullptr and the length should be 0.         *
+ *           *                                                               *
+ *           * Assume the layer is 2, you should allocate the memory for the *
+ *           * kv state like this:                                           *
+ *           * std::vector<std::pair<LLMKV, LLMKV>> kvState;                 *
+ *           * for (int i = 0; i < 2; i++) {                                 *
+ *           *   LLMKV key_state;                                            *
+ *           *   LLMKV value_state;                                          *
+ *           *   key_state.data = nullptr                                    *
+ *           *   value_state.data = nullptr                                  *
+ *           *   key_state.length = 0;                                       *
+ *           *   value_state.length = 0;                                     *
+ *           *   kvState.emplace_back(key_state, value_state);               *
+ *           *}                                                              *
+ *           *                                                               *
+ *           * After calling this function, the key_state's data is pointing *
+ *           * to the K tensor data stored in vineyard blob, and the         *
+ *           * value_state's data is pointing to the V tensor data stored in *
+ *           * vineyard blob. All the length of the kv state is the size of  *
+ *           * the tensor data. Then you can copy the kv state to the LLM KV *
+ *           * Cache. The memory of the kv state will be freed when calling  *
+ *           * the close function of the kv state cache manager.             *
+ *           *                                                               *
+ *           *****************************************************************
+ *
+ * @return Status to indicate whether the kv state has been queried
+ * successfully.
+ */
+Status KVCacheManager::Query(const std::vector<int>& prefix, int nextToken,
+                             std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
+  return storage->Query(prefix, nextToken, kvState);
+}
+
+Status KVCacheManager::Query(
+    const std::vector<int>& prefix, const std::vector<int>& tokenList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+    size_t& matched) {
+  return storage->Query(prefix, tokenList, kvCacheList, matched);
+}
+
+Status KVCacheManager::ClearGlobalCache(Client& client,
+                                        VineyardCacheConfig& config) {
   return BlobStorage::ClearGlobalCache(client, config.llmCacheSyncLock,
                                        config.llmCacheObjectName,
                                        config.llmRefcntObjectName);
 }
 
-void KVStateCacheManager::Close() { storage->CloseCache(); }
+void KVCacheManager::Close() { storage->CloseCache(); }
 
-void KVStateCacheManager::StopGlobalGCThread() {
-  storage->StopGlobalGCThread();
-}
+void KVCacheManager::StopGlobalGCThread() { storage->StopGlobalGCThread(); }
 
-void KVStateCacheManager::StartGlobalGCThread() {
-  storage->StartGlobalGCThread();
-}
+void KVCacheManager::StartGlobalGCThread() { storage->StartGlobalGCThread(); }
 
-KVStateCacheManager::~KVStateCacheManager() {}
+KVCacheManager::~KVCacheManager() {}
 
 }  // namespace vineyard
diff --git a/modules/llm-cache/ds/kv_state_cache_manager.h b/modules/llm-cache/ds/kv_cache_manager.h
similarity index 72%
rename from modules/llm-cache/ds/kv_state_cache_manager.h
rename to modules/llm-cache/ds/kv_cache_manager.h
index 9bf513da28..8cccabc8f4 100644
--- a/modules/llm-cache/ds/kv_state_cache_manager.h
+++ b/modules/llm-cache/ds/kv_cache_manager.h
@@ -20,26 +20,25 @@ limitations under the License.
 #include <vector>
 
 #include "llm-cache/ds/config.h"
-#include "llm-cache/ds/kv_state_cache.h"
+#include "llm-cache/ds/kv_cache.h"
 #include "llm-cache/storage/blob_storage.h"
 #include "llm-cache/storage/file_storage.h"
 
-#ifndef MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_
-#define MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_
+#ifndef MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_
+#define MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_
 
 namespace vineyard {
 
-class KVStateCacheManager {
+class KVCacheManager {
  public:
-  explicit KVStateCacheManager(std::shared_ptr<IStorage> storageImpl);
+  explicit KVCacheManager(std::shared_ptr<IStorage> storageImpl);
 
-  ~KVStateCacheManager();
+  ~KVCacheManager();
 
-  static Status Make(Client& client,
-                     std::shared_ptr<KVStateCacheManager>& manager,
+  static Status Make(Client& client, std::shared_ptr<KVCacheManager>& manager,
                      VineyardCacheConfig& config);
 
-  static Status Make(std::shared_ptr<KVStateCacheManager>& manager,
+  static Status Make(std::shared_ptr<KVCacheManager>& manager,
                      FileCacheConfig& config);
 
   Status Update(const std::vector<int>& tokenList, int nextToken,
@@ -47,19 +46,24 @@ class KVStateCacheManager {
 
   Status Update(
       const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated);
 
   Status Update(
       const std::vector<int>& prefix, const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated);
 
-  Status Query(const std::vector<int>& tokenList, int token,
+  Status Query(const std::vector<int>& tokenList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+               size_t& matched);
+
+  Status Query(const std::vector<int>& prefix, int nextToken,
                std::vector<std::pair<LLMKV, LLMKV>>& kvState);
 
-  Status Query(const std::vector<int>& tokenList,
-               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+  Status Query(const std::vector<int>& prefix,
+               const std::vector<int>& tokenList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
                size_t& matched);
 
   void Close();
@@ -79,4 +83,4 @@ class KVStateCacheManager {
 
 }  // namespace vineyard
 
-#endif  // MODULES_LLM_CACHE_DS_KV_STATE_CACHE_MANAGER_H_
+#endif  // MODULES_LLM_CACHE_DS_KV_CACHE_MANAGER_H_
diff --git a/modules/llm-cache/hash/hasher.h b/modules/llm-cache/hash/hasher.h
index 0b4c9b47a1..0d936df3fc 100644
--- a/modules/llm-cache/hash/hasher.h
+++ b/modules/llm-cache/hash/hasher.h
@@ -32,8 +32,8 @@ class Hasher {
    * @brief Compute the path list for the token list
    *
    * @param tokenList The list of tokens
-   * @param batchSize The size of the batch
-   * @param splitNumber The number of splits
+   * @param chunkSize The size of the batch
+   * @param hashChunkSize The number of splits
    * @param pathList The Relative path list of the token list
    *
    * @return Status
@@ -54,29 +54,29 @@ class Hasher {
    *      hashValue3(4c90a490) -> 4c/90/a4/90
    *
    */
-  Status computePathForTokens(const std::vector<int>& tokenList, int batchSize,
-                              int splitNumber,
+  Status computePathForTokens(const std::vector<int>& tokenList, int chunkSize,
+                              int hashChunkSize,
                               std::vector<std::string>& pathList) {
     char hashBuffer[9];
-    int tokenSize = tokenList.size() - tokenList.size() % batchSize;
+    int tokenSize = tokenList.size() - tokenList.size() % chunkSize;
     // if the token list (upper_bound) is less than the batch size, then return
     // directly
-    if (tokenSize < batchSize) {
+    if (tokenSize < chunkSize) {
       return Status::OK();
     }
 
     // split the token list into batches
-    for (int i = 0; i < tokenSize; i += batchSize) {
+    for (int i = 0; i < tokenSize; i += chunkSize) {
       int hashValue =
           hashAlgorithm->hash(reinterpret_cast<const char*>(tokenList.data()),
-                              (i + batchSize) * sizeof(int));
+                              (i + chunkSize) * sizeof(int));
       // split the hash value into paths
       std::snprintf(hashBuffer, sizeof(hashBuffer), "%08x", hashValue);
       int index = 0;
       std::string path;
-      while (index + splitNumber < 8) {
-        path += std::string(hashBuffer + index, splitNumber) + "/";
-        index += splitNumber;
+      while (index + hashChunkSize < 8) {
+        path += std::string(hashBuffer + index, hashChunkSize) + "/";
+        index += hashChunkSize;
       }
       path += std::string(hashBuffer + index, 8 - index);
       pathList.push_back(path);
diff --git a/modules/llm-cache/storage/blob_storage.cc b/modules/llm-cache/storage/blob_storage.cc
index 1c4b8beb00..fed7bab297 100644
--- a/modules/llm-cache/storage/blob_storage.cc
+++ b/modules/llm-cache/storage/blob_storage.cc
@@ -23,14 +23,13 @@ limitations under the License.
 
 namespace vineyard {
 
-BlobStorage::BlobStorage(Client& client,
-                         std::shared_ptr<KVStateCacheBuilder>& cache,
+BlobStorage::BlobStorage(Client& client, std::shared_ptr<KVCacheBuilder>& cache,
                          int syncInterval, std::string& llmCacheSyncLock,
                          std::string& llmCacheObjectName,
                          std::string& llmRefcntObjectName)
     : client(client) {
   this->syncInterval = syncInterval;
-  this->kvStateCacheBuilder = cache;
+  this->kvCacheBuilder = cache;
   this->llmCacheSyncLock = llmCacheSyncLock;
   this->llmCacheObjectName = llmCacheObjectName;
   this->llmRefcntObjectName = llmRefcntObjectName;
@@ -38,7 +37,7 @@ BlobStorage::BlobStorage(Client& client,
 }
 
 Status BlobStorage::Make(Client& client, std::shared_ptr<BlobStorage>& storage,
-                         int tensorBytes, int cacheCapacity, int layer,
+                         int tensorNBytes, int cacheCapacity, int layer,
                          int blockSize, int syncInterval,
                          std::string llmCacheSyncLock,
                          std::string llmCacheObjectName,
@@ -50,40 +49,37 @@ Status BlobStorage::Make(Client& client, std::shared_ptr<BlobStorage>& storage,
   AcquireServerLock(client, llmCacheSyncLock, actualKey);
 
   // sync global cache object with vineyard
-  ObjectID globalKVStateCacheID;
+  ObjectID globalKVCacheID;
   std::set<ObjectID> blockIDSetToAdd;
   std::set<ObjectID> blockIDSetToDelete;
-  Status status = client.GetName(llmCacheObjectName, globalKVStateCacheID);
-  std::shared_ptr<KVStateCacheBuilder> kvStateCacheBuilder;
+  Status status = client.GetName(llmCacheObjectName, globalKVCacheID);
+  std::shared_ptr<KVCacheBuilder> kvCacheBuilder;
   if (status.ok()) {
     // if success, pull the cache object
-    std::shared_ptr<KVStateCache> globalKVStateCache =
-        std::dynamic_pointer_cast<KVStateCache>(
-            client.FetchAndGetObject(globalKVStateCacheID));
-    Status status = KVStateCacheBuilder::Make(client, kvStateCacheBuilder,
-                                              globalKVStateCache);
+    std::shared_ptr<KVCache> globalKVCache = std::dynamic_pointer_cast<KVCache>(
+        client.FetchAndGetObject(globalKVCacheID));
+    Status status = KVCacheBuilder::Make(client, kvCacheBuilder, globalKVCache);
     if (!status.ok()) {
       ReleaseServerLock(client, actualKey);
       return Status::Invalid(
           "Failed to make the cache object from global cache object.");
     }
-    if (globalKVStateCache->id() != globalKVStateCacheID) {
+    if (globalKVCache->id() != globalKVCacheID) {
       VLOG(100) << "Del migrate object";
-      Status status = client.DelData(globalKVStateCache->id());
+      Status status = client.DelData(globalKVCache->id());
       if (!status.ok()) {
         LOG(ERROR) << "Delete object failed: " << status.ToString()
                    << " It may cause memory leak.";
       }
     }
 
-    kvStateCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd);
-    blockIDSetToDelete = kvStateCacheBuilder->GetBlockIDSetToDelete();
+    kvCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd);
+    blockIDSetToDelete = kvCacheBuilder->GetBlockIDSetToDelete();
   } else {
     // if failed, create a new cache object
     LOG(INFO) << "failed to get the cache object, create a new one.";
-    Status status =
-        KVStateCacheBuilder::Make(client, kvStateCacheBuilder, tensorBytes,
-                                  cacheCapacity, layer, blockSize);
+    Status status = KVCacheBuilder::Make(client, kvCacheBuilder, tensorNBytes,
+                                         cacheCapacity, layer, blockSize);
     if (!status.ok()) {
       ReleaseServerLock(client, actualKey);
       return Status::Invalid("Failed to make new cache object.");
@@ -92,9 +88,9 @@ Status BlobStorage::Make(Client& client, std::shared_ptr<BlobStorage>& storage,
 
   // TBD
   // use lease to prevent the deadlock if the client is down
-  storage = std::make_shared<BlobStorage>(
-      client, kvStateCacheBuilder, syncInterval, llmCacheSyncLock,
-      llmCacheObjectName, llmRefcntObjectName);
+  storage = std::make_shared<BlobStorage>(client, kvCacheBuilder, syncInterval,
+                                          llmCacheSyncLock, llmCacheObjectName,
+                                          llmRefcntObjectName);
   VINEYARD_CHECK_OK(storage->SetRefcntMap(blockIDSetToDelete, blockIDSetToAdd));
   // release the lock
   ReleaseServerLock(client, actualKey);
@@ -104,13 +100,13 @@ Status BlobStorage::Make(Client& client, std::shared_ptr<BlobStorage>& storage,
 Status BlobStorage::UpdateInternal(
     const std::vector<int>& tokenList, int nextToken,
     const std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
-  return kvStateCacheBuilder->Update(tokenList, nextToken, kvState);
+  return kvCacheBuilder->Update(tokenList, nextToken, kvState);
 }
 
 Status BlobStorage::QueryInternal(
     const std::vector<int>& tokenList, int token,
     std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
-  return kvStateCacheBuilder->Query(tokenList, token, kvState);
+  return kvCacheBuilder->Query(tokenList, token, kvState);
 }
 
 /**
@@ -132,12 +128,12 @@ Status BlobStorage::QueryInternal(
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   LLMKV key_state;                                            *
  *           *   LLMKV value_state;                                          *
- *           *   key_state.data = malloc(tensorBytes);                       *
- *           *   value_state.data = malloc(tensorBytes)                      *
+ *           *   key_state.data = malloc(tensorNBytes);                       *
+ *           *   value_state.data = malloc(tensorNBytes)                      *
  *           *   // Copy the k_state of LLM KV Cache to key_state.data       *
  *           *   // Copy the v_state of LLM KV Cache to value_state.data     *
- *           *   key_state.length = tensorBytes;                             *
- *           *   value_state.length = tensorBytes;                           *
+ *           *   key_state.length = tensorNBytes;                             *
+ *           *   value_state.length = tensorNBytes;                           *
  *           *   kvState.emplace_back(key_state, value_state);               *
  *           *}                                                              *
  *           *                                                               *
@@ -169,7 +165,7 @@ Status BlobStorage::Update(
  * manager.
  *
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -185,38 +181,38 @@ Status BlobStorage::Update(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList manually                         *
+ *           * kv buffer of the kvCacheList manually                         *
  *           *                                                               *
  *           *****************************************************************
  *
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList. and the second dimension of the kvStateList should be as same as
+ * kvCacheList. and the second dimension of the kvCacheList should be as same as
  * the layer of the kv state.
  *
  * @return Status
  */
 Status BlobStorage::Update(
     const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
   std::unique_lock<std::mutex> lock(cacheAccessMutex, std::defer_lock);
   if (!lock.try_lock()) {
@@ -227,7 +223,7 @@ Status BlobStorage::Update(
   }
   std::vector<int> tokenListCopy;
   for (size_t i = 0; i < tokenList.size(); i++) {
-    Status result = UpdateInternal(tokenListCopy, tokenList[i], kvStateList[i]);
+    Status result = UpdateInternal(tokenListCopy, tokenList[i], kvCacheList[i]);
     if (!result.ok()) {
       break;
     }
@@ -244,7 +240,7 @@ Status BlobStorage::Update(
  *
  * @param prefix The prefix of the token list.
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -258,25 +254,25 @@ Status BlobStorage::Update(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList                                  *
+ *           * kv buffer of the kvCacheList                                  *
  *           *                                                               *
  *           *****************************************************************
  *
@@ -287,7 +283,7 @@ Status BlobStorage::Update(
  */
 Status BlobStorage::Update(
     const std::vector<int>& prefix, const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
   std::unique_lock<std::mutex> lock(cacheAccessMutex, std::defer_lock);
   if (!lock.try_lock()) {
@@ -298,7 +294,7 @@ Status BlobStorage::Update(
   }
   std::vector<int> tokenListCopy(prefix.begin(), prefix.end());
   for (size_t i = 0; i < tokenList.size(); i++) {
-    Status result = UpdateInternal(tokenListCopy, tokenList[i], kvStateList[i]);
+    Status result = UpdateInternal(tokenListCopy, tokenList[i], kvCacheList[i]);
     if (!result.ok()) {
       break;
     }
@@ -309,68 +305,12 @@ Status BlobStorage::Update(
   return Status::OK();
 }
 
-/**
- * @brief Query the kv state with the given token and its prefix in the kv state
- * cache manager.
- *
- * @param tokenList The token list as the prefix of the updated token.
- * @param token The token to be queried.
- * @param kvState The kv state of the token. It must be initialized(allocated)
- *                before calling this function, including the data and length
- *                of the kv state. The length of the kv state should be as same
- *                as the layer of the kv state cache manager.
- *
- *           *****************************************************************
- *           * Important, the kv state is managed by the kv state cache      *
- *           * manager, the caller does not need to malloc and free the      *
- *           * memory of the kv state. Besides, the data pointer should be   *
- *           * nullptr and the length should be 0.                           *
- *           *                                                               *
- *           * Assume the layer is 2, you should allocate the memory for the *
- *           * kv state like this:                                           *
- *           * std::vector<std::pair<LLMKV, LLMKV>> kvState;                 *
- *           * for (int i = 0; i < 2; i++) {                                 *
- *           *   LLMKV key_state;                                            *
- *           *   LLMKV value_state;                                          *
- *           *   key_state.data = nullptr                                    *
- *           *   value_state.data = nullptr                                  *
- *           *   key_state.length = 0;                                       *
- *           *   value_state.length = 0;                                     *
- *           *   kvState.emplace_back(key_state, value_state);               *
- *           *}                                                              *
- *           *                                                               *
- *           * After calling this function, the key_state's data is pointing *
- *           * to the K tensor data stored in vineyard blob, and the         *
- *           * value_state's data is pointing to the V tensor data stored in *
- *           * vineyard blob. All the length of the kv state is the size of  *
- *           * the tensor data. Then you can copy the kv state to the LLM KV *
- *           * Cache. The memory of the kv state will be freed when calling  *
- *           * the close function of the kv state cache manager.             *
- *           *                                                               *
- *           *****************************************************************
- *
- * @return Status
- */
-Status BlobStorage::Query(const std::vector<int>& tokenList, int token,
-                          std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
-  std::unique_lock<std::mutex> lock(cacheAccessMutex, std::defer_lock);
-  if (!lock.try_lock()) {
-    // If failed to gain the lock, return OK and wait for next time
-    return Status::OK();
-  }
-  if (isClosed) {
-    return Status::Invalid("The memory storage is closed.");
-  }
-
-  return QueryInternal(tokenList, token, kvState);
-}
-
 /**
  * @brief Query the kv state with the given token list and its prefix in the kv
  * state cache manager.
  *
  * @param tokenList The token list as the prefix of the updated token.
- * @param kvStateList The kv state list of the token list. It must be
+ * @param kvCacheList The kv state list of the token list. It must be
  *                    initialized before calling this function, including the
  *                    data and length of the kv tensor.
  *                    The kv state list is a 2D vector, the first dimension is
@@ -391,7 +331,7 @@ Status BlobStorage::Query(const std::vector<int>& tokenList, int token,
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
@@ -403,7 +343,7 @@ Status BlobStorage::Query(const std::vector<int>& tokenList, int token,
  *           *     value_state.length = 0;                                   *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, the key_state's data is pointing *
@@ -420,7 +360,7 @@ Status BlobStorage::Query(const std::vector<int>& tokenList, int token,
  */
 Status BlobStorage::Query(
     const std::vector<int>& tokenList,
-    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& matched) {
   std::unique_lock<std::mutex> lock(cacheAccessMutex, std::defer_lock);
   if (!lock.try_lock()) {
@@ -434,9 +374,9 @@ Status BlobStorage::Query(
   // copy the token list and query the cache one token by one token
   matched = 0;
   std::vector<int> tokenListPrefix;
-  for (size_t i = 0; i < tokenList.size() && i < kvStateList.size(); i++) {
+  for (size_t i = 0; i < tokenList.size() && i < kvCacheList.size(); i++) {
     Status result =
-        QueryInternal(tokenListPrefix, tokenList[i], kvStateList[i]);
+        QueryInternal(tokenListPrefix, tokenList[i], kvCacheList[i]);
     if (!result.ok()) {
       return Status::OK();
     }
@@ -447,6 +387,68 @@ Status BlobStorage::Query(
   return Status::OK();
 }
 
+/**
+ * @brief Query the kv state with the given token and its prefix in the kv state
+ * cache manager.
+ *
+ * @param prefix The token list as the prefix of the updated token.
+ * @param token The token to be queried.
+ * @param kvState The kv state of the token. It must be initialized(allocated)
+ *                before calling this function, including the data and length
+ *                of the kv state. The length of the kv state should be as same
+ *                as the layer of the kv state cache manager.
+ *
+ *           *****************************************************************
+ *           * Important, the kv state is managed by the kv state cache      *
+ *           * manager, the caller does not need to malloc and free the      *
+ *           * memory of the kv state. Besides, the data pointer should be   *
+ *           * nullptr and the length should be 0.                           *
+ *           *                                                               *
+ *           * Assume the layer is 2, you should allocate the memory for the *
+ *           * kv state like this:                                           *
+ *           * std::vector<std::pair<LLMKV, LLMKV>> kvState;                 *
+ *           * for (int i = 0; i < 2; i++) {                                 *
+ *           *   LLMKV key_state;                                            *
+ *           *   LLMKV value_state;                                          *
+ *           *   key_state.data = nullptr                                    *
+ *           *   value_state.data = nullptr                                  *
+ *           *   key_state.length = 0;                                       *
+ *           *   value_state.length = 0;                                     *
+ *           *   kvState.emplace_back(key_state, value_state);               *
+ *           *}                                                              *
+ *           *                                                               *
+ *           * After calling this function, the key_state's data is pointing *
+ *           * to the K tensor data stored in vineyard blob, and the         *
+ *           * value_state's data is pointing to the V tensor data stored in *
+ *           * vineyard blob. All the length of the kv state is the size of  *
+ *           * the tensor data. Then you can copy the kv state to the LLM KV *
+ *           * Cache. The memory of the kv state will be freed when calling  *
+ *           * the close function of the kv state cache manager.             *
+ *           *                                                               *
+ *           *****************************************************************
+ *
+ * @return Status
+ */
+Status BlobStorage::Query(const std::vector<int>& prefix, int token,
+                          std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
+  std::unique_lock<std::mutex> lock(cacheAccessMutex, std::defer_lock);
+  if (!lock.try_lock()) {
+    // If failed to gain the lock, return OK and wait for next time
+    return Status::OK();
+  }
+  if (isClosed) {
+    return Status::Invalid("The memory storage is closed.");
+  }
+  return QueryInternal(prefix, token, kvState);
+}
+
+Status BlobStorage::Query(
+    const std::vector<int>& prefix, const std::vector<int>& tokenList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+    size_t& matched) {
+  return Status::NotImplemented();
+}
+
 BlobStorage::~BlobStorage() {
   StopSync();
   LOG(INFO) << "BlobStorage exit.";
@@ -455,10 +457,10 @@ BlobStorage::~BlobStorage() {
 // This function is used for testing
 void BlobStorage::Delete(std::vector<int>& token) {
   std::shared_ptr<NodeData> evictedNode;
-  kvStateCacheBuilder->GetRootTree()->Delete(token, evictedNode);
-  kvStateCacheBuilder->Delete(evictedNode);
+  kvCacheBuilder->GetRootTree()->Delete(token, evictedNode);
+  kvCacheBuilder->Delete(evictedNode);
   if (VLOG_IS_ON(100)) {
-    VLOG(100) << raxShow(kvStateCacheBuilder->GetRootTree()->tree);
+    VLOG(100) << raxShow(kvCacheBuilder->GetRootTree()->tree);
   }
 }
 
@@ -468,16 +470,16 @@ Status BlobStorage::Sync() {
   std::set<ObjectID> blockIDSetToDelete;
   std::set<ObjectID> globalBlockIDSet;
   // 1. pull the cache object
-  ObjectID globalKVStateCacheID;
+  ObjectID globalKVCacheID;
   std::vector<ObjectID> deleteList;
 
-  std::shared_ptr<KVStateCache> globalKVStateCache = nullptr;
-  status = client.GetName(llmCacheObjectName, globalKVStateCacheID);
+  std::shared_ptr<KVCache> globalKVCache = nullptr;
+  status = client.GetName(llmCacheObjectName, globalKVCacheID);
   if (status.ok()) {
-    deleteList.push_back(globalKVStateCacheID);
-    globalKVStateCache = std::dynamic_pointer_cast<KVStateCache>(
-        client.FetchAndGetObject(globalKVStateCacheID));
-    globalKVStateCache->GetCurrentBlockIDSet(globalBlockIDSet);
+    deleteList.push_back(globalKVCacheID);
+    globalKVCache = std::dynamic_pointer_cast<KVCache>(
+        client.FetchAndGetObject(globalKVCacheID));
+    globalKVCache->GetCurrentBlockIDSet(globalBlockIDSet);
   } else {
     // Not an error.
     VLOG(100) << "There is no cache object in the meta server.";
@@ -485,48 +487,47 @@ Status BlobStorage::Sync() {
 
   // 2. merge the cache object
   // only the global cache object with higher version will be merged
-  VLOG(100) << "Current builder version:" << kvStateCacheBuilder->GetVersion()
+  VLOG(100) << "Current builder version:" << kvCacheBuilder->GetVersion()
             << " global version:"
-            << (globalKVStateCache == nullptr
+            << (globalKVCache == nullptr
                     ? "null"
-                    : std::to_string(globalKVStateCache->GetVersion()));
-  if (globalKVStateCache != nullptr &&
-      kvStateCacheBuilder->GetVersion() < globalKVStateCache->GetVersion()) {
-    status = kvStateCacheBuilder->Merge(globalKVStateCache);
+                    : std::to_string(globalKVCache->GetVersion()));
+  if (globalKVCache != nullptr &&
+      kvCacheBuilder->GetVersion() < globalKVCache->GetVersion()) {
+    status = kvCacheBuilder->Merge(globalKVCache);
     RETURN_ON_ERROR(status);
-    if (globalKVStateCache->id() != globalKVStateCacheID) {
+    if (globalKVCache->id() != globalKVCacheID) {
       VLOG(100) << "Del migrate object";
-      Status status = client.DelData(globalKVStateCache->id());
+      Status status = client.DelData(globalKVCache->id());
       if (!status.ok()) {
         LOG(ERROR) << "Delete object failed: " << status.ToString()
                    << " It may cause memory leak.";
       }
     }
   }
-  kvStateCacheBuilder->UpdateVersion();
+  kvCacheBuilder->UpdateVersion();
 
   /**
    * 3. get the current block id set, which stores the block id(instead of block
    * ptr) and the block id set to delete.
    */
   std::set<ObjectID> currentObjectIDSet;
-  kvStateCacheBuilder->GetCurrentBlockIDSet(currentObjectIDSet);
-  blockIDSetToDelete = kvStateCacheBuilder->GetBlockIDSetToDelete();
+  kvCacheBuilder->GetCurrentBlockIDSet(currentObjectIDSet);
+  blockIDSetToDelete = kvCacheBuilder->GetBlockIDSetToDelete();
 
   // 4. push the cache object to the vineyardd
-  kvStateCache = std::dynamic_pointer_cast<KVStateCache>(
-      kvStateCacheBuilder->_Seal(client));
+  kvCache = std::dynamic_pointer_cast<KVCache>(kvCacheBuilder->_Seal(client));
 
   std::set<ObjectID> currentGlobalBlockIDSet;
-  kvStateCacheBuilder->GetCurrentBlockIDSet(currentGlobalBlockIDSet);
+  kvCacheBuilder->GetCurrentBlockIDSet(currentGlobalBlockIDSet);
 
-  status = client.Persist(kvStateCache->id());
+  status = client.Persist(kvCache->id());
   RETURN_ON_ERROR(status);
 
   // 5. put the name of the new cache object to the meta server
   status = client.DropName(llmCacheObjectName);
   RETURN_ON_ERROR(status);
-  status = client.PutName(kvStateCache->id(), llmCacheObjectName);
+  status = client.PutName(kvCache->id(), llmCacheObjectName);
   RETURN_ON_ERROR(status);
 
   // 6. delete old cache object
@@ -537,11 +538,10 @@ Status BlobStorage::Sync() {
   }
 
   // 7. create a global cache object replica
-  kvStateCache->Resolve();
-  RETURN_ON_ERROR(
-      KVStateCacheBuilder::Make(client, kvStateCacheBuilder, kvStateCache));
+  kvCache->Resolve();
+  RETURN_ON_ERROR(KVCacheBuilder::Make(client, kvCacheBuilder, kvCache));
 
-  kvStateCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd);
+  kvCacheBuilder->GetCurrentBlockIDSet(blockIDSetToAdd);
 
   /**
    * 8. get the add set, which contains the block id in the new cache object
@@ -625,31 +625,30 @@ Status BlobStorage::AfterSyncFailed() {
    * If there exists a global cache object, recover from the global object
    * and delete the cache object if the builder is sealed.
    */
-  ObjectID globalKVStateCacheID;
-  std::shared_ptr<KVStateCache> globalKVStateCache = nullptr;
-  Status status = client.GetName(llmCacheObjectName, globalKVStateCacheID);
+  ObjectID globalKVCacheID;
+  std::shared_ptr<KVCache> globalKVCache = nullptr;
+  Status status = client.GetName(llmCacheObjectName, globalKVCacheID);
   if (status.ok()) {
-    globalKVStateCache = std::dynamic_pointer_cast<KVStateCache>(
-        client.FetchAndGetObject(globalKVStateCacheID));
+    globalKVCache = std::dynamic_pointer_cast<KVCache>(
+        client.FetchAndGetObject(globalKVCacheID));
   } else {
     VLOG(100) << "There is no cache object in the meta server.";
     return Status::OK();
   }
 
-  status = KVStateCacheBuilder::Make(client, kvStateCacheBuilder,
-                                     globalKVStateCache);
+  status = KVCacheBuilder::Make(client, kvCacheBuilder, globalKVCache);
   RETURN_ON_ERROR(status);
-  if (kvStateCache != nullptr && kvStateCache->id() != globalKVStateCacheID) {
+  if (kvCache != nullptr && kvCache->id() != globalKVCacheID) {
     // It means the builder is sealed but not pushed to the vineyardd
-    deleteList.push_back(kvStateCache->id());
-    deleteList.push_back(globalKVStateCache->id());
+    deleteList.push_back(kvCache->id());
+    deleteList.push_back(globalKVCache->id());
   }
   status = client.DelData(deleteList, false, true);
   if (!status.ok()) {
     LOG(ERROR) << "Delete object failed: " << status.ToString()
                << " It may cause memory leak.";
   }
-  kvStateCache = nullptr;
+  kvCache = nullptr;
 
   return Status::OK();
 }
@@ -696,8 +695,8 @@ Status BlobStorage::ClearGlobalCache(Client& client,
   RETURN_ON_ERROR(client.GetName(llmRefcntObjectName, globalRefcntMapId));
   RETURN_ON_ERROR(client.DropName(llmRefcntObjectName));
 
-  std::shared_ptr<KVStateCache> globalCacheObject =
-      std::dynamic_pointer_cast<KVStateCache>(
+  std::shared_ptr<KVCache> globalCacheObject =
+      std::dynamic_pointer_cast<KVCache>(
           client.FetchAndGetObject(globalCacheObjectID));
   std::set<ObjectID> blockIDSetToDelete;
   globalCacheObject->GetCurrentBlockIDSet(blockIDSetToDelete);
@@ -719,7 +718,7 @@ void BlobStorage::CloseCache() {
 
   LOG(INFO) << "Clear block set and recycle blob.";
   std::lock_guard<std::mutex> cacheLock(cacheAccessMutex);
-  this->kvStateCacheBuilder->Close();
+  this->kvCacheBuilder->Close();
   this->isClosed = true;
   RefreshRefcnt();
 }
@@ -785,7 +784,7 @@ Status BlobStorage::SetRefcntMap(std::set<ObjectID>& blockIDSetToDelete,
 
 void BlobStorage::RefreshRefcnt() {
   std::set<ObjectID> blockIDSetToDelete =
-      this->kvStateCacheBuilder->GetBlockIDSetToDelete();
+      this->kvCacheBuilder->GetBlockIDSetToDelete();
   std::set<ObjectID> blockIDSetToAdd;
   std::string actualKey;
   AcquireServerLock(client, llmCacheSyncLock, actualKey);
diff --git a/modules/llm-cache/storage/blob_storage.h b/modules/llm-cache/storage/blob_storage.h
index 0f66aa41a1..c7cb182edd 100644
--- a/modules/llm-cache/storage/blob_storage.h
+++ b/modules/llm-cache/storage/blob_storage.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "client/client.h"
 #include "common/util/logging.h"
 
-#include "llm-cache/ds/kv_state_cache.h"
+#include "llm-cache/ds/kv_cache.h"
 #include "llm-cache/ds/refcnt_map.h"
 #include "llm-cache/storage/storage.h"
 
@@ -37,8 +37,8 @@ namespace vineyard {
 class BlobStorage : public IStorage {
  private:
   Client& client;
-  std::shared_ptr<KVStateCacheBuilder> kvStateCacheBuilder = nullptr;
-  std::shared_ptr<KVStateCache> kvStateCache = nullptr;
+  std::shared_ptr<KVCacheBuilder> kvCacheBuilder = nullptr;
+  std::shared_ptr<KVCache> kvCache = nullptr;
   std::shared_ptr<RefcntMapObjectBuilder> refcntMapObjectBuilder = nullptr;
   std::string llmCacheSyncLock;
   std::string llmCacheObjectName;
@@ -52,13 +52,13 @@ class BlobStorage : public IStorage {
   bool isClosed = false;
 
  public:
-  BlobStorage(Client& client, std::shared_ptr<KVStateCacheBuilder>& cache,
+  BlobStorage(Client& client, std::shared_ptr<KVCacheBuilder>& cache,
               int syncInterval, std::string& llmCacheSyncLock,
               std::string& llmCacheObjectName,
               std::string& llmRefcntObjectName);
 
   static Status Make(Client& client, std::shared_ptr<BlobStorage>& storage,
-                     int tensorBytes = 10, int cacheCapacity = 10,
+                     int tensorNBytes = 10, int cacheCapacity = 10,
                      int layer = 1, int blockSize = 5, int syncInterval = 3,
                      std::string llmCacheSyncLock = "llmCacheSyncLock",
                      std::string llmCacheObjectName = "llm_cache_object",
@@ -69,25 +69,30 @@ class BlobStorage : public IStorage {
 
   Status Update(
       const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) override;
 
   Status Update(
       const std::vector<int>& prefix, const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) override;
 
-  Status Query(const std::vector<int>& tokenList, int token,
+  Status Query(const std::vector<int>& tokenList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+               size_t& matched) override;
+
+  Status Query(const std::vector<int>& prefix, int token,
                std::vector<std::pair<LLMKV, LLMKV>>& kvState) override;
 
-  Status Query(const std::vector<int>& tokenList,
-               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+  Status Query(const std::vector<int>& prefix,
+               const std::vector<int>& tokenList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
                size_t& matched) override;
 
   void CloseCache() override;
 
-  std::shared_ptr<KVStateCacheBuilder>& GetKVStateCacheBuilder() {
-    return this->kvStateCacheBuilder;
+  std::shared_ptr<KVCacheBuilder>& GetKVCacheBuilder() {
+    return this->kvCacheBuilder;
   }
 
   std::shared_ptr<RefcntMapObjectBuilder>& GetRefcntMapObjectBuilder() {
diff --git a/modules/llm-cache/storage/file_storage.cc b/modules/llm-cache/storage/file_storage.cc
index 85efa3601c..7774801b71 100644
--- a/modules/llm-cache/storage/file_storage.cc
+++ b/modules/llm-cache/storage/file_storage.cc
@@ -30,29 +30,13 @@ limitations under the License.
 #include "llm-cache/storage/file_storage.h"
 #include "llm-cache/thread_group.h"
 
-#define RETURN_ON_ERROR_WITH_PATH_INDEX(index, status) \
-  do {                                                 \
-    auto _ret = (status);                              \
-    if (!_ret.ok()) {                                  \
-      return std::pair(index, _ret);                   \
-    }                                                  \
-  } while (0)
-
-#define RETURN_ON_ASSERT_WITH_PATH_INDEX(index, condition, message)         \
-  do {                                                                      \
-    if (!(condition)) {                                                     \
-      return std::pair(index, vineyard::Status::AssertionFailed(            \
-                                  std::string(#condition ": ") + message)); \
-    }                                                                       \
-  } while (0)
-
 namespace vineyard {
 
 /**
  * @brief Update the kv state with the given token list in the file storage.
  *
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -68,58 +52,63 @@ namespace vineyard {
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList manually                         *
+ *           * kv buffer of the kvCacheList manually                         *
  *           *                                                               *
  *           *****************************************************************
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList.
+ * kvCacheList.
  *
  *
  * @example Suppose the token list is [1, 2, 3, 4], the layer is 2,
- *          then the kvStateList should be a 2D vector with size 4 * 2.
+ *          then the kvCacheList should be a 2D vector with size 4 * 2.
  *
  * @return Status
  */
 Status FileStorage::Update(
     const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
   if (this->exitFlag) {
     return Status::Invalid("The file storage has been closed!");
   }
+  if (tokenList.size() % chunkSize != 0) {
+    return Status::Invalid("Tokens size " + std::to_string(tokenList.size()) +
+                           " should be multiple of batch size " +
+                           std::to_string(chunkSize) + "!");
+  }
+
   std::vector<std::string> pathList;
   std::set<std::string> createFileSet;
   std::mutex createFileSetMutex;
-  RETURN_ON_ERROR(hasher->computePathForTokens(tokenList, batchSize,
-                                               splitNumber, pathList));
+  RETURN_ON_ERROR(hasher->computePathForTokens(tokenList, chunkSize,
+                                               hashChunkSize, pathList));
   if (pathList.size() == 0) {
     return Status::OK();
   }
 
   std::vector<std::string> tempFilePaths(pathList.size());
-  auto fn = [this, &tempFilePaths, &pathList, &tokenList, &kvStateList,
-             &createFileSet,
-             &createFileSetMutex](int i) -> std::pair<int, Status> {
-    int tokenLength = (i + 1) * batchSize;
+  auto fn = [this, &tempFilePaths, &pathList, &tokenList, &kvCacheList,
+             &createFileSet, &createFileSetMutex](int i) -> Status {
+    int tokenLength = (i + 1) * chunkSize;
     std::shared_ptr<FileDescriptor> fd = CreateFileDescriptor();
     std::string tmpPathStr = GetTmpFileDir() + "-" + std::to_string(i);
     tempFilePaths[i] = tmpPathStr;
@@ -127,47 +116,42 @@ Status FileStorage::Update(
     std::string pathStr = this->rootPath + pathList[i];
     ghc::filesystem::path path(pathStr);
 
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(path.parent_path().string()));
+    RETURN_ON_ERROR(Mkdir(path.parent_path().string()));
 
     if (Open(pathStr, fd, FileOperationType::READ).ok()) {
       int tokenLengthInFile;
-      RETURN_ON_ERROR_WITH_PATH_INDEX(
-          i, Read(fd, &tokenLengthInFile, sizeof(int)));
+      RETURN_ON_ERROR(Read(fd, &tokenLengthInFile, sizeof(int)));
       std::vector<int> tokens;
       tokens.resize(tokenLengthInFile);
-      RETURN_ON_ERROR_WITH_PATH_INDEX(
-          i, Read(fd, tokens.data(), tokenLengthInFile * sizeof(int)));
+      RETURN_ON_ERROR(Read(fd, tokens.data(), tokenLengthInFile * sizeof(int)));
       if (!CompareTokenList(tokenList, tokens, tokenLengthInFile)) {
         // Token list not match
         VINEYARD_DISCARD(Close(fd));
-        return std::pair(
-            i, Status::ObjectExists("File exists for another token sequence"));
+        return Status::ObjectExists("File exists for another token sequence");
       }
       // Skip this kv state
       VINEYARD_DISCARD(Close(fd));
-      return std::pair(i, Status::OK());
+      return Status::OK();
     }
 
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(tmpPath.parent_path().string()));
+    RETURN_ON_ERROR(Mkdir(tmpPath.parent_path().string()));
     auto status = Open(tmpPathStr, fd, FileOperationType::WRITE);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to create temporary cache entry: "
                    << status.ToString();
-      return std::pair(
-          i, Status::Wrap(status, "Failed to create temporary cache entry"));
+      return Status::Wrap(status, "Failed to create temporary cache entry");
     }
 
     // Currently we do not consider delete.
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, &tokenLength, sizeof(int)));
-    RETURN_ON_ERROR_WITH_PATH_INDEX(
-        i, Write(fd, tokenList.data(), tokenLength * sizeof(int)));
-    for (int currentTokenIndex = i * batchSize;
-         currentTokenIndex < (i + 1) * batchSize; currentTokenIndex++) {
+    RETURN_ON_ERROR(Write(fd, &tokenLength, sizeof(int)));
+    RETURN_ON_ERROR(Write(fd, tokenList.data(), tokenLength * sizeof(int)));
+    for (int currentTokenIndex = i * chunkSize;
+         currentTokenIndex < (i + 1) * chunkSize; currentTokenIndex++) {
       for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
-        const LLMKV& k = kvStateList[currentTokenIndex][currentLayer].first;
-        const LLMKV& v = kvStateList[currentTokenIndex][currentLayer].second;
-        RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, k.data, k.length));
-        RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, v.data, k.length));
+        const LLMKV& k = kvCacheList[currentTokenIndex][currentLayer].first;
+        const LLMKV& v = kvCacheList[currentTokenIndex][currentLayer].second;
+        RETURN_ON_ERROR(Write(fd, k.data, k.length));
+        RETURN_ON_ERROR(Write(fd, v.data, k.length));
       }
     }
 
@@ -178,34 +162,31 @@ Status FileStorage::Update(
       // Move failed. There exists a file with the same name.
       LOG(WARNING) << "Failed to move cache entry: " << status.ToString();
       VINEYARD_SUPPRESS(Delete(tmpPathStr));
-      return std::pair(i, Status::Wrap(status, "Failed to move cache entry"));
+      return Status::Wrap(status, "Failed to move cache entry");
     }
     std::lock_guard<std::mutex> lock(createFileSetMutex);
     createFileSet.insert(pathStr);
-    return std::pair(i, Status::OK());
+    return Status::OK();
   };
 
   parallel::ThreadGroup tg(
       std::min(pathList.size(),
                static_cast<size_t>(std::thread::hardware_concurrency())));
-  for (size_t i = 0; i < pathList.size(); i++) {
-    tg.AddTask(fn, i);
+  std::vector<parallel::ThreadGroup::tid_t> tids(pathList.size());
+  for (size_t i = 0; i < pathList.size(); ++i) {
+    tids[i] = tg.AddTask(fn, i);
   }
-
-  std::vector<std::pair<int, Status>> ss = tg.TakeResults();
-  std::map<int, bool> pathIndexMap;
-  for (size_t i = 0; i < pathList.size(); i++) {
-    if (ss[i].second.ok()) {
-      pathIndexMap[ss[i].first] = true;
-    }
+  std::vector<Status> taskResults(pathList.size(), Status::OK());
+  for (size_t i = 0; i < pathList.size(); ++i) {
+    taskResults[i] = tg.TaskResult(tids[i]);
   }
 
-  int j = 0;
+  size_t upper_bound = 0;
   {
     std::lock_guard<std::mutex> lock(gcMutex);
     for (size_t i = 0; i < pathList.size(); i++) {
-      if (pathIndexMap.find(i) != pathIndexMap.end()) {
-        j += 1;
+      if (taskResults[i].ok()) {
+        upper_bound += 1;
         if (createFileSet.find(this->rootPath + pathList[i]) !=
             createFileSet.end()) {
           TouchFile(this->rootPath + pathList[i]);
@@ -216,12 +197,11 @@ Status FileStorage::Update(
       }
     }
   }
-  updated = ((size_t) j) * batchSize;
-  for (size_t i = j; i < pathList.size(); i++) {
+  updated = upper_bound * chunkSize;
+  for (size_t i = upper_bound; i < pathList.size(); i++) {
     VINEYARD_SUPPRESS(Delete(this->rootPath + pathList[i]));
     VINEYARD_SUPPRESS(Delete(tempFilePaths[i]));
   }
-
   return Status::OK();
 }
 
@@ -232,7 +212,7 @@ Status FileStorage::Update(
  * @param prefix The prefix token list. It should be a multiple of the batch
  * size.
  * @param tokenList The token list to be updated.
- * @param kvStateList The kv state list of the token list.
+ * @param kvCacheList The kv state list of the token list.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index.
  *                    The kv state is a pair of LLMKV, the first is the K tensor
@@ -248,47 +228,52 @@ Status FileStorage::Update(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
  *           *     // Copy the k_state of LLM KV Cache to key_state.data     *
  *           *     // Copy the v_state of LLM KV Cache to value_state.data   *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, you must release(free) the       *
- *           * kv buffer of the kvStateList manually                         *
+ *           * kv buffer of the kvCacheList manually                         *
  *           *                                                               *
  *           *****************************************************************
  *
  * @note The length of the token list should be as same as the length of the
- * kvStateList.
+ * kvCacheList.
  *
  * @example Suppose the prefix is [1, 2], the token list is [3, 4], the layer is
- * 2, then the kvStateList should be a 2D vector with size 2 * 2.
+ * 2, then the kvCacheList should be a 2D vector with size 2 * 2.
  *
  * @return Status
  */
 Status FileStorage::Update(
     const std::vector<int>& prefix, const std::vector<int>& tokenList,
-    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& updated) {
   if (this->exitFlag) {
     return Status::Invalid("The file storage has been closed!");
   }
-  if (prefix.size() % batchSize != 0) {
+  if (prefix.size() % chunkSize != 0) {
     return Status::Invalid("Prefix size " + std::to_string(prefix.size()) +
                            " should be multiple of batch size " +
-                           std::to_string(batchSize) + "!");
+                           std::to_string(chunkSize) + "!");
+  }
+  if (tokenList.size() % chunkSize != 0) {
+    return Status::Invalid("Tokens size " + std::to_string(tokenList.size()) +
+                           " should be multiple of batch size " +
+                           std::to_string(chunkSize) + "!");
   }
 
   std::vector<std::string> pathList;
@@ -298,17 +283,17 @@ Status FileStorage::Update(
   totalTokenList.insert(totalTokenList.end(), tokenList.begin(),
                         tokenList.end());
 
-  RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, batchSize,
-                                               splitNumber, pathList));
+  RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, chunkSize,
+                                               hashChunkSize, pathList));
   if (pathList.size() == 0) {
     return Status::OK();
   }
 
   std::vector<std::string> tempFilePaths(pathList.size());
   auto fn = [this, &tempFilePaths, &pathList, &prefix, &totalTokenList,
-             &kvStateList, &createFileSet,
-             &createFileSetMutex](size_t i) -> std::pair<int, Status> {
-    int tokenLength = (i + 1) * batchSize;
+             &kvCacheList, &createFileSet,
+             &createFileSetMutex](size_t i) -> Status {
+    int tokenLength = (i + 1) * chunkSize;
     std::shared_ptr<FileDescriptor> fd = CreateFileDescriptor();
     std::string tmpPathStr = GetTmpFileDir() + "-" + std::to_string(i);
     tempFilePaths[i] = tmpPathStr;
@@ -316,52 +301,47 @@ Status FileStorage::Update(
     std::string pathStr = this->rootPath + pathList[i];
     ghc::filesystem::path path(pathStr);
 
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(path.parent_path().string()));
+    RETURN_ON_ERROR(Mkdir(path.parent_path().string()));
 
     if (Open(pathStr, fd, FileOperationType::READ).ok()) {
       int tokenLength;
-      RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, &tokenLength, sizeof(int)));
-      std::vector<int> tokens;
-      tokens.resize(tokenLength);
-      RETURN_ON_ERROR_WITH_PATH_INDEX(
-          i, Read(fd, tokens.data(), tokenLength * sizeof(int)));
+      RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int)));
+      std::vector<int> tokens(tokenLength, -1);
+      RETURN_ON_ERROR(Read(fd, tokens.data(), tokenLength * sizeof(int)));
       if (!CompareTokenList(totalTokenList, tokens, tokenLength)) {
         // Token list not match
         VINEYARD_DISCARD(Close(fd));
-        return std::pair(
-            i, Status::ObjectExists("File exists for another token sequence"));
+        return Status::ObjectExists("File exists for another token sequence");
       }
       // Skip this kv state
       VINEYARD_DISCARD(Close(fd));
-      return std::pair(i, Status::OK());
+      return Status::OK();
     }
 
-    if ((i + 1) * batchSize <= prefix.size()) {
-      return std::pair(
-          i, Status::ObjectNotExists("The prefix is not in the file cache"));
+    if ((i + 1) * chunkSize <= prefix.size()) {
+      return Status::ObjectNotExists("The prefix is not in the file cache");
     }
 
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Mkdir(tmpPath.parent_path().string()));
+    RETURN_ON_ERROR(Mkdir(tmpPath.parent_path().string()));
     auto status = Open(tmpPathStr, fd, FileOperationType::WRITE);
     if (!status.ok()) {
-      return std::pair(
-          i, Status::Wrap(status, "Failed to create temporary cache entry"));
+      return Status::Wrap(status, "Failed to create temporary cache entry");
     }
 
     // Currently we do not consider delete.
 
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, &tokenLength, sizeof(int)));
-    RETURN_ON_ERROR_WITH_PATH_INDEX(
-        i, Write(fd, totalTokenList.data(), tokenLength * sizeof(int)));
+    RETURN_ON_ERROR(Write(fd, &tokenLength, sizeof(int)));
+    RETURN_ON_ERROR(
+        Write(fd, totalTokenList.data(), tokenLength * sizeof(int)));
     size_t kvStatePos =
-        (i * batchSize) < prefix.size() ? 0 : (i * batchSize) - prefix.size();
+        (i * chunkSize) < prefix.size() ? 0 : (i * chunkSize) - prefix.size();
     for (size_t currentTokenIndex = kvStatePos;
-         currentTokenIndex < kvStatePos + batchSize; currentTokenIndex++) {
+         currentTokenIndex < kvStatePos + chunkSize; currentTokenIndex++) {
       for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
-        const LLMKV& k = kvStateList[currentTokenIndex][currentLayer].first;
-        const LLMKV& v = kvStateList[currentTokenIndex][currentLayer].second;
-        RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, k.data, k.length));
-        RETURN_ON_ERROR_WITH_PATH_INDEX(i, Write(fd, v.data, k.length));
+        const LLMKV& k = kvCacheList[currentTokenIndex][currentLayer].first;
+        const LLMKV& v = kvCacheList[currentTokenIndex][currentLayer].second;
+        RETURN_ON_ERROR(Write(fd, k.data, k.length));
+        RETURN_ON_ERROR(Write(fd, v.data, k.length));
       }
     }
 
@@ -370,35 +350,32 @@ Status FileStorage::Update(
     if (!MoveFileAtomic(tmpPathStr, pathStr).ok()) {
       // Move failed. There exists a file with the same name.
       VINEYARD_SUPPRESS(Delete(tmpPathStr));
-      return std::pair(i, Status::Wrap(status, "Failed to move cache entry"));
+      return Status::Wrap(status, "Failed to move cache entry");
     }
     std::lock_guard<std::mutex> lock(createFileSetMutex);
     createFileSet.insert(pathStr);
-    return std::pair(i, Status::OK());
+    return Status::OK();
   };
 
   parallel::ThreadGroup tg(
       std::min(pathList.size(),
                static_cast<size_t>(std::thread::hardware_concurrency())));
-  for (size_t i = 0; i < pathList.size(); i++) {
-    tg.AddTask(fn, i);
+  std::vector<parallel::ThreadGroup::tid_t> tids(pathList.size());
+  for (size_t i = 0; i < pathList.size(); ++i) {
+    tids[i] = tg.AddTask(fn, i);
   }
-
-  std::vector<std::pair<int, Status>> ss = tg.TakeResults();
-  std::map<int, bool> pathIndexMap;
-  for (size_t i = 0; i < pathList.size(); i++) {
-    if (ss[i].second.ok()) {
-      pathIndexMap[ss[i].first] = true;
-    }
+  std::vector<Status> taskResults(pathList.size(), Status::OK());
+  for (size_t i = 0; i < pathList.size(); ++i) {
+    taskResults[i] = tg.TaskResult(tids[i]);
   }
 
-  int j = 0;
+  size_t upper_bound = 0;
   {
     std::lock_guard<std::mutex> lock(gcMutex);
     for (size_t i = 0; i < pathList.size(); i++) {
-      if (pathIndexMap.find(i) != pathIndexMap.end()) {
-        j += 1;
-        if (((size_t) j) * batchSize > prefix.size() &&
+      if (taskResults[i].ok()) {
+        upper_bound += 1;
+        if (upper_bound * chunkSize > prefix.size() &&
             createFileSet.find(this->rootPath + pathList[i]) !=
                 createFileSet.end()) {
           // Only this part is created.
@@ -410,13 +387,13 @@ Status FileStorage::Update(
       }
     }
   }
-  updated =
-      size_t(j * batchSize) < prefix.size() ? 0 : j * batchSize - prefix.size();
-  for (size_t i = j; i < pathList.size(); i++) {
+  updated = upper_bound * chunkSize <= prefix.size()
+                ? 0
+                : upper_bound * chunkSize - prefix.size();
+  for (size_t i = upper_bound; i < pathList.size(); i++) {
     VINEYARD_SUPPRESS(Delete(this->rootPath + pathList[i]));
     VINEYARD_SUPPRESS(Delete(tempFilePaths[i]));
   }
-
   return Status::OK();
 }
 
@@ -431,7 +408,7 @@ Status FileStorage::Update(
  * @brief Query the kv state with the given token list in the file storage.
  *
  * @param tokenList The token list to be queried.
- * @param kvStateList The kv state list of the token list to be fulfilled.
+ * @param kvCacheList The kv state list of the token list to be fulfilled.
  *                    It's a 2D vector, the first dimension is the token index,
  *                    and the second dimension is the layer index. The kv state
  *                    is a pair of LLMKV, the first is the K tensor and the
@@ -447,19 +424,19 @@ Status FileStorage::Update(
  *           *                                                               *
  *           * Assume the layer is 2, and the token list is [1,2] you should *
  *           * allocate the memory for the kv state like this:               *
- *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvStateList;*
+ *           * std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kvCacheList;*
  *           * for (int i = 0; i < 2; i++) {                                 *
  *           *   std::vector<std::pair<LLMKV, LLMKV>> kvState;               *
  *           *   for (int j = 0; j < 2; j++) {                               *
  *           *     LLMKV key_state;                                          *
  *           *     LLMKV value_state;                                        *
- *           *     key_state.data = malloc(tensorBytes);                     *
- *           *     value_state.data = malloc(tensorBytes)                    *
- *           *     key_state.length = tensorBytes;                           *
- *           *     value_state.length = tensorBytes;                         *
+ *           *     key_state.data = malloc(tensorNBytes);                     *
+ *           *     value_state.data = malloc(tensorNBytes)                    *
+ *           *     key_state.length = tensorNBytes;                           *
+ *           *     value_state.length = tensorNBytes;                         *
  *           *     kvState.emplace_back(key_state, value_state);             *
  *           *   }                                                           *
- *           *   kvStateList.push_back(kvState);                             *
+ *           *   kvCacheList.push_back(kvState);                             *
  *           *}                                                              *
  *           *                                                               *
  *           * After calling this function, the key_state and value_state    *
@@ -469,97 +446,90 @@ Status FileStorage::Update(
  *           *                                                               *
  *           *****************************************************************
  *
- * @note The kvStateList must be initialized before calling this function,
+ * @note The kvCacheList must be initialized before calling this function,
  * including the data and length of the kv tensor.
  *
  * @return Status
  */
 Status FileStorage::Query(
     const std::vector<int>& tokenList,
-    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
     size_t& matched) {
   if (this->exitFlag) {
     return Status::Invalid("The file storage has been closed!");
   }
+
   std::vector<std::string> paths;
   std::string dir = rootPath;
   RETURN_ON_ERROR(
-      hasher->computePathForTokens(tokenList, batchSize, splitNumber, paths));
+      hasher->computePathForTokens(tokenList, chunkSize, hashChunkSize, paths));
 
-  auto fn = [&](size_t i, size_t matched_start) -> std::pair<int, Status> {
+  auto fn = [&](size_t i, size_t matched_start) -> Status {
     ghc::filesystem::path filePath(dir + paths[i]);
     std::shared_ptr<FileDescriptor> fd = CreateFileDescriptor();
 
     // If open failed, it means the kv state is not in the cache(file not exist)
     if (!Open(filePath.string(), fd, FileOperationType::READ).ok()) {
-      return std::pair(i, Status::ObjectNotExists("file doesn't exist"));
+      return Status::ObjectNotExists("file doesn't exist");
     }
     size_t file_size = 0;
     auto s = GetFileSize(fd, file_size);
     if (!s.ok()) {
       VINEYARD_DISCARD(Close(fd));
-      return std::pair(i, Status::ObjectNotExists("cannot get file size"));
+      return Status::ObjectNotExists("cannot get file size");
     }
     if (file_size == 0) {
       VINEYARD_DISCARD(Close(fd));
       VINEYARD_DISCARD(Delete(filePath.string()));
-      return std::pair(i, Status::ObjectNotExists("file is empty"));
+      return Status::ObjectNotExists("file is empty");
     }
 
-    int tokenLength;
-    RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, &tokenLength, sizeof(int)));
-    std::vector<int> prefix;
-    prefix.resize(tokenLength);
-    RETURN_ON_ERROR_WITH_PATH_INDEX(
-        i, Read(fd, prefix.data(), tokenLength * sizeof(int)));
+    int tokenLength = 0;
+    RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int)));
+    std::vector<int> blockTokenList(tokenLength, 0);
+    RETURN_ON_ERROR(Read(fd, blockTokenList.data(), tokenLength * sizeof(int)));
 
-    if (!CompareTokenList(tokenList, prefix, prefix.size())) {
+    if (!CompareTokenList(tokenList, blockTokenList, tokenLength)) {
       VINEYARD_DISCARD(Close(fd));
-      return std::pair(i, Status::ObjectNotExists("token mismatch"));
-    } else {
-      for (int j = 0; j < batchSize; j++) {
-        if (matched_start + j >= tokenList.size() ||
-            matched_start + j >= kvStateList.size()) {
-          break;
-        }
-        auto& kvState = kvStateList[matched_start + j];
-        for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
-          RETURN_ON_ASSERT_WITH_PATH_INDEX(
-              i, static_cast<int>(kvState.size()) == layer,
-              "The size of kvState is not equal to layer");
-          LLMKV& k = kvState[currentLayer].first;
-          LLMKV& v = kvState[currentLayer].second;
-          RETURN_ON_ASSERT_WITH_PATH_INDEX(
-              i, k.length == tensorBytes && v.length == tensorBytes,
-              "The size of kv tensor doesn't match with the tensorBytes");
-          RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, k.data, k.length));
-          RETURN_ON_ERROR_WITH_PATH_INDEX(i, Read(fd, v.data, v.length));
-        }
+      return Status::ObjectNotExists("token mismatch");
+    }
+    for (int j = 0; j < chunkSize; j++) {
+      if (matched_start + j >= tokenList.size() ||
+          matched_start + j >= kvCacheList.size()) {
+        break;
+      }
+      auto& kvState = kvCacheList[matched_start + j];
+      for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
+        RETURN_ON_ASSERT(static_cast<int>(kvState.size()) == layer,
+                         "The size of kvState is not equal to layer");
+        LLMKV& k = kvState[currentLayer].first;
+        LLMKV& v = kvState[currentLayer].second;
+        RETURN_ON_ASSERT(
+            k.length == tensorNBytes && v.length == tensorNBytes,
+            "The size of kv tensor doesn't match with the tensorNBytes");
+        RETURN_ON_ERROR(Read(fd, k.data, k.length));
+        RETURN_ON_ERROR(Read(fd, v.data, v.length));
       }
     }
-
     VINEYARD_DISCARD(Close(fd));
-    return std::pair(i, Status::OK());
+    return Status::OK();
   };
 
   parallel::ThreadGroup tg(std::min(
       paths.size(), static_cast<size_t>(std::thread::hardware_concurrency())));
-  for (size_t i = 0; i < paths.size(); i++) {
-    tg.AddTask(fn, i, i * batchSize);
+  std::vector<parallel::ThreadGroup::tid_t> tids(paths.size());
+  for (size_t i = 0; i < paths.size(); ++i) {
+    tids[i] = tg.AddTask(fn, i, i * chunkSize);
   }
-
-  matched = 0;
-  std::vector<std::pair<int, Status>> ss = tg.TakeResults();
-  std::map<int, bool> pathIndexMap;
-  for (size_t i = 0; i < paths.size(); i++) {
-    if (ss[i].second.ok()) {
-      pathIndexMap[ss[i].first] = true;
-    }
+  std::vector<Status> taskResults(paths.size(), Status::OK());
+  for (size_t i = 0; i < paths.size(); ++i) {
+    taskResults[i] = tg.TaskResult(tids[i]);
   }
 
+  matched = 0;
   for (size_t i = 0; i < paths.size(); i++) {
-    if (pathIndexMap.find(i) != pathIndexMap.end()) {
-      matched += batchSize;
+    if (taskResults[i].ok()) {
+      matched += chunkSize;
     } else {
       break;
     }
@@ -567,20 +537,120 @@ Status FileStorage::Query(
   return Status::OK();
 }
 
-Status FileStorage::Query(const std::vector<int>& tokenList, int nextToken,
+Status FileStorage::Query(const std::vector<int>& prefix, int nextToken,
                           std::vector<std::pair<LLMKV, LLMKV>>& kvState) {
   // TBD
   return Status::NotImplemented();
 }
 
-bool FileStorage::CompareTokenList(const std::vector<int>& tokenList,
+Status FileStorage::Query(
+    const std::vector<int>& prefix, const std::vector<int>& tokenList,
+    std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+    size_t& matched) {
+  if (this->exitFlag) {
+    return Status::Invalid("The file storage has been closed!");
+  }
+  if (prefix.size() % chunkSize != 0) {
+    return Status::Invalid("Prefix size " + std::to_string(prefix.size()) +
+                           " should be multiple of batch size " +
+                           std::to_string(chunkSize) + "!");
+  }
+
+  size_t numPrefixChunks = prefix.size() / chunkSize;
+  std::vector<int> totalTokenList(prefix.begin(), prefix.end());
+  totalTokenList.insert(totalTokenList.end(), tokenList.begin(),
+                        tokenList.end());
+
+  std::vector<std::string> paths;
+  std::string dir = rootPath;
+  RETURN_ON_ERROR(hasher->computePathForTokens(totalTokenList, chunkSize,
+                                               hashChunkSize, paths));
+
+  auto fn = [&](size_t i, size_t matched_start) -> Status {
+    ghc::filesystem::path filePath(dir + paths[i]);
+    std::shared_ptr<FileDescriptor> fd = CreateFileDescriptor();
+
+    // If open failed, it means the kv state is not in the cache(file not exist)
+    if (!Open(filePath.string(), fd, FileOperationType::READ).ok()) {
+      return Status::ObjectNotExists("Failed to open file '" +
+                                     filePath.string() + "'");
+    }
+    size_t file_size = 0;
+    auto s = GetFileSize(fd, file_size);
+    if (!s.ok()) {
+      VINEYARD_DISCARD(Close(fd));
+      return Status::ObjectNotExists("Cannot get file size");
+    }
+    if (file_size == 0) {
+      VINEYARD_DISCARD(Close(fd));
+      VINEYARD_DISCARD(Delete(filePath.string()));
+      return Status::ObjectNotExists("The target file is empty");
+    }
+
+    int tokenLength = 0;
+    RETURN_ON_ERROR(Read(fd, &tokenLength, sizeof(int)));
+    std::vector<int> blockTokenList(tokenLength, -1);
+    RETURN_ON_ERROR(Read(fd, blockTokenList.data(), tokenLength * sizeof(int)));
+
+    if (!CompareTokenList(totalTokenList, blockTokenList, tokenLength)) {
+      VINEYARD_DISCARD(Close(fd));
+      return Status::ObjectNotExists("Token mismatch");
+    }
+    for (int j = 0; j < chunkSize; j++) {
+      if (matched_start + j >= totalTokenList.size() ||
+          matched_start + j >= kvCacheList.size()) {
+        break;
+      }
+      auto& kvState = kvCacheList[matched_start + j];
+      for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
+        RETURN_ON_ASSERT(static_cast<int>(kvState.size()) == layer,
+                         "The size of kvState is not equal to layer");
+        LLMKV& k = kvState[currentLayer].first;
+        LLMKV& v = kvState[currentLayer].second;
+        RETURN_ON_ASSERT(
+            k.length == tensorNBytes && v.length == tensorNBytes,
+            "The size of kv tensor doesn't match with the tensorNBytes");
+        RETURN_ON_ERROR(Read(fd, k.data, k.length));
+        RETURN_ON_ERROR(Read(fd, v.data, v.length));
+      }
+    }
+
+    VINEYARD_DISCARD(Close(fd));
+    return Status::OK();
+  };
+
+  parallel::ThreadGroup tg(std::min(
+      paths.size(), static_cast<size_t>(std::thread::hardware_concurrency())));
+  std::vector<parallel::ThreadGroup::tid_t> tids(paths.size() -
+                                                 numPrefixChunks);
+  for (size_t i = numPrefixChunks; i < paths.size(); i++) {
+    tids[i - numPrefixChunks] =
+        tg.AddTask(fn, i, (i - numPrefixChunks) * chunkSize);
+  }
+  std::vector<Status> taskResults(paths.size() - numPrefixChunks, Status::OK());
+  for (size_t i = numPrefixChunks; i < paths.size(); i++) {
+    taskResults[i - numPrefixChunks] = tg.TaskResult(tids[i - numPrefixChunks]);
+  }
+
+  matched = 0;
+  for (size_t i = numPrefixChunks; i < paths.size(); i++) {
+    if (taskResults[i - numPrefixChunks].ok()) {
+      matched += chunkSize;
+    } else {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+bool FileStorage::CompareTokenList(const std::vector<int>& tokenList1,
                                    const std::vector<int>& tokenList2,
                                    size_t length) {
-  if (tokenList.size() < length || tokenList2.size() < length) {
+  if (tokenList1.size() < length || tokenList2.size() < length) {
     return false;
   }
   for (size_t i = 0; i < length; i++) {
-    if (tokenList[i] != tokenList2[i]) {
+    if (tokenList1[i] != tokenList2[i]) {
       return false;
     }
   }
diff --git a/modules/llm-cache/storage/file_storage.h b/modules/llm-cache/storage/file_storage.h
index e3223180a9..a3be5c1f80 100644
--- a/modules/llm-cache/storage/file_storage.h
+++ b/modules/llm-cache/storage/file_storage.h
@@ -50,7 +50,7 @@ enum FileOperationType {
 class FileStorage : public IStorage,
                     public std::enable_shared_from_this<FileStorage> {
  private:
-  bool CompareTokenList(const std::vector<int>& tokenList,
+  bool CompareTokenList(const std::vector<int>& tokenList1,
                         const std::vector<int>& tokenList2, size_t length);
 
   virtual std::shared_ptr<FileDescriptor> CreateFileDescriptor() = 0;
@@ -117,7 +117,7 @@ class FileStorage : public IStorage,
 
   Status Update(
       const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) override;
 
   Status Update(const std::vector<int>& tokenList, int nextToken,
@@ -125,16 +125,21 @@ class FileStorage : public IStorage,
 
   Status Update(
       const std::vector<int>& prefix, const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) override;
 
   Status Query(const std::vector<int>& tokenList,
-               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
                size_t& matched) override;
 
-  Status Query(const std::vector<int>& tokenList, int nextToken,
+  Status Query(const std::vector<int>& prefix, int nextToken,
                std::vector<std::pair<LLMKV, LLMKV>>& kvState) override;
 
+  Status Query(const std::vector<int>& prefix,
+               const std::vector<int>& tokenList,
+               std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+               size_t& matched) override;
+
   void CloseCache() override;
 
   virtual Status Init() = 0;
@@ -144,11 +149,11 @@ class FileStorage : public IStorage,
   void StartGlobalGCThread() override { this->enableGlobalGC = true; }
 
  protected:
-  size_t tensorBytes;
+  size_t tensorNBytes;
   size_t cacheCapacity;
   int layer;
-  int batchSize;
-  int splitNumber;
+  int chunkSize;
+  int hashChunkSize;
   std::string rootPath;
   std::string tempFileDir;
   std::shared_ptr<IHashAlgorithm> hashAlgorithm;
diff --git a/modules/llm-cache/storage/local_file_storage.h b/modules/llm-cache/storage/local_file_storage.h
index 5abd9a0b8b..1396373a9d 100644
--- a/modules/llm-cache/storage/local_file_storage.h
+++ b/modules/llm-cache/storage/local_file_storage.h
@@ -35,21 +35,21 @@ struct LocalFileDescriptor : public FileDescriptor {
 
 class LocalFileStorage : public FileStorage {
  public:
-  LocalFileStorage(int tensorBytes, int cacheCapacity, int layer, int batchSize,
-                   int splitNumber, std::string rootPath,
-                   int64_t clientGCInterval, int64_t ttl, bool enableGlobalGC,
+  LocalFileStorage(int tensorNBytes, int cacheCapacity, int layer,
+                   int chunkSize, int hashChunkSize, std::string rootPath,
+                   int64_t gcInterval, int64_t ttl, bool enableGlobalGC,
                    int64_t globalGCInterval, int64_t globalTTL) {
     this->hashAlgorithm = std::make_shared<MurmurHash3Algorithm>();
     this->hasher = std::make_shared<Hasher>(hashAlgorithm.get());
-    this->tensorBytes = tensorBytes;
+    this->tensorNBytes = tensorNBytes;
     this->cacheCapacity = cacheCapacity;
     this->layer = layer;
-    this->batchSize = batchSize;
-    this->splitNumber = splitNumber;
+    this->chunkSize = chunkSize;
+    this->hashChunkSize = hashChunkSize;
     this->rootPath = std::regex_replace(rootPath + "/", std::regex("/+"), "/");
     this->tempFileDir =
         std::regex_replace(rootPath + "/__temp/", std::regex("/+"), "/");
-    this->gcInterval = std::chrono::seconds(clientGCInterval);
+    this->gcInterval = std::chrono::seconds(gcInterval);
     this->fileTTL = std::chrono::seconds(ttl);
     this->globalGCInterval = std::chrono::seconds(globalGCInterval);
     this->globalFileTTL = std::chrono::seconds(globalTTL);
diff --git a/modules/llm-cache/storage/storage.h b/modules/llm-cache/storage/storage.h
index abbe79e640..ce4344edfa 100644
--- a/modules/llm-cache/storage/storage.h
+++ b/modules/llm-cache/storage/storage.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "common/util/status.h"
-#include "llm-cache/ds/kv_state_cache_block.h"
+#include "llm-cache/ds/kv_cache_block.h"
 
 namespace vineyard {
 
@@ -31,7 +31,7 @@ class IStorage {
 
   virtual Status Update(
       const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) = 0;
 
   virtual Status Update(
@@ -40,17 +40,22 @@ class IStorage {
 
   virtual Status Update(
       const std::vector<int>& prefix, const std::vector<int>& tokenList,
-      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& updated) = 0;
 
   virtual Status Query(
       const std::vector<int>& tokenList,
-      std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvStateList,
+      std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
       size_t& matched) = 0;
 
-  virtual Status Query(const std::vector<int>& tokenList, int nextToken,
+  virtual Status Query(const std::vector<int>& prefix, int nextToken,
                        std::vector<std::pair<LLMKV, LLMKV>>& kvState) = 0;
 
+  virtual Status Query(
+      const std::vector<int>& prefix, const std::vector<int>& tokenList,
+      std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kvCacheList,
+      size_t& matched) = 0;
+
   virtual void CloseCache() = 0;
 
   virtual void StartGlobalGCThread() {}
diff --git a/modules/llm-cache/tests/k8s-test/worker.py b/modules/llm-cache/tests/k8s-test/worker.py
index 3d689212ca..f46da83c81 100644
--- a/modules/llm-cache/tests/k8s-test/worker.py
+++ b/modules/llm-cache/tests/k8s-test/worker.py
@@ -13,9 +13,9 @@ def start_server(port=8888):
     ip = os.environ.get('POD_IP', 'localhost')
     layer = int(os.environ.get('LAYER', 96))
     batch_size = int(os.environ.get('BATCH_SIZE', 16))
-    split_number = int(os.environ.get('SPLIT_NUMBER', 2))
+    hash_chunk_size = int(os.environ.get('HASH_CHUNK_SIZE', 2))
     cache_path = os.environ.get('CACHE_PATH', '/mnt/llm_cache')
-    client_gc_interval = int(os.environ.get('CLIENT_GC_INTERVAL', 30 * 60))
+    gc_interval = int(os.environ.get('GC_INTERVAL', 30 * 60))
     ttl = int(os.environ.get('TTL', 30 * 60))
     enable_global_gc = os.environ.get('ENABLE_GLOBAL_GC', False).lower() in ['true', '1']
     global_gc_interval = int(os.environ.get('GLOBAL_GC_INTERVAL', 3 * 60 * 60))
@@ -29,9 +29,9 @@ def start_server(port=8888):
 
     file_cache_config = FileCacheConfig(
         chunk_size = int(batch_size),
-        split_number = int(split_number),
+        hash_chunk_size = int(hash_chunk_size),
         root = cache_path,
-        client_gc_interval = client_gc_interval,
+        gc_interval = gc_interval,
         ttl = ttl,
         enable_global_gc = enable_global_gc,
         global_gc_interval = global_gc_interval,
@@ -39,7 +39,7 @@ def start_server(port=8888):
     )
     cache = KVCache(
         cache_config = file_cache_config,
-        tensor_bytes=kv_tensor.nbytes,  # should be the same as the nbytes of the tensor
+        tensor_nbytes=kv_tensor.nbytes,  # should be the same as the nbytes of the tensor
         cache_capacity=1024,
         layer=int(layer),
     )
@@ -64,7 +64,7 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor):
         return kv_tensors
 
     # used to hold the query results
-    kv_state_list = []
+    kv_cache_list = []
 
     while True:
         clientsocket, _ = serversocket.accept()
@@ -80,10 +80,10 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor):
         tokens = tokens.replace('\n', '').split(' ')
         tokens = [int(token) for token in tokens]
 
-        kv_state_list = reserve_kv_tensors(kv_state_list, len(tokens), kv_tensor)
+        kv_cache_list = reserve_kv_tensors(kv_cache_list, len(tokens), kv_tensor)
 
         query_start_time = time.time()
-        matched = cache.query(tokens, kv_state_list)
+        matched = cache.query(tokens, kv_cache_list)
         query_end_time = time.time()
         if matched > 0:
             total_query_time += query_end_time - query_start_time
@@ -92,14 +92,14 @@ def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor):
         total_tokens += len(tokens)
 
         remaining = tokens[matched:]
-        kv_state_list_remaining = [
+        kv_cache_list_remaining = [
             [ (KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes),
               KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes))
               for _ in range(layer)
             ] for _ in remaining
         ]
         update_start_time = time.time()
-        updated = cache.update(tokens[:matched], remaining, kv_state_list_remaining)
+        updated = cache.update(tokens[:matched], remaining, kv_cache_list_remaining)
         total_updated_tokens += updated
         update_end_time = time.time()
         if updated > 0:
diff --git a/modules/llm-cache/tests/k8s-test/yamls/worker.yaml b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml
index 603fb72f0b..32cc86212f 100644
--- a/modules/llm-cache/tests/k8s-test/yamls/worker.yaml
+++ b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml
@@ -35,9 +35,9 @@ spec:
           value: "96"
         - name: BATCH_SIZE
           value: "16"
-        - name: SPLIT_NUMBER
+        - name: HASH_CHUNK_SIZE
           value: "2"
-        - name: CLIENT_GC_INTERVAL
+        - name: GC_INTERVAL
           value: "1800"
         - name: TTL
           value: "1800"
diff --git a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc b/modules/llm-cache/tests/kv_cache_benchmark_test.cc
similarity index 92%
rename from modules/llm-cache/tests/kv_state_cache_benchmark_test.cc
rename to modules/llm-cache/tests/kv_cache_benchmark_test.cc
index 8dea2d4041..4d6ea93477 100644
--- a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc
+++ b/modules/llm-cache/tests/kv_cache_benchmark_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "client/ds/object_meta.h"
 #include "common/util/logging.h"
 
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 
 using namespace vineyard;  //  NOLINT(build/namespaces)
 
@@ -32,13 +32,13 @@ constexpr int CAPACITY = 8000;
 constexpr int LAYER = 64;
 constexpr int BLOCK_SIZE = 100;
 
-std::shared_ptr<KVStateCacheManager> manager;
+std::shared_ptr<KVCacheManager> manager;
 VineyardCacheConfig config(TENSORBYTES, CAPACITY, LAYER, BLOCK_SIZE, 300);
 Client client;
 
 void init(std::string socket) {
   VINEYARD_CHECK_OK(client.Connect(socket));
-  VINEYARD_CHECK_OK(KVStateCacheManager::Make(client, manager, config));
+  VINEYARD_CHECK_OK(KVCacheManager::Make(client, manager, config));
 }
 
 std::vector<int> generate_unique_tokens(size_t max_length) {
@@ -83,7 +83,7 @@ void benchmark_inference(std::vector<std::vector<int>>& tokens) {
   std::chrono::duration<double> query_duration(0);
 
   std::vector<int> inference_tokens;
-  std::vector<std::pair<LLMKV, LLMKV>> kv_state_list;
+  std::vector<std::pair<LLMKV, LLMKV>> kv_cache_list;
   void* key_state = malloc(TENSORBYTES);
   void* value_state = malloc(TENSORBYTES);
 
@@ -106,15 +106,15 @@ void benchmark_inference(std::vector<std::vector<int>>& tokens) {
   // query time
   for (size_t i = 0; i < tokens.size(); ++i) {
     inference_tokens.clear();
-    kv_state_list.clear();
+    kv_cache_list.clear();
     for (size_t j = 0; j < tokens[i].size(); ++j) {
       start = std::chrono::steady_clock::now();
       Status status =
-          manager->Query(inference_tokens, tokens[i][j], kv_state_list);
+          manager->Query(inference_tokens, tokens[i][j], kv_cache_list);
       if (!status.ok()) {
         VLOG(100) << "KV state is not in the cache.";
       }
-      for (auto& kv : kv_state_list) {
+      for (auto& kv : kv_cache_list) {
         for (int currentLayer = 0; currentLayer < LAYER; currentLayer++) {
           memcpy(key_state, kv.first.data, kv.first.length);
           memcpy(value_state, kv.second.data, kv.second.length);
@@ -138,7 +138,7 @@ void benchmark_inference(std::vector<std::vector<int>>& tokens) {
 
 int main(int argc, char** argv) {
   if (argc < 2) {
-    printf("usage ./kv_state_cache_benchmark <ipc_socket>");
+    printf("usage ./kv_cache_benchmark <ipc_socket>");
     return 1;
   }
   std::string ipc_socket = std::string(argv[1]);
diff --git a/modules/llm-cache/tests/kv_state_cache_hash_test.cc b/modules/llm-cache/tests/kv_cache_hash_test.cc
similarity index 86%
rename from modules/llm-cache/tests/kv_state_cache_hash_test.cc
rename to modules/llm-cache/tests/kv_cache_hash_test.cc
index 63bcc6e37b..48a1b9ed77 100644
--- a/modules/llm-cache/tests/kv_state_cache_hash_test.cc
+++ b/modules/llm-cache/tests/kv_cache_hash_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 using namespace vineyard;  //  NOLINT(build/namespaces)
 
 constexpr int BATCHSIZE = 16;
-constexpr int SPLITNUMBER = 2;
+constexpr int HASH_CHUNK_SIZE = 2;
 constexpr int TOKENLISTSIZE = 100000;
 
 std::vector<int> generate_random_tokens(size_t max_length) {
@@ -49,7 +49,7 @@ void test_with_tokens(IHashAlgorithm* hash_algorithm,
   std::vector<int> tokens = generate_random_tokens(10);
   std::vector<std::string> paths;
   VINEYARD_CHECK_OK(
-      hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths));
+      hasher.computePathForTokens(tokens, BATCHSIZE, HASH_CHUNK_SIZE, paths));
   VINEYARD_ASSERT(paths.size() == 0);
 
   // test the hash with the tokens more than the batch size
@@ -57,15 +57,15 @@ void test_with_tokens(IHashAlgorithm* hash_algorithm,
   std::vector<int> tokens1 = generate_random_tokens(17);
   std::vector<int> tokens2 = generate_random_tokens(18);
   VINEYARD_CHECK_OK(
-      hasher.computePathForTokens(tokens1, BATCHSIZE, SPLITNUMBER, paths1));
+      hasher.computePathForTokens(tokens1, BATCHSIZE, HASH_CHUNK_SIZE, paths1));
   VINEYARD_CHECK_OK(
-      hasher.computePathForTokens(tokens2, BATCHSIZE, SPLITNUMBER, paths2));
+      hasher.computePathForTokens(tokens2, BATCHSIZE, HASH_CHUNK_SIZE, paths2));
   VINEYARD_ASSERT(paths1.size() == paths1.size());
 
   paths.clear();
   tokens = generate_random_tokens(100);
   VINEYARD_CHECK_OK(
-      hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths));
+      hasher.computePathForTokens(tokens, BATCHSIZE, HASH_CHUNK_SIZE, paths));
   VINEYARD_ASSERT(paths.size() == size_t(100 / 16));
   LOG(INFO) << "Passed the " << hash_name << " test of tokens";
 }
@@ -79,10 +79,10 @@ void test_accuracy(IHashAlgorithm* hash_algorithm,
   std::vector<std::string> paths2;
   for (int i = 0; i < 100; i++) {
     std::vector<int> tokens = generate_random_tokens(100);
-    VINEYARD_CHECK_OK(
-        hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths1));
-    VINEYARD_CHECK_OK(
-        hasher.computePathForTokens(tokens, BATCHSIZE, SPLITNUMBER, paths2));
+    VINEYARD_CHECK_OK(hasher.computePathForTokens(tokens, BATCHSIZE,
+                                                  HASH_CHUNK_SIZE, paths1));
+    VINEYARD_CHECK_OK(hasher.computePathForTokens(tokens, BATCHSIZE,
+                                                  HASH_CHUNK_SIZE, paths2));
   }
 
   VINEYARD_ASSERT(paths1.size() == paths2.size());
@@ -119,9 +119,9 @@ void test_hash_conflict() {
     tokens_map[tokens]++;
     token_size += tokens.size();
     VINEYARD_CHECK_OK(murmur_hasher.computePathForTokens(
-        tokens, BATCHSIZE, SPLITNUMBER, murmur_hash_paths));
+        tokens, BATCHSIZE, HASH_CHUNK_SIZE, murmur_hash_paths));
     VINEYARD_CHECK_OK(city_hasher.computePathForTokens(
-        tokens, BATCHSIZE, SPLITNUMBER, city_hash_paths));
+        tokens, BATCHSIZE, HASH_CHUNK_SIZE, city_hash_paths));
   }
 
   for (size_t i = 0; i < murmur_hash_paths.size(); i++) {
diff --git a/modules/llm-cache/tests/kv_state_cache_local_file_test.cc b/modules/llm-cache/tests/kv_cache_local_file_test.cc
similarity index 83%
rename from modules/llm-cache/tests/kv_state_cache_local_file_test.cc
rename to modules/llm-cache/tests/kv_cache_local_file_test.cc
index 30ae068b4d..ee9e4f4d9a 100644
--- a/modules/llm-cache/tests/kv_state_cache_local_file_test.cc
+++ b/modules/llm-cache/tests/kv_cache_local_file_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 
 #include "gulrak/filesystem.hpp"
 #include "llm-cache/ds/config.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 #include "rax/radix.h"
 
 using namespace vineyard;  // NOLINT(build/namespaces)
 
-int tensorBytes = 80;
+int tensorNBytes = 80;
 int capacity = 20;
 int layer = 3;
 
@@ -43,10 +43,10 @@ std::vector<int> round_4_tokens = {1, 2, 3, 4, 5, 6};
 std::vector<std::vector<int>> tokens_list = {round_1_tokens, round_2_tokens,
                                              round_3_tokens, round_4_tokens};
 
-std::shared_ptr<KVStateCacheManager> init() {
-  std::shared_ptr<KVStateCacheManager> kv_state_cache_manager;
-  VINEYARD_CHECK_OK(KVStateCacheManager::Make(kv_state_cache_manager, config));
-  return kv_state_cache_manager;
+std::shared_ptr<KVCacheManager> init() {
+  std::shared_ptr<KVCacheManager> kv_cache_manager;
+  VINEYARD_CHECK_OK(KVCacheManager::Make(kv_cache_manager, config));
+  return kv_cache_manager;
 }
 
 void print_current_tokens(const std::vector<int>& prefix, int next_token) {
@@ -65,10 +65,10 @@ void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
         reinterpret_cast<uint8_t*>(kv_state[i].first.data);
     uint8_t* value_state_data =
         reinterpret_cast<uint8_t*>(kv_state[i].second.data);
-    // print the first tensorBytes bytes
+    // print the first tensorNBytes bytes
     std::string key_state_str = "";
     std::string value_state_str = "";
-    for (int j = 0; j < tensorBytes; j++) {
+    for (int j = 0; j < tensorNBytes; j++) {
       key_state_str += std::to_string(key_state_data[j]) + " ";
       value_state_str += std::to_string(value_state_data[j]) + " ";
     }
@@ -85,13 +85,13 @@ std::vector<std::pair<LLMKV, LLMKV>> generate_kv_state(int token) {
   for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
     LLMKV key_state;
     LLMKV value_state;
-    key_state.data = malloc(tensorBytes);
-    value_state.data = malloc(tensorBytes);
+    key_state.data = malloc(tensorNBytes);
+    value_state.data = malloc(tensorNBytes);
 
-    key_state.length = tensorBytes;
-    value_state.length = tensorBytes;
+    key_state.length = tensorNBytes;
+    value_state.length = tensorNBytes;
 
-    for (int i = 0; i < tensorBytes; ++i) {
+    for (int i = 0; i < tensorNBytes; ++i) {
       (reinterpret_cast<uint8_t*>(key_state.data))[i] =
           (static_cast<uint8_t>(token)) + i + currentLayer;
       (reinterpret_cast<uint8_t*>(value_state.data))[i] =
@@ -108,13 +108,13 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
   VINEYARD_ASSERT(kv_state.size() == (size_t) layer);
   for (size_t index = 0; index < kv_state.size(); ++index) {
     LOG(INFO) << "kv_state length: " << kv_state[index].first.length
-              << "tensorBytes: " << tensorBytes << "layer: " << layer;
-    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes);
-    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes);
-    for (int i = 0; i < tensorBytes; ++i) {
+              << "tensorNBytes: " << tensorNBytes << "layer: " << layer;
+    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes);
+    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes);
+    for (int i = 0; i < tensorNBytes; ++i) {
       if ((reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "key_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i]
@@ -124,7 +124,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
       }
       if (reinterpret_cast<uint8_t*>(kv_state[index].second.data)[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "value_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(
@@ -137,7 +137,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
   }
 }
 
-void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
+void inference(std::shared_ptr<KVCacheManager>& kv_cache_manager,
                std::vector<int> tokens, bool block = false) {
   std::vector<int> inference_tokens;
   std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kv_state;
@@ -150,8 +150,7 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
   }
 
   size_t updated = 0;
-  Status result =
-      kv_state_cache_manager->Update(inference_tokens, kv_state, updated);
+  Status result = kv_cache_manager->Update(inference_tokens, kv_state, updated);
 
   std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kv_state_to_query;
   for (size_t i = 0; i < tokens.size(); ++i) {
@@ -160,8 +159,8 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
     kv_state_to_query.push_back(current_kv_state);
   }
   size_t matched = 0;
-  Status query_result = kv_state_cache_manager->Query(
-      inference_tokens, kv_state_to_query, matched);
+  Status query_result =
+      kv_cache_manager->Query(inference_tokens, kv_state_to_query, matched);
   if (!query_result.ok()) {
     LOG(INFO) << "Query failed!";
   }
@@ -180,7 +179,7 @@ void checkFilesNotExist(std::string dir) {
 }
 
 void threadFunc(int sleep_time) {
-  std::shared_ptr<KVStateCacheManager> manager = init();
+  std::shared_ptr<KVCacheManager> manager = init();
 
   for (size_t i = 0; i < tokens_list.size(); i++) {
     LOG(INFO) << "Round " << i << " :";
@@ -193,10 +192,10 @@ void threadFunc(int sleep_time) {
 }
 
 int main(int argc, char** argv) {
-  LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes
+  LOG(INFO) << "Test KVCache with tensorNBytes: " << tensorNBytes
             << ", capacity: " << capacity << ", layer: " << layer;
 
-  config = FileCacheConfig(tensorBytes, capacity, layer, 4, 2,
+  config = FileCacheConfig(tensorNBytes, capacity, layer, 4, 2,
                            "/tmp/llm_cache/", LOCAL, 1, 1, false, 3, 5);
 
   std::vector<std::thread> threads;
@@ -211,7 +210,7 @@ int main(int argc, char** argv) {
 
   checkFilesNotExist("/tmp/llm_cache/");
 
-  config = FileCacheConfig(tensorBytes, capacity, layer, 4, 2,
+  config = FileCacheConfig(tensorNBytes, capacity, layer, 4, 2,
                            "/tmp/llm_cache/", LOCAL, 10, 20, true, 1, 2);
 
   threads.clear();
@@ -227,6 +226,6 @@ int main(int argc, char** argv) {
   sleep(3);
   checkFilesNotExist("/tmp/llm_cache/");
 
-  LOG(INFO) << "Passed KVStateCache tests...";
+  LOG(INFO) << "Passed KVCache tests...";
   return 0;
 }
diff --git a/modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc b/modules/llm-cache/tests/kv_cache_radix_tree_test.cc
similarity index 99%
rename from modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc
rename to modules/llm-cache/tests/kv_cache_radix_tree_test.cc
index 235e344405..723c9b15a2 100644
--- a/modules/llm-cache/tests/kv_state_cache_radix_tree_test.cc
+++ b/modules/llm-cache/tests/kv_cache_radix_tree_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "rax/radix.h"
 
 #include "common/util/logging.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 
 using namespace vineyard;  // NOLINT(build/namespaces)
 
diff --git a/modules/llm-cache/tests/kv_state_cache_test.cc b/modules/llm-cache/tests/kv_cache_test.cc
similarity index 84%
rename from modules/llm-cache/tests/kv_state_cache_test.cc
rename to modules/llm-cache/tests/kv_cache_test.cc
index e936a8afed..25b39060cc 100644
--- a/modules/llm-cache/tests/kv_state_cache_test.cc
+++ b/modules/llm-cache/tests/kv_cache_test.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "common/util/logging.h"
 #include "llm-cache/ds/config.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 
 using namespace vineyard;  // NOLINT(build/namespaces)
 
-int tensorBytes = 80;
+int tensorNBytes = 80;
 int capacity = 20;
 int layer = 3;
 int block_size = 5;
@@ -48,11 +48,10 @@ std::vector<int> round_4_tokens = {1, 2, 3, 4, 5, 6};
 std::vector<std::vector<int>> tokens_list = {round_1_tokens, round_2_tokens,
                                              round_3_tokens, round_4_tokens};
 
-std::shared_ptr<KVStateCacheManager> init(Client& client) {
-  std::shared_ptr<KVStateCacheManager> kv_state_cache_manager;
-  VINEYARD_CHECK_OK(
-      KVStateCacheManager::Make(client, kv_state_cache_manager, config));
-  return kv_state_cache_manager;
+std::shared_ptr<KVCacheManager> init(Client& client) {
+  std::shared_ptr<KVCacheManager> kv_cache_manager;
+  VINEYARD_CHECK_OK(KVCacheManager::Make(client, kv_cache_manager, config));
+  return kv_cache_manager;
 }
 
 void print_current_tokens(const std::vector<int>& prefix, int next_token) {
@@ -71,10 +70,10 @@ void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
         reinterpret_cast<uint8_t*>(kv_state[i].first.data);
     uint8_t* value_state_data =
         reinterpret_cast<uint8_t*>(kv_state[i].second.data);
-    // print the first tensorBytes bytes
+    // print the first tensorNBytes bytes
     std::string key_state_str = "";
     std::string value_state_str = "";
-    for (int j = 0; j < tensorBytes; j++) {
+    for (int j = 0; j < tensorNBytes; j++) {
       key_state_str += std::to_string(key_state_data[j]) + " ";
       value_state_str += std::to_string(value_state_data[j]) + " ";
     }
@@ -91,13 +90,13 @@ std::vector<std::pair<LLMKV, LLMKV>> generate_kv_state(int token) {
   for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
     LLMKV key_state;
     LLMKV value_state;
-    key_state.data = malloc(tensorBytes);
-    value_state.data = malloc(tensorBytes);
+    key_state.data = malloc(tensorNBytes);
+    value_state.data = malloc(tensorNBytes);
 
-    key_state.length = tensorBytes;
-    value_state.length = tensorBytes;
+    key_state.length = tensorNBytes;
+    value_state.length = tensorNBytes;
 
-    for (int i = 0; i < tensorBytes; ++i) {
+    for (int i = 0; i < tensorNBytes; ++i) {
       (reinterpret_cast<uint8_t*>(key_state.data))[i] =
           (static_cast<uint8_t>(token)) + i + currentLayer;
       (reinterpret_cast<uint8_t*>(value_state.data))[i] =
@@ -112,12 +111,12 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
                     int& token) {
   VINEYARD_ASSERT(kv_state.size() == (size_t) layer);
   for (size_t index = 0; index < kv_state.size(); ++index) {
-    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes);
-    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes);
-    for (int i = 0; i < tensorBytes; ++i) {
+    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes);
+    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes);
+    for (int i = 0; i < tensorNBytes; ++i) {
       if ((reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "key_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i]
@@ -127,7 +126,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
       }
       if (reinterpret_cast<uint8_t*>(kv_state[index].second.data)[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "value_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(
@@ -140,7 +139,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
   }
 }
 
-void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
+void inference(std::shared_ptr<KVCacheManager>& kv_cache_manager,
                std::vector<int> tokens, bool block = false) {
   std::vector<int> inference_tokens;
   std::vector<std::pair<LLMKV, LLMKV>> kv_state;
@@ -151,13 +150,13 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
     LOG(INFO) << "before query";
 
     LOG(INFO) << "kv_state_to_query size: " << kv_state_to_query.size()
-              << "layer" << layer << "tensorBytes" << tensorBytes;
+              << "layer" << layer << "tensorNBytes" << tensorNBytes;
     kv_state_to_query.clear();
     for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
       kv_state_to_query.emplace_back(LLMKV{nullptr, 0}, LLMKV{nullptr, 0});
     }
-    Status result = kv_state_cache_manager->Query(inference_tokens, tokens[i],
-                                                  kv_state_to_query);
+    Status result =
+        kv_cache_manager->Query(inference_tokens, tokens[i], kv_state_to_query);
     if (!result.ok()) {
       LOG(INFO) << "Can not find the kv_state from cache:";
       print_current_tokens(inference_tokens, tokens[i]);
@@ -165,7 +164,7 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
       kv_state = generate_kv_state(tokens[i]);
       print_kv_state(kv_state);
       Status status =
-          kv_state_cache_manager->Update(inference_tokens, tokens[i], kv_state);
+          kv_cache_manager->Update(inference_tokens, tokens[i], kv_state);
       if (!status.ok()) {
         // Not a error. May be the cache is full.
         VLOG(100) << "Put kv state into cache failed.";
@@ -183,7 +182,7 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
 void threadFunc(std::string socket) {
   Client client;
   VINEYARD_CHECK_OK(client.Connect(socket));
-  std::shared_ptr<KVStateCacheManager> manager = init(client);
+  std::shared_ptr<KVCacheManager> manager = init(client);
 
   for (size_t i = 0; i < tokens_list.size(); i++) {
     inference(manager, tokens_list[i]);
@@ -205,7 +204,7 @@ void clearGlobalObject(std::vector<std::string>& sockets) {
   Client client;
   VINEYARD_CHECK_OK(client.Connect(sockets[0]));
 
-  VINEYARD_CHECK_OK(KVStateCacheManager::ClearGlobalCache(client, config));
+  VINEYARD_CHECK_OK(KVCacheManager::ClearGlobalCache(client, config));
   client.Disconnect();
 
   for (size_t i = 0; i < sockets.size(); i++) {
@@ -231,9 +230,9 @@ int main(int argc, char** argv) {
   std::vector<std::string> sockets;
   if (argc < 2) {
     printf(
-        "usage ./kv_state_cache_test --client-num <client_num> "
+        "usage ./kv_cache_test --client-num <client_num> "
         "--vineyard-ipc-sockets <ipc_socket_1> ... <ipc_socket_n> -d "
-        "<tensorBytes> -c <capacity> -l <layer> -b <blockSize>\n");
+        "<tensorNBytes> -c <capacity> -l <layer> -b <blockSize>\n");
     return 1;
   }
 
@@ -245,7 +244,7 @@ int main(int argc, char** argv) {
 
   for (int i = 3; i < argc; i++) {
     if (strcmp(argv[i], "-d") == 0) {
-      tensorBytes = atoi(argv[i + 1]);
+      tensorNBytes = atoi(argv[i + 1]);
     } else if (strcmp(argv[i], "-c") == 0) {
       capacity = atoi(argv[i + 1]);
     } else if (strcmp(argv[i], "-l") == 0) {
@@ -273,12 +272,12 @@ int main(int argc, char** argv) {
     }
   }
 
-  LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes
+  LOG(INFO) << "Test KVCache with tensorNBytes: " << tensorNBytes
             << ", capacity: " << capacity << ", layer: " << layer
             << ", block_size: " << block_size << " and use " << client_num
             << " client.";
 
-  config = VineyardCacheConfig(tensorBytes, capacity, layer, block_size, 3,
+  config = VineyardCacheConfig(tensorNBytes, capacity, layer, block_size, 3,
                                llmCacheSyncLock, llmCacheObjectName,
                                llmRefcntObjectName);
 
@@ -308,6 +307,6 @@ int main(int argc, char** argv) {
   }
   LOG(INFO) << "Total memory usage:" << total_memory_usage;
 
-  LOG(INFO) << "Passed KVStateCache tests...";
+  LOG(INFO) << "Passed KVCache tests...";
   return 0;
 }
diff --git a/modules/llm-cache/tests/refcnt_map_test.cc b/modules/llm-cache/tests/refcnt_map_test.cc
index 0cf02d4b42..d2fc92ccea 100644
--- a/modules/llm-cache/tests/refcnt_map_test.cc
+++ b/modules/llm-cache/tests/refcnt_map_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include "rax/radix.h"
 
 #include "common/util/logging.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
+#include "llm-cache/ds/kv_cache_manager.h"
 
 using namespace vineyard;  // NOLINT(build/namespaces)
 
-constexpr int tensorBytes = 80;
+constexpr int tensorNBytes = 80;
 constexpr int capacity = 5;
 constexpr int layer = 3;
 constexpr int block_size = 4;
@@ -34,7 +34,7 @@ std::vector<int> round_2_tokens = {1, 2, 4, 9, 10};  // split to two blocks
 std::vector<std::vector<int>> round_token_list = {round_1_tokens,
                                                   round_2_tokens};
 
-std::vector<std::shared_ptr<KVStateCacheManager>> kv_state_cache_managers;
+std::vector<std::shared_ptr<KVCacheManager>> kv_cache_managers;
 std::vector<std::shared_ptr<BlobStorage>> blob_storages;
 std::string llmCacheObjectName = "refcnt_map_test_cache_object";
 std::string llmCacheSyncLock = "refcnt_map_test_cache_lock";
@@ -58,10 +58,10 @@ void print_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state) {
         reinterpret_cast<uint8_t*>(kv_state[i].first.data);
     uint8_t* value_state_data =
         reinterpret_cast<uint8_t*>(kv_state[i].second.data);
-    // print the first tensorBytes bytes
+    // print the first tensorNBytes bytes
     std::string key_state_str = "";
     std::string value_state_str = "";
-    for (int j = 0; j < tensorBytes; j++) {
+    for (int j = 0; j < tensorNBytes; j++) {
       key_state_str += std::to_string(key_state_data[j]) + " ";
       value_state_str += std::to_string(value_state_data[j]) + " ";
     }
@@ -78,13 +78,13 @@ std::vector<std::pair<LLMKV, LLMKV>> generate_kv_state(int token) {
   for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
     LLMKV key_state;
     LLMKV value_state;
-    key_state.data = malloc(tensorBytes);
-    value_state.data = malloc(tensorBytes);
+    key_state.data = malloc(tensorNBytes);
+    value_state.data = malloc(tensorNBytes);
 
-    key_state.length = tensorBytes;
-    value_state.length = tensorBytes;
+    key_state.length = tensorNBytes;
+    value_state.length = tensorNBytes;
 
-    for (int i = 0; i < tensorBytes; ++i) {
+    for (int i = 0; i < tensorNBytes; ++i) {
       (reinterpret_cast<uint8_t*>(key_state.data))[i] =
           (static_cast<uint8_t>(token)) + i + currentLayer;
       (reinterpret_cast<uint8_t*>(value_state.data))[i] =
@@ -100,12 +100,12 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
                     int& token) {
   VINEYARD_ASSERT(kv_state.size() == (size_t) layer);
   for (size_t index = 0; index < kv_state.size(); ++index) {
-    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorBytes);
-    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorBytes);
-    for (int i = 0; i < tensorBytes; ++i) {
+    VINEYARD_ASSERT(kv_state[index].first.length == (size_t) tensorNBytes);
+    VINEYARD_ASSERT(kv_state[index].second.length == (size_t) tensorNBytes);
+    for (int i = 0; i < tensorNBytes; ++i) {
       if ((reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "key_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(kv_state[index].first.data))[i]
@@ -115,7 +115,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
       }
       if (reinterpret_cast<uint8_t*>(kv_state[index].second.data)[i] !=
           (static_cast<uint8_t>(token)) + i + index) {
-        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
+        LOG(INFO) << "token:" << token << " tensorNBytes" << tensorNBytes
                   << " layer:" << index;
         LOG(INFO) << "value_state[" << i << "]: "
                   << (reinterpret_cast<uint8_t*>(
@@ -128,7 +128,7 @@ void check_kv_state(const std::vector<std::pair<LLMKV, LLMKV>>& kv_state,
   }
 }
 
-void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
+void inference(std::shared_ptr<KVCacheManager>& kv_cache_manager,
                std::vector<int> tokens, size_t begin = 0) {
   std::vector<int> inference_tokens;
   std::vector<std::pair<LLMKV, LLMKV>> kv_state;
@@ -140,16 +140,16 @@ void inference(std::shared_ptr<KVStateCacheManager>& kv_state_cache_manager,
       for (int current_layer = 0; current_layer < layer; current_layer++) {
         kv_state_to_query.emplace_back(LLMKV{nullptr, 0}, LLMKV{nullptr, 0});
       }
-      Status result = kv_state_cache_manager->Query(inference_tokens, tokens[i],
-                                                    kv_state_to_query);
+      Status result = kv_cache_manager->Query(inference_tokens, tokens[i],
+                                              kv_state_to_query);
       if (!result.ok()) {
         LOG(INFO) << "Can not find the kv_state from cache:";
         print_current_tokens(inference_tokens, tokens[i]);
         LOG(INFO) << "Generate the kv_state and update the cache.";
         kv_state = generate_kv_state(tokens[i]);
         // print_kv_state(kv_state);
-        Status status = kv_state_cache_manager->Update(inference_tokens,
-                                                       tokens[i], kv_state);
+        Status status =
+            kv_cache_manager->Update(inference_tokens, tokens[i], kv_state);
         if (!status.ok()) {
           // Not a error. May be the cache is full.
           LOG(INFO) << "Put kv state into cache failed:" << status.ToString();
@@ -180,18 +180,18 @@ Status checkRefCnt(std::string ipc_socket) {
 
   std::vector<std::set<void*>> treeDataSets;
   treeDataSets.push_back(blob_storages[0]
-                             ->GetKVStateCacheBuilder()
+                             ->GetKVCacheBuilder()
                              ->GetRootTree()
                              ->GetSubTreeDataSet());
   treeDataSets.push_back(blob_storages[1]
-                             ->GetKVStateCacheBuilder()
+                             ->GetKVCacheBuilder()
                              ->GetRootTree()
                              ->GetSubTreeDataSet());
   LOG(INFO) << raxShow(
-      blob_storages[0]->GetKVStateCacheBuilder()->GetRootTree()->GetRootTree());
+      blob_storages[0]->GetKVCacheBuilder()->GetRootTree()->GetRootTree());
   LOG(INFO) << "------------------------------------------";
   LOG(INFO) << raxShow(
-      blob_storages[1]->GetKVStateCacheBuilder()->GetRootTree()->GetRootTree());
+      blob_storages[1]->GetKVCacheBuilder()->GetRootTree()->GetRootTree());
 
   std::shared_ptr<RefcntMapObjectBuilder> refcnt_map =
       std::make_shared<RefcntMapObjectBuilder>(client[0]);
@@ -211,13 +211,12 @@ Status checkRefCnt(std::string ipc_socket) {
   ObjectID globalCacheObjectID;
   blockIDSetToAdd.clear();
   VINEYARD_CHECK_OK(client[0].GetName(llmCacheObjectName, globalCacheObjectID));
-  std::shared_ptr<KVStateCache> kvStateCache =
-      std::dynamic_pointer_cast<KVStateCache>(
-          client[0].FetchAndGetObject(globalCacheObjectID));
-  kvStateCache->GetCurrentBlockIDSet(blockIDSetToAdd);
+  std::shared_ptr<KVCache> kvCache = std::dynamic_pointer_cast<KVCache>(
+      client[0].FetchAndGetObject(globalCacheObjectID));
+  kvCache->GetCurrentBlockIDSet(blockIDSetToAdd);
   refcnt_map->IncSetRefcnt(blockIDSetToAdd);
-  if (kvStateCache->id() != globalCacheObjectID) {
-    client[0].DelData(kvStateCache->id());
+  if (kvCache->id() != globalCacheObjectID) {
+    client[0].DelData(kvCache->id());
   }
 
   LOG(INFO) << "Prepare refcnt done";
@@ -244,20 +243,20 @@ void threadFunc(std::string socket, int threadId) {
     sleep(4);
   }
 
-  std::shared_ptr<KVStateCacheManager> kv_state_cache_manager;
+  std::shared_ptr<KVCacheManager> kv_cache_manager;
   std::shared_ptr<BlobStorage> blob_storage;
   VINEYARD_CHECK_OK(BlobStorage::Make(
-      client[threadId], blob_storage, tensorBytes, capacity, layer, block_size,
+      client[threadId], blob_storage, tensorNBytes, capacity, layer, block_size,
       3, llmCacheSyncLock, llmCacheObjectName, llmRefcntObjectName));
   blob_storages.push_back(blob_storage);
-  kv_state_cache_manager = std::make_shared<KVStateCacheManager>(blob_storage);
-  kv_state_cache_managers.push_back(kv_state_cache_manager);
+  kv_cache_manager = std::make_shared<KVCacheManager>(blob_storage);
+  kv_cache_managers.push_back(kv_cache_manager);
 
   std::vector<int> tokenList = round_token_list[threadId];
   if (threadId == 1) {
-    inference(kv_state_cache_manager, tokenList, 2);
+    inference(kv_cache_manager, tokenList, 2);
   } else {
-    inference(kv_state_cache_manager, tokenList);
+    inference(kv_cache_manager, tokenList);
   }
 
   sleep(5);
@@ -292,7 +291,7 @@ int main(int argc, char** argv) {
   std::string sockets[2];
   if (argc < 2) {
     printf(
-        "Usage ./kv_state_cache_test "
+        "Usage ./kv_cache_test "
         "<ipc_socket_1> <ipc_socket_2> -d \n");
     return 1;
   }
@@ -315,8 +314,8 @@ int main(int argc, char** argv) {
   VINEYARD_CHECK_OK(checkRefCnt(sockets[0]));
 
   for (int i = 0; i < 2; i++) {
-    kv_state_cache_managers[i]->Close();
-    kv_state_cache_managers[i] = nullptr;
+    kv_cache_managers[i]->Close();
+    kv_cache_managers[i] = nullptr;
   }
 
   LOG(INFO) << "Clear global object";
diff --git a/modules/llm-cache/thread_group.h b/modules/llm-cache/thread_group.h
index 335f96fdf9..3e7cc8be70 100644
--- a/modules/llm-cache/thread_group.h
+++ b/modules/llm-cache/thread_group.h
@@ -42,13 +42,10 @@ namespace vineyard {
 namespace parallel {
 
 class ThreadGroup {
+ public:
   using tid_t = uint32_t;
-  // Returns the path index and task status for parallel execution.
-  // The path index is used to identify and delete results of unsuccessful
-  // tasks.
-  using return_t = std::pair<int, Status>;
+  using return_t = Status;
 
- public:
   explicit ThreadGroup(
       uint32_t parallelism = std::thread::hardware_concurrency());
 
@@ -67,7 +64,7 @@ class ThreadGroup {
       try {
         return std::move(_f(std::forward<Args>(_args)...));
       } catch (std::exception& e) {
-        return std::pair(-1, Status(StatusCode::kUnknownError, e.what()));
+        return Status(StatusCode::kUnknownError, e.what());
       }
     };
 
@@ -114,10 +111,10 @@ class ThreadGroup {
  * @AddTask@ will be blocked until there are spare thread resources.
  */
 class DynamicThreadGroup {
+ public:
   using tid_t = uint32_t;
   using return_t = Status;
 
- public:
   explicit DynamicThreadGroup(
       tid_t parallelism = std::thread::hardware_concurrency());
 
diff --git a/python/vineyard/llm/__init__.py b/python/vineyard/llm/__init__.py
index 99a994518e..07511e257b 100644
--- a/python/vineyard/llm/__init__.py
+++ b/python/vineyard/llm/__init__.py
@@ -16,167 +16,7 @@
 # limitations under the License.
 #
 
-from typing import Dict
-from typing import List
-from typing import Optional
-from typing import Tuple
-from typing import Union
-
-import numpy as np
-
-from .config import FileCacheConfig
-from .config import VineyardCacheConfig
-from .llm_C import KVTensor
-from .llm_C import _generate
-
-
-class KVCache:  # pylint: disable=too-many-instance-attributes
-    """KVCache is a class that manages the llm kv cache in vineyard."""
-
-    def __init__(
-        self,
-        cache_config: Union[VineyardCacheConfig, FileCacheConfig],
-        tensor_bytes: int = 10,
-        cache_capacity: int = 10,
-        layer: int = 1,
-        **kwargs
-    ):
-        """Create a llm kv cache manager based on vineyard blob.
-
-        Args:
-            cache_config (Union[VineyardCacheConfig, FileCacheConfig]):
-                The config of the kv cache, including vineyard cache and file cache.
-            tensor_bytes (int, optional):
-                The size of the kv cache tensor.
-                Defaults to 10.
-            cache_capacity (int, optional):
-                The capacity of the KV cache refers to the maximum number of
-                tokens it can hold. Defaults to 10.
-            layer (int, optional):
-                The number of layers of the kv cache. Defaults to 1.
-        """
-        self.kv_cache_manager = None
-        if not isinstance(cache_config, VineyardCacheConfig) and not isinstance(
-            cache_config, FileCacheConfig
-        ):
-            raise ValueError(
-                "The cache_config should be VineyardCacheConfig or FileCacheConfig."
-            )
-        self.tensor_bytes = tensor_bytes
-        self.cache_capacity = cache_capacity
-        self.layer = layer
-
-        self.kv_cache_manager = _generate(
-            tensor_bytes=tensor_bytes,
-            cache_capacity=cache_capacity,
-            layer=layer,
-            **cache_config.__dict__,
-            **kwargs
-        )
-
-    def update(
-        self,
-        prefix: Optional[List[int]],
-        tokens: List[int],
-        kv_state_list: List[List[Tuple[KVTensor, KVTensor]]],
-    ) -> int:
-        """Update the kv cache stored in vineyard.
-
-        Args:
-            prefix (list): the prefix of the tokens
-                For FileCacheConfig, the length of the prefix should be
-                multiple of the chunk size.
-            tokens (list): the tokens of the kv cache
-                e,g, [1 2 3 4]
-            kv_cache_list (List[List[Tuple[KVTensor, KVTensor]]]):
-                the kv tensors list of the related tokens including all layers, and
-                its length should be the same as the length of tokens.
-
-                The k, v tensor for i-th token at the j-th layer is: kv_state_list[i][j]
-
-                Whether the underlying kv cache is vineyard or file, the
-                kv_state_list is managed by the caller.
-                Assume the layer is 2, the tokens is [1, 2], then you should allocate
-                the kv_state_list as follows:
-
-                .. code:: python
-
-                    kv_state_list = []
-                    for _ in range(2): # the number of tokens
-                        k_tensor = np.random.rand(2,2).astype(np.float32)
-                        v_tensor = np.random.rand(2,2).astype(np.float32)
-                        kv_state_list.append(
-                            [
-                                (
-                                    KVTensor(k_tensor.ctypes.data, k_tensor.nbytes),
-                                    KVTensor(v_tensor.ctypes.data, v_tensor.nbytes),
-                                )
-                                for _ in range(2) # the number of layers
-                            ]
-                        )
-
-        """
-        if prefix:
-            return self.kv_cache_manager.update(prefix, tokens, kv_state_list)
-        else:
-            return self.kv_cache_manager.update(tokens, kv_state_list)
-
-    def query(
-        self,
-        tokens: List[int],
-        kv_state_list: List[List[Tuple[KVTensor, KVTensor]]],
-    ) -> int:
-        """Query the kv cache stored in vineyard.
-
-        Args:
-            tokens (list): the tokens of the kv cache
-                e,g, [1 2 3 4]
-            kv_state_list: (List[List[Tuple[KVTensor, KVTensor]]]):
-                the kv tensors list of the related tokens including all layers, and its
-                length should be the same as the length of tokens.
-
-                The k, v tensor for i-th token at the j-th layer is: kv_state_list[i][j]
-
-                For VineyardConfigCache, the kv_state_list is managed by vineyard.
-                The caller does not need to malloc and free the memory of the kv state.
-                Assume the layer is 2, the tokens is [1, 2], then you should allocate
-                the kv_state_list as follows:
-
-                .. code:: python
-
-                    kv_state_list = [
-                        (
-                            KVTensor(0, 0),
-                            KVTensor(0, 0),
-                        ) for _ in range(2) # the number of layers
-                    ] * 2 # the number of tokens
-
-                For FileCacheConfig, the kv_state_list is managed by the caller.
-                The caller needs to malloc and free the memory of the kv state.
-                Assume the layer is 2, the tokens is [1, 2], then you should allocate
-                the kv_state_list as follows:
-
-                .. code:: python
-
-                    kv_state_list = []
-                    for _ in range(2): # the number of tokens
-                        k_tensor = np.empty((2,2), dtype=np.float32)
-                        v_tensor = np.empty((2,2), dtype=np.float32)
-                        kv_state_list.append(
-                            [
-                                (
-                                    KVTensor(k_tensor.ctypes.data, k_tensor.nbytes),
-                                    KVTensor(v_tensor.ctypes.data, v_tensor.nbytes),
-                                )
-                                for _ in range(2) # the number of layers
-                            ]
-                        )
-
-        Returns:
-            int: The number of matched tokens.
-        """
-        return self.kv_cache_manager.query(tokens, kv_state_list)
-
-    def __del__(self):
-        if self.kv_cache_manager:
-            self.kv_cache_manager.close()
+from vineyard.llm.cache import FileCacheConfig
+from vineyard.llm.cache import KVCache
+from vineyard.llm.cache import KVTensor
+from vineyard.llm.cache import VineyardCacheConfig
diff --git a/python/vineyard/llm/cache.cc b/python/vineyard/llm/cache.cc
new file mode 100644
index 0000000000..b3de645bbf
--- /dev/null
+++ b/python/vineyard/llm/cache.cc
@@ -0,0 +1,193 @@
+/** Copyright 2020-2023 Alibaba Group Holding Limited.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <pybind11/cast.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+#include "client/client.h"
+
+#include "llm-cache/ds/config.h"
+#include "llm-cache/ds/kv_cache_block.h"
+#include "llm-cache/ds/kv_cache_manager.h"
+
+namespace py = pybind11;
+
+namespace vineyard {
+
+PYBIND11_MODULE(_llm_C, m) {
+  m.doc() = "vineyard llm kv cache manager module";
+
+  pybind11::enum_<FilesystemType>(m, "FilesystemType")
+      .value("LOCAL", FilesystemType::LOCAL)
+      .export_values();
+
+  py::class_<LLMKV, std::shared_ptr<LLMKV>>(m, "KVTensor",
+                                            py::buffer_protocol())
+      .def(py::init([](uintptr_t data, size_t length) {
+             return LLMKV{reinterpret_cast<void*>(data), length};
+           }),
+           py::arg("data"), py::arg("length"))
+      .def_property(
+          "data",
+          [](LLMKV& self) -> uintptr_t {  // getter
+            return reinterpret_cast<uintptr_t>(self.data);
+          },
+          [](LLMKV& self, uintptr_t new_ptr) {  // setter
+            self.data = reinterpret_cast<void*>(new_ptr);
+          })
+      .def_property(
+          "length",
+          [](LLMKV& self) -> size_t {  // getter
+            return self.length;
+          },
+          [](LLMKV& self, size_t new_length) {  // setter
+            self.length = new_length;
+          })
+      .def_buffer([](LLMKV& self) -> py::buffer_info {
+        return py::buffer_info(self.data, sizeof(char),
+                               py::format_descriptor<char>::value, 1,
+                               {self.length}, {sizeof(char)});
+      });
+
+  py::class_<KVCacheManager, std::shared_ptr<KVCacheManager>>(m,
+                                                              "KVCacheManager")
+      .def(py::init([](py::object ipc_client, int tensor_nbytes,
+                       int cache_capacity, int layer, int block_size,
+                       int sync_interval, std::string llm_cache_sync_lock,
+                       std::string llm_cache_object_name,
+                       std::string llm_ref_cnt_object_name)
+                        -> std::shared_ptr<KVCacheManager> {
+             VineyardCacheConfig config(
+                 tensor_nbytes, cache_capacity, layer, block_size,
+                 sync_interval, llm_cache_sync_lock, llm_cache_object_name,
+                 llm_ref_cnt_object_name);
+             Client& client = ipc_client.cast<Client&>();
+             std::shared_ptr<KVCacheManager> manager;
+             VINEYARD_CHECK_OK(
+                 vineyard::KVCacheManager::Make(client, manager, config));
+             return manager;
+           }),
+           py::arg("ipc_client"), py::arg("tensor_nbytes") = 1024,
+           py::arg("cache_capacity") = 1024, py::arg("layer") = 1,
+           py::arg("block_size") = 16, py::arg("sync_interval") = 3,
+           py::arg("llm_cache_sync_lock") = "llmCacheSyncLock",
+           py::arg("llm_cache_object_name") = "llm_cache_object",
+           py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object")
+      .def(py::init([](int tensor_nbytes, int cache_capacity, int layer,
+                       int chunk_size, int hash_chunk_size, std::string root,
+                       FilesystemType filesystemType, int gc_interval, int ttl,
+                       bool enable_global_gc, int global_gc_interval,
+                       int global_ttl) -> std::shared_ptr<KVCacheManager> {
+             FileCacheConfig config(
+                 tensor_nbytes, cache_capacity, layer, chunk_size,
+                 hash_chunk_size, root, filesystemType, gc_interval, ttl,
+                 enable_global_gc, global_gc_interval, global_ttl);
+             std::shared_ptr<KVCacheManager> manager;
+             VINEYARD_CHECK_OK(vineyard::KVCacheManager::Make(manager, config));
+             return manager;
+           }),
+           py::arg("tensor_nbytes") = 1024, py::arg("cache_capacity") = 1024,
+           py::arg("layer") = 1, py::arg("chunk_size") = 16,
+           py::arg("hash_chunk_size") = 4, py::arg("root") = "root",
+           py::arg("filesystem_type") = FilesystemType::LOCAL,
+           py::arg("gc_interval") = 30 * 60, py::arg("ttl") = 30 * 60,
+           py::arg("enable_global_gc") = false,
+           py::arg("global_gc_interval") = 30 * 60,
+           py::arg("global_ttl") = 30 * 60)
+      .def(
+          "update",
+          [](KVCacheManager* self, const std::vector<int>& tokenList,
+             int& next_token,
+             const std::vector<std::pair<vineyard::LLMKV, vineyard::LLMKV>>&
+                 kv_state) {
+            VINEYARD_CHECK_OK(self->Update(tokenList, next_token, kv_state));
+          },
+          py::arg("tokens"), py::arg("next_token"), py::arg("kv_state"))
+      .def(
+          "update",
+          [](KVCacheManager* self, const std::vector<int>& tokens,
+             const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kv_states)
+              -> size_t {
+            size_t updated = 0;
+            VINEYARD_CHECK_OK(self->Update(tokens, kv_states, updated));
+            return updated;
+          },
+          py::arg("tokens"), py::arg("kv_states"))
+      .def(
+          "update",
+          [](KVCacheManager* self, const std::vector<int>& prefix,
+             std::vector<int>& tokens,
+             const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kv_states)
+              -> size_t {
+            size_t updated = 0;
+            VINEYARD_CHECK_OK(self->Update(prefix, tokens, kv_states, updated));
+            return updated;
+          },
+          py::arg("prefix"), py::arg("tokens"), py::arg("kv_states"))
+      .def(
+          "query",
+          [](KVCacheManager* self, const std::vector<int>& tokens,
+             py::list& kv_cache_list) -> size_t {
+            std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kv_state_vec =
+                kv_cache_list
+                    .cast<std::vector<std::vector<std::pair<LLMKV, LLMKV>>>>();
+            size_t matched = 0;
+            VINEYARD_CHECK_OK(self->Query(tokens, kv_state_vec, matched));
+            for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) {
+              for (size_t j = 0; j < kv_state_vec[i].size(); ++j) {
+                kv_cache_list[i].cast<py::list>()[j] =
+                    py::cast(kv_state_vec[i][j]);
+              }
+            }
+            return matched;
+          },
+          py::arg("tokens"), py::arg("kv_states"))
+      .def(
+          "query",
+          [](KVCacheManager* self, const std::vector<int>& prefix,
+             int& next_token, py::list& kv_state) {
+            std::vector<std::pair<LLMKV, LLMKV>> kv_state_vec =
+                kv_state.cast<std::vector<std::pair<LLMKV, LLMKV>>>();
+            VINEYARD_CHECK_OK(self->Query(prefix, next_token, kv_state_vec));
+            for (size_t i = 0; i < kv_state_vec.size(); ++i) {
+              kv_state[i] = py::cast(kv_state_vec[i]);
+            }
+          },
+          py::arg("prefix"), py::arg("next_token"), py::arg("kv_states"))
+      .def(
+          "query",
+          [](KVCacheManager* self, const std::vector<int>& prefix,
+             const std::vector<int>& tokens,
+             py::list& kv_cache_list) -> size_t {
+            std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kv_state_vec =
+                kv_cache_list
+                    .cast<std::vector<std::vector<std::pair<LLMKV, LLMKV>>>>();
+            size_t matched = 0;
+            VINEYARD_CHECK_OK(
+                self->Query(prefix, tokens, kv_state_vec, matched));
+            for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) {
+              for (size_t j = 0; j < kv_state_vec[i].size(); ++j) {
+                kv_cache_list[i].cast<py::list>()[j] =
+                    py::cast(kv_state_vec[i][j]);
+              }
+            }
+            return matched;
+          },
+          py::arg("prefix"), py::arg("tokens"), py::arg("kv_states"))
+      .def("close", [](KVCacheManager* self) { self->Close(); });
+}
+
+}  // namespace vineyard
diff --git a/python/vineyard/llm/cache.py b/python/vineyard/llm/cache.py
new file mode 100644
index 0000000000..3e8859afd9
--- /dev/null
+++ b/python/vineyard/llm/cache.py
@@ -0,0 +1,378 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2020-2023 Alibaba Group Holding Limited.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import contextlib
+import logging
+import os
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ._llm_C import FilesystemType
+from ._llm_C import KVCacheManager
+from ._llm_C import KVTensor
+
+logger = logging.getLogger('vineyard')
+
+
+def _argument_from_env(
+    kwargs: Dict[str, Any],
+    envprefix: str,
+    name: str,
+    dtype=None,
+):
+    envname = f'{envprefix}_{name.upper()}'
+    if envname in os.environ:
+        value = os.environ.get(envname)
+        if dtype:
+            value = dtype(value)
+        kwargs[name] = value
+
+
+class VineyardCacheConfig:
+    """VineyardCacheConfig is a class to configure the llm kv cache in vineyard."""
+
+    def __init__(
+        self,
+        socket: str,
+        block_size: int = 5,
+        sync_interval: int = 3,
+        llm_cache_sync_lock: str = "llmCacheSyncLock",
+        llm_cache_object_name: str = "llm_cache_object",
+        llm_ref_cnt_object_name: str = "llm_refcnt_object",
+    ):
+        """Create a vineyard cache config.
+
+        Args:
+            socket (str):
+                The ipc socket of the vineyardd instance.
+            block_size (int, optional):
+                The block size of the kv cache. Defaults to 5.
+            sync_interval (int, optional):
+                The sync interval of the kv cache. Defaults to 3.
+            llm_cache_sync_lock (str, optional):
+                The name of the kv cache sync lock. Defaults to "llmCacheSyncLock".
+            llm_cache_object_name (str, optional):
+                The name of the kv cache object. Defaults to "llm_cache_object".
+            llm_ref_cnt_object_name (str, optional):
+                The name of the kv cache ref cnt object.
+                Defaults to "llm_refcnt_object".
+        """
+        import vineyard
+
+        self.socket = socket
+        self.block_size = block_size
+        self.sync_interval = sync_interval
+        self.llm_cache_sync_lock = llm_cache_sync_lock
+        self.llm_cache_object_name = llm_cache_object_name
+        self.llm_ref_cnt_object_name = llm_ref_cnt_object_name
+
+        # Connecting to vineyardd
+        self.ipc_client = vineyard.connect(socket).ipc_client
+
+    def __repr__(self):
+        return (
+            f'VineyardCacheConfig('
+            f'socket={self.socket}, '
+            f'block_size={self.block_size}, '
+            f'sync_interval={self.sync_interval}, '
+            f'llm_cache_sync_lock={self.llm_cache_sync_lock}, '
+            f'llm_cache_object_name={self.llm_cache_object_name}, '
+            f'llm_ref_cnt_object_name={self.llm_ref_cnt_object_name})'
+        )
+
+
+class FileCacheConfig:
+    """FileCacheConfig is a class to configure the llm kv cache on filesystem."""
+
+    def __init__(
+        self,
+        chunk_size: int = 16,
+        hash_chunk_size: int = 2,
+        root: str = "/tmp/vineyard/llm_cache",
+        filesystem_type: FilesystemType = FilesystemType.LOCAL,
+        gc_interval: int = 30 * 60,
+        ttl: int = 30 * 60,
+        enable_global_gc: bool = False,
+        global_gc_interval: int = 3 * 60 * 60,
+        global_ttl: int = 3 * 60 * 60,
+    ):
+        """Create a file cache config.
+
+        Args:
+            chunk_size (int):
+                Divide the token list into batches, each batch
+                contains chunk_size tokens. Defaults to 16.
+            hash_chunk_size (int):
+                Split the hash value into the file with multiple directories.
+                e.g, hash_chunk_size=2, hash value=123456, the file path is 12/34/56.
+            root (str):
+                The root directory of the kv state files.
+                Defaults to "/tmp/vineyard/llm_cache".
+            filesystem_type (str):
+                The type of the filesystem. Defaults to "local".
+            gc_interval (int):
+                The interval of the client gc (seconds).
+                Defaults to 30 * 60 seconds.
+            ttl (int):
+                The time to live of the kv state files (seconds).
+                Defaults to 30 * 60 seconds.
+            enable_global_gc (bool):
+                Enable the global gc or not. Defaults to False.
+            global_gc_interval (int):
+                The interval of the global gc (seconds).
+                Defaults to 3 * 60 * 60 seconds.
+            global_ttl (int):
+                The time to live of the global gc files (seconds).
+                Defaults to 3 * 60 * 60 seconds.
+        """
+        self.chunk_size = chunk_size
+        self.hash_chunk_size = hash_chunk_size
+        self.root = root
+        self.filesystem_type = filesystem_type
+        self.gc_interval = gc_interval
+        self.ttl = ttl
+        self.enable_global_gc = enable_global_gc
+        self.global_gc_interval = global_gc_interval
+        self.global_ttl = global_ttl
+
+    def __repr__(self):
+        return (
+            f'FileCacheConfig('
+            f'chunk_size={self.chunk_size}, '
+            f'hash_chunk_size={self.hash_chunk_size}, '
+            f'root={self.root}, '
+            f'filesystem_type={self.filesystem_type}, '
+            f'gc_interval={self.gc_interval}, '
+            f'ttl={self.ttl}, '
+            f'enable_global_gc={self.enable_global_gc}, '
+            f'global_gc_interval={self.global_gc_interval}, '
+            f'global_ttl={self.global_ttl})'
+        )
+
+
+class KVCache:  # pylint: disable=too-many-instance-attributes
+    """KVCache is a class that manages the llm kv cache in vineyard."""
+
+    def __init__(
+        self,
+        cache_config: Optional[Union[VineyardCacheConfig, FileCacheConfig]] = None,
+        tensor_nbytes: int = 1024,
+        cache_capacity: int = 1024,
+        layer: int = 1,
+        rank: Optional[int] = None,
+        world_size: Optional[int] = None,
+        **kwargs,
+    ):
+        """Create a llm kv cache manager based on vineyard blob.
+
+        Args:
+            cache_config (Union[VineyardCacheConfig, FileCacheConfig]):
+                The config of the KV cache, including vineyard cache and file cache.
+            tensor_nbytes (int, optional):
+                The size of the k/v cache tensor for each token at each layer.
+                Defaults to 10.
+            cache_capacity (int, optional):
+                The capacity of the KV cache refers to the maximum number of
+                tokens it can hold. Defaults to 10.
+            layer (int, optional):
+                The number of layers of the kv cache. Defaults to 1.
+            rank (int, optional):
+                The rank of the current worker. Defaults to None.
+        """
+        self.kv_cache_manager = None
+
+        if cache_config is None:
+            if 'VINEYARD_LLM_CACHE_SHARED_MEMORY' in os.environ:
+                config = {}
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'socket', str
+                )
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'block_size', int
+                )
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_SHARED_MEMORY', 'sync_interval', int
+                )
+                cache_config = VineyardCacheConfig(**config)
+            if 'VINEYARD_LLM_CACHE_FILESYSTEM' in os.environ:
+                config = {}
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'chunk_size', int
+                )
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'hash_chunk_size', int
+                )
+                _argument_from_env(
+                    config, 'VINEYARD_LLM_CACHE_FILESYSTEM', 'root', dtype=str
+                )
+                cache_config = FileCacheConfig(**config)
+
+        if rank is not None and world_size is not None:
+            if isinstance(cache_config, FileCacheConfig):
+                cache_config.root = os.path.join(
+                    cache_config.root, f'{world_size}-{rank}'
+                )
+
+        logger.info("Initializing vineyard llm cache with config: %r", cache_config)
+        if not isinstance(cache_config, VineyardCacheConfig) and not isinstance(
+            cache_config, FileCacheConfig
+        ):
+            raise ValueError(
+                "The cache_config should be VineyardCacheConfig or FileCacheConfig."
+            )
+        self.cache_config = cache_config
+        self.tensor_nbytes = tensor_nbytes
+        self.cache_capacity = cache_capacity
+        self.layer = layer
+
+        self.kv_cache_manager = KVCacheManager(
+            tensor_nbytes=tensor_nbytes,
+            cache_capacity=cache_capacity,
+            layer=layer,
+            **cache_config.__dict__,
+            **kwargs,
+        )
+        if isinstance(cache_config, VineyardCacheConfig):
+            self.chunk_size = cache_config.block_size
+        else:
+            self.chunk_size = cache_config.chunk_size
+
+    def __repr__(self):
+        return (
+            'KVCache('
+            f'cache_config={self.cache_config}, '
+            f'tensor_nbytes={self.tensor_nbytes}, '
+            f'cache_capacity={self.cache_capacity}, '
+            f'layer={self.layer})'
+        )
+
+    def update(
+        self,
+        prefix: List[int],
+        tokens: List[int],
+        kv_cache_list: List[List[Tuple[KVTensor, KVTensor]]],
+    ) -> int:
+        """Update the kv cache stored in vineyard.
+
+        Args:
+            prefix (list): the prefix of the tokens
+                For FileCacheConfig, the length of the prefix should be
+                multiple of the chunk size.
+            tokens (list): the tokens of the kv cache
+                e,g, [1 2 3 4]
+            kv_cache_list (List[List[Tuple[KVTensor, KVTensor]]]):
+                the kv tensors list of the related tokens including all layers, and
+                its length should be the same as the length of tokens.
+
+                The k, v tensor for i-th token at the j-th layer is: kv_cache_list[i][j]
+
+                Whether the underlying kv cache is vineyard or file, the
+                kv_cache_list is managed by the caller.
+                Assume the layer is 2, the tokens is [1, 2], then you should allocate
+                the kv_cache_list as follows:
+
+                .. code:: python
+
+                    kv_cache_list = []
+                    for _ in range(2): # the number of tokens
+                        k_tensor = np.random.rand(2,2).astype(np.float32)
+                        v_tensor = np.random.rand(2,2).astype(np.float32)
+                        kv_cache_list.append(
+                            [
+                                (
+                                    KVTensor(k_tensor.ctypes.data, k_tensor.nbytes),
+                                    KVTensor(v_tensor.ctypes.data, v_tensor.nbytes),
+                                )
+                                for _ in range(2) # the number of layers
+                            ]
+                        )
+
+        """
+        if prefix:
+            return self.kv_cache_manager.update(prefix, tokens, kv_cache_list)
+        else:
+            return self.kv_cache_manager.update(tokens, kv_cache_list)
+
+    def query(
+        self,
+        prefix: List[int],
+        tokens: List[int],
+        kv_cache_list: List[List[Tuple[KVTensor, KVTensor]]],
+    ) -> int:
+        """Query the kv cache stored in vineyard.
+
+        Args:
+            tokens (list): the tokens of the kv cache
+                e,g, [1 2 3 4]
+            kv_cache_list: (List[List[Tuple[KVTensor, KVTensor]]]):
+                the kv tensors list of the related tokens including all layers, and its
+                length should be the same as the length of tokens.
+
+                The k, v tensor for i-th token at the j-th layer is: kv_cache_list[i][j]
+
+                For VineyardConfigCache, the kv_cache_list is managed by vineyard.
+                The caller does not need to malloc and free the memory of the kv state.
+                Assume the layer is 2, the tokens is [1, 2], then you should allocate
+                the kv_cache_list as follows:
+
+                .. code:: python
+
+                    kv_cache_list = [
+                        (
+                            KVTensor(0, 0),
+                            KVTensor(0, 0),
+                        ) for _ in range(2) # the number of layers
+                    ] * 2 # the number of tokens
+
+                For FileCacheConfig, the kv_cache_list is managed by the caller.
+                The caller needs to malloc and free the memory of the kv state.
+                Assume the layer is 2, the tokens is [1, 2], then you should allocate
+                the kv_cache_list as follows:
+
+                .. code:: python
+
+                    kv_cache_list = []
+                    for _ in range(2): # the number of tokens
+                        k_tensor = np.empty((2,2), dtype=np.float32)
+                        v_tensor = np.empty((2,2), dtype=np.float32)
+                        kv_cache_list.append(
+                            [
+                                (
+                                    KVTensor(k_tensor.ctypes.data, k_tensor.nbytes),
+                                    KVTensor(v_tensor.ctypes.data, v_tensor.nbytes),
+                                )
+                                for _ in range(2) # the number of layers
+                            ]
+                        )
+
+        Returns:
+            int: The number of matched tokens.
+        """
+        if prefix:
+            return self.kv_cache_manager.query(prefix, tokens, kv_cache_list)
+        else:
+            return self.kv_cache_manager.query(tokens, kv_cache_list)
+
+    def __del__(self):
+        if self.kv_cache_manager:
+            with contextlib.suppress(Exception):
+                self.kv_cache_manager.close()
diff --git a/python/vineyard/llm/config.py b/python/vineyard/llm/config.py
deleted file mode 100644
index 1305d22602..0000000000
--- a/python/vineyard/llm/config.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright 2020-2023 Alibaba Group Holding Limited.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import vineyard
-
-from .llm_C import FilesystemType
-
-
-class VineyardCacheConfig:
-    """VineyardCacheConfig is a class to configure the llm kv cache in vineyard."""
-
-    def __init__(
-        self,
-        socket: str,
-        block_size: int = 5,
-        sync_interval: int = 3,
-        llm_cache_sync_lock: str = "llmCacheSyncLock",
-        llm_cache_object_name: str = "llm_cache_object",
-        llm_ref_cnt_object_name: str = "llm_refcnt_object",
-    ):
-        """Create a vineyard cache config.
-
-        Args:
-            socket (str):
-                The ipc socket of the vineyardd instance.
-            block_size (int, optional):
-                The block size of the kv cache. Defaults to 5.
-            sync_interval (int, optional):
-                The sync interval of the kv cache. Defaults to 3.
-            llm_cache_sync_lock (str, optional):
-                The name of the kv cache sync lock. Defaults to "llmCacheSyncLock".
-            llm_cache_object_name (str, optional):
-                The name of the kv cache object. Defaults to "llm_cache_object".
-            llm_ref_cnt_object_name (str, optional):
-                The name of the kv cache ref cnt object.
-                Defaults to "llm_refcnt_object".
-        """
-        self.ipc_client = vineyard.connect(socket).ipc_client
-        self.block_size = block_size
-        self.sync_interval = sync_interval
-        self.llm_cache_sync_lock = llm_cache_sync_lock
-        self.llm_cache_object_name = llm_cache_object_name
-        self.llm_ref_cnt_object_name = llm_ref_cnt_object_name
-
-
-class FileCacheConfig:
-    """FileCacheConfig is a class to configure the llm kv cache on filesystem."""
-
-    def __init__(
-        self,
-        chunk_size: int = 16,
-        split_number: int = 2,
-        root: str = "/tmp/vineyard/llm_cache",
-        filesystem_type: FilesystemType = FilesystemType.LOCAL,
-        client_gc_interval: int = 30 * 60,
-        ttl: int = 30 * 60,
-        enable_global_gc: bool = False,
-        global_gc_interval: int = 3 * 60 * 60,
-        global_ttl: int = 3 * 60 * 60,
-    ):
-        """Create a file cache config.
-
-        Args:
-            chunk_size (int):
-                Divide the token list into batches, each batch
-                contains batchSize tokens. Defaults to 16.
-            split_number (int):
-                Split the hash value into the file with multiple directories.
-                e.g, splitNumber=2, hash value=123456, the file path is 12/34/56.
-            root (str):
-                The root directory of the kv state files.
-                Defaults to "/tmp/vineyard/llm_cache".
-            filesystem_type (str):
-                The type of the filesystem. Defaults to "local".
-            client_gc_interval (int):
-                The interval of the client gc (seconds).
-                Defaults to 30 * 60 seconds.
-            ttl (int):
-                The time to live of the kv state files (seconds).
-                Defaults to 30 * 60 seconds.
-            enable_global_gc (bool):
-                Enable the global gc or not. Defaults to False.
-            global_gc_interval (int):
-                The interval of the global gc (seconds).
-                Defaults to 3 * 60 * 60 seconds.
-            global_ttl (int):
-                The time to live of the global gc files (seconds).
-                Defaults to 3 * 60 * 60 seconds.
-        """
-        self.chunk_size = chunk_size
-        self.split_number = split_number
-        self.root = root
-        self.filesystem_type = filesystem_type
-        self.client_gc_interval = client_gc_interval
-        self.ttl = ttl
-        self.enable_global_gc = enable_global_gc
-        self.global_gc_interval = global_gc_interval
-        self.global_ttl = global_ttl
diff --git a/python/vineyard/llm/kv_state_cache.cc b/python/vineyard/llm/kv_state_cache.cc
deleted file mode 100644
index 88e333fc9d..0000000000
--- a/python/vineyard/llm/kv_state_cache.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/** Copyright 2020-2023 Alibaba Group Holding Limited.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include <pybind11/cast.h>
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-#include "client/client.h"
-
-#include "llm-cache/ds/config.h"
-#include "llm-cache/ds/kv_state_cache_block.h"
-#include "llm-cache/ds/kv_state_cache_manager.h"
-
-namespace py = pybind11;
-
-namespace vineyard {
-
-PYBIND11_MODULE(llm_C, m) {
-  m.doc() = "vineyard llm kv cache manager module";
-
-  pybind11::enum_<FilesystemType>(m, "FilesystemType")
-      .value("LOCAL", FilesystemType::LOCAL)
-      .export_values();
-
-  py::class_<LLMKV, std::shared_ptr<LLMKV>>(m, "KVTensor",
-                                            py::buffer_protocol())
-      .def(py::init([](uintptr_t data, size_t length) {
-             return LLMKV{reinterpret_cast<void*>(data), length};
-           }),
-           py::arg("data"), py::arg("length"))
-      .def_property(
-          "data",
-          [](LLMKV& self) -> uintptr_t {  // getter
-            return reinterpret_cast<uintptr_t>(self.data);
-          },
-          [](LLMKV& self, uintptr_t new_ptr) {  // setter
-            self.data = reinterpret_cast<void*>(new_ptr);
-          })
-      .def_property(
-          "length",
-          [](LLMKV& self) -> size_t {  // getter
-            return self.length;
-          },
-          [](LLMKV& self, size_t new_length) {  // setter
-            self.length = new_length;
-          })
-      .def_buffer([](LLMKV& self) -> py::buffer_info {
-        return py::buffer_info(self.data, sizeof(char),
-                               py::format_descriptor<char>::value, 1,
-                               {self.length}, {sizeof(char)});
-      });
-
-  py::class_<KVStateCacheManager, std::shared_ptr<KVStateCacheManager>>(
-      m, "KVStateCacheManager")
-      .def(
-          "update",
-          [](KVStateCacheManager* self, const std::vector<int>& tokenList,
-             int& next_token,
-             const std::vector<std::pair<vineyard::LLMKV, vineyard::LLMKV>>&
-                 kv_state) {
-            VINEYARD_CHECK_OK(self->Update(tokenList, next_token, kv_state));
-          },
-          py::arg("tokens"), py::arg("next_token"), py::arg("kv_state"))
-      .def(
-          "update",
-          [](KVStateCacheManager* self, const std::vector<int>& tokens,
-             const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kv_states)
-              -> size_t {
-            size_t updated = 0;
-            VINEYARD_CHECK_OK(self->Update(tokens, kv_states, updated));
-            return updated;
-          },
-          py::arg("tokens"), py::arg("kv_states"))
-      .def(
-          "update",
-          [](KVStateCacheManager* self, const std::vector<int>& prefix,
-             std::vector<int>& tokens,
-             const std::vector<std::vector<std::pair<LLMKV, LLMKV>>>& kv_states)
-              -> size_t {
-            size_t updated = 0;
-            VINEYARD_CHECK_OK(self->Update(prefix, tokens, kv_states, updated));
-            return updated;
-          },
-          py::arg("prefix"), py::arg("tokens"), py::arg("kv_states"))
-      .def(
-          "query",
-          [](KVStateCacheManager* self, const std::vector<int>& tokens,
-             int& next_token, py::list& kv_state) {
-            std::vector<std::pair<LLMKV, LLMKV>> kv_state_vec =
-                kv_state.cast<std::vector<std::pair<LLMKV, LLMKV>>>();
-            VINEYARD_CHECK_OK(self->Query(tokens, next_token, kv_state_vec));
-            for (size_t i = 0; i < kv_state_vec.size(); ++i) {
-              kv_state[i] = py::cast(kv_state_vec[i]);
-            }
-          },
-          py::arg("tokens"), py::arg("next_token"), py::arg("kv_states"))
-      .def(
-          "query",
-          [](KVStateCacheManager* self, const std::vector<int>& tokens,
-             py::list& kv_state_list) -> size_t {
-            std::vector<std::vector<std::pair<LLMKV, LLMKV>>> kv_state_vec =
-                kv_state_list
-                    .cast<std::vector<std::vector<std::pair<LLMKV, LLMKV>>>>();
-            size_t matched = 0;
-            VINEYARD_CHECK_OK(self->Query(tokens, kv_state_vec, matched));
-            for (size_t i = 0; i < kv_state_vec.size() && i < matched; ++i) {
-              for (size_t j = 0; j < kv_state_vec[i].size(); ++j) {
-                kv_state_list[i].cast<py::list>()[j] =
-                    py::cast(kv_state_vec[i][j]);
-              }
-            }
-            return matched;
-          },
-          py::arg("tokens"), py::arg("kv_states"))
-      .def("close", [](KVStateCacheManager* self) { self->Close(); });
-
-  m.def(
-       "_generate",
-       [](py::object ipc_client, int tensor_bytes, int cache_capacity,
-          int layer, int block_size, int sync_interval,
-          std::string llm_cache_sync_lock, std::string llm_cache_object_name,
-          std::string llm_ref_cnt_object_name)
-           -> std::shared_ptr<KVStateCacheManager> {
-         std::shared_ptr<KVStateCacheManager> manager;
-         VineyardCacheConfig config(tensor_bytes, cache_capacity, layer,
-                                    block_size, sync_interval,
-                                    llm_cache_sync_lock, llm_cache_object_name,
-                                    llm_ref_cnt_object_name);
-         Client& client = ipc_client.cast<Client&>();
-         vineyard::Status status =
-             vineyard::KVStateCacheManager::Make(client, manager, config);
-         if (!status.ok()) {
-           throw std::runtime_error(status.ToString());
-         }
-         return manager;
-       },
-       py::arg("ipc_client"), py::arg("tensor_bytes") = 10,
-       py::arg("cache_capacity") = 10, py::arg("layer") = 1,
-       py::arg("block_size") = 5, py::arg("sync_interval") = 3,
-       py::arg("llm_cache_sync_lock") = "llmCacheSyncLock",
-       py::arg("llm_cache_object_name") = "llm_cache_object",
-       py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object")
-      .def(
-          "_generate",
-          [](int tensor_bytes, int cache_capacity, int layer, int chunk_size,
-             int split_number, std::string root, FilesystemType filesystemType,
-             int client_gc_interval, int ttl, bool enable_global_gc,
-             int global_gc_interval,
-             int global_ttl) -> std::shared_ptr<KVStateCacheManager> {
-            std::shared_ptr<KVStateCacheManager> manager;
-            FileCacheConfig config(
-                tensor_bytes, cache_capacity, layer, chunk_size, split_number,
-                root, filesystemType, client_gc_interval, ttl, enable_global_gc,
-                global_gc_interval, global_ttl);
-            VINEYARD_CHECK_OK(
-                vineyard::KVStateCacheManager::Make(manager, config));
-            return manager;
-          },
-          py::arg("tensor_bytes") = 10, py::arg("cache_capacity") = 10,
-          py::arg("layer") = 1, py::arg("chunk_size") = 5,
-          py::arg("split_number") = 3, py::arg("root") = "root",
-          py::arg("filesystem_type") = FilesystemType::LOCAL,
-          py::arg("client_gc_interval") = 30 * 60, py::arg("ttl") = 30 * 60,
-          py::arg("enable_global_gc") = false,
-          py::arg("global_gc_interval") = 30 * 60,
-          py::arg("global_ttl") = 30 * 60);
-}
-
-}  // namespace vineyard
diff --git a/python/vineyard/llm/tests/test_llm.py b/python/vineyard/llm/tests/test_llm.py
index 014fef6e3e..57549e4c7f 100644
--- a/python/vineyard/llm/tests/test_llm.py
+++ b/python/vineyard/llm/tests/test_llm.py
@@ -35,8 +35,8 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets):
     )
     cache = KVCache(
         cache_config=vineyard_cache_config,
-        tensor_bytes=16,  # should be the same as the nbytes of the tensor
-        cache_capacity=10,
+        tensor_nbytes=16,  # should be the same as the nbytes of the tensor
+        cache_capacity=1024,
         layer=2,
     )
 
@@ -100,13 +100,13 @@ def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets):
 def test_kv_cache_update_and_query_on_fs():
     file_cache_config = FileCacheConfig(
         chunk_size=2,
-        split_number=2,
+        hash_chunk_size=2,
         root="/tmp/vineyard/llm_cache",
     )
     cache = KVCache(
         cache_config=file_cache_config,
-        tensor_bytes=16,  # should be the same as the nbytes of the tensor
-        cache_capacity=10,
+        tensor_nbytes=16,  # should be the same as the nbytes of the tensor
+        cache_capacity=1024,
         layer=2,
     )
 
diff --git a/setup_llm.py b/setup_llm.py
index 829730cdd0..a27e8f3e90 100644
--- a/setup_llm.py
+++ b/setup_llm.py
@@ -101,7 +101,7 @@ def find_llm_packages(root):
     package_dir={'vineyard.llm': 'python/vineyard/llm'},
     packages=find_llm_packages('python'),
     ext_modules=[
-        CopyCMakeExtension('vineyard.llm.llm_C'),
+        CopyCMakeExtension('vineyard.llm._llm_C'),
     ],
     cmdclass={
         'build_ext': build_ext_with_precompiled,
diff --git a/src/common/util/functions.h b/src/common/util/functions.h
index 1d245038af..eef3d6a610 100644
--- a/src/common/util/functions.h
+++ b/src/common/util/functions.h
@@ -23,8 +23,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "boost/algorithm/string/replace.hpp"
-
 #include "common/util/env.h"
 
 namespace vineyard {
@@ -37,7 +35,11 @@ inline std::string ExpandEnvironmentVariables(const std::string& text) {
     std::smatch match;
     while (std::regex_search(text_copy, match, env)) {
       std::string var = read_env(match.str(1).c_str());
-      boost::replace_first(text_copy, match[0].str(), var);
+      const std::string& matched = match[0].str();
+      size_t pos = text_copy.find_first_of(matched);
+      if (pos != std::string::npos) {
+        text_copy.replace(pos, matched.size(), var, 0, std::string::npos);
+      }
     }
     return text_copy;
   } catch (std::exception& e) {
diff --git a/test/runner.py b/test/runner.py
index 9250e55ab8..70615e1675 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -474,9 +474,9 @@ def run_vineyard_cpp_tests(meta, allocator, endpoints, tests):
         run_test(tests, 'tensor_test')
         run_test(tests, 'typename_test')
         run_test(tests, 'version_test')
-        run_test(tests, 'kv_state_cache_radix_tree_test')
-        run_test(tests, 'kv_state_cache_hash_test')
-        run_test(tests, 'kv_state_cache_local_file_test')
+        run_test(tests, 'kv_cache_radix_tree_test')
+        run_test(tests, 'kv_cache_hash_test')
+        run_test(tests, 'kv_cache_local_file_test')
         run_test(tests, 'local_file_storage_gc_test')
 
 
@@ -711,7 +711,7 @@ def run_llm_tests(meta, allocator, endpoints):
 
         subprocess.check_call(
             [
-                './build/bin/kv_state_cache_test',
+                './build/bin/kv_cache_test',
                 '--client-num',
                 '2',
                 '--vineyard-ipc-sockets',