diff --git a/python/vineyard/llm/__init__.py b/python/vineyard/llm/__init__.py index 0907d0a93e..4b842b64a3 100644 --- a/python/vineyard/llm/__init__.py +++ b/python/vineyard/llm/__init__.py @@ -18,14 +18,15 @@ from typing import List from typing import Tuple +from typing import Union import numpy as np import torch from torch import dtype -import vineyard - +from .config import FileCacheConfig +from .config import VineyardCacheConfig from .llm_C import KVTensor from .llm_C import _generate @@ -35,24 +36,19 @@ class KV_Cache: # pylint: disable=too-many-instance-attributes def __init__( self, - socket: str, + cache_config: Union[VineyardCacheConfig, FileCacheConfig], tensor_bytes: int = 10, cache_capacity: int = 10, layer: int = 1, torch_size: torch.Size = None, dtype: dtype = None, - block_size: int = 5, - sync_interval: int = 3, - llm_cache_sync_lock: str = "llmCacheSyncLock", - llm_cache_object_name: str = "llm_cache_object", - llm_ref_cnt_object_name: str = "llm_refcnt_object", **kwargs ): """Create a llm kv cache manager based on vineyard blob. Args: - socket (str): - The vineyard socket path. + cache_config (Union[VineyardCacheConfig, FileCacheConfig]): + The config of the kv cache, including vineyard cache and file cache. tensor_bytes (int, optional): The size of the kv cache tensor. Defaults to 10. @@ -67,19 +63,13 @@ def __init__( dtype (dtype, optional): The dtype of the tensor. Defaults to None. e.g., torch.float32, torch.float64. - block_size (int, optional): - The block size of the kv cache. Defaults to 5. - sync_interval (int, optional): - The sync interval of the kv cache. Defaults to 3. - llm_cache_sync_lock (str, optional): - The name of the kv cache sync lock. Defaults to "llmCacheSyncLock". - llm_cache_object_name (str, optional): - The name of the kv cache object. Defaults to "llm_cache_object". - llm_ref_cnt_object_name (str, optional): - The name of the kv cache ref cnt object. - Defaults to "llm_refcnt_object". """ - self.client = vineyard.connect(socket) + if not isinstance(cache_config, VineyardCacheConfig) and not isinstance( + cache_config, FileCacheConfig + ): + raise ValueError( + "The cache_config should be VineyardCacheConfig or FileCacheConfig." + ) self.tensor_bytes = tensor_bytes self.cache_capacity = cache_capacity self.layer = layer @@ -88,21 +78,12 @@ def __init__( self.tensor_dtype = dtype # the dtype of the numpy array of the tensor self.numpy_dtype = None - self.block_size = block_size - self.sync_interval = sync_interval - self.llm_cache_sync_lock = llm_cache_sync_lock - self.llm_cache_object_name = llm_cache_object_name - self.llm_ref_cnt_object_name = llm_ref_cnt_object_name + self.kv_cache_manager = _generate( - ipc_client=self.client.ipc_client, tensor_bytes=tensor_bytes, cache_capacity=cache_capacity, layer=layer, - block_size=block_size, - sync_interval=sync_interval, - llm_cache_sync_lock=llm_cache_sync_lock, - llm_cache_object_name=llm_cache_object_name, - llm_ref_cnt_object_name=llm_ref_cnt_object_name, + **cache_config.__dict__, **kwargs ) diff --git a/python/vineyard/llm/config.py b/python/vineyard/llm/config.py new file mode 100644 index 0000000000..f8b4cbfc4b --- /dev/null +++ b/python/vineyard/llm/config.py @@ -0,0 +1,89 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020-2023 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import vineyard + +from .llm_C import FilesystemType + + +class VineyardCacheConfig: + """VineyardCacheConfig is a class to configure the llm kv cache in vineyard.""" + + def __init__( + self, + socket: str, + block_size: int = 5, + sync_interval: int = 3, + llm_cache_sync_lock: str = "llmCacheSyncLock", + llm_cache_object_name: str = "llm_cache_object", + llm_ref_cnt_object_name: str = "llm_refcnt_object", + ): + """Create a vineyard cache config. + + Args: + socket (str): + The ipc socket of the vineyardd instance. + block_size (int, optional): + The block size of the kv cache. Defaults to 5. + sync_interval (int, optional): + The sync interval of the kv cache. Defaults to 3. + llm_cache_sync_lock (str, optional): + The name of the kv cache sync lock. Defaults to "llmCacheSyncLock". + llm_cache_object_name (str, optional): + The name of the kv cache object. Defaults to "llm_cache_object". + llm_ref_cnt_object_name (str, optional): + The name of the kv cache ref cnt object. + Defaults to "llm_refcnt_object". + """ + self.ipc_client = vineyard.connect(socket).ipc_client + self.block_size = block_size + self.sync_interval = sync_interval + self.llm_cache_sync_lock = llm_cache_sync_lock + self.llm_cache_object_name = llm_cache_object_name + self.llm_ref_cnt_object_name = llm_ref_cnt_object_name + + +class FileCacheConfig: + """FileCacheConfig is a class to configure the llm kv cache on filesystem.""" + + def __init__( + self, + batch_size: int = 16, + split_number: int = 2, + root: str = "/tmp/vineyard/llm_cache", + filesystem_type: FilesystemType = FilesystemType.LOCAL, + ): + """Create a file cache config. + + Args: + batch_size (int): + Divide the token list into batches, each batch + contains batchSize tokens. Defaults to 16. + split_number (int): + Split the hash value into the file with multiple directories. + e.g, splitNumber=2, hash value=123456, the file path is 12/34/56. + root (str): + The root directory of the kv state files. + Defaults to "/tmp/vineyard/llm_cache". + filesystem_type (str): + The type of the filesystem. Defaults to "local". + """ + self.batch_size = batch_size + self.split_number = split_number + self.root = root + self.filesystem_type = filesystem_type diff --git a/python/vineyard/llm/kv_state_cache.cc b/python/vineyard/llm/kv_state_cache.cc index d4e3a860e0..8d0dec7904 100644 --- a/python/vineyard/llm/kv_state_cache.cc +++ b/python/vineyard/llm/kv_state_cache.cc @@ -34,6 +34,10 @@ LLMKV create_llmkv_from_buffer(py::buffer buffer, size_t size) { PYBIND11_MODULE(llm_C, m) { m.doc() = "vineyard llm kv cache manager module"; + pybind11::enum_(m, "FilesystemType") + .value("LOCAL", FilesystemType::LOCAL) + .export_values(); + py::class_(m, "KVTensor") .def(py::init(&create_llmkv_from_buffer), py::arg("buffer"), py::arg("size")) @@ -81,30 +85,50 @@ PYBIND11_MODULE(llm_C, m) { .def("close", [](KVStateCacheManager* self) { self->Close(); }); m.def( - "_generate", - [](py::object ipc_client, int tensor_bytes, int cache_capacity, int layer, - int block_size, int sync_interval, std::string llm_cache_sync_lock, - std::string llm_cache_object_name, - std::string llm_ref_cnt_object_name) -> py::object { - std::shared_ptr manager; - VineyardCacheConfig config(tensor_bytes, cache_capacity, layer, - block_size, sync_interval, - llm_cache_sync_lock, llm_cache_object_name, - llm_ref_cnt_object_name); - Client& client = ipc_client.cast(); - vineyard::Status status = - vineyard::KVStateCacheManager::Make(client, manager, config); - if (!status.ok()) { - throw std::runtime_error(status.ToString()); - } - return py::cast(manager); - }, - py::arg("ipc_client"), py::arg("tensor_bytes") = 10, - py::arg("cache_capacity") = 10, py::arg("layer") = 1, - py::arg("block_size") = 5, py::arg("sync_interval") = 3, - py::arg("llm_cache_sync_lock") = "llmCacheSyncLock", - py::arg("llm_cache_object_name") = "llm_cache_object", - py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object"); + "_generate", + [](py::object ipc_client, int tensor_bytes, int cache_capacity, + int layer, int block_size, int sync_interval, + std::string llm_cache_sync_lock, std::string llm_cache_object_name, + std::string llm_ref_cnt_object_name) -> py::object { + std::shared_ptr manager; + VineyardCacheConfig config(tensor_bytes, cache_capacity, layer, + block_size, sync_interval, + llm_cache_sync_lock, llm_cache_object_name, + llm_ref_cnt_object_name); + Client& client = ipc_client.cast(); + vineyard::Status status = + vineyard::KVStateCacheManager::Make(client, manager, config); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + return py::cast(manager); + }, + py::arg("ipc_client"), py::arg("tensor_bytes") = 10, + py::arg("cache_capacity") = 10, py::arg("layer") = 1, + py::arg("block_size") = 5, py::arg("sync_interval") = 3, + py::arg("llm_cache_sync_lock") = "llmCacheSyncLock", + py::arg("llm_cache_object_name") = "llm_cache_object", + py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object") + .def( + "_generate", + [](int tensor_bytes, int cache_capacity, int layer, int batch_size, + int split_number, std::string root, + FilesystemType filesystemType) -> py::object { + std::shared_ptr manager; + FileCacheConfig config(tensor_bytes, cache_capacity, layer, + batch_size, split_number, root, + filesystemType); + vineyard::Status status = + vineyard::KVStateCacheManager::Make(manager, config); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + return py::cast(manager); + }, + py::arg("tensor_bytes") = 10, py::arg("cache_capacity") = 10, + py::arg("layer") = 1, py::arg("batch_size") = 5, + py::arg("split_number") = 3, py::arg("root") = "root", + py::arg("filesystem_type") = FilesystemType::LOCAL); } } // namespace vineyard diff --git a/python/vineyard/llm/tests/test_llm.py b/python/vineyard/llm/tests/test_llm.py index 0d536975a0..019e0008f6 100644 --- a/python/vineyard/llm/tests/test_llm.py +++ b/python/vineyard/llm/tests/test_llm.py @@ -19,18 +19,26 @@ import torch from vineyard.llm import KV_Cache +from vineyard.llm.config import FileCacheConfig +from vineyard.llm.config import VineyardCacheConfig -def test_kv_cache_update_and_query(vineyard_ipc_sockets): - cache = KV_Cache( +def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets): + vineyard_cache_config = VineyardCacheConfig( socket=vineyard_ipc_sockets[0], + block_size=5, + sync_interval=3, + llm_cache_sync_lock="llmCacheSyncLock", + llm_cache_object_name="llm_cache_object", + llm_ref_cnt_object_name="llm_refcnt_object", + ) + cache = KV_Cache( + cache_config=vineyard_cache_config, tensor_bytes=16, # should be the same as the nbytes of the tensor cache_capacity=10, layer=1, torch_size=torch.Size([2, 2]), dtype=torch.float32, - block_size=5, - sync_interval=3, ) kv_cache_list = [ @@ -52,3 +60,43 @@ def test_kv_cache_update_and_query(vineyard_ipc_sockets): assert torch.equal(k_tensor, queried_k_tensor) and torch.equal( v_tensor, queried_v_tensor ) + + +def test_kv_cache_update_and_query_on_fs(): + file_cache_config = FileCacheConfig( + batch_size=2, + split_number=2, + root="/tmp/vineyard/llm_cache", + ) + cache = KV_Cache( + cache_config=file_cache_config, + tensor_bytes=10000, # should be the same as the nbytes of the tensor + cache_capacity=10, + layer=2, + torch_size=torch.Size([50, 50]), + dtype=torch.float32, + ) + + kv_cache_list = [ + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + (torch.rand(50, 50), torch.rand(50, 50)), + ] + + tokens = [1, 2, 3, 4] + # insert the token list and the related kv cache list + cache.update(tokens, kv_cache_list) + + queried_kv_cache_list = cache.query(tokens) + + for (k_tensor, v_tensor), (queried_k_tensor, queried_v_tensor) in zip( + kv_cache_list, queried_kv_cache_list + ): + assert torch.equal(k_tensor, queried_k_tensor) and torch.equal( + v_tensor, queried_v_tensor + )