Skip to content

Commit

Permalink
Add the file storage to the python api of llm kv cache. (#1848)
Browse files Browse the repository at this point in the history
Fix #1845

Signed-off-by: Ye Cao <[email protected]>
  • Loading branch information
dashanji authored Mar 26, 2024
1 parent 6359ec9 commit 4cfbaa7
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 61 deletions.
47 changes: 14 additions & 33 deletions python/vineyard/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@

from typing import List
from typing import Tuple
from typing import Union

import numpy as np

import torch
from torch import dtype

import vineyard

from .config import FileCacheConfig
from .config import VineyardCacheConfig
from .llm_C import KVTensor
from .llm_C import _generate

Expand All @@ -35,24 +36,19 @@ class KV_Cache: # pylint: disable=too-many-instance-attributes

def __init__(
self,
socket: str,
cache_config: Union[VineyardCacheConfig, FileCacheConfig],
tensor_bytes: int = 10,
cache_capacity: int = 10,
layer: int = 1,
torch_size: torch.Size = None,
dtype: dtype = None,
block_size: int = 5,
sync_interval: int = 3,
llm_cache_sync_lock: str = "llmCacheSyncLock",
llm_cache_object_name: str = "llm_cache_object",
llm_ref_cnt_object_name: str = "llm_refcnt_object",
**kwargs
):
"""Create a llm kv cache manager based on vineyard blob.
Args:
socket (str):
The vineyard socket path.
cache_config (Union[VineyardCacheConfig, FileCacheConfig]):
The config of the kv cache, including vineyard cache and file cache.
tensor_bytes (int, optional):
The size of the kv cache tensor.
Defaults to 10.
Expand All @@ -67,19 +63,13 @@ def __init__(
dtype (dtype, optional):
The dtype of the tensor. Defaults to None.
e.g., torch.float32, torch.float64.
block_size (int, optional):
The block size of the kv cache. Defaults to 5.
sync_interval (int, optional):
The sync interval of the kv cache. Defaults to 3.
llm_cache_sync_lock (str, optional):
The name of the kv cache sync lock. Defaults to "llmCacheSyncLock".
llm_cache_object_name (str, optional):
The name of the kv cache object. Defaults to "llm_cache_object".
llm_ref_cnt_object_name (str, optional):
The name of the kv cache ref cnt object.
Defaults to "llm_refcnt_object".
"""
self.client = vineyard.connect(socket)
if not isinstance(cache_config, VineyardCacheConfig) and not isinstance(
cache_config, FileCacheConfig
):
raise ValueError(
"The cache_config should be VineyardCacheConfig or FileCacheConfig."
)
self.tensor_bytes = tensor_bytes
self.cache_capacity = cache_capacity
self.layer = layer
Expand All @@ -88,21 +78,12 @@ def __init__(
self.tensor_dtype = dtype
# the dtype of the numpy array of the tensor
self.numpy_dtype = None
self.block_size = block_size
self.sync_interval = sync_interval
self.llm_cache_sync_lock = llm_cache_sync_lock
self.llm_cache_object_name = llm_cache_object_name
self.llm_ref_cnt_object_name = llm_ref_cnt_object_name

self.kv_cache_manager = _generate(
ipc_client=self.client.ipc_client,
tensor_bytes=tensor_bytes,
cache_capacity=cache_capacity,
layer=layer,
block_size=block_size,
sync_interval=sync_interval,
llm_cache_sync_lock=llm_cache_sync_lock,
llm_cache_object_name=llm_cache_object_name,
llm_ref_cnt_object_name=llm_ref_cnt_object_name,
**cache_config.__dict__,
**kwargs
)

Expand Down
89 changes: 89 additions & 0 deletions python/vineyard/llm/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2020-2023 Alibaba Group Holding Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import vineyard

from .llm_C import FilesystemType


class VineyardCacheConfig:
"""VineyardCacheConfig is a class to configure the llm kv cache in vineyard."""

def __init__(
self,
socket: str,
block_size: int = 5,
sync_interval: int = 3,
llm_cache_sync_lock: str = "llmCacheSyncLock",
llm_cache_object_name: str = "llm_cache_object",
llm_ref_cnt_object_name: str = "llm_refcnt_object",
):
"""Create a vineyard cache config.
Args:
socket (str):
The ipc socket of the vineyardd instance.
block_size (int, optional):
The block size of the kv cache. Defaults to 5.
sync_interval (int, optional):
The sync interval of the kv cache. Defaults to 3.
llm_cache_sync_lock (str, optional):
The name of the kv cache sync lock. Defaults to "llmCacheSyncLock".
llm_cache_object_name (str, optional):
The name of the kv cache object. Defaults to "llm_cache_object".
llm_ref_cnt_object_name (str, optional):
The name of the kv cache ref cnt object.
Defaults to "llm_refcnt_object".
"""
self.ipc_client = vineyard.connect(socket).ipc_client
self.block_size = block_size
self.sync_interval = sync_interval
self.llm_cache_sync_lock = llm_cache_sync_lock
self.llm_cache_object_name = llm_cache_object_name
self.llm_ref_cnt_object_name = llm_ref_cnt_object_name


class FileCacheConfig:
"""FileCacheConfig is a class to configure the llm kv cache on filesystem."""

def __init__(
self,
batch_size: int = 16,
split_number: int = 2,
root: str = "/tmp/vineyard/llm_cache",
filesystem_type: FilesystemType = FilesystemType.LOCAL,
):
"""Create a file cache config.
Args:
batch_size (int):
Divide the token list into batches, each batch
contains batchSize tokens. Defaults to 16.
split_number (int):
Split the hash value into the file with multiple directories.
e.g, splitNumber=2, hash value=123456, the file path is 12/34/56.
root (str):
The root directory of the kv state files.
Defaults to "/tmp/vineyard/llm_cache".
filesystem_type (str):
The type of the filesystem. Defaults to "local".
"""
self.batch_size = batch_size
self.split_number = split_number
self.root = root
self.filesystem_type = filesystem_type
72 changes: 48 additions & 24 deletions python/vineyard/llm/kv_state_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ LLMKV create_llmkv_from_buffer(py::buffer buffer, size_t size) {
PYBIND11_MODULE(llm_C, m) {
m.doc() = "vineyard llm kv cache manager module";

pybind11::enum_<FilesystemType>(m, "FilesystemType")
.value("LOCAL", FilesystemType::LOCAL)
.export_values();

py::class_<LLMKV>(m, "KVTensor")
.def(py::init(&create_llmkv_from_buffer), py::arg("buffer"),
py::arg("size"))
Expand Down Expand Up @@ -81,30 +85,50 @@ PYBIND11_MODULE(llm_C, m) {
.def("close", [](KVStateCacheManager* self) { self->Close(); });

m.def(
"_generate",
[](py::object ipc_client, int tensor_bytes, int cache_capacity, int layer,
int block_size, int sync_interval, std::string llm_cache_sync_lock,
std::string llm_cache_object_name,
std::string llm_ref_cnt_object_name) -> py::object {
std::shared_ptr<KVStateCacheManager> manager;
VineyardCacheConfig config(tensor_bytes, cache_capacity, layer,
block_size, sync_interval,
llm_cache_sync_lock, llm_cache_object_name,
llm_ref_cnt_object_name);
Client& client = ipc_client.cast<Client&>();
vineyard::Status status =
vineyard::KVStateCacheManager::Make(client, manager, config);
if (!status.ok()) {
throw std::runtime_error(status.ToString());
}
return py::cast(manager);
},
py::arg("ipc_client"), py::arg("tensor_bytes") = 10,
py::arg("cache_capacity") = 10, py::arg("layer") = 1,
py::arg("block_size") = 5, py::arg("sync_interval") = 3,
py::arg("llm_cache_sync_lock") = "llmCacheSyncLock",
py::arg("llm_cache_object_name") = "llm_cache_object",
py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object");
"_generate",
[](py::object ipc_client, int tensor_bytes, int cache_capacity,
int layer, int block_size, int sync_interval,
std::string llm_cache_sync_lock, std::string llm_cache_object_name,
std::string llm_ref_cnt_object_name) -> py::object {
std::shared_ptr<KVStateCacheManager> manager;
VineyardCacheConfig config(tensor_bytes, cache_capacity, layer,
block_size, sync_interval,
llm_cache_sync_lock, llm_cache_object_name,
llm_ref_cnt_object_name);
Client& client = ipc_client.cast<Client&>();
vineyard::Status status =
vineyard::KVStateCacheManager::Make(client, manager, config);
if (!status.ok()) {
throw std::runtime_error(status.ToString());
}
return py::cast(manager);
},
py::arg("ipc_client"), py::arg("tensor_bytes") = 10,
py::arg("cache_capacity") = 10, py::arg("layer") = 1,
py::arg("block_size") = 5, py::arg("sync_interval") = 3,
py::arg("llm_cache_sync_lock") = "llmCacheSyncLock",
py::arg("llm_cache_object_name") = "llm_cache_object",
py::arg("llm_ref_cnt_object_name") = "llm_refcnt_object")
.def(
"_generate",
[](int tensor_bytes, int cache_capacity, int layer, int batch_size,
int split_number, std::string root,
FilesystemType filesystemType) -> py::object {
std::shared_ptr<KVStateCacheManager> manager;
FileCacheConfig config(tensor_bytes, cache_capacity, layer,
batch_size, split_number, root,
filesystemType);
vineyard::Status status =
vineyard::KVStateCacheManager::Make(manager, config);
if (!status.ok()) {
throw std::runtime_error(status.ToString());
}
return py::cast(manager);
},
py::arg("tensor_bytes") = 10, py::arg("cache_capacity") = 10,
py::arg("layer") = 1, py::arg("batch_size") = 5,
py::arg("split_number") = 3, py::arg("root") = "root",
py::arg("filesystem_type") = FilesystemType::LOCAL);
}

} // namespace vineyard
56 changes: 52 additions & 4 deletions python/vineyard/llm/tests/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,26 @@
import torch

from vineyard.llm import KV_Cache
from vineyard.llm.config import FileCacheConfig
from vineyard.llm.config import VineyardCacheConfig


def test_kv_cache_update_and_query(vineyard_ipc_sockets):
cache = KV_Cache(
def test_kv_cache_update_and_query_on_blob(vineyard_ipc_sockets):
vineyard_cache_config = VineyardCacheConfig(
socket=vineyard_ipc_sockets[0],
block_size=5,
sync_interval=3,
llm_cache_sync_lock="llmCacheSyncLock",
llm_cache_object_name="llm_cache_object",
llm_ref_cnt_object_name="llm_refcnt_object",
)
cache = KV_Cache(
cache_config=vineyard_cache_config,
tensor_bytes=16, # should be the same as the nbytes of the tensor
cache_capacity=10,
layer=1,
torch_size=torch.Size([2, 2]),
dtype=torch.float32,
block_size=5,
sync_interval=3,
)

kv_cache_list = [
Expand All @@ -52,3 +60,43 @@ def test_kv_cache_update_and_query(vineyard_ipc_sockets):
assert torch.equal(k_tensor, queried_k_tensor) and torch.equal(
v_tensor, queried_v_tensor
)


def test_kv_cache_update_and_query_on_fs():
file_cache_config = FileCacheConfig(
batch_size=2,
split_number=2,
root="/tmp/vineyard/llm_cache",
)
cache = KV_Cache(
cache_config=file_cache_config,
tensor_bytes=10000, # should be the same as the nbytes of the tensor
cache_capacity=10,
layer=2,
torch_size=torch.Size([50, 50]),
dtype=torch.float32,
)

kv_cache_list = [
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
(torch.rand(50, 50), torch.rand(50, 50)),
]

tokens = [1, 2, 3, 4]
# insert the token list and the related kv cache list
cache.update(tokens, kv_cache_list)

queried_kv_cache_list = cache.query(tokens)

for (k_tensor, v_tensor), (queried_k_tensor, queried_v_tensor) in zip(
kv_cache_list, queried_kv_cache_list
):
assert torch.equal(k_tensor, queried_k_tensor) and torch.equal(
v_tensor, queried_v_tensor
)

0 comments on commit 4cfbaa7

Please sign in to comment.