NVIDIA · kaiyux · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -154,3 +154,5 @@ Take GPT-350M as an example for single GPU with static batching
     --static_emulated_timeout 100 \
     --dataset ../../benchmarks/cpp/preprocessed_dataset.json
 ```
+
+`gptManagerBenchmark` can also be used with the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`). This can be done by passing the argument `--api executor`. Note that the Executor class is still under development and currently does not support models with tp or pp > 1.
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -60,6 +60,20 @@ else()
   message(STATUS "Importing batch manager")
 endif()
 
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/executor/CMakeLists.txt")
+  set(BUILD_EXECUTOR_DEFAULT ON)
+else()
+  set(BUILD_EXECUTOR_DEFAULT OFF)
+endif()
+
+option(BUILD_EXECUTOR "Build executor from source" ${BUILD_EXECUTOR_DEFAULT})
+
+if(BUILD_EXECUTOR)
+  message(STATUS "Building executor")
+else()
+  message(STATUS "Importing executor")
+endif()
+
 if(BUILD_PYT)
   message(STATUS "Building PyTorch")
 else()

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 
 #include <optional>
@@ -42,6 +43,13 @@ class KvCacheConfig
     {
     }
 
+    explicit KvCacheConfig(executor::KvCacheConfig const& kvCacheConfig)
+        : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindow(),
+            kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
+            kvCacheConfig.getEnableBlockReuse(), kvCacheConfig.getUseUvm())
+    {
+    }
+
     std::optional<SizeType> maxTokens;
     std::optional<SizeType> maxAttentionWindow;
     std::optional<SizeType> sinkTokenLength;