From 8d24ce4e8d556f4f593d834fb6233be4729407de Mon Sep 17 00:00:00 2001 From: Viet Anh Nguyen Date: Sat, 29 Jul 2023 19:42:56 +0700 Subject: [PATCH 1/3] Add video capture support --- CMakeLists.txt | 14 ++++--- customchar/main.cpp | 46 ++++++++++++++++------- customchar/vision/video_capture.h | 62 +++++++++++++++++++++++++++++++ docs/architecture.svg | 2 +- 4 files changed, 104 insertions(+), 20 deletions(-) create mode 100644 customchar/vision/video_capture.h diff --git a/CMakeLists.txt b/CMakeLists.txt index cc93eda..f3f433c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,9 @@ string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES) message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}") message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}") +# OpenCV for perception module +find_package(OpenCV REQUIRED) + # Add whisper-cpp add_subdirectory(libs/whisper-cpp) @@ -42,7 +45,7 @@ target_include_directories( libs . ) -target_link_libraries(${TARGET} PUBLIC ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} whisper) +target_link_libraries(${TARGET} PUBLIC ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${OpenCV_LIBS} whisper) # CustomChar - cli add_executable( @@ -51,9 +54,9 @@ add_executable( ) target_link_libraries(customchar-cli customchar-core) - option(BUILD_GUI "Build GUI" ON) -if (BUILD_GUI) + +if(BUILD_GUI) find_package(OpenGL REQUIRED) find_package(GLEW REQUIRED) find_package(glfw3 REQUIRED) @@ -72,12 +75,12 @@ if (BUILD_GUI) add_library(imgui STATIC ${IMGUI_SRCS}) target_include_directories(imgui PUBLIC ${IMGUI_DIR} ${IMGUI_DIR}/backends) - if (UNIX AND NOT APPLE) + if(UNIX AND NOT APPLE) message(STATUS "Building for Linux") set(LINUX_GL_LIBS GL GLEW) target_link_libraries(${TARGET} PUBLIC ${LINUX_GL_LIBS} glfw) target_compile_definitions(${TARGET} PUBLIC LINUX) - elseif (APPLE) + elseif(APPLE) message(STATUS "Building for Mac OS X") target_link_libraries(${TARGET} PUBLIC "-framework OpenGL" "-framework Cocoa" "-framework IOKit" "-framework CoreVideo" glfw) target_compile_definitions(${TARGET} PUBLIC APPLE) @@ -100,5 +103,4 @@ if (BUILD_GUI) add_custom_command(TARGET customchar POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/fonts $/fonts) - endif() diff --git a/customchar/main.cpp b/customchar/main.cpp index dbfe83e..dbb49e2 100644 --- a/customchar/main.cpp +++ b/customchar/main.cpp @@ -20,6 +20,7 @@ #include "customchar/common/helpers.h" #include "customchar/session/chat_history.h" #include "customchar/session/chat_message.h" +#include "customchar/vision/video_capture.h" #include "imgui_internal.h" #include "imspinner/imspinner.h" @@ -27,6 +28,8 @@ using namespace CC; using namespace CC::character; +vision::VideoCapture video_capture; + // [Win32] Our example includes a copy of glfw3.lib pre-compiled with VS2010 to // maximize ease of testing and compatibility with old VS compilers. To link // with VS2010-era libraries, VS2015+ requires linking with @@ -43,8 +46,8 @@ using namespace CC::character; // everytime user sends message, IMGUI sets global variable to message // signal client server ... lock/unlock mutex constexpr int TEXT_MESSAGE_SIZE = 1024 * 8; -constexpr int INIT_WINDOW_WIDTH = 450; -constexpr int INIT_WINDOW_HEIGHT = 400; +constexpr int INIT_WINDOW_WIDTH = 600; +constexpr int INIT_WINDOW_HEIGHT = 600; static void GLFWErrorCallback(int error, const char* description) { fprintf(stderr, "Glfw Error %d: %s\n", error, description); @@ -129,17 +132,6 @@ void runImgui(std::shared_ptr history) { // Main loop while (!glfwWindowShouldClose(window)) { - // Poll and handle events (inputs, window resize, etc.) - // You can read the io.WantCaptureMouse, io.WantCaptureKeyboard - // flags to tell if dear imgui wants to use your inputs. - // - When io.WantCaptureMouse is true, do not dispatch mouse input - // data to your main application, or clear/overwrite your copy of - // the mouse data. - // - When io.WantCaptureKeyboard is true, do not dispatch keyboard - // input data to your main application, or clear/overwrite your copy - // of the keyboard data. Generally you may always pass all inputs to - // dear imgui, and hide them from your application based on those - // two flags glfwPollEvents(); // Start the Dear ImGui frame @@ -158,6 +150,32 @@ void runImgui(std::shared_ptr history) { ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoTitleBar); + // Get window size + ImVec2 windowSize = ImGui::GetWindowSize(); + + // Resize image to fit window + cv::Mat image = video_capture.GetFrame(); + cv::Mat resized_image; + float ratio = (float)image.cols / (float)image.rows; + int new_width = windowSize.x; + int new_height = new_width / ratio; + cv::resize(image, resized_image, cv::Size(new_width, new_height)); + cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGBA); + + GLuint texture; + glGenTextures(1, &texture); + glBindTexture(GL_TEXTURE_2D, texture); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, resized_image.cols, + resized_image.rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, + resized_image.data); + ImGui::Image(reinterpret_cast(static_cast(texture)), + ImVec2(resized_image.cols, resized_image.rows), ImVec2(0, 0), + ImVec2(1, 1), ImColor(255, 255, 255, 255), + ImColor(255, 255, 255, 128)); + // Child window scrollable area ImGuiWindowFlags window_flags = ImGuiWindowFlags_None; @@ -252,6 +270,8 @@ int main(int argc, char** argv) { // Create character Character character(params); + video_capture.Start(); + // Set message callbacks character.SetOnUserMessage( std::bind(OnNewMessage, std::placeholders::_1, "User", history)); diff --git a/customchar/vision/video_capture.h b/customchar/vision/video_capture.h new file mode 100644 index 0000000..8f27ee4 --- /dev/null +++ b/customchar/vision/video_capture.h @@ -0,0 +1,62 @@ +#ifndef CUSTOMCHAR_VISION_VIDEO_CAPTURE_H_ +#define CUSTOMCHAR_VISION_VIDEO_CAPTURE_H_ + +#include +#include +#include +#include + +#include + +namespace CC { +namespace vision { + +class VideoCapture { + private: + cv::VideoCapture capture_; + cv::Mat frame_; + std::mutex frame_mutex_; + std::thread capture_thread_; + + public: + VideoCapture() { + capture_ = cv::VideoCapture(0); + if (!capture_.isOpened()) { + std::cout << "Error opening video stream or file" << std::endl; + exit(-1); + } + } + + /// @brief Capture frames from camera + void Capture() { + cv::Mat frame; + while (true) { + capture_ >> frame; + if (frame.empty()) { + break; + } + std::lock_guard lock(frame_mutex_); + frame_ = frame.clone(); + } + } + + /// @brief Start capturing frames + void Start() { capture_thread_ = std::thread(&VideoCapture::Capture, this); } + + /// @brief Stop capturing frames + void Stop() { capture_thread_.join(); } + + /// @brief Get frame from queue + /// @return cv::Mat. Empty if queue is empty + cv::Mat GetFrame() { + cv::Mat frame; + std::lock_guard lock(frame_mutex_); + frame = frame_.clone(); + return frame; + } +}; + +} // namespace vision +} // namespace CC + +#endif // CUSTOMCHAR_SESSION_CHAT_HISTORY_H_ diff --git a/docs/architecture.svg b/docs/architecture.svg index 073f4b5..8599a31 100644 --- a/docs/architecture.svg +++ b/docs/architecture.svg @@ -1,4 +1,4 @@ -
Character Data
Character Data
Text Data
Text Data
Voice Sample
Voice Sample
CC::EmbedDB
(HNSW + SQLite)
CC::EmbedDB...
CC::VoiceSynthesizer
CC::VoiceSynthesizer
Voice
Voice
CC::SpeechRecognizer
(Whisper)
CC::SpeechRecognizer...
CC::WakeWordDetector
CC::WakeWordDetector
Vector DB
Vector DB
Document
Document
CC::LLM
(Llama V2)
CC::LLM...
CC::LLM
(Llama V2)
CC::LLM...
Response Voice
Response Voice
Response
Response
Chat History
Chat History
Data Preparation
Data Preparation
Inference
Inference
CustomChar
Architecture
CustomChar...
Prompt
Prompt
Perception Prompt
Perception Prompt
CC::Perception
CC::Perception
CC::LLM
(Llama V2)
CC::LLM...
Image
Image
CC::PluginExecutor
CC::PluginExecutor
Text Prompt
Text Prompt
External Services
External Services
Prompt Template
Prompt Template
Text is not SVG - cannot display
+
Character Data
Character Data
Text Data
Text Data
Voice Sample
Voice Sample
CC::EmbedDB
(HNSW + SQLite)
CC::EmbedDB...
CC::VoiceSynthesizer
CC::VoiceSynthesizer
Voice
Voice
CC::SpeechRecognizer
(Whisper)
CC::SpeechRecognizer...
CC::WakeWordDetector
CC::WakeWordDetector
Vector DB
Vector DB
Document
Document
CC::LLM
(Llama V2)
CC::LLM...
CC::LLM
(Llama V2)
CC::LLM...
Response Voice
Response Voice
Response
Response
Chat History
Chat History
Data Preparation
Data Preparation
Inference
Inference
CustomChar
Architecture
CustomChar...
Prompt
Prompt
Perception Prompt
Perception Prompt
CC::Vision
CC::Vision
CC::LLM
(Llama V2)
CC::LLM...
Image
Image
CC::PluginExecutor
CC::PluginExecutor
Text Prompt
Text Prompt
External Services
External Services
Prompt Template
Prompt Template
Text is not SVG - cannot display
From c5c4fbc676c02fa5b02986bb15b227c999bbe293 Mon Sep 17 00:00:00 2001 From: Viet Anh Nguyen Date: Sat, 29 Jul 2023 22:14:27 +0700 Subject: [PATCH 2/3] Fix memory leak --- customchar/audio/speech_recognizer.cpp | 1 - customchar/audio/voice_synthesizer.cpp | 2 +- customchar/main.cpp | 18 ++++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/customchar/audio/speech_recognizer.cpp b/customchar/audio/speech_recognizer.cpp index 7c5868f..3c05322 100644 --- a/customchar/audio/speech_recognizer.cpp +++ b/customchar/audio/speech_recognizer.cpp @@ -143,7 +143,6 @@ std::string SpeechRecognizer::Transcribe(const std::vector& pcmf32, const int n_segments = whisper_full_n_segments(context_); for (int i = 0; i < n_segments; ++i) { const char* text = whisper_full_get_segment_text(context_, i); - result += text; const int n_tokens = whisper_full_n_tokens(context_, i); diff --git a/customchar/audio/voice_synthesizer.cpp b/customchar/audio/voice_synthesizer.cpp index fd39efa..322cf86 100644 --- a/customchar/audio/voice_synthesizer.cpp +++ b/customchar/audio/voice_synthesizer.cpp @@ -6,7 +6,7 @@ using namespace CC::audio; VoiceSynthesizer::VoiceSynthesizer() { // Check if the Say command is supported - std::string command = "Say --version"; + std::string command = "which say"; FILE* pipe = popen(command.c_str(), "r"); if (pipe == nullptr) { printf("Failed to run command: %s\n", command.c_str()); diff --git a/customchar/main.cpp b/customchar/main.cpp index dbb49e2..6d2d77c 100644 --- a/customchar/main.cpp +++ b/customchar/main.cpp @@ -130,6 +130,13 @@ void runImgui(std::shared_ptr history) { // Initial text char text[TEXT_MESSAGE_SIZE] = ""; + GLuint texture; + glGenTextures(1, &texture); + glBindTexture(GL_TEXTURE_2D, texture); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + // Main loop while (!glfwWindowShouldClose(window)) { glfwPollEvents(); @@ -162,12 +169,7 @@ void runImgui(std::shared_ptr history) { cv::resize(image, resized_image, cv::Size(new_width, new_height)); cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGBA); - GLuint texture; - glGenTextures(1, &texture); - glBindTexture(GL_TEXTURE_2D, texture); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + // Display image glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, resized_image.cols, resized_image.rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, resized_image.data); @@ -227,6 +229,7 @@ void runImgui(std::shared_ptr history) { // Put the cursor of InputTextMultiline at the end of the text ImGui::SetKeyboardFocusHere(); + ImGui::End(); // Rendering ImGui::Render(); @@ -241,8 +244,7 @@ void runImgui(std::shared_ptr history) { glfwSwapBuffers(window); } - std::cout << "Main ImGUI loop ended" << std::endl; - + glDeleteTextures(1, &texture); ImGui_ImplOpenGL3_Shutdown(); ImGui_ImplGlfw_Shutdown(); ImGui::DestroyContext(); From 794cef13bc5d2a98565d446686896a4ed01d4f6a Mon Sep 17 00:00:00 2001 From: Viet Anh Nguyen Date: Sat, 29 Jul 2023 23:23:39 +0700 Subject: [PATCH 3/3] Switch camera on/off --- customchar/main.cpp | 83 +++++++++++++++++++------------ customchar/vision/video_capture.h | 72 +++++++++++++++++++++++---- 2 files changed, 115 insertions(+), 40 deletions(-) diff --git a/customchar/main.cpp b/customchar/main.cpp index 6d2d77c..85145c7 100644 --- a/customchar/main.cpp +++ b/customchar/main.cpp @@ -47,7 +47,7 @@ vision::VideoCapture video_capture; // signal client server ... lock/unlock mutex constexpr int TEXT_MESSAGE_SIZE = 1024 * 8; constexpr int INIT_WINDOW_WIDTH = 600; -constexpr int INIT_WINDOW_HEIGHT = 600; +constexpr int INIT_WINDOW_HEIGHT = 400; static void GLFWErrorCallback(int error, const char* description) { fprintf(stderr, "Glfw Error %d: %s\n", error, description); @@ -105,9 +105,6 @@ void runImgui(std::shared_ptr history) { ImGui::CreateContext(); ImGuiIO& io = ImGui::GetIO(); (void)io; - // io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard; // Enable - // Keyboard Controls io.ConfigFlags |= ImGuiConfigFlags_NavEnableGamepad; // - // Enable Gamepad Controls // Setup Dear ImGui style // ImGui::StyleColorsDark(); @@ -125,7 +122,7 @@ void runImgui(std::shared_ptr history) { // Our state ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f); - bool justSent = true; + bool just_sent = true; // Initial text char text[TEXT_MESSAGE_SIZE] = ""; @@ -137,6 +134,9 @@ void runImgui(std::shared_ptr history) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + bool last_enable_camera = false; + bool enable_camera = true; + // Main loop while (!glfwWindowShouldClose(window)) { glfwPollEvents(); @@ -157,26 +157,49 @@ void runImgui(std::shared_ptr history) { ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoTitleBar); - // Get window size - ImVec2 windowSize = ImGui::GetWindowSize(); - - // Resize image to fit window - cv::Mat image = video_capture.GetFrame(); - cv::Mat resized_image; - float ratio = (float)image.cols / (float)image.rows; - int new_width = windowSize.x; - int new_height = new_width / ratio; - cv::resize(image, resized_image, cv::Size(new_width, new_height)); - cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGBA); - - // Display image - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, resized_image.cols, - resized_image.rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, - resized_image.data); - ImGui::Image(reinterpret_cast(static_cast(texture)), - ImVec2(resized_image.cols, resized_image.rows), ImVec2(0, 0), - ImVec2(1, 1), ImColor(255, 255, 255, 255), - ImColor(255, 255, 255, 128)); + ImVec2 window_size = ImGui::GetWindowSize(); + + // Check and start/stop camera + if (last_enable_camera != enable_camera) { + if (enable_camera) { + video_capture.Start(); + // Adapt window height to camera aspect ratio + int window_width = window_size.x; + int window_height = window_width * video_capture.GetFrameHeight() / + video_capture.GetFrameWidth() + + 200; + glfwSetWindowSize(window, window_width, window_height); + } else { + video_capture.Stop(); + glfwSetWindowSize(window, INIT_WINDOW_WIDTH, INIT_WINDOW_HEIGHT); + } + last_enable_camera = enable_camera; + } + + // Render camera + if (enable_camera) { + // Resize image to fit window + cv::Mat image = video_capture.GetFrame(); + if (!image.empty()) { + cv::Mat resized_image; + float ratio = (float)image.cols / (float)image.rows; + int new_width = window_size.x - 20; + int new_height = new_width / ratio; + cv::resize(image, resized_image, cv::Size(new_width, new_height)); + cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGBA); + + // Display image + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, resized_image.cols, + resized_image.rows, 0, GL_RGBA, GL_UNSIGNED_BYTE, + resized_image.data); + ImGui::Image(reinterpret_cast(static_cast(texture)), + ImVec2(resized_image.cols, resized_image.rows), + ImVec2(0, 0), ImVec2(1, 1), ImColor(255, 255, 255, 255), + ImColor(255, 255, 255, 128)); + } + } + + ImGui::Checkbox("Enable Camera", &enable_camera); // Child window scrollable area ImGuiWindowFlags window_flags = ImGuiWindowFlags_None; @@ -196,7 +219,7 @@ void runImgui(std::shared_ptr history) { ImGui::TextWrapped("> %s: %s", message.GetSender().c_str(), message.GetMessage().c_str()); } - if (history->HasNewMessage() || justSent) { + if (history->HasNewMessage() || just_sent) { ImGui::SetScrollHereY(1.0f); } @@ -208,9 +231,9 @@ void runImgui(std::shared_ptr history) { ImGuiInputTextFlags input_flags = ImGuiInputTextFlags_ReadOnly; // Refocus text area if text was just sent - if (justSent) { + if (just_sent) { ImGui::SetKeyboardFocusHere(); - justSent = false; + just_sent = false; } // Create a spinner and text input in the same line @@ -224,7 +247,7 @@ void runImgui(std::shared_ptr history) { strcpy(text, "Say something..."); ImGui::PushItemWidth(ImGui::GetContentRegionAvail().x); if (ImGui::InputText("##source", text, IM_ARRAYSIZE(text), input_flags)) { - justSent = OnNewMessage(text, "User", history); + just_sent = OnNewMessage(text, "User", history); }; // Put the cursor of InputTextMultiline at the end of the text @@ -272,8 +295,6 @@ int main(int argc, char** argv) { // Create character Character character(params); - video_capture.Start(); - // Set message callbacks character.SetOnUserMessage( std::bind(OnNewMessage, std::placeholders::_1, "User", history)); diff --git a/customchar/vision/video_capture.h b/customchar/vision/video_capture.h index 8f27ee4..42fdd10 100644 --- a/customchar/vision/video_capture.h +++ b/customchar/vision/video_capture.h @@ -13,25 +13,60 @@ namespace vision { class VideoCapture { private: + int device_id_; + bool is_capturing_; + cv::VideoCapture capture_; + std::mutex capture_mutex_; + cv::Mat frame_; std::mutex frame_mutex_; std::thread capture_thread_; - public: - VideoCapture() { - capture_ = cv::VideoCapture(0); + /// @brief Start capturing frames from device + bool StartDevice() { + capture_ = cv::VideoCapture(device_id_); if (!capture_.isOpened()) { - std::cout << "Error opening video stream or file" << std::endl; - exit(-1); + std::cerr << "Error opening video stream or file" << std::endl; + return false; + } else { + return true; } } + /// @brief Stop capturing frames from device + void StopDevice() { + if (capture_.isOpened()) capture_.release(); + } + + public: + VideoCapture() {} + + /// @brief Get frame width + /// @return int + int GetFrameWidth() { + std::lock_guard lock(frame_mutex_); + return frame_.cols; + } + + /// @brief Get frame height + /// @return int + int GetFrameHeight() { + std::lock_guard lock(frame_mutex_); + return frame_.rows; + } + /// @brief Capture frames from camera void Capture() { cv::Mat frame; while (true) { - capture_ >> frame; + { + std::lock_guard lock(capture_mutex_); + if (!is_capturing_) { + break; + } + capture_ >> frame; + } if (frame.empty()) { break; } @@ -41,10 +76,29 @@ class VideoCapture { } /// @brief Start capturing frames - void Start() { capture_thread_ = std::thread(&VideoCapture::Capture, this); } + void Start() { + { + std::lock_guard lock(capture_mutex_); + if (is_capturing_) { + return; + } + if (!StartDevice()) return; + // Get first frame to initialize frame size + std::lock_guard frame_lock(frame_mutex_); + capture_ >> frame_; + is_capturing_ = true; + } + capture_thread_ = std::thread(&VideoCapture::Capture, this); + } - /// @brief Stop capturing frames - void Stop() { capture_thread_.join(); } + /// @brief Stop capturing frames + void Stop() { + std::lock_guard lock(capture_mutex_); + if (!is_capturing_) { + return; + } + StopDevice(); + } /// @brief Get frame from queue /// @return cv::Mat. Empty if queue is empty