Merge pull request #4 from vietanhdev/refactor/clean_audio_llama_apis

Clean up APIs
nrl-ai · Jul 30, 2023 · 8c5d607 · 8c5d607
2 parents 83b8aa3 + 7077559
commit 8c5d607
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -67,22 +67,25 @@ Install dependencies:
 - [CMake](https://cmake.org/download/)
 - C++ 14 compiler
 
-On macOS:
+On macOS: Using [Homebrew](https://brew.sh/)
 
 ```bash
 brew install sdl2 glew glfw3
+brew install opencv
 ```
 
 On Ubuntu:
 
 ```bash
 sudo apt-get install libsdl2-dev libglew-dev libglfw3-dev
+sudo apt-get install libopencv-dev
 ```
 
-On Windows:
+On Windows: Using [vcpkg](https://github.com/microsoft/vcpkg) and [Git Bash](https://git-scm.com/downloads):
 
 ```bash
 vcpkg install sdl2:x64-windows glew:x64-windows glfw3:x64-windows
+vcpkg install opencv[contrib,nonfree,ffmpeg,ipp]:x64-windows --recurse
 ```
 
 Build the **CustomChar** executable:

diff --git a/customchar/audio/voice_recorder.cpp b/customchar/audio/voice_recorder.cpp
@@ -32,3 +32,30 @@ void VoiceRecorder::GetAudio(std::vector<float>& result) {
  audio_->Get(voice_ms, pcmf32_cur_);
  result = pcmf32_cur_;
 }
+
+std::vector<float> VoiceRecorder::RecordSpeech() {
+ bool is_running;
+ std::vector<float> audio_buff;
+ while (true) {
+ // Handle Ctrl + C
+ is_running = audio::SDLPollEvents();
+ if (!is_running) {
+ break;
+ }
+
+ // Delay
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+ // Sample audio
+ SampleAudio();
+ if (!FinishedTalking()) {
+ continue;
+ }
+
+ // Get recorded audio
+ GetAudio(audio_buff);
+ break;
+ }
+
+ return audio_buff;
+};
diff --git a/customchar/audio/voice_recorder.h b/customchar/audio/voice_recorder.h
@@ -35,6 +35,10 @@ class VoiceRecorder {
 
  /// @brief Get final audio_
  void GetAudio(std::vector<float>& result);
+
+ /// @brief Record speech from user
+ /// @return Audio buffer from user
+ std::vector<float> RecordSpeech();
 }; // class VoiceRecorder
 
 } // namespace audio

diff --git a/customchar/character/character.cpp b/customchar/character/character.cpp
@@ -57,31 +57,20 @@ void Character::Run() {
  break;
  }
 
- // Delay
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
- float prob = 0.0f;
- int64_t t_ms = 0;
-
- // Sample audio
- voice_recoder_->SampleAudio();
- if (!voice_recoder_->FinishedTalking()) {
- continue;
- }
-
- // Get recorded audio
- std::vector<float> audio_buff;
- voice_recoder_->GetAudio(audio_buff);
+ // Record speech from user
+ std::vector<float> audio_buff = voice_recoder_->RecordSpeech();
 
  // Recognize speech
+ float prob;
+ int64_t t_ms;
  std::string text_heard =
  speech_recognizer_->Recognize(audio_buff, prob, t_ms);
 
  // Tokenize user input
  auto tokens = llm_->Tokenize(text_heard, false);
 
- // Skip if nothing was heard
+ // Start over if nothing was heard
  if (text_heard.empty() || tokens.empty()) {
- printf("Heard nothing, skipping ...\n");
  voice_recoder_->ClearAudioBuffer();
  continue;
  }
@@ -103,12 +92,8 @@ void Character::Run() {
  // Otherwise, LLM will handle
  std::string response;
  if (!plugin_executor_->ParseAndExecute(text_heard, response)) {
- // Append the new input tokens to the session_tokens vector
- llm_->AddTokensToCurrentSession(tokens);
- // Get answer from LLM
- embd = llm_->Tokenize(formated_text_heard, false);
  // Get answer from LLM
- response = llm_->GetAnswer(embd);
+ response = llm_->GetAnswer(text_heard);
  } else {
  // TODO: Add plugin executor response to LLM session
  }

diff --git a/customchar/llm/llm.cpp b/customchar/llm/llm.cpp
@@ -181,11 +181,21 @@ void LLM::EvalModel() {
  n_matching_session_tokens < (embd_inp_.size() * 3 / 4);
 }
 
-std::string LLM::GetAnswer(std::vector<llama_token>& embd) {
+std::string LLM::GetAnswer(const std::string& user_input) {
+ // Tokenize and put unformated tokens to the session store
+ AddTokensToCurrentSession(Tokenize(user_input, false));
+
+ // Format the input and tokenize
+ // TODO: Do it more efficient (using above output)
+ std::string formated_input = user_input;
+ formated_input.insert(0, 1, ' ');
+ formated_input += "\n" + bot_name_ + chat_symb_;
+ std::vector<llama_token> embd = Tokenize(formated_input, false);
+
  bool done = false;
  int last_length = 0;
  int loop_count = 0;
- std::string text_to_speak;
+ std::string output_text;
  while (true) {
  if (embd.size() > 0) {
  if (n_past_ + (int)embd.size() > n_ctx_) {
@@ -297,7 +307,7 @@ std::string LLM::GetAnswer(std::vector<llama_token>& embd) {
  if (id != llama_token_eos()) {
  // add it to the context
  embd.push_back(id);
- text_to_speak += llama_token_to_str(ctx_llama_, id);
+ output_text += llama_token_to_str(ctx_llama_, id);
  printf("%s", llama_token_to_str(ctx_llama_, id));
  }
  }
@@ -314,7 +324,7 @@ std::string LLM::GetAnswer(std::vector<llama_token>& embd) {
  last_output.length() - antiprompt.length(),
  antiprompt.length()) != std::string::npos) {
  done = true;
- text_to_speak = common::Replace(text_to_speak, antiprompt, "");
+ output_text = common::Replace(output_text, antiprompt, "");
  fflush(stdout);
  need_to_save_session_ = true;
  break;
@@ -324,17 +334,17 @@ std::string LLM::GetAnswer(std::vector<llama_token>& embd) {
 
  // Break to avoid infinite loop
  // TODO: Fix this bug
- if ((int)text_to_speak.length() == last_length + 1 &&
- text_to_speak[text_to_speak.length() - 1] == '\n') {
+ if ((int)output_text.length() == last_length + 1 &&
+ output_text[output_text.length() - 1] == '\n') {
  ++loop_count;
  } else {
  loop_count = 0;
  }
  if (loop_count > 5) {
  break;
  }
- last_length = text_to_speak.length();
+ last_length = output_text.length();
  }
 
- return text_to_speak;
+ return output_text;
 }
diff --git a/customchar/llm/llm.h b/customchar/llm/llm.h
@@ -87,7 +87,7 @@ class LLM {
  std::vector<llama_token> Tokenize(const std::string& text, bool add_bos);
 
  /// @brief Get answer from LLM
- std::string GetAnswer(std::vector<llama_token>& embd);
+ std::string GetAnswer(const std::string& user_input);
 };
 
 } // namespace llm