From 2fbd4d368cc7ce2363c2510d84d066d3cea6df17 Mon Sep 17 00:00:00 2001
From: Viet Anh Nguyen <vietanh.dev@gmail.com>
Date: Thu, 3 Aug 2023 00:49:09 +0700
Subject: [PATCH] Search JARVIS's quotes

---
 CMakeLists.txt          |  11 +++--
 customchar/llm/llm.cpp  |  26 +++++++++++
 customchar/llm/llm.h    |  29 +++++++++++-
 data/jarvis.txt         |   6 +++
 examples/search_doc.cpp | 100 +++++++++++++++++++++++++++++-----------
 5 files changed, 138 insertions(+), 34 deletions(-)
 create mode 100644 data/jarvis.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 395c982..17d6ee1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,11 +43,6 @@ target_link_libraries(
     pthread
     dl
 )
-add_executable(
-    search_doc
-    examples/search_doc.cpp
-)
-target_link_libraries(search_doc embeddb)
 
 # Build CustomChar-core
 set(TARGET customchar-core)
@@ -71,6 +66,12 @@ target_include_directories(
 )
 target_link_libraries(${TARGET} PUBLIC ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${OpenCV_LIBS} whisper subprocess embeddb)
 
+add_executable(
+    search_doc
+    examples/search_doc.cpp
+)
+target_link_libraries(search_doc PUBLIC customchar-core)
+
 # CustomChar - cli
 add_executable(
     customchar-cli
diff --git a/customchar/llm/llm.cpp b/customchar/llm/llm.cpp
index c215649..92c0804 100644
--- a/customchar/llm/llm.cpp
+++ b/customchar/llm/llm.cpp
@@ -20,6 +20,7 @@ LLM::LLM(const std::string& model_path, const std::string& path_session,
   lparams_.n_ctx = 2048;
   lparams_.seed = 1;
   lparams_.f16_kv = true;
+  lparams_.embedding = true;
 
   // Load model to ram
   ctx_llama_ = llama_init_from_file(model_path_.c_str(), lparams_);
@@ -349,3 +350,28 @@ std::string LLM::get_answer(const std::string& user_input) {
 
   return output_text;
 }
+
+std::vector<float> LLM::get_embedding(const std::string& text) {
+  std::vector<llama_token> embd(text.size());
+  llama_tokenize(ctx_llama_, text.c_str(), embd.data(), embd.size(), true);
+  llama_eval(ctx_llama_, embd.data(), embd.size(), n_past_, n_threads_);
+  const int n_embd = llama_n_embd(ctx_llama_);
+  const auto embeddings = llama_get_embeddings(ctx_llama_);
+  std::vector<float> result;
+  result.reserve(n_embd);
+  for (int i = 0; i < n_embd; ++i) {
+    result.push_back(embeddings[i]);
+  }
+
+  // Normalize
+  float norm = 0;
+  for (int i = 0; i < n_embd; ++i) {
+    norm += result[i] * result[i];
+  }
+  norm = sqrt(norm);
+  for (int i = 0; i < n_embd; ++i) {
+    result[i] /= norm;
+  }
+
+  return result;
+}
diff --git a/customchar/llm/llm.h b/customchar/llm/llm.h
index 0a41865..9523691 100644
--- a/customchar/llm/llm.h
+++ b/customchar/llm/llm.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <cstdio>
 #include <fstream>
+#include <iostream>
 #include <regex>
 #include <string>
 #include <thread>
@@ -65,7 +66,7 @@ class LLM {
   void init_prompt();
 
  public:
-  /// @brief Constructor
+  // @brief Constructor
   /// @param model_path Path to the model
   /// @param path_session Path to the session
   LLM(const std::string& model_path, const std::string& path_session = "",
@@ -88,6 +89,32 @@ class LLM {
 
   /// @brief Get answer from LLM
   std::string get_answer(const std::string& user_input);
+
+  /// @brief Get embedding from LLM
+  std::vector<float> get_embedding(const std::string& text) {
+    std::vector<llama_token> embd(text.size());
+    llama_tokenize(ctx_llama_, text.c_str(), embd.data(), embd.size(), true);
+    llama_eval(ctx_llama_, embd.data(), embd.size(), n_past_, n_threads_);
+    const int n_embd = llama_n_embd(ctx_llama_);
+    const auto embeddings = llama_get_embeddings(ctx_llama_);
+    std::vector<float> result;
+    result.reserve(n_embd);
+    for (int i = 0; i < n_embd; ++i) {
+      result.push_back(embeddings[i]);
+    }
+
+    // Normalize
+    float norm = 0;
+    for (int i = 0; i < n_embd; ++i) {
+      norm += result[i] * result[i];
+    }
+    norm = sqrt(norm);
+    for (int i = 0; i < n_embd; ++i) {
+      result[i] /= norm;
+    }
+
+    return result;
+  }
 };
 
 }  // namespace llm
diff --git a/data/jarvis.txt b/data/jarvis.txt
new file mode 100644
index 0000000..ed26b67
--- /dev/null
+++ b/data/jarvis.txt
@@ -0,0 +1,6 @@
+Good morning. It's 7 A.M. The weather in Malibu is 72 degrees with scattered clouds. The surf conditions are fair with waist to shoulder highlines, high tide will be at 10:52 a.m. ―J.A.R.V.I.S
+We are now running on emergency backup power. ―J.A.R.V.I.S
+I don't want this winding up in the wrong hands. Maybe in mine, it could actually do some good.
+Wake up. Daddy's home.
+Welcome home, sir. Congratulations on the opening ceremonies. They were such a success, as was your senate hearing. And may I say how refreshing it is to finally see you in a video with your clothing on, sir.
+Sir, we will lose power before we penetrate that shell.
diff --git a/examples/search_doc.cpp b/examples/search_doc.cpp
index 6a49dd7..ee55a80 100644
--- a/examples/search_doc.cpp
+++ b/examples/search_doc.cpp
@@ -5,44 +5,88 @@
 #include "customchar/embeddb/document.h"
 #include "customchar/embeddb/embed_search.h"
 #include "customchar/embeddb/types.h"
+#include "customchar/llm/llm.h"
 
+using namespace CC;
 using namespace CC::embeddb;
 
+std::vector<std::string> read_lines(const std::string& file_path) {
+  std::vector<std::string> lines;
+
+  // Open the file
+  std::ifstream file(file_path);
+
+  // Check if the file was opened successfully
+  if (!file.is_open()) {
+    std::cerr << "Error: Unable to open the file." << std::endl;
+    return lines;
+  }
+
+  // Read line by line and add non-empty lines to the vector
+  std::string line;
+  while (std::getline(file, line)) {
+    if (!line.empty()) {
+      lines.push_back(line);
+    }
+  }
+
+  // Close the file after reading
+  file.close();
+
+  return lines;
+}
+
 int main() {
+  std::string model_path = "../models/llama-2-7b-chat.ggmlv3.q4_0.bin";
+  llm::LLM embedding_model(model_path);
+  embedding_model.eval_model();
+
   std::string connection_name = "test_collection";
   std::string path = "test_collection";
-  int dim = 10;
+  int dim = 4096;
   int max_size = 1000;
   Collection* collection = new Collection(connection_name, path, dim, max_size);
 
-  // Test Insert
-  std::vector<float> embed;
-  for (int i = 0; i < 10; i++) {
-    embed.push_back(i);
-  }
-  std::string content = "test content";
-  std::string meta = "test meta";
-  u_int32_t id = collection->insert_doc(embed, content, meta, 1, 1, 1);
-  std::cout << "Inserted document id: " << id << std::endl;
-
-  // Test Get Doc From Ids
-  std::vector<u_int32_t> ids{0, 1};
-  std::vector<Document> docs = collection->get_docs_by_ids(ids, 2);
-  std::cout << docs.size() << std::endl;
-
-  // Test Search
-  std::vector<float> query;
-  for (int i = 0; i < 10; i++) {
-    query.push_back(i);
-  }
-  std::vector<u_int32_t> doc_ids;
-  std::vector<float> distances;
-  int top_k = 2;
-  float threshold = 100;
-  collection->search(query, top_k, threshold, doc_ids, distances);
-  for (int i = 0; i < doc_ids.size(); i++) {
-    std::cout << doc_ids[i] << " " << distances[i] << std::endl;
+  // Read the document from file
+  std::string file_path = "../data/jarvis.txt";
+  std::vector<std::string> lines = read_lines(file_path);
+
+  // Insert all documents
+  for (int i = 0; i < lines.size(); i++) {
+    std::string content = lines[i];
+    std::vector<float> embed = embedding_model.get_embedding(lines[i]);
+    std::string meta = "test meta";
+    std::cout << "Inserting document " << i << std::endl;
+    std::cout << "Embedding size " << embed.size() << std::endl;
+    std::cout << "Content size " << content.size() << std::endl;
+    std::cout << "Content: " << content << std::endl;
+    u_int32_t id = collection->insert_doc(embed, content, meta, 1, 1, 1);
   }
 
+  while (true) {
+    // Test Search
+    std::string query_str;
+    std::cout << "Enter query: ";
+    std::getline(std::cin, query_str);
+    std::vector<float> query = embedding_model.get_embedding(query_str);
+
+    std::vector<u_int32_t> doc_ids;
+    std::vector<float> distances;
+    int top_k = 2;
+    float threshold = 100000000;
+    collection->search(query, top_k, threshold, doc_ids, distances);
+    for (int i = 0; i < 10; ++i) {
+      std::cout << query[i] << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Search result: " << std::endl;
+    for (int i = 0; i < doc_ids.size(); i++) {
+      std::cout << "Doc id: " << doc_ids[i] << std::endl;
+      std::cout << "Distance: " << distances[i] << std::endl;
+      Document doc = collection->get_doc(doc_ids[i]);
+      std::cout << "Content: " << doc.get_content() << std::endl;
+    }
+  }
   return 0;
 }