From 2fbd4d368cc7ce2363c2510d84d066d3cea6df17 Mon Sep 17 00:00:00 2001 From: Viet Anh Nguyen Date: Thu, 3 Aug 2023 00:49:09 +0700 Subject: [PATCH] Search JARVIS's quotes --- CMakeLists.txt | 11 +++-- customchar/llm/llm.cpp | 26 +++++++++++ customchar/llm/llm.h | 29 +++++++++++- data/jarvis.txt | 6 +++ examples/search_doc.cpp | 100 +++++++++++++++++++++++++++++----------- 5 files changed, 138 insertions(+), 34 deletions(-) create mode 100644 data/jarvis.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 395c982..17d6ee1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,11 +43,6 @@ target_link_libraries( pthread dl ) -add_executable( - search_doc - examples/search_doc.cpp -) -target_link_libraries(search_doc embeddb) # Build CustomChar-core set(TARGET customchar-core) @@ -71,6 +66,12 @@ target_include_directories( ) target_link_libraries(${TARGET} PUBLIC ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${OpenCV_LIBS} whisper subprocess embeddb) +add_executable( + search_doc + examples/search_doc.cpp +) +target_link_libraries(search_doc PUBLIC customchar-core) + # CustomChar - cli add_executable( customchar-cli diff --git a/customchar/llm/llm.cpp b/customchar/llm/llm.cpp index c215649..92c0804 100644 --- a/customchar/llm/llm.cpp +++ b/customchar/llm/llm.cpp @@ -20,6 +20,7 @@ LLM::LLM(const std::string& model_path, const std::string& path_session, lparams_.n_ctx = 2048; lparams_.seed = 1; lparams_.f16_kv = true; + lparams_.embedding = true; // Load model to ram ctx_llama_ = llama_init_from_file(model_path_.c_str(), lparams_); @@ -349,3 +350,28 @@ std::string LLM::get_answer(const std::string& user_input) { return output_text; } + +std::vector LLM::get_embedding(const std::string& text) { + std::vector embd(text.size()); + llama_tokenize(ctx_llama_, text.c_str(), embd.data(), embd.size(), true); + llama_eval(ctx_llama_, embd.data(), embd.size(), n_past_, n_threads_); + const int n_embd = llama_n_embd(ctx_llama_); + const auto embeddings = llama_get_embeddings(ctx_llama_); + std::vector result; + result.reserve(n_embd); + for (int i = 0; i < n_embd; ++i) { + result.push_back(embeddings[i]); + } + + // Normalize + float norm = 0; + for (int i = 0; i < n_embd; ++i) { + norm += result[i] * result[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n_embd; ++i) { + result[i] /= norm; + } + + return result; +} diff --git a/customchar/llm/llm.h b/customchar/llm/llm.h index 0a41865..9523691 100644 --- a/customchar/llm/llm.h +++ b/customchar/llm/llm.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ class LLM { void init_prompt(); public: - /// @brief Constructor + // @brief Constructor /// @param model_path Path to the model /// @param path_session Path to the session LLM(const std::string& model_path, const std::string& path_session = "", @@ -88,6 +89,32 @@ class LLM { /// @brief Get answer from LLM std::string get_answer(const std::string& user_input); + + /// @brief Get embedding from LLM + std::vector get_embedding(const std::string& text) { + std::vector embd(text.size()); + llama_tokenize(ctx_llama_, text.c_str(), embd.data(), embd.size(), true); + llama_eval(ctx_llama_, embd.data(), embd.size(), n_past_, n_threads_); + const int n_embd = llama_n_embd(ctx_llama_); + const auto embeddings = llama_get_embeddings(ctx_llama_); + std::vector result; + result.reserve(n_embd); + for (int i = 0; i < n_embd; ++i) { + result.push_back(embeddings[i]); + } + + // Normalize + float norm = 0; + for (int i = 0; i < n_embd; ++i) { + norm += result[i] * result[i]; + } + norm = sqrt(norm); + for (int i = 0; i < n_embd; ++i) { + result[i] /= norm; + } + + return result; + } }; } // namespace llm diff --git a/data/jarvis.txt b/data/jarvis.txt new file mode 100644 index 0000000..ed26b67 --- /dev/null +++ b/data/jarvis.txt @@ -0,0 +1,6 @@ +Good morning. It's 7 A.M. The weather in Malibu is 72 degrees with scattered clouds. The surf conditions are fair with waist to shoulder highlines, high tide will be at 10:52 a.m. ―J.A.R.V.I.S +We are now running on emergency backup power. ―J.A.R.V.I.S +I don't want this winding up in the wrong hands. Maybe in mine, it could actually do some good. +Wake up. Daddy's home. +Welcome home, sir. Congratulations on the opening ceremonies. They were such a success, as was your senate hearing. And may I say how refreshing it is to finally see you in a video with your clothing on, sir. +Sir, we will lose power before we penetrate that shell. diff --git a/examples/search_doc.cpp b/examples/search_doc.cpp index 6a49dd7..ee55a80 100644 --- a/examples/search_doc.cpp +++ b/examples/search_doc.cpp @@ -5,44 +5,88 @@ #include "customchar/embeddb/document.h" #include "customchar/embeddb/embed_search.h" #include "customchar/embeddb/types.h" +#include "customchar/llm/llm.h" +using namespace CC; using namespace CC::embeddb; +std::vector read_lines(const std::string& file_path) { + std::vector lines; + + // Open the file + std::ifstream file(file_path); + + // Check if the file was opened successfully + if (!file.is_open()) { + std::cerr << "Error: Unable to open the file." << std::endl; + return lines; + } + + // Read line by line and add non-empty lines to the vector + std::string line; + while (std::getline(file, line)) { + if (!line.empty()) { + lines.push_back(line); + } + } + + // Close the file after reading + file.close(); + + return lines; +} + int main() { + std::string model_path = "../models/llama-2-7b-chat.ggmlv3.q4_0.bin"; + llm::LLM embedding_model(model_path); + embedding_model.eval_model(); + std::string connection_name = "test_collection"; std::string path = "test_collection"; - int dim = 10; + int dim = 4096; int max_size = 1000; Collection* collection = new Collection(connection_name, path, dim, max_size); - // Test Insert - std::vector embed; - for (int i = 0; i < 10; i++) { - embed.push_back(i); - } - std::string content = "test content"; - std::string meta = "test meta"; - u_int32_t id = collection->insert_doc(embed, content, meta, 1, 1, 1); - std::cout << "Inserted document id: " << id << std::endl; - - // Test Get Doc From Ids - std::vector ids{0, 1}; - std::vector docs = collection->get_docs_by_ids(ids, 2); - std::cout << docs.size() << std::endl; - - // Test Search - std::vector query; - for (int i = 0; i < 10; i++) { - query.push_back(i); - } - std::vector doc_ids; - std::vector distances; - int top_k = 2; - float threshold = 100; - collection->search(query, top_k, threshold, doc_ids, distances); - for (int i = 0; i < doc_ids.size(); i++) { - std::cout << doc_ids[i] << " " << distances[i] << std::endl; + // Read the document from file + std::string file_path = "../data/jarvis.txt"; + std::vector lines = read_lines(file_path); + + // Insert all documents + for (int i = 0; i < lines.size(); i++) { + std::string content = lines[i]; + std::vector embed = embedding_model.get_embedding(lines[i]); + std::string meta = "test meta"; + std::cout << "Inserting document " << i << std::endl; + std::cout << "Embedding size " << embed.size() << std::endl; + std::cout << "Content size " << content.size() << std::endl; + std::cout << "Content: " << content << std::endl; + u_int32_t id = collection->insert_doc(embed, content, meta, 1, 1, 1); } + while (true) { + // Test Search + std::string query_str; + std::cout << "Enter query: "; + std::getline(std::cin, query_str); + std::vector query = embedding_model.get_embedding(query_str); + + std::vector doc_ids; + std::vector distances; + int top_k = 2; + float threshold = 100000000; + collection->search(query, top_k, threshold, doc_ids, distances); + for (int i = 0; i < 10; ++i) { + std::cout << query[i] << " "; + } + std::cout << std::endl; + + std::cout << "Search result: " << std::endl; + for (int i = 0; i < doc_ids.size(); i++) { + std::cout << "Doc id: " << doc_ids[i] << std::endl; + std::cout << "Distance: " << distances[i] << std::endl; + Document doc = collection->get_doc(doc_ids[i]); + std::cout << "Content: " << doc.get_content() << std::endl; + } + } return 0; }