update

MaybeShewill-CV · Nov 26, 2024 · 7dba7a0 · 7dba7a0
1 parent cf0fdd3
commit 7dba7a0
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 14 deletions.
diff --git a/conf/model/llm/llama/llama-3.2-1B-instruct.ini b/conf/model/llm/llama/llama-3.2-1B-instruct.ini
@@ -0,0 +1,8 @@
+[LLAMA3]
+model_file_path="../weights/llm/llama/Llama-3.2-1B-Instruct/llama-3.2-1B-instruct-Q4_K_M.gguf"
+n_gpu_layers=300
+main_gpu_device=0
+sampler_temp=0.7
+
+[CONTEXT]
+context_size=4096
diff --git a/src/apps/model_benchmark/llm/llama3_benchmark.cpp b/src/apps/model_benchmark/llm/llama3_benchmark.cpp
@@ -43,11 +43,11 @@ int main(int argc, char** argv) {
         return -1;
     }
 
-    std::string input = "<user>\n"
-                        "Can you recommend some beginner-friendly programming languages for someone new to coding?\n"
-                        "</user>";
+    std::string input = "\n<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nWho creates you?<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n";
+    LOG(INFO) << "input prompt text: " << input;
     std::string out;
     model.run(input, out);
+    LOG(INFO) << "generated output: " << out;
 
     return 0;
 }
diff --git a/src/models/llm/llama/llama3.inl b/src/models/llm/llama/llama3.inl
@@ -157,9 +157,10 @@ private:
     /***
      *
      * @param prompt_tokens
+     * @param generate_out
      * @return
      */
-    StatusCode llama_generate(std::vector<llama_token>& prompt_tokens);
+    StatusCode llama_generate(std::vector<llama_token>& prompt_tokens, std::string& generate_out);
 };
 
 /***
@@ -248,9 +249,16 @@ StatusCode Llama3<INPUT, OUTPUT>::Impl::run(const INPUT& in, OUTPUT& out) {
     // tokenize input prompt
     std::vector<llama_token> prompt_tokens;
     auto status = tokenize_prompt(prompt, prompt_tokens);
+    if (status != StatusCode::OK) {
+        return status;
+    }
 
     // run llama3 generate
-    status = llama_generate(prompt_tokens);
+    std::string generate_out;
+    status = llama_generate(prompt_tokens, generate_out);
+
+    // transform output
+    out = llama_impl::transform_output<OUTPUT>(generate_out);
 
     return status;
 }
@@ -294,8 +302,7 @@ StatusCode Llama3<INPUT, OUTPUT>::Impl::tokenize_prompt(const std::string &promp
  * @return
  */
 template <typename INPUT, typename OUTPUT>
-StatusCode Llama3<INPUT, OUTPUT>::Impl::llama_generate(std::vector<llama_token> &prompt_tokens) {
-    std::string response;
+StatusCode Llama3<INPUT, OUTPUT>::Impl::llama_generate(std::vector<llama_token> &prompt_tokens, std::string& generate_out) {
     // prepare a batch for the prompt
     llama_batch batch = llama_batch_get_one(prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
     llama_token new_token_id;
@@ -304,9 +311,8 @@ StatusCode Llama3<INPUT, OUTPUT>::Impl::llama_generate(std::vector<llama_token>
         int n_ctx = llama_n_ctx(_m_ctx);
         int n_ctx_used = llama_get_kv_cache_used_cells(_m_ctx);
         if (n_ctx_used + batch.n_tokens > n_ctx) {
-            printf("\033[0m\n");
-            fprintf(stderr, "context size exceeded\n");
-            exit(0);
+            LOG(ERROR) << "context size exceeded";
+            return StatusCode::MODEL_RUN_SESSION_FAILED;
         }
 
         if (llama_decode(_m_ctx, batch)) {
@@ -330,14 +336,13 @@ StatusCode Llama3<INPUT, OUTPUT>::Impl::llama_generate(std::vector<llama_token>
             return StatusCode::MODEL_RUN_SESSION_FAILED;
         }
         std::string piece(buf, n);
-        printf("%s", piece.c_str());
-        fflush(stdout);
-        response += piece;
+//        printf("%s", piece.c_str());
+//        fflush(stdout);
+        generate_out += piece;
 
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
     }
-    LOG(INFO) << "generate: " << response;
 
     return StatusCode::OK;
 }