diff --git a/.nojekyll b/.nojekyll
index 2640a25..f7a8443 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-61f03ba7
\ No newline at end of file
+cc4d3b70
\ No newline at end of file
diff --git a/images/profile.jpeg b/images/profile.jpeg
index 3f56d27..6ef0667 100644
Binary files a/images/profile.jpeg and b/images/profile.jpeg differ
diff --git a/listings.json b/listings.json
index afacafb..f7703ce 100644
--- a/listings.json
+++ b/listings.json
@@ -1,7 +1,15 @@
 [
+  {
+    "listing": "/news.html",
+    "items": [
+      "/news/Research news/researchnews.html",
+      "/news/Personal thoughts/myread.html"
+    ]
+  },
   {
     "listing": "/notes.html",
     "items": [
+      "/notes/Large Language Model/inference_optimize.html",
       "/notes/Math Theories/ml_optimizer.html",
       "/notes/Large Language Model/llm_train.html",
       "/notes/Math Theories/complexanalysis.html",
@@ -10,12 +18,5 @@
       "/notes/Diffusion Model/sd.html",
       "/notes/Large Language Model/rl_llm.html"
     ]
-  },
-  {
-    "listing": "/news.html",
-    "items": [
-      "/news/Research news/researchnews.html",
-      "/news/Personal thoughts/myread.html"
-    ]
   }
 ]
\ No newline at end of file
diff --git a/notes.html b/notes.html
index 60aa72c..0e267e3 100644
--- a/notes.html
+++ b/notes.html
@@ -220,7 +220,37 @@ <h1 class="title">Research notes</h1>
 
 <div class="quarto-listing quarto-listing-container-default" id="listing-listing">
 <div class="list quarto-listing-default">
-<div class="quarto-post image-right" data-index="0" data-categories="Math Theories" data-listing-date-sort="1710486000000" data-listing-file-modified-sort="1710739348216" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="13" data-listing-word-count-sort="2402">
+<div class="quarto-post image-right" data-index="0" data-categories="Large Language Models" data-listing-date-sort="1713596400000" data-listing-file-modified-sort="1713637598695" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="4" data-listing-word-count-sort="734">
+<div class="thumbnail">
+<p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external">
+<div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
+</a><p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p>
+</div>
+<div class="body">
+<h3 class="no-anchor listing-title">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external">The inference optimization</a>
+</h3>
+<div class="listing-subtitle">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a>
+</div>
+<div class="listing-categories">
+<div class="listing-category" onclick="window.quartoListingCategory('Large Language Models'); return false;">
+Large Language Models
+</div>
+</div>
+<div class="listing-description">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external">To run the language model faster and especially on the edge devices, we need to optimize the model. This…</a>
+</div>
+</div>
+<div class="metadata">
+<p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external">
+<div class="listing-reading-time">
+4 min
+</div>
+</a>
+</div>
+</div>
+<div class="quarto-post image-right" data-index="1" data-categories="Math Theories" data-listing-date-sort="1710486000000" data-listing-file-modified-sort="1710739348216" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="13" data-listing-word-count-sort="2402">
 <div class="thumbnail">
 <p><a href="./notes/Math Theories/ml_optimizer.html" class="no-external"></a></p><a href="./notes/Math Theories/ml_optimizer.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -250,7 +280,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="1" data-categories="Large Language Models" data-listing-date-sort="1709366400000" data-listing-file-modified-sort="1709442246902" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="911">
+<div class="quarto-post image-right" data-index="2" data-categories="Large Language Models" data-listing-date-sort="1709366400000" data-listing-file-modified-sort="1709442246902" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="911">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/llm_train.html" class="no-external"></a></p><a href="./notes/Large Language Model/llm_train.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -280,7 +310,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="2" data-categories="Math Theories" data-listing-date-sort="1708848000000" data-listing-file-modified-sort="1710200821871" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="849">
+<div class="quarto-post image-right" data-index="3" data-categories="Math Theories" data-listing-date-sort="1708848000000" data-listing-file-modified-sort="1710200821871" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="849">
 <div class="thumbnail">
 <p><a href="./notes/Math Theories/complexanalysis.html" class="no-external"></a></p><a href="./notes/Math Theories/complexanalysis.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -310,7 +340,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="3" data-categories="Large Language Models" data-listing-date-sort="1708588800000" data-listing-file-modified-sort="1709437533641" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11" data-listing-word-count-sort="2127">
+<div class="quarto-post image-right" data-index="4" data-categories="Large Language Models" data-listing-date-sort="1708588800000" data-listing-file-modified-sort="1709437533641" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11" data-listing-word-count-sort="2127">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/llm_eval.html" class="no-external"></a></p><a href="./notes/Large Language Model/llm_eval.html" class="no-external">
 <p class="card-img-top"><img src="images/LLM_eval.png"  class="thumbnail-image card-img"/></p>
@@ -340,7 +370,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="4" data-categories="Large Language Models" data-listing-date-sort="1708502400000" data-listing-file-modified-sort="1708927783466" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="942">
+<div class="quarto-post image-right" data-index="5" data-categories="Large Language Models" data-listing-date-sort="1708502400000" data-listing-file-modified-sort="1708927783466" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="942">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/moe.html" class="no-external"></a></p><a href="./notes/Large Language Model/moe.html" class="no-external">
 <p class="card-img-top"><img src="images/llama2.png"  class="thumbnail-image card-img"/></p>
@@ -370,7 +400,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="5" data-categories="Diffusion Model" data-listing-date-sort="1708329600000" data-listing-file-modified-sort="1708330283513" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="1" data-listing-word-count-sort="13">
+<div class="quarto-post image-right" data-index="6" data-categories="Diffusion Model" data-listing-date-sort="1708329600000" data-listing-file-modified-sort="1713632215422" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="1" data-listing-word-count-sort="14">
 <div class="thumbnail">
 <p><a href="./notes/Diffusion Model/sd.html" class="no-external"></a></p><a href="./notes/Diffusion Model/sd.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -400,7 +430,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="6" data-categories="Large Language Models" data-listing-date-sort="1707984000000" data-listing-file-modified-sort="1708587186166" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="19" data-listing-word-count-sort="3751">
+<div class="quarto-post image-right" data-index="7" data-categories="Large Language Models" data-listing-date-sort="1707984000000" data-listing-file-modified-sort="1708587186166" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="19" data-listing-word-count-sort="3751">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/rl_llm.html" class="no-external"></a></p><a href="./notes/Large Language Model/rl_llm.html" class="no-external">
 <p class="card-img-top"><img src="images/RL_basic.png"  class="thumbnail-image card-img"/></p>
diff --git a/notes.xml b/notes.xml
index 9cd1ff2..288cc1e 100644
--- a/notes.xml
+++ b/notes.xml
@@ -10,7 +10,81 @@
 <atom:link href="https://alexchen4ai.github.io/blog/notes.xml" rel="self" type="application/rss+xml"/>
 <description>Personal summaries and insights gathered from reading various research papers and articles.</description>
 <generator>quarto-1.4.549</generator>
-<lastBuildDate>Fri, 15 Mar 2024 07:00:00 GMT</lastBuildDate>
+<lastBuildDate>Sat, 20 Apr 2024 07:00:00 GMT</lastBuildDate>
+<item>
+  <title>The inference optimization</title>
+  <dc:creator>Alex Chen</dc:creator>
+  <link>https://alexchen4ai.github.io/blog/notes/Large Language Model/inference_optimize.html</link>
+  <description><![CDATA[ 
+
+
+
+
+
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To run the language model faster and especially on the edge devices, we need to optimize the model. This optimization can be done in different ways. In this article, we will discuss some of the optimization techniques. In this blog, I will introduce different methods and the existing solutions to enable the language model to run faster. Note that my focus is for the on-device language model.</p>
+</div>
+</div>
+<section id="overview-of-the-optimization" class="level2">
+<h2 class="anchored" data-anchor-id="overview-of-the-optimization">Overview of the optimization</h2>
+<p>To optimize the inference for language model, we mainly have the following methods:</p>
+<ol type="1">
+<li><strong>Quantization</strong>: Quantization is the process of reducing the precision of the weights and activations of the model. This reduces the memory footprint and increases the speed of the model.</li>
+<li><strong>Pruning</strong>: Pruning is the process of removing the weights that are close to zero. This reduces the number of parameters in the model and increases the speed of the model.</li>
+<li><strong>Lower level implementation</strong>: Implementing the model in a lower level language like C++ or Rust can increase the speed of the model.</li>
+<li><strong>KV Cache</strong>: Key-Value cache is a technique to cache the intermediate results of the model. This reduces the computation and increases the speed of the model. For some certain devices, we may need to support the KV cache specially.</li>
+<li><strong>Optimization based on hardware</strong>: Like the flash attention for NVIDIA GPU, we can optimize the model based on the hardware. The main method would be to use the memory-access pattern method to optimize the model.</li>
+</ol>
+</section>
+<section id="quantization" class="level2">
+<h2 class="anchored" data-anchor-id="quantization">Quantization</h2>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Quantization is a model compression technique that converts the weights and activations within an LLM from a high-precision data representation to a lower-precision data representation, i.e., from a data type that can hold more information to one that holds less. A typical example of this is the conversion of data from a 32-bit floating-point number (FP32) to an 8-bit or 4-bit integer (INT4 or INT8). A good blog from internet is <a href="https://symbl.ai/developers/blog/a-guide-to-quantization-in-llms/">here</a>.</p>
+</div>
+</div>
+<p>Let’s first revisit the representation of data in computer. We mainly study the <code>float32</code>, <code>float16</code> and <code>bfloat16</code> type.</p>
+<ul>
+<li><strong>float32</strong>: 32 bits. We have 1 bit for the sign, 8 bits for the exponent and 23 bits for the mantissa. To form a float number in computer, we need the sign, the number before the exponent and the exponent number over 2. For example, we have <img src="https://latex.codecogs.com/png.latex?6.75=+1.1011%5Ctimes%202%5E2">. Thus, we can conclude that the range of the representation is between <img src="https://latex.codecogs.com/png.latex?1e%5E%7B-38%7D"> and <img src="https://latex.codecogs.com/png.latex?3e%5E%7B38%7D"> (you can add sign freely, though).</li>
+<li><strong>float16</strong>: 16 bits. We have 1 bit for the sign, 5 bits for the exponent and 10 bits for the mantissa. The range of the representation is between <img src="https://latex.codecogs.com/png.latex?6e%5E%7B-8%7D"> and <img src="https://latex.codecogs.com/png.latex?6e%5E%7B4%7D">.</li>
+<li><strong>bfloat16</strong>: 16 bits. We have 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. The range of the representation is between <img src="https://latex.codecogs.com/png.latex?1e%5E%7B-38%7D"> and <img src="https://latex.codecogs.com/png.latex?3e%5E%7B38%7D">.</li>
+</ul>
+<p>We can see that <code>float16</code> and <code>bfloat16</code> take up the same memory space. But they are different in the bits allocation. The <code>float16</code> has better precision than <code>bfloat16</code>, but the <code>bfloat16</code> has better range than <code>float16</code>. For deep neural network, we may need to consider the use of the <code>bfloat16</code> type since the range is more important than the precision for the deep neural network. The common quantization type are <code>INT8</code> and <code>INT4</code>. Note that <code>INT8</code> and <code>INT4</code> can only represent the integer numbers, not for the float numbers. Thus, <code>INT8</code> can only represent the numbers between <img src="https://latex.codecogs.com/png.latex?-128"> and <img src="https://latex.codecogs.com/png.latex?127">, and <code>INT4</code> can only represent the numbers between <img src="https://latex.codecogs.com/png.latex?-8"> and <img src="https://latex.codecogs.com/png.latex?7">.</p>
+<p>We use the <em>affine quantization scheme</em> to convert the model:</p>
+<p><img src="https://latex.codecogs.com/png.latex?%0Ax_q%20=%20%5Coperatorname%7Bround%7D%5Cleft(x/S%20+%20Z%5Cright)%0A"></p>
+<p>where we have: - <img src="https://latex.codecogs.com/png.latex?x_q">: the quantized value - <img src="https://latex.codecogs.com/png.latex?x">: the original value - <img src="https://latex.codecogs.com/png.latex?S">: the scale factor - <img src="https://latex.codecogs.com/png.latex?Z">: the zero point - <img src="https://latex.codecogs.com/png.latex?%5Coperatorname%7Bround%7D">: the rounding function.</p>
+<p>Usually, we will set multiple blocks to quantize the model. It means that we need multiple scale factors and zero points. Note that not all layers are quantized. For some important layers, we still consider the use of the float32 type.</p>
+<p>For LLM quantization, we have two different methods called post-training quantization and quantization-aware training. If we finally use the quantization model, quantization-aware training is better.</p>
+<section id="exisiting-solutions" class="level3">
+<h3 class="anchored" data-anchor-id="exisiting-solutions">Exisiting solutions</h3>
+<p>We can use quantization library provied in huggingface transformers. For more foundamental optimization, we should consider to use <code>GGML</code> (GPT-Generated Model Language) and <code>GGUF</code> (GPT-Generated Unified Format). For on-device deployment, we should consider the usage of <code>GGUF</code> since it is more efficient. Refer to <a href="https://github.com/ggerganov/llama.cpp">github</a> to use it. We can consider another library called <a href="https://github.com/ollama/ollama">ollama</a> which is built based on the llama cpp.</p>
+
+
+</section>
+</section>
+
+ ]]></description>
+  <category>Large Language Models</category>
+  <guid>https://alexchen4ai.github.io/blog/notes/Large Language Model/inference_optimize.html</guid>
+  <pubDate>Sat, 20 Apr 2024 07:00:00 GMT</pubDate>
+</item>
 <item>
   <title>Optimization in machine learning</title>
   <dc:creator>Alex Chen</dc:creator>
@@ -1473,8 +1547,12 @@ Tip
 </div>
 </div>
 <p>📝 <strong>Paper</strong>: <a href="https://arxiv.org/abs/2212.09748">https://arxiv.org/abs/2212.09748</a></p>
+<section id="introduction" class="level2">
+<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
+<p>Coming soom</p>
 
 
+</section>
 
  ]]></description>
   <category>Diffusion Model</category>
diff --git a/notes/Diffusion Model/sd.html b/notes/Diffusion Model/sd.html
index 32d003f..c6734dc 100644
--- a/notes/Diffusion Model/sd.html	
+++ b/notes/Diffusion Model/sd.html	
@@ -180,6 +180,12 @@ <h1 class="title d-none d-lg-block">Scalable diffusion models with transformers<
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
@@ -272,8 +278,12 @@ <h1 class="title d-none d-lg-block">Scalable diffusion models with transformers<
 </div>
 </div>
 <p>📝 <strong>Paper</strong>: <a href="https://arxiv.org/abs/2212.09748">https://arxiv.org/abs/2212.09748</a></p>
+<section id="introduction" class="level2">
+<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
+<p>Coming soom</p>
 
 
+</section>
 
 </main> <!-- /main -->
 <script id="quarto-html-after-body" type="application/javascript">
diff --git a/notes/Large Language Model/inference_optimize.html b/notes/Large Language Model/inference_optimize.html
new file mode 100644
index 0000000..5bbcfd1
--- /dev/null
+++ b/notes/Large Language Model/inference_optimize.html	
@@ -0,0 +1,750 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.4.549">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Alex Chen">
+<meta name="dcterms.date" content="2024-04-20">
+
+<title>Alex Chen’s Blog - The inference optimization</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../../">
+<link href="../../images/favicon.ico" rel="icon">
+<script src="../../site_libs/quarto-html/quarto.js"></script>
+<script src="../../site_libs/quarto-html/popper.min.js"></script>
+<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+
+<script type="text/javascript">
+const typesetMath = (el) => {
+  if (window.MathJax) {
+    // MathJax Typeset
+    window.MathJax.typeset([el]);
+  } else if (window.katex) {
+    // KaTeX Render
+    var mathElements = el.getElementsByClassName("math");
+    var macros = [];
+    for (var i = 0; i < mathElements.length; i++) {
+      var texText = mathElements[i].firstChild;
+      if (mathElements[i].tagName == "SPAN") {
+        window.katex.render(texText.data, mathElements[i], {
+          displayMode: mathElements[i].classList.contains('display'),
+          throwOnError: false,
+          macros: macros,
+          fleqn: false
+        });
+      }
+    }
+  }
+}
+window.Quarto = {
+  typesetMath
+};
+</script>
+
+<link rel="stylesheet" href="../../styles.css">
+</head>
+
+<body class="nav-sidebar floating nav-fixed fullcontent">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top quarto-banner">
+    <nav class="navbar navbar-expand-lg " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a class="navbar-brand" href="../../index.html">
+    <span class="navbar-title">Alex Chen’s Blog</span>
+    </a>
+  </div>
+            <div id="quarto-search" class="" title="Search"></div>
+          <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+  <span class="navbar-toggler-icon"></span>
+</button>
+          <div class="collapse navbar-collapse" id="navbarCollapse">
+            <ul class="navbar-nav navbar-nav-scroll me-auto">
+  <li class="nav-item">
+    <a class="nav-link" href="../../index.html"> 
+<span class="menu-text">Home</span></a>
+  </li>  
+  <li class="nav-item">
+    <a class="nav-link" href="../../notes.html"> 
+<span class="menu-text">Articles</span></a>
+  </li>  
+  <li class="nav-item">
+    <a class="nav-link" href="../../news.html"> 
+<span class="menu-text">News</span></a>
+  </li>  
+</ul>
+            <ul class="navbar-nav navbar-nav-scroll ms-auto">
+  <li class="nav-item compact">
+    <a class="nav-link" href="https://github.com/alexchen4ai"> <i class="bi bi-github" role="img">
+</i> 
+<span class="menu-text"></span></a>
+  </li>  
+  <li class="nav-item compact">
+    <a class="nav-link" href="https://www.linkedin.com/in/wei-chen-stanford"> <i class="bi bi-linkedin" role="img">
+</i> 
+<span class="menu-text"></span></a>
+  </li>  
+</ul>
+          </div> <!-- /navcollapse -->
+          <div class="quarto-navbar-tools">
+</div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <a class="flex-grow-1 no-decor" role="button" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+          <h1 class="quarto-secondary-nav-title">The inference optimization</h1>
+        </a>     
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<header id="title-block-header" class="quarto-title-block default page-columns page-full">
+  <div class="quarto-title-banner page-columns page-full">
+    <div class="quarto-title column-body">
+      <h1 class="title d-none d-lg-block">The inference optimization</h1>
+                                <div class="quarto-categories">
+                <div class="quarto-category">Large Language Models</div>
+              </div>
+                  </div>
+  </div>
+    
+  
+  <div class="quarto-title-meta">
+
+      <div>
+      <div class="quarto-title-meta-heading">Author</div>
+      <div class="quarto-title-meta-contents">
+               <p>Alex Chen </p>
+            </div>
+    </div>
+      
+      <div>
+      <div class="quarto-title-meta-heading">Published</div>
+      <div class="quarto-title-meta-contents">
+        <p class="date">April 20, 2024</p>
+      </div>
+    </div>
+    
+      
+    </div>
+    
+  
+  </header><div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
+ <span class="menu-text">🗣️ <strong>Large language models</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Large language model evaluation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/llm_train.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Large language model distributed training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/moe.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mixture of expert</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/rl_llm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reinforcement learning for large language model</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
+ <span class="menu-text">💡 <strong>Diffusion models</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Diffusion Model/sd.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Scalable diffusion models with transformers</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true">
+ <span class="menu-text">♾ <strong>Math Theories</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Math Theories/complexanalysis.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Complex analysis for machine learning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Math Theories/ml_optimizer.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Optimization in machine learning</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    
+<!-- main -->
+<main class="content quarto-banner-title-block" id="quarto-document-content">
+
+
+
+
+
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To run the language model faster and especially on the edge devices, we need to optimize the model. This optimization can be done in different ways. In this article, we will discuss some of the optimization techniques. In this blog, I will introduce different methods and the existing solutions to enable the language model to run faster. Note that my focus is for the on-device language model.</p>
+</div>
+</div>
+<section id="overview-of-the-optimization" class="level2">
+<h2 class="anchored" data-anchor-id="overview-of-the-optimization">Overview of the optimization</h2>
+<p>To optimize the inference for language model, we mainly have the following methods:</p>
+<ol type="1">
+<li><strong>Quantization</strong>: Quantization is the process of reducing the precision of the weights and activations of the model. This reduces the memory footprint and increases the speed of the model.</li>
+<li><strong>Pruning</strong>: Pruning is the process of removing the weights that are close to zero. This reduces the number of parameters in the model and increases the speed of the model.</li>
+<li><strong>Lower level implementation</strong>: Implementing the model in a lower level language like C++ or Rust can increase the speed of the model.</li>
+<li><strong>KV Cache</strong>: Key-Value cache is a technique to cache the intermediate results of the model. This reduces the computation and increases the speed of the model. For some certain devices, we may need to support the KV cache specially.</li>
+<li><strong>Optimization based on hardware</strong>: Like the flash attention for NVIDIA GPU, we can optimize the model based on the hardware. The main method would be to use the memory-access pattern method to optimize the model.</li>
+</ol>
+</section>
+<section id="quantization" class="level2">
+<h2 class="anchored" data-anchor-id="quantization">Quantization</h2>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Quantization is a model compression technique that converts the weights and activations within an LLM from a high-precision data representation to a lower-precision data representation, i.e., from a data type that can hold more information to one that holds less. A typical example of this is the conversion of data from a 32-bit floating-point number (FP32) to an 8-bit or 4-bit integer (INT4 or INT8). A good blog from internet is <a href="https://symbl.ai/developers/blog/a-guide-to-quantization-in-llms/">here</a>.</p>
+</div>
+</div>
+<p>Let’s first revisit the representation of data in computer. We mainly study the <code>float32</code>, <code>float16</code> and <code>bfloat16</code> type.</p>
+<ul>
+<li><strong>float32</strong>: 32 bits. We have 1 bit for the sign, 8 bits for the exponent and 23 bits for the mantissa. To form a float number in computer, we need the sign, the number before the exponent and the exponent number over 2. For example, we have <span class="math inline">\(6.75=+1.1011\times 2^2\)</span>. Thus, we can conclude that the range of the representation is between <span class="math inline">\(1e^{-38}\)</span> and <span class="math inline">\(3e^{38}\)</span> (you can add sign freely, though).</li>
+<li><strong>float16</strong>: 16 bits. We have 1 bit for the sign, 5 bits for the exponent and 10 bits for the mantissa. The range of the representation is between <span class="math inline">\(6e^{-8}\)</span> and <span class="math inline">\(6e^{4}\)</span>.</li>
+<li><strong>bfloat16</strong>: 16 bits. We have 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. The range of the representation is between <span class="math inline">\(1e^{-38}\)</span> and <span class="math inline">\(3e^{38}\)</span>.</li>
+</ul>
+<p>We can see that <code>float16</code> and <code>bfloat16</code> take up the same memory space. But they are different in the bits allocation. The <code>float16</code> has better precision than <code>bfloat16</code>, but the <code>bfloat16</code> has better range than <code>float16</code>. For deep neural network, we may need to consider the use of the <code>bfloat16</code> type since the range is more important than the precision for the deep neural network. The common quantization type are <code>INT8</code> and <code>INT4</code>. Note that <code>INT8</code> and <code>INT4</code> can only represent the integer numbers, not for the float numbers. Thus, <code>INT8</code> can only represent the numbers between <span class="math inline">\(-128\)</span> and <span class="math inline">\(127\)</span>, and <code>INT4</code> can only represent the numbers between <span class="math inline">\(-8\)</span> and <span class="math inline">\(7\)</span>.</p>
+<p>We use the <em>affine quantization scheme</em> to convert the model:</p>
+<p><span class="math display">\[
+x_q = \operatorname{round}\left(x/S + Z\right)
+\]</span></p>
+<p>where we have: - <span class="math inline">\(x_q\)</span>: the quantized value - <span class="math inline">\(x\)</span>: the original value - <span class="math inline">\(S\)</span>: the scale factor - <span class="math inline">\(Z\)</span>: the zero point - <span class="math inline">\(\operatorname{round}\)</span>: the rounding function.</p>
+<p>Usually, we will set multiple blocks to quantize the model. It means that we need multiple scale factors and zero points. Note that not all layers are quantized. For some important layers, we still consider the use of the float32 type.</p>
+<p>For LLM quantization, we have two different methods called post-training quantization and quantization-aware training. If we finally use the quantization model, quantization-aware training is better.</p>
+<section id="exisiting-solutions" class="level3">
+<h3 class="anchored" data-anchor-id="exisiting-solutions">Exisiting solutions</h3>
+<p>We can use quantization library provied in huggingface transformers. For more foundamental optimization, we should consider to use <code>GGML</code> (GPT-Generated Model Language) and <code>GGUF</code> (GPT-Generated Unified Format). For on-device deployment, we should consider the usage of <code>GGUF</code> since it is more efficient. Refer to <a href="https://github.com/ggerganov/llama.cpp">github</a> to use it. We can consider another library called <a href="https://github.com/ollama/ollama">ollama</a> which is built based on the llama cpp.</p>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/notes/Large Language Model/llm_eval.html b/notes/Large Language Model/llm_eval.html
index 980bba6..1813f85 100644
--- a/notes/Large Language Model/llm_eval.html	
+++ b/notes/Large Language Model/llm_eval.html	
@@ -180,6 +180,12 @@ <h1 class="title d-none d-lg-block">Large language model evaluation</h1>
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/notes/Large Language Model/llm_train.html b/notes/Large Language Model/llm_train.html
index ff86343..153c6c9 100644
--- a/notes/Large Language Model/llm_train.html	
+++ b/notes/Large Language Model/llm_train.html	
@@ -214,6 +214,12 @@ <h1 class="title d-none d-lg-block">Large language model distributed training</h
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/notes/Large Language Model/moe.html b/notes/Large Language Model/moe.html
index aeac0e8..39b8c61 100644
--- a/notes/Large Language Model/moe.html	
+++ b/notes/Large Language Model/moe.html	
@@ -243,6 +243,12 @@ <h1 class="title d-none d-lg-block">Mixture of expert</h1>
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/notes/Large Language Model/optimize.html b/notes/Large Language Model/optimize.html
new file mode 100644
index 0000000..c15e83a
--- /dev/null
+++ b/notes/Large Language Model/optimize.html	
@@ -0,0 +1,676 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.4.549">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Alex Chen">
+<meta name="dcterms.date" content="2024-04-20">
+
+<title>Alex Chen’s Blog - The optimization of language model</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../../">
+<link href="../../images/favicon.ico" rel="icon">
+<script src="../../site_libs/quarto-html/quarto.js"></script>
+<script src="../../site_libs/quarto-html/popper.min.js"></script>
+<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+
+<link rel="stylesheet" href="../../styles.css">
+</head>
+
+<body class="nav-sidebar floating nav-fixed fullcontent">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top quarto-banner">
+    <nav class="navbar navbar-expand-lg " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a class="navbar-brand" href="../../index.html">
+    <span class="navbar-title">Alex Chen’s Blog</span>
+    </a>
+  </div>
+            <div id="quarto-search" class="" title="Search"></div>
+          <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+  <span class="navbar-toggler-icon"></span>
+</button>
+          <div class="collapse navbar-collapse" id="navbarCollapse">
+            <ul class="navbar-nav navbar-nav-scroll me-auto">
+  <li class="nav-item">
+    <a class="nav-link" href="../../index.html"> 
+<span class="menu-text">Home</span></a>
+  </li>  
+  <li class="nav-item">
+    <a class="nav-link" href="../../notes.html"> 
+<span class="menu-text">Articles</span></a>
+  </li>  
+  <li class="nav-item">
+    <a class="nav-link" href="../../news.html"> 
+<span class="menu-text">News</span></a>
+  </li>  
+</ul>
+            <ul class="navbar-nav navbar-nav-scroll ms-auto">
+  <li class="nav-item compact">
+    <a class="nav-link" href="https://github.com/alexchen4ai"> <i class="bi bi-github" role="img">
+</i> 
+<span class="menu-text"></span></a>
+  </li>  
+  <li class="nav-item compact">
+    <a class="nav-link" href="https://www.linkedin.com/in/wei-chen-stanford"> <i class="bi bi-linkedin" role="img">
+</i> 
+<span class="menu-text"></span></a>
+  </li>  
+</ul>
+          </div> <!-- /navcollapse -->
+          <div class="quarto-navbar-tools">
+</div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <a class="flex-grow-1 no-decor" role="button" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+          <h1 class="quarto-secondary-nav-title">The optimization of language model</h1>
+        </a>     
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<header id="title-block-header" class="quarto-title-block default page-columns page-full">
+  <div class="quarto-title-banner page-columns page-full">
+    <div class="quarto-title column-body">
+      <h1 class="title d-none d-lg-block">The optimization of language model</h1>
+                                <div class="quarto-categories">
+                <div class="quarto-category">Large Language Models</div>
+              </div>
+                  </div>
+  </div>
+    
+  
+  <div class="quarto-title-meta">
+
+      <div>
+      <div class="quarto-title-meta-heading">Author</div>
+      <div class="quarto-title-meta-contents">
+               <p>Alex Chen </p>
+            </div>
+    </div>
+      
+      <div>
+      <div class="quarto-title-meta-heading">Published</div>
+      <div class="quarto-title-meta-contents">
+        <p class="date">April 20, 2024</p>
+      </div>
+    </div>
+    
+      
+    </div>
+    
+  
+  </header><div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true">
+ <span class="menu-text">🗣️ <strong>Large language models</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Large language model evaluation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/llm_train.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Large language model distributed training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/moe.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mixture of expert</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/optimize.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">The optimization of language model</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/rl_llm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reinforcement learning for large language model</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
+ <span class="menu-text">💡 <strong>Diffusion models</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Diffusion Model/sd.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Scalable diffusion models with transformers</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true">
+ <span class="menu-text">♾ <strong>Math Theories</strong></span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Math Theories/complexanalysis.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Complex analysis for machine learning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../notes/Math Theories/ml_optimizer.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Optimization in machine learning</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    
+<!-- main -->
+<main class="content quarto-banner-title-block" id="quarto-document-content">
+
+
+
+
+
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To run the language model faster and especially on the edge devices, we need to optimize the model. This optimization can be done in different ways. In this article, we will discuss some of the optimization techniques.</p>
+</div>
+</div>
+
+
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/notes/Large Language Model/rl_llm.html b/notes/Large Language Model/rl_llm.html
index f005fb5..43700b7 100644
--- a/notes/Large Language Model/rl_llm.html	
+++ b/notes/Large Language Model/rl_llm.html	
@@ -243,6 +243,12 @@ <h1 class="title d-none d-lg-block">Reinforcement learning for large language mo
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/notes/Math Theories/complexanalysis.html b/notes/Math Theories/complexanalysis.html
index 65f713d..8f7a4b7 100644
--- a/notes/Math Theories/complexanalysis.html	
+++ b/notes/Math Theories/complexanalysis.html	
@@ -209,6 +209,12 @@ <h1 class="title d-none d-lg-block">Complex analysis for machine learning</h1>
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/notes/Math Theories/ml_optimizer.html b/notes/Math Theories/ml_optimizer.html
index d907834..f9e4bcf 100644
--- a/notes/Math Theories/ml_optimizer.html	
+++ b/notes/Math Theories/ml_optimizer.html	
@@ -243,6 +243,12 @@ <h1 class="title d-none d-lg-block">Optimization in machine learning</h1>
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
diff --git a/search.json b/search.json
index 4a3abd2..c1c7dd5 100644
--- a/search.json
+++ b/search.json
@@ -52,7 +52,19 @@
     "href": "notes/Diffusion Model/sd.html",
     "title": "Scalable diffusion models with transformers",
     "section": "",
-    "text": "Tip\n\n\n\nA text to image generation model from the diffusion architecture.\n\n\n📝 Paper: https://arxiv.org/abs/2212.09748",
+    "text": "Tip\n\n\n\nA text to image generation model from the diffusion architecture.\n📝 Paper: https://arxiv.org/abs/2212.09748",
+    "crumbs": [
+      "Home",
+      "💡 **Diffusion models**",
+      "Scalable diffusion models with transformers"
+    ]
+  },
+  {
+    "objectID": "notes/Diffusion Model/sd.html#introduction",
+    "href": "notes/Diffusion Model/sd.html#introduction",
+    "title": "Scalable diffusion models with transformers",
+    "section": "Introduction",
+    "text": "Introduction\nComing soom",
     "crumbs": [
       "Home",
       "💡 **Diffusion models**",
@@ -96,29 +108,105 @@
     ]
   },
   {
-    "objectID": "notes/Large Language Model/llm_train.html",
-    "href": "notes/Large Language Model/llm_train.html",
-    "title": "Large language model distributed training",
+    "objectID": "notes/Large Language Model/inference_optimize.html",
+    "href": "notes/Large Language Model/inference_optimize.html",
+    "title": "The inference optimization",
     "section": "",
-    "text": "Tip\n\n\n\nThe AWS sagemaker is a service to support the automatic training for the models. And the price is 1.5x of the normal elastic container. Thus, the distributed learning is important and expensive.",
+    "text": "Tip\n\n\n\nTo run the language model faster and especially on the edge devices, we need to optimize the model. This optimization can be done in different ways. In this article, we will discuss some of the optimization techniques. In this blog, I will introduce different methods and the existing solutions to enable the language model to run faster. Note that my focus is for the on-device language model.",
     "crumbs": [
       "Home",
       "🗣️ **Large language models**",
-      "Large language model distributed training"
+      "The inference optimization"
     ]
   },
   {
-    "objectID": "notes/Large Language Model/llm_train.html#distributed-learning-introduction-in-pytorch",
-    "href": "notes/Large Language Model/llm_train.html#distributed-learning-introduction-in-pytorch",
-    "title": "Large language model distributed training",
-    "section": "Distributed learning introduction in Pytorch",
-    "text": "Distributed learning introduction in Pytorch\nWe need to be aware of what kind of distributed learning we can use, and there are DDP, RPC and Collective communication from the pytorch documentation (read the documentation for the detail).\n\nData Parallel\nDistributedDataParallel is better than the DataParallel (DP), since DP is limited by the GIL. For DP, it is to split the dataset into multiple machine, and compute them then reduce them. Suppose you have a forward computation with batch size as 16, and the number of the GPU is 4. Then, you basically calculate batch size 4 in each GPU. To apply it, we just need to add a few code:\nif torch.cuda.device_count() &gt; 1:\n    model = nn.DataParallel(model)\nWe don’t need to do any other operation to let it run.\n\n\nDistributed Data Parallel (DDP)\nWe need to use the specific module to let it work. This trick can overcome the GIL. A code example can be\nimport torch\nimport torch.distributed as dist\nimport torch.multiprocessing as mp\nimport torch.nn as nn\nimport torch.optim as optim\nimport os\nfrom torch.nn.parallel import DistributedDataParallel as DDP\n\nclass SimpleCNN(nn.Module):\n    pass \n\ndef example(rank, world_size):\n    # create default process group, nccl means running on GPU\n    dist.init_process_group(\"nccl\", rank=rank, world_size=world_size)\n    # create local model and move it to the current device (GPU/CPU)\n    model = SimpleCNN().to(rank)\n    # construct DDP model\n    ddp_model = DDP(model, device_ids=[rank])\n    # define loss function and optimizer\n    loss_fn = nn.CrossEntropyLoss()\n    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)\n\n    # forward pass\n    outputs = ddp_model(torch.randn(64, 1, 28, 28).to(rank))  # Example input size for MNIST\n    labels = torch.randint(0, 10, (64,)).to(rank)  # Example labels for 64 samples\n    # backward pass\n    loss_fn(outputs, labels).backward()\n    # update parameters\n    optimizer.step()\n\ndef main():\n    world_size = 2\n    mp.spawn(example,\n             args=(world_size,),\n             nprocs=world_size,\n             join=True)\n\nif __name__==\"__main__\":\n    # Environment variables for distributed training\n    os.environ[\"MASTER_ADDR\"] = \"localhost\"\n    os.environ[\"MASTER_PORT\"] = \"29500\"\n    main()\n\n\n\n\n\n\nTip\n\n\n\nrank and world_size are two special concept in the distributed learning. When we launch multiple processes to learn the model, the total number of processes is world_size. For each process, we can define it as rank. You can imagine rank is like a small device, so we put the model or data to the rank like we put them in cuda.\n\n\n\n\n\n\n\n\nTip\n\n\n\nWe still need to define the env value for the distributed learning code, since the framework needs to setup a communication network.\n\n\n\n\nUse the ZeroRedundancyOptimizer\nSince some optimizer like Adam will keep many states, usually twice the model size, OOM error can occur. Therefore, we consider to deepspeed optimizer. In pytorch, it is already implemented!\nfrom torch.distributed.optim import ZeroRedundancyOptimizer\nIf we want to use it, just add a flag called use_zero:\nif use_zero:\n    optimizer = ZeroRedundancyOptimizer(\n        ddp_model.parameters(),\n        optimizer_class=torch.optim.Adam,\n        lr=0.01\n    )\nelse:\n    optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.01)\nThis technique is mainly used to distribute the optimizer to multiple machine to avoid the OOM. All other code is similar to the DDP part.\n\n\nFully sharded data parallel\nThis FSDP will distribute the model and data across all process, and it is good especially for the model that can’t be fitted to one GPU. For the example script, refer to this code example.\n\n\ntorchrun\ntorchrun is a method to execute the distributed learning in a way of elastic running. It can deal with the case that some node may fail. And it can handle the restart automatically.\nWe should set the checkpoint so that we will at most lose one epoch of training. The code is like\ndef main():\n     args = parse_args(sys.argv[1:])\n     state = load_checkpoint(args.checkpoint_path)\n     initialize(state)\n\n     # torch.distributed.run ensures that this will work\n     # by exporting all the env vars needed to initialize the process group\n     torch.distributed.init_process_group(backend=args.backend)\n\n     for i in range(state.epoch, state.total_num_epochs)\n          for batch in iter(state.dataset)\n              train(batch, state.model)\n\n          state.epoch += 1\n          save_checkpoint(state)\nFor more usage about the torchrun, refer to this page. Here is another script that can be runned by the torchrun command. If we want to run the torchrun, we should firstly make sure the script can adapt to the torchrun. The code is to run it is:\ntorchrun\n   --nnodes=NUM_NODES\n   --nproc-per-node=TRAINERS_PER_NODE\n   --max-restarts=NUM_ALLOWED_FAILURES\n   --rdzv-id=JOB_ID\n   --rdzv-backend=c10d\n   --rdzv-endpoint=HOST_NODE_ADDR\n   YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)\nFor more complicated case, pytorch also provide some solution to use multiple container with communication by docker example or k8s example.",
+    "objectID": "notes/Large Language Model/inference_optimize.html#overview-of-the-optimization",
+    "href": "notes/Large Language Model/inference_optimize.html#overview-of-the-optimization",
+    "title": "The inference optimization",
+    "section": "Overview of the optimization",
+    "text": "Overview of the optimization\nTo optimize the inference for language model, we mainly have the following methods:\n\nQuantization: Quantization is the process of reducing the precision of the weights and activations of the model. This reduces the memory footprint and increases the speed of the model.\nPruning: Pruning is the process of removing the weights that are close to zero. This reduces the number of parameters in the model and increases the speed of the model.\nLower level implementation: Implementing the model in a lower level language like C++ or Rust can increase the speed of the model.\nKV Cache: Key-Value cache is a technique to cache the intermediate results of the model. This reduces the computation and increases the speed of the model. For some certain devices, we may need to support the KV cache specially.\nOptimization based on hardware: Like the flash attention for NVIDIA GPU, we can optimize the model based on the hardware. The main method would be to use the memory-access pattern method to optimize the model.",
     "crumbs": [
       "Home",
       "🗣️ **Large language models**",
-      "Large language model distributed training"
+      "The inference optimization"
     ]
   },
+  {
+    "objectID": "notes/Large Language Model/inference_optimize.html#quantization",
+    "href": "notes/Large Language Model/inference_optimize.html#quantization",
+    "title": "The inference optimization",
+    "section": "Quantization",
+    "text": "Quantization\n\n\n\n\n\n\nTip\n\n\n\nQuantization is a model compression technique that converts the weights and activations within an LLM from a high-precision data representation to a lower-precision data representation, i.e., from a data type that can hold more information to one that holds less. A typical example of this is the conversion of data from a 32-bit floating-point number (FP32) to an 8-bit or 4-bit integer (INT4 or INT8). A good blog from internet is here.\n\n\nLet’s first revisit the representation of data in computer. We mainly study the float32, float16 and bfloat16 type.\n\nfloat32: 32 bits. We have 1 bit for the sign, 8 bits for the exponent and 23 bits for the mantissa. To form a float number in computer, we need the sign, the number before the exponent and the exponent number over 2. For example, we have \\(6.75=+1.1011\\times 2^2\\). Thus, we can conclude that the range of the representation is between \\(1e^{-38}\\) and \\(3e^{38}\\) (you can add sign freely, though).\nfloat16: 16 bits. We have 1 bit for the sign, 5 bits for the exponent and 10 bits for the mantissa. The range of the representation is between \\(6e^{-8}\\) and \\(6e^{4}\\).\nbfloat16: 16 bits. We have 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. The range of the representation is between \\(1e^{-38}\\) and \\(3e^{38}\\).\n\nWe can see that float16 and bfloat16 take up the same memory space. But they are different in the bits allocation. The float16 has better precision than bfloat16, but the bfloat16 has better range than float16. For deep neural network, we may need to consider the use of the bfloat16 type since the range is more important than the precision for the deep neural network. The common quantization type are INT8 and INT4. Note that INT8 and INT4 can only represent the integer numbers, not for the float numbers. Thus, INT8 can only represent the numbers between \\(-128\\) and \\(127\\), and INT4 can only represent the numbers between \\(-8\\) and \\(7\\).\nWe use the affine quantization scheme to convert the model:\n\\[\nx_q = \\operatorname{round}\\left(x/S + Z\\right)\n\\]\nwhere we have: - \\(x_q\\): the quantized value - \\(x\\): the original value - \\(S\\): the scale factor - \\(Z\\): the zero point - \\(\\operatorname{round}\\): the rounding function.\nUsually, we will set multiple blocks to quantize the model. It means that we need multiple scale factors and zero points. Note that not all layers are quantized. For some important layers, we still consider the use of the float32 type.\nFor LLM quantization, we have two different methods called post-training quantization and quantization-aware training. If we finally use the quantization model, quantization-aware training is better.\n\nExisiting solutions\nWe can use quantization library provied in huggingface transformers. For more foundamental optimization, we should consider to use GGML (GPT-Generated Model Language) and GGUF (GPT-Generated Unified Format). For on-device deployment, we should consider the usage of GGUF since it is more efficient. Refer to github to use it. We can consider another library called ollama which is built based on the llama cpp.",
+    "crumbs": [
+      "Home",
+      "🗣️ **Large language models**",
+      "The inference optimization"
+    ]
+  },
+  {
+    "objectID": "notes/Large Language Model/rl_llm.html",
+    "href": "notes/Large Language Model/rl_llm.html",
+    "title": "Reinforcement learning for large language model",
+    "section": "",
+    "text": "Tip\n\n\n\nReinforment is a common technique, which can be applied to the large language model area.",
+    "crumbs": [
+      "Home",
+      "🗣️ **Large language models**",
+      "Reinforcement learning for large language model"
+    ]
+  },
+  {
+    "objectID": "notes/Large Language Model/rl_llm.html#background-of-reinforcement-learning",
+    "href": "notes/Large Language Model/rl_llm.html#background-of-reinforcement-learning",
+    "title": "Reinforcement learning for large language model",
+    "section": "Background of reinforcement learning",
+    "text": "Background of reinforcement learning\nIn the first section, we will review the fundamental concept of the reinforcement learning. The fundamental part of the reinforcement learning includes the agent and environment. The process is as the following:\n\n\n\nAt each iteration step, we have the state of the environement marked as \\(S\\), the action \\(A\\) and the reward \\(R\\). Below, we list the step at the time step \\(t\\):\n\nBased on the current state \\(S_t\\), the agent make the action \\(A_t\\);\nThe environment react to the action and transit to the state \\(S_{t+1}\\) and reward \\(R_{t+1}\\).\n\nTherefore, related to each action, we will have a state of \\(S_t, A_t, S_{t+1}, R_{t+1}\\). And these four variables will be the critical data used for the reinforcement learning! Now, let me introduce more about the glossary of the reinforcement learning terms.\n\nMarkov chain: The Markov chain means that the action taken by the agent is only dependent on the most recent state/present state, and is independent of past states.\nObservation/State: The state is the complete description while the observation is just the partial description. The partial description means part of the state.\npolicy: The policy is usually denoted as \\(\\pi\\) and it is used to decide which action \\(a\\) to take. According to the Markov chain, we have \\(\\pi(s)=a\\).\nreward: Reward is the value that we can get immediately after we take a new action. For example, in cartpole example, we get every positive feedback if the cartpole doesn’t fail.\nValue: The value function to calculate the discounted sum of all future rewards! Thus, the values are different from the reward.\n\nThese are some basic concepts in the reinforcement learning! We will introduce more advanced concept along with more topics involved below. We revisit the fundamental part of the RL: The agent can repeated to take actions and get feedback (rewards/values) from the environment so that it can update the agent itself to behave better to get best reward or values. The deep learning and pytorch is not designed for the RL, and RL is more a mathematically which may not naturally suited for the deep learning. Rather, we design some equation to apply the deep learning. Thus, when we design the RL, we need to think from the fundamental math, and deep learning is just a method to solve a math problem.",
+    "crumbs": [
+      "Home",
+      "🗣️ **Large language models**",
+      "Reinforcement learning for large language model"
+    ]
+  },
+  {
+    "objectID": "notes/Large Language Model/rl_llm.html#the-classification-of-rl",
+    "href": "notes/Large Language Model/rl_llm.html#the-classification-of-rl",
+    "title": "Reinforcement learning for large language model",
+    "section": "The classification of RL",
+    "text": "The classification of RL\nTo solve the RL problem, we have various methods! The detailed is concluded in the figure below. We will study more about the policy based method, the value based method. And for SOTA, the LLM usuaully use a combined method. When we consider how to train the RL, we should first think about how to use the pretrained model. We wish the model to guide us to get the best action to take at every step! Thus, we need a great policy \\(\\pi^*\\)!.\n\n\n\n\nThe value based method\nThe famous \\(Q\\) learning is a typical value-based method. The original paper can be accessed here. The \\(Q\\) is the abbreviate of quality. The value based method has two submethods called the state-value function and the action-value function. Usually, we use \\(V\\) to represent the value, which is\n\\[\nV_{\\pi}(s) = \\mathbb{E}_{\\pi}\\left[ R_{t+1}+\\gamma R_{t+2} + \\gamma^2R_{t+3}+... | S_t=s \\right]\n\\]\nLet me clarify the equation above in a probability. The \\(\\pi\\) is like a distribution, and we may express the value as\n\\[\nV_{\\pi}(s) = \\mathbb{E}_{\\tau\\sim\\pi}\\left[ R_{t+1}+\\gamma R_{t+2} + \\gamma^2R_{t+3}+... | S_t=s \\right]\n\\]\nsince we have \\(a\\sim \\pi(s)\\). And \\(a\\) is directly relevant to the trajectory \\(\\tau\\) which can be used for comprehensive rewards. Now, we have known the value function, this is a value that can evaluate the current confidence to get the best reward based on the current state! Another better and granular method is not just the current state, but also the action. And we introduce the \\(Q\\) value. However, fundamentally, we have \\(Q\\) and \\(V\\) to express the same meaning, the confidence or the estimated quality of the current condition. The only difference is that the \\(Q\\) function also count in the actions.\nThe comparison would be \\(V_\\pi (s)=\\mathbb{E}_\\pi [G_t|S_t=s]\\) vs. \\(Q_{\\pi}(s, a)=\\mathbb{E_\\pi}[G_t|S_t=s, A_t=a]\\). The \\(G_t\\) here represent the ending state. Then, as stated above how do we get the best policy? We can use\n\\[\n\\pi^* = \\text{arg}\\max_a Q^*(s, a)\n\\]\nTo simulate the RL, we usually need to simulate the whole episode, like a cartpole example would continue until it fails. However, there are ways to simplify the process by Bellman equation: \\[\nV_\\pi(s) = \\mathbb{E}_{\\pi} [R_{t+1}+\\gamma * V_{\\pi}(S_{t+1})|S_t=s].\n\\] And we can update the value function by Monte Carlo or the Temporary Difference method. The \\(Q\\) learning is an off-policy (when updating the value function choose a different way to sample the action) value-based method that uses a TD approach to train its action-value function.\nBefore move on, we explain the off-policy. In RL, we usually use \\(\\epsilon\\) greedy policy to choose the actions. That is for a given state \\(s\\), we take the action by sample \\(p\\in [0,1]\\): \\[\nf(x) =\n\\begin{cases}\n\\pi^*(s) & \\text{$p\\leq\\epsilon$}, \\\\\n\\text{random action} & \\text{otherwise}.\n\\end{cases}\n\\] This is a combination of exploration and eploitation. And each time, when we train the \\(Q\\) function, we update it like \\[\nQ(S_t, A_t) \\leftarrow Q(S_t, A_t) +\\alpha (R_{t+1}+\\gamma \\max_a Q(S_{t+1}, a) -Q(S_t, A_t))\n\\tag{1}\\]\nFor certain case with finite number of state and actions, we can easily use a table to record the \\(Q\\) function. However, for some infinite number of states and actions, we need more complicated expression. For example, a math function, and abstract that function we can get the deep neural network \\(Q\\). This is how we can infer the DQN, a nature paper. This basically tell us the value of \\(Q_\\theta (s, a)\\).\nA DQN algorithm is:\n\nInitialize replay memory \\(D\\) to capacity \\(N\\)\nInitialize action-value function \\(Q\\) with random weights \\(\\theta\\)\nInitialize target action-value function \\(\\hat{Q}\\) with weights \\(\\theta^{-} = \\theta\\)\nFor episode = \\(1, M\\) do\n\nInitialize sequence \\(s_1 = \\{x_1\\}\\) and preprocessed sequence \\(\\phi_1 = \\phi(s_1)\\)\nFor \\(t = 1, T\\) do\n\nWith probability \\(\\varepsilon\\) select a random action \\(a_t\\) otherwise select \\(a_t = \\text{argmax}_a Q(\\phi(s_t), a; \\theta)\\)\nExecute action \\(a_t\\) in emulator and observe reward \\(r_t\\) and image \\(x_{t+1}\\)\nSet \\(s_{t+1} = s_t, a_t, x_{t+1}\\) and preprocess \\(\\phi_{t+1} = \\phi(s_{t+1})\\)\nStore transition \\((\\phi_t, a_t, r_t, \\phi_{t+1})\\) in \\(D\\)\nSample random minibatch of transitions \\((\\phi_j, a_j, r_j, \\phi_{j+1})\\) from \\(D\\)\nSet \\(y_j = \\left\\{\\begin{array}{ll}\nr_j & \\text{if episode terminates at step } j+1 \\\\\nr_j + \\gamma \\max_{a'} \\hat{Q}(\\phi_{j+1}, a'; \\theta^{-}) & \\text{otherwise}\n\\end{array}\\right.\\)\nPerform a gradient descent step on \\((y_j - Q(\\phi_j, a_j; \\theta))^2\\) with respect to the network parameters \\(\\theta\\)\nEvery \\(C\\) steps reset \\(\\hat{Q} = Q\\)\n\n\n\nHere \\(\\phi\\) represent some feature encoder! For example, if the state can be represented as image. Then, \\(\\phi\\) is something like the RGB value extractor. From the DQN algorithm above, we notice that the gradient descent is applied on the loss term of \\[\n(y_j - Q(\\phi_j, a_j; \\theta))^2\n\\tag{2}\\]\nThis is to make the learned \\(Q\\) function to approximate the value of the predicted \\(Q\\) value. If we revisit the Equation 1, we notice that the original Q value update is to directly update the \\(Q(S_t, A_t)\\), and the goal is to reduce the difference between \\(R_{t+1}+\\gamma \\max_a Q(S_{t+1}, a)\\) and the \\(Q(S_t, A_t)\\). In the context of the DQN, we can direcly construct the Equation 2 for it! One notation here is that in the Equation 1, we set the \\(R_{t+1}\\), with the same subscript as \\(S_{t+1}\\), but in the algorithm described above, we have it expressed as \\(r_{j}\\) with the -1 subscript compared to \\(\\phi_{j+1}\\). However, the two terms are the same, we use \\(t\\) since it represents the time step. For the use of \\(j\\), it is one step of generated rewards! It is just different notation.\n\n\n\n\n\n\nTip\n\n\n\nWhy do we use Equation 2? Q is the quality value, and it is used to estimate the total expected rewards based on the current state and the action. Suppose we already have the best \\(Q\\), then \\(Q(\\phi_j, a_j)\\) should be equal to the reward after we take the action \\(a_j\\), and then based on the state \\(\\phi_{j+1}\\), the best rewards we can expect, and we use a greedy algorithm here.\n\n\n\n\n\n\n\n\nTip\n\n\n\nIt is always good to visit the code implementation to make sure you understand the detail.\n\n\nWe can study a DQN example from the atari simulation, and the full github code can be accessed there. Now, we will combine the algorithm and the code to introduce more about the DQN.\nFirstly, we need to have data generation process, and we can use\nnext_obs, rewards, terminations, truncations, infos = envs.step(actions)\nto get the variables \\(R, S, A\\) and so on. And we save the generated data to the replay\nrb.add(obs, real_next_obs, actions, rewards, terminations, infos)\nAnd the update process is like\ndata = rb.sample(args.batch_size)\nwith torch.no_grad():\n   target_max, _ = target_network(data.next_observations).max(dim=1)\n   # data.dones is 0 or 1.\n   td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())\nold_val = q_network(data.observations).gather(1, data.actions).squeeze()\nloss = F.mse_loss(td_target, old_val)\n\n\nThe policy based method\nWe can also train the policy directly \\(\\pi_\\theta\\). It is more intuitive. Compared to the value-based method, it has pros and cons.\nFor pros: (a) Can explore stochastic polify, no need for the exploration and exploitation effort; (b) More effective in high-dimensional action space, especially the continuous actions spaces; (c) Better convergence properties, the curve is smoother.\nFor the cons: (a) Often get suboptimal result; (b) Take longer time to train; (c) Policy gradient have high variance (The policy gradient in different step has really different result).\n\n\n\n\n\n\nTip\n\n\n\nNote that Q learning method needs the argmax to get the best action. And if the action is a continuous space, we need to do some pretty complicated optimization to get the result!\n\n\nNote that \\(\\pi_\\theta (s) = \\mathbb{P}(A|s;\\theta)\\). Thus, the training basically becomes that when we have postive reward, we should increase the proability of the state and action pair. Otherwise, decrease it. The objective function is still the total rewards! \\[\nJ(\\theta) = \\mathbb{E}_{\\tau\\sim \\pi}[R(\\tau)],\n\\] where \\(\\tau\\) is a trajectory (a whole simulation process). We already have a theorem to update the policy:\n\\[\n\\nabla_\\theta J(\\theta)=\\mathbb{E}_{\\pi_\\theta}\\left[\\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right) R(\\tau)\\right]\n\\]\nwhich is valid for any differentiable policy and for any policy objective function! To better understand the process, we introduce the Monte Carlo Reinforce. In a loop:\n\nUse the policy \\(\\pi_\\theta\\) to collect an episode \\(\\tau\\)\nUse the episode to estimate the gradient \\(\\hat{g}=\\nabla_\\theta J(\\theta)\\) \\[\n\\nabla_\\theta J(\\theta) \\approx \\hat{g}=\\sum_{t=0} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right) R(\\tau)\n\\]\nUpdate the weights of the policy: \\(\\theta \\leftarrow \\theta+\\alpha \\hat{g}\\). (Gradient ascent)\n\nAlternatively, we can collect multiple trajectories (helpful to mitigate the variance), and the gradient becomes \\[\n\\nabla_\\theta J(\\theta) \\approx \\hat{g}=\\frac{1}{m}\\sum_{i=1}^m\\sum_{t=0} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t^{(i)} \\mid s_t^{(i)}\\right) R(\\tau^{(i)}).\n\\]\n\n\n\n\n\n\nTip\n\n\n\nWe can treat the \\(\\nabla_\\theta \\log\\pi_\\theta(a_t\\mid s_t)\\) is the direction of the steeppest increase of the log probability of selected action based on the \\(s_t\\). This is because that we wish to maximize the objective here (rewards).\n\n\nFor the derivation of the policy gradient theorem, check the following:\n\nTheorem 1 (policy-gradient-theorem) The derivation of the policy gradient theorem is as the following:\n\\[\n\\begin{aligned}\n\\nabla_\\theta J(\\theta)  &= \\mathbb{E}_{\\tau\\sim \\pi}[R(\\tau)] \\\\\n   &= \\nabla_\\theta \\sum_{\\tau}P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} \\nabla_\\theta P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\frac{\\nabla_\\theta P(\\tau;\\theta)}{P(\\tau;\\theta)}R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta \\log P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\log [\\phi(s_0)\\prod_{t=0}^T P(s_{t+1}|s_t, a_t)\\pi_\\theta (a_t\\mid s_t)] R(\\tau)\\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\left[\\log\\phi(s_0) + \\log\\sum_{t=0}^T P(s_{t+1}|s_t, a_t) +\\log\\sum_{t=0}^T\\pi_\\theta (a_t\\mid s_t)\\right] R(\\tau)\\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\left[\\log\\sum_{t=0}^T\\pi_\\theta (a_t\\mid s_t)\\right] R(\\tau).\n\\end{aligned}\n\\tag{3}\\]\n\nFor the code part, using the cartpole as an example, the policy framework would be\nclass Policy(nn.Module):\n    def __init__(self, s_size, a_size, h_size):\n        super(Policy, self).__init__()\n        self.fc1 = nn.Linear(s_size, h_size)\n        self.fc2 = nn.Linear(h_size, a_size)\n\n    def forward(self, x):\n        x = F.relu(self.fc1(x))\n        x = self.fc2(x)\n        return F.softmax(x, dim=1)\n\n    def act(self, state):\n        state = torch.from_numpy(state).float().unsqueeze(0).to(device)\n        probs = self.forward(state).cpu()\n        m = Categorical(probs)\n        action = np.argmax(m)\n        return action.item(), m.log_prob(action)\nNote that we need to use the Categorical from torch.distributions to enable the backpropagation. The reinforce process can be constructed according to the reinfoce algorithm introduced above.\ndef reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):\n    scores_deque = deque(maxlen=100)\n    scores = []\n    for i_episode in range(1, n_training_episodes + 1):\n        saved_log_probs = []\n        rewards = []\n        state = env.reset()\n        for t in range(max_t):\n            action, log_prob = policy.act(state)\n            saved_log_probs.append(log_prob)\n            state, reward, done, _ = env.step(action)\n            rewards.append(reward)\n            if done:\n                break\n        scores_deque.append(sum(rewards))\n        scores.append(sum(rewards))\n\n        returns = deque(maxlen=max_t)\n        n_steps = len(rewards)\n        \n        for t in range(n_steps)[::-1]:\n            disc_return_t = returns[0] if len(returns) &gt; 0 else 0\n            returns.appendleft(gamma * disc_return_t + rewards[t])\n\n        ## standardization of the returns is employed to make training more stable\n        eps = np.finfo(np.float32).eps.item()\n\n        ## eps is the smallest representable float, which is\n        # added to the standard deviation of the returns to avoid numerical instabilities\n        returns = torch.tensor(returns)\n        returns = (returns - returns.mean()) / (returns.std() + eps)\n\n        policy_loss = []\n        for log_prob, disc_return in zip(saved_log_probs, returns):\n            policy_loss.append(-log_prob * disc_return)\n        policy_loss = torch.cat(policy_loss).sum()\n\n        optimizer.zero_grad()\n        policy_loss.backward()\n        optimizer.step()\n\n        if i_episode % print_every == 0:\n            print(\"Episode {}\\tAverage Score: {:.2f}\".format(i_episode, np.mean(scores_deque)))\n    return scores\n\n\nThe actor-critic method and PPO\n\n\n\n\n\n\nTip\n\n\n\nSOTA, we usually use a mixed method containing both policy based and value based methods.\n\n\nThe motivation of the actor-critic method is to lower the variation of the policy method. We can use a large number of the trajectories but it is not efficient. Therefore, we choose a new method called actor-critic method. That is to say, instead of giving rewards/feedback to the policy (actor) after many trajectories, we can use critic to give instant feedback to evaluate the actions taken by the policy. Now, we have two network to train:\n\nA policy function with parameters \\(\\pi_\\theta(s)\\);\nA value function with parameters \\({q}_w(s, a)\\)\n\nThis is a combined methods of the policy-based and value-based methods. For one step of time \\(t\\)\n\nAt time step \\(t\\), we have the state \\(s_t\\);\nWe have the policy \\(\\pi_\\theta(s_t) = a_t\\);\nNow, we can compute the Q-value by the value function directly as \\(Q_t={q}_w(s, a)\\);\nExecute the action \\(a_t\\) and get the new state \\(s_{t+1}\\) and new reward \\(r_{t+1}\\).\nUpdate the policy parameters using the Q value;\nUsing the updated parameters to get the next action \\(a_{t+1}\\), and use the new action to update critic parameters.\n\n\n\n\n\n\n\nTip\n\n\n\nIn policy based function, Equation 3 needs to use \\(R(\\tau)\\), and \\(R(\\tau)\\) is obtained by iterative experiments. Now, we can use Q value since they represent the same meaning. Also, when we update the Q parameters, we use argmax to get the best action, now we use the updated policy to calculate the best action. This actor-critic is somewhat like the iterative-optimization methods seen in many math problems.\n\n\n\n\n\n\n\n\nTip\n\n\n\nTo stabilize the training, now we tend to use the advantage function to replace the Q value.\n\n\nPPO is an algorithm based on the actor-critic method, and it is to clip the ratio which indicates the difference of policy to [\\(1-\\epsilon\\), \\(1+\\epsilon\\)].\nTo do so, we just need to the change the policy objection function (with advantage function) from \\[\nJ(\\theta) = \\mathbb{E}_t\\left[ \\log\\pi_\\theta (a_t\\mid s_t)*A_t \\right]\n\\]\nto \\[\nJ(\\theta)=\\hat{\\mathbb{E}}_t\\left[\\min \\left(r_t(\\theta) \\hat{A}_t, \\operatorname{clip}\\left(r_t(\\theta), 1-\\epsilon, 1+\\epsilon\\right) \\hat{A}_t\\right)\\right]\n\\]\nwhere the ratio is \\[\nr_t(\\theta)=\\frac{\\pi_\\theta\\left(a_t \\mid s_t\\right)}{\\pi_{\\theta_{\\text {old }}}\\left(a_t \\mid s_t\\right)}.\n\\]\nNow, we use a PPO implementation to better study the algorithm above. The full code implementation can be found here. There is also another wonderful post about PPO implementation. Let’s study the code now.\n\nDefine both the actor and critic\nWe usually define the network directly!\nclass Agent(nn.Module):\n    def __init__(self, envs):\n        super().__init__()\n        self.critic = nn.Sequential(\n            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 1), std=1.0),\n        )\n        self.actor = nn.Sequential(\n            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),\n        )\n\n    def get_value(self, x):\n        return self.critic(x)\n\n    def get_action_and_value(self, x, action=None):\n        logits = self.actor(x)\n        probs = Categorical(logits=logits)\n        if action is None:\n            action = probs.sample()\n        return action, probs.log_prob(action), probs.entropy(), self.critic(x)\nNote that we don’t have q value here since the PPO uses the advantage value which means we don’t need the Q value anymore! And you may observe that critic output is a single dim value.\n\n\n\n\n\n\nTip\n\n\n\nIn pytorch, the forward process is not necessarily defined in forward() function. We often use it since it has customization so that model(**params) is equal to model.forward(**params).\n\n\n\n\nDeal with the advantage values\nThe action value is simply as \\[\nA(s_t, a_t) = Q(s_t, a_t) - V(s_t) = r + \\gamma V(s_{t+1}) - V(s)\n\\] Here, we use \\(r + \\gamma V(s_{t+1})\\) to appriximate the \\(Q\\) value, but recall in the DQN algorithm, we use it as well!\n\n\n\nApply PPO to LLM\nNow, we discuss the pivotal topic of this blog. How to consider the LLM training as a PPO.\nWe use the concept of RL, and explain how LLM can be used here.\n\nenvironment: The language world, when you output a new word, it will be added as the context of the conversation. The observation/state is the existing generation and the initial language;\nstate: The existing generation and the initial language;\nagent: The LLM model it self. We have LLM(curr_words) = next_token. Here \\(\\pi_\\theta\\) = LLM;\nreward: Can be customized, and we usually choose to add a linear layer (two-heads output) to the last embedding layer of the LLM as the reward function.\n\nThe step of the PPO can be formulated as the following:\n\nGiven the preference pair (\\(y_{Y}\\), \\(y_{N}\\)), we train a reward model. The reward model can be trained using the following loss: \\[\n\\mathcal{L}_R\\left(r_\\phi, \\mathcal{D}\\right)=-\\mathbb{E}_{\\left(x, y_Y, y_N\\right) \\sim \\mathcal{D}}\\left[\\log \\sigma\\left(r_\\phi\\left(x, y_Y\\right)-r_\\phi\\left(x, y_N\\right)\\right)\\right]\n\\]\nAfter the have the reward function, we freeze the parameters \\(\\phi\\) and train the \\(\\theta\\) by optimization of \\[\n\\max _{\\pi_\\theta} \\mathbb{E}_{x \\sim \\mathcal{D}, y \\sim \\pi_\\theta(y \\mid x)}\\left[r_\\phi(x, y)\\right]-\\beta \\mathbb{D}_{\\mathrm{KL}}\\left[\\pi_\\theta(y \\mid x) \\| \\pi_{\\mathrm{ref}}(y \\mid x)\\right]\n\\]\n\n\n\n\n\n\n\nTip\n\n\n\nThe here should be perceived as a probability function. Thus, \\(\\pi_\\theta(y|x)\\) will output a probability!\nActually, in the case of LLM, we have \\[\n\\pi_\\theta(y|x) = p(y|x; \\text{LLM}) = p(y_{0}|x, y)\\prod_{i=1}^Tp(y_{1}|x, y_{0,...,i-1}; \\text{LLM})\n\\]\n\n\n\n\nDPO\nDPO is another method inspired by the limitation of the PPO. In the case of direct preference of choosing from two results. The human preference distribution \\(p^*\\) can be expressed with reward function: \\[\np^*\\left(y_1 \\succ y_2 \\mid x\\right)=\\frac{\\exp \\left(r^*\\left(x, y_1\\right)\\right)}{\\exp \\left(r^*\\left(x, y_1\\right)\\right)+\\exp \\left(r^*\\left(x, y_2\\right)\\right)}\n\\]\nThe DPO paper indicate that we can express the probability under the policy \\(\\pi^*\\) with\n\\[\np^*\\left(y_1 \\succ y_2 \\mid x\\right)=\\frac{1}{1+\\exp \\left(\\beta \\log \\frac{\\pi^*\\left(y_2 \\mid x\\right)}{\\pi_{\\mathrm{ref}}\\left(y_2 \\mid x\\right)}-\\beta \\log \\frac{\\pi^*\\left(y_1 \\mid x\\right)}{\\pi_{\\mathrm{ref}}\\left(y_1 \\mid x\\right)}\\right)}\n\\]\nTherefore, we don’t need the real PPO now. And we just need to do something like a SFT with a different loss function: \\[\n\\mathcal{L}_{\\mathrm{DPO}}\\left(\\pi_\\theta ; \\pi_{\\mathrm{ref}}\\right)=-\\mathbb{E}_{\\left(x, y_w, y_l\\right) \\sim \\mathcal{D}}\\left[\\log \\sigma\\left(\\beta \\log \\frac{\\pi_\\theta\\left(y_w \\mid x\\right)}{\\pi_{\\text {ref }}\\left(y_w \\mid x\\right)}-\\beta \\log \\frac{\\pi_\\theta\\left(y_l \\mid x\\right)}{\\pi_{\\text {ref }}\\left(y_l \\mid x\\right)}\\right)\\right] .\n\\]\n\n\n\n\n\n\nTip\n\n\n\nDuring training, the \\(\\pi_{ref}\\) is freezed!",
+    "crumbs": [
+      "Home",
+      "🗣️ **Large language models**",
+      "Reinforcement learning for large language model"
+    ]
+  },
+  {
+    "objectID": "about.html",
+    "href": "about.html",
+    "title": "About Wei Chen",
+    "section": "",
+    "text": "I am inside a Hawaii cafe\n\n\n\n\n   \n\n\nGreetings! My name is Alex Chen, an AI researcher and startup founder in Silicon Valley. Previously, I pursued a PhD at Stanford University, delving into numerical simulation and artificial intelligence research.\nIn this space, I aim to share my journey in academia and industry, providing insights into cutting-edge research and practical applications of AI and machine learning.\n\nFeel free to connect with me:\n\nEmail: weichen6@stanford.edu\nCheck out my startup: AI Tools by Nexa"
+  },
+  {
+    "objectID": "news.html",
+    "href": "news.html",
+    "title": "The latest news about the research and product about generative.",
+    "section": "",
+    "text": "Hot topic\n\n\n\n\n\n\nResearch news\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\n\n\n\n\n\n\nReading list\n\n\n\n\n\n\nPersonal thoughts\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\nNo matching items"
+  },
+  {
+    "objectID": "notes.html",
+    "href": "notes.html",
+    "title": "Research notes",
+    "section": "",
+    "text": "The inference optimization\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n4 min\n\n\n\n\n\n\n\n\n\n\n\n\nOptimization in machine learning\n\n\n\n\n\n\nMath Theories\n\n\n\n\n\n\n\n\n\n13 min\n\n\n\n\n\n\n\n\n\n\n\n\nLarge language model distributed training\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nComplex analysis for machine learning\n\n\n\n\n\n\nMath Theories\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nLarge language model evaluation\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n11 min\n\n\n\n\n\n\n\n\n\n\n\n\nMixture of expert\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nScalable diffusion models with transformers\n\n\n\n\n\n\nDiffusion Model\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\n\n\n\n\n\n\nReinforcement learning for large language model\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n19 min\n\n\n\n\n\n\nNo matching items"
+  },
+  {
+    "objectID": "index.html",
+    "href": "index.html",
+    "title": "Alex Chen",
+    "section": "",
+    "text": "👋 Greetings! My name is Alex Chen, an AI researcher and startup founder in Silicon Valley. Previously, I pursued a PhD at Stanford University, delving into numerical simulation and artificial intelligence research. Now, I am building an AI agent company called Nexa AI.\nIn this space, I aim to share my journey in academia and industry, providing insights into cutting-edge research and practical applications of AI and machine learning.\nIf you want to contact me, feel free to send a mail at this address.\n\nCredits:\n\nEmojis used in figures are designed by OpenMoji, the open-source emoji and icon project. License: CC BY-SA 4.0.\nVector icons are provided by Streamline (https://streamlinehq.com). License: CC BY-SA 4.0."
+  },
   {
     "objectID": "notes/Large Language Model/llm_eval.html",
     "href": "notes/Large Language Model/llm_eval.html",
@@ -180,67 +268,27 @@
     ]
   },
   {
-    "objectID": "index.html",
-    "href": "index.html",
-    "title": "Alex Chen",
-    "section": "",
-    "text": "👋 Greetings! My name is Alex Chen, an AI researcher and startup founder in Silicon Valley. Previously, I pursued a PhD at Stanford University, delving into numerical simulation and artificial intelligence research. Now, I am building an AI agent company called Nexa AI.\nIn this space, I aim to share my journey in academia and industry, providing insights into cutting-edge research and practical applications of AI and machine learning.\nIf you want to contact me, feel free to send a mail at this address.\n\nCredits:\n\nEmojis used in figures are designed by OpenMoji, the open-source emoji and icon project. License: CC BY-SA 4.0.\nVector icons are provided by Streamline (https://streamlinehq.com). License: CC BY-SA 4.0."
-  },
-  {
-    "objectID": "notes.html",
-    "href": "notes.html",
-    "title": "Research notes",
-    "section": "",
-    "text": "Optimization in machine learning\n\n\n\n\n\n\nMath Theories\n\n\n\n\n\n\n\n\n\n13 min\n\n\n\n\n\n\n\n\n\n\n\n\nLarge language model distributed training\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nComplex analysis for machine learning\n\n\n\n\n\n\nMath Theories\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nLarge language model evaluation\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n11 min\n\n\n\n\n\n\n\n\n\n\n\n\nMixture of expert\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n5 min\n\n\n\n\n\n\n\n\n\n\n\n\nScalable diffusion models with transformers\n\n\n\n\n\n\nDiffusion Model\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\n\n\n\n\n\n\nReinforcement learning for large language model\n\n\n\n\n\n\nLarge Language Models\n\n\n\n\n\n\n\n\n\n19 min\n\n\n\n\n\n\nNo matching items"
-  },
-  {
-    "objectID": "news.html",
-    "href": "news.html",
-    "title": "The latest news about the research and product about generative.",
-    "section": "",
-    "text": "Hot topic\n\n\n\n\n\n\nResearch news\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\n\n\n\n\n\n\nReading list\n\n\n\n\n\n\nPersonal thoughts\n\n\n\n\n\n\n\n\n\n1 min\n\n\n\n\n\n\nNo matching items"
-  },
-  {
-    "objectID": "about.html",
-    "href": "about.html",
-    "title": "About Wei Chen",
-    "section": "",
-    "text": "I am inside a Hawaii cafe\n\n\n\n\n   \n\n\nGreetings! My name is Alex Chen, an AI researcher and startup founder in Silicon Valley. Previously, I pursued a PhD at Stanford University, delving into numerical simulation and artificial intelligence research.\nIn this space, I aim to share my journey in academia and industry, providing insights into cutting-edge research and practical applications of AI and machine learning.\n\nFeel free to connect with me:\n\nEmail: weichen6@stanford.edu\nCheck out my startup: AI Tools by Nexa"
-  },
-  {
-    "objectID": "notes/Large Language Model/rl_llm.html",
-    "href": "notes/Large Language Model/rl_llm.html",
-    "title": "Reinforcement learning for large language model",
+    "objectID": "notes/Large Language Model/llm_train.html",
+    "href": "notes/Large Language Model/llm_train.html",
+    "title": "Large language model distributed training",
     "section": "",
-    "text": "Tip\n\n\n\nReinforment is a common technique, which can be applied to the large language model area.",
-    "crumbs": [
-      "Home",
-      "🗣️ **Large language models**",
-      "Reinforcement learning for large language model"
-    ]
-  },
-  {
-    "objectID": "notes/Large Language Model/rl_llm.html#background-of-reinforcement-learning",
-    "href": "notes/Large Language Model/rl_llm.html#background-of-reinforcement-learning",
-    "title": "Reinforcement learning for large language model",
-    "section": "Background of reinforcement learning",
-    "text": "Background of reinforcement learning\nIn the first section, we will review the fundamental concept of the reinforcement learning. The fundamental part of the reinforcement learning includes the agent and environment. The process is as the following:\n\n\n\nAt each iteration step, we have the state of the environement marked as \\(S\\), the action \\(A\\) and the reward \\(R\\). Below, we list the step at the time step \\(t\\):\n\nBased on the current state \\(S_t\\), the agent make the action \\(A_t\\);\nThe environment react to the action and transit to the state \\(S_{t+1}\\) and reward \\(R_{t+1}\\).\n\nTherefore, related to each action, we will have a state of \\(S_t, A_t, S_{t+1}, R_{t+1}\\). And these four variables will be the critical data used for the reinforcement learning! Now, let me introduce more about the glossary of the reinforcement learning terms.\n\nMarkov chain: The Markov chain means that the action taken by the agent is only dependent on the most recent state/present state, and is independent of past states.\nObservation/State: The state is the complete description while the observation is just the partial description. The partial description means part of the state.\npolicy: The policy is usually denoted as \\(\\pi\\) and it is used to decide which action \\(a\\) to take. According to the Markov chain, we have \\(\\pi(s)=a\\).\nreward: Reward is the value that we can get immediately after we take a new action. For example, in cartpole example, we get every positive feedback if the cartpole doesn’t fail.\nValue: The value function to calculate the discounted sum of all future rewards! Thus, the values are different from the reward.\n\nThese are some basic concepts in the reinforcement learning! We will introduce more advanced concept along with more topics involved below. We revisit the fundamental part of the RL: The agent can repeated to take actions and get feedback (rewards/values) from the environment so that it can update the agent itself to behave better to get best reward or values. The deep learning and pytorch is not designed for the RL, and RL is more a mathematically which may not naturally suited for the deep learning. Rather, we design some equation to apply the deep learning. Thus, when we design the RL, we need to think from the fundamental math, and deep learning is just a method to solve a math problem.",
+    "text": "Tip\n\n\n\nThe AWS sagemaker is a service to support the automatic training for the models. And the price is 1.5x of the normal elastic container. Thus, the distributed learning is important and expensive.",
     "crumbs": [
       "Home",
       "🗣️ **Large language models**",
-      "Reinforcement learning for large language model"
+      "Large language model distributed training"
     ]
   },
   {
-    "objectID": "notes/Large Language Model/rl_llm.html#the-classification-of-rl",
-    "href": "notes/Large Language Model/rl_llm.html#the-classification-of-rl",
-    "title": "Reinforcement learning for large language model",
-    "section": "The classification of RL",
-    "text": "The classification of RL\nTo solve the RL problem, we have various methods! The detailed is concluded in the figure below. We will study more about the policy based method, the value based method. And for SOTA, the LLM usuaully use a combined method. When we consider how to train the RL, we should first think about how to use the pretrained model. We wish the model to guide us to get the best action to take at every step! Thus, we need a great policy \\(\\pi^*\\)!.\n\n\n\n\nThe value based method\nThe famous \\(Q\\) learning is a typical value-based method. The original paper can be accessed here. The \\(Q\\) is the abbreviate of quality. The value based method has two submethods called the state-value function and the action-value function. Usually, we use \\(V\\) to represent the value, which is\n\\[\nV_{\\pi}(s) = \\mathbb{E}_{\\pi}\\left[ R_{t+1}+\\gamma R_{t+2} + \\gamma^2R_{t+3}+... | S_t=s \\right]\n\\]\nLet me clarify the equation above in a probability. The \\(\\pi\\) is like a distribution, and we may express the value as\n\\[\nV_{\\pi}(s) = \\mathbb{E}_{\\tau\\sim\\pi}\\left[ R_{t+1}+\\gamma R_{t+2} + \\gamma^2R_{t+3}+... | S_t=s \\right]\n\\]\nsince we have \\(a\\sim \\pi(s)\\). And \\(a\\) is directly relevant to the trajectory \\(\\tau\\) which can be used for comprehensive rewards. Now, we have known the value function, this is a value that can evaluate the current confidence to get the best reward based on the current state! Another better and granular method is not just the current state, but also the action. And we introduce the \\(Q\\) value. However, fundamentally, we have \\(Q\\) and \\(V\\) to express the same meaning, the confidence or the estimated quality of the current condition. The only difference is that the \\(Q\\) function also count in the actions.\nThe comparison would be \\(V_\\pi (s)=\\mathbb{E}_\\pi [G_t|S_t=s]\\) vs. \\(Q_{\\pi}(s, a)=\\mathbb{E_\\pi}[G_t|S_t=s, A_t=a]\\). The \\(G_t\\) here represent the ending state. Then, as stated above how do we get the best policy? We can use\n\\[\n\\pi^* = \\text{arg}\\max_a Q^*(s, a)\n\\]\nTo simulate the RL, we usually need to simulate the whole episode, like a cartpole example would continue until it fails. However, there are ways to simplify the process by Bellman equation: \\[\nV_\\pi(s) = \\mathbb{E}_{\\pi} [R_{t+1}+\\gamma * V_{\\pi}(S_{t+1})|S_t=s].\n\\] And we can update the value function by Monte Carlo or the Temporary Difference method. The \\(Q\\) learning is an off-policy (when updating the value function choose a different way to sample the action) value-based method that uses a TD approach to train its action-value function.\nBefore move on, we explain the off-policy. In RL, we usually use \\(\\epsilon\\) greedy policy to choose the actions. That is for a given state \\(s\\), we take the action by sample \\(p\\in [0,1]\\): \\[\nf(x) =\n\\begin{cases}\n\\pi^*(s) & \\text{$p\\leq\\epsilon$}, \\\\\n\\text{random action} & \\text{otherwise}.\n\\end{cases}\n\\] This is a combination of exploration and eploitation. And each time, when we train the \\(Q\\) function, we update it like \\[\nQ(S_t, A_t) \\leftarrow Q(S_t, A_t) +\\alpha (R_{t+1}+\\gamma \\max_a Q(S_{t+1}, a) -Q(S_t, A_t))\n\\tag{1}\\]\nFor certain case with finite number of state and actions, we can easily use a table to record the \\(Q\\) function. However, for some infinite number of states and actions, we need more complicated expression. For example, a math function, and abstract that function we can get the deep neural network \\(Q\\). This is how we can infer the DQN, a nature paper. This basically tell us the value of \\(Q_\\theta (s, a)\\).\nA DQN algorithm is:\n\nInitialize replay memory \\(D\\) to capacity \\(N\\)\nInitialize action-value function \\(Q\\) with random weights \\(\\theta\\)\nInitialize target action-value function \\(\\hat{Q}\\) with weights \\(\\theta^{-} = \\theta\\)\nFor episode = \\(1, M\\) do\n\nInitialize sequence \\(s_1 = \\{x_1\\}\\) and preprocessed sequence \\(\\phi_1 = \\phi(s_1)\\)\nFor \\(t = 1, T\\) do\n\nWith probability \\(\\varepsilon\\) select a random action \\(a_t\\) otherwise select \\(a_t = \\text{argmax}_a Q(\\phi(s_t), a; \\theta)\\)\nExecute action \\(a_t\\) in emulator and observe reward \\(r_t\\) and image \\(x_{t+1}\\)\nSet \\(s_{t+1} = s_t, a_t, x_{t+1}\\) and preprocess \\(\\phi_{t+1} = \\phi(s_{t+1})\\)\nStore transition \\((\\phi_t, a_t, r_t, \\phi_{t+1})\\) in \\(D\\)\nSample random minibatch of transitions \\((\\phi_j, a_j, r_j, \\phi_{j+1})\\) from \\(D\\)\nSet \\(y_j = \\left\\{\\begin{array}{ll}\nr_j & \\text{if episode terminates at step } j+1 \\\\\nr_j + \\gamma \\max_{a'} \\hat{Q}(\\phi_{j+1}, a'; \\theta^{-}) & \\text{otherwise}\n\\end{array}\\right.\\)\nPerform a gradient descent step on \\((y_j - Q(\\phi_j, a_j; \\theta))^2\\) with respect to the network parameters \\(\\theta\\)\nEvery \\(C\\) steps reset \\(\\hat{Q} = Q\\)\n\n\n\nHere \\(\\phi\\) represent some feature encoder! For example, if the state can be represented as image. Then, \\(\\phi\\) is something like the RGB value extractor. From the DQN algorithm above, we notice that the gradient descent is applied on the loss term of \\[\n(y_j - Q(\\phi_j, a_j; \\theta))^2\n\\tag{2}\\]\nThis is to make the learned \\(Q\\) function to approximate the value of the predicted \\(Q\\) value. If we revisit the Equation 1, we notice that the original Q value update is to directly update the \\(Q(S_t, A_t)\\), and the goal is to reduce the difference between \\(R_{t+1}+\\gamma \\max_a Q(S_{t+1}, a)\\) and the \\(Q(S_t, A_t)\\). In the context of the DQN, we can direcly construct the Equation 2 for it! One notation here is that in the Equation 1, we set the \\(R_{t+1}\\), with the same subscript as \\(S_{t+1}\\), but in the algorithm described above, we have it expressed as \\(r_{j}\\) with the -1 subscript compared to \\(\\phi_{j+1}\\). However, the two terms are the same, we use \\(t\\) since it represents the time step. For the use of \\(j\\), it is one step of generated rewards! It is just different notation.\n\n\n\n\n\n\nTip\n\n\n\nWhy do we use Equation 2? Q is the quality value, and it is used to estimate the total expected rewards based on the current state and the action. Suppose we already have the best \\(Q\\), then \\(Q(\\phi_j, a_j)\\) should be equal to the reward after we take the action \\(a_j\\), and then based on the state \\(\\phi_{j+1}\\), the best rewards we can expect, and we use a greedy algorithm here.\n\n\n\n\n\n\n\n\nTip\n\n\n\nIt is always good to visit the code implementation to make sure you understand the detail.\n\n\nWe can study a DQN example from the atari simulation, and the full github code can be accessed there. Now, we will combine the algorithm and the code to introduce more about the DQN.\nFirstly, we need to have data generation process, and we can use\nnext_obs, rewards, terminations, truncations, infos = envs.step(actions)\nto get the variables \\(R, S, A\\) and so on. And we save the generated data to the replay\nrb.add(obs, real_next_obs, actions, rewards, terminations, infos)\nAnd the update process is like\ndata = rb.sample(args.batch_size)\nwith torch.no_grad():\n   target_max, _ = target_network(data.next_observations).max(dim=1)\n   # data.dones is 0 or 1.\n   td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())\nold_val = q_network(data.observations).gather(1, data.actions).squeeze()\nloss = F.mse_loss(td_target, old_val)\n\n\nThe policy based method\nWe can also train the policy directly \\(\\pi_\\theta\\). It is more intuitive. Compared to the value-based method, it has pros and cons.\nFor pros: (a) Can explore stochastic polify, no need for the exploration and exploitation effort; (b) More effective in high-dimensional action space, especially the continuous actions spaces; (c) Better convergence properties, the curve is smoother.\nFor the cons: (a) Often get suboptimal result; (b) Take longer time to train; (c) Policy gradient have high variance (The policy gradient in different step has really different result).\n\n\n\n\n\n\nTip\n\n\n\nNote that Q learning method needs the argmax to get the best action. And if the action is a continuous space, we need to do some pretty complicated optimization to get the result!\n\n\nNote that \\(\\pi_\\theta (s) = \\mathbb{P}(A|s;\\theta)\\). Thus, the training basically becomes that when we have postive reward, we should increase the proability of the state and action pair. Otherwise, decrease it. The objective function is still the total rewards! \\[\nJ(\\theta) = \\mathbb{E}_{\\tau\\sim \\pi}[R(\\tau)],\n\\] where \\(\\tau\\) is a trajectory (a whole simulation process). We already have a theorem to update the policy:\n\\[\n\\nabla_\\theta J(\\theta)=\\mathbb{E}_{\\pi_\\theta}\\left[\\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right) R(\\tau)\\right]\n\\]\nwhich is valid for any differentiable policy and for any policy objective function! To better understand the process, we introduce the Monte Carlo Reinforce. In a loop:\n\nUse the policy \\(\\pi_\\theta\\) to collect an episode \\(\\tau\\)\nUse the episode to estimate the gradient \\(\\hat{g}=\\nabla_\\theta J(\\theta)\\) \\[\n\\nabla_\\theta J(\\theta) \\approx \\hat{g}=\\sum_{t=0} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right) R(\\tau)\n\\]\nUpdate the weights of the policy: \\(\\theta \\leftarrow \\theta+\\alpha \\hat{g}\\). (Gradient ascent)\n\nAlternatively, we can collect multiple trajectories (helpful to mitigate the variance), and the gradient becomes \\[\n\\nabla_\\theta J(\\theta) \\approx \\hat{g}=\\frac{1}{m}\\sum_{i=1}^m\\sum_{t=0} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t^{(i)} \\mid s_t^{(i)}\\right) R(\\tau^{(i)}).\n\\]\n\n\n\n\n\n\nTip\n\n\n\nWe can treat the \\(\\nabla_\\theta \\log\\pi_\\theta(a_t\\mid s_t)\\) is the direction of the steeppest increase of the log probability of selected action based on the \\(s_t\\). This is because that we wish to maximize the objective here (rewards).\n\n\nFor the derivation of the policy gradient theorem, check the following:\n\nTheorem 1 (policy-gradient-theorem) The derivation of the policy gradient theorem is as the following:\n\\[\n\\begin{aligned}\n\\nabla_\\theta J(\\theta)  &= \\mathbb{E}_{\\tau\\sim \\pi}[R(\\tau)] \\\\\n   &= \\nabla_\\theta \\sum_{\\tau}P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} \\nabla_\\theta P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\frac{\\nabla_\\theta P(\\tau;\\theta)}{P(\\tau;\\theta)}R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta \\log P(\\tau;\\theta)R(\\tau) \\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\log [\\phi(s_0)\\prod_{t=0}^T P(s_{t+1}|s_t, a_t)\\pi_\\theta (a_t\\mid s_t)] R(\\tau)\\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\left[\\log\\phi(s_0) + \\log\\sum_{t=0}^T P(s_{t+1}|s_t, a_t) +\\log\\sum_{t=0}^T\\pi_\\theta (a_t\\mid s_t)\\right] R(\\tau)\\\\\n   &= \\sum_{\\tau} P(\\tau;\\theta) \\nabla_\\theta\\left[\\log\\sum_{t=0}^T\\pi_\\theta (a_t\\mid s_t)\\right] R(\\tau).\n\\end{aligned}\n\\tag{3}\\]\n\nFor the code part, using the cartpole as an example, the policy framework would be\nclass Policy(nn.Module):\n    def __init__(self, s_size, a_size, h_size):\n        super(Policy, self).__init__()\n        self.fc1 = nn.Linear(s_size, h_size)\n        self.fc2 = nn.Linear(h_size, a_size)\n\n    def forward(self, x):\n        x = F.relu(self.fc1(x))\n        x = self.fc2(x)\n        return F.softmax(x, dim=1)\n\n    def act(self, state):\n        state = torch.from_numpy(state).float().unsqueeze(0).to(device)\n        probs = self.forward(state).cpu()\n        m = Categorical(probs)\n        action = np.argmax(m)\n        return action.item(), m.log_prob(action)\nNote that we need to use the Categorical from torch.distributions to enable the backpropagation. The reinforce process can be constructed according to the reinfoce algorithm introduced above.\ndef reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):\n    scores_deque = deque(maxlen=100)\n    scores = []\n    for i_episode in range(1, n_training_episodes + 1):\n        saved_log_probs = []\n        rewards = []\n        state = env.reset()\n        for t in range(max_t):\n            action, log_prob = policy.act(state)\n            saved_log_probs.append(log_prob)\n            state, reward, done, _ = env.step(action)\n            rewards.append(reward)\n            if done:\n                break\n        scores_deque.append(sum(rewards))\n        scores.append(sum(rewards))\n\n        returns = deque(maxlen=max_t)\n        n_steps = len(rewards)\n        \n        for t in range(n_steps)[::-1]:\n            disc_return_t = returns[0] if len(returns) &gt; 0 else 0\n            returns.appendleft(gamma * disc_return_t + rewards[t])\n\n        ## standardization of the returns is employed to make training more stable\n        eps = np.finfo(np.float32).eps.item()\n\n        ## eps is the smallest representable float, which is\n        # added to the standard deviation of the returns to avoid numerical instabilities\n        returns = torch.tensor(returns)\n        returns = (returns - returns.mean()) / (returns.std() + eps)\n\n        policy_loss = []\n        for log_prob, disc_return in zip(saved_log_probs, returns):\n            policy_loss.append(-log_prob * disc_return)\n        policy_loss = torch.cat(policy_loss).sum()\n\n        optimizer.zero_grad()\n        policy_loss.backward()\n        optimizer.step()\n\n        if i_episode % print_every == 0:\n            print(\"Episode {}\\tAverage Score: {:.2f}\".format(i_episode, np.mean(scores_deque)))\n    return scores\n\n\nThe actor-critic method and PPO\n\n\n\n\n\n\nTip\n\n\n\nSOTA, we usually use a mixed method containing both policy based and value based methods.\n\n\nThe motivation of the actor-critic method is to lower the variation of the policy method. We can use a large number of the trajectories but it is not efficient. Therefore, we choose a new method called actor-critic method. That is to say, instead of giving rewards/feedback to the policy (actor) after many trajectories, we can use critic to give instant feedback to evaluate the actions taken by the policy. Now, we have two network to train:\n\nA policy function with parameters \\(\\pi_\\theta(s)\\);\nA value function with parameters \\({q}_w(s, a)\\)\n\nThis is a combined methods of the policy-based and value-based methods. For one step of time \\(t\\)\n\nAt time step \\(t\\), we have the state \\(s_t\\);\nWe have the policy \\(\\pi_\\theta(s_t) = a_t\\);\nNow, we can compute the Q-value by the value function directly as \\(Q_t={q}_w(s, a)\\);\nExecute the action \\(a_t\\) and get the new state \\(s_{t+1}\\) and new reward \\(r_{t+1}\\).\nUpdate the policy parameters using the Q value;\nUsing the updated parameters to get the next action \\(a_{t+1}\\), and use the new action to update critic parameters.\n\n\n\n\n\n\n\nTip\n\n\n\nIn policy based function, Equation 3 needs to use \\(R(\\tau)\\), and \\(R(\\tau)\\) is obtained by iterative experiments. Now, we can use Q value since they represent the same meaning. Also, when we update the Q parameters, we use argmax to get the best action, now we use the updated policy to calculate the best action. This actor-critic is somewhat like the iterative-optimization methods seen in many math problems.\n\n\n\n\n\n\n\n\nTip\n\n\n\nTo stabilize the training, now we tend to use the advantage function to replace the Q value.\n\n\nPPO is an algorithm based on the actor-critic method, and it is to clip the ratio which indicates the difference of policy to [\\(1-\\epsilon\\), \\(1+\\epsilon\\)].\nTo do so, we just need to the change the policy objection function (with advantage function) from \\[\nJ(\\theta) = \\mathbb{E}_t\\left[ \\log\\pi_\\theta (a_t\\mid s_t)*A_t \\right]\n\\]\nto \\[\nJ(\\theta)=\\hat{\\mathbb{E}}_t\\left[\\min \\left(r_t(\\theta) \\hat{A}_t, \\operatorname{clip}\\left(r_t(\\theta), 1-\\epsilon, 1+\\epsilon\\right) \\hat{A}_t\\right)\\right]\n\\]\nwhere the ratio is \\[\nr_t(\\theta)=\\frac{\\pi_\\theta\\left(a_t \\mid s_t\\right)}{\\pi_{\\theta_{\\text {old }}}\\left(a_t \\mid s_t\\right)}.\n\\]\nNow, we use a PPO implementation to better study the algorithm above. The full code implementation can be found here. There is also another wonderful post about PPO implementation. Let’s study the code now.\n\nDefine both the actor and critic\nWe usually define the network directly!\nclass Agent(nn.Module):\n    def __init__(self, envs):\n        super().__init__()\n        self.critic = nn.Sequential(\n            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 1), std=1.0),\n        )\n        self.actor = nn.Sequential(\n            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, 64)),\n            nn.Tanh(),\n            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),\n        )\n\n    def get_value(self, x):\n        return self.critic(x)\n\n    def get_action_and_value(self, x, action=None):\n        logits = self.actor(x)\n        probs = Categorical(logits=logits)\n        if action is None:\n            action = probs.sample()\n        return action, probs.log_prob(action), probs.entropy(), self.critic(x)\nNote that we don’t have q value here since the PPO uses the advantage value which means we don’t need the Q value anymore! And you may observe that critic output is a single dim value.\n\n\n\n\n\n\nTip\n\n\n\nIn pytorch, the forward process is not necessarily defined in forward() function. We often use it since it has customization so that model(**params) is equal to model.forward(**params).\n\n\n\n\nDeal with the advantage values\nThe action value is simply as \\[\nA(s_t, a_t) = Q(s_t, a_t) - V(s_t) = r + \\gamma V(s_{t+1}) - V(s)\n\\] Here, we use \\(r + \\gamma V(s_{t+1})\\) to appriximate the \\(Q\\) value, but recall in the DQN algorithm, we use it as well!\n\n\n\nApply PPO to LLM\nNow, we discuss the pivotal topic of this blog. How to consider the LLM training as a PPO.\nWe use the concept of RL, and explain how LLM can be used here.\n\nenvironment: The language world, when you output a new word, it will be added as the context of the conversation. The observation/state is the existing generation and the initial language;\nstate: The existing generation and the initial language;\nagent: The LLM model it self. We have LLM(curr_words) = next_token. Here \\(\\pi_\\theta\\) = LLM;\nreward: Can be customized, and we usually choose to add a linear layer (two-heads output) to the last embedding layer of the LLM as the reward function.\n\nThe step of the PPO can be formulated as the following:\n\nGiven the preference pair (\\(y_{Y}\\), \\(y_{N}\\)), we train a reward model. The reward model can be trained using the following loss: \\[\n\\mathcal{L}_R\\left(r_\\phi, \\mathcal{D}\\right)=-\\mathbb{E}_{\\left(x, y_Y, y_N\\right) \\sim \\mathcal{D}}\\left[\\log \\sigma\\left(r_\\phi\\left(x, y_Y\\right)-r_\\phi\\left(x, y_N\\right)\\right)\\right]\n\\]\nAfter the have the reward function, we freeze the parameters \\(\\phi\\) and train the \\(\\theta\\) by optimization of \\[\n\\max _{\\pi_\\theta} \\mathbb{E}_{x \\sim \\mathcal{D}, y \\sim \\pi_\\theta(y \\mid x)}\\left[r_\\phi(x, y)\\right]-\\beta \\mathbb{D}_{\\mathrm{KL}}\\left[\\pi_\\theta(y \\mid x) \\| \\pi_{\\mathrm{ref}}(y \\mid x)\\right]\n\\]\n\n\n\n\n\n\n\nTip\n\n\n\nThe here should be perceived as a probability function. Thus, \\(\\pi_\\theta(y|x)\\) will output a probability!\nActually, in the case of LLM, we have \\[\n\\pi_\\theta(y|x) = p(y|x; \\text{LLM}) = p(y_{0}|x, y)\\prod_{i=1}^Tp(y_{1}|x, y_{0,...,i-1}; \\text{LLM})\n\\]\n\n\n\n\nDPO\nDPO is another method inspired by the limitation of the PPO. In the case of direct preference of choosing from two results. The human preference distribution \\(p^*\\) can be expressed with reward function: \\[\np^*\\left(y_1 \\succ y_2 \\mid x\\right)=\\frac{\\exp \\left(r^*\\left(x, y_1\\right)\\right)}{\\exp \\left(r^*\\left(x, y_1\\right)\\right)+\\exp \\left(r^*\\left(x, y_2\\right)\\right)}\n\\]\nThe DPO paper indicate that we can express the probability under the policy \\(\\pi^*\\) with\n\\[\np^*\\left(y_1 \\succ y_2 \\mid x\\right)=\\frac{1}{1+\\exp \\left(\\beta \\log \\frac{\\pi^*\\left(y_2 \\mid x\\right)}{\\pi_{\\mathrm{ref}}\\left(y_2 \\mid x\\right)}-\\beta \\log \\frac{\\pi^*\\left(y_1 \\mid x\\right)}{\\pi_{\\mathrm{ref}}\\left(y_1 \\mid x\\right)}\\right)}\n\\]\nTherefore, we don’t need the real PPO now. And we just need to do something like a SFT with a different loss function: \\[\n\\mathcal{L}_{\\mathrm{DPO}}\\left(\\pi_\\theta ; \\pi_{\\mathrm{ref}}\\right)=-\\mathbb{E}_{\\left(x, y_w, y_l\\right) \\sim \\mathcal{D}}\\left[\\log \\sigma\\left(\\beta \\log \\frac{\\pi_\\theta\\left(y_w \\mid x\\right)}{\\pi_{\\text {ref }}\\left(y_w \\mid x\\right)}-\\beta \\log \\frac{\\pi_\\theta\\left(y_l \\mid x\\right)}{\\pi_{\\text {ref }}\\left(y_l \\mid x\\right)}\\right)\\right] .\n\\]\n\n\n\n\n\n\nTip\n\n\n\nDuring training, the \\(\\pi_{ref}\\) is freezed!",
+    "objectID": "notes/Large Language Model/llm_train.html#distributed-learning-introduction-in-pytorch",
+    "href": "notes/Large Language Model/llm_train.html#distributed-learning-introduction-in-pytorch",
+    "title": "Large language model distributed training",
+    "section": "Distributed learning introduction in Pytorch",
+    "text": "Distributed learning introduction in Pytorch\nWe need to be aware of what kind of distributed learning we can use, and there are DDP, RPC and Collective communication from the pytorch documentation (read the documentation for the detail).\n\nData Parallel\nDistributedDataParallel is better than the DataParallel (DP), since DP is limited by the GIL. For DP, it is to split the dataset into multiple machine, and compute them then reduce them. Suppose you have a forward computation with batch size as 16, and the number of the GPU is 4. Then, you basically calculate batch size 4 in each GPU. To apply it, we just need to add a few code:\nif torch.cuda.device_count() &gt; 1:\n    model = nn.DataParallel(model)\nWe don’t need to do any other operation to let it run.\n\n\nDistributed Data Parallel (DDP)\nWe need to use the specific module to let it work. This trick can overcome the GIL. A code example can be\nimport torch\nimport torch.distributed as dist\nimport torch.multiprocessing as mp\nimport torch.nn as nn\nimport torch.optim as optim\nimport os\nfrom torch.nn.parallel import DistributedDataParallel as DDP\n\nclass SimpleCNN(nn.Module):\n    pass \n\ndef example(rank, world_size):\n    # create default process group, nccl means running on GPU\n    dist.init_process_group(\"nccl\", rank=rank, world_size=world_size)\n    # create local model and move it to the current device (GPU/CPU)\n    model = SimpleCNN().to(rank)\n    # construct DDP model\n    ddp_model = DDP(model, device_ids=[rank])\n    # define loss function and optimizer\n    loss_fn = nn.CrossEntropyLoss()\n    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)\n\n    # forward pass\n    outputs = ddp_model(torch.randn(64, 1, 28, 28).to(rank))  # Example input size for MNIST\n    labels = torch.randint(0, 10, (64,)).to(rank)  # Example labels for 64 samples\n    # backward pass\n    loss_fn(outputs, labels).backward()\n    # update parameters\n    optimizer.step()\n\ndef main():\n    world_size = 2\n    mp.spawn(example,\n             args=(world_size,),\n             nprocs=world_size,\n             join=True)\n\nif __name__==\"__main__\":\n    # Environment variables for distributed training\n    os.environ[\"MASTER_ADDR\"] = \"localhost\"\n    os.environ[\"MASTER_PORT\"] = \"29500\"\n    main()\n\n\n\n\n\n\nTip\n\n\n\nrank and world_size are two special concept in the distributed learning. When we launch multiple processes to learn the model, the total number of processes is world_size. For each process, we can define it as rank. You can imagine rank is like a small device, so we put the model or data to the rank like we put them in cuda.\n\n\n\n\n\n\n\n\nTip\n\n\n\nWe still need to define the env value for the distributed learning code, since the framework needs to setup a communication network.\n\n\n\n\nUse the ZeroRedundancyOptimizer\nSince some optimizer like Adam will keep many states, usually twice the model size, OOM error can occur. Therefore, we consider to deepspeed optimizer. In pytorch, it is already implemented!\nfrom torch.distributed.optim import ZeroRedundancyOptimizer\nIf we want to use it, just add a flag called use_zero:\nif use_zero:\n    optimizer = ZeroRedundancyOptimizer(\n        ddp_model.parameters(),\n        optimizer_class=torch.optim.Adam,\n        lr=0.01\n    )\nelse:\n    optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.01)\nThis technique is mainly used to distribute the optimizer to multiple machine to avoid the OOM. All other code is similar to the DDP part.\n\n\nFully sharded data parallel\nThis FSDP will distribute the model and data across all process, and it is good especially for the model that can’t be fitted to one GPU. For the example script, refer to this code example.\n\n\ntorchrun\ntorchrun is a method to execute the distributed learning in a way of elastic running. It can deal with the case that some node may fail. And it can handle the restart automatically.\nWe should set the checkpoint so that we will at most lose one epoch of training. The code is like\ndef main():\n     args = parse_args(sys.argv[1:])\n     state = load_checkpoint(args.checkpoint_path)\n     initialize(state)\n\n     # torch.distributed.run ensures that this will work\n     # by exporting all the env vars needed to initialize the process group\n     torch.distributed.init_process_group(backend=args.backend)\n\n     for i in range(state.epoch, state.total_num_epochs)\n          for batch in iter(state.dataset)\n              train(batch, state.model)\n\n          state.epoch += 1\n          save_checkpoint(state)\nFor more usage about the torchrun, refer to this page. Here is another script that can be runned by the torchrun command. If we want to run the torchrun, we should firstly make sure the script can adapt to the torchrun. The code is to run it is:\ntorchrun\n   --nnodes=NUM_NODES\n   --nproc-per-node=TRAINERS_PER_NODE\n   --max-restarts=NUM_ALLOWED_FAILURES\n   --rdzv-id=JOB_ID\n   --rdzv-backend=c10d\n   --rdzv-endpoint=HOST_NODE_ADDR\n   YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)\nFor more complicated case, pytorch also provide some solution to use multiple container with communication by docker example or k8s example.",
     "crumbs": [
       "Home",
       "🗣️ **Large language models**",
-      "Reinforcement learning for large language model"
+      "Large language model distributed training"
     ]
   },
   {
diff --git a/sitemap.xml b/sitemap.xml
index d466417..c13714a 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -6,39 +6,43 @@
   </url>
   <url>
     <loc>https://alexchen4ai.github.io/blog/notes/Diffusion Model/sd.html</loc>
-    <lastmod>2024-02-19T08:11:23.513Z</lastmod>
+    <lastmod>2024-04-20T16:56:55.422Z</lastmod>
   </url>
   <url>
     <loc>https://alexchen4ai.github.io/blog/notes/Math Theories/complexanalysis.html</loc>
     <lastmod>2024-03-11T23:47:01.871Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/llm_train.html</loc>
-    <lastmod>2024-03-03T05:04:06.902Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/inference_optimize.html</loc>
+    <lastmod>2024-04-20T18:26:38.695Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/llm_eval.html</loc>
-    <lastmod>2024-03-03T03:45:33.641Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/rl_llm.html</loc>
+    <lastmod>2024-02-22T07:33:06.166Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/index.html</loc>
-    <lastmod>2024-04-10T20:10:10.803Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/about.html</loc>
+    <lastmod>2024-04-10T20:08:38.899Z</lastmod>
+  </url>
+  <url>
+    <loc>https://alexchen4ai.github.io/blog/news.html</loc>
+    <lastmod>2024-03-10T05:17:23.389Z</lastmod>
   </url>
   <url>
     <loc>https://alexchen4ai.github.io/blog/notes.html</loc>
     <lastmod>2024-02-03T23:13:18.582Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/news.html</loc>
-    <lastmod>2024-03-10T05:17:23.389Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/index.html</loc>
+    <lastmod>2024-04-10T20:10:10.803Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/about.html</loc>
-    <lastmod>2024-04-10T20:08:38.899Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/llm_eval.html</loc>
+    <lastmod>2024-03-03T03:45:33.641Z</lastmod>
   </url>
   <url>
-    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/rl_llm.html</loc>
-    <lastmod>2024-02-22T07:33:06.166Z</lastmod>
+    <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/llm_train.html</loc>
+    <lastmod>2024-03-03T05:04:06.902Z</lastmod>
   </url>
   <url>
     <loc>https://alexchen4ai.github.io/blog/notes/Large Language Model/moe.html</loc>