Built site for gh-pages

alexchen4ai · Apr 20, 2024 · 5736a6c · 5736a6c
1 parent d2d0327
commit 5736a6c
Show file tree

Hide file tree

Showing 16 changed files with 1,725 additions and 92 deletions.
diff --git a/.nojekyll b/.nojekyll
@@ -1 +1 @@
-61f03ba7
+cc4d3b70
diff --git a/images/profile.jpeg b/images/profile.jpeg
diff --git a/listings.json b/listings.json
@@ -1,7 +1,15 @@
 [
+  {
+    "listing": "/news.html",
+    "items": [
+      "/news/Research news/researchnews.html",
+      "/news/Personal thoughts/myread.html"
+    ]
+  },
   {
     "listing": "/notes.html",
     "items": [
+      "/notes/Large Language Model/inference_optimize.html",
       "/notes/Math Theories/ml_optimizer.html",
       "/notes/Large Language Model/llm_train.html",
       "/notes/Math Theories/complexanalysis.html",
@@ -10,12 +18,5 @@
       "/notes/Diffusion Model/sd.html",
       "/notes/Large Language Model/rl_llm.html"
     ]
-  },
-  {
-    "listing": "/news.html",
-    "items": [
-      "/news/Research news/researchnews.html",
-      "/news/Personal thoughts/myread.html"
-    ]
   }
 ]
diff --git a/notes.html b/notes.html
@@ -220,7 +220,37 @@ <h1 class="title">Research notes</h1>
 
 <div class="quarto-listing quarto-listing-container-default" id="listing-listing">
 <div class="list quarto-listing-default">
-<div class="quarto-post image-right" data-index="0" data-categories="Math Theories" data-listing-date-sort="1710486000000" data-listing-file-modified-sort="1710739348216" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="13" data-listing-word-count-sort="2402">
+<div class="quarto-post image-right" data-index="0" data-categories="Large Language Models" data-listing-date-sort="1713596400000" data-listing-file-modified-sort="1713637598695" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="4" data-listing-word-count-sort="734">
+<div class="thumbnail">
+<p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external">
+<div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
+</a><p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p>
+</div>
+<div class="body">
+<h3 class="no-anchor listing-title">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external">The inference optimization</a>
+</h3>
+<div class="listing-subtitle">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a>
+</div>
+<div class="listing-categories">
+<div class="listing-category" onclick="window.quartoListingCategory('Large Language Models'); return false;">
+Large Language Models
+</div>
+</div>
+<div class="listing-description">
+<a href="./notes/Large Language Model/inference_optimize.html" class="no-external">To run the language model faster and especially on the edge devices, we need to optimize the model. This…</a>
+</div>
+</div>
+<div class="metadata">
+<p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external"></a></p><a href="./notes/Large Language Model/inference_optimize.html" class="no-external">
+<div class="listing-reading-time">
+4 min
+</div>
+</a>
+</div>
+</div>
+<div class="quarto-post image-right" data-index="1" data-categories="Math Theories" data-listing-date-sort="1710486000000" data-listing-file-modified-sort="1710739348216" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="13" data-listing-word-count-sort="2402">
 <div class="thumbnail">
 <p><a href="./notes/Math Theories/ml_optimizer.html" class="no-external"></a></p><a href="./notes/Math Theories/ml_optimizer.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -250,7 +280,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="1" data-categories="Large Language Models" data-listing-date-sort="1709366400000" data-listing-file-modified-sort="1709442246902" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="911">
+<div class="quarto-post image-right" data-index="2" data-categories="Large Language Models" data-listing-date-sort="1709366400000" data-listing-file-modified-sort="1709442246902" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="911">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/llm_train.html" class="no-external"></a></p><a href="./notes/Large Language Model/llm_train.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -280,7 +310,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="2" data-categories="Math Theories" data-listing-date-sort="1708848000000" data-listing-file-modified-sort="1710200821871" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="849">
+<div class="quarto-post image-right" data-index="3" data-categories="Math Theories" data-listing-date-sort="1708848000000" data-listing-file-modified-sort="1710200821871" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="849">
 <div class="thumbnail">
 <p><a href="./notes/Math Theories/complexanalysis.html" class="no-external"></a></p><a href="./notes/Math Theories/complexanalysis.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -310,7 +340,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="3" data-categories="Large Language Models" data-listing-date-sort="1708588800000" data-listing-file-modified-sort="1709437533641" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11" data-listing-word-count-sort="2127">
+<div class="quarto-post image-right" data-index="4" data-categories="Large Language Models" data-listing-date-sort="1708588800000" data-listing-file-modified-sort="1709437533641" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11" data-listing-word-count-sort="2127">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/llm_eval.html" class="no-external"></a></p><a href="./notes/Large Language Model/llm_eval.html" class="no-external">
 <p class="card-img-top"><img src="images/LLM_eval.png"  class="thumbnail-image card-img"/></p>
@@ -340,7 +370,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="4" data-categories="Large Language Models" data-listing-date-sort="1708502400000" data-listing-file-modified-sort="1708927783466" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="942">
+<div class="quarto-post image-right" data-index="5" data-categories="Large Language Models" data-listing-date-sort="1708502400000" data-listing-file-modified-sort="1708927783466" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="5" data-listing-word-count-sort="942">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/moe.html" class="no-external"></a></p><a href="./notes/Large Language Model/moe.html" class="no-external">
 <p class="card-img-top"><img src="images/llama2.png"  class="thumbnail-image card-img"/></p>
@@ -370,7 +400,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="5" data-categories="Diffusion Model" data-listing-date-sort="1708329600000" data-listing-file-modified-sort="1708330283513" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="1" data-listing-word-count-sort="13">
+<div class="quarto-post image-right" data-index="6" data-categories="Diffusion Model" data-listing-date-sort="1708329600000" data-listing-file-modified-sort="1713632215422" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="1" data-listing-word-count-sort="14">
 <div class="thumbnail">
 <p><a href="./notes/Diffusion Model/sd.html" class="no-external"></a></p><a href="./notes/Diffusion Model/sd.html" class="no-external">
 <div class="listing-item-img-placeholder card-img-top" >&nbsp;</div>
@@ -400,7 +430,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="6" data-categories="Large Language Models" data-listing-date-sort="1707984000000" data-listing-file-modified-sort="1708587186166" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="19" data-listing-word-count-sort="3751">
+<div class="quarto-post image-right" data-index="7" data-categories="Large Language Models" data-listing-date-sort="1707984000000" data-listing-file-modified-sort="1708587186166" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="19" data-listing-word-count-sort="3751">
 <div class="thumbnail">
 <p><a href="./notes/Large Language Model/rl_llm.html" class="no-external"></a></p><a href="./notes/Large Language Model/rl_llm.html" class="no-external">
 <p class="card-img-top"><img src="images/RL_basic.png"  class="thumbnail-image card-img"/></p>

diff --git a/notes.xml b/notes.xml
@@ -10,7 +10,81 @@
 <atom:link href="https://alexchen4ai.github.io/blog/notes.xml" rel="self" type="application/rss+xml"/>
 <description>Personal summaries and insights gathered from reading various research papers and articles.</description>
 <generator>quarto-1.4.549</generator>
-<lastBuildDate>Fri, 15 Mar 2024 07:00:00 GMT</lastBuildDate>
+<lastBuildDate>Sat, 20 Apr 2024 07:00:00 GMT</lastBuildDate>
+<item>
+  <title>The inference optimization</title>
+  <dc:creator>Alex Chen</dc:creator>
+  <link>https://alexchen4ai.github.io/blog/notes/Large Language Model/inference_optimize.html</link>
+  <description><![CDATA[ 
+
+
+
+
+
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To run the language model faster and especially on the edge devices, we need to optimize the model. This optimization can be done in different ways. In this article, we will discuss some of the optimization techniques. In this blog, I will introduce different methods and the existing solutions to enable the language model to run faster. Note that my focus is for the on-device language model.</p>
+</div>
+</div>
+<section id="overview-of-the-optimization" class="level2">
+<h2 class="anchored" data-anchor-id="overview-of-the-optimization">Overview of the optimization</h2>
+<p>To optimize the inference for language model, we mainly have the following methods:</p>
+<ol type="1">
+<li><strong>Quantization</strong>: Quantization is the process of reducing the precision of the weights and activations of the model. This reduces the memory footprint and increases the speed of the model.</li>
+<li><strong>Pruning</strong>: Pruning is the process of removing the weights that are close to zero. This reduces the number of parameters in the model and increases the speed of the model.</li>
+<li><strong>Lower level implementation</strong>: Implementing the model in a lower level language like C++ or Rust can increase the speed of the model.</li>
+<li><strong>KV Cache</strong>: Key-Value cache is a technique to cache the intermediate results of the model. This reduces the computation and increases the speed of the model. For some certain devices, we may need to support the KV cache specially.</li>
+<li><strong>Optimization based on hardware</strong>: Like the flash attention for NVIDIA GPU, we can optimize the model based on the hardware. The main method would be to use the memory-access pattern method to optimize the model.</li>
+</ol>
+</section>
+<section id="quantization" class="level2">
+<h2 class="anchored" data-anchor-id="quantization">Quantization</h2>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Quantization is a model compression technique that converts the weights and activations within an LLM from a high-precision data representation to a lower-precision data representation, i.e., from a data type that can hold more information to one that holds less. A typical example of this is the conversion of data from a 32-bit floating-point number (FP32) to an 8-bit or 4-bit integer (INT4 or INT8). A good blog from internet is <a href="https://symbl.ai/developers/blog/a-guide-to-quantization-in-llms/">here</a>.</p>
+</div>
+</div>
+<p>Let’s first revisit the representation of data in computer. We mainly study the <code>float32</code>, <code>float16</code> and <code>bfloat16</code> type.</p>
+<ul>
+<li><strong>float32</strong>: 32 bits. We have 1 bit for the sign, 8 bits for the exponent and 23 bits for the mantissa. To form a float number in computer, we need the sign, the number before the exponent and the exponent number over 2. For example, we have <img src="https://latex.codecogs.com/png.latex?6.75=+1.1011%5Ctimes%202%5E2">. Thus, we can conclude that the range of the representation is between <img src="https://latex.codecogs.com/png.latex?1e%5E%7B-38%7D"> and <img src="https://latex.codecogs.com/png.latex?3e%5E%7B38%7D"> (you can add sign freely, though).</li>
+<li><strong>float16</strong>: 16 bits. We have 1 bit for the sign, 5 bits for the exponent and 10 bits for the mantissa. The range of the representation is between <img src="https://latex.codecogs.com/png.latex?6e%5E%7B-8%7D"> and <img src="https://latex.codecogs.com/png.latex?6e%5E%7B4%7D">.</li>
+<li><strong>bfloat16</strong>: 16 bits. We have 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. The range of the representation is between <img src="https://latex.codecogs.com/png.latex?1e%5E%7B-38%7D"> and <img src="https://latex.codecogs.com/png.latex?3e%5E%7B38%7D">.</li>
+</ul>
+<p>We can see that <code>float16</code> and <code>bfloat16</code> take up the same memory space. But they are different in the bits allocation. The <code>float16</code> has better precision than <code>bfloat16</code>, but the <code>bfloat16</code> has better range than <code>float16</code>. For deep neural network, we may need to consider the use of the <code>bfloat16</code> type since the range is more important than the precision for the deep neural network. The common quantization type are <code>INT8</code> and <code>INT4</code>. Note that <code>INT8</code> and <code>INT4</code> can only represent the integer numbers, not for the float numbers. Thus, <code>INT8</code> can only represent the numbers between <img src="https://latex.codecogs.com/png.latex?-128"> and <img src="https://latex.codecogs.com/png.latex?127">, and <code>INT4</code> can only represent the numbers between <img src="https://latex.codecogs.com/png.latex?-8"> and <img src="https://latex.codecogs.com/png.latex?7">.</p>
+<p>We use the <em>affine quantization scheme</em> to convert the model:</p>
+<p><img src="https://latex.codecogs.com/png.latex?%0Ax_q%20=%20%5Coperatorname%7Bround%7D%5Cleft(x/S%20+%20Z%5Cright)%0A"></p>
+<p>where we have: - <img src="https://latex.codecogs.com/png.latex?x_q">: the quantized value - <img src="https://latex.codecogs.com/png.latex?x">: the original value - <img src="https://latex.codecogs.com/png.latex?S">: the scale factor - <img src="https://latex.codecogs.com/png.latex?Z">: the zero point - <img src="https://latex.codecogs.com/png.latex?%5Coperatorname%7Bround%7D">: the rounding function.</p>
+<p>Usually, we will set multiple blocks to quantize the model. It means that we need multiple scale factors and zero points. Note that not all layers are quantized. For some important layers, we still consider the use of the float32 type.</p>
+<p>For LLM quantization, we have two different methods called post-training quantization and quantization-aware training. If we finally use the quantization model, quantization-aware training is better.</p>
+<section id="exisiting-solutions" class="level3">
+<h3 class="anchored" data-anchor-id="exisiting-solutions">Exisiting solutions</h3>
+<p>We can use quantization library provied in huggingface transformers. For more foundamental optimization, we should consider to use <code>GGML</code> (GPT-Generated Model Language) and <code>GGUF</code> (GPT-Generated Unified Format). For on-device deployment, we should consider the usage of <code>GGUF</code> since it is more efficient. Refer to <a href="https://github.com/ggerganov/llama.cpp">github</a> to use it. We can consider another library called <a href="https://github.com/ollama/ollama">ollama</a> which is built based on the llama cpp.</p>
+
+
+</section>
+</section>
+
+ ]]></description>
+  <category>Large Language Models</category>
+  <guid>https://alexchen4ai.github.io/blog/notes/Large Language Model/inference_optimize.html</guid>
+  <pubDate>Sat, 20 Apr 2024 07:00:00 GMT</pubDate>
+</item>
 <item>
   <title>Optimization in machine learning</title>
   <dc:creator>Alex Chen</dc:creator>
@@ -1473,8 +1547,12 @@ Tip
 </div>
 </div>
 <p>📝 <strong>Paper</strong>: <a href="https://arxiv.org/abs/2212.09748">https://arxiv.org/abs/2212.09748</a></p>
+<section id="introduction" class="level2">
+<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
+<p>Coming soom</p>
 
 
+</section>
 
  ]]></description>
   <category>Diffusion Model</category>

diff --git a/notes/Diffusion Model/sd.html b/notes/Diffusion Model/sd.html
@@ -180,6 +180,12 @@ <h1 class="title d-none d-lg-block">Scalable diffusion models with transformers<
       <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
+  <a href="../../notes/Large Language Model/inference_optimize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">The inference optimization</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
   <a href="../../notes/Large Language Model/llm_eval.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Large language model evaluation</span></a>
   </div>
@@ -272,8 +278,12 @@ <h1 class="title d-none d-lg-block">Scalable diffusion models with transformers<
 </div>
 </div>
 <p>📝 <strong>Paper</strong>: <a href="https://arxiv.org/abs/2212.09748">https://arxiv.org/abs/2212.09748</a></p>
+<section id="introduction" class="level2">
+<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
+<p>Coming soom</p>
 
 
+</section>
 
 </main> <!-- /main -->
 <script id="quarto-html-after-body" type="application/javascript">