From a4f7dbf4f62a72aaf5b40895e23fb12d776c4e79 Mon Sep 17 00:00:00 2001
From: Vyacheslav Bazhenov <v.bazhenov@partner.samsung.com>
Date: Mon, 15 Jul 2024 11:57:53 +0300
Subject: [PATCH] [luci] Introduce Compress weights pass

This commit introduces CopressWeightsPass for Conv2D

ONE-DCO-1.0-Signed-off-by: Vyacheslav Bazhenov <slavikmipt@gmail.com>
---
 compiler/circle2circle/src/Circle2Circle.cpp  |   3 +
 .../include/luci_interpreter/core/Tensor.h    |  15 +
 .../pal/linux/HuffmanDecoder.h                | 357 +++++++++++++++++
 .../luci-interpreter/pal/linux/PALConv2d.h    | 135 ++++++-
 .../src/SimpleMemoryManager.cpp               |  17 +-
 compiler/luci-interpreter/src/core/Tensor.cpp |  26 +-
 .../luci-interpreter/src/kernels/Conv2D.cpp   | 147 ++++++-
 .../luci-interpreter/src/kernels/Conv2D.h     |   2 +
 compiler/luci-interpreter/src/kernels/Utils.h |   3 +-
 .../src/loader/GraphLoader.cpp                |   3 +
 compiler/luci-pass-value-py-test/test.lst     |   1 +
 .../luci/export/src/CircleExporterUtils.cpp   |  15 +
 .../luci/export/src/CircleExporterUtils.h     |   1 +
 .../luci/export/src/CircleTensorExporter.cpp  |  28 +-
 .../include/luci/Import/CircleImporterUtils.h |  33 ++
 .../luci/import/src/CircleImporterUtils.cpp   |  38 ++
 compiler/luci/import/src/CircleReader.cpp     |   6 +
 .../luci/import/src/Nodes/CircleConst.cpp     |   2 +-
 .../luci/import/src/Nodes/CircleConv2D.cpp    |   1 +
 .../include/luci/IR/AttrWeightCompression.h   |  33 ++
 .../lang/include/luci/IR/Nodes/CircleConst.h  |   5 +
 compiler/luci/lang/src/Nodes/CircleConst.cpp  |   4 +
 .../luci/pass/include/luci/CircleOptimizer.h  |   1 +
 .../include/luci/Pass/CompressWeightsPass.h   |  39 ++
 compiler/luci/pass/src/CircleOptimizer.cpp    |   8 +-
 .../luci/pass/src/CompressWeightsPass.cpp     | 109 ++++++
 .../luci/pass/src/helpers/HuffmanCommon.h     |  48 +++
 .../luci/pass/src/helpers/HuffmanDecoder.h    | 362 ++++++++++++++++++
 .../luci/pass/src/helpers/HuffmanEncoder.cpp  | 172 +++++++++
 .../luci/pass/src/helpers/HuffmanEncoder.h    |  67 ++++
 .../pass/src/helpers/HuffmanEncoder.test.cpp  |  63 +++
 31 files changed, 1720 insertions(+), 24 deletions(-)
 create mode 100644 compiler/luci-interpreter/pal/linux/HuffmanDecoder.h
 create mode 100644 compiler/luci/import/include/luci/Import/CircleImporterUtils.h
 create mode 100644 compiler/luci/import/src/CircleImporterUtils.cpp
 create mode 100644 compiler/luci/lang/include/luci/IR/AttrWeightCompression.h
 create mode 100644 compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h
 create mode 100644 compiler/luci/pass/src/CompressWeightsPass.cpp
 create mode 100644 compiler/luci/pass/src/helpers/HuffmanCommon.h
 create mode 100644 compiler/luci/pass/src/helpers/HuffmanDecoder.h
 create mode 100644 compiler/luci/pass/src/helpers/HuffmanEncoder.cpp
 create mode 100644 compiler/luci/pass/src/helpers/HuffmanEncoder.h
 create mode 100644 compiler/luci/pass/src/helpers/HuffmanEncoder.test.cpp

diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index c99ec8b69a3..adb22b1d79b 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -193,6 +193,8 @@ int entry(int argc, char **argv)
              "This will convert single input Transpose to Reshape");
   add_switch(arser, "--expand_broadcast_const", "This will expand broadcastable constant inputs");
   add_switch(arser, "--unroll_unidirseqlstm", "Unroll UnidirectionalSequenceLSTM operator.");
+  add_switch(arser, "--compress_weights_huffman",
+             "Loseless weights compression with Huffman encoding.");
   add_switch(arser, "--convert_nchw_to_nhwc",
              "Experimental: This will convert NCHW operators to NHWC under the assumption that "
              "input model is NCHW.");
@@ -343,6 +345,7 @@ int entry(int argc, char **argv)
   option_str_to_enum["decompose_softmax"] = Algorithms::DecomposeSoftmaxPass;
   option_str_to_enum["expand_broadcast_const"] = Algorithms::ExpandBroadcastConst;
   option_str_to_enum["unroll_unidirseqlstm"] = Algorithms::UnrollUnidirSeqLSTM;
+  option_str_to_enum["compress_weights_huffman"] = Algorithms::CompressWeightsHuffman;
   // clang-format on
 
   if (arser.get<bool>("--verbose"))
diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
index f118ee22c24..91ca85380c4 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -18,6 +18,7 @@
 #define LUCI_INTERPRETER_CORE_TENSOR_H
 
 #include "luci_interpreter/core/DataType.h"
+#include <luci/IR/AttrWeightCompression.h>
 
 #include <cassert>
 #include <cstddef>
@@ -146,6 +147,8 @@ class Tensor
 
   void resize(const Shape &new_shape);
 
+  void resize(const Shape &new_shape, size_t raw_size);
+
   void set_data_buffer(uint8_t *buffer)
   {
     if (buffer == nullptr)
@@ -173,11 +176,21 @@ class Tensor
 
   void set_offset(int32_t offset) { _offset = offset; }
 
+  luci::CompressionType get_compression() const { return _compression; }
+
+  void set_compression(luci::CompressionType compression) { _compression = compression; }
+
+  size_t get_raw_size(void) const { return _raw_size; }
+  void set_raw_size(size_t size) { _raw_size = size; }
+
 private:
   DataType _element_type;
   Shape _shape;
   AffineQuantization _quantization;
   uint8_t *_data = nullptr;
+  // Used for compressed/sparsed tensors when size != WxHxLxD
+  size_t _raw_size{0};
+
   std::string _name;
   bool _data_allocated = false;
   // Write of tensor is reported to registered Observers only if this tensor is observable
@@ -190,6 +203,8 @@ class Tensor
   // Used by static memory manager.
   // Stores the offset from the beginning of the allocated memory buffer.
   int32_t _offset = -1;
+
+  luci::CompressionType _compression{luci::CompressionType::NONE};
 };
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h b/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h
new file mode 100644
index 00000000000..6a8dd712b7c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/HuffmanDecoder.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__
+#define __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+#include <tuple>
+#include <queue>
+#include <string>
+#include <bitset>
+#include <climits>
+
+namespace luci_interpreter_pal
+{
+
+namespace huffman
+{
+template <typename T> struct Node
+{
+  Node *p_left = nullptr;
+  Node *p_right = nullptr;
+  T data;
+  unsigned int freq;
+};
+
+template <typename T> class HuffmanDecoder
+{
+private:
+  Node<T> *root = nullptr;
+  std::unordered_map<T, std::string> huffmanCode;
+  std::vector<bool> encoded_bitset{};
+  std::size_t nodes_count = 0;
+
+private:
+  Node<T> *allocateNode(T data, unsigned int freq, Node<T> *p_left, Node<T> *p_right)
+  {
+    Node<T> *node = new Node<T>;
+    node->data = data;
+    node->freq = freq;
+    node->p_left = p_left;
+    node->p_right = p_right;
+    nodes_count++;
+    return node;
+  }
+
+  std::string exportHuffmanTreeToString(Node<T> *node)
+  {
+    if (node == nullptr)
+      return "";
+    if (!node->p_left && !node->p_right)
+    {
+      return "0" + std::bitset<sizeof(T) * CHAR_BIT>(node->data).to_string();
+    }
+    std::string tmp = "1";
+    tmp += exportHuffmanTreeToString(node->p_left);
+    tmp += exportHuffmanTreeToString(node->p_right);
+    return tmp;
+  }
+
+  Node<T> *importHuffmanTreeFromBoolVec(std::vector<bool> &vec, size_t &index)
+  {
+    if (vec.empty())
+      return nullptr;
+    if (vec[index])
+    {
+      index++;
+      Node<T> *p_left = importHuffmanTreeFromBoolVec(vec, index);
+      Node<T> *p_right = importHuffmanTreeFromBoolVec(vec, index);
+      return allocateNode(0, 0, p_left, p_right);
+    }
+    else if (vec[index] == false)
+    {
+      index++;
+      T tmp = 0;
+      for (size_t i = 0; i < sizeof(T) * CHAR_BIT; ++i)
+      {
+        if (vec[index++])
+          tmp |= (1 << (sizeof(T) * CHAR_BIT - 1)) >> i;
+      }
+
+      return allocateNode(tmp, 0, nullptr, nullptr);
+    }
+    return nullptr;
+  }
+
+  Node<T> *importHuffmanTreeFromString(std::string &str)
+  {
+
+    if (str.substr(0, 1) == "1")
+    {
+      str = str.substr(1);
+      Node<T> *p_left = importHuffmanTreeFromString(str);
+      Node<T> *p_right = importHuffmanTreeFromString(str);
+      return allocateNode(0, 0, p_left, p_right);
+    }
+    else if (str.substr(0, 1) == "0")
+    {
+      str = str.substr(1);
+      std::bitset<sizeof(T) * CHAR_BIT> tmp(str.substr(0, sizeof(T) * CHAR_BIT));
+      str = str.substr(sizeof(T) * CHAR_BIT);
+      return allocateNode(static_cast<T>(tmp.to_ullong()), 0, nullptr, nullptr);
+    }
+  }
+
+  void buildHuffmanTable(Node<T> *node, const std::string str = "")
+  {
+    if (node == nullptr)
+      return;
+
+    if (!node->p_left && !node->p_right)
+    {
+      huffmanCode[node->data] = str;
+    }
+
+    buildHuffmanTable(node->p_left, str + "0");
+    buildHuffmanTable(node->p_right, str + "1");
+  }
+
+  void decode(Node<T> *node, std::string &str, std::vector<T> &out_vec, size_t &index)
+  {
+    if (node == nullptr)
+    {
+      return;
+    }
+
+    if (!node->p_left && !node->p_right)
+    {
+      out_vec.push_back(node->data);
+      return;
+    }
+
+    if (str.size() == index)
+      return;
+    if (str[index] == '0')
+    {
+      decode(node->p_left, str, out_vec, ++index);
+    }
+    else
+    {
+      decode(node->p_right, str, out_vec, ++index);
+    }
+  }
+
+  struct EncodedTreeAndData
+  {
+    std::vector<bool> tree_vec{};
+    std::vector<bool> data_vec{};
+  };
+
+  EncodedTreeAndData unpackArrayToEncodedTreeAndData(const uint8_t *pack_ptr)
+  {
+    constexpr auto kTreeSizeBytesN = sizeof(size_t);
+    constexpr auto kDataSizeBytesN = sizeof(size_t);
+
+    const std::bitset<CHAR_BIT * kTreeSizeBytesN> tree_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr)));
+    const std::bitset<CHAR_BIT * kDataSizeBytesN> data_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr + kTreeSizeBytesN)));
+
+    const size_t kTreeSizeInBits = static_cast<size_t>(tree_size_bitset.to_ullong());
+    const size_t kDataSizeInBits = static_cast<size_t>(data_size_bitset.to_ullong());
+
+    auto start_pos = kTreeSizeBytesN + kDataSizeBytesN;
+    EncodedTreeAndData tree_and_data;
+
+    const auto kTreeSizeInBytes =
+      kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT;
+
+    for (size_t i = 0; i < kTreeSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      for (size_t j = 0; j < kNumOfBits; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.tree_vec.push_back(true);
+        else
+          tree_and_data.tree_vec.push_back(false);
+      }
+    }
+    const auto kDataSizeInBytes =
+      kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT;
+    const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT;
+    start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes;
+
+    for (size_t i = 0; i < kDataSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      const auto kBitsInFirstByteToRead =
+        kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits;
+      for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j)
+      {
+
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+      if (kNumOfBits < CHAR_BIT - kOffsetInBits)
+        break;
+      const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits
+                               ? kNumOfBits - (CHAR_BIT - kOffsetInBits)
+                               : kOffsetInBits;
+      for (size_t j = 0; j < kBitsLeft; ++j)
+      {
+
+        if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    return tree_and_data;
+  }
+
+  EncodedTreeAndData unpackArrayToEncodedTreeAndData(const std::vector<uint8_t> &packed_vec)
+  {
+    constexpr auto kTreeSizeBytesN = sizeof(size_t);
+    constexpr auto kDataSizeBytesN = sizeof(size_t);
+    const uint8_t *pack_ptr = packed_vec.data();
+    const std::bitset<CHAR_BIT * kTreeSizeBytesN> tree_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr)));
+    const std::bitset<CHAR_BIT * kDataSizeBytesN> data_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr + kTreeSizeBytesN)));
+
+    const size_t kTreeSizeInBits = static_cast<size_t>(tree_size_bitset.to_ullong());
+    const size_t kDataSizeInBits = static_cast<size_t>(data_size_bitset.to_ullong());
+
+    auto start_pos = kTreeSizeBytesN + kDataSizeBytesN;
+    EncodedTreeAndData tree_and_data;
+
+    const auto kTreeSizeInBytes =
+      kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT;
+
+    for (size_t i = 0; i < kTreeSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      for (size_t j = 0; j < kNumOfBits; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    const auto kDataSizeInBytes =
+      kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT;
+    const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT;
+    start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes;
+
+    for (size_t i = 0; i < kDataSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      const auto kBitsInFirstByteToRead =
+        kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits;
+      for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j)
+      {
+
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+      if (kNumOfBits < CHAR_BIT - kOffsetInBits)
+        break;
+      const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits
+                               ? kNumOfBits - (CHAR_BIT - kOffsetInBits)
+                               : kOffsetInBits;
+      for (size_t j = 0; j < kBitsLeft; ++j)
+      {
+
+        if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    return tree_and_data;
+  }
+
+public:
+  void decode(Node<T> *node, std::vector<bool> &vec, T *dst_ptr)
+  {
+    if (node == nullptr)
+    {
+      return;
+    }
+
+    if (!node->p_left && !node->p_right)
+    {
+      *dst_ptr = node->data;
+      return;
+    }
+
+    if (vec.size() == _decode_idx)
+      return;
+    if (vec[_decode_idx] == false)
+    {
+      ++_decode_idx;
+      decode(node->p_left, vec, dst_ptr);
+    }
+    else
+    {
+      ++_decode_idx;
+      decode(node->p_right, vec, dst_ptr);
+    }
+  }
+
+private:
+  size_t _decode_idx = 0;
+  EncodedTreeAndData _encoded_tree_and_data;
+
+public:
+  void init_decoder(const uint8_t *input)
+  {
+    size_t index = 0;
+    _encoded_tree_and_data = unpackArrayToEncodedTreeAndData(input);
+    root = importHuffmanTreeFromBoolVec(_encoded_tree_and_data.tree_vec, index);
+  }
+
+  void reset_decode_idx(void) { _decode_idx = 0; }
+
+  int decode_n(uint8_t *dst_ptr, size_t num)
+  {
+    size_t bytes_decoded = 0;
+    for (size_t i = 0; i < num && _decode_idx < _encoded_tree_and_data.data_vec.size(); ++i)
+    {
+      decode(root, _encoded_tree_and_data.data_vec, dst_ptr + bytes_decoded);
+      bytes_decoded++;
+    }
+    return bytes_decoded;
+  }
+
+  HuffmanDecoder() = default;
+};
+} // namespace huffman
+} // namespace luci_interpreter_pal
+#endif // __LUCI_INTERPRETER_PAL_HUFFMAN_DECODER_H__
diff --git a/compiler/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-interpreter/pal/linux/PALConv2d.h
index 0ce83fc6e35..4d0f3a37774 100644
--- a/compiler/luci-interpreter/pal/linux/PALConv2d.h
+++ b/compiler/luci-interpreter/pal/linux/PALConv2d.h
@@ -19,6 +19,7 @@
 
 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
 #include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+#include "HuffmanDecoder.h"
 
 namespace luci_interpreter_pal
 {
@@ -84,6 +85,135 @@ static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeS
                               scratchpad_data, gemmlowp_context.get());
 }
 
+template <typename T>
+void ConvPerChannelHuffman(const tflite::ConvParams &params, const int32_t *mult,
+                           const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+                           const T *input_data, const tflite::RuntimeShape &filter_shape,
+                           const T *filter_data, const tflite::RuntimeShape &bias_shape,
+                           const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+                           T *output_data, const tflite::RuntimeShape &scratchpad_shape,
+                           T *scratchpad_data)
+{
+  (void)scratchpad_shape;
+  (void)scratchpad_data;
+  // Get parameters.
+  const int32_t input_offset = params.input_offset; // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+  const int32_t filter_offset = params.weights_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  huffman::HuffmanDecoder<uint8_t> decoder;
+  decoder.init_decoder(reinterpret_cast<const uint8_t *>(filter_data));
+  decoder.reset_decode_idx();
+  for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+  {
+    auto group = out_channel / filters_per_group;
+
+    // extract compressed filter
+    decoder.decode_n(reinterpret_cast<uint8_t *>(&scratchpad_data[0]), scratchpad_shape.FlatSize());
+
+    for (int batch = 0; batch < batches; ++batch)
+    {
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          int32_t acc = 0;
+
+          for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel)
+          {
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+            {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+              {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+                if (!is_point_inside_image)
+                {
+                  continue;
+                }
+
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x,
+                                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val =
+                  scratchpad_data[(filter_y * filter_height + filter_x) * filter_width +
+                                  in_channel];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // accumulator depth is smaller than 2^16.
+                acc += (filter_val + filter_offset) * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = tflite::MultiplyByQuantizedMultiplier(acc, mult[out_channel], shifts[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast<T>(acc);
+        }
+      }
+    }
+  }
+}
+
 static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
                                   const int32_t *shifts, const tflite::RuntimeShape &input_shape,
                                   const int8 *input_data, const tflite::RuntimeShape &filter_shape,
@@ -105,7 +235,8 @@ static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
                                          const tflite::ConvParams &params,
                                          const tflite::RuntimeShape &input_shape,
                                          const tflite::RuntimeShape &filter_shape,
-                                         const tflite::RuntimeShape &output_shape)
+                                         const tflite::RuntimeShape &output_shape,
+                                         bool is_compressed = false)
 {
   const int32_t filter_height = filter_shape.Dims(1);
   const int32_t filter_width = filter_shape.Dims(2);
@@ -117,7 +248,7 @@ static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
   const bool need_non_dilated_scratchpad = params.stride_height != 1 || params.stride_width != 1 ||
                                            filter_height != 1 || filter_width != 1;
   auto _need_scratchpad = input_data_type != luci_interpreter::DataType::S16 &&
-                          (need_dilated_scratchpad || need_non_dilated_scratchpad);
+                          (need_dilated_scratchpad || need_non_dilated_scratchpad || is_compressed);
 
   if (_need_scratchpad)
   {
diff --git a/compiler/luci-interpreter/src/SimpleMemoryManager.cpp b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp
index a39c34a0ad8..bf13b0cc9a8 100644
--- a/compiler/luci-interpreter/src/SimpleMemoryManager.cpp
+++ b/compiler/luci-interpreter/src/SimpleMemoryManager.cpp
@@ -29,12 +29,21 @@ void SimpleMemoryManager::allocate_memory(luci_interpreter::Tensor &tensor)
   {
     release_memory(tensor);
   }
-  const auto element_size = getDataTypeSize(tensor.element_type());
+  size_t bytes_to_allocate = 0;
+  if (tensor.get_raw_size() > 0)
+  {
+    bytes_to_allocate = tensor.get_raw_size();
+  }
+  else
+  {
+    const auto element_size = getDataTypeSize(tensor.element_type());
 
-  // Use large_num_elements to avoid overflow
-  const auto num_elements = tensor.shape().large_num_elements();
+    // Use large_num_elements to avoid overflow
+    const auto num_elements = tensor.shape().large_num_elements();
+    bytes_to_allocate = num_elements * element_size;
+  }
 
-  auto *data = new uint8_t[num_elements * element_size];
+  auto *data = new uint8_t[bytes_to_allocate];
   tensor.set_data_buffer(data);
 }
 
diff --git a/compiler/luci-interpreter/src/core/Tensor.cpp b/compiler/luci-interpreter/src/core/Tensor.cpp
index 3c3c5ffffe8..b7769174e23 100644
--- a/compiler/luci-interpreter/src/core/Tensor.cpp
+++ b/compiler/luci-interpreter/src/core/Tensor.cpp
@@ -45,14 +45,34 @@ void Tensor::writeData(const void *data_ptr, size_t data_size)
 {
   const size_t element_size = getDataTypeSize(element_type());
   const int32_t num_elements = shape().num_elements();
-  if (data_size != num_elements * element_size)
+  if (_raw_size > 0)
   {
-    throw std::invalid_argument("Invalid data size.");
+    if (data_size != _raw_size)
+    {
+      throw std::invalid_argument("Invalid data size.");
+    }
+  }
+  else
+  {
+    if (data_size != num_elements * element_size)
+    {
+      throw std::invalid_argument("Invalid data size.");
+    }
   }
   assert(data_ptr != nullptr);
   std::memcpy(data<void>(), data_ptr, data_size);
 }
 
-void Tensor::resize(const Shape &new_shape) { _shape = new_shape; }
+void Tensor::resize(const Shape &new_shape)
+{
+  _shape = new_shape;
+  _raw_size = 0;
+}
+
+void Tensor::resize(const Shape &new_shape, size_t raw_size)
+{
+  _shape = new_shape;
+  _raw_size = raw_size;
+}
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
index 9aae9da2644..a5377408adc 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
@@ -117,9 +117,10 @@ void Conv2D::configure()
   params.dilation_height_factor = _params.dilation_height_factor;
   params.dilation_width_factor = _params.dilation_width_factor;
   auto scratchpad = getOutputTensors()[1];
+  bool is_compressed = filter()->get_compression() != luci::CompressionType::NONE;
   luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), params,
                                               getTensorShape(input()), getTensorShape(filter()),
-                                              getTensorShape(output()));
+                                              getTensorShape(output()), is_compressed);
 
   switch (_params.activation)
   {
@@ -145,20 +146,34 @@ void Conv2D::execute() const
       }
       throw std::runtime_error("luci-intp Conv2D(2) Unsupported type.");
     case DataType::U8:
-      if (filter()->scales().size() == 1)
+      if (filter()->get_compression() == luci::CompressionType::HUFFMAN)
       {
-        evalQuantized();
+        evalQuantizedU8PerChannelHuffman();
       }
-      else if (filter()->scales().size() > 1)
+      else
       {
-        LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
-        LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
-                               static_cast<size_t>(filter()->shape().dim(0)));
-        evalQuantizedPerChannel();
+        if (filter()->scales().size() == 1)
+        {
+          evalQuantized();
+        }
+        else if (filter()->scales().size() > 1)
+        {
+          LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+          LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+                                 static_cast<size_t>(filter()->shape().dim(0)));
+          evalQuantizedPerChannel();
+        }
       }
       break;
     case DataType::S8:
-      evalQuantizedS8PerChannel();
+      if (filter()->get_compression() == luci::CompressionType::HUFFMAN)
+      {
+        evalQuantizedS8PerChannelHuffman();
+      }
+      else
+      {
+        evalQuantizedS8PerChannel();
+      }
       break;
     case DataType::S16:
       evalQuantizedS16();
@@ -321,6 +336,120 @@ void Conv2D::evalQuantizedPerChannel() const
   }
 }
 
+// TODO: remove code duplication with S8
+void Conv2D::evalQuantizedU8PerChannelHuffman() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  // The kernel expects filter zero points to be negated.
+  params.input_offset = -input()->zero_point();    // Note the '-'.
+  params.weights_offset = -filter()->zero_point(); // Unused in tflite code
+  params.output_offset = output()->zero_point();
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  auto scratchpad = getOutputTensors()[1];
+  uint8_t *scratchpad_data = nullptr;
+
+  // Scratchpad used for decompression
+  const auto filter_shape = getTensorShape(filter());
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  auto scratchpad_shape = Shape({filter_height, filter_width, filter_input_depth});
+
+  if (scratchpad->is_allocatable())
+  {
+    scratchpad->resize(scratchpad_shape);
+    scratchpad_data = scratchpad->data<uint8_t>();
+  }
+  luci_interpreter_pal::ConvPerChannelHuffman<uint8_t>(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<uint8_t>(input()), getTensorShape(filter()), getTensorData<uint8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<uint8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
+}
+
+void Conv2D::evalQuantizedS8PerChannelHuffman() const
+{
+  int32_t activation_min{};
+  int32_t activation_max{};
+  calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+  tflite::ConvParams params{};
+  params.padding_values.height = _padding_height;
+  params.padding_values.width = _padding_width;
+  params.stride_height = _params.stride_height;
+  params.stride_width = _params.stride_width;
+  params.dilation_height_factor = _params.dilation_height_factor;
+  params.dilation_width_factor = _params.dilation_width_factor;
+  // The kernel expects filter zero points to be negated.
+  params.input_offset = -input()->zero_point(); // Note the '-'.
+  params.weights_offset = 0;                    // Unused in tflite code
+  params.output_offset = output()->zero_point();
+  params.quantized_activation_min = activation_min;
+  params.quantized_activation_max = activation_max;
+
+  const std::vector<double> effective_output_scales =
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+  std::vector<ChannelQuantMultipliers> quant_multipliers =
+    quantizeMultipliers(effective_output_scales);
+
+  std::vector<int32_t> shifts;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+                 [](ChannelQuantMultipliers cm) { return cm.shift; });
+  std::vector<int32_t> multipliers;
+  std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+                 std::back_inserter(multipliers),
+                 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+  auto scratchpad = getOutputTensors()[1];
+  int8_t *scratchpad_data = nullptr;
+
+  // Scratchpad used for decompression
+  const auto filter_shape = getTensorShape(filter());
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  auto scratchpad_shape = Shape({filter_height, filter_width, filter_input_depth});
+
+  if (scratchpad->is_allocatable())
+  {
+    scratchpad->resize(scratchpad_shape);
+    scratchpad_data = scratchpad->data<int8_t>();
+  }
+
+  luci_interpreter_pal::ConvPerChannelHuffman<int8_t>(
+    params, multipliers.data(), shifts.data(), getTensorShape(input()),
+    getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+    getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+    getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
+}
+
 void Conv2D::evalQuantizedS8PerChannel() const
 {
   int32_t activation_min{};
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-interpreter/src/kernels/Conv2D.h
index 330bf3a2a69..096bd85f4db 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.h
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.h
@@ -47,6 +47,8 @@ class Conv2D : public KernelWithParams<Conv2DParams>
   void evalQuantizedPerChannel() const;
   void evalQuantizedS8PerChannel() const;
   void evalQuantizedS16() const;
+  void evalQuantizedS8PerChannelHuffman() const;
+  void evalQuantizedU8PerChannelHuffman() const;
 
 private:
   int32_t _padding_height{};
diff --git a/compiler/luci-interpreter/src/kernels/Utils.h b/compiler/luci-interpreter/src/kernels/Utils.h
index e975585cdf3..422c0b4d7d8 100644
--- a/compiler/luci-interpreter/src/kernels/Utils.h
+++ b/compiler/luci-interpreter/src/kernels/Utils.h
@@ -137,7 +137,8 @@ Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_
 inline double getQuantizedConvolutionMultipler(float input_scale, float filter_scale,
                                                float output_scale)
 {
-  const double input_product_scale = static_cast<double>(input_scale * filter_scale);
+  const double input_product_scale =
+    static_cast<double>(static_cast<double>(input_scale) * static_cast<double>(filter_scale));
   LUCI_INTERPRETER_CHECK(input_product_scale >= 0);
   return input_product_scale / static_cast<double>(output_scale);
 }
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index cf83713d906..6e1399dd467 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -243,9 +243,11 @@ void GraphLoader::loadTensors()
       const void *const_data = getNodeData(const_node, &data_size);
       if (const_data != nullptr)
       {
+        tensor->set_raw_size(data_size);
         _memory_manager->allocate_memory(*tensor);
         tensor->writeData(const_data, data_size);
       }
+      tensor->set_compression(const_node->compression());
     }
     else if (const auto *custom_out_node = dynamic_cast<const luci::CircleCustomOut *>(node))
     {
@@ -258,6 +260,7 @@ void GraphLoader::loadTensors()
         const void *const_data = getNodeData(custom_node, &data_size);
         if (const_data != nullptr)
         {
+          tensor->set_raw_size(data_size);
           _memory_manager->allocate_memory(*tensor);
           tensor->writeData(const_data, data_size);
         }
diff --git a/compiler/luci-pass-value-py-test/test.lst b/compiler/luci-pass-value-py-test/test.lst
index 8328948b937..d2ad2d41742 100644
--- a/compiler/luci-pass-value-py-test/test.lst
+++ b/compiler/luci-pass-value-py-test/test.lst
@@ -7,6 +7,7 @@
 
 # eval(Net_Preactivation_BN_000 fuse_preactivation_batchnorm) : value diff exist
 # --> https://github.com/Samsung/ONE/issues/5782
+eval(Conv2D_U8_000 compress_weights_huffman)
 eval(FullyConnected_007 replace_non_const_fc_with_batch_matmul)
 eval(HardSwish_001 decompose_hardswish)
 eval(Net_Add_FloorMod_Gather_000 remove_gather_guard)
diff --git a/compiler/luci/export/src/CircleExporterUtils.cpp b/compiler/luci/export/src/CircleExporterUtils.cpp
index f6e380d7872..13889f17f89 100644
--- a/compiler/luci/export/src/CircleExporterUtils.cpp
+++ b/compiler/luci/export/src/CircleExporterUtils.cpp
@@ -25,6 +25,21 @@
 namespace luci
 {
 
+circle::CompressionType to_circle_compressiontype(luci::CompressionType type)
+{
+  switch (type)
+  {
+    case luci::CompressionType::UNDEFINED:
+    case luci::CompressionType::NONE:
+      return circle::CompressionType_NONE;
+    case luci::CompressionType::HUFFMAN:
+      return circle::CompressionType_HUFFMAN;
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::WeightCompression",
+                     oops::to_uint32(type));
+  }
+}
+
 circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func)
 {
   switch (func)
diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h
index 6d0ebd6cb29..970c7555a3f 100644
--- a/compiler/luci/export/src/CircleExporterUtils.h
+++ b/compiler/luci/export/src/CircleExporterUtils.h
@@ -32,6 +32,7 @@ inline constexpr uint64_t FLATBUFFERS_SIZE_MAX = 2147483648UL; // 2GB
 namespace luci
 {
 
+circle::CompressionType to_circle_compressiontype(luci::CompressionType type);
 circle::ActivationFunctionType to_circle_actfunc(luci::FusedActFunc func);
 circle::TensorType to_circle_tensortype(loco::DataType type);
 circle::MirrorPadMode to_circle_mirrorpadmode(luci::MirrorPadMode mode);
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index fa585d06e81..8d8ab851c74 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -578,30 +578,48 @@ bool has_same_values(luci::CircleConst *lhs, luci::CircleConst *rhs)
   switch (lhs->dtype())
   {
     case loco::DataType::FLOAT32:
+      if (lhs->size<loco::DataType::FLOAT32>() != rhs->size<loco::DataType::FLOAT32>())
+        return false;
       return has_same_elements<loco::DataType::FLOAT32>(lhs, rhs);
 
     case loco::DataType::S4:
+      if (lhs->size<loco::DataType::S4>() != rhs->size<loco::DataType::S4>())
+        return false;
       return has_same_elements<loco::DataType::S4>(lhs, rhs);
 
     case loco::DataType::S8:
+      if (lhs->size<loco::DataType::S8>() != rhs->size<loco::DataType::S8>())
+        return false;
       return has_same_elements<loco::DataType::S8>(lhs, rhs);
 
     case loco::DataType::S16:
+      if (lhs->size<loco::DataType::S16>() != rhs->size<loco::DataType::S16>())
+        return false;
       return has_same_elements<loco::DataType::S16>(lhs, rhs);
 
     case loco::DataType::S32:
+      if (lhs->size<loco::DataType::S32>() != rhs->size<loco::DataType::S32>())
+        return false;
       return has_same_elements<loco::DataType::S32>(lhs, rhs);
 
     case loco::DataType::S64:
+      if (lhs->size<loco::DataType::S64>() != rhs->size<loco::DataType::S64>())
+        return false;
       return has_same_elements<loco::DataType::S64>(lhs, rhs);
 
     case loco::DataType::U4:
+      if (lhs->size<loco::DataType::U4>() != rhs->size<loco::DataType::U4>())
+        return false;
       return has_same_elements<loco::DataType::U4>(lhs, rhs);
 
     case loco::DataType::U8:
+      if (lhs->size<loco::DataType::U8>() != rhs->size<loco::DataType::U8>())
+        return false;
       return has_same_elements<loco::DataType::U8>(lhs, rhs);
 
     case loco::DataType::BOOL:
+      if (lhs->size<loco::DataType::BOOL>() != rhs->size<loco::DataType::BOOL>())
+        return false;
       return has_same_elements<loco::DataType::BOOL>(lhs, rhs);
 
     default:
@@ -668,8 +686,14 @@ void exportOpDefinedTensor(const CircleTensorInfo &info, FlatBufferBuilder &buil
 
   auto is_variable = info.is_variable();
 
-  auto tensor_offset = CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset,
-                                    quantparam, is_variable, sparsityparam, shape_signature_offset);
+  luci::CircleConst *content = info.content();
+  auto compression_type = circle::CompressionType_NONE;
+  if (content)
+    compression_type = to_circle_compressiontype(info.content()->compression());
+
+  auto tensor_offset =
+    CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset, quantparam,
+                 is_variable, sparsityparam, shape_signature_offset, false, 0, compression_type);
   gd._tensors.push_back(tensor_offset);
 }
 
diff --git a/compiler/luci/import/include/luci/Import/CircleImporterUtils.h b/compiler/luci/import/include/luci/Import/CircleImporterUtils.h
new file mode 100644
index 00000000000..f96ec210747
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/CircleImporterUtils.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_IMPORTER_UTILS_H__
+#define __CIRCLE_IMPORTER_UTILS_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco.h>
+
+#include <mio/circle/schema_generated.h>
+
+namespace luci
+{
+
+luci::CompressionType from_circle_compressiontype(circle::CompressionType type);
+
+} // namespace luci
+
+#endif // __CIRCLE_IMPORTER_UTILS_H__
diff --git a/compiler/luci/import/src/CircleImporterUtils.cpp b/compiler/luci/import/src/CircleImporterUtils.cpp
new file mode 100644
index 00000000000..2e4f97ef27d
--- /dev/null
+++ b/compiler/luci/import/src/CircleImporterUtils.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/CircleImporterUtils.h"
+
+#include <oops/InternalExn.h>
+
+namespace luci
+{
+
+luci::CompressionType from_circle_compressiontype(circle::CompressionType type)
+{
+  switch (type)
+  {
+    case circle::CompressionType_NONE:
+      return luci::CompressionType::NONE;
+    case circle::CompressionType_HUFFMAN:
+      return luci::CompressionType::HUFFMAN;
+    default:
+      INTERNAL_EXN_V("trying to convert unsupported luci::WeightCompression",
+                     oops::to_uint32(type));
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index 392f0c2a5b9..ccf3e0f7e5b 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Import/CircleReader.h"
+#include <luci/Import/CircleImporterUtils.h>
 
 #include <mio_circle/Helper.h>
 
@@ -289,6 +290,11 @@ void copy_tensor_attributes(const circle::Tensor *tensor, CircleNode *node)
     if (sparsityparam)
       node->sparsityparam(std::move(sparsityparam));
   }
+  auto const_node = dynamic_cast<CircleConst *>(node);
+  if (const_node)
+  {
+    const_node->compression(luci::from_circle_compressiontype(tensor->compression_type()));
+  }
 }
 
 std::string fb_string2std_string(const flatbuffers::String *fb_str)
diff --git a/compiler/luci/import/src/Nodes/CircleConst.cpp b/compiler/luci/import/src/Nodes/CircleConst.cpp
index 945a8dc98b7..83639ae2d59 100644
--- a/compiler/luci/import/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConst.cpp
@@ -57,7 +57,7 @@ void copy_data(const VectorWrapper<uint8_t> &raw_data, uint32_t num_elements,
   using T = typename loco::DataTypeImpl<DT>::Type;
 
   // TODO calculate the exact buffer size of sparse tensor
-  if (const_node->sparsityparam())
+  if (const_node->sparsityparam() or const_node->compression() != luci::CompressionType::NONE)
   {
     num_elements = raw_data.size() / sizeof(T);
   }
diff --git a/compiler/luci/import/src/Nodes/CircleConv2D.cpp b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
index 8cbecdc003b..35b59e48b4a 100644
--- a/compiler/luci/import/src/Nodes/CircleConv2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Import/Nodes/CircleConv2D.h"
+#include "luci/Import/CircleImporterUtils.h"
 
 #include <luci/IR/Nodes/CircleConv2D.h>
 
diff --git a/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h b/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h
new file mode 100644
index 00000000000..e1a83b01908
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/AttrWeightCompression.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__
+#define __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__
+
+namespace luci
+{
+
+enum class CompressionType
+{
+  UNDEFINED, // This is not defined by TFLite or Circle. This was added to
+             // prevent programming error.
+  NONE,
+  HUFFMAN
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_ATTRWEIGHTCOMPRESSION_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
index 3e9a274e0cd..2f59b73b3fa 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
@@ -17,6 +17,7 @@
 #ifndef __LUCI_IR_CIRCLECONST_H__
 #define __LUCI_IR_CIRCLECONST_H__
 
+#include "luci/IR/AttrWeightCompression.h"
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
@@ -42,10 +43,14 @@ class CircleConst final : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::
   template <loco::DataType DT> const typename loco::DataTypeImpl<DT>::Type &scalar(void) const;
   template <loco::DataType DT> typename loco::DataTypeImpl<DT>::Type &scalar(void);
 
+  CompressionType compression(void) const;
+  void compression(CompressionType c);
+
 private:
   std::vector<uint8_t> _data;
   // TODO use _data for STRING and remove _strings
   std::vector<std::string> _strings; // for STRING type
+  CompressionType _compression{CompressionType::NONE};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/src/Nodes/CircleConst.cpp b/compiler/luci/lang/src/Nodes/CircleConst.cpp
index c17a4e2c36d..54f23fbf175 100644
--- a/compiler/luci/lang/src/Nodes/CircleConst.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleConst.cpp
@@ -21,6 +21,10 @@
 namespace luci
 {
 
+CompressionType CircleConst::compression(void) const { return _compression; }
+
+void CircleConst::compression(luci::CompressionType c) { _compression = c; }
+
 template <loco::DataType DT> uint32_t CircleConst::size(void) const
 {
   assert(dtype() == DT);
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index d4f675f36fe..8c90230252c 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -114,6 +114,7 @@ class CircleOptimizer final
       UnrollUnidirSeqLSTM,
       XpSepActFromTransposeConv,
       RemoveGatherGuard,
+      CompressWeightsHuffman
     };
 
     enum AlgorithmParameters
diff --git a/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h b/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h
new file mode 100644
index 00000000000..f9f97791914
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/CompressWeightsPass.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_COMPRESS_WEIGHTS_PASS_H__
+#define __LUCI_COMPRESS_WEIGHTS_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to generate FC/CONV with compressed weights
+ *
+ * To see the target Op pattern, please visit implementation.
+ */
+struct CompressWeightsPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::CompressWeightsPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_COMPRESS_WEIGHTS_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index bf18b973d6d..9884d18de65 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -98,6 +98,7 @@
 #include "luci/Pass/DecomposeSoftmaxPass.h"
 #include "luci/Pass/UnrollUnidirectionalSequenceLSTMPass.h"
 #include "luci/Pass/XpSepActFromTransposeConvPass.h"
+#include "luci/Pass/CompressWeightsPass.h"
 // TODO add more passes
 
 #include "luci/Pass/CircleShapeInferencePass.h"
@@ -313,7 +314,10 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::RemoveRedundantTransposePass>());
   }
-
+  if (_options->query(Options::Algorithm::CompressWeightsHuffman))
+  {
+    phase.emplace_back(std::make_unique<luci::CompressWeightsPass>());
+  }
   // clang-format off
   std::map<Options::Algorithm, std::unique_ptr<logo::Pass> (*)(void)> option_to_pass;
 
@@ -389,7 +393,7 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   option_to_pass[Options::Algorithm::XpSepActFromTransposeConv] = &createPassInstance<luci::XpSepActFromTransposeConvPass>;
   option_to_pass[Options::Algorithm::ForwardReshapeToUnaryOp] = &createPassInstance<luci::ForwardReshapeToUnaryOpPass>;
   option_to_pass[Options::Algorithm::ForwardTransposeOp] = &createPassInstance<luci::ForwardTransposeOpPass>;
-  // clang-format on 
+  // clang-format on
 
   for (auto const &m : option_to_pass)
   {
diff --git a/compiler/luci/pass/src/CompressWeightsPass.cpp b/compiler/luci/pass/src/CompressWeightsPass.cpp
new file mode 100644
index 00000000000..ba5204e2f6c
--- /dev/null
+++ b/compiler/luci/pass/src/CompressWeightsPass.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/CompressWeightsPass.h"
+#include "helpers/HuffmanEncoder.h"
+#include "helpers/NodeFiller.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/Nodes/CircleConst.h>
+
+#include <cmath>
+#include <cassert>
+
+namespace
+{
+
+template <loco::DataType T> class TypeSelector;
+
+template <> class TypeSelector<loco::DataType::U8>
+{
+public:
+  using Type = uint8_t;
+};
+template <> class TypeSelector<loco::DataType::S8>
+{
+public:
+  using Type = int8_t;
+};
+
+template <loco::DataType DT> bool compress_weights_huffman(luci::CircleConv2D *conv2d)
+{
+  using T = typename TypeSelector<DT>::Type;
+  assert(conv2d);
+
+  auto weights = loco::must_cast<luci::CircleConst *>(conv2d->filter());
+  if (weights->compression() != luci::CompressionType::NONE)
+    return false;
+
+  luci::huffman::HuffmanEncoder<T> encoder;
+  auto new_weights = luci::clone(weights);
+
+  std::vector<T> tmp_buf(weights->size<DT>());
+
+  for (size_t i = 0; i < weights->size<DT>(); ++i)
+  {
+    tmp_buf[i] = weights->at<DT>(i);
+  }
+
+  std::vector<uint8_t> encoded = encoder.encode(tmp_buf);
+
+  new_weights->dtype(DT);
+  new_weights->size<DT>(encoded.size());
+  new_weights->compression(luci::CompressionType::HUFFMAN);
+
+  for (size_t i = 0; i < new_weights->size<DT>(); ++i)
+  {
+    new_weights->at<DT>(i) = encoded[i];
+  }
+  conv2d->filter(new_weights);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool CompressWeightsPass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto conv2d = dynamic_cast<luci::CircleConv2D *>(node);
+    if (not conv2d)
+      continue;
+
+    auto filter = loco::must_cast<luci::CircleConst *>(conv2d->filter());
+
+    if (filter->dtype() == loco::DataType::S8)
+    {
+      if (compress_weights_huffman<loco::DataType::S8>(conv2d))
+        changed = true;
+    }
+    else if (filter->dtype() == loco::DataType::U8)
+    {
+      if (compress_weights_huffman<loco::DataType::U8>(conv2d))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/HuffmanCommon.h b/compiler/luci/pass/src/helpers/HuffmanCommon.h
new file mode 100644
index 00000000000..d457b2f11e8
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/HuffmanCommon.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_HELPERS_HUFFMAN_COMMON_H__
+#define __LUCI_PASS_HELPERS_HUFFMAN_COMMON_H__
+
+#include <memory>
+
+namespace luci
+{
+namespace huffman
+{
+
+// Node of prefix tree
+template <typename T> struct Node
+{
+  std::shared_ptr<Node<T>> p_left;
+  std::shared_ptr<Node<T>> p_right;
+  T data;
+  unsigned int freq;
+};
+
+// Compare functor for priority queue
+template <typename T> struct CompareNodes
+{
+  bool operator()(std::shared_ptr<Node<T>> l, std::shared_ptr<Node<T>> r)
+  {
+    return l->freq > r->freq;
+  }
+};
+
+} // namespace huffman
+} // namespace luci
+
+#endif // __LUCI_PASS_HELPERS_HUFFMAN_COMMON_H__
diff --git a/compiler/luci/pass/src/helpers/HuffmanDecoder.h b/compiler/luci/pass/src/helpers/HuffmanDecoder.h
new file mode 100644
index 00000000000..b5246041079
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/HuffmanDecoder.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__
+#define __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+#include <tuple>
+#include <queue>
+#include <string>
+#include <bitset>
+#include <climits>
+
+namespace luci
+{
+
+namespace huffman
+{
+
+template <typename T> class HuffmanDecoder
+{
+private:
+  std::shared_ptr<Node<T>> root = nullptr;
+  std::unordered_map<T, std::string> huffmanCode;
+  std::vector<bool> encoded_bitset{};
+  std::size_t nodes_count = 0;
+
+private:
+  std::shared_ptr<Node<T>> allocateNode(T data, unsigned int freq, std::shared_ptr<Node<T>> p_left,
+                                        std::shared_ptr<Node<T>> p_right)
+  {
+    std::shared_ptr<Node<T>> node = std::make_unique<Node<T>>();
+    node->data = data;
+    node->freq = freq;
+    node->p_left = p_left;
+    node->p_right = p_right;
+    nodes_count++;
+    return node;
+  }
+
+  std::string exportHuffmanTreeToString(std::shared_ptr<Node<T>> node)
+  {
+    if (node == nullptr)
+      return "";
+    if (!node->p_left && !node->p_right)
+    {
+      return "0" + std::bitset<sizeof(T) * CHAR_BIT>(node->data).to_string();
+    }
+    std::string tmp = "1";
+    tmp += exportHuffmanTreeToString(node->p_left);
+    tmp += exportHuffmanTreeToString(node->p_right);
+    return tmp;
+  }
+
+  std::shared_ptr<Node<T>> importHuffmanTreeFromBoolVec(std::vector<bool> &vec, size_t &index)
+  {
+    if (vec.empty())
+      return nullptr;
+    if (vec[index])
+    {
+      index++;
+      std::shared_ptr<Node<T>> p_left = importHuffmanTreeFromBoolVec(vec, index);
+      std::shared_ptr<Node<T>> p_right = importHuffmanTreeFromBoolVec(vec, index);
+      return allocateNode(0, 0, p_left, p_right);
+    }
+    else if (vec[index] == false)
+    {
+      index++;
+      T tmp = 0;
+      for (size_t i = 0; i < sizeof(T) * CHAR_BIT; ++i)
+      {
+        if (vec[index++])
+          tmp |= (1 << (sizeof(T) * CHAR_BIT - 1)) >> i;
+      }
+
+      return allocateNode(tmp, 0, nullptr, nullptr);
+    }
+    return nullptr;
+  }
+
+  std::shared_ptr<Node<T>> importHuffmanTreeFromString(std::string &str)
+  {
+
+    if (str.substr(0, 1) == "1")
+    {
+      str = str.substr(1);
+      std::shared_ptr<Node<T>> p_left = importHuffmanTreeFromString(str);
+      std::shared_ptr<Node<T>> p_right = importHuffmanTreeFromString(str);
+      return allocateNode(0, 0, p_left, p_right);
+    }
+    else if (str.substr(0, 1) == "0")
+    {
+      str = str.substr(1);
+      std::bitset<sizeof(T) * CHAR_BIT> tmp(str.substr(0, sizeof(T) * CHAR_BIT));
+      str = str.substr(sizeof(T) * CHAR_BIT);
+      return allocateNode(static_cast<T>(tmp.to_ullong()), 0, nullptr, nullptr);
+    }
+  }
+
+  void buildHuffmanTable(std::shared_ptr<Node<T>> node, const std::string str = "")
+  {
+    if (node == nullptr)
+      return;
+
+    if (!node->p_left && !node->p_right)
+    {
+      huffmanCode[node->data] = str;
+    }
+
+    buildHuffmanTable(node->p_left, str + "0");
+    buildHuffmanTable(node->p_right, str + "1");
+  }
+
+  void decode(std::shared_ptr<Node<T>> node, std::string &str, std::vector<T> &out_vec,
+              size_t &index)
+  {
+    if (node == nullptr)
+    {
+      return;
+    }
+
+    if (!node->p_left && !node->p_right)
+    {
+      out_vec.push_back(node->data);
+      return;
+    }
+
+    if (str.size() == index)
+      return;
+    if (str[index] == '0')
+    {
+      decode(node->p_left, str, out_vec, ++index);
+    }
+    else
+    {
+      decode(node->p_right, str, out_vec, ++index);
+    }
+  }
+
+  struct EncodedTreeAndData
+  {
+    std::vector<bool> tree_vec{};
+    std::vector<bool> data_vec{};
+  };
+
+  EncodedTreeAndData unpackArrayToEncodedTreeAndData(const uint8_t *pack_ptr)
+  {
+    constexpr auto kTreeSizeBytesN = sizeof(size_t);
+    constexpr auto kDataSizeBytesN = sizeof(size_t);
+
+    const std::bitset<CHAR_BIT * kTreeSizeBytesN> tree_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr)));
+    const std::bitset<CHAR_BIT * kDataSizeBytesN> data_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr + kTreeSizeBytesN)));
+
+    const size_t kTreeSizeInBits = static_cast<size_t>(tree_size_bitset.to_ullong());
+    const size_t kDataSizeInBits = static_cast<size_t>(data_size_bitset.to_ullong());
+
+    auto start_pos = kTreeSizeBytesN + kDataSizeBytesN;
+    EncodedTreeAndData tree_and_data;
+
+    const auto kTreeSizeInBytes =
+      kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT;
+
+    for (size_t i = 0; i < kTreeSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      for (size_t j = 0; j < kNumOfBits; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.tree_vec.push_back(true);
+        else
+          tree_and_data.tree_vec.push_back(false);
+      }
+    }
+    const auto kDataSizeInBytes =
+      kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT;
+    const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT;
+    start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes;
+
+    for (size_t i = 0; i < kDataSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      const auto kBitsInFirstByteToRead =
+        kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits;
+      for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+      if (kNumOfBits < CHAR_BIT - kOffsetInBits)
+        break;
+      const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits
+                               ? kNumOfBits - (CHAR_BIT - kOffsetInBits)
+                               : kOffsetInBits;
+      for (size_t j = 0; j < kBitsLeft; ++j)
+      {
+        if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    return tree_and_data;
+  }
+
+  EncodedTreeAndData unpackArrayToEncodedTreeAndData(const std::vector<uint8_t> &packed_vec)
+  {
+    constexpr auto kTreeSizeBytesN = sizeof(size_t);
+    constexpr auto kDataSizeBytesN = sizeof(size_t);
+    const uint8_t *pack_ptr = packed_vec.data();
+    const std::bitset<CHAR_BIT * kTreeSizeBytesN> tree_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr)));
+    const std::bitset<CHAR_BIT * kDataSizeBytesN> data_size_bitset(
+      *static_cast<const size_t *>(static_cast<const void *>(pack_ptr + kTreeSizeBytesN)));
+
+    const size_t kTreeSizeInBits = static_cast<size_t>(tree_size_bitset.to_ullong());
+    const size_t kDataSizeInBits = static_cast<size_t>(data_size_bitset.to_ullong());
+
+    auto start_pos = kTreeSizeBytesN + kDataSizeBytesN;
+    EncodedTreeAndData tree_and_data;
+
+    const auto kTreeSizeInBytes =
+      kTreeSizeInBits % CHAR_BIT ? kTreeSizeInBits / CHAR_BIT + 1 : kTreeSizeInBits / CHAR_BIT;
+
+    for (size_t i = 0; i < kTreeSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kTreeSizeInBits - i * CHAR_BIT < CHAR_BIT ? kTreeSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      for (size_t j = 0; j < kNumOfBits; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    const auto kDataSizeInBytes =
+      kDataSizeInBits % CHAR_BIT ? kDataSizeInBits / CHAR_BIT + 1 : kDataSizeInBits / CHAR_BIT;
+    const auto kOffsetInBits = kTreeSizeInBits % CHAR_BIT;
+    start_pos += kOffsetInBits ? kTreeSizeInBytes - 1 : kTreeSizeInBytes;
+
+    for (size_t i = 0; i < kDataSizeInBytes; ++i)
+    {
+      const auto kNumOfBits =
+        kDataSizeInBits - i * CHAR_BIT < CHAR_BIT ? kDataSizeInBits - i * CHAR_BIT : CHAR_BIT;
+      const auto kBitsInFirstByteToRead =
+        kNumOfBits < CHAR_BIT - kOffsetInBits ? kNumOfBits : CHAR_BIT - kOffsetInBits;
+      for (size_t j = kOffsetInBits; j < kOffsetInBits + kBitsInFirstByteToRead; ++j)
+      {
+        if (*(pack_ptr + start_pos + i) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+      if (kNumOfBits < CHAR_BIT - kOffsetInBits)
+        break;
+      const auto kBitsLeft = kNumOfBits - (CHAR_BIT - kOffsetInBits) < kOffsetInBits
+                               ? kNumOfBits - (CHAR_BIT - kOffsetInBits)
+                               : kOffsetInBits;
+      for (size_t j = 0; j < kBitsLeft; ++j)
+      {
+        if (*(pack_ptr + start_pos + i + 1) & ((1 << 7) >> j))
+          tree_and_data.data_vec.push_back(true);
+        else
+          tree_and_data.data_vec.push_back(false);
+      }
+    }
+    return tree_and_data;
+  }
+
+public:
+  void decode(std::shared_ptr<Node<T>> node, std::vector<bool> &vec, uint8_t *dst_ptr)
+  {
+    if (node == nullptr)
+    {
+      return;
+    }
+
+    if (!node->p_left && !node->p_right)
+    {
+      *dst_ptr = node->data;
+      return;
+    }
+
+    if (vec.size() == _decode_idx)
+      return;
+    if (vec[_decode_idx] == false)
+    {
+      ++_decode_idx;
+      decode(node->p_left, vec, dst_ptr);
+    }
+    else
+    {
+      ++_decode_idx;
+      decode(node->p_right, vec, dst_ptr);
+    }
+  }
+
+private:
+  size_t _decode_idx = 0;
+  EncodedTreeAndData _encoded_tree_and_data;
+
+public:
+  void init_decoder(const uint8_t *input)
+  {
+    size_t index = 0;
+    _encoded_tree_and_data = unpackArrayToEncodedTreeAndData(input);
+    root = importHuffmanTreeFromBoolVec(_encoded_tree_and_data.tree_vec, index);
+  }
+
+  void reset_decode_idx(void) { _decode_idx = 0; }
+
+  int decode_n(uint8_t *dst_ptr, size_t num)
+  {
+    size_t bytes_decoded = 0;
+    for (int i = 0; i < num && _decode_idx < _encoded_tree_and_data.data_vec.size(); ++i)
+    {
+      decode(root, _encoded_tree_and_data.data_vec, dst_ptr + bytes_decoded);
+      bytes_decoded++;
+    }
+    return bytes_decoded;
+  }
+
+  HuffmanDecoder() = default;
+
+  std::vector<T> decode(const std::vector<uint8_t> &input)
+  {
+    init_decoder(input.data());
+    std::vector<T> decoded{};
+    T tmp;
+    while (decode_n(reinterpret_cast<uint8_t *>(&tmp), sizeof(T)))
+    {
+      decoded.push_back(tmp);
+    }
+    return decoded;
+  }
+};
+
+} // namespace huffman
+} // namespace luci
+
+#endif // __LUCI_PASS_HELPERS_HUFFMAN_DECODER_H__
diff --git a/compiler/luci/pass/src/helpers/HuffmanEncoder.cpp b/compiler/luci/pass/src/helpers/HuffmanEncoder.cpp
new file mode 100644
index 00000000000..a60b4e18591
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/HuffmanEncoder.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HuffmanEncoder.h"
+
+namespace luci
+{
+namespace huffman
+{
+
+template <typename T>
+std::shared_ptr<Node<T>> HuffmanEncoder<T>::allocateNode(T data, unsigned int freq,
+                                                         std::shared_ptr<Node<T>> p_left,
+                                                         std::shared_ptr<Node<T>> p_right)
+{
+  std::shared_ptr<Node<T>> node = std::make_unique<Node<T>>();
+  node->data = data;
+  node->freq = freq;
+  node->p_left = p_left;
+  node->p_right = p_right;
+  return node;
+}
+
+template <typename T>
+std::unordered_map<T, unsigned int>
+HuffmanEncoder<T>::calculateFrequencyMap(const std::vector<T> &input)
+{
+  std::unordered_map<T, unsigned int> out_map;
+  for (auto &item : input)
+    out_map[item] = out_map.find(item) != out_map.end() ? out_map[item] + 1 : 1;
+
+  return out_map;
+}
+
+template <typename T>
+std::string HuffmanEncoder<T>::exportHuffmanTreeToString(std::shared_ptr<Node<T>> node)
+{
+  if (node == nullptr)
+    return "";
+
+  if (!node->p_left && !node->p_right)
+  {
+    return "0" + std::bitset<sizeof(T) * CHAR_BIT>(node->data).to_string();
+  }
+
+  std::string tmp = "1";
+  tmp += exportHuffmanTreeToString(node->p_left);
+  tmp += exportHuffmanTreeToString(node->p_right);
+  return tmp;
+}
+
+template <typename T>
+void HuffmanEncoder<T>::buildHuffmanTable(std::shared_ptr<Node<T>> node, const std::string str)
+{
+  if (node == nullptr)
+    return;
+
+  if (!node->p_left && !node->p_right)
+  {
+    _huffman_table[node->data] = str;
+  }
+
+  buildHuffmanTable(node->p_left, str + "0");
+  buildHuffmanTable(node->p_right, str + "1");
+}
+
+template <typename T>
+std::shared_ptr<Node<T>> HuffmanEncoder<T>::buildHuffmanTree(const std::vector<T> &input)
+{
+  auto freq_map = calculateFrequencyMap(input);
+
+  std::priority_queue<std::shared_ptr<Node<T>>, std::vector<std::shared_ptr<Node<T>>>,
+                      CompareNodes<T>>
+    pq;
+
+  for (auto &item : freq_map)
+  {
+    pq.push(allocateNode(item.first, item.second, nullptr, nullptr));
+  }
+
+  while (pq.size() != 1)
+  {
+    std::shared_ptr<Node<T>> left = pq.top();
+    pq.pop();
+    std::shared_ptr<Node<T>> right = pq.top();
+    pq.pop();
+
+    unsigned int sum = left->freq + right->freq;
+    pq.push(allocateNode(0, sum, left, right));
+  }
+
+  return pq.top();
+}
+
+template <typename T>
+std::vector<uint8_t> HuffmanEncoder<T>::packEncodedDataToArray(const std::string &tree_str,
+                                                               const std::string &encoded_data)
+{
+  std::vector<uint8_t> arr;
+  const size_t kTreeSizeInBits = tree_str.size();
+  const size_t kDataSizeInBits = encoded_data.size();
+
+  for (size_t i = 0; i < sizeof(size_t); ++i)
+  {
+    arr.push_back(*(static_cast<const uint8_t *>(static_cast<const void *>(&kTreeSizeInBits)) + i));
+  }
+
+  for (size_t i = 0; i < sizeof(size_t); ++i)
+  {
+    arr.push_back(*(static_cast<const uint8_t *>(static_cast<const void *>(&kDataSizeInBits)) + i));
+  }
+
+  const auto merged_str = tree_str + encoded_data;
+  const size_t kMergedSizeInBits = merged_str.size();
+
+  const auto kMergedSizeInBytes =
+    kMergedSizeInBits % CHAR_BIT ? kMergedSizeInBits / CHAR_BIT + 1 : kMergedSizeInBits / CHAR_BIT;
+  for (size_t i = 0; i < kMergedSizeInBytes; ++i)
+  {
+    const auto kNumOfBits =
+      kMergedSizeInBits - i * CHAR_BIT < CHAR_BIT ? kMergedSizeInBits - i * CHAR_BIT : CHAR_BIT;
+
+    std::string tmp_str = merged_str.substr(i * CHAR_BIT, kNumOfBits);
+
+    for (size_t i = 0; i < CHAR_BIT - kNumOfBits; ++i)
+      tmp_str += "0";
+
+    const std::bitset<CHAR_BIT> tmp_bitset(tmp_str);
+
+    arr.push_back(static_cast<uint8_t>(tmp_bitset.to_ullong()));
+  }
+  return arr;
+}
+
+template <typename T> std::vector<uint8_t> HuffmanEncoder<T>::encode(const std::vector<T> &input)
+{
+  std::shared_ptr<Node<T>> root = buildHuffmanTree(input);
+  buildHuffmanTable(root);
+
+  std::string exported_tree = exportHuffmanTreeToString(root);
+  std::string str = "";
+
+  for (auto &item : input)
+  {
+    str += _huffman_table[item];
+  }
+
+  std::vector<uint8_t> raw_arr = packEncodedDataToArray(exported_tree, str);
+  return raw_arr;
+}
+
+// Explicit template instantiation
+template class HuffmanEncoder<uint8_t>;
+template class HuffmanEncoder<int8_t>;
+template class HuffmanEncoder<uint16_t>;
+template class HuffmanEncoder<int16_t>;
+
+} // namespace huffman
+} // namespace luci
\ No newline at end of file
diff --git a/compiler/luci/pass/src/helpers/HuffmanEncoder.h b/compiler/luci/pass/src/helpers/HuffmanEncoder.h
new file mode 100644
index 00000000000..6ba715ce8c7
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/HuffmanEncoder.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__
+#define __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__
+
+#include <bitset>
+#include <climits>
+#include <memory>
+#include <queue>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "HuffmanCommon.h"
+
+namespace luci
+{
+namespace huffman
+{
+
+template <typename T> class HuffmanEncoder
+{
+private:
+  std::unordered_map<T, std::string> _huffman_table;
+
+private:
+  std::shared_ptr<Node<T>> allocateNode(T data, unsigned int freq, std::shared_ptr<Node<T>> p_left,
+                                        std::shared_ptr<Node<T>> p_right);
+
+  std::unordered_map<T, unsigned int> calculateFrequencyMap(const std::vector<T> &input);
+
+  std::string exportHuffmanTreeToString(std::shared_ptr<Node<T>> node);
+
+  void buildHuffmanTable(std::shared_ptr<Node<T>> node, const std::string str = "");
+
+  std::shared_ptr<Node<T>> buildHuffmanTree(const std::vector<T> &input);
+
+  std::vector<uint8_t> packEncodedDataToArray(const std::string &tree_str,
+                                              const std::string &encoded_data);
+
+public:
+  // Encodes input vector of values of type T and returns encoded vector of uint8_t
+  std::vector<uint8_t> encode(const std::vector<T> &input);
+
+public:
+  HuffmanEncoder() = default;
+};
+
+} // namespace huffman
+} // namespace luci
+
+#endif // __LUCI_PASS_HELPERS_HUFFMAN_ENCODER_H__
diff --git a/compiler/luci/pass/src/helpers/HuffmanEncoder.test.cpp b/compiler/luci/pass/src/helpers/HuffmanEncoder.test.cpp
new file mode 100644
index 00000000000..3ceec56c656
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/HuffmanEncoder.test.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HuffmanEncoder.h"
+#include "HuffmanDecoder.h"
+
+#include <algorithm>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+std::vector<int8_t> input_s8{13, 17, -8, -8, 1, 84, 33, 53, -26, 26, -14, -1, 23, 59, 28, -8};
+std::vector<uint8_t> input_u8{13, 17, 218, 8, 1, 84, 33, 53, 26, 26, 14, 1, 23, 59, 28, 8};
+std::vector<uint16_t> input_u16{13, 17, 218, 8, 1, 84, 33, 53, 26, 26, 14, 1, 23, 59, 28, 8};
+
+} // namespace
+
+TEST(HuffmanEncodeDecodeTest, simple_test_s8)
+{
+  luci::huffman::HuffmanEncoder<int8_t> encoder;
+  luci::huffman::HuffmanDecoder<int8_t> decoder;
+
+  std::vector<uint8_t> encoded = encoder.encode(input_s8);
+  std::vector<int8_t> decoded = decoder.decode(encoded);
+
+  EXPECT_EQ(input_s8, decoded);
+}
+
+TEST(HuffmanEncodeDecodeTest, simple_test_u8)
+{
+  luci::huffman::HuffmanEncoder<uint8_t> encoder;
+  luci::huffman::HuffmanDecoder<uint8_t> decoder;
+
+  std::vector<uint8_t> encoded = encoder.encode(input_u8);
+  std::vector<uint8_t> decoded = decoder.decode(encoded);
+
+  EXPECT_EQ(input_u8, decoded);
+}
+
+//TEST(HuffmanEncodeDecodeTest, simple_test_u16)
+//{
+//  luci::huffman::HuffmanEncoder<uint16_t> encoder;
+//  luci::huffman::HuffmanDecoder<uint16_t> decoder;
+//
+//  std::vector<uint8_t> encoded = encoder.encode(input_u16);
+//  std::vector<uint16_t> decoded = decoder.decode(encoded);
+//
+//  EXPECT_EQ(input_u16, decoded);
+//}
\ No newline at end of file