Samsung · hseok-oh · Aug 19, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 22, 2024
diff --git a/nnpackage/schema/circle_schema.fbs b/nnpackage/schema/circle_schema.fbs
@@ -45,8 +45,8 @@ file_extension "circle";
 // end to ensure backwards compatibility.
 
 // The type of data stored in a tensor.
+// Q4_0, Q4_1, Q8_0, Q8_1 are follow ggml quantization spec (https://github.com/ggerganov/ggml)
 enum TensorType : byte {
-  UINT4 = -1,
   FLOAT32 = 0,
   FLOAT16 = 1,
   INT32 = 2,
@@ -68,6 +68,12 @@ enum TensorType : byte {
   UINT32 = 15,
   UINT16 = 16,
   INT4 = 17,
+  // Belows are using negative value to represent not existing TensorType on TensorFlow Lite schema
+  UINT4 = -1,
+  Q4_0 = -2,
+  Q4_1 = -3,
+  Q8_0 = -4,
+  Q8_1 = -5,
 }
 
 // Custom quantization parameters for experimenting with new quantization
@@ -78,7 +84,7 @@ table CustomQuantization {
 
 // Represents a specific quantization technique's parameters.
 union QuantizationDetails {
-  CustomQuantization,
+  CustomQuantization
 }
 
 // Parameters for converting a quantized tensor back to float.

diff --git a/runtime/libs/circle-schema/include/circle_schema_generated.h b/runtime/libs/circle-schema/include/circle_schema_generated.h
@@ -701,6 +701,10 @@ struct ModelT;
 
 enum TensorType : int8_t
 {
+  TensorType_Q8_1 = -5,
+  TensorType_Q8_0 = -4,
+  TensorType_Q4_1 = -3,
+  TensorType_Q4_0 = -2,
   TensorType_UINT4 = -1,
   TensorType_FLOAT32 = 0,
   TensorType_FLOAT16 = 1,
@@ -720,13 +724,14 @@ enum TensorType : int8_t
   TensorType_UINT32 = 15,
   TensorType_UINT16 = 16,
   TensorType_INT4 = 17,
-  TensorType_MIN = TensorType_UINT4,
+  TensorType_MIN = TensorType_Q8_1,
   TensorType_MAX = TensorType_INT4
 };
 
-inline const TensorType (&EnumValuesTensorType())[19]
+inline const TensorType (&EnumValuesTensorType())[23]
 {
   static const TensorType values[] = {
+    TensorType_Q8_1,       TensorType_Q8_0,      TensorType_Q4_1,     TensorType_Q4_0,
     TensorType_UINT4,      TensorType_FLOAT32,   TensorType_FLOAT16,  TensorType_INT32,
     TensorType_UINT8,      TensorType_INT64,     TensorType_STRING,   TensorType_BOOL,
     TensorType_INT16,      TensorType_COMPLEX64, TensorType_INT8,     TensorType_FLOAT64,
@@ -737,18 +742,18 @@ inline const TensorType (&EnumValuesTensorType())[19]
 
 inline const char *const *EnumNamesTensorType()
 {
-  static const char *const names[20] = {"UINT4",   "FLOAT32", "FLOAT16",    "INT32",  "UINT8",
-                                        "INT64",   "STRING",  "BOOL",       "INT16",  "COMPLEX64",
-                                        "INT8",    "FLOAT64", "COMPLEX128", "UINT64", "RESOURCE",
-                                        "VARIANT", "UINT32",  "UINT16",     "INT4",   nullptr};
+  static const char *const names[24] = {
+    "Q8_1",       "Q8_0",   "Q4_1",     "Q4_0",    "UINT4",  "FLOAT32",   "FLOAT16", "INT32",
+    "UINT8",      "INT64",  "STRING",   "BOOL",    "INT16",  "COMPLEX64", "INT8",    "FLOAT64",
+    "COMPLEX128", "UINT64", "RESOURCE", "VARIANT", "UINT32", "UINT16",    "INT4",    nullptr};
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e)
 {
-  if (::flatbuffers::IsOutRange(e, TensorType_UINT4, TensorType_INT4))
+  if (::flatbuffers::IsOutRange(e, TensorType_Q8_1, TensorType_INT4))
     return "";
-  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(TensorType_UINT4);
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(TensorType_Q8_1);
   return EnumNamesTensorType()[index];
 }
 

diff --git a/runtime/onert/core/include/ir/DataType.h b/runtime/onert/core/include/ir/DataType.h
@@ -39,6 +39,8 @@ enum class DataType
   QUANT_INT16_ASYMM = 10,
   QUANT_INT8_SYMM_PER_CHANNEL = 11,
   QUANT_INT16_SYMM = 12,
+  QUANT_GGML_Q4_0 = 13,
+  QUANT_GGML_Q8_0 = 14
 };
 
 size_t sizeOfDataType(DataType data_type);

diff --git a/runtime/onert/core/include/ir/OperandInfo.h b/runtime/onert/core/include/ir/OperandInfo.h
@@ -120,7 +120,7 @@ class OperandInfo
    * @brief   Return size of tensor (bytes)
    * @return  Tensor size
    */
-  size_t total_size() const { return _shape.num_elements() * sizeOfDataType(_typeInfo.type()); }
+  size_t total_size() const;
 
   MemAllocType memAllocType() const { return _alloc_type; }
   void setAsConstant() { _const = true; }

diff --git a/runtime/onert/core/src/ir/DataType.cc b/runtime/onert/core/src/ir/DataType.cc
@@ -52,6 +52,9 @@ size_t sizeOfDataType(DataType data_type)
       return sizeof(int16_t);
     case DataType::QUANT_INT16_SYMM:
       return sizeof(int16_t);
+    // Chunk type size is not supported
+    // case DataType::QUANT_GGML_Q4_0:
+    // case DataType::QUANT_GGML_Q8_0
     default:
       throw std::runtime_error{"Unsupported type size"};
   }

diff --git a/runtime/onert/core/src/ir/OperandInfo.cc b/runtime/onert/core/src/ir/OperandInfo.cc
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/OperandInfo.h"
+
+#include <cassert>
+
+namespace onert
+{
+namespace ir
+{
+
+size_t OperandInfo::total_size() const
+{
+  const auto data_type = _typeInfo.type();
+  try
+  {
+    return _shape.num_elements() * sizeOfDataType(data_type);
+  }
+  catch (const std::runtime_error &e)
+  {
+    if (data_type != DataType::QUANT_GGML_Q4_0 && data_type != DataType::QUANT_GGML_Q8_0)
+      throw e;
+
+    // Assume last dim is multiple of chunk size (32)
+    assert(_shape.dim(_shape.rank() - 1) % 32 == 0);
+    const auto num_chunks = _shape.num_elements() / 32;
+    const auto chunk_size = data_type == DataType::QUANT_GGML_Q4_0
+                              ? (sizeof(uint8_t) * 32 / 2 + sizeof(uint16_t))
+                              : (sizeof(uint8_t) * 32 + sizeof(uint16_t));
+    return num_chunks * chunk_size;
+  }
+}
+
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/core/src/loader/BaseLoader.h b/runtime/onert/core/src/loader/BaseLoader.h
@@ -94,6 +94,7 @@ template <typename LoaderDomain> class BaseLoader
 
   // Helper functions
   ir::Activation convertActivation(ActivationFunctionType type);
+  virtual ir::DataType getTensorDataType(const Tensor *tensor);
   ir::DataType tensorTypeToDataType(TensorType type);
   ir::OperandIndex tensorIdxToOperandIdx(int32_t tensorIdx);
   flexbuffers::Map getCustomOpAttrMap(const Operator *op);
@@ -295,6 +296,12 @@ BaseLoader<LoaderDomain>::BaseLoader::convertActivation(const ActivationFunction
   }
 }
 
+template <typename LoaderDomain>
+ir::DataType BaseLoader<LoaderDomain>::BaseLoader::getTensorDataType(const Tensor *tensor)
+{
+  return tensorTypeToDataType(tensor->type());
+}
+
 template <typename LoaderDomain>
 ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const TensorType type)
 {
@@ -381,7 +388,7 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
   //       be used.
 
   // TypeInfo
-  ir::TypeInfo type_info(tensorTypeToDataType(tensor->type()));
+  ir::TypeInfo type_info(getTensorDataType(tensor));
   loadQuantization(tensor, type_info);
   loadSparsity(tensor, type_info);
 

diff --git a/runtime/onert/core/src/loader/CircleLoader.cc b/runtime/onert/core/src/loader/CircleLoader.cc
@@ -87,6 +87,17 @@ class CircleLoader final : public loader::BaseLoader<LoaderDomain>
     }
   }
 
+protected:
+  ir::DataType getTensorDataType(const Tensor *tensor) override
+  {
+    auto type = tensor->type();
+    if (type == TensorType::TensorType_Q4_0)
+      return ir::DataType::QUANT_GGML_Q4_0;
+    if (type == TensorType::TensorType_Q8_0)
+      return ir::DataType::QUANT_GGML_Q8_0;
+    return tensorTypeToDataType(type);
+  }
+
 private:
   std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) override
   {