Added reference mamtul weights decompression kernel

dmitry-gorokhov · Jul 21, 2023 · a95cfa3 · a95cfa3
1 parent f2fc37b
commit a95cfa3
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 2 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -232,11 +232,17 @@ void FullyConnected::getSupportedDescriptors() {
     if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
         IE_THROW() << errorPrefix << " has incorrect number of input edges";
     if (getChildEdges().empty())
-        IE_THROW()<< errorPrefix << " has incorrect number of output edges";
+        IE_THROW() << errorPrefix << " has incorrect number of output edges";
 
     withBiases = getOriginalInputsNumber() == 3;
 
     useSparseWeights = useSparseWeightsDecompression();
+    useWeightsDecompression = canUseWeightsDecompression();
+    if (!useWeightsDecompression) {
+        if (!decompressionSubtract.empty() || !decompressionMultiply.empty()) {
+            IE_THROW() << errorPrefix << " doesn't support weights decompression feature";
+        }
+    }
 
     auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID));
     outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID));
@@ -839,6 +845,10 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
     bool retVal = false;
     const auto inRank = getInputShapeAtPort(DATA_ID).getRank();
     const auto weightRank = getInputShapeAtPort(WEIGHTS_ID).getRank();
+
+    if (useWeightsDecompression)
+        return false;
+
     // disable rank=4:
     // if layout is nhwc:
     //   A matrix: N * IC * H * W --> N * (IC*H*W), the M, N', K of matrix multiply will be:
@@ -953,6 +963,19 @@ void FullyConnected::fuseDecompressionMultiply(const NodePtr& constData) {
                 elementsCount);
 }
 
+// todo: reuse the method in fusion pass for limitations check
+bool FullyConnected::canUseWeightsDecompression() {
+    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2))
+        return false;
+
+    if (getOriginalInputPrecisionAtPort(DATA_ID) != Precision::FP32 ||
+        getOriginalInputPrecisionAtPort(WEIGHTS_ID) != Precision::U8) {
+        return false;
+    }
+
+    return true;
+}
+
 void FullyConnected::fuseDecompressionSubtract(const NodePtr& constData) {
     auto *constInputNode = dynamic_cast<node::Input *>(constData.get());
     if (!constInputNode) {

diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -107,6 +107,8 @@ class FullyConnected : public Node {
     bool useSparseWeightsDecompression();
     VectorDims expectedBiasDims {};
 
+    bool canUseWeightsDecompression();
+    bool useWeightsDecompression = false;
     std::vector<float> decompressionSubtract;
     std::vector<float> decompressionMultiply;
 };

diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp
@@ -210,7 +210,14 @@ TEST_P(MatmulWeightsDecompressionTest, CompareWithRefs) {
 namespace {
 
 std::vector<std::vector<ngraph::Shape>> inputShapes = {
+    {{1, 4, 16}, {32, 16}},
+    {{1, 4, 16}, {256, 16}},
+    {{1, 4, 32}, {256, 32}},
+    {{1, 4, 48}, {256, 48}},
+    {{1, 4, 512}, {256, 512}},
     {{1, 16, 32}, {64, 32}},
+    {{10, 4, 16}, {32, 16}},
+    {{10, 40, 496}, {240, 496}},
 };
 
 std::vector<size_t> patternTypes = {

diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn