Skip to content

Commit

Permalink
[CPU] FullyConnected acceleration with 8bit weights decompression
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitry-gorokhov committed Aug 2, 2023
1 parent b44f915 commit 402760c
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 11 deletions.
28 changes: 28 additions & 0 deletions src/plugins/intel_cpu/src/dnnl_postops_composer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,5 +251,33 @@ void DnnlPostOpsComposer::appendClip(const std::vector<float>& low, const std::v
}
}

void DnnlPostOpsComposer::appendDecompressionScales(const std::vector<float>& scales) {
if (scales.empty())
return;

int mask = scales.size() > 1 ? weightScaleMaskPerChannel : 0;
DEBUG_LOG("Set weights scales mask ", "DNNL_ARG: ", DNNL_ARG_WEIGHTS, " mask: ", mask);
attr.set_scales_mask(DNNL_ARG_WEIGHTS, mask);

DnnlBlockedMemoryDesc memoryDesc(InferenceEngine::Precision::FP32, Shape({scales.size()}));
auto mem = std::make_shared<Memory>(engine, memoryDesc);
memcpy(mem->getData(), scales.data(), scales.size() * sizeof(float));
args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = mem;
}

void DnnlPostOpsComposer::appendDecompressionZeroPoints(const std::vector<float>& zero_points) {
if (zero_points.empty())
return;

int mask = zero_points.size() > 1 ? weightScaleMaskPerChannel : 0;
DEBUG_LOG("Set weights zero points mask ", "DNNL_ARG: ", DNNL_ARG_WEIGHTS, " mask: ", mask);
attr.set_zero_points_mask(DNNL_ARG_WEIGHTS, mask);

DnnlBlockedMemoryDesc memoryDesc(InferenceEngine::Precision::FP32, Shape({zero_points.size()}));
auto mem = std::make_shared<Memory>(engine, memoryDesc);
memcpy(mem->getData(), zero_points.data(), zero_points.size() * sizeof(float));
args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = mem;
}

} // namespace intel_cpu
} // namespace ov
3 changes: 3 additions & 0 deletions src/plugins/intel_cpu/src/dnnl_postops_composer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class DnnlPostOpsComposer {
bool appendLinear(const std::vector<float>& scale, const std::vector<float>& shift, bool isLastPostOp, bool allowBinary = true);
void appendClip(const std::vector<float>& low, const std::vector<float>& high);

void appendDecompressionScales(const std::vector<float>& scales);
void appendDecompressionZeroPoints(const std::vector<float>& zero_points);

const VectorDims& getOutputDims() {
return outputDims;
}
Expand Down
9 changes: 9 additions & 0 deletions src/plugins/intel_cpu/src/graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) {
FuseConvMatmulFCDeconvAndDQScales(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression");
FuseFCAndWeightsDecompression(graph);
graph.RemoveDroppedNodes();

OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias");
FuseConvolutionMatMulDeconvAndBias(graph);
graph.RemoveDroppedNodes();
Expand Down Expand Up @@ -289,6 +293,9 @@ void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) {
if (fcNode == nullptr)
continue;

if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2))
continue;

const auto parent = fcNode->getParentEdgesAtPort(1)[0]->getParent();
const bool withTranspose = parent->getType() == Type::Transpose;
const NodePtr transposeNode = withTranspose ? parent : nullptr;
Expand Down Expand Up @@ -323,6 +330,8 @@ void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) {
continue;

// Precision limitations
if (fcNode->getOriginalInputPrecisionAtPort(0) != Precision::FP32)
continue;
if (multiplyConstNode->getOriginalOutputPrecisionAtPort(0) != Precision::FP32)
continue;
if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end())
Expand Down
28 changes: 21 additions & 7 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,20 @@ void FullyConnected::getSupportedDescriptors() {
if (getChildEdges().empty())
IE_THROW()<< errorPrefix << " has incorrect number of output edges";

withBiases = getOriginalInputsNumber() == 3;

useSparseWeights = useSparseWeightsDecompression();

auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID));
outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID));

if (!fusedWith.empty()) {
outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0));
}
auto weightsDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(WEIGHTS_ID));

withBiases = getOriginalInputsNumber() == 3;

useSparseWeights = useSparseWeightsDecompression();
useWeightsDecompressionImpl = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) &&
inputDataType == memory::data_type::f32 && weightsDataType == memory::data_type::u8;

// revert back outputDataType on special cases
if (inputDataType == memory::data_type::f32) {
// oneDNN only support f32 output when input is f32, even if FQ is fused
Expand Down Expand Up @@ -286,7 +289,7 @@ void FullyConnected::getSupportedDescriptors() {
#ifdef OV_CPU_WITH_MLAS
// MLAS doesn't support post-ops fusing and only supports FP32. INT8 is not enabled yet
// Disable MLAS when FC could fuse post-ops
useMlas = !useSparseWeights &&
useMlas = !useSparseWeights && !useWeightsDecompressionImpl &&
(inputDataType == memory::data_type::f32 && weightsDataType == memory::data_type::f32) &&
fusedWith.empty();
auto wgtDims = getInputShapeAtPort(WEIGHTS_ID).getStaticDims();
Expand Down Expand Up @@ -633,6 +636,10 @@ void FullyConnected::setPostOps(dnnl::primitive_attr& attr, const VectorDims& di
DnnlPostOpsComposer dnnlpoc(getEngine(), attr, ops, postOpsArgs, dims, dims.size() - 1, canBeExecutedInInt8(),
1 << 0, getDQScales(), withBiases);

dnnlpoc.appendDecompressionScales(decompressionMultiply);
if (!decompressionSubtract.empty())
dnnlpoc.appendDecompressionZeroPoints(decompressionSubtract);

for (size_t i = 0; i < fusedWith.size(); ++i) {
auto& node = fusedWith[i];
bool isLastPostOp = (i == (fusedWith.size() - 1));
Expand Down Expand Up @@ -719,11 +726,15 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
dnnl::memory::data_type wdt = indt;
dnnl::memory::data_type bdt = outdt;

if (one_of(indt, dnnl::memory::data_type::bf16, dnnl::memory::data_type::f16)) {
//oneDNN ARM InnerProduct primitive supports only identical in/out data types
dnnl::memory::data_type original_wdt = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(WEIGHTS_ID));
if (indt == dnnl::memory::data_type::f32 && original_wdt == dnnl::memory::data_type::u8) {
// Weights decompression case
wdt = original_wdt;
} else if (one_of(indt, dnnl::memory::data_type::bf16, dnnl::memory::data_type::f16)) {
#if defined(OPENVINO_ARCH_X86_64)
bdt = dnnl::memory::data_type::f32;
#else
// oneDNN ARM InnerProduct primitive supports only identical in/out data types
bdt = dnnl::memory::data_type::f16;
#endif
} else if (indt == dnnl::memory::data_type::u8 || indt == dnnl::memory::data_type::s8) {
Expand Down Expand Up @@ -985,6 +996,9 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
bool retVal = false;
const auto inRank = getInputShapeAtPort(DATA_ID).getRank();
const auto weightRank = getInputShapeAtPort(WEIGHTS_ID).getRank();
if (useWeightsDecompressionImpl) {
return false;
}
// disable rank=4:
// if layout is nhwc:
// A matrix: N * IC * H * W --> N * (IC*H*W), the M, N', K of matrix multiply will be:
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/nodes/fullyconnected.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class FullyConnected : public Node {
void prepackMLASWeight();
#endif

bool useWeightsDecompressionImpl = false;
std::vector<float> decompressionSubtract;
std::vector<float> decompressionMultiply;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,6 @@ std::vector<std::string> disabledTestPatterns() {
// New plugin API doesn't support changes of pre-processing
R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInputInfo.*)",
R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInferRequest.*)",
// Issue: 113727
R"(.*MatMulCompressedWeights.*)",
};

#if defined(OPENVINO_ARCH_X86)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,9 @@ const std::vector<std::vector<InputShape>> input_shapes_basic = {
{{{}, {{1, 4, 48}}}, {{}, {{48, 256}}}},
{{{}, {{1, 4, 512}}}, {{}, {{512, 256}}}},
{{{}, {{1, 16, 32}}}, {{}, {{32, 64}}}},
{{{}, {{2, 4, 32}}}, {{}, {{32, 65}}}},
{{{}, {{11, 339, 377}}}, {{}, {{377, 335}}}},
{{{}, {{3, 12, 768}}}, {{}, {{768, 1024}}}},
};
const std::vector<fusingSpecificParams> fusingParamsSet {
emptyFusingSpec,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ function(create_target_per_test_for_directory TEST_DIR TARGET_PREFIX)
endfunction()

if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST)
create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src/arm ov_cpu_func_subgraph)
create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src ov_cpu_func_subgraph)
create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests ov_cpu_func_slt)
endif()

Expand Down

0 comments on commit 402760c

Please sign in to comment.