From 4c5d97920e36f330fa6e466822b7579834dfe15b Mon Sep 17 00:00:00 2001
From: Tixxx <tix@microsoft.com>
Date: Tue, 11 May 2021 10:24:32 -0700
Subject: [PATCH] fixed correctness for both depthToSpace and new packed matmul
 migrating this to the new repo...

---
 .../super_resolution_model_zoo/karma.conf.js  |   1 +
 .../super_resolution_model_zoo/package.json   |   4 +-
 .../super_resolution_model_zoo/src/index.js   |   6 +-
 lib/backends/webgl/ops/conv-pack.ts           |  11 +-
 lib/backends/webgl/ops/matmul-pack.ts         |  73 +++---
 .../backends/webgl/test_matmul_packed.ts      | 247 ++++++++++++++++++
 test/unittests/index.ts                       |   1 +
 7 files changed, 302 insertions(+), 41 deletions(-)
 create mode 100644 test/unittests/backends/webgl/test_matmul_packed.ts

diff --git a/benchmark/super_resolution_model_zoo/karma.conf.js b/benchmark/super_resolution_model_zoo/karma.conf.js
index 92c2c1fc..1daabad6 100644
--- a/benchmark/super_resolution_model_zoo/karma.conf.js
+++ b/benchmark/super_resolution_model_zoo/karma.conf.js
@@ -59,6 +59,7 @@ module.exports = function(config) {
       printMatches: false,
       // To enable pack, run 'PACK=1 npm run test'
       usePackedGlTexture: config.usePackedGlTexture==1 ? true : false,
+      runIteration: config.runIteration ? config.runIteration : 10,
       profile: config.profile
     },
     browsers: ['ChromeTest', 'ChromeDebug', 'Edge', 'Safari'],
diff --git a/benchmark/super_resolution_model_zoo/package.json b/benchmark/super_resolution_model_zoo/package.json
index 76867407..f5a9b000 100644
--- a/benchmark/super_resolution_model_zoo/package.json
+++ b/benchmark/super_resolution_model_zoo/package.json
@@ -6,8 +6,8 @@
   "scripts": {
     "build": "webpack --config ./webpack.conf.js --mode production",
     "build-debug": "webpack --config ./webpack.conf.js --mode development",
-    "test": "karma start --browsers ChromeTest --single-run --usePackedGlTexture=$PACK",
-    "profile": "karma start --browsers ChromeTest --single-run --profile --usePackedGlTexture=$PACK",
+    "test": "karma start --browsers ChromeTest --single-run --usePackedGlTexture=$PACK --runIteration=$RUNCOUNT",
+    "profile": "karma start --browsers ChromeTest --single-run --profile --usePackedGlTexture=$PACK --runIteration=$RUNCOUNT",
     "test-debug": "karma start --browsers ChromeDebug",
     "test-edge": "karma start --browsers Edge --single-run",
     "test-safari": "karma start --browsers Safari --single-run"
diff --git a/benchmark/super_resolution_model_zoo/src/index.js b/benchmark/super_resolution_model_zoo/src/index.js
index 776cd7e1..725ed246 100644
--- a/benchmark/super_resolution_model_zoo/src/index.js
+++ b/benchmark/super_resolution_model_zoo/src/index.js
@@ -99,9 +99,9 @@ async function runBenchmark(benchmarkData, backend, imageSize) {
     const imageLoader = new ImageLoader(imageSize, imageSize);
     const durations = [];
     for(const input of benchmarkData.inputs) {
-        console.log(`Running ${input.name}`)
+        console.log(`Running ${input.name} for ${runIteration} iterations.`)
         const imageData = await imageLoader.getImageData(input.url);
-        for(let i = 0 ; i < 10; i++) {
+        for(let i = 0 ; i < runIteration; i++) {
           const outputData = await impl.runModel(imageData.data);
           durations.push(impl.duration);
         }
@@ -125,6 +125,7 @@ class TensorFlowResnetBenchmark {
         this.imageSize = imageSize;
         tf.disposeVariables();
         tf.env().set('WEBGL_PACK', pack_texture);
+
         console.log(`Pack mode enabled: ${tf.env().getBool('WEBGL_PACK')}`);
         if(backend) {
             console.log(`Setting the backend to ${backend}`);
@@ -261,6 +262,7 @@ const results = [];
 const browser = __karma__.config.browser[0];
 const profile = __karma__.config.profile;
 const pack_texture = __karma__.config.usePackedGlTexture;
+const runIteration = __karma__.config.runIteration;
 
 console.log(`browser: ${browser}`)
 describe('super resolution Tests', ()=> {
diff --git a/lib/backends/webgl/ops/conv-pack.ts b/lib/backends/webgl/ops/conv-pack.ts
index 414bf64d..cb130279 100644
--- a/lib/backends/webgl/ops/conv-pack.ts
+++ b/lib/backends/webgl/ops/conv-pack.ts
@@ -15,7 +15,7 @@ import {WebGLReshapePacked} from './reshape-packed';
 export class WebGLConvPacked extends Conv {
   protected artifacts: Artifact[];
   protected programInfo: ProgramInfo[];
-
+  protected outputShape: number[];
   run(inferenceHandler: WebGLInferenceHandler, inputs: Tensor[]): Tensor[] {
     const programManager = inferenceHandler.session.programManager;
     const xshape = inputs[0].dims.slice();
@@ -33,8 +33,8 @@ export class WebGLConvPacked extends Conv {
         `autpPad:${this.autoPad}, dilations:${this.dilations}, group:${this.group}, kernelShape:${
             this.kernelShape}, pads:${this.pads}, strides:${this.strides}`);
 
-    const outputShape = WebGLConv.calcOutputShape(xshape, kshape, this.dilations, this.pads, this.strides);
-    const im2col = new WebGLIm2ColPacked(outputShape, kshape, this.dilations, this.pads, this.strides);
+    this.outputShape = WebGLConv.calcOutputShape(xshape, kshape, this.dilations, this.pads, this.strides);
+    const im2col = new WebGLIm2ColPacked(this.outputShape, kshape, this.dilations, this.pads, this.strides);
     const matmul = new WebGLMatMulPacked();
     const reshape = new WebGLReshapePacked();
     // shape for kernel reshape
@@ -76,11 +76,10 @@ export class WebGLConvPacked extends Conv {
     inferenceHandler.checkAndUpdateTextureForm(this.artifacts[2], runDataMatmul);
     programManager.run(this.artifacts[2], runDataMatmul);
     const matmulOutput = runDataMatmul.outputTextureData.tensor;
-
     // reshape output
     const outputShapeTensor = new Tensor(
-        [outputShape.length], 'int32', undefined, undefined,
-        new Int32Array([outputShape[0], outputShape[1], outputShape[2], outputShape[3]]));
+        [this.outputShape.length], 'int32', undefined, undefined,
+        new Int32Array([this.outputShape[0], this.outputShape[1], this.outputShape[2], this.outputShape[3]]));
 
     assert(this.artifacts.length > 2, () => 'expect at least 3 artifacts created');
     if (this.artifacts.length === 3) {
diff --git a/lib/backends/webgl/ops/matmul-pack.ts b/lib/backends/webgl/ops/matmul-pack.ts
index 2f9e714e..eeefa90b 100644
--- a/lib/backends/webgl/ops/matmul-pack.ts
+++ b/lib/backends/webgl/ops/matmul-pack.ts
@@ -4,8 +4,10 @@
 import {MatMul} from '../../../ops/matmul';
 import {Tensor} from '../../../tensor';
 import {BroadcastUtil} from '../../../util';
+import {getGlsl} from '../glsl-source';
 import {WebGLInferenceHandler} from '../inference-handler';
 import {ProgramInfo, RunData, WebGLOperator} from '../types';
+import {getCoordsDataType} from '../utils';
 
 export class WebGLMatMulPacked extends MatMul implements WebGLOperator {
   run(inferenceHandler: WebGLInferenceHandler, inputs: Tensor[]): Tensor[] {
@@ -13,7 +15,7 @@ export class WebGLMatMulPacked extends MatMul implements WebGLOperator {
   }
   createProgramInfo(handler: WebGLInferenceHandler, inputs: Tensor[]): ProgramInfo {
     const hasBias = inputs.length > 2;
-    const processBias = hasBias ? `value += vec4(getBias(a[0]*2).xx, getBias(a[0]*2).yy);` : ``;
+    const processBias = hasBias ? 'result += getBiasAtOutCoords();' : '';
     const aShape = inputs[0].dims;
     const bShape = inputs[1].dims;
     const outputShape = BroadcastUtil.calcShape(aShape, bShape, true);
@@ -21,34 +23,43 @@ export class WebGLMatMulPacked extends MatMul implements WebGLOperator {
     if (!outputShape) {
       throw new Error('Can\'t use matmul on the given tensors');
     }
-    const rank = outputShape.length;
+
+    const sharedDim = aShape[aShape.length - 1];
+    const sharedDimIndex = Math.ceil(sharedDim / 2);
+
     const aRank = aShape.length;
     const bRank = bShape.length;
-    const sharedDim = aShape[aShape.length - 1];
-    // TODO:fix broadcasting
+
+    const glsl = getGlsl(handler.session.backend.glContext.version);
+    const coordsDataType = getCoordsDataType(outputShape.length);
+    const allGlChannels = ['x', 'y', 'z', 'w', 'u', 'v'];
+
     const shaderSource = `
-      vec4 process(int indices[${rank}]) {
-          int a[${aRank}];
-          int b[${bRank}];
-          bcastMatmulIndices_A(indices, a);
-          bcastMatmulIndices_B(indices, b);
+    void main() {
+      ${coordsDataType} rc = getOutputCoords();
+
+      vec4 result = vec4(0);
+
+      for (int i = 0; i < ${sharedDimIndex}; i++) {
+        vec4 a = getA(${getA(allGlChannels, aRank)});
+        vec4 b = getB(${getB(allGlChannels, bRank)});
+
+        result += (a.rrbb * b.rgrg);
+        result += (a.ggaa * b.baba);
+      }
+
+      ${processBias}
+
+      ${glsl.output} = result;
+    }`;
 
-          vec4 value;
-          for (int k=0; k<((${sharedDim}+1)/2); ++k) {
-              a[${aRank - 1}] = k;
-              b[${bRank - 2}] = k;
-              value += ${getA(aRank)}.rrbb * ${getB(bRank)}.rgrg;
-              value += ${getA(aRank)}.ggaa * ${getB(bRank)}.baba;
-          }
-          ${processBias}
-          return value;
-      }`;
     return {
       inputLayouts: inputs.map((t, i) => handler.getOrCreateTextureLayout(t, 4, true, inputs[i].dims, true)),
       outputLayout:
           handler.createTextureLayoutFromShape(outputShape, 4, outputShape, {isPacked: true, reverseWH: true}),
       samplers: hasBias ? ['A', 'B', 'Bias'] : ['A', 'B'],
       shaderSource,
+      hasMain: true,
       expectPackedInputs: true,
       expectPackedOutputs: true,
     };
@@ -64,22 +75,22 @@ export class WebGLMatMulPacked extends MatMul implements WebGLOperator {
   }
 }
 
-function getA(outputRank: number): string {
-  let res = 'getA(';
-  for (let i = 0; i < outputRank - 2; i++) {
-    res += `a[${i}], `;
+function getA(allGlChannels: string[], rank: number): string {
+  let res = '';
+  for (let i = 0; i < rank - 2; i++) {
+    res += `rc.${allGlChannels[i]}, `;
   }
-  res += `a[${outputRank - 2}]*2, ` +
-      'k*2)';
+  res += `rc.${allGlChannels[rank - 2]}, ` +
+      'i<<1';
   return res;
 }
 
-function getB(outputRank: number): string {
-  let res = 'getB(';
-  for (let i = 0; i < outputRank - 2; i++) {
-    res += `b[${i}], `;
+function getB(allGlChannels: string[], rank: number): string {
+  let res = '';
+  for (let i = 0; i < rank - 2; i++) {
+    res += `rc.${allGlChannels[i]}, `;
   }
-  res += 'k*2, ' +
-      `b[${outputRank - 1}]*2)`;
+  res += 'i<<1, ' +
+      `rc.${allGlChannels[rank - 1]}`;
   return res;
 }
diff --git a/test/unittests/backends/webgl/test_matmul_packed.ts b/test/unittests/backends/webgl/test_matmul_packed.ts
new file mode 100644
index 00000000..f4eeab9b
--- /dev/null
+++ b/test/unittests/backends/webgl/test_matmul_packed.ts
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+import {expect} from 'chai';
+
+import {Backend, InferenceHandler, SessionHandler} from '../../../../lib/backend';
+import {WebGLBackend} from '../../../../lib/backends/backend-webgl';
+import {WebGLInferenceHandler} from '../../../../lib/backends/webgl/inference-handler';
+import {WebGLMatMulPacked} from '../../../../lib/backends/webgl/ops/matmul-pack';
+import {Profiler} from '../../../../lib/instrument';
+import {Tensor} from '../../../../lib/tensor';
+import {ShapeUtil} from '../../../../lib/util';
+
+import {createAscendingArray} from './test_utils';
+import {createTextureFromArray} from './test_utils';
+
+let backend: Backend|undefined;
+let sessionhandler: SessionHandler|undefined;
+let inferenceHandler: InferenceHandler|undefined;
+
+describe('#UnitTest# - packed matmul - Tensor matmul', () => {
+  before('Initialize Context', async () => {
+    const profiler = Profiler.create();
+    backend = await Backend('webgl');
+    // Explicitly set to true to trigger packed version
+    (backend as WebGLBackend).pack = true;
+    sessionhandler = backend.createSessionHandler({profiler});
+    inferenceHandler = sessionhandler.createInferenceHandler();
+  });
+
+  // Set it back to false, apparently this state is sticky throughout all the tests running in same browser session..
+  after('Resetting Context', () => {
+    (backend as WebGLBackend).pack = false;
+  });
+
+  const testDataSet = getTestData();
+  for (let k = 0; k < testDataSet.length; ++k) {
+    const testData = testDataSet[k];
+    describe(`Test matmul ${JSON.stringify(testData)}`, () => {});
+    it(`Test packed matmul kernel `, () => {
+      const webglInferenceHandler = inferenceHandler as WebGLInferenceHandler;
+
+      // TODO support WebGl 1.0
+      if (webglInferenceHandler.session.textureManager.glContext.version === 1) {
+        console.log('Running packed matmul with webgl1 is not supported. Skipping.');
+        return;
+      }
+
+      const op = new WebGLMatMulPacked();
+
+      const elementCountA = testData.elementCountA;
+      const elementCountB = testData.elementCountB;
+
+      const inputTensorShapeA = testData.inputShapeA;
+      const inputTextureShapeA = testData.inputTextureShapeA;
+
+      const inputTensorShapeB = testData.inputShapeB;
+      const inputTextureShapeB = testData.inputTextureShapeB;
+
+      // create input data and tensor. The input data will be used to verify if the output tensor contains the
+      // same value but possibly different order depending on our packing algorithm.
+      const inputDataA = createAscendingArray(elementCountA);
+      const inputDataB = createAscendingArray(elementCountB);
+      const inputTensorA = new Tensor(inputTensorShapeA, 'float32', undefined, undefined, inputDataA);
+      const inputTensorB = new Tensor(inputTensorShapeB, 'float32', undefined, undefined, inputDataB);
+
+      // manually creat packed texture from inputTensor, and insert in cache
+      const gl = webglInferenceHandler.session.textureManager.glContext.gl;
+
+      webglInferenceHandler.session.textureManager.glContext.checkError();
+      const webglTextureA = createTextureFromArray(
+          webglInferenceHandler.session.textureManager.glContext, testData.rawInputA ? testData.rawInputA : inputDataA,
+          gl.RGBA, inputTextureShapeA[0], inputTextureShapeA[1]);
+      const webglTextureB = createTextureFromArray(
+          webglInferenceHandler.session.textureManager.glContext, testData.rawInputB ? testData.rawInputB : inputDataB,
+          gl.RGBA, inputTextureShapeB[0], inputTextureShapeB[1]);
+
+      webglInferenceHandler.session.textureManager.glContext.checkError();
+      const packedShapeA = inputTextureShapeA;
+      const textureDataA = {
+        width: inputTextureShapeA[0],
+        height: inputTextureShapeA[1],
+        channels: 4 as const,
+        isPacked: true,
+        shape: packedShapeA,
+        strides: ShapeUtil.computeStrides(packedShapeA),
+        unpackedShape: inputTensorShapeA,
+        tensor: inputTensorA,
+        texture: webglTextureA!
+      };
+
+      const packedShapeB = inputTextureShapeB;
+      const textureDataB = {
+        width: inputTextureShapeB[0],
+        height: inputTextureShapeB[1],
+        channels: 4 as const,
+        isPacked: true,
+        shape: packedShapeB,
+        strides: ShapeUtil.computeStrides(packedShapeB),
+        unpackedShape: inputTensorShapeB,
+        tensor: inputTensorB,
+        texture: webglTextureB!
+      };
+
+      webglInferenceHandler.setTextureData(inputTensorA.dataId, textureDataA, true);
+      webglInferenceHandler.setTextureData(inputTensorB.dataId, textureDataB, true);
+
+      const inputList = testData.biasValue ?
+          [
+            inputTensorA, inputTensorB,
+            new Tensor([1], 'float32', undefined, undefined, new Float32Array([testData.biasValue]))
+          ] :
+          [inputTensorA, inputTensorB];
+
+      // compile shader code
+      const programInfo = op.createProgramInfo(inferenceHandler! as WebGLInferenceHandler, inputList);
+
+      const artifact = webglInferenceHandler.session.programManager.build(programInfo);
+      webglInferenceHandler.session.programManager.setArtifact(op, artifact);
+
+      // run kernal and get output
+      const runData = op.createRunData(webglInferenceHandler, artifact.programInfo, inputList);
+      webglInferenceHandler.session.programManager.run(artifact, runData);
+      const result = runData.outputTextureData.tensor.data;
+
+      webglInferenceHandler.session.textureManager.glContext.checkError();
+      // verify result.
+      const expectedOutput = testData.expectedOutput;
+      expect(result).to.not.equal(null);
+      let batchMultiplier = 1;
+      if (testData.inputShapeA.length > 2) {
+        batchMultiplier = testData.inputShapeA[0];
+      }
+      if (testData.inputShapeB.length > 2) {
+        batchMultiplier = Math.max(batchMultiplier, testData.inputShapeB[0]);
+      }
+
+      expect(result).to.have.lengthOf(
+          batchMultiplier * testData.inputShapeA[testData.inputShapeA.length - 2] *
+          testData.inputShapeB[testData.inputShapeB.length - 1]);
+      expect(result).to.deep.equal(expectedOutput);
+    });
+  }
+});
+interface TestData {
+  elementCountA: number;
+  elementCountB: number;
+  inputShapeA: number[];
+  inputShapeB: number[];
+  outputShape: number[];
+  inputTextureShapeA: number[];
+  inputTextureShapeB: number[];
+  outputTextureShape: number[];
+  expectedOutput: Float32Array;
+  // The value of bias matrix that will be broadcasted to the corresponding shape in matmul.
+  // i.e. If biasValue = 1, then bias matrix is [1], when being added to 2x2 matmul result, it will be bcasted to
+  // [1, 1]
+  // [1, 1]
+  biasValue?: number;
+  // If empty, the test will use auto-generated data.
+  rawInputA?: Float32Array;
+  // If empty, the test will use auto-generated data.
+  rawInputB?: Float32Array;
+}
+function getTestData(): TestData[] {
+  return [
+    // test 2D tensor
+    {
+      elementCountA: 4,
+      elementCountB: 4,
+      inputShapeA: [2, 2],
+      inputShapeB: [2, 2],
+      outputShape: [2, 2],
+      inputTextureShapeA: [1, 1],
+      inputTextureShapeB: [1, 1],
+      outputTextureShape: [1, 1],
+      expectedOutput: new Float32Array([7, 10, 15, 22]),
+    },
+    {
+      elementCountA: 4,
+      elementCountB: 4,
+      inputShapeA: [2, 2],
+      inputShapeB: [2, 2],
+      outputShape: [2, 2],
+      inputTextureShapeA: [1, 1],
+      inputTextureShapeB: [1, 1],
+      outputTextureShape: [1, 1],
+      biasValue: 1,
+      expectedOutput: new Float32Array([8, 11, 16, 23]),
+    },
+    {
+      elementCountA: 6,
+      elementCountB: 6,
+      inputShapeA: [2, 3],
+      inputShapeB: [3, 2],
+      outputShape: [2, 2],
+      inputTextureShapeA: [2, 1],
+      inputTextureShapeB: [1, 2],
+      outputTextureShape: [1, 1],
+      expectedOutput: new Float32Array([22, 28, 49, 64]),
+      rawInputA: new Float32Array([1, 2, 4, 5, 3, 0, 6, 0]),
+      rawInputB: new Float32Array([1, 2, 3, 4, 5, 6, 0, 0]),
+    },
+    {
+      elementCountA: 6,
+      elementCountB: 6,
+      inputShapeA: [2, 3],
+      inputShapeB: [3, 2],
+      outputShape: [2, 2],
+      inputTextureShapeA: [2, 1],
+      inputTextureShapeB: [1, 2],
+      outputTextureShape: [1, 1],
+      expectedOutput: new Float32Array([23, 29, 50, 65]),
+      biasValue: 1,
+      rawInputA: new Float32Array([1, 2, 4, 5, 3, 0, 6, 0]),
+      rawInputB: new Float32Array([1, 2, 3, 4, 5, 6, 0, 0]),
+    },
+    {
+      elementCountA: 12,
+      elementCountB: 12,
+      inputShapeA: [2, 2, 3],
+      inputShapeB: [2, 3, 2],
+      outputShape: [2, 2, 2],
+      inputTextureShapeA: [2, 2],
+      inputTextureShapeB: [1, 4],
+      outputTextureShape: [2, 1],
+      expectedOutput: new Float32Array([23, 29, 50, 65, 23, 29, 50, 65]),
+      biasValue: 1,
+      rawInputA: new Float32Array([1, 2, 4, 5, 3, 0, 6, 0, 1, 2, 4, 5, 3, 0, 6, 0]),
+      rawInputB: new Float32Array([1, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0]),
+    },
+    // test bcast
+    {
+      elementCountA: 12,
+      elementCountB: 6,
+      inputShapeA: [2, 2, 3],
+      inputShapeB: [3, 2],
+      outputShape: [2, 2, 2],
+      inputTextureShapeA: [2, 2],
+      inputTextureShapeB: [1, 2],
+      outputTextureShape: [2, 1],
+      expectedOutput: new Float32Array([23, 29, 50, 65, 23, 29, 50, 65]),
+      biasValue: 1,
+      rawInputA: new Float32Array([1, 2, 4, 5, 3, 0, 6, 0, 1, 2, 4, 5, 3, 0, 6, 0]),
+      rawInputB: new Float32Array([1, 2, 3, 4, 5, 6, 0, 0]),
+    },
+  ];
+}
diff --git a/test/unittests/index.ts b/test/unittests/index.ts
index 13a5575c..3c26b870 100644
--- a/test/unittests/index.ts
+++ b/test/unittests/index.ts
@@ -10,6 +10,7 @@ if (typeof window !== 'undefined' && !onnx.backend.webgl.disabled) {
   require('./backends/webgl/test_concat_packed');
   require('./backends/webgl/test_depth_to_space');
   require('./backends/webgl/test_reshape_packed');
+  require('./backends/webgl/test_matmul_packed');
 }
 
 // require('./api/onnx');