experiments/dot4U8packed.html

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Wasm Dot Product</title>

  <script type="module">
    import { perf } from 'https://cdn.jsdelivr.net/npm/@jsheaven/perf@1.0.4/+esm'
    import { generateSampleData } from "./lib/samples.mjs"
    import getWasmModule from "./.gen/dot_product.mjs";

    // 2 x 1024 float32 vectors with 1024 dimensions, seeded random
    const sampleData = generateSampleData(31337 /* seed */)

    let Module;

    async function runBenchmark() {

      document.getElementById('results').innerHTML = `<br /><b>Benchmarking... (might take a few secs.)</b>`
      if (!Module) {
        Module = await getWasmModule()
      }

        // using SIMD and threads as both have > 90% coverage, even in browsers (browsers should use WebGL)
        // usually, this implementation should be chosen for when running in a JS server-side environment,
        // such as V8 in Node.js https://v8.dev/features/simd

        // Get the exported dotProduct function
        const dotProduct = Module._dotProduct;

        const dimensions = JSON.parse(document.querySelector('[name=dimensions]').value);
        const iterations = JSON.parse(document.querySelector('[name=iterations]').value);

        const times = {}
        const results = []
        const measurement = await perf([{
            name: 'WASM',
            fn: async (dims, i) => {
              
              if (!times[dims]) {
                times[dims] = performance.now()
              }
              const vectorA = sampleData.vectorsA[i].slice(0, dims);
              const vectorB = sampleData.vectorsB[i].slice(0, dims);
              
              const time = performance.now()

              // Allocate memory in the WASM heap and copy the input vectors
              const ptrA = Module._malloc(vectorA.length * vectorA.BYTES_PER_ELEMENT);
              const ptrB = Module._malloc(vectorB.length * vectorB.BYTES_PER_ELEMENT);

              Module.HEAPF32.set(vectorA, ptrA / vectorA.BYTES_PER_ELEMENT);
              Module.HEAPF32.set(vectorB, ptrB / vectorB.BYTES_PER_ELEMENT);

              // Call the dotProduct function
              results.push(dotProduct(ptrA, ptrB, dims));

              // Free the allocated memory
              Module._free(ptrA);
              Module._free(ptrB);

              if (i === iterations - 1 && times[dims]) {
                times[dims] = performance.now() - times[dims];
              }
            },
          }], 
          dimensions /* sizes (dimensionality) */, 
          true /* warmup*/, 
          iterations /* iterations */,  
          30000 /* maxExecutionTime */, 
          1 /* chunk size, one call at a time */
        )

        let testFailed = true;
        if (results[0] === -0.018422827124595642) {
          testFailed = false;
        }
        console.log('testFailed', testFailed, 'result', results[0])

        document.getElementById('results').innerHTML = `
          <h2>Results:</h2>
          <h3>WebAssembly, using SIMD vector instruction set:</h3>
          <b>Runs:</b> <b>${iterations}</b> single dot product calculations / pairs of n-dimensional vectors<br />
          <b>Took:</b> <br />${dimensions.map((d, i) => `<b>${times[d].toFixed()} ms</b> for <b>${d} dimensions</b>`).join(", <br />")}<br />`
      }
      window.runBenchmark = runBenchmark;
      
    </script>
</head>
<body>

  <h1>Fast Dot Product - WebAssembly using SIMD</h1>


  Iterations: <input name="iterations" value="20000" type="number" />
  Dimensions (JSON): <input name="dimensions" value="[4, 384, 1024]" type="text" />

  <button onclick="javascript:runBenchmark();">Run Benchmark</button>

  <br />
  <i>Notes: This implementation currently suffers from invokation/memory management overhead and limited parallelism. n ops should be processed at once in WASM instead of single ops.</i>

  <div id="results"></div>

  <h2>Implementation</h2>

  <h3>C, using WASM v128 instruction set (emscripten):</h3>
  <pre>
    #include <stddef.h>
    #include <stdint.h>
    #include <wasm_simd128.h>
    
    // limitation: aligns to min. 4 dimensions only
    float dotProduct(const float *a, const float *b, size_t dims) {
        v128_t sum = wasm_f32x4_splat(0.0f);
    
        // Process in chunks of 4
        for (size_t i = 0; i &lt; dims; i += 4) {
            v128_t vecA = wasm_v128_load(&a[i]);
            v128_t vecB = wasm_v128_load(&b[i]);
            v128_t product = wasm_f32x4_mul(vecA, vecB);
            sum = wasm_f32x4_add(sum, product);
        }
    
        // Extract the results from the SIMD register and sum them
        float result[4];
        wasm_v128_store(result, sum);
        return result[0] + result[1] + result[2] + result[3];
    }
  </pre>


  <h3>JS:</h3>
  <pre>
    // Allocate memory in the WASM heap
    const ptrA = Module._malloc(vectorA.length * vectorA.BYTES_PER_ELEMENT);
    const ptrB = Module._malloc(vectorB.length * vectorB.BYTES_PER_ELEMENT);

    // reference the vector Float32Array's
    Module.HEAPF32.set(vectorA, ptrA / vectorA.BYTES_PER_ELEMENT);
    Module.HEAPF32.set(vectorB, ptrB / vectorB.BYTES_PER_ELEMENT);

    // Call the dotProduct function (WASM module export)
    results.push(dotProduct(ptrA, ptrB, dims));

    // Free the allocated memory
    Module._free(ptrA);
    Module._free(ptrB);
  </pre>

</body>
</html>