-
Notifications
You must be signed in to change notification settings - Fork 0
/
dot4U8packed.html
153 lines (117 loc) · 5.33 KB
/
dot4U8packed.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Wasm Dot Product</title>
<script type="module">
import { perf } from 'https://cdn.jsdelivr.net/npm/@jsheaven/[email protected]/+esm'
import { generateSampleData } from "./lib/samples.mjs"
import getWasmModule from "./.gen/dot_product.mjs";
// 2 x 1024 float32 vectors with 1024 dimensions, seeded random
const sampleData = generateSampleData(31337 /* seed */)
let Module;
async function runBenchmark() {
document.getElementById('results').innerHTML = `<br /><b>Benchmarking... (might take a few secs.)</b>`
if (!Module) {
Module = await getWasmModule()
}
// using SIMD and threads as both have > 90% coverage, even in browsers (browsers should use WebGL)
// usually, this implementation should be chosen for when running in a JS server-side environment,
// such as V8 in Node.js https://v8.dev/features/simd
// Get the exported dotProduct function
const dotProduct = Module._dotProduct;
const dimensions = JSON.parse(document.querySelector('[name=dimensions]').value);
const iterations = JSON.parse(document.querySelector('[name=iterations]').value);
const times = {}
const results = []
const measurement = await perf([{
name: 'WASM',
fn: async (dims, i) => {
if (!times[dims]) {
times[dims] = performance.now()
}
const vectorA = sampleData.vectorsA[i].slice(0, dims);
const vectorB = sampleData.vectorsB[i].slice(0, dims);
const time = performance.now()
// Allocate memory in the WASM heap and copy the input vectors
const ptrA = Module._malloc(vectorA.length * vectorA.BYTES_PER_ELEMENT);
const ptrB = Module._malloc(vectorB.length * vectorB.BYTES_PER_ELEMENT);
Module.HEAPF32.set(vectorA, ptrA / vectorA.BYTES_PER_ELEMENT);
Module.HEAPF32.set(vectorB, ptrB / vectorB.BYTES_PER_ELEMENT);
// Call the dotProduct function
results.push(dotProduct(ptrA, ptrB, dims));
// Free the allocated memory
Module._free(ptrA);
Module._free(ptrB);
if (i === iterations - 1 && times[dims]) {
times[dims] = performance.now() - times[dims];
}
},
}],
dimensions /* sizes (dimensionality) */,
true /* warmup*/,
iterations /* iterations */,
30000 /* maxExecutionTime */,
1 /* chunk size, one call at a time */
)
let testFailed = true;
if (results[0] === -0.018422827124595642) {
testFailed = false;
}
console.log('testFailed', testFailed, 'result', results[0])
document.getElementById('results').innerHTML = `
<h2>Results:</h2>
<h3>WebAssembly, using SIMD vector instruction set:</h3>
<b>Runs:</b> <b>${iterations}</b> single dot product calculations / pairs of n-dimensional vectors<br />
<b>Took:</b> <br />${dimensions.map((d, i) => `<b>${times[d].toFixed()} ms</b> for <b>${d} dimensions</b>`).join(", <br />")}<br />`
}
window.runBenchmark = runBenchmark;
</script>
</head>
<body>
<h1>Fast Dot Product - WebAssembly using SIMD</h1>
Iterations: <input name="iterations" value="20000" type="number" />
Dimensions (JSON): <input name="dimensions" value="[4, 384, 1024]" type="text" />
<button onclick="javascript:runBenchmark();">Run Benchmark</button>
<br />
<i>Notes: This implementation currently suffers from invokation/memory management overhead and limited parallelism. n ops should be processed at once in WASM instead of single ops.</i>
<div id="results"></div>
<h2>Implementation</h2>
<h3>C, using WASM v128 instruction set (emscripten):</h3>
<pre>
#include <stddef.h>
#include <stdint.h>
#include <wasm_simd128.h>
// limitation: aligns to min. 4 dimensions only
float dotProduct(const float *a, const float *b, size_t dims) {
v128_t sum = wasm_f32x4_splat(0.0f);
// Process in chunks of 4
for (size_t i = 0; i < dims; i += 4) {
v128_t vecA = wasm_v128_load(&a[i]);
v128_t vecB = wasm_v128_load(&b[i]);
v128_t product = wasm_f32x4_mul(vecA, vecB);
sum = wasm_f32x4_add(sum, product);
}
// Extract the results from the SIMD register and sum them
float result[4];
wasm_v128_store(result, sum);
return result[0] + result[1] + result[2] + result[3];
}
</pre>
<h3>JS:</h3>
<pre>
// Allocate memory in the WASM heap
const ptrA = Module._malloc(vectorA.length * vectorA.BYTES_PER_ELEMENT);
const ptrB = Module._malloc(vectorB.length * vectorB.BYTES_PER_ELEMENT);
// reference the vector Float32Array's
Module.HEAPF32.set(vectorA, ptrA / vectorA.BYTES_PER_ELEMENT);
Module.HEAPF32.set(vectorB, ptrB / vectorB.BYTES_PER_ELEMENT);
// Call the dotProduct function (WASM module export)
results.push(dotProduct(ptrA, ptrB, dims));
// Free the allocated memory
Module._free(ptrA);
Module._free(ptrB);
</pre>
</body>
</html>