From 48b695b3cd9bf4908a4d1af97ff285c8753b16b5 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Thu, 29 Feb 2024 13:47:14 +0000
Subject: [PATCH] Add GitHub Actions

---
 .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++
 README.md                |  9 +++------
 2 files changed, 33 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..c767c8c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,30 @@
+name: CI
+
+on:
+  push:
+    branches: [ "develop" ]
+  pull_request:
+    branches: [ "develop" ]
+  workflow_dispatch: { }
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: 'test'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Zig Setup
+        uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: "0.12.0-dev.2541+894493549"
+
+      - name: Zig Lint - Fmt
+        run: zig fmt --check .
+
+      - name: Zig Build
+        shell: bash
+        run: zig build test
diff --git a/README.md b/README.md
index 6a4fa9d..7762ba4 100644
--- a/README.md
+++ b/README.md
@@ -46,19 +46,19 @@ With this order it is possible to make adjacent cells appear in adjacent SIMD wo
 register is. For example, a vector of u32s will be rearranged and then iterated such that adjacent words look like this:
 
 ```
-u32 with 64-bit SIMD
+2xu32 64-bit SIMD
 { 0, 64 }
 { 1, 65 }
 { 2, 66 }
 ...
 
-u32 with 128-bit SIMD
+4xu32 128-bit SIMD
 { 0, 64, 128, 192 }
 { 1, 65, 129, 193 }
 { 2, 66, 130, 194 }
 ...
 
-u32 with 512-bit SIMD
+16xu32 512-bit SIMD
 { 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 }
 { 1, 65, 129, 193, 257, 321, 385, 449, 513, 577, 641, 705, 769, 833, 897, 961 }
 { 2, 66, 130, 194, 258, 322, 386, 450, 514, 578, 642, 706, 770, 834, 898, 962 }
@@ -177,6 +177,3 @@ As with all benchmarks, take the results with a pinch of salt.
 > I found the performance of benchmarks varies greatly depending on whether the inputs and outputs are stack allocated or
   heap allocated. I was surprised to find that often heap allocation was significantly faster than stack allocation.
   If anyone happens to know why, please do let me know!
-
-The following plot shows the performance vs the original FastLanes repository for all bit unpacking kernels on an M2 Mac:
-