From 00181883c49ad836b015a608f2974bec0f1fd271 Mon Sep 17 00:00:00 2001
From: Claude <github@claude.nl>
Date: Sun, 3 Dec 2023 18:47:45 +0100
Subject: [PATCH] using webcodecs for decoding

---
 .github/workflows/deploy.yml |   2 -
 Makefile                     |   7 +-
 package-lock.json            |  73 +++++++++++++
 package.json                 |   2 +
 src/infer/tfjs.ts            |  24 ++--
 src/lib/video.ts             | 205 +++++++++++++++++++++++++----------
 6 files changed, 243 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index e0237a3..094a575 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -28,8 +28,6 @@ jobs:
         # However because of the hash, cache only gets restored if the Dockerfile
         # is unchanged.
         run: make -t public/app/bundled/libavjs/version.txt || true
-      - name: install packages
-        run: npm install .
       - name: Build
         run: DOCKER="docker buildx" DOCKER_TMPDIR=/tmp make all
       - name: Set AWS credentials
diff --git a/Makefile b/Makefile
index a4a29ee..1830234 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,12 @@ $(LIBAVJS_TARGET_FILES): libav.js/Dockerfile libav.js/commit.txt
 	@cp -R $(OUTDIR)/dist public/app/bundled/libavjs-$(LIBAVJS_COMMIT)
 	@rm -r "$(OUTDIR)"
 
-public/app/tsc: tsconfig.json $(shell find src) public/app/bundled/libavjs-$(LIBAVJS_COMMIT)/version.txt
+node_modules/tag: package.json
+	@npm install --no-save .
+	@cd node_modules/libavjs-webcodecs-bridge && make all
+	@touch $@
+
+public/app/tsc: tsconfig.json $(shell find src) public/app/bundled/libavjs-$(LIBAVJS_COMMIT)/version.txt node_modules/tag
 	@tsc --noEmit
 	@./node_modules/esbuild/bin/esbuild $(ENTRYPOINTS) --sourcemap --bundle --format=esm --outbase=src --outdir=public/app/
 	@(cd public $(foreach ext,js css,$(foreach outfilebase,$(OUTFILESBASE),&& MD5=$$(md5sum "$(outfilebase).$(ext)" | cut -c-10) && mv "$(outfilebase).$(ext)" "$(outfilebase).$${MD5}.$(ext)" && echo "s|$(outfilebase).$(ext)|$(outfilebase).$${MD5}.$(ext)|g"))) > $@
diff --git a/package-lock.json b/package-lock.json
index d39e70d..1b3dddb 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,9 +8,11 @@
         "@tensorflow/tfjs": "^4.12.0",
         "@tensorflow/tfjs-backend-wasm": "^4.12.0",
         "@tensorflow/tfjs-backend-webgpu": "^4.12.0",
+        "@types/dom-webcodecs": "^0.1.11",
         "@types/react": "^18.2.38",
         "@types/wicg-file-system-access": "^2023.10.4",
         "esbuild": "^0.19.5",
+        "libavjs-webcodecs-bridge": "github:Yahweasel/libavjs-webcodecs-bridge#98fb137f2b06029e376dc252c31d2f0b540a5919",
         "preact": "^10.19.2"
       }
     },
@@ -472,6 +474,11 @@
         "@tensorflow/tfjs-core": "4.12.0"
       }
     },
+    "node_modules/@types/dom-webcodecs": {
+      "version": "0.1.11",
+      "resolved": "https://registry.npmjs.org/@types/dom-webcodecs/-/dom-webcodecs-0.1.11.tgz",
+      "integrity": "sha512-yPEZ3z7EohrmOxbk/QTAa0yonMFkNkjnVXqbGb7D4rMr+F1dGQ8ZUFxXkyLLJuiICPejZ0AZE9Rrk9wUCczx4A=="
+    },
     "node_modules/@types/emscripten": {
       "version": "0.0.34",
       "resolved": "https://registry.npmjs.org/@types/emscripten/-/emscripten-0.0.34.tgz",
@@ -534,6 +541,11 @@
       "resolved": "https://registry.npmjs.org/@types/wicg-file-system-access/-/wicg-file-system-access-2023.10.4.tgz",
       "integrity": "sha512-ewOj7hWhsUTS2+aY6zY+7BwlgqGBj5ZXxKuHt3TAWpIJH0bDW/6bO1N1SdUDAzV8r0Nc+/ZtpAEETYTwrehBMw=="
     },
+    "node_modules/@ungap/global-this": {
+      "version": "0.4.4",
+      "resolved": "https://registry.npmjs.org/@ungap/global-this/-/global-this-0.4.4.tgz",
+      "integrity": "sha512-mHkm6FvepJECMNthFuIgpAEFmPOk71UyXuIxYfjytvFTnSDBIz7jmViO+LfHI/AjrazWije0PnSP3+/NlwzqtA=="
+    },
     "node_modules/@webgpu/types": {
       "version": "0.1.30",
       "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.30.tgz",
@@ -698,6 +710,27 @@
         "node": ">=6"
       }
     },
+    "node_modules/fast-xml-parser": {
+      "version": "4.3.2",
+      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.3.2.tgz",
+      "integrity": "sha512-rmrXUXwbJedoXkStenj1kkljNF7ugn5ZjR9FJcwmCfcCbtOMDghPajbc+Tck6vE6F5XsDmx+Pr2le9fw8+pXBg==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        },
+        {
+          "type": "paypal",
+          "url": "https://paypal.me/naturalintelligence"
+        }
+      ],
+      "dependencies": {
+        "strnum": "^1.0.5"
+      },
+      "bin": {
+        "fxparser": "src/cli/cli.js"
+      }
+    },
     "node_modules/form-data": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
@@ -735,6 +768,41 @@
         "node": ">=8"
       }
     },
+    "node_modules/libav.js": {
+      "version": "4.6.6",
+      "resolved": "https://registry.npmjs.org/libav.js/-/libav.js-4.6.6.tgz",
+      "integrity": "sha512-p+rgPnn8ZSRYE9s2z3sn0oeQAq1CvNl21ZML8ZSvgtLCs3EzaQNFWAczfoxNjHMPzssnvu3o+fiEIhcZIlpunQ==",
+      "dependencies": {
+        "fast-xml-parser": "^4.2.2"
+      }
+    },
+    "node_modules/libavjs-webcodecs-bridge": {
+      "version": "0.0.6",
+      "resolved": "git+ssh://git@github.com/Yahweasel/libavjs-webcodecs-bridge.git#98fb137f2b06029e376dc252c31d2f0b540a5919",
+      "integrity": "sha512-oMgMq/sCSsZmJ5es44Yj+vs7fiyjv5Oo/iQ7DKASnOZPVuZHkejUY9LmOMfglzmhCA3p8VvT6Vah26iSaSM0OQ==",
+      "license": "ISC",
+      "dependencies": {
+        "libav.js": "^4.4.6",
+        "libavjs-webcodecs-polyfill": "^0.4.2"
+      }
+    },
+    "node_modules/libavjs-webcodecs-polyfill": {
+      "version": "0.4.3",
+      "resolved": "https://registry.npmjs.org/libavjs-webcodecs-polyfill/-/libavjs-webcodecs-polyfill-0.4.3.tgz",
+      "integrity": "sha512-XyykrwB5L0hqvCLEPz+4bwG1uZ3MugkkNeJd94B1MscyWTlW9VQzjxXsh5ihgjVzUJGabBSag2QjMgpkCQzc3g==",
+      "dependencies": {
+        "@ungap/global-this": "^0.4.4",
+        "libav.js": "=4.3.6"
+      }
+    },
+    "node_modules/libavjs-webcodecs-polyfill/node_modules/libav.js": {
+      "version": "4.3.6",
+      "resolved": "https://registry.npmjs.org/libav.js/-/libav.js-4.3.6.tgz",
+      "integrity": "sha512-xyUh1ZgltbJ241BSgR52LqL/d6BCFoPJAcf97qjGe3l38OMSfze/VLMlh2ijBN2oNcjZvRj1AUdsJugLdMhc7Q==",
+      "dependencies": {
+        "fast-xml-parser": "^4.2.2"
+      }
+    },
     "node_modules/long": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz",
@@ -861,6 +929,11 @@
         "node": ">=8"
       }
     },
+    "node_modules/strnum": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz",
+      "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA=="
+    },
     "node_modules/supports-color": {
       "version": "7.2.0",
       "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
diff --git a/package.json b/package.json
index 2c99d25..c4884a2 100644
--- a/package.json
+++ b/package.json
@@ -3,9 +3,11 @@
     "@tensorflow/tfjs": "^4.12.0",
     "@tensorflow/tfjs-backend-wasm": "^4.12.0",
     "@tensorflow/tfjs-backend-webgpu": "^4.12.0",
+    "@types/dom-webcodecs": "^0.1.11",
     "@types/react": "^18.2.38",
     "@types/wicg-file-system-access": "^2023.10.4",
     "esbuild": "^0.19.5",
+    "libavjs-webcodecs-bridge": "github:Yahweasel/libavjs-webcodecs-bridge#98fb137f2b06029e376dc252c31d2f0b540a5919",
     "preact": "^10.19.2"
   }
 }
diff --git a/src/infer/tfjs.ts b/src/infer/tfjs.ts
index 5f9c206..1ca7159 100644
--- a/src/infer/tfjs.ts
+++ b/src/infer/tfjs.ts
@@ -58,10 +58,10 @@ export async function convert(
       + `# all coordinates are on frame where left-top = (0, 0) and right-bottom is (1, 1)\n`
   ))
   const MODEL_DIMENSION = 640
-  for await (const imageData of getFrames(file, MODEL_DIMENSION, MODEL_DIMENSION)) {
-    const [boxes, scores, classes] = await infer(model, yoloVersion, imageData)
+  for await (const videoFrame of getFrames(file)) {
+    const [boxes, scores, classes] = await infer(model, yoloVersion, videoFrame)
     if (ctx) {
-      ctx.putImageData(imageData, 0, 0)
+      ctx.drawImage(videoFrame, 0, 0)
       ctx.strokeStyle = "red"
       ctx.lineWidth = 5;
     }
@@ -77,26 +77,28 @@ export async function convert(
       await outputstream.write(textEncoder.encode(line))
       const [cx, cy, w, h] = box
       if (ctx) {
+        const scale = Math.max(videoFrame.displayWidth, videoFrame.displayHeight)
         ctx.strokeRect(
-          (cx - w / 2) * Math.max(imageData.width, imageData.height),
-          (cy - h / 2) * Math.max(imageData.width, imageData.height),
-          w * Math.max(imageData.width, imageData.height),
-          h * Math.max(imageData.width, imageData.height),
+          (cx - w / 2) * scale, (cy - h / 2) * scale, w * scale, h * scale,
         )
       }
     }
+    videoFrame.close()
     onProgress({"converting": Math.min(framenr / numberOfFrames, 1)})
     framenr++
   }
 }
 
 export function preprocess(
-  imageData: ImageData,
+  videoFrame: VideoFrame,
   modelWidth: number,
   modelHeight: number
 ): [tf.Tensor<tf.Rank>, number, number] {
+  const offScreenCanvas = new OffscreenCanvas(videoFrame.displayWidth, videoFrame.displayHeight)
+  const ctx = offScreenCanvas.getContext("2d")!
+  ctx.drawImage(videoFrame, 0, 0)
 
-  const img = tf.browser.fromPixels(imageData);
+  const img = tf.browser.fromPixels(offScreenCanvas.transferToImageBitmap())
 
   const [h, w] = img.shape.slice(0, 2); // get source width and height
   const maxSize = Math.max(w, h); // get max size
@@ -151,9 +153,9 @@ function getBoxesAndScoresAndClassesFromResult(
 export async function infer(
   model: Model,
   yoloVersion: YoloVersion,
-  imageData: ImageData,
+  videoFrame: VideoFrame,
 ): Promise<[Float32Array, Float32Array, Float32Array]> {
-  const [img_tensor, xRatio, yRatio] = tf.tidy(() => preprocess(imageData, 640, 640))
+  const [img_tensor, xRatio, yRatio] = tf.tidy(() => preprocess(videoFrame, 640, 640))
   if (yoloVersion === "v5") {
     const res = await  model.executeAsync(img_tensor)
     const [boxes, scores, classes] = (res as tf.Tensor<tf.Rank>[]).slice(0, 3)
diff --git a/src/lib/video.ts b/src/lib/video.ts
index 1715016..2edddb7 100644
--- a/src/lib/video.ts
+++ b/src/lib/video.ts
@@ -1,4 +1,5 @@
 import type * as LibAVTypes from "../../public/app/bundled/libavjs/dist/libav.types";
+import * as LibAVWebcodecsBridge from "libavjs-webcodecs-bridge"
 declare global {
   interface Window {
     LibAV: LibAVTypes.LibAVWrapper;
@@ -37,7 +38,6 @@ export async function getNumberOfFrames(input: File): Promise<number> {
     }
     libav.unlink(input.name);
     libav.unlink(FFPROBEOUTPUT);
-    // should we destroy libavjs? // TODO
     const outputjson = new TextDecoder("utf-8").decode(writtenData);
     try {
       const videostreams = JSON.parse(outputjson).streams.filter(
@@ -68,15 +68,143 @@ export async function getNumberOfFrames(input: File): Promise<number> {
   }
 }
 
+function promiseWithResolve<T>(): {
+  promise: Promise<T>,
+  resolve: (value: T) => void,
+  reject: (error: any) => void
+} {
+  // next 6 lines could be made in one on platforms that support Promise.withResolvers()
+  let resolve: (value: T) => void
+  let reject: (error: any) => void
+  const promise = new Promise<T>((res, rej) => {
+    resolve = res
+    reject = rej
+  })
+  // @ts-ignore
+  return {promise, resolve, reject}
+}
+
+/**
+  * See https://github.com/Yahweasel/libavjs-webcodecs-bridge/issues/3#issuecomment-1837189047 for more info
+  */
+async function createFakeKeyFrameChunk(
+  decoderConfig: VideoDecoderConfig
+): Promise<EncodedVideoChunk> {
+  const {promise, resolve, reject} = promiseWithResolve<EncodedVideoChunk>()
+  const encoderConfig = {...decoderConfig} as VideoEncoderConfig
+  // encoderConfig needs a width and height set; in my tests these dimensions
+  // do not have to match the actual video dimensions, so I'm just using something
+  // random for them
+  encoderConfig.width = 640
+  encoderConfig.height = 360
+  encoderConfig.avc = {format: decoderConfig.description ? "avc" : "annexb"}
+  const videoEncoder = new VideoEncoder({
+    output: (chunk, _metadata) => resolve(chunk),
+    error: e => reject(e)
+    })
+  try {
+    videoEncoder.configure(encoderConfig)
+    const oscanvas = new OffscreenCanvas(encoderConfig.width, encoderConfig.height)
+    // getting context seems to be minimal needed before it can be used as VideoFrame source
+    oscanvas.getContext("2d")
+    const videoFrame = new VideoFrame(
+      oscanvas, {timestamp: Number.MIN_SAFE_INTEGER})
+    try {
+      videoEncoder.encode(videoFrame)
+      await videoEncoder.flush()
+      const chunk =  await promise
+      return chunk
+    } finally {
+      videoFrame.close()
+    }
+  } finally {
+    videoEncoder.close()
+  }
+}
+
+class VideoDecoderWrapper {
+  private frames: VideoFrame[]
+  private nextFrameNumber: number
+  private nextIsDummyFrame: boolean
+  private end_of_stream: boolean
+  private videoDecoder: VideoDecoder
+
+  private constructor(
+    startFrameNumber: number,
+    videoDecoderConfig: VideoDecoderConfig,
+    private getMoreEncodedChunks: () => Promise<{chunks: EncodedVideoChunk[], end_of_stream: boolean}>,
+  ) {
+    this.frames = []
+    this.end_of_stream = false
+    this.nextIsDummyFrame = true
+    this.nextFrameNumber = startFrameNumber
+    this.videoDecoder = new VideoDecoder({
+      output: this.addFrames.bind(this),
+      error: error => console.log("Video decoder error", {error})
+    })
+    this.videoDecoder.configure(videoDecoderConfig)
+  }
+
+  public static async getVideoDecoderWrapper(
+    startFrameNumber: number,
+    videoDecoderConfig: VideoDecoderConfig,
+    getMoreEncodedChunks: () => Promise<{chunks: EncodedVideoChunk[], end_of_stream: boolean}>,
+): Promise<VideoDecoderWrapper> {
+    const videoDecoderWrapper = new VideoDecoderWrapper(startFrameNumber, videoDecoderConfig, getMoreEncodedChunks)
+    const chunk = await createFakeKeyFrameChunk(videoDecoderConfig)
+    videoDecoderWrapper.videoDecoder.decode(chunk)
+    return videoDecoderWrapper
+  }
+
+
+  public addFrames(videoFrame: VideoFrame) {
+    this.frames.push(videoFrame)
+  }
+
+  public availableFrames(): number {
+    return this.frames.length
+  }
+
+  public async getNextFrame(): Promise<VideoFrame | null> {
+    while (this.availableFrames() || !this.end_of_stream) {
+      if (this.frames.length) {
+        let frame: VideoFrame = this.frames.splice(0, 1)[0]
+        if (this.nextIsDummyFrame) {
+          frame.close()
+          this.nextIsDummyFrame = false
+        } else {
+          return frame
+        }
+      } else {
+        const {chunks, end_of_stream} = await this.getMoreEncodedChunks()
+        try {
+        chunks.forEach(chunk => this.videoDecoder.decode(chunk))
+        } catch (e) {
+          console.log("my error", e)
+          throw e
+        }
+
+        if (end_of_stream) {
+          console.log("Closing")
+          await this.videoDecoder.flush()
+          this.videoDecoder.close()
+          this.end_of_stream = true
+        }
+        // make sure there is time to run async code (probably not necessary but doesn't hurt)
+        await new Promise(resolve => window.setTimeout(resolve, 0))
+      }
+    }
+    return null
+  }
+}
+
+/**
+  * Gets video frames from a file.
+  * Make sure to call frame.close() when done with a frame
+  */
 export async function* getFrames(
   input: File,
-  width: number,
-  height: number
-): AsyncGenerator<ImageData, void, void> {
-  if (!Number.isInteger(width) || !Number.isInteger(height)) {
-    throw new Error("Not ints");
-  }
-  let scale_ctx: null | number = null;
+): AsyncGenerator<VideoFrame, void, void> {
   const libav = await window.LibAV.LibAV({ noworker: false, nothreads: true });
   await libav.av_log_set_level(libav.AV_LOG_ERROR);
   await libav.mkreadaheadfile(input.name, input);
@@ -91,63 +219,28 @@ export async function* getFrames(
       );
     }
     const [stream] = video_streams;
-    const [, c, pkt, frameptr] = await libav.ff_init_decoder(
-      stream.codec_id,
-      stream.codecpar
-    );
-    while (true) {
+    const pkt = await libav.av_packet_alloc()
+    const decoderConfig = (await LibAVWebcodecsBridge.videoStreamToConfig(libav, stream)) as VideoDecoderConfig
+    decoderConfig.hardwareAcceleration = "prefer-software"
+    const videoDecoderWrapper = await VideoDecoderWrapper.getVideoDecoderWrapper(0, decoderConfig, async () => {
       const [result, packets] = await libav.ff_read_multi(
         fmt_ctx,
         pkt,
         undefined,
-        { limit: 100 * 1024, copyoutPacket: "ptr" }
+        { limit: 100 * 1024}
       );
       const end_of_stream = result === libav.AVERROR_EOF;
-      const framePointers = await libav.ff_decode_multi(
-        c,
-        pkt,
-        frameptr,
-        packets[stream.index],
-        { fin: end_of_stream, copyoutFrame: "ptr" }
-      )
+      const chunks = packets[stream.index].map(p => LibAVWebcodecsBridge.packetToEncodedVideoChunk(p, stream) as EncodedVideoChunk)
+      return {chunks, end_of_stream}
+    })
 
-      for (const fp of framePointers) {
-        if (scale_ctx === null) {
-          const frameWidth = await libav.AVFrame_width(fp);
-          const frameHeight = await libav.AVFrame_height(fp);
-          const frameFormat = await libav.AVFrame_format(fp);
-          const scaleFactor = Math.min(
-            1,
-            width / frameWidth,
-            height / frameHeight
-          );
-          const targetWidth = Math.round(frameWidth * scaleFactor);
-          const targetHeight = Math.round(frameHeight * scaleFactor);
-
-          scale_ctx = await libav.sws_getContext(
-            frameWidth,
-            frameHeight,
-            frameFormat,
-            targetWidth,
-            targetHeight,
-            libav.AV_PIX_FMT_RGBA,
-            2,
-            0,
-            0,
-            0
-          );
-        }
-        await libav.sws_scale_frame(scale_ctx, frameptr, fp);
-        const imageData = await libav.ff_copyout_frame_video_imagedata(
-          frameptr
-        );
-        await libav.av_frame_unref(fp);
-        await libav.av_frame_unref(frameptr);
-        yield imageData;
-      }
-      if (end_of_stream) {
+    while (true) {
+      const frame = await videoDecoderWrapper.getNextFrame()
+      if (frame === null) {
+        console.log("done -- break")
         break;
       }
+      yield frame;
     }
   } finally {
     await libav.unlink(input.name);