From 00181883c49ad836b015a608f2974bec0f1fd271 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 3 Dec 2023 18:47:45 +0100 Subject: [PATCH] using webcodecs for decoding --- .github/workflows/deploy.yml | 2 - Makefile | 7 +- package-lock.json | 73 +++++++++++++ package.json | 2 + src/infer/tfjs.ts | 24 ++-- src/lib/video.ts | 205 +++++++++++++++++++++++++---------- 6 files changed, 243 insertions(+), 70 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index e0237a3..094a575 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -28,8 +28,6 @@ jobs: # However because of the hash, cache only gets restored if the Dockerfile # is unchanged. run: make -t public/app/bundled/libavjs/version.txt || true - - name: install packages - run: npm install . - name: Build run: DOCKER="docker buildx" DOCKER_TMPDIR=/tmp make all - name: Set AWS credentials diff --git a/Makefile b/Makefile index a4a29ee..1830234 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,12 @@ $(LIBAVJS_TARGET_FILES): libav.js/Dockerfile libav.js/commit.txt @cp -R $(OUTDIR)/dist public/app/bundled/libavjs-$(LIBAVJS_COMMIT) @rm -r "$(OUTDIR)" -public/app/tsc: tsconfig.json $(shell find src) public/app/bundled/libavjs-$(LIBAVJS_COMMIT)/version.txt +node_modules/tag: package.json + @npm install --no-save . + @cd node_modules/libavjs-webcodecs-bridge && make all + @touch $@ + +public/app/tsc: tsconfig.json $(shell find src) public/app/bundled/libavjs-$(LIBAVJS_COMMIT)/version.txt node_modules/tag @tsc --noEmit @./node_modules/esbuild/bin/esbuild $(ENTRYPOINTS) --sourcemap --bundle --format=esm --outbase=src --outdir=public/app/ @(cd public $(foreach ext,js css,$(foreach outfilebase,$(OUTFILESBASE),&& MD5=$$(md5sum "$(outfilebase).$(ext)" | cut -c-10) && mv "$(outfilebase).$(ext)" "$(outfilebase).$${MD5}.$(ext)" && echo "s|$(outfilebase).$(ext)|$(outfilebase).$${MD5}.$(ext)|g"))) > $@ diff --git a/package-lock.json b/package-lock.json index d39e70d..1b3dddb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,9 +8,11 @@ "@tensorflow/tfjs": "^4.12.0", "@tensorflow/tfjs-backend-wasm": "^4.12.0", "@tensorflow/tfjs-backend-webgpu": "^4.12.0", + "@types/dom-webcodecs": "^0.1.11", "@types/react": "^18.2.38", "@types/wicg-file-system-access": "^2023.10.4", "esbuild": "^0.19.5", + "libavjs-webcodecs-bridge": "github:Yahweasel/libavjs-webcodecs-bridge#98fb137f2b06029e376dc252c31d2f0b540a5919", "preact": "^10.19.2" } }, @@ -472,6 +474,11 @@ "@tensorflow/tfjs-core": "4.12.0" } }, + "node_modules/@types/dom-webcodecs": { + "version": "0.1.11", + "resolved": "https://registry.npmjs.org/@types/dom-webcodecs/-/dom-webcodecs-0.1.11.tgz", + "integrity": "sha512-yPEZ3z7EohrmOxbk/QTAa0yonMFkNkjnVXqbGb7D4rMr+F1dGQ8ZUFxXkyLLJuiICPejZ0AZE9Rrk9wUCczx4A==" + }, "node_modules/@types/emscripten": { "version": "0.0.34", "resolved": "https://registry.npmjs.org/@types/emscripten/-/emscripten-0.0.34.tgz", @@ -534,6 +541,11 @@ "resolved": "https://registry.npmjs.org/@types/wicg-file-system-access/-/wicg-file-system-access-2023.10.4.tgz", "integrity": "sha512-ewOj7hWhsUTS2+aY6zY+7BwlgqGBj5ZXxKuHt3TAWpIJH0bDW/6bO1N1SdUDAzV8r0Nc+/ZtpAEETYTwrehBMw==" }, + "node_modules/@ungap/global-this": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/@ungap/global-this/-/global-this-0.4.4.tgz", + "integrity": "sha512-mHkm6FvepJECMNthFuIgpAEFmPOk71UyXuIxYfjytvFTnSDBIz7jmViO+LfHI/AjrazWije0PnSP3+/NlwzqtA==" + }, "node_modules/@webgpu/types": { "version": "0.1.30", "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.30.tgz", @@ -698,6 +710,27 @@ "node": ">=6" } }, + "node_modules/fast-xml-parser": { + "version": "4.3.2", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.3.2.tgz", + "integrity": "sha512-rmrXUXwbJedoXkStenj1kkljNF7ugn5ZjR9FJcwmCfcCbtOMDghPajbc+Tck6vE6F5XsDmx+Pr2le9fw8+pXBg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + }, + { + "type": "paypal", + "url": "https://paypal.me/naturalintelligence" + } + ], + "dependencies": { + "strnum": "^1.0.5" + }, + "bin": { + "fxparser": "src/cli/cli.js" + } + }, "node_modules/form-data": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", @@ -735,6 +768,41 @@ "node": ">=8" } }, + "node_modules/libav.js": { + "version": "4.6.6", + "resolved": "https://registry.npmjs.org/libav.js/-/libav.js-4.6.6.tgz", + "integrity": "sha512-p+rgPnn8ZSRYE9s2z3sn0oeQAq1CvNl21ZML8ZSvgtLCs3EzaQNFWAczfoxNjHMPzssnvu3o+fiEIhcZIlpunQ==", + "dependencies": { + "fast-xml-parser": "^4.2.2" + } + }, + "node_modules/libavjs-webcodecs-bridge": { + "version": "0.0.6", + "resolved": "git+ssh://git@github.com/Yahweasel/libavjs-webcodecs-bridge.git#98fb137f2b06029e376dc252c31d2f0b540a5919", + "integrity": "sha512-oMgMq/sCSsZmJ5es44Yj+vs7fiyjv5Oo/iQ7DKASnOZPVuZHkejUY9LmOMfglzmhCA3p8VvT6Vah26iSaSM0OQ==", + "license": "ISC", + "dependencies": { + "libav.js": "^4.4.6", + "libavjs-webcodecs-polyfill": "^0.4.2" + } + }, + "node_modules/libavjs-webcodecs-polyfill": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/libavjs-webcodecs-polyfill/-/libavjs-webcodecs-polyfill-0.4.3.tgz", + "integrity": "sha512-XyykrwB5L0hqvCLEPz+4bwG1uZ3MugkkNeJd94B1MscyWTlW9VQzjxXsh5ihgjVzUJGabBSag2QjMgpkCQzc3g==", + "dependencies": { + "@ungap/global-this": "^0.4.4", + "libav.js": "=4.3.6" + } + }, + "node_modules/libavjs-webcodecs-polyfill/node_modules/libav.js": { + "version": "4.3.6", + "resolved": "https://registry.npmjs.org/libav.js/-/libav.js-4.3.6.tgz", + "integrity": "sha512-xyUh1ZgltbJ241BSgR52LqL/d6BCFoPJAcf97qjGe3l38OMSfze/VLMlh2ijBN2oNcjZvRj1AUdsJugLdMhc7Q==", + "dependencies": { + "fast-xml-parser": "^4.2.2" + } + }, "node_modules/long": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz", @@ -861,6 +929,11 @@ "node": ">=8" } }, + "node_modules/strnum": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz", + "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==" + }, "node_modules/supports-color": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", diff --git a/package.json b/package.json index 2c99d25..c4884a2 100644 --- a/package.json +++ b/package.json @@ -3,9 +3,11 @@ "@tensorflow/tfjs": "^4.12.0", "@tensorflow/tfjs-backend-wasm": "^4.12.0", "@tensorflow/tfjs-backend-webgpu": "^4.12.0", + "@types/dom-webcodecs": "^0.1.11", "@types/react": "^18.2.38", "@types/wicg-file-system-access": "^2023.10.4", "esbuild": "^0.19.5", + "libavjs-webcodecs-bridge": "github:Yahweasel/libavjs-webcodecs-bridge#98fb137f2b06029e376dc252c31d2f0b540a5919", "preact": "^10.19.2" } } diff --git a/src/infer/tfjs.ts b/src/infer/tfjs.ts index 5f9c206..1ca7159 100644 --- a/src/infer/tfjs.ts +++ b/src/infer/tfjs.ts @@ -58,10 +58,10 @@ export async function convert( + `# all coordinates are on frame where left-top = (0, 0) and right-bottom is (1, 1)\n` )) const MODEL_DIMENSION = 640 - for await (const imageData of getFrames(file, MODEL_DIMENSION, MODEL_DIMENSION)) { - const [boxes, scores, classes] = await infer(model, yoloVersion, imageData) + for await (const videoFrame of getFrames(file)) { + const [boxes, scores, classes] = await infer(model, yoloVersion, videoFrame) if (ctx) { - ctx.putImageData(imageData, 0, 0) + ctx.drawImage(videoFrame, 0, 0) ctx.strokeStyle = "red" ctx.lineWidth = 5; } @@ -77,26 +77,28 @@ export async function convert( await outputstream.write(textEncoder.encode(line)) const [cx, cy, w, h] = box if (ctx) { + const scale = Math.max(videoFrame.displayWidth, videoFrame.displayHeight) ctx.strokeRect( - (cx - w / 2) * Math.max(imageData.width, imageData.height), - (cy - h / 2) * Math.max(imageData.width, imageData.height), - w * Math.max(imageData.width, imageData.height), - h * Math.max(imageData.width, imageData.height), + (cx - w / 2) * scale, (cy - h / 2) * scale, w * scale, h * scale, ) } } + videoFrame.close() onProgress({"converting": Math.min(framenr / numberOfFrames, 1)}) framenr++ } } export function preprocess( - imageData: ImageData, + videoFrame: VideoFrame, modelWidth: number, modelHeight: number ): [tf.Tensor, number, number] { + const offScreenCanvas = new OffscreenCanvas(videoFrame.displayWidth, videoFrame.displayHeight) + const ctx = offScreenCanvas.getContext("2d")! + ctx.drawImage(videoFrame, 0, 0) - const img = tf.browser.fromPixels(imageData); + const img = tf.browser.fromPixels(offScreenCanvas.transferToImageBitmap()) const [h, w] = img.shape.slice(0, 2); // get source width and height const maxSize = Math.max(w, h); // get max size @@ -151,9 +153,9 @@ function getBoxesAndScoresAndClassesFromResult( export async function infer( model: Model, yoloVersion: YoloVersion, - imageData: ImageData, + videoFrame: VideoFrame, ): Promise<[Float32Array, Float32Array, Float32Array]> { - const [img_tensor, xRatio, yRatio] = tf.tidy(() => preprocess(imageData, 640, 640)) + const [img_tensor, xRatio, yRatio] = tf.tidy(() => preprocess(videoFrame, 640, 640)) if (yoloVersion === "v5") { const res = await model.executeAsync(img_tensor) const [boxes, scores, classes] = (res as tf.Tensor[]).slice(0, 3) diff --git a/src/lib/video.ts b/src/lib/video.ts index 1715016..2edddb7 100644 --- a/src/lib/video.ts +++ b/src/lib/video.ts @@ -1,4 +1,5 @@ import type * as LibAVTypes from "../../public/app/bundled/libavjs/dist/libav.types"; +import * as LibAVWebcodecsBridge from "libavjs-webcodecs-bridge" declare global { interface Window { LibAV: LibAVTypes.LibAVWrapper; @@ -37,7 +38,6 @@ export async function getNumberOfFrames(input: File): Promise { } libav.unlink(input.name); libav.unlink(FFPROBEOUTPUT); - // should we destroy libavjs? // TODO const outputjson = new TextDecoder("utf-8").decode(writtenData); try { const videostreams = JSON.parse(outputjson).streams.filter( @@ -68,15 +68,143 @@ export async function getNumberOfFrames(input: File): Promise { } } +function promiseWithResolve(): { + promise: Promise, + resolve: (value: T) => void, + reject: (error: any) => void +} { + // next 6 lines could be made in one on platforms that support Promise.withResolvers() + let resolve: (value: T) => void + let reject: (error: any) => void + const promise = new Promise((res, rej) => { + resolve = res + reject = rej + }) + // @ts-ignore + return {promise, resolve, reject} +} + +/** + * See https://github.com/Yahweasel/libavjs-webcodecs-bridge/issues/3#issuecomment-1837189047 for more info + */ +async function createFakeKeyFrameChunk( + decoderConfig: VideoDecoderConfig +): Promise { + const {promise, resolve, reject} = promiseWithResolve() + const encoderConfig = {...decoderConfig} as VideoEncoderConfig + // encoderConfig needs a width and height set; in my tests these dimensions + // do not have to match the actual video dimensions, so I'm just using something + // random for them + encoderConfig.width = 640 + encoderConfig.height = 360 + encoderConfig.avc = {format: decoderConfig.description ? "avc" : "annexb"} + const videoEncoder = new VideoEncoder({ + output: (chunk, _metadata) => resolve(chunk), + error: e => reject(e) + }) + try { + videoEncoder.configure(encoderConfig) + const oscanvas = new OffscreenCanvas(encoderConfig.width, encoderConfig.height) + // getting context seems to be minimal needed before it can be used as VideoFrame source + oscanvas.getContext("2d") + const videoFrame = new VideoFrame( + oscanvas, {timestamp: Number.MIN_SAFE_INTEGER}) + try { + videoEncoder.encode(videoFrame) + await videoEncoder.flush() + const chunk = await promise + return chunk + } finally { + videoFrame.close() + } + } finally { + videoEncoder.close() + } +} + +class VideoDecoderWrapper { + private frames: VideoFrame[] + private nextFrameNumber: number + private nextIsDummyFrame: boolean + private end_of_stream: boolean + private videoDecoder: VideoDecoder + + private constructor( + startFrameNumber: number, + videoDecoderConfig: VideoDecoderConfig, + private getMoreEncodedChunks: () => Promise<{chunks: EncodedVideoChunk[], end_of_stream: boolean}>, + ) { + this.frames = [] + this.end_of_stream = false + this.nextIsDummyFrame = true + this.nextFrameNumber = startFrameNumber + this.videoDecoder = new VideoDecoder({ + output: this.addFrames.bind(this), + error: error => console.log("Video decoder error", {error}) + }) + this.videoDecoder.configure(videoDecoderConfig) + } + + public static async getVideoDecoderWrapper( + startFrameNumber: number, + videoDecoderConfig: VideoDecoderConfig, + getMoreEncodedChunks: () => Promise<{chunks: EncodedVideoChunk[], end_of_stream: boolean}>, +): Promise { + const videoDecoderWrapper = new VideoDecoderWrapper(startFrameNumber, videoDecoderConfig, getMoreEncodedChunks) + const chunk = await createFakeKeyFrameChunk(videoDecoderConfig) + videoDecoderWrapper.videoDecoder.decode(chunk) + return videoDecoderWrapper + } + + + public addFrames(videoFrame: VideoFrame) { + this.frames.push(videoFrame) + } + + public availableFrames(): number { + return this.frames.length + } + + public async getNextFrame(): Promise { + while (this.availableFrames() || !this.end_of_stream) { + if (this.frames.length) { + let frame: VideoFrame = this.frames.splice(0, 1)[0] + if (this.nextIsDummyFrame) { + frame.close() + this.nextIsDummyFrame = false + } else { + return frame + } + } else { + const {chunks, end_of_stream} = await this.getMoreEncodedChunks() + try { + chunks.forEach(chunk => this.videoDecoder.decode(chunk)) + } catch (e) { + console.log("my error", e) + throw e + } + + if (end_of_stream) { + console.log("Closing") + await this.videoDecoder.flush() + this.videoDecoder.close() + this.end_of_stream = true + } + // make sure there is time to run async code (probably not necessary but doesn't hurt) + await new Promise(resolve => window.setTimeout(resolve, 0)) + } + } + return null + } +} + +/** + * Gets video frames from a file. + * Make sure to call frame.close() when done with a frame + */ export async function* getFrames( input: File, - width: number, - height: number -): AsyncGenerator { - if (!Number.isInteger(width) || !Number.isInteger(height)) { - throw new Error("Not ints"); - } - let scale_ctx: null | number = null; +): AsyncGenerator { const libav = await window.LibAV.LibAV({ noworker: false, nothreads: true }); await libav.av_log_set_level(libav.AV_LOG_ERROR); await libav.mkreadaheadfile(input.name, input); @@ -91,63 +219,28 @@ export async function* getFrames( ); } const [stream] = video_streams; - const [, c, pkt, frameptr] = await libav.ff_init_decoder( - stream.codec_id, - stream.codecpar - ); - while (true) { + const pkt = await libav.av_packet_alloc() + const decoderConfig = (await LibAVWebcodecsBridge.videoStreamToConfig(libav, stream)) as VideoDecoderConfig + decoderConfig.hardwareAcceleration = "prefer-software" + const videoDecoderWrapper = await VideoDecoderWrapper.getVideoDecoderWrapper(0, decoderConfig, async () => { const [result, packets] = await libav.ff_read_multi( fmt_ctx, pkt, undefined, - { limit: 100 * 1024, copyoutPacket: "ptr" } + { limit: 100 * 1024} ); const end_of_stream = result === libav.AVERROR_EOF; - const framePointers = await libav.ff_decode_multi( - c, - pkt, - frameptr, - packets[stream.index], - { fin: end_of_stream, copyoutFrame: "ptr" } - ) + const chunks = packets[stream.index].map(p => LibAVWebcodecsBridge.packetToEncodedVideoChunk(p, stream) as EncodedVideoChunk) + return {chunks, end_of_stream} + }) - for (const fp of framePointers) { - if (scale_ctx === null) { - const frameWidth = await libav.AVFrame_width(fp); - const frameHeight = await libav.AVFrame_height(fp); - const frameFormat = await libav.AVFrame_format(fp); - const scaleFactor = Math.min( - 1, - width / frameWidth, - height / frameHeight - ); - const targetWidth = Math.round(frameWidth * scaleFactor); - const targetHeight = Math.round(frameHeight * scaleFactor); - - scale_ctx = await libav.sws_getContext( - frameWidth, - frameHeight, - frameFormat, - targetWidth, - targetHeight, - libav.AV_PIX_FMT_RGBA, - 2, - 0, - 0, - 0 - ); - } - await libav.sws_scale_frame(scale_ctx, frameptr, fp); - const imageData = await libav.ff_copyout_frame_video_imagedata( - frameptr - ); - await libav.av_frame_unref(fp); - await libav.av_frame_unref(frameptr); - yield imageData; - } - if (end_of_stream) { + while (true) { + const frame = await videoDecoderWrapper.getNextFrame() + if (frame === null) { + console.log("done -- break") break; } + yield frame; } } finally { await libav.unlink(input.name);