microsoft · fs-eire · Jan 13, 2024 · Dec 20, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -3,12 +3,24 @@
 
 import {Env, Tensor} from 'onnxruntime-common';
 
+import {tensorDataTypeEnumToString} from '../wasm-common';
+
 import {configureLogger, LOG_DEBUG} from './log';
 import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
+import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency, QueryType} from './webgpu/types';
+
+interface KernelInfo {
+  opType: string;
+  nodeName: string;
+  kernelEntry: RunFunction;
+  attributes: [((attribute: unknown) => unknown)|undefined, unknown];
+  programName: string;
+  inputTensorViews: readonly TensorView[];
+  outputTensorViews: readonly TensorView[];
+}
 
 const getProgramInputTensorInfoDependencyKey =
     (inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
@@ -122,20 +134,21 @@ export class WebGpuBackend {
     return data;
   }
 
-  /**
-   * a KernelID -> kernel info mapping. value is
-   * [ op_type, name, run function, [optional] preprocess_attribute_once function ]
-   */
-  kernels: Map<number, [string, string, RunFunction, [((attribute: unknown) => unknown) | undefined, unknown]]>;
-
+  kernels: Map<number, KernelInfo>;
   private commandEncoder: GPUCommandEncoder|null = null;
   private computePassEncoder: GPUComputePassEncoder|null = null;
+  maxDispatchNumber = 16;
   pendingDispatchNumber = 0;
 
-  queryData?: GpuData;
-  querySet?: GPUQuerySet;
-  querySetCount = 2;
-  queryTimeBase?: bigint;
+
+  // info of kernels pending submission for a single batch
+  private pendingKernels: number[] = [];
+  // queryReadData -> pendingKernels mapping for all the batches
+  private pendingQueries: Map<number, number[]> = new Map();
+  private queryResolveData?: GpuData;
+  private querySet?: GPUQuerySet;
+  private queryTimeBase?: bigint;
+  queryType: QueryType;
 
   env: Env;
 
@@ -161,7 +174,9 @@ export class WebGpuBackend {
       requiredFeatures,
     };
 
-    if (adapter.features.has('timestamp-query')) {
+    if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+      requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
+    } else if (adapter.features.has('timestamp-query')) {
       requiredFeatures.push('timestamp-query');
     }
     if (adapter.features.has('shader-f16')) {
@@ -188,36 +203,47 @@ export class WebGpuBackend {
     };
 
     Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
+
+    // init queryType, which is necessary for createKernel
+    this.setQueryType();
   }
 
   dispose(): void {
     if (typeof this.querySet !== 'undefined') {
       this.querySet.destroy();
+      this.gpuDataManager.release(this.queryResolveData!.id);
     }
     this.gpuDataManager.dispose();
   }
 
   getCommandEncoder(): GPUCommandEncoder {
     if (!this.commandEncoder) {
       this.commandEncoder = this.device.createCommandEncoder();
+
+      // refresh queryType, as sometimes we only need to enable query for a specific run
+      this.setQueryType();
+      if (this.queryType !== QueryType.none && typeof this.querySet === 'undefined') {
+        this.querySet = this.device.createQuerySet({
+          type: 'timestamp',
+          count: this.maxDispatchNumber * 2,
+        });
+        this.queryResolveData = this.gpuDataManager.create(
+            // eslint-disable-next-line no-bitwise
+            this.maxDispatchNumber * 2 * 8, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
+      }
     }
     return this.commandEncoder;
   }
 
   getComputePassEncoder(): GPUComputePassEncoder {
     if (!this.computePassEncoder) {
       const computePassDescriptor: GPUComputePassDescriptor = {};
-      if (this.isQueryEnabled()) {
-        if (typeof this.querySet === 'undefined') {
-          this.querySet = this.device.createQuerySet({
-            type: 'timestamp',
-            count: this.querySetCount,
-          });
-        }
+
+      if (this.queryType === QueryType.atPasses) {
         computePassDescriptor.timestampWrites = {
-          querySet: this.querySet,
-          beginningOfPassWriteIndex: 0,
-          endOfPassWriteIndex: 1,
+          querySet: this.querySet!,
+          beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
+          endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
         };
       }
 
@@ -234,19 +260,87 @@ export class WebGpuBackend {
   }
 
   flush(): void {
-    if (this.commandEncoder) {
-      this.endComputePass();
-      this.device.queue.submit([this.getCommandEncoder().finish()]);
-      this.gpuDataManager.refreshPendingBuffers();
-      this.commandEncoder = null;
-      this.pendingDispatchNumber = 0;
+    if (!this.commandEncoder) {
+      return;
     }
-  }
 
-  isQueryEnabled(): boolean {
-    return this.device.features.has('timestamp-query') &&
-        (this.env.webgpu.profiling?.mode === 'default' ||
-         (!this.env.webgpu.profiling?.mode && this.env.webgpu.profilingMode === 'default'));
+    let queryReadData: GpuData;
+    if (this.queryType !== QueryType.none) {
+      this.commandEncoder.resolveQuerySet(
+          this.querySet!, 0, this.pendingDispatchNumber * 2, this.queryResolveData!.buffer, 0);
+      queryReadData = this.gpuDataManager.create(
+          // eslint-disable-next-line no-bitwise
+          this.pendingDispatchNumber * 2 * 8, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
+      this.pendingQueries.set(queryReadData.id, this.pendingKernels);
+      this.pendingKernels = [];
+      this.commandEncoder.copyBufferToBuffer(
+          this.queryResolveData!.buffer, 0, queryReadData.buffer, 0, this.pendingDispatchNumber * 2 * 8);
+    }
+
+    this.device.queue.submit([this.commandEncoder.finish()]);
+    this.gpuDataManager.refreshPendingBuffers();
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+
+    if (this.queryType !== QueryType.none) {
+      void queryReadData!.buffer.mapAsync(GPUMapMode.READ).then(() => {
+        const mappedData = new BigUint64Array(queryReadData.buffer.getMappedRange());
+        const pendingKernels = this.pendingQueries.get(queryReadData.id);
+        for (let i = 0; i < mappedData.length / 2; i++) {
+          const kernelId = pendingKernels![i];
+          const kernelInfo = this.kernels.get(kernelId)!;
+          const opType = kernelInfo.opType;
+          const nodeName = kernelInfo.nodeName;
+          const programName = kernelInfo.programName;
+          const inputTensorViews = kernelInfo.inputTensorViews;
+          const outputTensorViews = kernelInfo.outputTensorViews;
+          const startTimeU64 = mappedData[i * 2];
+          const endTimeU64 = mappedData[i * 2 + 1];
+
+          if (typeof this.queryTimeBase === 'undefined') {
+            this.queryTimeBase = startTimeU64;
+          }
+
+          const startTime = Number(startTimeU64 - this.queryTimeBase);
+          const endTime = Number(endTimeU64 - this.queryTimeBase);
+
+          if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
+            throw new RangeError('incorrect timestamp range');
+          }
+
+          if (this.env.webgpu.profiling?.ondata) {
+            this.env.webgpu.profiling.ondata({
+              version: 1,
+              inputsMetadata: inputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              outputsMetadata: outputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              kernelId,
+              kernelType: opType,
+              kernelName: nodeName,
+              startTime,
+              endTime,
+            });
+          } else {
+            // if no callback is provided, print the profiling message to console
+            let inputShapes = '';
+            inputTensorViews.forEach((value, i) => {
+              inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            let outputShapes = '';
+            outputTensorViews.forEach((value, i) => {
+              outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            // eslint-disable-next-line no-console
+            console.log(`[profiling] kernel "${kernelId}|${opType}|${nodeName}|${programName}" ${inputShapes}${
+                outputShapes}execution time: ${endTime - startTime} ns`);
+          }
+        }
+        queryReadData.buffer.unmap();
+        this.gpuDataManager.release(queryReadData.id);
+        this.pendingQueries.delete(queryReadData.id);
+      });
+    }
   }
 
   /**
@@ -378,14 +472,22 @@ export class WebGpuBackend {
       this.programManager.setArtifact(key, artifact);
       LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
     }
+    // update kernels
+    const kernelInfo = this.kernels.get(this.currentKernelId!)!;
+    kernelInfo.programName = artifact.programInfo.name;
+    kernelInfo.inputTensorViews = inputTensorViews;
+    kernelInfo.outputTensorViews = outputTensorViews;
 
     LOG_DEBUG(
         'info',
         () => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
             normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
-    this.programManager.run(
-        artifact, inputTensorViews, outputTensorViews, inputDatas, outputDatas, normalizedDispatchGroup,
-        uniformBufferBinding);
+
+    if (this.queryType !== QueryType.none) {
+      this.pendingKernels.push(this.currentKernelId!);
+    }
+
+    this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);
 
     return outputTensorViews;
   }
@@ -418,7 +520,16 @@ export class WebGpuBackend {
       throw new Error(`kernel not implemented: ${opType}`);
     }
 
-    this.kernels.set(kernelId, [opType, nodeName, op[0], [op[1], attribute]]);
+    const kernelInfo: KernelInfo = {
+      opType,
+      nodeName,
+      kernelEntry: op[0],
+      attributes: [op[1], attribute],
+      programName: '',
+      inputTensorViews: [],
+      outputTensorViews: [],
+    };
+    this.kernels.set(kernelId, kernelInfo);
   }
 
   releaseKernel(kernelId: number): void {
@@ -439,7 +550,10 @@ export class WebGpuBackend {
     if (!kernel) {
       throw new Error(`kernel not created: ${kernelId}`);
     }
-    const [opType, nodeName, kernelEntry, attributes] = kernel;
+    const opType = kernel.opType;
+    const nodeName = kernel.nodeName;
+    const kernelEntry = kernel.kernelEntry;
+    const attributes = kernel.attributes;
     if (this.currentKernelId !== null) {
       throw new Error(`kernel "[${opType}] ${nodeName}" is not allowed to be called recursively`);
     }
@@ -514,5 +628,23 @@ export class WebGpuBackend {
       return createView(data.buffer, type);
     };
   }
+  writeTimeStamp(index: number): void {
+    if (this.queryType !== QueryType.insidePasses) {
+      return;
+    }
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (this.computePassEncoder as any).writeTimestamp(this.querySet, index);
+  }
+  setQueryType(): void {
+    this.queryType = QueryType.none;
+    if (this.env.webgpu.profiling?.mode === 'default') {
+      if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+        this.queryType = QueryType.insidePasses;
+      } else if (this.device.features.has('timestamp-query')) {
+        this.queryType = QueryType.atPasses;
+      }
+    }
+  }
   // #endregion
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
-import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
+import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, QueryType} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -186,9 +186,10 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           },
 
       // jsepCreateKernel
-      (name: string, kernel: number, attribute: unknown) => backend.createKernel(
-          name, kernel, attribute,
-          env.debug || backend.isQueryEnabled() ? module.UTF8ToString(module._JsepGetNodeName(kernel)) : `${kernel}`),
+      (opType: string, kernelId: number, attribute: unknown) => backend.createKernel(
+          opType, kernelId, attribute,
+          env.debug || backend.queryType !== QueryType.none ? module.UTF8ToString(module._JsepGetNodeName(kernelId)) :
+                                                              `${opType}`),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),