microsoft · fs-eire · Jan 13, 2024 · Dec 20, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -3,12 +3,14 @@
 
 import {Env, Tensor} from 'onnxruntime-common';
 
+import {tensorDataTypeEnumToString} from '../wasm-common';
+
 import {configureLogger, LOG_DEBUG} from './log';
 import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
+import {ComputeContext, GpuData, PendingKernelInfo, ProgramInfo, ProgramInputTensorInfoDependency, QueryType} from './webgpu/types';
 
 const getProgramInputTensorInfoDependencyKey =
     (inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
@@ -130,12 +132,18 @@ export class WebGpuBackend {
 
   private commandEncoder: GPUCommandEncoder|null = null;
   private computePassEncoder: GPUComputePassEncoder|null = null;
+  maxDispatchNumber = 16;
   pendingDispatchNumber = 0;
 
-  queryData?: GpuData;
-  querySet?: GPUQuerySet;
-  querySetCount = 2;
-  queryTimeBase?: bigint;
+
+  // info of kernels pending submission for a single batch
+  pendingKernels: PendingKernelInfo[] = [];
+  // queryReadData -> pendingKernels mapping for all the batches
+  private pendingQueries: Map<number, PendingKernelInfo[]> = new Map();
+  private queryResolveData?: GpuData;
+  private querySet?: GPUQuerySet;
+  private queryTimeBase?: bigint;
+  queryType: QueryType;
 
   env: Env;
 
@@ -161,7 +169,9 @@ export class WebGpuBackend {
       requiredFeatures,
     };
 
-    if (adapter.features.has('timestamp-query')) {
+    if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+      requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
+    } else if (adapter.features.has('timestamp-query')) {
       requiredFeatures.push('timestamp-query');
     }
     if (adapter.features.has('shader-f16')) {
@@ -200,24 +210,30 @@ export class WebGpuBackend {
   getCommandEncoder(): GPUCommandEncoder {
     if (!this.commandEncoder) {
       this.commandEncoder = this.device.createCommandEncoder();
+
+      this.setQueryType();
+      if (this.queryType !== QueryType.none && typeof this.querySet === 'undefined') {
+        this.querySet = this.device.createQuerySet({
+          type: 'timestamp',
+          count: this.maxDispatchNumber * 2,
+        });
+        this.queryResolveData = this.gpuDataManager.create(
+            // eslint-disable-next-line no-bitwise
+            this.maxDispatchNumber * 2 * 8, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
+      }
     }
     return this.commandEncoder;
   }
 
   getComputePassEncoder(): GPUComputePassEncoder {
     if (!this.computePassEncoder) {
       const computePassDescriptor: GPUComputePassDescriptor = {};
-      if (this.isQueryEnabled()) {
-        if (typeof this.querySet === 'undefined') {
-          this.querySet = this.device.createQuerySet({
-            type: 'timestamp',
-            count: this.querySetCount,
-          });
-        }
+
+      if (this.queryType === QueryType.atPasses) {
         computePassDescriptor.timestampWrites = {
-          querySet: this.querySet,
-          beginningOfPassWriteIndex: 0,
-          endOfPassWriteIndex: 1,
+          querySet: this.querySet!,
+          beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
+          endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
         };
       }
 
@@ -234,19 +250,85 @@ export class WebGpuBackend {
   }
 
   flush(): void {
-    if (this.commandEncoder) {
-      this.endComputePass();
-      this.device.queue.submit([this.getCommandEncoder().finish()]);
-      this.gpuDataManager.refreshPendingBuffers();
-      this.commandEncoder = null;
-      this.pendingDispatchNumber = 0;
+    if (!this.commandEncoder) {
+      return;
     }
-  }
 
-  isQueryEnabled(): boolean {
-    return this.device.features.has('timestamp-query') &&
-        (this.env.webgpu.profiling?.mode === 'default' ||
-         (!this.env.webgpu.profiling?.mode && this.env.webgpu.profilingMode === 'default'));
+    let queryReadData: GpuData;
+    if (this.queryType !== QueryType.none) {
+      this.commandEncoder.resolveQuerySet(
+          this.querySet!, 0, this.pendingDispatchNumber * 2, this.queryResolveData!.buffer, 0);
+      queryReadData = this.gpuDataManager.create(
+          // eslint-disable-next-line no-bitwise
+          this.pendingDispatchNumber * 2 * 8, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
+      this.pendingQueries.set(queryReadData.id, this.pendingKernels);
+      this.pendingKernels = [];
+      this.commandEncoder.copyBufferToBuffer(
+          this.queryResolveData!.buffer, 0, queryReadData.buffer, 0, this.pendingDispatchNumber * 2 * 8);
+    }
+
+    this.device.queue.submit([this.commandEncoder.finish()]);
+    this.gpuDataManager.refreshPendingBuffers();
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+
+    if (this.queryType !== QueryType.none) {
+      void queryReadData!.buffer.mapAsync(GPUMapMode.READ).then(() => {
+        const mappedData = new BigUint64Array(queryReadData.buffer.getMappedRange());
+        const pendingKernels = this.pendingQueries.get(queryReadData.id);
+        for (let i = 0; i < mappedData.length / 2; i++) {
+          const kernelId = pendingKernels![i].id;
+          const kernelInfo = this.kernels.get(kernelId)!;
+          const kernelType = kernelInfo[0];
+          const kernelName = pendingKernels![i].name;
+          const inputTensorViews = pendingKernels![i].inputTensorViews;
+          const outputTensorViews = pendingKernels![i].outputTensorViews;
+          const startTimeU64 = mappedData[i * 2];
+          const endTimeU64 = mappedData[i * 2 + 1];
+
+          if (typeof this.queryTimeBase === 'undefined') {
+            this.queryTimeBase = startTimeU64;
+          }
+
+          const startTime = Number(startTimeU64 - this.queryTimeBase);
+          const endTime = Number(endTimeU64 - this.queryTimeBase);
+
+          if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
+            throw new RangeError('incorrect timestamp range');
+          }
+
+          if (this.env.webgpu.profiling?.ondata) {
+            this.env.webgpu.profiling.ondata({
+              version: 1,
+              inputsMetadata: inputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              outputsMetadata: outputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              kernelId,
+              kernelType,
+              kernelName,
+              startTime,
+              endTime,
+            });
+          } else {
+            // if no callback is provided, print the profiling message to console
+            let inputShapes = '';
+            inputTensorViews.forEach((value, i) => {
+              inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            let outputShapes = '';
+            outputTensorViews.forEach((value, i) => {
+              outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            // eslint-disable-next-line no-console
+            console.log(`[profiling] kernel "${kernelId}|${kernelName}" ${inputShapes}${outputShapes}execution time: ${
+                endTime - startTime} ns`);
+          }
+        }
+        queryReadData.buffer.unmap();
+        this.gpuDataManager.release(queryReadData.id);
+      });
+    }
   }
 
   /**
@@ -514,5 +596,24 @@ export class WebGpuBackend {
       return createView(data.buffer, type);
     };
   }
+  writeTimeStamp(index: number): void {
+    if (this.queryType !== QueryType.insidePasses) {
+      return;
+    }
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (this.computePassEncoder as any).writeTimestamp(this.querySet, index);
+  }
+
+  setQueryType(): void {
+    this.queryType = QueryType.none;
+    if (this.env.webgpu.profiling?.mode === 'default') {
+      if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+        this.queryType = QueryType.insidePasses;
+      } else if (this.device.features.has('timestamp-query')) {
+        this.queryType = QueryType.atPasses;
+      }
+    }
+  }
   // #endregion
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
-import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
+import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, QueryType} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -188,7 +188,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
       // jsepCreateKernel
       (name: string, kernel: number, attribute: unknown) => backend.createKernel(
           name, kernel, attribute,
-          env.debug || backend.isQueryEnabled() ? module.UTF8ToString(module._JsepGetNodeName(kernel)) : `${kernel}`),
+          env.debug || backend.queryType !== QueryType.none ? module.UTF8ToString(module._JsepGetNodeName(kernel)) :
+                                                              `${kernel}`),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),

diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {tensorDataTypeEnumToString} from '../../wasm-common';
 import {WebGpuBackend} from '../backend-webgpu';
 import {LOG_DEBUG} from '../log';
 import {TensorView} from '../tensor-view';
 
 import {createShaderHelper} from './ops/common';
-import {Artifact, GpuData, ProgramInfo} from './types';
+import {Artifact, GpuData, PendingKernelInfo, ProgramInfo, QueryType} from './types';
 
 /**
  * ProgramManager is the main class behind running computations
@@ -36,8 +35,8 @@ export class ProgramManager {
       inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number],
       uniformBufferBinding: GPUBindingResource|undefined): void {
     const device = this.backend.device;
-
     const computePassEncoder = this.backend.getComputePassEncoder();
+    this.backend.writeTimeStamp(this.backend.pendingDispatchNumber * 2);
     computePassEncoder.setPipeline(buildArtifact.computePipeline);
     const entries = [];
     for (const input of inputs) {
@@ -55,77 +54,29 @@ export class ProgramManager {
 
     computePassEncoder.dispatchWorkgroups(...dispatchGroup);
 
-    this.backend.pendingDispatchNumber++;
-
-    if (this.backend.isQueryEnabled()) {
-      if (typeof this.backend.queryData === 'undefined') {
-        this.backend.queryData = this.backend.gpuDataManager.create(
-            // eslint-disable-next-line no-bitwise
-            this.backend.querySetCount * 8, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
-      }
-      const syncData = this.backend.gpuDataManager.create(
-          // eslint-disable-next-line no-bitwise
-          this.backend.querySetCount * 8, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
-
-      this.backend.endComputePass();
-      this.backend.getCommandEncoder().resolveQuerySet(this.backend.querySet!, 0, 2, this.backend.queryData.buffer, 0);
-      this.backend.getCommandEncoder().copyBufferToBuffer(
-          this.backend.queryData.buffer, 0, syncData.buffer, 0, this.backend.querySetCount * 8);
-      this.backend.flush();
-
+    if (this.backend.queryType !== QueryType.none) {
       const kernelId = this.backend.currentKernelId!;
       const kernelInfo = this.backend.kernels.get(kernelId)!;
-
-      void syncData.buffer.mapAsync(GPUMapMode.READ).then(() => {
-        const mappedData = new BigUint64Array(syncData.buffer.getMappedRange());
-        const [startTimeU64, endTimeU64] = mappedData;
-        const [kernelType, kernelName] = kernelInfo;
-
-        syncData.buffer.unmap();
-
-        if (typeof this.backend.queryTimeBase === 'undefined') {
-          this.backend.queryTimeBase = startTimeU64;
-        }
-
-        const startTime = Number(startTimeU64 - this.backend.queryTimeBase);
-        const endTime = Number(endTimeU64 - this.backend.queryTimeBase);
-
-        if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
-          throw new RangeError('incorrect timestamp range');
-        }
-
-        this.backend.gpuDataManager.release(syncData.id);
-        if (this.backend.env.webgpu.profiling?.ondata) {
-          this.backend.env.webgpu.profiling.ondata({
-            version: 1,
-            inputsMetadata: inputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            outputsMetadata: outputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            kernelId,
-            kernelType,
-            kernelName,
-            startTime,
-            endTime,
-          });
-        } else {
-          // if no callback is provided, print the profiling message to console
-          let inputShapes = '';
-          inputTensorViews.forEach((value, i) => {
-            inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          let outputShapes = '';
-          outputTensorViews.forEach((value, i) => {
-            outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          // eslint-disable-next-line no-console
-          console.log(`[profiling] kernel "${kernelId}|${kernelName}|${buildArtifact.programInfo.name}" ${inputShapes}${
-              outputShapes}execution time: ${endTime - startTime} ns`);
-        }
-      });
+      let kernelName = kernelInfo[0];
+      if (buildArtifact.programInfo.name !== kernelName) {
+        kernelName = `${kernelName}/${buildArtifact.programInfo.name}`;
+      }
+      const pendingKernelInfo: PendingKernelInfo = {
+        id: kernelId,
+        name: kernelName,
+        inputTensorViews,
+        outputTensorViews,
+      };
+      this.backend.pendingKernels.push(pendingKernelInfo);
+      this.backend.writeTimeStamp(this.backend.pendingDispatchNumber * 2 + 1);
     }
 
-    if (this.backend.pendingDispatchNumber >= 16) {
+    this.backend.pendingDispatchNumber++;
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber ||
+        this.backend.queryType === QueryType.atPasses) {
+      this.backend.endComputePass();
+    }
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber) {
       this.backend.flush();
     }
   }

diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -12,6 +12,12 @@ export enum GpuDataType {
 }
 export type GpuDataId = number;
 
+export enum QueryType {
+  none,
+  insidePasses,
+  atPasses,
+}
+
 export interface GpuData {
   type: GpuDataType;
   id: GpuDataId;
@@ -23,12 +29,18 @@ export interface TensorInfo {
   dataType: number;
 }
 
-
 export interface ProgramUniform {
   type: 'int32'|'float32'|'uint32';
   data: number|readonly number[];
 }
 
+export interface PendingKernelInfo {
+  id: number;
+  name: string;
+  inputTensorViews: readonly TensorView[];
+  outputTensorViews: readonly TensorView[];
+}
+
 /**
  * Represent the dependency of a program on a specific input tensor.
  *