From baf5d065ad2324199e7cdf1732128a1483ebf1c1 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Wed, 7 Feb 2024 19:25:02 -0800 Subject: [PATCH] [not for land] other torchbench torchao testing stuff Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned] --- log.log | 34676 +--------------- log2.log | 199 + log_acc.log | 29420 +++++++++++++ torchao_benchmarks.sh | 37 +- userbenchmark/dynamo/dynamobench/common.py | 231 +- .../dynamo/dynamobench/torchbench.py | 9 +- 6 files changed, 29991 insertions(+), 34581 deletions(-) create mode 100644 log2.log create mode 100644 log_acc.log diff --git a/log.log b/log.log index 39ac878584..700cc49d56 100644 --- a/log.log +++ b/log.log @@ -1,8732 +1,118 @@ -start dynamic - loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] -torchrec_dlrm -/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] BERT_pytorch -cuda eval BERT_pytorch int8dynamic - running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) -Run failed with return code: -6 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - loading model: 0it [00:06, ?it/s] -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead -doctr_reco_predictor -cuda eval doctr_reco_predictor int8dynamic -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/diffusers/models/attention_processor.py(1236)__call__() --> hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) -(Pdb) TIMEOUT - loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] -timm_efficientdet -cuda eval timm_efficientdet int8dynamic - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch int8weightonly - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch int4weightonly - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch baseline - running benchmark: 0%| | 0/30 [00:00 will be ignored -[rank0]:[2023-12-12 03:31:06,215] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance. -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes: -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐ -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ Index │ Extra Ops │ Extra Param Size (b) │ -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤ -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ 0 │ 157 │ 44910720 │ -[rank0]:[2023-12-12 03:31:06,238] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘ -skipping cudagraphs due to ['mutated inputs'] -[rank0]:[2023-12-12 03:31:29,846] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator()) - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch int8weightonly-bs1 - running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) -Run failed with return code: -6 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - loading model: 0it [00:05, ?it/s] -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead -doctr_reco_predictor -cuda eval doctr_reco_predictor int8weightonly-bs1 -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch int4weightonly-bs1 - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] BERT_pytorch -cuda eval BERT_pytorch baseline-bs1 -AUTOTUNE addmm(128x768, 128x768, 768x768) - bias_addmm 0.0127 ms 100.0% - triton_mm_5 0.0131 ms 97.5% - triton_mm_9 0.0131 ms 97.3% - triton_mm_6 0.0135 ms 94.4% - triton_mm_8 0.0147 ms 86.7% - addmm 0.0165 ms 77.0% - triton_mm_3 0.0170 ms 75.0% - triton_mm_4 0.0171 ms 74.3% - triton_mm_2 0.0205 ms 62.1% - triton_mm_1 0.0205 ms 62.0% -SingleProcess AUTOTUNE takes 5.5990 seconds -AUTOTUNE mm(128x768, 768x768) - mm 0.0119 ms 100.0% - triton_mm_65 0.0125 ms 94.6% - triton_mm_66 0.0131 ms 90.7% - triton_mm_69 0.0133 ms 89.0% - triton_mm_68 0.0136 ms 87.5% - triton_mm_64 0.0158 ms 74.9% - triton_mm_63 0.0162 ms 73.4% - triton_mm_62 0.0192 ms 61.8% - triton_mm_61 0.0197 ms 60.1% - triton_mm_60 0.0279 ms 42.5% -SingleProcess AUTOTUNE takes 5.1503 seconds -AUTOTUNE mm(128x768, 768x3072) - mm 0.0145 ms 100.0% - triton_mm_80 0.0150 ms 96.8% - triton_mm_76 0.0168 ms 86.6% - triton_mm_78 0.0170 ms 85.5% - triton_mm_75 0.0173 ms 84.1% - triton_mm_77 0.0178 ms 81.7% - triton_mm_81 0.0187 ms 77.6% - triton_mm_74 0.0202 ms 72.1% - triton_mm_73 0.0203 ms 71.7% - triton_mm_72 0.0300 ms 48.5% -SingleProcess AUTOTUNE takes 4.9015 seconds -AUTOTUNE mm(128x3072, 3072x768) - mm 0.0179 ms 100.0% - triton_mm_90 0.0298 ms 59.9% - triton_mm_89 0.0300 ms 59.4% - triton_mm_93 0.0310 ms 57.6% - triton_mm_92 0.0343 ms 52.1% - triton_mm_88 0.0411 ms 43.5% - triton_mm_87 0.0415 ms 43.0% - triton_mm_86 0.0557 ms 32.1% - triton_mm_85 0.0557 ms 32.0% - triton_mm_84 0.0747 ms 23.9% -SingleProcess AUTOTUNE takes 4.8119 seconds - running benchmark: 0%| | 0/30 [00:00= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) -Run failed with return code: -6 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - loading model: 0it [00:05, ?it/s] -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead -doctr_reco_predictor -cuda eval doctr_reco_predictor baseline-bs1 -WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead - running benchmark: 0%| | 0/30 [00:00 will be ignored -[rank0]:[2023-12-12 09:46:53,460] [1/0_1] torch._dynamo.backends.distributed: [WARNING] Some buckets were extended beyond their requested parameter capacities in order to ensure each subgraph has an output node, required for fx graph partitioning. This can be the case when a subgraph would have only contained nodes performing inplace mutation, and returning no logical outputs. This should not be a problem, unless it results in too few graph partitions for optimal DDP performance. -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] DDPOptimizer extended these buckets to ensure per-subgraph output nodes: -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ┌─────────┬─────────────┬────────────────────────┐ -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ Index │ Extra Ops │ Extra Param Size (b) │ -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] ├─────────┼─────────────┼────────────────────────┤ -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] │ 0 │ 157 │ 44910720 │ -[rank0]:[2023-12-12 09:46:53,486] [1/0_1] torch._dynamo.backends.distributed: [WARNING] └─────────┴─────────────┴────────────────────────┘ -AUTOTUNE addmm(1x128, 1x2048, 2048x128) - bias_addmm 0.0112 ms 100.0% - addmm 0.0112 ms 100.0% - triton_mm_540 0.0187 ms 60.1% - triton_mm_541 0.0198 ms 56.8% - triton_mm_543 0.0201 ms 55.9% - triton_mm_544 0.0210 ms 53.5% - triton_mm_539 0.0217 ms 51.7% - triton_mm_538 0.0241 ms 46.7% - triton_mm_537 0.0308 ms 36.4% - triton_mm_536 0.0331 ms 34.0% -SingleProcess AUTOTUNE takes 4.3309 seconds -skipping cudagraphs due to ['mutated inputs'] -[rank0]:[2023-12-12 09:47:26,228] [5/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE mm(1x2048, 2048x128) - mm 0.0083 ms 100.0% - triton_mm_1087 0.0177 ms 46.8% - triton_mm_1088 0.0188 ms 44.0% - triton_mm_1090 0.0196 ms 42.2% - triton_mm_1091 0.0205 ms 40.3% - triton_mm_1086 0.0207 ms 40.0% - triton_mm_1085 0.0236 ms 35.0% - triton_mm_1084 0.0308 ms 26.9% - triton_mm_1083 0.0326 ms 25.4% - triton_mm_1082 0.0543 ms 15.3% -SingleProcess AUTOTUNE takes 4.4818 seconds -AUTOTUNE bmm(1x1x128, 1x128x1) - triton_bmm_1096 0.0061 ms 100.0% - triton_bmm_1098 0.0061 ms 100.0% - bmm 0.0065 ms 93.1% - triton_bmm_1097 0.0066 ms 91.8% - triton_bmm_1099 0.0066 ms 91.8% - triton_bmm_1095 0.0070 ms 86.4% - triton_bmm_1094 0.0074 ms 82.3% - triton_bmm_1100 0.0081 ms 75.4% - triton_bmm_1101 0.0086 ms 70.6% -SingleProcess AUTOTUNE takes 2.5842 seconds -AUTOTUNE bmm(1x1x128, 1x128x32000) - triton_bmm_1102 0.0140 ms 100.0% - triton_bmm_1104 0.0145 ms 96.6% - triton_bmm_1103 0.0150 ms 93.2% - triton_bmm_1106 0.0152 ms 91.8% - triton_bmm_1108 0.0154 ms 90.6% - triton_bmm_1105 0.0157 ms 89.2% - triton_bmm_1111 0.0158 ms 88.6% - triton_bmm_1112 0.0158 ms 88.6% - triton_bmm_1113 0.0162 ms 86.4% - triton_bmm_1109 0.0181 ms 77.1% -SingleProcess AUTOTUNE takes 3.8806 seconds -[rank0]:[W CUDAGraph.cpp:145] Warning: Waiting for pending NCCL work to finish before starting graph capture. (function operator()) - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch int8dynamic-bs32 -AUTOTUNE bmm(384x128x64, 384x64x128) - triton_bmm_30 0.0256 ms 100.0% - triton_bmm_23 0.0266 ms 96.0% - triton_bmm_24 0.0267 ms 95.9% - triton_bmm_25 0.0274 ms 93.3% - triton_bmm_26 0.0275 ms 93.1% - triton_bmm_32 0.0281 ms 91.1% - triton_bmm_22 0.0282 ms 90.6% - bmm 0.0292 ms 87.5% - triton_bmm_29 0.0296 ms 86.4% - triton_bmm_31 0.0316 ms 80.9% -SingleProcess AUTOTUNE takes 1.8196 seconds -AUTOTUNE bmm(384x128x128, 384x128x64) - triton_bmm_47 0.0282 ms 100.0% - triton_bmm_46 0.0296 ms 95.3% - triton_bmm_53 0.0299 ms 94.2% - triton_bmm_45 0.0300 ms 93.9% - triton_bmm_49 0.0301 ms 93.6% - triton_bmm_52 0.0301 ms 93.6% - triton_bmm_48 0.0313 ms 90.1% - triton_bmm_51 0.0322 ms 87.5% - triton_bmm_55 0.0325 ms 86.7% - triton_bmm_50 0.0334 ms 84.4% -SingleProcess AUTOTUNE takes 1.8928 seconds - running benchmark: 0%| | 0/30 [00:00 - async_compile.wait(globals()) - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait - scope[key] = result.result() - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result - self.future.result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result - return self.__get_result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result - raise self._exception -torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -CompilationError: at 14:40: xnumel = 196 - yoffset = tl.program_id(1).to(tl.int64) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) - ymask = yindex < ynumel - xoffset = tl.program_id(0).to(tl.int64) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = yindex % 1024 - y1 = (yindex // 1024) - tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) - ^ -ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') - -Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information - - -You can suppress this exception and fall back to eager by setting: - import torch._dynamo - torch._dynamo.config.suppress_errors = True - -Run failed with return code: 255 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 - loading model: 0it [00:08, ?it/s] -WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load -Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16' -Eager model failed to run -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model - self.model_iter_fn(model, example_inputs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass - return mod(*inputs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward - return self.inference(batched_inputs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference - results, _ = self.roi_heads(images, features, proposals, None) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward - pred_instances = self._forward_box(features, proposals) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box - box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward - return self.level_poolers[0](x[0], pooler_fmt_boxes) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward - return roi_align( - File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align - return torch.ops.torchvision.roi_align( - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__ - return self._op(*args, **(kwargs or {})) -RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16' - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model - self.validate_model(model, example_inputs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model - raise NotImplementedError("Eager model failed to run") from e -NotImplementedError: Eager model failed to run - - loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead - loading model: 0it [00:07, ?it/s] -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead -cuda eval detectron2_fasterrcnn_r_101_fpn int8dynamic-bs32 -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead -skipping cudagraphs due to ['mutated inputs'] -AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7) - convolution 3.7106 ms 100.0% - triton_convolution_3 22.1062 ms 16.8% - triton_convolution_4 24.2973 ms 15.3% - triton_convolution_5 26.9378 ms 13.8% - triton_convolution_0 30.0738 ms 12.3% - triton_convolution_2 32.5176 ms 11.4% - triton_convolution_1 81.4506 ms 4.6% -SingleProcess AUTOTUNE takes 5.0160 seconds -AUTOTUNE mm(3268608x64, 64x64) - triton_mm_14 0.5186 ms 100.0% - triton_mm_8 0.5193 ms 99.9% - triton_mm_7 0.5283 ms 98.2% - triton_mm_10 0.5368 ms 96.6% - triton_mm_6 0.5460 ms 95.0% - triton_mm_13 0.5492 ms 94.4% - triton_mm_9 0.5513 ms 94.1% - mm 0.5668 ms 91.5% - triton_mm_15 0.6987 ms 74.2% - triton_mm_16 0.7410 ms 70.0% -SingleProcess AUTOTUNE takes 4.1927 seconds -AUTOTUNE convolution(32x64x304x336, 64x64x3x3) - convolution 1.4962 ms 100.0% - triton_convolution_18 7.1458 ms 20.9% - triton_convolution_23 8.0892 ms 18.5% - triton_convolution_24 9.7211 ms 15.4% - triton_convolution_19 11.9942 ms 12.5% - triton_convolution_21 12.1926 ms 12.3% - triton_convolution_22 12.5636 ms 11.9% - triton_convolution_20 28.5195 ms 5.2% -SingleProcess AUTOTUNE takes 4.7321 seconds -AUTOTUNE mm(3268608x64, 64x256) - triton_mm_27 1.5259 ms 100.0% - triton_mm_26 1.5455 ms 98.7% - triton_mm_28 1.7398 ms 87.7% - mm 1.7660 ms 86.4% - triton_mm_29 1.7692 ms 86.2% - triton_mm_33 1.8433 ms 82.8% - triton_mm_25 1.8774 ms 81.3% - triton_mm_32 1.9772 ms 77.2% - triton_mm_35 2.4140 ms 63.2% - triton_mm_34 2.8250 ms 54.0% -SingleProcess AUTOTUNE takes 4.9396 seconds -AUTOTUNE mm(3268608x256, 256x64) - triton_mm_51 1.3316 ms 100.0% - triton_mm_53 1.3624 ms 97.7% - triton_mm_56 1.3661 ms 97.5% - mm 1.4040 ms 94.8% - triton_mm_57 1.4285 ms 93.2% - triton_mm_50 1.4361 ms 92.7% - triton_mm_49 1.4596 ms 91.2% - triton_mm_52 1.4744 ms 90.3% - triton_mm_54 2.1015 ms 63.4% - triton_mm_55 2.2606 ms 58.9% -SingleProcess AUTOTUNE takes 4.7929 seconds -AUTOTUNE convolution(32x256x304x336, 128x256x1x1) - convolution 0.4567 ms 100.0% - triton_convolution_114 1.0757 ms 42.5% - triton_convolution_111 1.2215 ms 37.4% - triton_convolution_117 1.3653 ms 33.5% - triton_convolution_116 1.5024 ms 30.4% - triton_convolution_115 1.6693 ms 27.4% - triton_convolution_112 3.1440 ms 14.5% - triton_convolution_113 6.8288 ms 6.7% -SingleProcess AUTOTUNE takes 4.5671 seconds -AUTOTUNE convolution(32x128x152x168, 128x128x3x3) - convolution 1.1805 ms 100.0% - triton_convolution_121 7.0887 ms 16.7% - triton_convolution_118 8.0229 ms 14.7% - triton_convolution_123 8.3772 ms 14.1% - triton_convolution_124 10.6232 ms 11.1% - triton_convolution_122 12.4481 ms 9.5% - triton_convolution_119 13.7375 ms 8.6% - triton_convolution_120 28.6058 ms 4.1% -SingleProcess AUTOTUNE takes 4.7865 seconds -AUTOTUNE mm(817152x128, 128x512) - triton_mm_127 0.9996 ms 100.0% - triton_mm_126 1.0023 ms 99.7% - triton_mm_132 1.0768 ms 92.8% - triton_mm_125 1.1161 ms 89.6% - mm 1.1354 ms 88.0% - triton_mm_128 1.1937 ms 83.7% - triton_mm_129 1.2079 ms 82.8% - triton_mm_133 1.3932 ms 71.8% - triton_mm_135 1.6393 ms 61.0% - triton_mm_134 2.2798 ms 43.8% -SingleProcess AUTOTUNE takes 4.8697 seconds -AUTOTUNE convolution(32x256x304x336, 512x256x1x1) - convolution 1.5260 ms 100.0% - triton_convolution_140 4.2531 ms 35.9% - triton_convolution_142 4.5226 ms 33.7% - triton_convolution_143 5.3815 ms 28.4% - triton_convolution_141 6.6562 ms 22.9% - triton_convolution_137 8.2852 ms 18.4% - triton_convolution_138 12.5198 ms 12.2% - triton_convolution_139 27.1271 ms 5.6% -SingleProcess AUTOTUNE takes 5.0886 seconds -AUTOTUNE mm(817152x512, 512x128) - mm 0.7224 ms 100.0% - triton_mm_146 0.8024 ms 90.0% - triton_mm_145 0.8341 ms 86.6% - triton_mm_148 0.8952 ms 80.7% - triton_mm_147 0.9011 ms 80.2% - triton_mm_151 0.9721 ms 74.3% - triton_mm_152 1.0261 ms 70.4% - triton_mm_144 1.0666 ms 67.7% - triton_mm_154 1.5977 ms 45.2% - triton_mm_149 1.7777 ms 40.6% -SingleProcess AUTOTUNE takes 5.0220 seconds -AUTOTUNE convolution(32x512x152x168, 256x512x1x1) - convolution 0.3106 ms 100.0% - triton_convolution_240 1.0496 ms 29.6% - triton_convolution_242 1.0762 ms 28.9% - triton_convolution_243 1.2981 ms 23.9% - triton_convolution_241 1.7032 ms 18.2% - triton_convolution_237 2.1030 ms 14.8% - triton_convolution_238 3.1476 ms 9.9% - triton_convolution_239 6.7274 ms 4.6% -SingleProcess AUTOTUNE takes 4.7450 seconds -AUTOTUNE convolution(32x256x76x84, 256x256x3x3) - convolution 1.0683 ms 100.0% - triton_convolution_249 6.4305 ms 16.6% - triton_convolution_247 7.2643 ms 14.7% - triton_convolution_244 7.9726 ms 13.4% - triton_convolution_250 11.5396 ms 9.3% - triton_convolution_248 16.4705 ms 6.5% - triton_convolution_245 19.3214 ms 5.5% - triton_convolution_246 29.2912 ms 3.6% -SingleProcess AUTOTUNE takes 5.4721 seconds -AUTOTUNE mm(204288x256, 256x1024) - mm 0.6795 ms 100.0% - triton_mm_253 0.7678 ms 88.5% - triton_mm_252 0.7709 ms 88.1% - triton_mm_258 0.7911 ms 85.9% - triton_mm_251 0.9138 ms 74.4% - triton_mm_254 0.9144 ms 74.3% - triton_mm_255 0.9386 ms 72.4% - triton_mm_259 1.0814 ms 62.8% - triton_mm_261 1.3090 ms 51.9% - triton_mm_260 1.8635 ms 36.5% -SingleProcess AUTOTUNE takes 4.9279 seconds -AUTOTUNE convolution(32x512x152x168, 1024x512x1x1) - convolution 1.2123 ms 100.0% - triton_convolution_266 4.1210 ms 29.4% - triton_convolution_268 4.2024 ms 28.8% - triton_convolution_269 5.0803 ms 23.9% - triton_convolution_267 6.5498 ms 18.5% - triton_convolution_263 8.1485 ms 14.9% - triton_convolution_264 12.4510 ms 9.7% - triton_convolution_265 26.7531 ms 4.5% -SingleProcess AUTOTUNE takes 5.0502 seconds -AUTOTUNE mm(204288x1024, 1024x256) - mm 0.5419 ms 100.0% - triton_mm_271 0.6533 ms 82.9% - triton_mm_272 0.6558 ms 82.6% - triton_mm_273 0.7579 ms 71.5% - triton_mm_274 0.7680 ms 70.6% - triton_mm_270 0.8710 ms 62.2% - triton_mm_278 0.8870 ms 61.1% - triton_mm_277 0.9553 ms 56.7% - triton_mm_280 1.3468 ms 40.2% - triton_mm_279 1.5510 ms 34.9% -SingleProcess AUTOTUNE takes 5.6453 seconds -AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1) - convolution 0.2594 ms 100.0% - triton_convolution_955 1.0005 ms 25.9% - triton_convolution_957 1.1321 ms 22.9% - triton_convolution_958 1.2459 ms 20.8% - triton_convolution_956 1.9279 ms 13.5% - triton_convolution_952 2.2604 ms 11.5% - triton_convolution_953 3.1723 ms 8.2% - triton_convolution_954 6.5568 ms 4.0% -SingleProcess AUTOTUNE takes 5.2660 seconds -AUTOTUNE convolution(32x512x38x42, 512x512x3x3) - convolution 1.0310 ms 100.0% - triton_convolution_964 6.9642 ms 14.8% - triton_convolution_959 9.8914 ms 10.4% - triton_convolution_962 10.1237 ms 10.2% - triton_convolution_965 17.1603 ms 6.0% - triton_convolution_963 19.1972 ms 5.4% - triton_convolution_960 21.0584 ms 4.9% - triton_convolution_961 28.6043 ms 3.6% -SingleProcess AUTOTUNE takes 5.4783 seconds -AUTOTUNE mm(51072x512, 512x2048) - mm 0.5481 ms 100.0% - triton_mm_968 0.6581 ms 83.3% - triton_mm_967 0.6594 ms 83.1% - triton_mm_973 0.6983 ms 78.5% - triton_mm_970 0.7820 ms 70.1% - triton_mm_969 0.7847 ms 69.9% - triton_mm_966 0.8211 ms 66.8% - triton_mm_974 0.9289 ms 59.0% - triton_mm_976 1.2285 ms 44.6% - triton_mm_972 1.6656 ms 32.9% -SingleProcess AUTOTUNE takes 5.7071 seconds -AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1) - convolution 1.0465 ms 100.0% - triton_convolution_981 3.9368 ms 26.6% - triton_convolution_983 4.3779 ms 23.9% - triton_convolution_984 4.9079 ms 21.3% - triton_convolution_982 7.6491 ms 13.7% - triton_convolution_978 8.3042 ms 12.6% - triton_convolution_979 12.2804 ms 8.5% - triton_convolution_980 26.2045 ms 4.0% -SingleProcess AUTOTUNE takes 5.1454 seconds -AUTOTUNE mm(51072x2048, 2048x512) - mm 0.4900 ms 100.0% - triton_mm_987 0.6065 ms 80.8% - triton_mm_986 0.6077 ms 80.6% - triton_mm_988 0.7009 ms 69.9% - triton_mm_989 0.7034 ms 69.7% - triton_mm_985 0.8206 ms 59.7% - triton_mm_993 0.8374 ms 58.5% - triton_mm_992 0.8790 ms 55.7% - triton_mm_995 1.2860 ms 38.1% - triton_mm_991 1.5045 ms 32.6% -SingleProcess AUTOTUNE takes 5.4583 seconds -AUTOTUNE addmm(51072x256, 51072x2048, 2048x256) - bias_addmm 0.3029 ms 100.0% - addmm 0.3089 ms 98.1% - triton_mm_1049 0.3235 ms 93.6% - triton_mm_1048 0.3275 ms 92.5% - triton_mm_1050 0.3550 ms 85.3% - triton_mm_1051 0.3622 ms 83.6% - triton_mm_1055 0.4242 ms 71.4% - triton_mm_1047 0.4364 ms 69.4% - triton_mm_1054 0.5085 ms 59.6% - triton_mm_1057 0.7001 ms 43.3% -SingleProcess AUTOTUNE takes 5.7065 seconds -AUTOTUNE convolution(32x256x38x42, 256x256x3x3) - convolution 0.2750 ms 100.0% - triton_convolution_1064 1.7089 ms 16.1% - triton_convolution_1062 1.8415 ms 14.9% - triton_convolution_1059 2.1425 ms 12.8% - triton_convolution_1065 2.9491 ms 9.3% - triton_convolution_1063 3.9602 ms 6.9% - triton_convolution_1060 4.6976 ms 5.9% - triton_convolution_1061 7.4853 ms 3.7% -SingleProcess AUTOTUNE takes 5.0653 seconds -AUTOTUNE addmm(3268608x256, 3268608x256, 256x256) - triton_mm_1067 3.5880 ms 100.0% - triton_mm_1068 3.6040 ms 99.6% - triton_mm_1073 3.8683 ms 92.8% - triton_mm_1066 4.2427 ms 84.6% - triton_mm_1070 4.2668 ms 84.1% - triton_mm_1069 4.2703 ms 84.0% - triton_mm_1074 4.8278 ms 74.3% - bias_addmm 5.9413 ms 60.4% - addmm 5.9656 ms 60.1% - triton_mm_1076 6.2184 ms 57.7% -SingleProcess AUTOTUNE takes 5.7881 seconds -AUTOTUNE addmm(817152x256, 817152x512, 512x256) - bias_addmm 1.3238 ms 100.0% - triton_mm_1080 1.5291 ms 86.6% - triton_mm_1079 1.5313 ms 86.4% - triton_mm_1081 1.7488 ms 75.7% - triton_mm_1082 1.7947 ms 73.8% - addmm 1.8729 ms 70.7% - triton_mm_1078 1.9184 ms 69.0% - triton_mm_1086 2.0476 ms 64.6% - triton_mm_1085 2.6087 ms 50.7% - triton_mm_1088 2.8319 ms 46.7% -SingleProcess AUTOTUNE takes 6.2909 seconds -AUTOTUNE addmm(204288x256, 204288x1024, 1024x256) - bias_addmm 0.5711 ms 100.0% - triton_mm_1091 0.6686 ms 85.4% - triton_mm_1092 0.6850 ms 83.4% - addmm 0.7367 ms 77.5% - triton_mm_1093 0.7751 ms 73.7% - triton_mm_1094 0.7860 ms 72.7% - triton_mm_1090 0.8937 ms 63.9% - triton_mm_1098 0.9115 ms 62.7% - triton_mm_1097 0.9857 ms 57.9% - triton_mm_1100 1.3505 ms 42.3% -SingleProcess AUTOTUNE takes 5.8394 seconds -AUTOTUNE convolution(32x256x304x336, 256x256x3x3) - convolution 17.6540 ms 100.0% - triton_convolution_1107 100.5644 ms 17.6% - triton_convolution_1105 118.3672 ms 14.9% - triton_convolution_1102 122.4366 ms 14.4% - triton_convolution_1108 186.0839 ms 9.5% - triton_convolution_1106 265.2757 ms 6.7% - triton_convolution_1103 300.7162 ms 5.9% - triton_convolution_1104 466.2684 ms 3.8% -SingleProcess AUTOTUNE takes 17.1328 seconds -AUTOTUNE convolution(32x256x152x168, 256x256x3x3) - convolution 4.3941 ms 100.0% - triton_convolution_1114 25.2578 ms 17.4% - triton_convolution_1112 29.7074 ms 14.8% - triton_convolution_1109 31.0365 ms 14.2% - triton_convolution_1115 46.3984 ms 9.5% - triton_convolution_1113 66.1080 ms 6.6% - triton_convolution_1110 76.1160 ms 5.8% - triton_convolution_1111 116.5012 ms 3.8% -SingleProcess AUTOTUNE takes 7.3893 seconds -AUTOTUNE addmm(3268608x3, 3268608x256, 256x3) - triton_mm_1132 1.0489 ms 100.0% - triton_mm_1131 1.0684 ms 98.2% - triton_mm_1135 1.0694 ms 98.1% - triton_mm_1133 1.0721 ms 97.8% - triton_mm_1134 1.0732 ms 97.7% - triton_mm_1137 1.0832 ms 96.8% - triton_mm_1130 1.0861 ms 96.6% - triton_mm_1141 1.1324 ms 92.6% - triton_mm_1140 1.1654 ms 90.0% - triton_mm_1138 1.6334 ms 64.2% -SingleProcess AUTOTUNE takes 4.4295 seconds -AUTOTUNE addmm(817152x3, 817152x256, 256x3) - triton_mm_1151 0.2790 ms 100.0% - triton_mm_1150 0.2827 ms 98.7% - triton_mm_1154 0.2834 ms 98.4% - triton_mm_1152 0.2838 ms 98.3% - triton_mm_1153 0.2846 ms 98.0% - triton_mm_1149 0.2883 ms 96.8% - triton_mm_1156 0.2890 ms 96.5% - triton_mm_1160 0.3026 ms 92.2% - triton_mm_1159 0.3090 ms 90.3% - bias_addmm 0.4003 ms 69.7% -SingleProcess AUTOTUNE takes 4.7481 seconds -AUTOTUNE addmm(204288x3, 204288x256, 256x3) - triton_mm_1169 0.0863 ms 100.0% - triton_mm_1171 0.0867 ms 99.5% - triton_mm_1173 0.0870 ms 99.2% - triton_mm_1172 0.0878 ms 98.3% - triton_mm_1170 0.0879 ms 98.2% - triton_mm_1168 0.0897 ms 96.2% - triton_mm_1175 0.0902 ms 95.7% - triton_mm_1178 0.0952 ms 90.6% - triton_mm_1179 0.1020 ms 84.6% - triton_mm_1176 0.1132 ms 76.3% -SingleProcess AUTOTUNE takes 4.5108 seconds -AUTOTUNE addmm(51072x3, 51072x256, 256x3) - triton_mm_1189 0.0340 ms 100.0% - triton_mm_1191 0.0343 ms 99.0% - triton_mm_1190 0.0344 ms 98.7% - triton_mm_1188 0.0347 ms 97.9% - triton_mm_1187 0.0351 ms 96.9% - triton_mm_1194 0.0351 ms 96.8% - triton_mm_1192 0.0352 ms 96.6% - triton_mm_1197 0.0364 ms 93.4% - triton_mm_1198 0.0364 ms 93.3% - bias_addmm 0.0380 ms 89.4% -SingleProcess AUTOTUNE takes 4.2754 seconds -AUTOTUNE convolution(32x256x19x21, 256x256x3x3) - convolution 0.0683 ms 100.0% - triton_convolution_1202 0.4277 ms 16.0% - triton_convolution_1204 0.4415 ms 15.5% - triton_convolution_1199 0.5705 ms 12.0% - triton_convolution_1205 0.6938 ms 9.8% - triton_convolution_1200 0.7713 ms 8.9% - triton_convolution_1203 0.7883 ms 8.7% - triton_convolution_1201 1.8232 ms 3.7% -SingleProcess AUTOTUNE takes 4.8825 seconds -AUTOTUNE addmm(12768x3, 12768x256, 256x3) - triton_mm_1209 0.0132 ms 100.0% - triton_mm_1207 0.0133 ms 99.8% - triton_mm_1211 0.0133 ms 99.5% - triton_mm_1208 0.0135 ms 97.9% - triton_mm_1210 0.0141 ms 94.1% - triton_mm_1214 0.0149 ms 89.0% - triton_mm_1206 0.0153 ms 86.6% - bias_addmm 0.0161 ms 82.5% - triton_mm_1213 0.0161 ms 82.3% - triton_mm_1216 0.0173 ms 76.7% -SingleProcess AUTOTUNE takes 4.5147 seconds -AUTOTUNE addmm(3268608x12, 3268608x256, 256x12) - triton_mm_1226 1.0755 ms 100.0% - triton_mm_1220 1.1086 ms 97.0% - triton_mm_1219 1.1316 ms 95.0% - triton_mm_1221 1.1340 ms 94.8% - triton_mm_1223 1.1346 ms 94.8% - triton_mm_1222 1.1365 ms 94.6% - triton_mm_1225 1.1634 ms 92.4% - triton_mm_1218 1.1695 ms 92.0% - triton_mm_1227 1.2061 ms 89.2% - triton_mm_1229 1.2184 ms 88.3% -SingleProcess AUTOTUNE takes 4.3601 seconds -AUTOTUNE addmm(817152x12, 817152x256, 256x12) - triton_mm_1238 0.2854 ms 100.0% - triton_mm_1232 0.2928 ms 97.5% - triton_mm_1231 0.2960 ms 96.4% - triton_mm_1233 0.2968 ms 96.2% - triton_mm_1235 0.2970 ms 96.1% - triton_mm_1234 0.2985 ms 95.6% - triton_mm_1237 0.3063 ms 93.2% - triton_mm_1230 0.3072 ms 92.9% - triton_mm_1239 0.3164 ms 90.2% - triton_mm_1236 0.3207 ms 89.0% -SingleProcess AUTOTUNE takes 4.1640 seconds -AUTOTUNE addmm(204288x12, 204288x256, 256x12) - triton_mm_1250 0.0884 ms 100.0% - triton_mm_1243 0.0885 ms 99.8% - triton_mm_1247 0.0889 ms 99.4% - triton_mm_1245 0.0892 ms 99.1% - triton_mm_1244 0.0893 ms 98.9% - triton_mm_1246 0.0901 ms 98.1% - triton_mm_1242 0.0923 ms 95.8% - triton_mm_1249 0.0923 ms 95.8% - triton_mm_1248 0.0940 ms 94.0% - triton_mm_1251 0.0940 ms 94.0% -SingleProcess AUTOTUNE takes 4.0702 seconds -AUTOTUNE addmm(51072x12, 51072x256, 256x12) - triton_mm_1256 0.0333 ms 100.0% - triton_mm_1262 0.0336 ms 99.0% - triton_mm_1263 0.0343 ms 97.2% - triton_mm_1255 0.0343 ms 97.1% - triton_mm_1257 0.0344 ms 96.8% - triton_mm_1258 0.0344 ms 96.7% - triton_mm_1254 0.0350 ms 95.1% - triton_mm_1261 0.0350 ms 95.1% - triton_mm_1259 0.0352 ms 94.5% - triton_mm_1260 0.0361 ms 92.2% -SingleProcess AUTOTUNE takes 4.3500 seconds -AUTOTUNE addmm(12768x12, 12768x256, 256x12) - triton_mm_1267 0.0132 ms 100.0% - triton_mm_1275 0.0132 ms 100.0% - triton_mm_1271 0.0134 ms 98.6% - triton_mm_1270 0.0135 ms 97.9% - triton_mm_1274 0.0136 ms 97.4% - triton_mm_1272 0.0137 ms 96.7% - triton_mm_1269 0.0138 ms 95.6% - triton_mm_1268 0.0139 ms 95.2% - bias_addmm 0.0147 ms 90.2% - triton_mm_1266 0.0152 ms 87.1% -SingleProcess AUTOTUNE takes 4.2287 seconds -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 11:30:21,351] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE int_mm(32000x12544, 12544x1024, 32000x1024) - triton_mm_1288 1.6268 ms 100.0% - triton_mm_1287 1.6641 ms 97.8% - triton_mm_1286 3.2028 ms 50.8% - triton_mm_1279 3.3561 ms 48.5% - triton_mm_1280 3.3804 ms 48.1% - triton_mm_1281 3.5864 ms 45.4% - triton_mm_1282 3.6539 ms 44.5% - triton_mm_1278 4.6130 ms 35.3% - triton_mm_1285 6.7362 ms 24.2% - triton_mm_1284 10.3430 ms 15.7% -SingleProcess AUTOTUNE takes 7.9031 seconds -AUTOTUNE int_mm(32000x1024, 1024x1024, 32000x1024) - triton_mm_1299 0.3092 ms 100.0% - triton_mm_1298 0.3122 ms 99.0% - triton_mm_1291 0.3562 ms 86.8% - triton_mm_1290 0.3630 ms 85.2% - triton_mm_1297 0.3685 ms 83.9% - triton_mm_1292 0.4132 ms 74.8% - triton_mm_1293 0.4229 ms 73.1% - triton_mm_1289 0.4459 ms 69.3% - triton_mm_1296 0.6285 ms 49.2% - triton_mm_1295 0.9780 ms 31.6% -SingleProcess AUTOTUNE takes 7.8710 seconds -AUTOTUNE int_mm(32000x1024, 1024x81, 32000x81) - triton_mm_1309 0.0622 ms 100.0% - triton_mm_1308 0.0670 ms 92.8% - triton_mm_1304 0.0760 ms 81.8% - triton_mm_1302 0.0789 ms 78.8% - triton_mm_1310 0.0896 ms 69.4% - triton_mm_1301 0.0958 ms 64.9% - triton_mm_1303 0.0958 ms 64.9% - triton_mm_1305 0.1015 ms 61.2% - triton_mm_1300 0.1186 ms 52.4% - triton_mm_1307 0.1248 ms 49.8% -SingleProcess AUTOTUNE takes 8.0148 seconds -AUTOTUNE int_mm(32000x1024, 1024x320, 32000x320) - triton_mm_1319 0.1247 ms 100.0% - triton_mm_1321 0.1270 ms 98.2% - triton_mm_1313 0.1348 ms 92.5% - triton_mm_1315 0.1486 ms 83.9% - triton_mm_1320 0.1552 ms 80.4% - triton_mm_1312 0.1607 ms 77.6% - triton_mm_1311 0.1653 ms 75.4% - triton_mm_1314 0.1705 ms 73.1% - triton_mm_1318 0.2664 ms 46.8% - triton_mm_1317 0.3146 ms 39.6% -SingleProcess AUTOTUNE takes 7.6002 seconds -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] - running benchmark: 0%| | 0/30 [00:00 - async_compile.wait(globals()) - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait - scope[key] = result.result() - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result - self.future.result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result - return self.__get_result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result - raise self._exception -torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -CompilationError: at 14:40: xnumel = 196 - yoffset = tl.program_id(1).to(tl.int64) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) - ymask = yindex < ynumel - xoffset = tl.program_id(0).to(tl.int64) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = yindex % 1024 - y1 = (yindex // 1024) - tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) - ^ -ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') - -Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information - - -You can suppress this exception and fall back to eager by setting: - import torch._dynamo - torch._dynamo.config.suppress_errors = True - -Run failed with return code: 255 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead - loading model: 0it [00:08, ?it/s] -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead -cuda eval detectron2_maskrcnn_r_101_fpn int8dynamic-bs32 -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 12:04:47,676] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 12:06:04,086] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE convolution(967x256x14x14, 256x256x3x3) - convolution 0.9932 ms 100.0% - triton_convolution_1327 5.8403 ms 17.0% - triton_convolution_1325 6.4791 ms 15.3% - triton_convolution_1322 7.3768 ms 13.5% - triton_convolution_1328 10.8547 ms 9.1% - triton_convolution_1326 14.0171 ms 7.1% - triton_convolution_1323 18.6382 ms 5.3% - triton_convolution_1324 25.5688 ms 3.9% -SingleProcess AUTOTUNE takes 5.9567 seconds -AUTOTUNE addmm(758128x80, 758128x256, 256x80) - bias_addmm 0.3601 ms 100.0% - triton_mm_1352 0.4278 ms 84.2% - triton_mm_1357 0.4576 ms 78.7% - triton_mm_1351 0.4592 ms 78.4% - triton_mm_1354 0.4899 ms 73.5% - triton_mm_1353 0.4958 ms 72.6% - triton_mm_1350 0.5550 ms 64.9% - triton_mm_1358 0.5671 ms 63.5% - addmm 0.6006 ms 60.0% - triton_mm_1355 0.7203 ms 50.0% -SingleProcess AUTOTUNE takes 5.8245 seconds -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] - running benchmark: 0%| | 0/30 [00:00 - async_compile.wait(globals()) - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait - scope[key] = result.result() - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result - self.future.result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result - return self.__get_result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result - raise self._exception -torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -CompilationError: at 14:40: xnumel = 196 - yoffset = tl.program_id(1).to(tl.int64) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) - ymask = yindex < ynumel - xoffset = tl.program_id(0).to(tl.int64) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = yindex % 1024 - y1 = (yindex // 1024) - tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) - ^ -ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') - -Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information - - -You can suppress this exception and fall back to eager by setting: - import torch._dynamo - torch._dynamo.config.suppress_errors = True - -Run failed with return code: 255 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn -WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead - loading model: 0it [00:06, ?it/s] -WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead -cuda eval detectron2_maskrcnn_r_50_fpn int8dynamic-bs32 -WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 12:17:51,708] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 12:18:56,679] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE convolution(1154x256x14x14, 256x256x3x3) - convolution 1.1834 ms 100.0% - triton_convolution_800 6.9411 ms 17.0% - triton_convolution_798 7.7173 ms 15.3% - triton_convolution_795 8.7726 ms 13.5% - triton_convolution_801 12.9288 ms 9.2% - triton_convolution_799 16.7297 ms 7.1% - triton_convolution_796 22.5466 ms 5.2% - triton_convolution_797 30.0653 ms 3.9% -SingleProcess AUTOTUNE takes 5.5937 seconds -AUTOTUNE addmm(904736x80, 904736x256, 256x80) - bias_addmm 0.4253 ms 100.0% - triton_mm_830 0.5515 ms 77.1% - triton_mm_825 0.5585 ms 76.2% - triton_mm_827 0.5972 ms 71.2% - triton_mm_824 0.7048 ms 60.3% - triton_mm_826 0.7101 ms 59.9% - addmm 0.7138 ms 59.6% - triton_mm_831 0.7736 ms 55.0% - triton_mm_823 0.8249 ms 51.6% - triton_mm_828 0.8686 ms 49.0% -SingleProcess AUTOTUNE takes 5.7222 seconds -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] - running benchmark: 0%| | 0/30 [00:00bhts", q, k * softmax_scale) - File "/home/cdhernandez/local/pytorch/torch/functional.py", line 380, in einsum - return _VF.einsum(equation, operands) # type: ignore[attr-defined] -torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 589.69 MiB is free. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 77.73 GiB is allocated by PyTorch, and 336.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model - self.validate_model(model, example_inputs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model - raise NotImplementedError("Eager model failed to run") from e -NotImplementedError: Eager model failed to run - - loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] -phlippe_densenet -cuda eval phlippe_densenet int8dynamic-bs32 -AUTOTUNE convolution(32x3x32x32, 32x3x3x3) - triton_convolution_0 0.0162 ms 100.0% - triton_convolution_4 0.0166 ms 97.9% - convolution 0.0184 ms 88.2% - triton_convolution_3 0.0198 ms 81.8% - triton_convolution_2 0.0214 ms 75.8% - triton_convolution_5 0.0251 ms 64.7% - triton_convolution_1 0.0260 ms 62.4% -SingleProcess AUTOTUNE takes 2.9950 seconds -AUTOTUNE mm(32768x32, 32x32) - triton_mm_9 0.0096 ms 100.0% - triton_mm_11 0.0096 ms 100.0% - triton_mm_14 0.0096 ms 100.0% - triton_mm_6 0.0098 ms 98.0% - triton_mm_7 0.0098 ms 98.0% - triton_mm_17 0.0098 ms 98.0% - triton_mm_8 0.0099 ms 97.1% - triton_mm_16 0.0100 ms 96.1% - triton_mm_12 0.0100 ms 95.5% - triton_mm_15 0.0100 ms 95.5% -SingleProcess AUTOTUNE takes 3.3572 seconds -AUTOTUNE convolution(32x32x32x32, 16x32x3x3) - convolution 0.0164 ms 100.0% - triton_convolution_21 0.0302 ms 54.4% - triton_convolution_22 0.0314 ms 52.4% - triton_convolution_18 0.0315 ms 52.2% - triton_convolution_23 0.0425 ms 38.7% - triton_convolution_19 0.0427 ms 38.6% - triton_convolution_20 0.0780 ms 21.1% -SingleProcess AUTOTUNE takes 3.1045 seconds -AUTOTUNE mm(32768x48, 48x32) - triton_mm_34 0.0109 ms 100.0% - triton_mm_35 0.0113 ms 96.0% - triton_mm_26 0.0116 ms 94.2% - triton_mm_33 0.0116 ms 93.4% - triton_mm_31 0.0117 ms 92.9% - triton_mm_24 0.0117 ms 92.6% - triton_mm_32 0.0118 ms 91.9% - triton_mm_27 0.0121 ms 89.7% - triton_mm_28 0.0122 ms 89.1% - triton_mm_25 0.0126 ms 86.5% -SingleProcess AUTOTUNE takes 4.0981 seconds -AUTOTUNE mm(32768x128, 128x64) - triton_mm_114 0.0159 ms 100.0% - triton_mm_115 0.0166 ms 96.0% - triton_mm_121 0.0168 ms 94.7% - triton_mm_116 0.0170 ms 94.0% - triton_mm_117 0.0175 ms 90.9% - triton_mm_122 0.0181 ms 88.0% - triton_mm_118 0.0191 ms 83.4% - triton_mm_124 0.0204 ms 78.1% - mm 0.0205 ms 77.8% - triton_mm_120 0.0205 ms 77.6% -SingleProcess AUTOTUNE takes 4.1403 seconds -AUTOTUNE mm(8192x64, 64x32) - triton_mm_128 0.0079 ms 100.0% - triton_mm_126 0.0084 ms 94.6% - triton_mm_133 0.0084 ms 94.6% - triton_mm_132 0.0084 ms 94.3% - triton_mm_129 0.0085 ms 92.9% - triton_mm_135 0.0086 ms 92.2% - triton_mm_127 0.0086 ms 91.5% - triton_mm_130 0.0087 ms 90.8% - triton_mm_134 0.0088 ms 90.1% - triton_mm_137 0.0088 ms 89.5% -SingleProcess AUTOTUNE takes 3.7232 seconds -AUTOTUNE convolution(32x32x16x16, 16x32x3x3) - convolution 0.0109 ms 100.0% - triton_convolution_141 0.0175 ms 62.3% - triton_convolution_142 0.0184 ms 59.4% - triton_convolution_138 0.0193 ms 56.5% - triton_convolution_143 0.0242 ms 45.2% - triton_convolution_139 0.0316 ms 34.5% - triton_convolution_140 0.0779 ms 14.0% -SingleProcess AUTOTUNE takes 2.6783 seconds -AUTOTUNE mm(8192x160, 160x80) - triton_mm_235 0.0125 ms 100.0% - triton_mm_236 0.0125 ms 99.7% - triton_mm_234 0.0129 ms 97.0% - triton_mm_237 0.0130 ms 96.3% - triton_mm_238 0.0130 ms 96.3% - mm 0.0132 ms 94.7% - triton_mm_239 0.0137 ms 91.1% - triton_mm_242 0.0140 ms 89.5% - triton_mm_245 0.0142 ms 88.1% - triton_mm_241 0.0146 ms 85.7% -SingleProcess AUTOTUNE takes 5.1406 seconds -AUTOTUNE mm(2048x80, 80x32) - triton_mm_252 0.0071 ms 100.0% - triton_mm_249 0.0072 ms 98.2% - triton_mm_251 0.0072 ms 98.2% - triton_mm_254 0.0074 ms 96.1% - triton_mm_247 0.0076 ms 92.9% - triton_mm_250 0.0076 ms 92.5% - triton_mm_248 0.0079 ms 89.8% - triton_mm_255 0.0079 ms 89.8% - triton_mm_246 0.0084 ms 84.7% - mm 0.0084 ms 83.7% -SingleProcess AUTOTUNE takes 4.4931 seconds -AUTOTUNE convolution(32x32x8x8, 16x32x3x3) - convolution 0.0106 ms 100.0% - triton_convolution_262 0.0164 ms 64.6% - triton_convolution_258 0.0171 ms 62.2% - triton_convolution_261 0.0179 ms 59.3% - triton_convolution_263 0.0257 ms 41.4% - triton_convolution_259 0.0326 ms 32.6% - triton_convolution_260 0.0773 ms 13.7% -SingleProcess AUTOTUNE takes 2.7801 seconds -AUTOTUNE mm(2048x96, 96x32) - triton_mm_270 0.0070 ms 100.0% - triton_mm_267 0.0072 ms 97.8% - triton_mm_266 0.0074 ms 94.4% - triton_mm_265 0.0076 ms 91.6% - triton_mm_268 0.0076 ms 91.6% - triton_mm_269 0.0078 ms 90.3% - triton_mm_272 0.0079 ms 89.0% - triton_mm_273 0.0079 ms 89.0% - triton_mm_264 0.0084 ms 83.9% - mm 0.0084 ms 83.0% -SingleProcess AUTOTUNE takes 3.7179 seconds -AUTOTUNE mm(2048x112, 112x32) - triton_mm_288 0.0074 ms 100.0% - triton_mm_285 0.0076 ms 97.1% - triton_mm_287 0.0076 ms 97.1% - triton_mm_283 0.0079 ms 94.5% - triton_mm_290 0.0079 ms 94.3% - triton_mm_291 0.0079 ms 94.3% - triton_mm_286 0.0081 ms 91.7% - triton_mm_284 0.0083 ms 89.2% - mm 0.0084 ms 87.9% - triton_mm_282 0.0095 ms 78.4% -SingleProcess AUTOTUNE takes 3.7757 seconds -AUTOTUNE mm(2048x128, 128x32) - triton_mm_308 0.0074 ms 100.0% - triton_mm_305 0.0076 ms 96.7% - triton_mm_306 0.0078 ms 94.7% - triton_mm_303 0.0080 ms 92.0% - triton_mm_302 0.0081 ms 91.3% - triton_mm_304 0.0081 ms 91.3% - triton_mm_309 0.0081 ms 91.3% - mm 0.0081 ms 90.9% - triton_mm_301 0.0083 ms 88.8% - triton_mm_300 0.0092 ms 79.9% -SingleProcess AUTOTUNE takes 3.7442 seconds -AUTOTUNE mm(2048x144, 144x32) - triton_mm_324 0.0080 ms 100.0% - triton_mm_326 0.0083 ms 96.2% - triton_mm_327 0.0084 ms 95.8% - triton_mm_321 0.0084 ms 95.4% - triton_mm_323 0.0084 ms 94.7% - triton_mm_322 0.0085 ms 94.0% - triton_mm_320 0.0085 ms 93.6% - triton_mm_319 0.0086 ms 93.3% - mm 0.0092 ms 86.5% - triton_mm_318 0.0099 ms 80.9% -SingleProcess AUTOTUNE takes 3.8789 seconds -AUTOTUNE mm(2048x160, 160x32) - triton_mm_342 0.0076 ms 100.0% - triton_mm_341 0.0079 ms 95.9% - triton_mm_344 0.0079 ms 95.9% - triton_mm_345 0.0084 ms 90.4% - triton_mm_338 0.0085 ms 88.4% - triton_mm_337 0.0086 ms 88.1% - triton_mm_339 0.0086 ms 88.1% - triton_mm_340 0.0090 ms 83.7% - mm 0.0100 ms 75.4% - triton_mm_336 0.0104 ms 72.8% -SingleProcess AUTOTUNE takes 4.1559 seconds -AUTOTUNE mm(2048x176, 176x88) - triton_mm_359 0.0086 ms 100.0% - triton_mm_362 0.0088 ms 97.8% - triton_mm_357 0.0092 ms 92.7% - mm 0.0095 ms 90.5% - triton_mm_355 0.0095 ms 90.5% - triton_mm_363 0.0096 ms 89.6% - triton_mm_356 0.0097 ms 88.4% - triton_mm_358 0.0101 ms 84.8% - triton_mm_360 0.0106 ms 80.7% - triton_mm_354 0.0115 ms 74.4% -SingleProcess AUTOTUNE takes 4.7225 seconds -AUTOTUNE mm(512x88, 88x32) - triton_mm_367 0.0069 ms 100.0% - triton_mm_369 0.0071 ms 96.9% - triton_mm_372 0.0071 ms 96.9% - triton_mm_374 0.0071 ms 96.9% - triton_mm_368 0.0074 ms 93.5% - triton_mm_371 0.0076 ms 91.3% - triton_mm_370 0.0076 ms 90.8% - triton_mm_375 0.0081 ms 85.7% - triton_mm_366 0.0083 ms 83.4% - mm 0.0088 ms 78.8% -SingleProcess AUTOTUNE takes 4.2952 seconds -AUTOTUNE convolution(32x32x4x4, 16x32x3x3) - convolution 0.0109 ms 100.0% - triton_convolution_378 0.0168 ms 65.3% - triton_convolution_382 0.0184 ms 59.6% - triton_convolution_381 0.0189 ms 58.0% - triton_convolution_383 0.0280 ms 39.0% - triton_convolution_379 0.0321 ms 34.1% - triton_convolution_380 0.0421 ms 26.0% -SingleProcess AUTOTUNE takes 2.5215 seconds -AUTOTUNE mm(512x104, 104x32) - triton_mm_390 0.0068 ms 100.0% - triton_mm_385 0.0071 ms 95.7% - triton_mm_392 0.0071 ms 95.7% - triton_mm_389 0.0074 ms 92.8% - triton_mm_387 0.0074 ms 92.4% - triton_mm_388 0.0078 ms 87.1% - triton_mm_386 0.0079 ms 86.1% - triton_mm_393 0.0080 ms 84.9% - mm 0.0088 ms 77.9% - triton_mm_384 0.0090 ms 76.0% -SingleProcess AUTOTUNE takes 3.7611 seconds -AUTOTUNE mm(512x120, 120x32) - triton_mm_403 0.0072 ms 100.0% - triton_mm_408 0.0074 ms 97.4% - triton_mm_404 0.0078 ms 91.8% - triton_mm_410 0.0078 ms 91.6% - triton_mm_407 0.0078 ms 91.4% - triton_mm_405 0.0079 ms 91.1% - triton_mm_411 0.0081 ms 88.9% - triton_mm_406 0.0085 ms 84.7% - triton_mm_402 0.0092 ms 78.0% - mm 0.0095 ms 75.2% -SingleProcess AUTOTUNE takes 3.8015 seconds -AUTOTUNE mm(512x136, 136x32) - triton_mm_426 0.0074 ms 100.0% - triton_mm_421 0.0077 ms 96.7% - triton_mm_429 0.0079 ms 94.3% - triton_mm_428 0.0081 ms 91.9% - triton_mm_423 0.0081 ms 91.7% - triton_mm_425 0.0081 ms 91.7% - mm 0.0086 ms 86.6% - triton_mm_422 0.0087 ms 85.1% - triton_mm_424 0.0088 ms 84.7% - triton_mm_420 0.0099 ms 74.8% -SingleProcess AUTOTUNE takes 4.1494 seconds -AUTOTUNE mm(512x152, 152x32) - triton_mm_444 0.0071 ms 100.0% - triton_mm_441 0.0076 ms 93.7% - triton_mm_447 0.0078 ms 91.4% - triton_mm_446 0.0081 ms 88.5% - triton_mm_443 0.0081 ms 88.1% - triton_mm_439 0.0083 ms 85.8% - triton_mm_440 0.0083 ms 85.8% - triton_mm_442 0.0083 ms 85.8% - mm 0.0091 ms 78.2% - triton_mm_438 0.0097 ms 73.8% -SingleProcess AUTOTUNE takes 4.0948 seconds -AUTOTUNE mm(512x168, 168x32) - triton_mm_465 0.0074 ms 100.0% - triton_mm_464 0.0076 ms 97.5% - triton_mm_462 0.0076 ms 97.1% - triton_mm_461 0.0079 ms 94.3% - triton_mm_459 0.0085 ms 87.2% - triton_mm_457 0.0085 ms 86.9% - triton_mm_458 0.0090 ms 82.6% - mm 0.0091 ms 81.4% - triton_mm_460 0.0092 ms 80.8% - triton_mm_456 0.0104 ms 71.6% -SingleProcess AUTOTUNE takes 4.0597 seconds -AUTOTUNE int_mm(32x184, 184x10, 32x10) - triton_mm_479 0.0069 ms 100.0% - triton_mm_477 0.0071 ms 96.9% - triton_mm_478 0.0076 ms 90.8% - triton_mm_476 0.0078 ms 88.2% - triton_mm_475 0.0083 ms 83.4% - triton_mm_474 0.0092 ms 75.3% -SingleProcess AUTOTUNE takes 2.5471 seconds - running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/pytorch/torch/_ops.py(759)__call__() --> return self._op(*args, **(kwargs or {})) -(Pdb) TIMEOUT - loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] -timm_efficientnet -cuda eval timm_efficientnet int8dynamic-bs32 -AUTOTUNE convolution(32x3x224x224, 32x3x3x3) - convolution 0.1094 ms 100.0% - triton_convolution_4 0.1242 ms 88.1% - triton_convolution_0 0.1343 ms 81.5% - triton_convolution_3 0.1370 ms 79.9% - triton_convolution_2 0.1484 ms 73.7% - triton_convolution_5 0.1879 ms 58.2% - triton_convolution_1 0.2244 ms 48.8% -SingleProcess AUTOTUNE takes 0.9131 seconds -AUTOTUNE addmm(32x8, 32x32, 32x8) - triton_mm_6 0.0065 ms 100.0% - triton_mm_7 0.0070 ms 91.8% - triton_mm_8 0.0070 ms 91.8% - triton_mm_9 0.0070 ms 91.8% - triton_mm_11 0.0071 ms 91.0% - triton_mm_10 0.0074 ms 87.3% - bias_addmm 0.0076 ms 84.5% - addmm 0.0117 ms 55.3% -SingleProcess AUTOTUNE takes 1.0004 seconds -AUTOTUNE addmm(32x32, 32x8, 8x32) - triton_mm_13 0.0060 ms 100.0% - triton_mm_16 0.0062 ms 96.4% - triton_mm_14 0.0066 ms 90.8% - triton_mm_15 0.0066 ms 90.8% - triton_mm_12 0.0066 ms 90.6% - bias_addmm 0.0074 ms 81.0% - addmm 0.0120 ms 50.1% -SingleProcess AUTOTUNE takes 0.8723 seconds -AUTOTUNE mm(401408x32, 32x16) - triton_mm_17 0.0348 ms 100.0% - triton_mm_21 0.0348 ms 99.8% - triton_mm_24 0.0348 ms 99.7% - triton_mm_18 0.0351 ms 99.0% - triton_mm_20 0.0351 ms 98.9% - triton_mm_22 0.0351 ms 98.9% - triton_mm_19 0.0352 ms 98.8% - triton_mm_25 0.0354 ms 98.2% - triton_mm_23 0.0354 ms 98.1% - triton_mm_27 0.0386 ms 90.1% -SingleProcess AUTOTUNE takes 1.4750 seconds -AUTOTUNE mm(401408x16, 16x96) - triton_mm_33 0.0646 ms 100.0% - triton_mm_37 0.0647 ms 99.9% - triton_mm_38 0.0661 ms 97.8% - triton_mm_28 0.0673 ms 96.1% - triton_mm_32 0.0678 ms 95.4% - triton_mm_29 0.0685 ms 94.3% - triton_mm_30 0.0685 ms 94.3% - triton_mm_35 0.0686 ms 94.2% - triton_mm_31 0.0689 ms 93.9% - triton_mm_36 0.0690 ms 93.7% -SingleProcess AUTOTUNE takes 1.6201 seconds -AUTOTUNE addmm(32x4, 32x96, 96x4) - triton_mm_42 0.0067 ms 100.0% - triton_mm_41 0.0073 ms 92.5% - triton_mm_40 0.0073 ms 92.1% - triton_mm_43 0.0074 ms 91.3% - triton_mm_44 0.0078 ms 85.7% - bias_addmm 0.0081 ms 82.7% - triton_mm_39 0.0082 ms 82.2% - triton_mm_46 0.0083 ms 80.8% - triton_mm_45 0.0089 ms 75.8% - addmm 0.0112 ms 59.8% -SingleProcess AUTOTUNE takes 1.2656 seconds -AUTOTUNE addmm(32x96, 32x4, 4x96) - triton_mm_52 0.0062 ms 100.0% - triton_mm_51 0.0063 ms 99.5% - triton_mm_53 0.0063 ms 99.5% - triton_mm_57 0.0065 ms 96.1% - triton_mm_56 0.0067 ms 93.1% - triton_mm_48 0.0067 ms 92.9% - triton_mm_50 0.0069 ms 90.3% - triton_mm_55 0.0069 ms 90.1% - triton_mm_49 0.0070 ms 89.4% - triton_mm_47 0.0070 ms 89.0% -SingleProcess AUTOTUNE takes 1.9945 seconds -AUTOTUNE mm(100352x96, 96x24) - triton_mm_65 0.0292 ms 100.0% - triton_mm_61 0.0296 ms 98.7% - triton_mm_62 0.0299 ms 97.7% - triton_mm_58 0.0300 ms 97.5% - triton_mm_63 0.0304 ms 96.3% - triton_mm_66 0.0304 ms 96.1% - triton_mm_60 0.0309 ms 94.6% - triton_mm_59 0.0310 ms 94.2% - triton_mm_69 0.0318 ms 91.9% - triton_mm_68 0.0320 ms 91.3% -SingleProcess AUTOTUNE takes 1.6133 seconds -AUTOTUNE mm(100352x24, 24x144) - triton_mm_72 0.0318 ms 100.0% - triton_mm_70 0.0332 ms 95.8% - triton_mm_81 0.0342 ms 93.1% - triton_mm_74 0.0343 ms 92.9% - triton_mm_75 0.0355 ms 89.7% - triton_mm_78 0.0355 ms 89.6% - triton_mm_79 0.0375 ms 85.0% - triton_mm_71 0.0380 ms 83.8% - triton_mm_77 0.0390 ms 81.6% - triton_mm_76 0.0396 ms 80.3% -SingleProcess AUTOTUNE takes 1.6121 seconds -AUTOTUNE addmm(32x6, 32x144, 144x6) - triton_mm_85 0.0069 ms 100.0% - triton_mm_86 0.0076 ms 90.8% - triton_mm_84 0.0077 ms 90.0% - triton_mm_87 0.0078 ms 88.6% - triton_mm_83 0.0079 ms 87.5% - bias_addmm 0.0091 ms 76.7% - triton_mm_82 0.0091 ms 76.1% - triton_mm_88 0.0108 ms 64.0% - triton_mm_89 0.0111 ms 62.5% - addmm 0.0123 ms 56.7% -SingleProcess AUTOTUNE takes 1.2720 seconds -AUTOTUNE addmm(32x144, 32x6, 6x144) - triton_mm_90 0.0065 ms 100.0% - triton_mm_96 0.0065 ms 100.0% - triton_mm_93 0.0067 ms 96.2% - triton_mm_97 0.0067 ms 96.2% - triton_mm_95 0.0068 ms 95.7% - triton_mm_99 0.0068 ms 95.3% - triton_mm_92 0.0070 ms 92.2% - triton_mm_94 0.0070 ms 92.2% - triton_mm_98 0.0070 ms 92.2% - triton_mm_100 0.0070 ms 92.2% -SingleProcess AUTOTUNE takes 1.7632 seconds -AUTOTUNE mm(100352x144, 144x24) - triton_mm_105 0.0410 ms 100.0% - triton_mm_103 0.0410 ms 99.8% - triton_mm_106 0.0416 ms 98.5% - triton_mm_108 0.0416 ms 98.5% - triton_mm_104 0.0421 ms 97.3% - triton_mm_112 0.0421 ms 97.3% - triton_mm_109 0.0422 ms 97.0% - triton_mm_111 0.0432 ms 94.8% - triton_mm_102 0.0435 ms 94.1% - mm 0.0449 ms 91.3% -SingleProcess AUTOTUNE takes 1.5944 seconds -AUTOTUNE mm(25088x144, 144x40) - triton_mm_146 0.0155 ms 100.0% - triton_mm_151 0.0162 ms 95.5% - triton_mm_148 0.0166 ms 93.1% - triton_mm_155 0.0177 ms 87.5% - triton_mm_145 0.0180 ms 86.1% - triton_mm_152 0.0181 ms 85.3% - triton_mm_144 0.0182 ms 85.0% - triton_mm_147 0.0188 ms 82.4% - triton_mm_149 0.0196 ms 79.1% - mm 0.0201 ms 76.8% -SingleProcess AUTOTUNE takes 4.1262 seconds -AUTOTUNE mm(25088x40, 40x240) - triton_mm_164 0.0207 ms 100.0% - triton_mm_158 0.0208 ms 99.7% - mm 0.0216 ms 95.9% - triton_mm_167 0.0223 ms 93.1% - triton_mm_160 0.0226 ms 91.8% - triton_mm_163 0.0240 ms 86.3% - triton_mm_157 0.0243 ms 85.3% - triton_mm_159 0.0249 ms 83.4% - triton_mm_165 0.0258 ms 80.5% - triton_mm_156 0.0261 ms 79.4% -SingleProcess AUTOTUNE takes 1.6583 seconds -AUTOTUNE addmm(32x10, 32x240, 240x10) - triton_mm_171 0.0078 ms 100.0% - triton_mm_172 0.0078 ms 99.2% - triton_mm_173 0.0081 ms 96.4% - triton_mm_170 0.0084 ms 92.4% - triton_mm_169 0.0085 ms 91.0% - bias_addmm 0.0095 ms 81.8% - triton_mm_168 0.0109 ms 71.1% - addmm 0.0128 ms 60.6% - triton_mm_175 0.0148 ms 52.5% - triton_mm_174 0.0150 ms 51.8% -SingleProcess AUTOTUNE takes 1.3745 seconds -AUTOTUNE addmm(32x240, 32x10, 10x240) - triton_mm_181 0.0062 ms 100.0% - triton_mm_185 0.0062 ms 100.0% - triton_mm_178 0.0065 ms 96.5% - triton_mm_180 0.0065 ms 96.5% - triton_mm_182 0.0065 ms 96.5% - triton_mm_184 0.0065 ms 96.5% - triton_mm_186 0.0065 ms 96.5% - triton_mm_176 0.0070 ms 89.0% - triton_mm_183 0.0075 ms 83.7% - triton_mm_179 0.0075 ms 83.3% -SingleProcess AUTOTUNE takes 1.6535 seconds -AUTOTUNE mm(25088x240, 240x40) - triton_mm_189 0.0208 ms 100.0% - triton_mm_191 0.0210 ms 99.2% - triton_mm_194 0.0216 ms 96.2% - triton_mm_195 0.0231 ms 90.2% - triton_mm_190 0.0233 ms 89.2% - mm 0.0240 ms 86.6% - triton_mm_187 0.0247 ms 84.1% - triton_mm_188 0.0248 ms 84.0% - triton_mm_198 0.0259 ms 80.3% - triton_mm_192 0.0260 ms 80.0% -SingleProcess AUTOTUNE takes 4.6825 seconds -AUTOTUNE mm(6272x240, 240x80) - triton_mm_232 0.0122 ms 100.0% - triton_mm_234 0.0124 ms 99.0% - triton_mm_233 0.0125 ms 97.7% - triton_mm_238 0.0126 ms 97.2% - triton_mm_231 0.0128 ms 95.7% - triton_mm_235 0.0140 ms 87.6% - mm 0.0141 ms 86.8% - triton_mm_239 0.0145 ms 84.1% - triton_mm_230 0.0156 ms 78.6% - triton_mm_236 0.0170 ms 71.9% -SingleProcess AUTOTUNE takes 1.6571 seconds -AUTOTUNE mm(6272x80, 80x480) - triton_mm_249 0.0139 ms 100.0% - mm 0.0148 ms 94.0% - triton_mm_245 0.0148 ms 94.0% - triton_mm_243 0.0149 ms 93.1% - triton_mm_242 0.0152 ms 91.4% - triton_mm_246 0.0154 ms 90.6% - triton_mm_244 0.0154 ms 90.4% - triton_mm_250 0.0182 ms 76.4% - triton_mm_252 0.0185 ms 75.4% - triton_mm_253 0.0209 ms 66.7% -SingleProcess AUTOTUNE takes 1.7384 seconds -AUTOTUNE addmm(32x20, 32x480, 480x20) - triton_mm_257 0.0092 ms 100.0% - triton_mm_258 0.0100 ms 92.0% - triton_mm_259 0.0101 ms 90.9% - triton_mm_256 0.0103 ms 89.4% - triton_mm_255 0.0113 ms 81.4% - bias_addmm 0.0119 ms 77.2% - addmm 0.0155 ms 59.5% - triton_mm_254 0.0170 ms 54.3% - triton_mm_261 0.0198 ms 46.5% - triton_mm_260 0.0207 ms 44.6% -SingleProcess AUTOTUNE takes 1.2630 seconds -AUTOTUNE addmm(32x480, 32x20, 20x480) - triton_mm_267 0.0065 ms 100.0% - triton_mm_262 0.0065 ms 99.5% - triton_mm_268 0.0065 ms 99.5% - triton_mm_273 0.0069 ms 93.1% - triton_mm_271 0.0070 ms 92.2% - triton_mm_264 0.0071 ms 91.4% - triton_mm_266 0.0071 ms 90.6% - triton_mm_270 0.0071 ms 90.6% - triton_mm_263 0.0074 ms 87.4% - triton_mm_269 0.0075 ms 86.3% -SingleProcess AUTOTUNE takes 1.7823 seconds -AUTOTUNE mm(6272x480, 480x80) - triton_mm_278 0.0151 ms 100.0% - mm 0.0153 ms 98.7% - triton_mm_277 0.0154 ms 98.0% - triton_mm_282 0.0157 ms 95.9% - triton_mm_275 0.0164 ms 91.8% - triton_mm_276 0.0165 ms 91.5% - triton_mm_279 0.0174 ms 86.7% - triton_mm_283 0.0205 ms 73.4% - triton_mm_280 0.0209 ms 72.1% - triton_mm_274 0.0231 ms 65.2% -SingleProcess AUTOTUNE takes 1.6409 seconds -AUTOTUNE mm(6272x480, 480x112) - triton_mm_366 0.0156 ms 100.0% - mm 0.0159 ms 98.4% - triton_mm_365 0.0159 ms 98.4% - triton_mm_364 0.0166 ms 94.0% - triton_mm_370 0.0171 ms 91.4% - triton_mm_363 0.0174 ms 89.9% - triton_mm_367 0.0202 ms 77.4% - triton_mm_362 0.0233 ms 67.3% - triton_mm_371 0.0234 ms 67.0% - triton_mm_368 0.0244 ms 64.3% -SingleProcess AUTOTUNE takes 1.6373 seconds -AUTOTUNE mm(6272x112, 112x672) - triton_mm_376 0.0191 ms 100.0% - triton_mm_381 0.0207 ms 92.6% - triton_mm_375 0.0217 ms 88.2% - triton_mm_378 0.0221 ms 86.4% - triton_mm_382 0.0223 ms 85.7% - triton_mm_377 0.0226 ms 84.6% - triton_mm_374 0.0231 ms 82.8% - mm 0.0250 ms 76.7% - triton_mm_384 0.0277 ms 69.0% - triton_mm_383 0.0313 ms 61.1% -SingleProcess AUTOTUNE takes 1.6365 seconds -AUTOTUNE addmm(32x28, 32x672, 672x28) - triton_mm_389 0.0107 ms 100.0% - triton_mm_390 0.0112 ms 96.0% - triton_mm_388 0.0118 ms 90.5% - bias_addmm 0.0121 ms 88.6% - triton_mm_391 0.0128 ms 84.0% - triton_mm_387 0.0139 ms 77.0% - addmm 0.0156 ms 68.8% - triton_mm_386 0.0213 ms 50.4% - triton_mm_393 0.0257 ms 41.7% - triton_mm_392 0.0261 ms 41.1% -SingleProcess AUTOTUNE takes 1.2713 seconds -AUTOTUNE addmm(32x672, 32x28, 28x672) - triton_mm_403 0.0065 ms 100.0% - triton_mm_398 0.0067 ms 96.7% - triton_mm_397 0.0069 ms 93.1% - triton_mm_399 0.0070 ms 92.2% - triton_mm_396 0.0072 ms 90.0% - triton_mm_402 0.0072 ms 89.8% - triton_mm_394 0.0072 ms 89.4% - triton_mm_400 0.0072 ms 89.4% - triton_mm_395 0.0074 ms 87.8% - triton_mm_401 0.0075 ms 86.3% -SingleProcess AUTOTUNE takes 1.7761 seconds -AUTOTUNE mm(6272x672, 672x112) - mm 0.0181 ms 100.0% - triton_mm_410 0.0182 ms 99.6% - triton_mm_409 0.0190 ms 95.1% - triton_mm_408 0.0203 ms 89.1% - triton_mm_414 0.0205 ms 88.3% - triton_mm_407 0.0210 ms 86.3% - triton_mm_411 0.0250 ms 72.5% - triton_mm_412 0.0282 ms 64.3% - triton_mm_415 0.0295 ms 61.4% - triton_mm_406 0.0298 ms 60.7% -SingleProcess AUTOTUNE takes 1.6350 seconds -AUTOTUNE mm(1568x672, 672x192) - mm 0.0125 ms 100.0% - triton_mm_502 0.0135 ms 92.4% - triton_mm_498 0.0149 ms 83.9% - triton_mm_497 0.0154 ms 80.9% - triton_mm_499 0.0157 ms 79.3% - triton_mm_500 0.0157 ms 79.3% - triton_mm_503 0.0160 ms 78.0% - triton_mm_495 0.0177 ms 70.4% - triton_mm_496 0.0179 ms 69.8% - triton_mm_494 0.0256 ms 48.8% -SingleProcess AUTOTUNE takes 5.0328 seconds -AUTOTUNE mm(1568x192, 192x1152) - triton_mm_507 0.0138 ms 100.0% - triton_mm_508 0.0142 ms 97.1% - triton_mm_506 0.0144 ms 95.3% - mm 0.0149 ms 92.1% - triton_mm_514 0.0154 ms 89.2% - triton_mm_509 0.0156 ms 87.9% - triton_mm_510 0.0158 ms 87.2% - triton_mm_513 0.0166 ms 82.9% - triton_mm_516 0.0175 ms 78.6% - triton_mm_512 0.0215 ms 63.9% -SingleProcess AUTOTUNE takes 1.6465 seconds -AUTOTUNE addmm(32x48, 32x1152, 1152x48) - bias_addmm 0.0122 ms 100.0% - triton_mm_521 0.0134 ms 90.7% - triton_mm_522 0.0151 ms 80.3% - triton_mm_524 0.0153 ms 79.2% - addmm 0.0155 ms 78.7% - triton_mm_525 0.0158 ms 77.2% - triton_mm_520 0.0161 ms 75.4% - triton_mm_519 0.0209 ms 58.2% - triton_mm_518 0.0346 ms 35.1% - triton_mm_523 0.0355 ms 34.2% -SingleProcess AUTOTUNE takes 1.5143 seconds -AUTOTUNE addmm(32x1152, 32x48, 48x1152) - triton_mm_536 0.0072 ms 100.0% - triton_mm_533 0.0074 ms 97.0% - triton_mm_530 0.0076 ms 94.1% - triton_mm_537 0.0077 ms 93.0% - triton_mm_529 0.0079 ms 91.5% - triton_mm_534 0.0079 ms 91.5% - triton_mm_531 0.0080 ms 89.6% - triton_mm_532 0.0082 ms 87.9% - triton_mm_528 0.0082 ms 87.5% - triton_mm_538 0.0083 ms 86.5% -SingleProcess AUTOTUNE takes 1.7876 seconds -AUTOTUNE mm(1568x1152, 1152x192) - mm 0.0146 ms 100.0% - triton_mm_548 0.0175 ms 83.2% - triton_mm_549 0.0197 ms 73.9% - triton_mm_543 0.0203 ms 72.0% - triton_mm_544 0.0207 ms 70.4% - triton_mm_545 0.0210 ms 69.4% - triton_mm_546 0.0218 ms 67.1% - triton_mm_541 0.0252 ms 57.8% - triton_mm_542 0.0261 ms 55.9% - triton_mm_540 0.0390 ms 37.4% -SingleProcess AUTOTUNE takes 1.8321 seconds -AUTOTUNE mm(1568x1152, 1152x320) - mm 0.0172 ms 100.0% - triton_mm_681 0.0209 ms 82.4% - triton_mm_682 0.0209 ms 82.4% - triton_mm_686 0.0223 ms 77.3% - triton_mm_680 0.0257 ms 66.9% - triton_mm_679 0.0260 ms 66.2% - triton_mm_683 0.0284 ms 60.6% - triton_mm_684 0.0289 ms 59.5% - triton_mm_687 0.0314 ms 54.8% - triton_mm_678 0.0392 ms 44.0% -SingleProcess AUTOTUNE takes 1.6326 seconds -AUTOTUNE mm(1568x320, 320x1280) - triton_mm_691 0.0172 ms 100.0% - triton_mm_692 0.0173 ms 99.4% - mm 0.0180 ms 95.6% - triton_mm_690 0.0184 ms 93.4% - triton_mm_693 0.0193 ms 89.4% - triton_mm_694 0.0195 ms 88.5% - triton_mm_698 0.0204 ms 84.3% - triton_mm_697 0.0211 ms 81.8% - triton_mm_700 0.0258 ms 66.8% - triton_mm_696 0.0310 ms 55.5% -SingleProcess AUTOTUNE takes 1.6535 seconds -AUTOTUNE int_mm(32x1280, 1280x1000, 32x1000) - triton_mm_712 0.0139 ms 100.0% - triton_mm_707 0.0156 ms 89.3% - triton_mm_710 0.0162 ms 85.8% - triton_mm_708 0.0169 ms 82.2% - triton_mm_711 0.0172 ms 80.8% - triton_mm_706 0.0182 ms 76.1% - triton_mm_705 0.0208 ms 66.7% - triton_mm_704 0.0236 ms 58.9% - triton_mm_703 0.0267 ms 52.0% - triton_mm_702 0.0346 ms 40.1% -SingleProcess AUTOTUNE takes 1.5528 seconds - running benchmark: 0%| | 0/30 [00:00 - run() - File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run - benchmark.run(bm_args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run - main(TorchBenchmarkRunner(), original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main - process_entry(0, runner, original_dir, args) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry - return maybe_fresh_cache( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner - return fn(*args, **kwargs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model - module = importlib.import_module(c) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module - return _bootstrap._gcd_import(name[level:], package, level) - File "", line 1050, in _gcd_import - File "", line 1027, in _find_and_load - File "", line 1006, in _find_and_load_unlocked - File "", line 688, in _load_unlocked - File "", line 883, in exec_module - File "", line 241, in _call_with_frames_removed - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in - from .data.dlrm_dataloader import get_dataloader - File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in - from torchrec.datasets.criteo import ( - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in - import torchrec.distributed # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in - from torchrec.distributed.model_parallel import DistributedModelParallel # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in - from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in - from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in - from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in - from torchrec.distributed.embedding_types import EmbeddingComputeKernel - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in - from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in - from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in - torch.ops.fbgemm.jagged_2d_to_dense, - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ - raise AttributeError( -AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' -Run failed with return code: 1 -Output: None -Error: None - loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] -BERT_pytorch -cuda eval BERT_pytorch baseline-bs32 -AUTOTUNE mm(4096x768, 768x768) - mm 0.0306 ms 100.0% - triton_mm_2 0.0371 ms 82.5% - triton_mm_1 0.0403 ms 75.8% - triton_mm_3 0.0411 ms 74.4% - triton_mm_4 0.0413 ms 73.9% - triton_mm_7 0.0438 ms 69.8% - triton_mm_8 0.0481 ms 63.6% - triton_mm_0 0.0570 ms 53.7% - triton_mm_10 0.0712 ms 42.9% - triton_mm_9 0.0790 ms 38.7% -SingleProcess AUTOTUNE takes 1.6112 seconds -AUTOTUNE bmm(384x128x64, 384x64x128) - triton_bmm_32 0.0257 ms 100.0% - triton_bmm_26 0.0265 ms 97.2% - triton_bmm_25 0.0267 ms 96.5% - triton_bmm_27 0.0272 ms 94.8% - triton_bmm_28 0.0281 ms 91.7% - triton_bmm_34 0.0281 ms 91.6% - triton_bmm_24 0.0282 ms 91.2% - bmm 0.0284 ms 90.8% - triton_bmm_31 0.0296 ms 87.0% - triton_bmm_33 0.0316 ms 81.6% -SingleProcess AUTOTUNE takes 1.7496 seconds -AUTOTUNE bmm(384x128x128, 384x128x64) - triton_bmm_50 0.0285 ms 100.0% - triton_bmm_49 0.0293 ms 97.4% - triton_bmm_52 0.0297 ms 96.1% - triton_bmm_48 0.0300 ms 95.1% - triton_bmm_55 0.0300 ms 94.9% - triton_bmm_56 0.0300 ms 94.9% - triton_bmm_51 0.0312 ms 91.6% - triton_bmm_54 0.0319 ms 89.5% - triton_bmm_58 0.0326 ms 87.5% - triton_bmm_53 0.0332 ms 85.9% -SingleProcess AUTOTUNE takes 1.6227 seconds -AUTOTUNE mm(4096x768, 768x3072) - mm 0.0995 ms 100.0% - triton_mm_74 0.1117 ms 89.0% - triton_mm_73 0.1118 ms 89.0% - triton_mm_79 0.1285 ms 77.4% - triton_mm_75 0.1327 ms 75.0% - triton_mm_76 0.1349 ms 73.8% - triton_mm_72 0.1421 ms 70.0% - triton_mm_80 0.1557 ms 63.9% - triton_mm_82 0.2340 ms 42.5% - triton_mm_77 0.2836 ms 35.1% -SingleProcess AUTOTUNE takes 1.6582 seconds -AUTOTUNE mm(4096x3072, 3072x768) - mm 0.0851 ms 100.0% - triton_mm_86 0.1235 ms 68.9% - triton_mm_85 0.1276 ms 66.7% - triton_mm_87 0.1304 ms 65.3% - triton_mm_88 0.1315 ms 64.7% - triton_mm_92 0.1557 ms 54.7% - triton_mm_84 0.1623 ms 52.4% - triton_mm_91 0.1916 ms 44.4% - triton_mm_89 0.2732 ms 31.2% - triton_mm_90 0.2747 ms 31.0% -SingleProcess AUTOTUNE takes 1.6729 seconds - running benchmark: 0%| | 0/30 [00:00 - async_compile.wait(globals()) - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait - scope[key] = result.result() - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result - self.future.result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result - return self.__get_result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result - raise self._exception -torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -CompilationError: at 14:40: xnumel = 196 - yoffset = tl.program_id(1).to(tl.int64) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) - ymask = yindex < ynumel - xoffset = tl.program_id(0).to(tl.int64) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = yindex % 1024 - y1 = (yindex // 1024) - tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) - ^ -ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') - -Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information - - -You can suppress this exception and fall back to eager by setting: - import torch._dynamo - torch._dynamo.config.suppress_errors = True - -Run failed with return code: 255 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 - loading model: 0it [00:09, ?it/s] -WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load -Original Error: "roi_align_forward_kernel" not implemented for 'BFloat16' -Eager model failed to run -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model - self.model_iter_fn(model, example_inputs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass - return mod(*inputs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward - return self.inference(batched_inputs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference - results, _ = self.roi_heads(images, features, proposals, None) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward - pred_instances = self._forward_box(features, proposals) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box - box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward - return self.level_poolers[0](x[0], pooler_fmt_boxes) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl - return forward_call(*args, **kwargs) - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward - return roi_align( - File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 238, in roi_align - return torch.ops.torchvision.roi_align( - File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 755, in __call__ - return self._op(*args, **(kwargs or {})) -RuntimeError: "roi_align_forward_kernel" not implemented for 'BFloat16' - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run - ) = runner.load_model( - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model - self.validate_model(model, example_inputs) - File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model - raise NotImplementedError("Eager model failed to run") from e -NotImplementedError: Eager model failed to run - - loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead - loading model: 0it [00:05, ?it/s] -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead -cuda eval detectron2_fasterrcnn_r_101_fpn baseline-bs32 -WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead -skipping cudagraphs due to ['mutated inputs'] -AUTOTUNE convolution(32x3x1216x1344, 64x3x7x7) - convolution 3.7276 ms 100.0% - triton_convolution_3 22.1441 ms 16.8% - triton_convolution_4 24.2946 ms 15.3% - triton_convolution_5 26.9298 ms 13.8% - triton_convolution_0 30.0927 ms 12.4% - triton_convolution_2 32.5115 ms 11.5% - triton_convolution_1 81.4334 ms 4.6% -SingleProcess AUTOTUNE takes 4.6148 seconds -AUTOTUNE mm(3268608x64, 64x64) - triton_mm_14 0.5204 ms 100.0% - triton_mm_8 0.5205 ms 100.0% - triton_mm_7 0.5292 ms 98.3% - triton_mm_10 0.5374 ms 96.8% - triton_mm_6 0.5463 ms 95.2% - triton_mm_13 0.5497 ms 94.7% - triton_mm_9 0.5542 ms 93.9% - mm 0.5696 ms 91.4% - triton_mm_15 0.7099 ms 73.3% - triton_mm_16 0.7413 ms 70.2% -SingleProcess AUTOTUNE takes 3.9672 seconds -AUTOTUNE convolution(32x64x304x336, 64x64x3x3) - convolution 1.4512 ms 100.0% - triton_convolution_18 7.1300 ms 20.4% - triton_convolution_23 8.1302 ms 17.8% - triton_convolution_24 9.7394 ms 14.9% - triton_convolution_19 12.0317 ms 12.1% - triton_convolution_21 12.1762 ms 11.9% - triton_convolution_22 12.6389 ms 11.5% - triton_convolution_20 28.5142 ms 5.1% -SingleProcess AUTOTUNE takes 4.5511 seconds -AUTOTUNE mm(3268608x64, 64x256) - triton_mm_27 1.5106 ms 100.0% - triton_mm_26 1.5171 ms 99.6% - triton_mm_28 1.7556 ms 86.0% - triton_mm_29 1.7616 ms 85.8% - mm 1.7827 ms 84.7% - triton_mm_33 1.8637 ms 81.1% - triton_mm_25 1.8785 ms 80.4% - triton_mm_32 1.9761 ms 76.4% - triton_mm_35 2.2260 ms 67.9% - triton_mm_34 2.8418 ms 53.2% -SingleProcess AUTOTUNE takes 4.4901 seconds -AUTOTUNE mm(3268608x256, 256x64) - triton_mm_51 1.3317 ms 100.0% - triton_mm_50 1.3587 ms 98.0% - triton_mm_53 1.3629 ms 97.7% - triton_mm_56 1.3673 ms 97.4% - triton_mm_57 1.3967 ms 95.3% - triton_mm_49 1.4020 ms 95.0% - mm 1.4044 ms 94.8% - triton_mm_52 1.4348 ms 92.8% - triton_mm_59 2.0760 ms 64.1% - triton_mm_55 2.1260 ms 62.6% -SingleProcess AUTOTUNE takes 4.6271 seconds -AUTOTUNE convolution(32x256x304x336, 128x256x1x1) - convolution 0.4530 ms 100.0% - triton_convolution_114 1.0770 ms 42.1% - triton_convolution_111 1.2264 ms 36.9% - triton_convolution_117 1.3645 ms 33.2% - triton_convolution_116 1.5053 ms 30.1% - triton_convolution_115 1.6691 ms 27.1% - triton_convolution_112 3.1430 ms 14.4% - triton_convolution_113 6.8266 ms 6.6% -SingleProcess AUTOTUNE takes 4.1577 seconds -AUTOTUNE convolution(32x128x152x168, 128x128x3x3) - convolution 1.2069 ms 100.0% - triton_convolution_121 7.0850 ms 17.0% - triton_convolution_118 8.0108 ms 15.1% - triton_convolution_123 8.3698 ms 14.4% - triton_convolution_124 10.6358 ms 11.3% - triton_convolution_122 12.3811 ms 9.7% - triton_convolution_119 13.8907 ms 8.7% - triton_convolution_120 28.6004 ms 4.2% -SingleProcess AUTOTUNE takes 4.5632 seconds -AUTOTUNE mm(817152x128, 128x512) - triton_mm_127 1.0106 ms 100.0% - triton_mm_126 1.0184 ms 99.2% - triton_mm_132 1.0308 ms 98.0% - mm 1.1343 ms 89.1% - triton_mm_125 1.1611 ms 87.0% - triton_mm_128 1.1932 ms 84.7% - triton_mm_129 1.2067 ms 83.7% - triton_mm_133 1.3923 ms 72.6% - triton_mm_135 1.7286 ms 58.5% - triton_mm_134 2.2943 ms 44.0% -SingleProcess AUTOTUNE takes 4.4943 seconds -AUTOTUNE convolution(32x256x304x336, 512x256x1x1) - convolution 1.5192 ms 100.0% - triton_convolution_140 4.2484 ms 35.8% - triton_convolution_142 4.5286 ms 33.5% - triton_convolution_143 5.4167 ms 28.0% - triton_convolution_141 6.6385 ms 22.9% - triton_convolution_137 8.3060 ms 18.3% - triton_convolution_138 12.5158 ms 12.1% - triton_convolution_139 27.1067 ms 5.6% -SingleProcess AUTOTUNE takes 5.2291 seconds -AUTOTUNE mm(817152x512, 512x128) - mm 0.7270 ms 100.0% - triton_mm_146 0.8278 ms 87.8% - triton_mm_145 0.8757 ms 83.0% - triton_mm_148 0.9058 ms 80.3% - triton_mm_147 0.9367 ms 77.6% - triton_mm_151 0.9769 ms 74.4% - triton_mm_152 1.0830 ms 67.1% - triton_mm_144 1.1040 ms 65.9% - triton_mm_154 1.6869 ms 43.1% - triton_mm_149 1.7984 ms 40.4% -SingleProcess AUTOTUNE takes 5.0176 seconds -AUTOTUNE convolution(32x512x152x168, 256x512x1x1) - convolution 0.3219 ms 100.0% - triton_convolution_240 1.0508 ms 30.6% - triton_convolution_242 1.0755 ms 29.9% - triton_convolution_243 1.2962 ms 24.8% - triton_convolution_241 1.6665 ms 19.3% - triton_convolution_237 2.1070 ms 15.3% - triton_convolution_238 3.1459 ms 10.2% - triton_convolution_239 6.7243 ms 4.8% -SingleProcess AUTOTUNE takes 4.9713 seconds -AUTOTUNE convolution(32x256x76x84, 256x256x3x3) - convolution 1.0804 ms 100.0% - triton_convolution_249 6.4292 ms 16.8% - triton_convolution_247 7.2717 ms 14.9% - triton_convolution_244 7.9900 ms 13.5% - triton_convolution_250 11.5452 ms 9.4% - triton_convolution_248 16.4734 ms 6.6% - triton_convolution_245 19.3684 ms 5.6% - triton_convolution_246 29.2683 ms 3.7% -SingleProcess AUTOTUNE takes 5.1353 seconds -AUTOTUNE mm(204288x256, 256x1024) - mm 0.6786 ms 100.0% - triton_mm_253 0.7821 ms 86.8% - triton_mm_252 0.7883 ms 86.1% - triton_mm_258 0.8225 ms 82.5% - triton_mm_251 0.9310 ms 72.9% - triton_mm_255 0.9324 ms 72.8% - triton_mm_254 0.9355 ms 72.5% - triton_mm_259 1.0938 ms 62.0% - triton_mm_261 1.3321 ms 50.9% - triton_mm_260 1.9005 ms 35.7% -SingleProcess AUTOTUNE takes 4.6551 seconds -AUTOTUNE convolution(32x512x152x168, 1024x512x1x1) - convolution 1.2104 ms 100.0% - triton_convolution_266 4.1278 ms 29.3% - triton_convolution_268 4.2156 ms 28.7% - triton_convolution_269 5.0886 ms 23.8% - triton_convolution_267 6.5420 ms 18.5% - triton_convolution_263 8.1750 ms 14.8% - triton_convolution_264 12.4580 ms 9.7% - triton_convolution_265 26.7420 ms 4.5% -SingleProcess AUTOTUNE takes 5.2265 seconds -AUTOTUNE mm(204288x1024, 1024x256) - mm 0.5437 ms 100.0% - triton_mm_271 0.6632 ms 82.0% - triton_mm_272 0.6646 ms 81.8% - triton_mm_274 0.7669 ms 70.9% - triton_mm_273 0.7675 ms 70.8% - triton_mm_270 0.8797 ms 61.8% - triton_mm_278 0.8978 ms 60.6% - triton_mm_277 0.9712 ms 56.0% - triton_mm_280 1.3469 ms 40.4% - triton_mm_279 1.5811 ms 34.4% -SingleProcess AUTOTUNE takes 5.2192 seconds -AUTOTUNE convolution(32x1024x76x84, 512x1024x1x1) - convolution 0.2622 ms 100.0% - triton_convolution_955 1.0030 ms 26.1% - triton_convolution_957 1.1227 ms 23.4% - triton_convolution_958 1.2446 ms 21.1% - triton_convolution_956 1.9245 ms 13.6% - triton_convolution_952 2.2700 ms 11.6% - triton_convolution_953 3.1720 ms 8.3% - triton_convolution_954 6.5570 ms 4.0% -SingleProcess AUTOTUNE takes 4.5819 seconds -AUTOTUNE convolution(32x512x38x42, 512x512x3x3) - convolution 1.0515 ms 100.0% - triton_convolution_964 6.9621 ms 15.1% - triton_convolution_959 9.8507 ms 10.7% - triton_convolution_962 10.1274 ms 10.4% - triton_convolution_965 17.0127 ms 6.2% - triton_convolution_963 19.1787 ms 5.5% - triton_convolution_960 21.0096 ms 5.0% - triton_convolution_961 28.6011 ms 3.7% -SingleProcess AUTOTUNE takes 5.3532 seconds -AUTOTUNE mm(51072x512, 512x2048) - mm 0.5559 ms 100.0% - triton_mm_968 0.6594 ms 84.3% - triton_mm_967 0.6638 ms 83.7% - triton_mm_973 0.7133 ms 77.9% - triton_mm_970 0.7841 ms 70.9% - triton_mm_969 0.7904 ms 70.3% - triton_mm_966 0.8376 ms 66.4% - triton_mm_974 0.9407 ms 59.1% - triton_mm_976 1.2370 ms 44.9% - triton_mm_971 1.7099 ms 32.5% -SingleProcess AUTOTUNE takes 4.7618 seconds -AUTOTUNE convolution(32x1024x76x84, 2048x1024x1x1) - convolution 1.0470 ms 100.0% - triton_convolution_981 3.9419 ms 26.6% - triton_convolution_983 4.3415 ms 24.1% - triton_convolution_984 4.9420 ms 21.2% - triton_convolution_982 7.6665 ms 13.7% - triton_convolution_978 8.2708 ms 12.7% - triton_convolution_979 12.2919 ms 8.5% - triton_convolution_980 26.1428 ms 4.0% -SingleProcess AUTOTUNE takes 4.8388 seconds -AUTOTUNE mm(51072x2048, 2048x512) - mm 0.4948 ms 100.0% - triton_mm_987 0.6086 ms 81.3% - triton_mm_986 0.6168 ms 80.2% - triton_mm_988 0.7104 ms 69.6% - triton_mm_989 0.7249 ms 68.3% - triton_mm_985 0.8283 ms 59.7% - triton_mm_993 0.8379 ms 59.1% - triton_mm_992 0.8847 ms 55.9% - triton_mm_995 1.3007 ms 38.0% - triton_mm_990 1.5264 ms 32.4% -SingleProcess AUTOTUNE takes 5.0192 seconds -AUTOTUNE addmm(51072x256, 51072x2048, 2048x256) - bias_addmm 0.3045 ms 100.0% - addmm 0.3104 ms 98.1% - triton_mm_1049 0.3329 ms 91.5% - triton_mm_1048 0.3370 ms 90.4% - triton_mm_1050 0.3632 ms 83.8% - triton_mm_1051 0.3691 ms 82.5% - triton_mm_1055 0.4297 ms 70.9% - triton_mm_1047 0.4396 ms 69.3% - triton_mm_1054 0.5082 ms 59.9% - triton_mm_1057 0.7069 ms 43.1% -SingleProcess AUTOTUNE takes 5.5123 seconds -AUTOTUNE convolution(32x256x38x42, 256x256x3x3) - convolution 0.2800 ms 100.0% - triton_convolution_1064 1.7070 ms 16.4% - triton_convolution_1062 1.8459 ms 15.2% - triton_convolution_1059 2.1395 ms 13.1% - triton_convolution_1065 2.9374 ms 9.5% - triton_convolution_1063 3.9718 ms 7.0% - triton_convolution_1060 4.7213 ms 5.9% - triton_convolution_1061 7.4863 ms 3.7% -SingleProcess AUTOTUNE takes 4.7817 seconds -AUTOTUNE addmm(3268608x256, 3268608x256, 256x256) - triton_mm_1068 3.6515 ms 100.0% - triton_mm_1067 3.7399 ms 97.6% - triton_mm_1073 3.9075 ms 93.4% - triton_mm_1069 4.2690 ms 85.5% - triton_mm_1070 4.3191 ms 84.5% - triton_mm_1066 4.4729 ms 81.6% - triton_mm_1074 4.8846 ms 74.8% - bias_addmm 5.9405 ms 61.5% - addmm 5.9812 ms 61.0% - triton_mm_1076 6.6301 ms 55.1% -SingleProcess AUTOTUNE takes 5.7035 seconds -AUTOTUNE addmm(817152x256, 817152x512, 512x256) - bias_addmm 1.3548 ms 100.0% - triton_mm_1080 1.4332 ms 94.5% - triton_mm_1079 1.5655 ms 86.5% - triton_mm_1081 1.8005 ms 75.2% - triton_mm_1082 1.8064 ms 75.0% - addmm 1.9001 ms 71.3% - triton_mm_1078 1.9506 ms 69.5% - triton_mm_1086 2.0710 ms 65.4% - triton_mm_1085 2.6391 ms 51.3% - triton_mm_1088 2.8551 ms 47.5% -SingleProcess AUTOTUNE takes 5.6193 seconds -AUTOTUNE addmm(204288x256, 204288x1024, 1024x256) - bias_addmm 0.5761 ms 100.0% - triton_mm_1091 0.6856 ms 84.0% - triton_mm_1092 0.6867 ms 83.9% - addmm 0.7476 ms 77.1% - triton_mm_1093 0.7855 ms 73.3% - triton_mm_1094 0.7955 ms 72.4% - triton_mm_1090 0.9032 ms 63.8% - triton_mm_1098 0.9230 ms 62.4% - triton_mm_1097 0.9917 ms 58.1% - triton_mm_1100 1.3640 ms 42.2% -SingleProcess AUTOTUNE takes 5.7268 seconds -AUTOTUNE convolution(32x256x304x336, 256x256x3x3) - convolution 17.6539 ms 100.0% - triton_convolution_1107 100.5318 ms 17.6% - triton_convolution_1105 118.5658 ms 14.9% - triton_convolution_1102 122.7876 ms 14.4% - triton_convolution_1108 186.2668 ms 9.5% - triton_convolution_1106 265.2579 ms 6.7% - triton_convolution_1103 300.6816 ms 5.9% - triton_convolution_1104 465.4240 ms 3.8% -SingleProcess AUTOTUNE takes 16.9654 seconds -AUTOTUNE convolution(32x256x152x168, 256x256x3x3) - convolution 4.4577 ms 100.0% - triton_convolution_1114 25.2454 ms 17.7% - triton_convolution_1112 29.8068 ms 15.0% - triton_convolution_1109 31.0213 ms 14.4% - triton_convolution_1115 46.3662 ms 9.6% - triton_convolution_1113 66.0892 ms 6.7% - triton_convolution_1110 76.3744 ms 5.8% - triton_convolution_1111 116.2610 ms 3.8% -SingleProcess AUTOTUNE takes 7.4362 seconds -AUTOTUNE addmm(3268608x3, 3268608x256, 256x3) - triton_mm_1132 1.0499 ms 100.0% - triton_mm_1131 1.0695 ms 98.2% - triton_mm_1135 1.0697 ms 98.1% - triton_mm_1133 1.0720 ms 97.9% - triton_mm_1134 1.0749 ms 97.7% - triton_mm_1137 1.0843 ms 96.8% - triton_mm_1130 1.0863 ms 96.6% - triton_mm_1141 1.1345 ms 92.5% - triton_mm_1140 1.1662 ms 90.0% - triton_mm_1138 1.2461 ms 84.3% -SingleProcess AUTOTUNE takes 5.0383 seconds -AUTOTUNE addmm(817152x3, 817152x256, 256x3) - triton_mm_1151 0.2790 ms 100.0% - triton_mm_1150 0.2831 ms 98.6% - triton_mm_1154 0.2835 ms 98.4% - triton_mm_1152 0.2840 ms 98.3% - triton_mm_1153 0.2846 ms 98.1% - triton_mm_1149 0.2888 ms 96.6% - triton_mm_1156 0.2890 ms 96.5% - triton_mm_1160 0.3040 ms 91.8% - triton_mm_1159 0.3087 ms 90.4% - triton_mm_1157 0.3271 ms 85.3% -SingleProcess AUTOTUNE takes 4.3400 seconds -AUTOTUNE addmm(204288x3, 204288x256, 256x3) - triton_mm_1169 0.0862 ms 100.0% - triton_mm_1171 0.0866 ms 99.5% - triton_mm_1173 0.0869 ms 99.2% - triton_mm_1172 0.0875 ms 98.5% - triton_mm_1170 0.0877 ms 98.2% - triton_mm_1168 0.0899 ms 95.9% - triton_mm_1175 0.0902 ms 95.5% - triton_mm_1178 0.0952 ms 90.6% - triton_mm_1176 0.0966 ms 89.2% - triton_mm_1179 0.1013 ms 85.0% -SingleProcess AUTOTUNE takes 3.9785 seconds -AUTOTUNE addmm(51072x3, 51072x256, 256x3) - triton_mm_1189 0.0339 ms 100.0% - triton_mm_1191 0.0343 ms 98.8% - triton_mm_1188 0.0344 ms 98.7% - triton_mm_1190 0.0345 ms 98.3% - triton_mm_1187 0.0353 ms 96.2% - triton_mm_1194 0.0353 ms 96.2% - triton_mm_1192 0.0355 ms 95.7% - triton_mm_1198 0.0364 ms 93.3% - triton_mm_1197 0.0368 ms 92.3% - triton_mm_1195 0.0371 ms 91.4% -SingleProcess AUTOTUNE takes 4.3372 seconds -AUTOTUNE convolution(32x256x19x21, 256x256x3x3) - convolution 0.0686 ms 100.0% - triton_convolution_1202 0.4279 ms 16.0% - triton_convolution_1204 0.4406 ms 15.6% - triton_convolution_1199 0.5708 ms 12.0% - triton_convolution_1205 0.6937 ms 9.9% - triton_convolution_1200 0.7684 ms 8.9% - triton_convolution_1203 0.7883 ms 8.7% - triton_convolution_1201 1.8229 ms 3.8% -SingleProcess AUTOTUNE takes 4.5925 seconds -AUTOTUNE addmm(12768x3, 12768x256, 256x3) - triton_mm_1209 0.0132 ms 100.0% - triton_mm_1207 0.0133 ms 99.5% - triton_mm_1211 0.0133 ms 99.3% - triton_mm_1208 0.0140 ms 94.7% - triton_mm_1210 0.0140 ms 94.7% - triton_mm_1214 0.0143 ms 92.2% - bias_addmm 0.0155 ms 85.2% - triton_mm_1206 0.0157 ms 84.3% - triton_mm_1213 0.0161 ms 82.3% - triton_mm_1216 0.0176 ms 74.9% -SingleProcess AUTOTUNE takes 4.0712 seconds -AUTOTUNE addmm(3268608x12, 3268608x256, 256x12) - triton_mm_1226 1.0775 ms 100.0% - triton_mm_1220 1.1099 ms 97.1% - triton_mm_1219 1.1344 ms 95.0% - triton_mm_1223 1.1381 ms 94.7% - triton_mm_1222 1.1384 ms 94.7% - triton_mm_1221 1.1386 ms 94.6% - triton_mm_1225 1.1644 ms 92.5% - triton_mm_1227 1.1674 ms 92.3% - triton_mm_1224 1.1682 ms 92.2% - triton_mm_1218 1.1714 ms 92.0% -SingleProcess AUTOTUNE takes 4.1467 seconds -AUTOTUNE addmm(817152x12, 817152x256, 256x12) - triton_mm_1238 0.2854 ms 100.0% - triton_mm_1232 0.2924 ms 97.6% - triton_mm_1231 0.2959 ms 96.5% - triton_mm_1233 0.2972 ms 96.0% - triton_mm_1235 0.2973 ms 96.0% - triton_mm_1234 0.2988 ms 95.5% - triton_mm_1236 0.3048 ms 93.6% - triton_mm_1237 0.3065 ms 93.1% - triton_mm_1239 0.3069 ms 93.0% - triton_mm_1230 0.3071 ms 92.9% -SingleProcess AUTOTUNE takes 3.9259 seconds -AUTOTUNE addmm(204288x12, 204288x256, 256x12) - triton_mm_1250 0.0881 ms 100.0% - triton_mm_1243 0.0881 ms 99.9% - triton_mm_1245 0.0890 ms 98.9% - triton_mm_1247 0.0893 ms 98.6% - triton_mm_1246 0.0896 ms 98.3% - triton_mm_1244 0.0896 ms 98.3% - triton_mm_1248 0.0911 ms 96.6% - triton_mm_1249 0.0923 ms 95.4% - triton_mm_1242 0.0924 ms 95.3% - triton_mm_1251 0.0925 ms 95.2% -SingleProcess AUTOTUNE takes 4.2935 seconds -AUTOTUNE addmm(51072x12, 51072x256, 256x12) - triton_mm_1256 0.0333 ms 100.0% - triton_mm_1262 0.0336 ms 99.3% - triton_mm_1258 0.0342 ms 97.5% - triton_mm_1263 0.0343 ms 97.2% - triton_mm_1255 0.0344 ms 97.0% - triton_mm_1257 0.0345 ms 96.8% - triton_mm_1261 0.0346 ms 96.3% - triton_mm_1259 0.0348 ms 95.7% - triton_mm_1254 0.0352 ms 94.6% - triton_mm_1265 0.0362 ms 92.1% -SingleProcess AUTOTUNE takes 4.0971 seconds -AUTOTUNE addmm(12768x12, 12768x256, 256x12) - triton_mm_1274 0.0129 ms 100.0% - triton_mm_1267 0.0132 ms 98.1% - triton_mm_1268 0.0134 ms 96.7% - triton_mm_1271 0.0134 ms 96.7% - triton_mm_1275 0.0135 ms 96.0% - triton_mm_1269 0.0137 ms 94.4% - triton_mm_1272 0.0138 ms 94.0% - triton_mm_1270 0.0139 ms 92.9% - bias_addmm 0.0149 ms 86.9% - triton_mm_1266 0.0152 ms 85.2% -SingleProcess AUTOTUNE takes 3.8293 seconds -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 17:13:02,525] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE mm(32000x12544, 12544x1024) - mm 3.6433 ms 100.0% - triton_mm_1279 4.8504 ms 75.1% - triton_mm_1280 4.9807 ms 73.1% - triton_mm_1281 5.4958 ms 66.3% - triton_mm_1282 5.5499 ms 65.6% - triton_mm_1286 6.4978 ms 56.1% - triton_mm_1278 6.5740 ms 55.4% - triton_mm_1285 6.9910 ms 52.1% - triton_mm_1288 10.0610 ms 36.2% - triton_mm_1283 11.3868 ms 32.0% -SingleProcess AUTOTUNE takes 5.8258 seconds -AUTOTUNE mm(32000x1024, 1024x1024) - mm 0.3167 ms 100.0% - triton_mm_1292 0.3782 ms 83.7% - triton_mm_1291 0.3793 ms 83.5% - triton_mm_1293 0.4526 ms 70.0% - triton_mm_1294 0.4528 ms 69.9% - triton_mm_1290 0.4931 ms 64.2% - triton_mm_1298 0.5370 ms 59.0% - triton_mm_1297 0.5453 ms 58.1% - triton_mm_1300 0.7575 ms 41.8% - triton_mm_1299 0.9494 ms 33.4% -SingleProcess AUTOTUNE takes 5.4527 seconds -AUTOTUNE addmm(32000x81, 32000x1024, 1024x81) - triton_mm_1303 0.0913 ms 100.0% - triton_mm_1305 0.0941 ms 97.0% - triton_mm_1310 0.1024 ms 89.2% - triton_mm_1306 0.1032 ms 88.5% - triton_mm_1304 0.1063 ms 85.9% - triton_mm_1307 0.1226 ms 74.5% - triton_mm_1311 0.1240 ms 73.7% - triton_mm_1309 0.1309 ms 69.7% - triton_mm_1302 0.1377 ms 66.3% - triton_mm_1308 0.1441 ms 63.4% -SingleProcess AUTOTUNE takes 6.6387 seconds -AUTOTUNE addmm(32000x320, 32000x1024, 1024x320) - triton_mm_1316 0.1288 ms 100.0% - bias_addmm 0.1451 ms 88.8% - triton_mm_1318 0.1492 ms 86.3% - triton_mm_1315 0.1502 ms 85.7% - addmm 0.1558 ms 82.6% - triton_mm_1314 0.1653 ms 77.9% - triton_mm_1317 0.1708 ms 75.4% - triton_mm_1322 0.1740 ms 74.0% - triton_mm_1321 0.2398 ms 53.7% - triton_mm_1324 0.2614 ms 49.3% -SingleProcess AUTOTUNE takes 5.5531 seconds -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) -[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45) -[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] last reason: L['scale_x'] == 0.5337781484570475 # self.tensor[:, 0::2] *= scale_x # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale -[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". -[2023-12-12 17:14:54,209] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. - running benchmark: 0%| | 0/30 [00:00 - async_compile.wait(globals()) - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2470, in wait - scope[key] = result.result() - File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 2313, in result - self.future.result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result - return self.__get_result() - File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result - raise self._exception -torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: -CompilationError: at 14:40: xnumel = 196 - yoffset = tl.program_id(1).to(tl.int64) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[None, :].to(tl.int64) - ymask = yindex < ynumel - xoffset = tl.program_id(0).to(tl.int64) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64) - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = yindex % 1024 - y1 = (yindex // 1024) - tmp0 = tl.load(in_ptr0 + (x2 + (196*y3)), xmask, eviction_policy='evict_last').to(tl.float32) - ^ -ValueError('numel (262144) exceeds triton maximum tensor numel (131072)') - -Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information - - -You can suppress this exception and fall back to eager by setting: - import torch._dynamo - torch._dynamo.config.suppress_errors = True - -Run failed with return code: 255 -Output: None -Error: None - loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead - loading model: 0it [00:06, ?it/s] -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead -cuda eval detectron2_maskrcnn_r_101_fpn baseline-bs32 -WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 17:40:18,830] [30/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['non-cuda device in graph'] -[2023-12-12 17:41:27,948] [30/1_1] torch._inductor.utils: [WARNING] DeviceCopy in input program -skipping cudagraphs due to ['non-cuda device in graph'] -AUTOTUNE convolution(957x256x14x14, 256x256x3x3) - convolution 1.2820 ms 100.0% - triton_convolution_1332 4.0781 ms 31.4% - triton_convolution_1327 4.1190 ms 31.1% - triton_convolution_1330 4.6714 ms 27.4% - triton_convolution_1329 5.5829 ms 23.0% - triton_convolution_1326 6.5548 ms 19.6% - triton_convolution_1331 8.8159 ms 14.5% - triton_convolution_1328 12.4215 ms 10.3% -SingleProcess AUTOTUNE takes 5.6736 seconds -AUTOTUNE convolution(957x256x28x28, 80x256x1x1) - triton_convolution_1354 0.8452 ms 100.0% - triton_convolution_1355 0.9540 ms 88.6% - triton_convolution_1359 0.9582 ms 88.2% - triton_convolution_1357 1.0138 ms 83.4% - convolution 1.0945 ms 77.2% - triton_convolution_1358 1.1035 ms 76.6% - triton_convolution_1360 1.1451 ms 73.8% - triton_convolution_1356 1.7224 ms 49.1% - conv1x1_via_mm 3.6812 ms 23.0% -SingleProcess AUTOTUNE takes 4.8280 seconds -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['non-cuda device in graph'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -skipping cudagraphs due to ['mutated inputs'] -[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) -[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_paste_masks_in_image' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:123) -[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] last reason: L['N'] == 36 # num_chunks <= N # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/mask_ops.py:125 in resume_in_paste_masks_in_image -[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". -[2023-12-12 17:43:12,768] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. -skipping cudagraphs due to ['mutated inputs'] -[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (8) -[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] function: 'resume_in_detector_postprocess' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/postprocessing.py:45) -[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] last reason: L['scale_x'] == 0.5337781484570475 # self.tensor[:, 0::2] *= scale_x # miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/structures/boxes.py:275 in scale -[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". -[2023-12-12 17:43:22,978] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. - running benchmark: 0%| | 0/30 [00:00 /home/cdhernandez/local/pytorch/torch/_inductor/decomposition.py(221)mm() --> if config.coordinate_descent_tuning: -(Pdb) \ No newline at end of file diff --git a/log2.log b/log2.log new file mode 100644 index 0000000000..d3a93dc3c5 --- /dev/null +++ b/log2.log @@ -0,0 +1,199 @@ +BERT_pytorch + loading model: 0it [00:00, ?it/s][W Module.cpp:156] symbolizing C++ stack trace for exception; if this hangs, rerun with TORCH_DISABLE_ADDR2LINE=1... + + loading model: 0it [00:34, ?it/s] +SQNR ['35.0'] +BERT_pytorch batchsize 128 +cuda eval BERT_pytorch int8dynamic-epi +AUTOTUNE int_mm(16384x768, 768x768, 16384x768) + triton_mm_9 0.1026 ms 100.0% + triton_mm_8 0.1160 ms 88.4% + triton_mm_1 0.1164 ms 88.1% + triton_mm_2 0.1371 ms 74.8% + triton_mm_3 0.1390 ms 73.8% + triton_mm_0 0.1524 ms 67.3% + triton_mm_7 0.1589 ms 64.5% + triton_mm_4 0.1595 ms 64.3% + triton_mm_10 0.1731 ms 59.3% + triton_mm_6 0.3018 ms 34.0% +SingleProcess AUTOTUNE takes 2.0636 seconds +AUTOTUNE bmm(1536x128x64, 1536x64x128) + triton_bmm_30 0.0725 ms 100.0% + triton_bmm_23 0.0739 ms 98.2% + triton_bmm_24 0.0754 ms 96.2% + triton_bmm_25 0.0771 ms 94.1% + triton_bmm_26 0.0790 ms 91.9% + bmm 0.0824 ms 88.0% + triton_bmm_22 0.0836 ms 86.8% + triton_bmm_32 0.0843 ms 86.1% + triton_bmm_29 0.0916 ms 79.2% + triton_bmm_33 0.0951 ms 76.3% +SingleProcess AUTOTUNE takes 1.6065 seconds +AUTOTUNE bmm(1536x128x128, 1536x128x64) + triton_bmm_47 0.0775 ms 100.0% + triton_bmm_46 0.0787 ms 98.4% + triton_bmm_49 0.0789 ms 98.2% + triton_bmm_53 0.0791 ms 97.9% + triton_bmm_48 0.0797 ms 97.1% + triton_bmm_52 0.0814 ms 95.1% + triton_bmm_45 0.0816 ms 95.0% + triton_bmm_51 0.0831 ms 93.2% + bmm 0.0863 ms 89.7% + triton_bmm_55 0.0877 ms 88.4% +SingleProcess AUTOTUNE takes 1.5922 seconds +AUTOTUNE int_mm(16384x768, 768x3072, 16384x3072) + triton_mm_77 0.3461 ms 100.0% + triton_mm_69 0.4061 ms 85.2% + triton_mm_76 0.4309 ms 80.3% + triton_mm_70 0.4879 ms 70.9% + triton_mm_71 0.5020 ms 69.0% + triton_mm_68 0.5176 ms 66.9% + triton_mm_75 0.5216 ms 66.4% + triton_mm_72 0.5839 ms 59.3% + triton_mm_78 0.6185 ms 56.0% + triton_mm_74 1.1828 ms 29.3% +SingleProcess AUTOTUNE takes 1.5006 seconds +AUTOTUNE int_mm(16384x3072, 3072x768, 16384x768) + triton_mm_88 0.2087 ms 100.0% + triton_mm_89 0.2860 ms 73.0% + triton_mm_87 0.3379 ms 61.8% + triton_mm_80 0.3466 ms 60.2% + triton_mm_81 0.3612 ms 57.8% + triton_mm_82 0.3898 ms 53.5% + triton_mm_83 0.4113 ms 50.7% + triton_mm_86 0.4154 ms 50.2% + triton_mm_79 0.4978 ms 41.9% + triton_mm_85 1.0247 ms 20.4% +SingleProcess AUTOTUNE takes 1.4745 seconds + running benchmark: 0%| | 0/30 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for BERT_pytorch. Setting accuracy check to cosine +AUTOTUNE int_mm(128x768, 768x768, 128x768) + triton_mm_6 0.0127 ms 100.0% + triton_mm_8 0.0127 ms 100.0% + triton_mm_5 0.0130 ms 97.4% + triton_mm_10 0.0150 ms 84.6% + triton_mm_3 0.0173 ms 73.1% + triton_mm_4 0.0173 ms 73.1% + triton_mm_1 0.0196 ms 64.6% + triton_mm_2 0.0196 ms 64.5% + triton_mm_0 0.0228 ms 55.5% + triton_mm_9 0.0245 ms 51.8% +SingleProcess AUTOTUNE takes 1.4679 seconds +AUTOTUNE bmm(12x128x64, 12x64x128) + triton_bmm_31 0.0073 ms 100.0% + triton_bmm_27 0.0076 ms 96.2% + triton_bmm_22 0.0076 ms 95.8% + triton_bmm_24 0.0078 ms 93.5% + triton_bmm_30 0.0080 ms 92.0% + triton_bmm_28 0.0081 ms 90.9% + triton_bmm_25 0.0081 ms 90.5% + triton_bmm_26 0.0081 ms 90.5% + triton_bmm_33 0.0082 ms 89.8% + bmm 0.0082 ms 89.1% +SingleProcess AUTOTUNE takes 1.9117 seconds +AUTOTUNE bmm(12x128x128, 12x128x64) + triton_bmm_48 0.0081 ms 100.0% + triton_bmm_51 0.0082 ms 98.8% + triton_bmm_53 0.0083 ms 97.9% + triton_bmm_50 0.0084 ms 96.9% + triton_bmm_49 0.0085 ms 94.8% + triton_bmm_54 0.0085 ms 94.8% + triton_bmm_46 0.0086 ms 93.7% + triton_bmm_47 0.0087 ms 92.7% + bmm 0.0091 ms 89.1% + triton_bmm_45 0.0097 ms 83.5% +SingleProcess AUTOTUNE takes 1.7105 seconds +AUTOTUNE int_mm(128x768, 768x3072, 128x3072) + triton_mm_76 0.0141 ms 100.0% + triton_mm_78 0.0161 ms 87.9% + triton_mm_73 0.0172 ms 82.5% + triton_mm_72 0.0175 ms 81.0% + triton_mm_74 0.0180 ms 78.5% + triton_mm_71 0.0181 ms 78.1% + triton_mm_70 0.0201 ms 70.3% + triton_mm_69 0.0205 ms 69.0% + triton_mm_68 0.0228 ms 61.9% + triton_mm_77 0.0246 ms 57.6% +SingleProcess AUTOTUNE takes 1.4610 seconds +AUTOTUNE int_mm(128x3072, 3072x768, 128x768) + triton_mm_84 0.0287 ms 100.0% + triton_mm_85 0.0290 ms 98.9% + triton_mm_89 0.0295 ms 97.3% + triton_mm_87 0.0304 ms 94.3% + triton_mm_82 0.0427 ms 67.1% + triton_mm_83 0.0432 ms 66.4% + triton_mm_88 0.0518 ms 55.3% + triton_mm_81 0.0541 ms 53.0% + triton_mm_80 0.0546 ms 52.5% + triton_mm_79 0.0732 ms 39.2% +SingleProcess AUTOTUNE takes 1.4071 seconds +pass-sqnr-37.176 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +Background_Matting +cuda eval Background_Matting int8dynamic-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead + loading model: 0it [00:12, ?it/s] +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +DALLE2_pytorch +cuda eval DALLE2_pytorch int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for DALLE2_pytorch. Setting accuracy check to cosine +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +AUTOTUNE mm(154x512, 512x1536) + triton_mm_8 0.0116 ms 100.0% + mm 0.0116 ms 99.7% + triton_mm_9 0.0132 ms 87.6% + triton_mm_3 0.0134 ms 86.4% + triton_mm_4 0.0135 ms 85.5% + triton_mm_6 0.0141 ms 81.9% + triton_mm_5 0.0142 ms 81.1% + triton_mm_1 0.0152 ms 76.0% + triton_mm_2 0.0157 ms 73.4% + triton_mm_0 0.0205 ms 56.2% +SingleProcess AUTOTUNE takes 1.9569 seconds +[2023-12-12 21:12:05,984] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE int_mm(154x512, 512x512, 154x512) + triton_mm_18 0.0110 ms 100.0% + triton_mm_17 0.0110 ms 99.7% + triton_mm_20 0.0115 ms 95.6% + triton_mm_15 0.0152 ms 72.6% + triton_mm_16 0.0152 ms 72.4% + triton_mm_13 0.0164 ms 67.1% + triton_mm_12 0.0169 ms 65.2% + triton_mm_14 0.0170 ms 64.9% + triton_mm_22 0.0260 ms 42.4% + triton_mm_19 0.0272 ms 40.4% +SingleProcess AUTOTUNE takes 1.4754 seconds +AUTOTUNE int_mm(154x512, 512x2048, 154x2048) + triton_mm_31 0.0119 ms 100.0% + triton_mm_26 0.0148 ms 80.0% + triton_mm_29 0.0149 ms 79.6% + triton_mm_28 0.0151 ms 78.9% + triton_mm_27 0.0156 ms 76.1% + triton_mm_24 0.0172 ms 69.0% + triton_mm_25 0.0174 ms 68.2% + triton_mm_23 0.0177 ms 67.0% + triton_mm_33 0.0267 ms 44.5% + triton_mm_30 0.0275 ms 43.2% +SingleProcess AUTOTUNE takes 1.5254 seconds +AUTOTUNE int_mm(154x2048, 2048x512, 154x512) + triton_mm_40 0.0213 ms 100.0% + triton_mm_39 0.0218 ms 97.8% + triton_mm_42 0.0223 ms 95.6% + triton_mm_37 0.0302 ms 70.6% + triton_mm_38 0.0303 ms 70.3% + triton_mm_36 0.0395 ms 53.9% + triton_mm_35 0.0396 ms 53.9% + triton_mm_44 0.0436 ms 48.9% + triton_mm_43 0.0468 ms 45.5% + triton_mm_34 0.0483 ms 44.1% +SingleProcess AUTOTUNE takes 1.4480 seconds +[2023-12-12 21:12:10,942] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:11,254] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:11,558] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:11,866] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:12,169] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:12,469] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:13,113] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:13,416] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:13,724] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:14,027] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:12:14,324] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mm(2x512, 512x512) + triton_mm_545 0.0099 ms 100.0% + triton_mm_546 0.0100 ms 98.4% + mm 0.0103 ms 96.0% + triton_mm_548 0.0103 ms 96.0% + triton_mm_549 0.0106 ms 92.8% + triton_mm_544 0.0110 ms 89.3% + triton_mm_543 0.0113 ms 87.5% + triton_mm_542 0.0123 ms 80.0% + triton_mm_541 0.0130 ms 75.9% + triton_mm_540 0.0188 ms 52.4% +SingleProcess AUTOTUNE takes 1.7053 seconds +AUTOTUNE int_mm(2x512, 512x1024, 2x1024) + triton_mm_562 0.0090 ms 100.0% + triton_mm_560 0.0097 ms 92.1% + triton_mm_558 0.0100 ms 89.5% + triton_mm_557 0.0102 ms 88.1% + triton_mm_556 0.0105 ms 85.4% + triton_mm_561 0.0105 ms 85.4% + triton_mm_554 0.0128 ms 69.8% + triton_mm_553 0.0133 ms 67.1% + triton_mm_552 0.0164 ms 54.5% + triton_mm_555 0.0170 ms 52.7% +SingleProcess AUTOTUNE takes 1.4649 seconds +AUTOTUNE int_mm(2x1024, 1024x1024, 2x1024) + triton_mm_573 0.0118 ms 100.0% + triton_mm_572 0.0131 ms 90.7% + triton_mm_568 0.0136 ms 86.9% + triton_mm_571 0.0139 ms 85.3% + triton_mm_569 0.0144 ms 82.2% + triton_mm_567 0.0156 ms 75.8% + triton_mm_565 0.0200 ms 59.2% + triton_mm_564 0.0213 ms 55.6% + triton_mm_563 0.0287 ms 41.2% + triton_mm_566 0.0300 ms 39.4% +SingleProcess AUTOTUNE takes 1.4571 seconds +AUTOTUNE int_mm(2x1024, 1024x512, 2x512) + triton_mm_584 0.0118 ms 100.0% + triton_mm_582 0.0128 ms 92.2% + triton_mm_583 0.0130 ms 91.1% + triton_mm_579 0.0133 ms 88.5% + triton_mm_580 0.0133 ms 88.5% + triton_mm_578 0.0144 ms 81.8% + triton_mm_576 0.0198 ms 59.6% + triton_mm_575 0.0208 ms 56.8% + triton_mm_574 0.0275 ms 43.0% + triton_mm_577 0.0291 ms 40.6% +SingleProcess AUTOTUNE takes 1.4587 seconds +AUTOTUNE int_mm(520x512, 512x128, 520x128) + triton_mm_590 0.0110 ms 100.0% + triton_mm_591 0.0112 ms 98.6% + triton_mm_593 0.0115 ms 96.1% + triton_mm_588 0.0152 ms 72.8% + triton_mm_589 0.0154 ms 71.6% + triton_mm_587 0.0164 ms 67.3% + triton_mm_585 0.0169 ms 65.2% + triton_mm_594 0.0172 ms 64.2% + triton_mm_586 0.0172 ms 64.1% + triton_mm_592 0.0269 ms 41.0% +SingleProcess AUTOTUNE takes 1.4739 seconds +AUTOTUNE int_mm(520x512, 512x512, 520x512) + triton_mm_604 0.0120 ms 100.0% + triton_mm_601 0.0142 ms 84.8% + triton_mm_602 0.0142 ms 84.8% + triton_mm_599 0.0150 ms 80.1% + triton_mm_600 0.0154 ms 78.1% + triton_mm_598 0.0164 ms 73.4% + triton_mm_597 0.0164 ms 73.2% + triton_mm_596 0.0167 ms 72.1% + triton_mm_603 0.0273 ms 44.2% + triton_mm_606 0.0289 ms 41.6% +SingleProcess AUTOTUNE takes 1.4827 seconds +AUTOTUNE bmm(2x2080x64, 2x64x261) + triton_bmm_609 0.0106 ms 100.0% + triton_bmm_608 0.0108 ms 97.9% + triton_bmm_607 0.0109 ms 97.6% + triton_bmm_611 0.0111 ms 96.0% + triton_bmm_616 0.0116 ms 91.5% + triton_bmm_610 0.0118 ms 90.0% + triton_bmm_614 0.0118 ms 90.0% + triton_bmm_617 0.0119 ms 89.2% + triton_bmm_618 0.0121 ms 88.1% + triton_bmm_613 0.0145 ms 73.5% +SingleProcess AUTOTUNE takes 1.7109 seconds +AUTOTUNE bmm(2x2080x261, 2x261x64) + triton_bmm_628 0.0151 ms 100.0% + triton_bmm_625 0.0165 ms 91.3% + triton_bmm_627 0.0165 ms 91.3% + triton_bmm_622 0.0167 ms 90.1% + triton_bmm_623 0.0187 ms 80.7% + triton_bmm_630 0.0200 ms 75.2% + bmm 0.0202 ms 74.8% + triton_bmm_626 0.0205 ms 73.6% + triton_bmm_620 0.0213 ms 70.6% + triton_bmm_624 0.0213 ms 70.6% +SingleProcess AUTOTUNE takes 1.7092 seconds +AUTOTUNE int_mm(520x512, 512x4096, 520x4096) + triton_mm_650 0.0263 ms 100.0% + triton_mm_643 0.0266 ms 99.0% + triton_mm_645 0.0284 ms 92.9% + triton_mm_644 0.0305 ms 86.4% + triton_mm_642 0.0312 ms 84.3% + triton_mm_651 0.0313 ms 84.2% + triton_mm_646 0.0325 ms 81.0% + triton_mm_652 0.0332 ms 79.4% + triton_mm_648 0.0482 ms 54.6% + triton_mm_647 0.0487 ms 54.0% +SingleProcess AUTOTUNE takes 1.4592 seconds +AUTOTUNE int_mm(520x2048, 2048x512, 520x512) + triton_mm_661 0.0229 ms 100.0% + triton_mm_656 0.0303 ms 75.5% + triton_mm_657 0.0308 ms 74.2% + triton_mm_659 0.0309 ms 73.9% + triton_mm_658 0.0315 ms 72.6% + triton_mm_654 0.0396 ms 57.8% + triton_mm_655 0.0398 ms 57.5% + triton_mm_653 0.0460 ms 49.8% + triton_mm_663 0.0465 ms 49.2% + triton_mm_662 0.0478 ms 47.8% +SingleProcess AUTOTUNE takes 1.4595 seconds +AUTOTUNE bmm(2x1x512, 2x512x1) + triton_bmm_1073 0.0082 ms 100.0% + triton_bmm_1072 0.0084 ms 97.0% + triton_bmm_1075 0.0084 ms 97.0% + triton_bmm_1074 0.0088 ms 93.3% + triton_bmm_1071 0.0107 ms 76.9% + triton_bmm_1070 0.0144 ms 56.8% + triton_bmm_1077 0.0175 ms 46.9% + triton_bmm_1076 0.0177 ms 46.2% + bmm 0.0934 ms 8.8% +SingleProcess AUTOTUNE takes 1.2156 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE mm(77x512, 512x1536) + triton_mm_1084 0.0108 ms 100.0% + mm 0.0113 ms 95.8% + triton_mm_1087 0.0113 ms 95.8% + triton_mm_1083 0.0115 ms 94.2% + triton_mm_1086 0.0115 ms 94.2% + triton_mm_1081 0.0134 ms 81.1% + triton_mm_1082 0.0134 ms 80.9% + triton_mm_1079 0.0153 ms 70.8% + triton_mm_1080 0.0157 ms 69.2% + triton_mm_1078 0.0209 ms 52.0% +SingleProcess AUTOTUNE takes 4.9112 seconds +[2023-12-12 21:13:13,919] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE int_mm(77x512, 512x512, 77x512) + triton_mm_1095 0.0113 ms 100.0% + triton_mm_1098 0.0114 ms 99.0% + triton_mm_1096 0.0115 ms 98.6% + triton_mm_1093 0.0149 ms 75.7% + triton_mm_1094 0.0152 ms 74.6% + triton_mm_1091 0.0164 ms 68.9% + triton_mm_1092 0.0168 ms 67.5% + triton_mm_1090 0.0169 ms 67.1% + triton_mm_1100 0.0172 ms 65.6% + triton_mm_1097 0.0265 ms 42.7% +SingleProcess AUTOTUNE takes 6.7791 seconds +AUTOTUNE int_mm(77x512, 512x2048, 77x2048) + triton_mm_1107 0.0120 ms 100.0% + triton_mm_1109 0.0120 ms 100.0% + triton_mm_1104 0.0149 ms 80.4% + triton_mm_1105 0.0157 ms 76.2% + triton_mm_1106 0.0159 ms 75.4% + triton_mm_1111 0.0167 ms 71.8% + triton_mm_1103 0.0168 ms 71.2% + triton_mm_1102 0.0169 ms 70.7% + triton_mm_1101 0.0178 ms 67.3% + triton_mm_1108 0.0278 ms 43.1% +SingleProcess AUTOTUNE takes 6.7169 seconds +AUTOTUNE int_mm(77x2048, 2048x512, 77x512) + triton_mm_1117 0.0218 ms 100.0% + triton_mm_1118 0.0221 ms 98.8% + triton_mm_1120 0.0231 ms 94.5% + triton_mm_1122 0.0262 ms 83.3% + triton_mm_1115 0.0298 ms 73.3% + triton_mm_1116 0.0311 ms 70.2% + triton_mm_1114 0.0396 ms 55.2% + triton_mm_1113 0.0401 ms 54.4% + triton_mm_1121 0.0465 ms 46.9% + triton_mm_1112 0.0503 ms 43.4% +SingleProcess AUTOTUNE takes 6.4149 seconds +[2023-12-12 21:13:34,260] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:34,535] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:34,818] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:35,104] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:35,392] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:35,672] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:35,953] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:36,241] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:36,520] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:36,803] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 21:13:37,086] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mm(1x512, 512x512) + mm 0.0077 ms 100.0% + triton_mm_1624 0.0095 ms 81.2% + triton_mm_1626 0.0098 ms 79.3% + triton_mm_1623 0.0100 ms 77.3% + triton_mm_1622 0.0105 ms 73.8% + triton_mm_1627 0.0105 ms 73.8% + triton_mm_1621 0.0118 ms 65.6% + triton_mm_1620 0.0129 ms 60.2% + triton_mm_1619 0.0134 ms 57.8% + triton_mm_1618 0.0182 ms 42.5% +SingleProcess AUTOTUNE takes 4.0715 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE int_mm(1x128, 128x512, 1x512) + triton_mm_1634 0.0074 ms 100.0% + triton_mm_1636 0.0074 ms 100.0% + triton_mm_1638 0.0077 ms 96.7% + triton_mm_1640 0.0077 ms 96.7% + triton_mm_1635 0.0077 ms 95.9% + triton_mm_1631 0.0082 ms 90.3% + triton_mm_1630 0.0084 ms 88.5% + triton_mm_1632 0.0084 ms 87.9% + triton_mm_1633 0.0089 ms 83.8% + triton_mm_1639 0.0090 ms 82.7% +SingleProcess AUTOTUNE takes 3.2873 seconds +AUTOTUNE int_mm(1x512, 512x256, 1x256) + triton_mm_1646 0.0092 ms 100.0% + triton_mm_1651 0.0097 ms 94.7% + triton_mm_1647 0.0099 ms 92.9% + triton_mm_1649 0.0102 ms 90.0% + triton_mm_1650 0.0105 ms 87.8% + triton_mm_1645 0.0105 ms 87.5% + triton_mm_1643 0.0123 ms 75.0% + triton_mm_1642 0.0133 ms 69.2% + triton_mm_1641 0.0151 ms 60.9% + triton_mm_1644 0.0164 ms 56.3% +SingleProcess AUTOTUNE takes 3.6591 seconds +AUTOTUNE int_mm(1x512, 512x512, 1x512) + triton_mm_1662 0.0092 ms 100.0% + triton_mm_1658 0.0100 ms 92.0% + triton_mm_1657 0.0100 ms 91.7% + triton_mm_1656 0.0102 ms 89.7% + triton_mm_1660 0.0103 ms 89.4% + triton_mm_1661 0.0108 ms 85.0% + triton_mm_1654 0.0123 ms 74.5% + triton_mm_1653 0.0136 ms 67.7% + triton_mm_1652 0.0156 ms 58.8% + triton_mm_1655 0.0167 ms 55.1% +SingleProcess AUTOTUNE takes 3.4849 seconds +AUTOTUNE int_mm(77x512, 512x128, 77x128) + triton_mm_1668 0.0107 ms 100.0% + triton_mm_1671 0.0113 ms 94.6% + triton_mm_1669 0.0113 ms 94.2% + triton_mm_1666 0.0149 ms 71.5% + triton_mm_1667 0.0152 ms 70.3% + triton_mm_1665 0.0167 ms 63.7% + triton_mm_1663 0.0170 ms 62.8% + triton_mm_1664 0.0170 ms 62.8% + triton_mm_1672 0.0172 ms 61.8% + triton_mm_1670 0.0267 ms 39.9% +SingleProcess AUTOTUNE takes 4.8761 seconds +AUTOTUNE convolution(1x3x128x128, 64x3x3x3) + convolution 0.0115 ms 100.0% + triton_convolution_1677 0.0155 ms 74.3% + triton_convolution_1676 0.0170 ms 67.6% + triton_convolution_1673 0.0195 ms 58.9% + triton_convolution_1678 0.0197 ms 58.2% + triton_convolution_1675 0.0206 ms 55.8% + triton_convolution_1674 0.0427 ms 26.9% +SingleProcess AUTOTUNE takes 3.2879 seconds +AUTOTUNE convolution(1x3x128x128, 32x3x7x7) + convolution 0.0228 ms 100.0% + triton_convolution_1683 0.0425 ms 53.7% + triton_convolution_1679 0.0469 ms 48.7% + triton_convolution_1682 0.0481 ms 47.4% + triton_convolution_1684 0.0591 ms 38.6% + triton_convolution_1681 0.0719 ms 31.7% + triton_convolution_1680 0.0832 ms 27.4% +SingleProcess AUTOTUNE takes 2.4553 seconds +AUTOTUNE convolution(1x3x128x128, 32x3x15x15) + convolution 0.0733 ms 100.0% + triton_convolution_1689 0.1702 ms 43.1% + triton_convolution_1688 0.1862 ms 39.4% + triton_convolution_1685 0.1890 ms 38.8% + triton_convolution_1690 0.2446 ms 30.0% + triton_convolution_1687 0.3008 ms 24.4% + triton_convolution_1686 0.3492 ms 21.0% +SingleProcess AUTOTUNE takes 2.6112 seconds +AUTOTUNE convolution(1x128x128x128, 128x128x3x3) + convolution 0.0375 ms 100.0% + triton_convolution_1696 0.2068 ms 18.2% + triton_convolution_1697 0.2168 ms 17.3% + triton_convolution_1694 0.2292 ms 16.4% + triton_convolution_1695 0.2587 ms 14.5% + triton_convolution_1691 0.2658 ms 14.1% + triton_convolution_1692 0.3224 ms 11.6% + triton_convolution_1693 0.9626 ms 3.9% +SingleProcess AUTOTUNE takes 4.0887 seconds +AUTOTUNE addmm(4096x128, 4096x512, 512x128) + triton_mm_1792 0.0141 ms 100.0% + triton_mm_1791 0.0142 ms 99.3% + triton_mm_1796 0.0149 ms 94.4% + triton_mm_1790 0.0166 ms 84.8% + triton_mm_1789 0.0169 ms 83.7% + triton_mm_1793 0.0181 ms 77.8% + addmm 0.0187 ms 75.4% + triton_mm_1794 0.0194 ms 72.7% + triton_mm_1797 0.0208 ms 68.0% + triton_mm_1788 0.0227 ms 62.2% +SingleProcess AUTOTUNE takes 5.3874 seconds +AUTOTUNE convolution(1x128x64x64, 128x128x3x3) + convolution 0.0183 ms 100.0% + triton_convolution_1805 0.0756 ms 24.2% + triton_convolution_1804 0.1057 ms 17.3% + triton_convolution_1803 0.1122 ms 16.3% + triton_convolution_1806 0.1228 ms 14.9% + triton_convolution_1800 0.1355 ms 13.5% + triton_convolution_1801 0.2376 ms 7.7% + triton_convolution_1802 0.4860 ms 3.8% +SingleProcess AUTOTUNE takes 3.8363 seconds +AUTOTUNE int_mm(6x128, 128x1024, 6x1024) + triton_mm_1851 0.0070 ms 100.0% + triton_mm_1847 0.0074 ms 94.4% + triton_mm_1849 0.0074 ms 94.4% + triton_mm_1853 0.0076 ms 91.6% + triton_mm_1848 0.0077 ms 90.8% + triton_mm_1845 0.0079 ms 88.6% + triton_mm_1844 0.0082 ms 85.5% + triton_mm_1852 0.0082 ms 85.2% + triton_mm_1843 0.0090 ms 77.9% + triton_mm_1846 0.0095 ms 73.6% +SingleProcess AUTOTUNE takes 3.8395 seconds +AUTOTUNE int_mm(4096x128, 128x512, 4096x512) + triton_mm_1862 0.0166 ms 100.0% + triton_mm_1854 0.0180 ms 92.4% + triton_mm_1856 0.0180 ms 92.4% + triton_mm_1855 0.0194 ms 85.8% + triton_mm_1857 0.0199 ms 83.6% + triton_mm_1858 0.0201 ms 82.8% + triton_mm_1860 0.0242 ms 68.9% + triton_mm_1859 0.0245 ms 67.8% + triton_mm_1861 0.0272 ms 61.3% + triton_mm_1864 0.0295 ms 56.5% +SingleProcess AUTOTUNE takes 5.5841 seconds +AUTOTUNE bmm(8x4096x64, 8x64x7) + triton_bmm_1866 0.0113 ms 100.0% + triton_bmm_1868 0.0113 ms 99.7% + triton_bmm_1873 0.0113 ms 99.7% + triton_bmm_1870 0.0114 ms 98.9% + triton_bmm_1872 0.0115 ms 98.1% + triton_bmm_1865 0.0117 ms 96.6% + triton_bmm_1867 0.0118 ms 95.7% + triton_bmm_1869 0.0118 ms 95.1% + triton_bmm_1876 0.0120 ms 93.9% + triton_bmm_1871 0.0120 ms 93.6% +SingleProcess AUTOTUNE takes 3.8358 seconds +AUTOTUNE bmm(8x4096x7, 8x7x64) + triton_bmm_1885 0.0092 ms 100.0% + triton_bmm_1878 0.0097 ms 95.1% + triton_bmm_1877 0.0098 ms 94.8% + triton_bmm_1884 0.0098 ms 94.8% + triton_bmm_1880 0.0098 ms 94.4% + triton_bmm_1883 0.0100 ms 92.9% + triton_bmm_1879 0.0102 ms 90.3% + triton_bmm_1881 0.0103 ms 89.8% + triton_bmm_1882 0.0103 ms 89.8% + triton_bmm_1887 0.0108 ms 85.5% +SingleProcess AUTOTUNE takes 3.2867 seconds +AUTOTUNE int_mm(4096x512, 512x128, 4096x128) + triton_mm_1896 0.0143 ms 100.0% + triton_mm_1892 0.0157 ms 91.0% + triton_mm_1891 0.0160 ms 89.4% + triton_mm_1897 0.0177 ms 81.0% + triton_mm_1889 0.0177 ms 80.8% + triton_mm_1890 0.0179 ms 79.8% + triton_mm_1893 0.0183 ms 78.1% + triton_mm_1894 0.0193 ms 74.3% + triton_mm_1888 0.0198 ms 72.1% + triton_mm_1895 0.0271 ms 52.8% +SingleProcess AUTOTUNE takes 6.3205 seconds +AUTOTUNE addmm(1024x256, 1024x512, 512x256) + triton_mm_1995 0.0122 ms 100.0% + triton_mm_1996 0.0131 ms 93.2% + triton_mm_1990 0.0134 ms 91.4% + triton_mm_1991 0.0139 ms 87.8% + triton_mm_1993 0.0140 ms 87.2% + triton_mm_1992 0.0145 ms 84.4% + triton_mm_1989 0.0152 ms 80.6% + triton_mm_1988 0.0155 ms 79.1% + addmm 0.0158 ms 77.2% + triton_mm_1987 0.0213 ms 57.4% +SingleProcess AUTOTUNE takes 5.4855 seconds +AUTOTUNE convolution(1x256x32x32, 256x256x3x3) + convolution 0.0196 ms 100.0% + triton_convolution_2003 0.1675 ms 11.7% + triton_convolution_2004 0.2201 ms 8.9% + triton_convolution_2002 0.2294 ms 8.5% + triton_convolution_2005 0.2941 ms 6.6% + triton_convolution_1999 0.5184 ms 3.8% + triton_convolution_2000 0.5746 ms 3.4% + triton_convolution_2001 0.9630 ms 2.0% +SingleProcess AUTOTUNE takes 4.3531 seconds +AUTOTUNE int_mm(1024x256, 256x512, 1024x512) + triton_mm_2061 0.0118 ms 100.0% + triton_mm_2055 0.0126 ms 93.9% + triton_mm_2056 0.0126 ms 93.7% + triton_mm_2057 0.0129 ms 91.7% + triton_mm_2053 0.0134 ms 88.3% + triton_mm_2054 0.0134 ms 88.1% + triton_mm_2058 0.0145 ms 81.5% + triton_mm_2059 0.0151 ms 78.0% + triton_mm_2060 0.0194 ms 61.0% + triton_mm_2062 0.0262 ms 45.1% +SingleProcess AUTOTUNE takes 7.8649 seconds +AUTOTUNE bmm(8x1024x64, 8x64x7) + triton_bmm_2065 0.0079 ms 100.0% + triton_bmm_2066 0.0079 ms 99.6% + triton_bmm_2069 0.0079 ms 99.6% + triton_bmm_2072 0.0079 ms 99.6% + triton_bmm_2071 0.0084 ms 93.9% + triton_bmm_2075 0.0084 ms 93.6% + triton_bmm_2067 0.0085 ms 93.2% + triton_bmm_2068 0.0085 ms 93.2% + triton_bmm_2070 0.0085 ms 93.2% + triton_bmm_2073 0.0085 ms 92.9% +SingleProcess AUTOTUNE takes 4.1193 seconds +AUTOTUNE bmm(8x1024x7, 8x7x64) + triton_bmm_2085 0.0072 ms 100.0% + triton_bmm_2077 0.0074 ms 97.0% + triton_bmm_2083 0.0074 ms 97.0% + triton_bmm_2086 0.0074 ms 97.0% + triton_bmm_2076 0.0078 ms 91.8% + triton_bmm_2078 0.0078 ms 91.8% + triton_bmm_2081 0.0079 ms 90.7% + triton_bmm_2079 0.0079 ms 90.3% + triton_bmm_2082 0.0079 ms 90.3% + triton_bmm_2084 0.0079 ms 90.3% +SingleProcess AUTOTUNE takes 3.4447 seconds +AUTOTUNE int_mm(1024x512, 512x256, 1024x256) + triton_mm_2095 0.0116 ms 100.0% + triton_mm_2092 0.0145 ms 80.0% + triton_mm_2093 0.0146 ms 79.6% + triton_mm_2090 0.0146 ms 79.2% + triton_mm_2091 0.0156 ms 74.0% + triton_mm_2089 0.0164 ms 70.6% + triton_mm_2087 0.0169 ms 68.4% + triton_mm_2088 0.0172 ms 67.3% + triton_mm_2094 0.0273 ms 42.5% + triton_mm_2096 0.0290 ms 40.0% +SingleProcess AUTOTUNE takes 7.5561 seconds +AUTOTUNE addmm(256x512, 256x1024, 1024x512) + triton_mm_2191 0.0145 ms 100.0% + triton_mm_2192 0.0147 ms 98.7% + triton_mm_2194 0.0165 ms 88.0% + triton_mm_2195 0.0170 ms 85.3% + triton_mm_2190 0.0185 ms 78.2% + triton_mm_2189 0.0186 ms 78.0% + addmm 0.0200 ms 72.5% + triton_mm_2188 0.0232 ms 62.6% + triton_mm_2187 0.0234 ms 62.0% + triton_mm_2186 0.0356 ms 40.7% +SingleProcess AUTOTUNE takes 5.8412 seconds +AUTOTUNE convolution(1x512x16x16, 512x512x3x3) + convolution 0.0244 ms 100.0% + triton_convolution_2202 0.4849 ms 5.0% + triton_convolution_2203 0.5004 ms 4.9% + triton_convolution_2204 0.6577 ms 3.7% + triton_convolution_2200 0.6579 ms 3.7% + triton_convolution_2201 0.6769 ms 3.6% + triton_convolution_2199 1.1457 ms 2.1% + triton_convolution_2198 1.2210 ms 2.0% +SingleProcess AUTOTUNE takes 4.2305 seconds +AUTOTUNE int_mm(1x512, 512x1024, 1x1024) + triton_mm_2215 0.0098 ms 100.0% + triton_mm_2210 0.0102 ms 95.6% + triton_mm_2213 0.0106 ms 92.4% + triton_mm_2211 0.0108 ms 90.5% + triton_mm_2209 0.0110 ms 88.7% + triton_mm_2214 0.0110 ms 88.7% + triton_mm_2207 0.0126 ms 77.6% + triton_mm_2206 0.0138 ms 70.6% + triton_mm_2205 0.0156 ms 62.4% + triton_mm_2208 0.0174 ms 56.0% +SingleProcess AUTOTUNE takes 3.5712 seconds +AUTOTUNE int_mm(256x512, 512x512, 256x512) + triton_mm_2258 0.0111 ms 100.0% + triton_mm_2257 0.0116 ms 95.3% + triton_mm_2260 0.0119 ms 92.9% + triton_mm_2256 0.0148 ms 75.1% + triton_mm_2255 0.0149 ms 74.2% + triton_mm_2252 0.0167 ms 66.4% + triton_mm_2254 0.0167 ms 66.4% + triton_mm_2253 0.0169 ms 65.3% + triton_mm_2259 0.0269 ms 41.1% + triton_mm_2262 0.0287 ms 38.5% +SingleProcess AUTOTUNE takes 7.3540 seconds +AUTOTUNE bmm(8x256x64, 8x64x7) + triton_bmm_2269 0.0067 ms 100.0% + triton_bmm_2264 0.0067 ms 99.5% + triton_bmm_2267 0.0072 ms 92.9% + triton_bmm_2271 0.0072 ms 92.9% + triton_bmm_2263 0.0074 ms 89.7% + triton_bmm_2268 0.0074 ms 89.7% + triton_bmm_2266 0.0075 ms 89.3% + triton_bmm_2265 0.0075 ms 88.5% + triton_bmm_2270 0.0077 ms 86.7% + triton_bmm_2274 0.0077 ms 86.7% +SingleProcess AUTOTUNE takes 3.8703 seconds +AUTOTUNE bmm(8x256x7, 8x7x64) + triton_bmm_2278 0.0067 ms 100.0% + triton_bmm_2280 0.0067 ms 100.0% + triton_bmm_2281 0.0067 ms 100.0% + triton_bmm_2285 0.0067 ms 100.0% + triton_bmm_2276 0.0067 ms 99.5% + triton_bmm_2283 0.0067 ms 99.5% + triton_bmm_2284 0.0072 ms 92.4% + triton_bmm_2275 0.0074 ms 90.2% + triton_bmm_2279 0.0074 ms 89.7% + triton_bmm_2282 0.0074 ms 89.7% +SingleProcess AUTOTUNE takes 3.3912 seconds +AUTOTUNE addmm(256x1024, 256x512, 512x1024) + triton_mm_2393 0.0120 ms 100.0% + triton_mm_2394 0.0129 ms 93.5% + triton_mm_2388 0.0136 ms 88.5% + triton_mm_2389 0.0136 ms 88.5% + triton_mm_2390 0.0140 ms 85.6% + triton_mm_2391 0.0140 ms 85.6% + triton_mm_2387 0.0161 ms 74.9% + triton_mm_2386 0.0162 ms 74.2% + addmm 0.0171 ms 70.5% + triton_mm_2385 0.0219 ms 55.0% +SingleProcess AUTOTUNE takes 5.3527 seconds +AUTOTUNE convolution(1x1024x16x16, 1024x1024x3x3) + convolution 0.0511 ms 100.0% + triton_convolution_2402 1.0552 ms 4.8% + triton_convolution_2401 1.2348 ms 4.1% + triton_convolution_2399 1.3037 ms 3.9% + triton_convolution_2403 1.4461 ms 3.5% + triton_convolution_2400 1.4536 ms 3.5% + triton_convolution_2398 2.5367 ms 2.0% + triton_convolution_2397 2.6322 ms 1.9% +SingleProcess AUTOTUNE takes 4.4800 seconds +AUTOTUNE int_mm(1x512, 512x2048, 1x2048) + triton_mm_2414 0.0100 ms 100.0% + triton_mm_2412 0.0102 ms 98.1% + triton_mm_2409 0.0105 ms 95.4% + triton_mm_2413 0.0105 ms 95.4% + triton_mm_2410 0.0110 ms 91.0% + triton_mm_2408 0.0113 ms 89.0% + triton_mm_2406 0.0135 ms 74.6% + triton_mm_2405 0.0147 ms 68.6% + triton_mm_2404 0.0164 ms 61.3% + triton_mm_2407 0.0180 ms 55.8% +SingleProcess AUTOTUNE takes 3.4746 seconds +AUTOTUNE int_mm(262x128, 128x1024, 262x1024) + triton_mm_2423 0.0095 ms 100.0% + triton_mm_2421 0.0103 ms 92.2% + triton_mm_2415 0.0105 ms 90.3% + triton_mm_2416 0.0108 ms 88.1% + triton_mm_2420 0.0109 ms 87.0% + triton_mm_2419 0.0110 ms 86.1% + triton_mm_2418 0.0113 ms 83.9% + triton_mm_2417 0.0116 ms 82.0% + triton_mm_2422 0.0166 ms 57.1% + triton_mm_2425 0.0292 ms 32.5% +SingleProcess AUTOTUNE takes 6.0614 seconds +AUTOTUNE int_mm(256x1024, 1024x512, 256x512) + triton_mm_2431 0.0146 ms 100.0% + triton_mm_2432 0.0151 ms 96.7% + triton_mm_2434 0.0155 ms 94.6% + triton_mm_2430 0.0200 ms 73.0% + triton_mm_2429 0.0201 ms 72.8% + triton_mm_2428 0.0245 ms 59.6% + triton_mm_2427 0.0248 ms 58.9% + triton_mm_2426 0.0272 ms 53.8% + triton_mm_2436 0.0352 ms 41.6% + triton_mm_2435 0.0355 ms 41.2% +SingleProcess AUTOTUNE takes 7.2916 seconds +AUTOTUNE bmm(8x256x64, 8x64x263) + triton_bmm_2439 0.0088 ms 100.0% + triton_bmm_2446 0.0090 ms 97.2% + triton_bmm_2438 0.0091 ms 96.5% + triton_bmm_2440 0.0094 ms 93.5% + triton_bmm_2437 0.0095 ms 92.6% + triton_bmm_2441 0.0095 ms 91.9% + triton_bmm_2447 0.0096 ms 91.6% + triton_bmm_2443 0.0096 ms 91.3% + triton_bmm_2448 0.0101 ms 86.4% + triton_bmm_2444 0.0109 ms 80.1% +SingleProcess AUTOTUNE takes 5.0196 seconds +AUTOTUNE bmm(8x256x263, 8x263x64) + triton_bmm_2458 0.0129 ms 100.0% + bmm 0.0149 ms 86.5% + triton_bmm_2455 0.0151 ms 85.6% + triton_bmm_2452 0.0156 ms 82.6% + triton_bmm_2457 0.0159 ms 81.1% + triton_bmm_2460 0.0182 ms 70.9% + triton_bmm_2453 0.0184 ms 70.3% + triton_bmm_2454 0.0192 ms 67.2% + triton_bmm_2450 0.0208 ms 62.2% + triton_bmm_2456 0.0213 ms 60.6% +SingleProcess AUTOTUNE takes 4.6948 seconds +AUTOTUNE int_mm(256x512, 512x1024, 256x1024) + triton_mm_2469 0.0116 ms 100.0% + triton_mm_2467 0.0142 ms 81.1% + triton_mm_2466 0.0143 ms 80.9% + triton_mm_2464 0.0151 ms 76.4% + triton_mm_2465 0.0154 ms 75.1% + triton_mm_2463 0.0164 ms 70.6% + triton_mm_2461 0.0167 ms 69.0% + triton_mm_2462 0.0170 ms 68.1% + triton_mm_2468 0.0266 ms 43.4% + triton_mm_2470 0.0290 ms 39.8% +SingleProcess AUTOTUNE takes 7.5199 seconds +AUTOTUNE int_mm(256x1024, 1024x64, 256x64) + triton_mm_2484 0.0146 ms 100.0% + triton_mm_2485 0.0146 ms 100.0% + triton_mm_2487 0.0151 ms 96.6% + triton_mm_2488 0.0152 ms 96.4% + triton_mm_2482 0.0180 ms 81.5% + triton_mm_2489 0.0196 ms 74.8% + triton_mm_2483 0.0203 ms 72.2% + triton_mm_2480 0.0213 ms 68.5% + triton_mm_2481 0.0241 ms 60.6% + triton_mm_2479 0.0267 ms 54.8% +SingleProcess AUTOTUNE takes 5.2344 seconds +AUTOTUNE bmm(1x4096x32, 1x32x257) + triton_bmm_2507 0.0098 ms 100.0% + triton_bmm_2501 0.0098 ms 99.7% + triton_bmm_2509 0.0101 ms 97.1% + triton_bmm_2512 0.0101 ms 97.1% + triton_bmm_2511 0.0102 ms 96.2% + triton_bmm_2504 0.0105 ms 93.0% + triton_bmm_2503 0.0107 ms 91.6% + triton_bmm_2508 0.0108 ms 90.8% + triton_bmm_2505 0.0110 ms 88.7% + triton_bmm_2502 0.0111 ms 88.4% +SingleProcess AUTOTUNE takes 4.7754 seconds +AUTOTUNE bmm(1x4096x257, 1x257x32) + triton_bmm_2522 0.0141 ms 100.0% + bmm 0.0146 ms 96.5% + triton_bmm_2521 0.0151 ms 93.4% + triton_bmm_2519 0.0152 ms 93.0% + triton_bmm_2513 0.0180 ms 78.2% + triton_bmm_2524 0.0192 ms 73.6% + triton_bmm_2516 0.0197 ms 71.6% + triton_bmm_2514 0.0198 ms 71.4% + triton_bmm_2518 0.0203 ms 69.7% + triton_bmm_2517 0.0216 ms 65.4% +SingleProcess AUTOTUNE takes 4.0355 seconds +AUTOTUNE convolution(1x1536x16x16, 1024x1536x3x3) + convolution 0.0708 ms 100.0% + triton_convolution_2623 1.5814 ms 4.5% + triton_convolution_2620 1.8215 ms 3.9% + triton_convolution_2622 1.8618 ms 3.8% + triton_convolution_2624 2.1451 ms 3.3% + triton_convolution_2621 2.1885 ms 3.2% + triton_convolution_2618 3.5822 ms 2.0% + triton_convolution_2619 3.6348 ms 1.9% +SingleProcess AUTOTUNE takes 4.2930 seconds +AUTOTUNE addmm(256x1024, 256x1536, 1536x1024) + triton_mm_2707 0.0212 ms 100.0% + addmm 0.0217 ms 97.8% + triton_mm_2708 0.0236 ms 89.6% + triton_mm_2703 0.0246 ms 86.0% + triton_mm_2702 0.0252 ms 84.1% + triton_mm_2704 0.0258 ms 82.2% + triton_mm_2705 0.0266 ms 79.6% + triton_mm_2701 0.0318 ms 66.5% + triton_mm_2700 0.0321 ms 66.0% + bias_addmm 0.0420 ms 50.4% +SingleProcess AUTOTUNE takes 6.0336 seconds +AUTOTUNE addmm(256x2048, 256x1024, 1024x2048) + triton_mm_2900 0.0201 ms 100.0% + triton_mm_2901 0.0202 ms 99.2% + triton_mm_2905 0.0217 ms 92.3% + addmm 0.0229 ms 87.7% + triton_mm_2898 0.0247 ms 81.3% + triton_mm_2899 0.0247 ms 81.1% + triton_mm_2903 0.0270 ms 74.4% + triton_mm_2902 0.0274 ms 73.3% + triton_mm_2906 0.0319 ms 63.0% + bias_addmm 0.0382 ms 52.6% +SingleProcess AUTOTUNE takes 5.4137 seconds +AUTOTUNE convolution(1x768x32x32, 512x768x3x3) + convolution 0.0608 ms 100.0% + triton_convolution_2914 0.7372 ms 8.3% + triton_convolution_2915 0.9193 ms 6.6% + triton_convolution_2913 0.9273 ms 6.6% + triton_convolution_2912 1.0188 ms 6.0% + triton_convolution_2909 1.5727 ms 3.9% + triton_convolution_2910 1.6945 ms 3.6% + triton_convolution_2911 2.8783 ms 2.1% +SingleProcess AUTOTUNE takes 4.5485 seconds +AUTOTUNE int_mm(1024x512, 512x512, 1024x512) + triton_mm_2946 0.0131 ms 100.0% + triton_mm_2941 0.0149 ms 88.2% + triton_mm_2942 0.0151 ms 86.7% + triton_mm_2940 0.0172 ms 76.2% + triton_mm_2939 0.0174 ms 75.5% + triton_mm_2944 0.0180 ms 73.0% + triton_mm_2938 0.0182 ms 71.9% + triton_mm_2943 0.0185 ms 70.8% + triton_mm_2945 0.0275 ms 47.7% + triton_mm_2948 0.0292 ms 44.9% +SingleProcess AUTOTUNE takes 7.5632 seconds +AUTOTUNE convolution(1x512x32x32, 512x512x3x3) + convolution 0.0450 ms 100.0% + triton_convolution_2988 0.4812 ms 9.3% + triton_convolution_2987 0.5840 ms 7.7% + triton_convolution_2989 0.6228 ms 7.2% + triton_convolution_2986 0.6782 ms 6.6% + triton_convolution_2983 1.1561 ms 3.9% + triton_convolution_2984 1.2221 ms 3.7% + triton_convolution_2985 1.9798 ms 2.3% +SingleProcess AUTOTUNE takes 4.7999 seconds +AUTOTUNE addmm(1024x512, 1024x768, 768x512) + triton_mm_2994 0.0164 ms 100.0% + triton_mm_2993 0.0170 ms 96.1% + triton_mm_2998 0.0180 ms 90.6% + triton_mm_2991 0.0200 ms 81.9% + triton_mm_2992 0.0201 ms 81.5% + addmm 0.0204 ms 80.0% + triton_mm_2995 0.0218 ms 74.9% + triton_mm_2996 0.0229 ms 71.3% + triton_mm_2999 0.0238 ms 68.7% + triton_mm_2990 0.0304 ms 53.8% +SingleProcess AUTOTUNE takes 5.6965 seconds +AUTOTUNE addmm(1024x1024, 1024x512, 512x1024) + triton_mm_3189 0.0172 ms 100.0% + triton_mm_3190 0.0180 ms 95.4% + triton_mm_3196 0.0192 ms 89.4% + triton_mm_3191 0.0194 ms 88.7% + triton_mm_3192 0.0194 ms 88.5% + triton_mm_3188 0.0234 ms 73.5% + addmm 0.0246 ms 69.9% + triton_mm_3193 0.0248 ms 69.2% + triton_mm_3194 0.0256 ms 67.0% + triton_mm_3197 0.0262 ms 65.6% +SingleProcess AUTOTUNE takes 5.3627 seconds +AUTOTUNE convolution(1x384x64x64, 256x384x3x3) + convolution 0.0568 ms 100.0% + triton_convolution_3205 0.3607 ms 15.7% + triton_convolution_3206 0.4088 ms 13.9% + triton_convolution_3203 0.4787 ms 11.9% + triton_convolution_3204 0.7431 ms 7.6% + triton_convolution_3200 0.7572 ms 7.5% + triton_convolution_3201 0.7977 ms 7.1% + triton_convolution_3202 1.4604 ms 3.9% +SingleProcess AUTOTUNE takes 4.5428 seconds +AUTOTUNE int_mm(4096x256, 256x512, 4096x512) + triton_mm_3237 0.0191 ms 100.0% + triton_mm_3231 0.0218 ms 87.5% + triton_mm_3229 0.0223 ms 85.6% + triton_mm_3230 0.0228 ms 83.7% + triton_mm_3232 0.0245 ms 77.9% + triton_mm_3233 0.0248 ms 77.0% + triton_mm_3239 0.0279 ms 68.3% + triton_mm_3238 0.0282 ms 67.6% + triton_mm_3234 0.0308 ms 61.8% + triton_mm_3235 0.0311 ms 61.4% +SingleProcess AUTOTUNE takes 7.4841 seconds +AUTOTUNE int_mm(4096x512, 512x256, 4096x256) + triton_mm_3271 0.0172 ms 100.0% + triton_mm_3265 0.0204 ms 84.0% + triton_mm_3263 0.0213 ms 80.5% + triton_mm_3264 0.0218 ms 79.0% + triton_mm_3267 0.0220 ms 78.3% + triton_mm_3266 0.0236 ms 72.8% + triton_mm_3268 0.0258 ms 66.7% + triton_mm_3269 0.0262 ms 65.5% + triton_mm_3270 0.0278 ms 61.7% + triton_mm_3273 0.0300 ms 57.3% +SingleProcess AUTOTUNE takes 7.3802 seconds +AUTOTUNE convolution(1x256x64x64, 256x256x3x3) + convolution 0.0422 ms 100.0% + triton_convolution_3279 0.2186 ms 19.3% + triton_convolution_3277 0.2312 ms 18.3% + triton_convolution_3280 0.2661 ms 15.9% + triton_convolution_3278 0.3990 ms 10.6% + triton_convolution_3274 0.5000 ms 8.4% + triton_convolution_3275 0.5327 ms 7.9% + triton_convolution_3276 0.9654 ms 4.4% +SingleProcess AUTOTUNE takes 4.7397 seconds +AUTOTUNE addmm(4096x256, 4096x384, 384x256) + triton_mm_3283 0.0155 ms 100.0% + triton_mm_3282 0.0162 ms 95.6% + triton_mm_3284 0.0169 ms 91.7% + triton_mm_3285 0.0170 ms 90.8% + triton_mm_3289 0.0175 ms 88.5% + triton_mm_3281 0.0201 ms 76.9% + triton_mm_3286 0.0213 ms 72.6% + addmm 0.0221 ms 69.9% + triton_mm_3287 0.0231 ms 66.8% + triton_mm_3288 0.0237 ms 65.2% +SingleProcess AUTOTUNE takes 5.4951 seconds +AUTOTUNE addmm(4096x512, 4096x256, 256x512) + triton_mm_3481 0.0159 ms 100.0% + triton_mm_3480 0.0168 ms 95.0% + triton_mm_3479 0.0180 ms 88.3% + triton_mm_3483 0.0183 ms 87.2% + triton_mm_3482 0.0192 ms 82.9% + triton_mm_3487 0.0197 ms 80.8% + triton_mm_3486 0.0203 ms 78.4% + triton_mm_3489 0.0226 ms 70.5% + addmm 0.0273 ms 58.4% + triton_mm_3484 0.0279 ms 57.1% +SingleProcess AUTOTUNE takes 5.4799 seconds +AUTOTUNE convolution(1x256x128x128, 128x256x3x3) + convolution 0.0609 ms 100.0% + triton_convolution_3496 0.4120 ms 14.8% + triton_convolution_3494 0.4741 ms 12.8% + triton_convolution_3497 0.4790 ms 12.7% + triton_convolution_3495 0.6027 ms 10.1% + triton_convolution_3492 0.6341 ms 9.6% + triton_convolution_3491 0.6524 ms 9.3% + triton_convolution_3493 1.9135 ms 3.2% +SingleProcess AUTOTUNE takes 4.0367 seconds +AUTOTUNE addmm(16384x128, 16384x256, 256x128) + triton_mm_3518 0.0184 ms 100.0% + triton_mm_3517 0.0208 ms 88.6% + triton_mm_3516 0.0212 ms 86.9% + triton_mm_3520 0.0215 ms 85.8% + triton_mm_3523 0.0220 ms 84.0% + triton_mm_3519 0.0226 ms 81.6% + triton_mm_3524 0.0229 ms 80.4% + triton_mm_3526 0.0287 ms 64.2% + triton_mm_3521 0.0289 ms 63.7% + addmm 0.0290 ms 63.6% +SingleProcess AUTOTUNE takes 5.6594 seconds +AUTOTUNE addmm(16384x6, 16384x128, 128x6) + triton_mm_3641 0.0113 ms 100.0% + triton_mm_3642 0.0113 ms 99.4% + triton_mm_3648 0.0116 ms 97.0% + triton_mm_3644 0.0116 ms 96.7% + triton_mm_3639 0.0118 ms 95.7% + triton_mm_3643 0.0118 ms 95.7% + triton_mm_3640 0.0118 ms 95.4% + triton_mm_3646 0.0120 ms 93.9% + triton_mm_3647 0.0120 ms 93.6% + triton_mm_3649 0.0141 ms 80.0% +SingleProcess AUTOTUNE takes 3.9677 seconds +AUTOTUNE mm(1x16, 16x64) + triton_mm_3651 0.0059 ms 100.0% + triton_mm_3652 0.0059 ms 100.0% + triton_mm_3655 0.0059 ms 100.0% + mm 0.0061 ms 95.3% + triton_mm_3654 0.0061 ms 95.3% + triton_mm_3656 0.0061 ms 95.3% + triton_mm_3657 0.0061 ms 95.3% + triton_mm_3653 0.0063 ms 93.6% +SingleProcess AUTOTUNE takes 1.8684 seconds +AUTOTUNE int_mm(1x64, 64x256, 1x256) + triton_mm_3660 0.0069 ms 100.0% + triton_mm_3664 0.0071 ms 96.9% + triton_mm_3663 0.0074 ms 93.7% + triton_mm_3666 0.0074 ms 93.7% + triton_mm_3668 0.0076 ms 91.5% + triton_mm_3661 0.0077 ms 90.0% + triton_mm_3662 0.0077 ms 90.0% + triton_mm_3658 0.0077 ms 89.6% + triton_mm_3659 0.0077 ms 89.6% + triton_mm_3667 0.0080 ms 86.6% +SingleProcess AUTOTUNE takes 3.2951 seconds +AUTOTUNE convolution(1x6x256x256, 8x6x3x3) + triton_convolution_3672 0.0152 ms 100.0% + triton_convolution_3673 0.0171 ms 88.9% + triton_convolution_3669 0.0173 ms 87.5% + convolution 0.0208 ms 72.8% + triton_convolution_3670 0.0219 ms 69.3% + triton_convolution_3671 0.0244 ms 62.2% +SingleProcess AUTOTUNE takes 1.8102 seconds +AUTOTUNE convolution(1x6x256x256, 4x6x7x7) + convolution 0.0293 ms 100.0% + triton_convolution_3677 0.0668 ms 43.9% + triton_convolution_3675 0.0741 ms 39.5% + triton_convolution_3674 0.0764 ms 38.4% + triton_convolution_3678 0.0767 ms 38.2% + triton_convolution_3676 0.1073 ms 27.3% +SingleProcess AUTOTUNE takes 1.9426 seconds +AUTOTUNE convolution(1x6x256x256, 4x6x15x15) + convolution 0.1554 ms 100.0% + triton_convolution_3682 0.2589 ms 60.0% + triton_convolution_3680 0.2849 ms 54.5% + triton_convolution_3679 0.2990 ms 52.0% + triton_convolution_3683 0.3071 ms 50.6% + triton_convolution_3681 0.4260 ms 36.5% +SingleProcess AUTOTUNE takes 1.9797 seconds +AUTOTUNE convolution(1x16x256x256, 16x16x3x3) + convolution 0.0158 ms 100.0% + triton_convolution_3687 0.0224 ms 70.5% + triton_convolution_3684 0.0231 ms 68.3% + triton_convolution_3688 0.0231 ms 68.3% + triton_convolution_3685 0.0234 ms 67.3% + triton_convolution_3686 0.0298 ms 52.9% +SingleProcess AUTOTUNE takes 1.8039 seconds +AUTOTUNE int_mm(1x64, 64x64, 1x64) + triton_mm_3694 0.0063 ms 100.0% + triton_mm_3693 0.0064 ms 98.5% + triton_mm_3689 0.0067 ms 94.7% + triton_mm_3690 0.0069 ms 90.8% + triton_mm_3691 0.0069 ms 90.8% + triton_mm_3692 0.0074 ms 84.7% +SingleProcess AUTOTUNE takes 1.6829 seconds +AUTOTUNE int_mm(1x64, 64x32, 1x32) + triton_mm_3696 0.0061 ms 100.0% + triton_mm_3695 0.0064 ms 96.0% + triton_mm_3698 0.0068 ms 89.7% + triton_mm_3699 0.0069 ms 89.3% + triton_mm_3697 0.0069 ms 88.7% +SingleProcess AUTOTUNE takes 1.3927 seconds +AUTOTUNE addmm(16384x16, 16384x64, 64x16) + triton_mm_3735 0.0095 ms 100.0% + triton_mm_3743 0.0095 ms 100.0% + triton_mm_3736 0.0095 ms 99.7% + triton_mm_3738 0.0095 ms 99.7% + triton_mm_3737 0.0097 ms 97.4% + triton_mm_3739 0.0098 ms 97.0% + triton_mm_3744 0.0100 ms 94.3% + triton_mm_3742 0.0102 ms 92.5% + triton_mm_3740 0.0103 ms 92.2% + triton_mm_3741 0.0103 ms 92.2% +SingleProcess AUTOTUNE takes 3.7683 seconds +AUTOTUNE convolution(1x16x128x128, 16x16x3x3) + convolution 0.0114 ms 100.0% + triton_convolution_3747 0.0126 ms 90.6% + triton_convolution_3750 0.0137 ms 83.4% + triton_convolution_3751 0.0151 ms 75.3% + triton_convolution_3748 0.0169 ms 67.3% + triton_convolution_3749 0.0283 ms 40.3% +SingleProcess AUTOTUNE takes 1.8217 seconds +AUTOTUNE int_mm(2x128, 128x1024, 2x1024) + triton_mm_3780 0.0069 ms 100.0% + triton_mm_3776 0.0074 ms 93.5% + triton_mm_3782 0.0074 ms 93.5% + triton_mm_3777 0.0077 ms 90.0% + triton_mm_3778 0.0079 ms 87.1% + triton_mm_3773 0.0082 ms 84.7% + triton_mm_3774 0.0084 ms 81.8% + triton_mm_3781 0.0087 ms 79.4% + triton_mm_3772 0.0089 ms 77.8% + triton_mm_3775 0.0092 ms 75.0% +SingleProcess AUTOTUNE takes 3.5196 seconds +AUTOTUNE bmm(1x16384x16, 1x16x512) + triton_bmm_3787 0.0163 ms 100.0% + triton_bmm_3786 0.0164 ms 99.3% + triton_bmm_3783 0.0164 ms 99.1% + triton_bmm_3784 0.0165 ms 98.4% + triton_bmm_3790 0.0167 ms 97.4% + triton_bmm_3785 0.0167 ms 97.2% + triton_bmm_3791 0.0172 ms 94.3% + triton_bmm_3789 0.0185 ms 88.1% + triton_bmm_3788 0.0193 ms 84.3% + triton_bmm_3792 0.0195 ms 83.5% +SingleProcess AUTOTUNE takes 3.3490 seconds +AUTOTUNE bmm(8x16384x64, 8x64x3) + triton_bmm_3802 0.0245 ms 100.0% + triton_bmm_3803 0.0254 ms 96.6% + triton_bmm_3794 0.0262 ms 93.5% + triton_bmm_3795 0.0266 ms 92.3% + triton_bmm_3796 0.0266 ms 92.2% + triton_bmm_3804 0.0267 ms 92.1% + triton_bmm_3801 0.0268 ms 91.6% + triton_bmm_3805 0.0268 ms 91.6% + triton_bmm_3797 0.0273 ms 90.0% + triton_bmm_3798 0.0275 ms 89.2% +SingleProcess AUTOTUNE takes 3.8005 seconds +AUTOTUNE bmm(8x16384x3, 8x3x64) + triton_bmm_3808 0.0170 ms 100.0% + triton_bmm_3806 0.0171 ms 99.4% + triton_bmm_3809 0.0171 ms 99.4% + triton_bmm_3807 0.0171 ms 99.1% + triton_bmm_3810 0.0175 ms 96.7% + triton_bmm_3813 0.0175 ms 96.7% + triton_bmm_3814 0.0177 ms 96.0% + triton_bmm_3812 0.0192 ms 88.3% + triton_bmm_3811 0.0206 ms 82.4% + triton_bmm_3816 0.0208 ms 81.7% +SingleProcess AUTOTUNE takes 3.6020 seconds +AUTOTUNE int_mm(16384x512, 512x16, 16384x16) + triton_mm_3825 0.0158 ms 100.0% + triton_mm_3827 0.0164 ms 96.7% + triton_mm_3823 0.0168 ms 94.1% + triton_mm_3826 0.0168 ms 93.9% + triton_mm_3822 0.0174 ms 90.8% + triton_mm_3818 0.0177 ms 89.3% + triton_mm_3820 0.0177 ms 89.3% + triton_mm_3819 0.0187 ms 84.4% + triton_mm_3817 0.0202 ms 78.3% + triton_mm_3821 0.0227 ms 69.6% +SingleProcess AUTOTUNE takes 4.7479 seconds +AUTOTUNE addmm(4096x32, 4096x64, 64x32) + triton_mm_3905 0.0076 ms 100.0% + triton_mm_3912 0.0076 ms 100.0% + triton_mm_3907 0.0077 ms 98.8% + triton_mm_3909 0.0077 ms 98.8% + triton_mm_3908 0.0079 ms 95.6% + triton_mm_3913 0.0079 ms 95.6% + triton_mm_3910 0.0081 ms 93.7% + triton_mm_3904 0.0082 ms 92.6% + triton_mm_3906 0.0083 ms 91.5% + triton_mm_3911 0.0086 ms 87.8% +SingleProcess AUTOTUNE takes 4.5098 seconds +AUTOTUNE convolution(1x32x64x64, 32x32x3x3) + convolution 0.0106 ms 100.0% + triton_convolution_3921 0.0146 ms 72.7% + triton_convolution_3919 0.0167 ms 63.5% + triton_convolution_3920 0.0176 ms 60.2% + triton_convolution_3916 0.0185 ms 57.4% + triton_convolution_3922 0.0233 ms 45.4% + triton_convolution_3917 0.0393 ms 27.0% + triton_convolution_3918 0.0749 ms 14.1% +SingleProcess AUTOTUNE takes 2.8654 seconds +AUTOTUNE int_mm(4096x32, 32x512, 4096x512) + triton_mm_3966 0.0130 ms 100.0% + triton_mm_3968 0.0145 ms 89.4% + triton_mm_3960 0.0147 ms 88.3% + triton_mm_3965 0.0151 ms 85.8% + triton_mm_3961 0.0163 ms 79.5% + triton_mm_3963 0.0166 ms 78.4% + triton_mm_3962 0.0205 ms 63.3% + triton_mm_3964 0.0214 ms 60.6% + triton_mm_3970 0.0268 ms 48.4% + triton_mm_3969 0.0272 ms 47.8% +SingleProcess AUTOTUNE takes 5.0872 seconds +AUTOTUNE bmm(8x4096x64, 8x64x3) + triton_bmm_3978 0.0113 ms 100.0% + triton_bmm_3974 0.0114 ms 99.4% + triton_bmm_3981 0.0116 ms 97.3% + triton_bmm_3973 0.0117 ms 97.0% + triton_bmm_3971 0.0117 ms 96.7% + triton_bmm_3982 0.0117 ms 96.5% + triton_bmm_3975 0.0118 ms 95.7% + triton_bmm_3979 0.0119 ms 95.4% + triton_bmm_3972 0.0119 ms 95.2% + triton_bmm_3976 0.0121 ms 93.4% +SingleProcess AUTOTUNE takes 3.6936 seconds +AUTOTUNE bmm(8x4096x3, 8x3x64) + triton_bmm_3983 0.0092 ms 100.0% + triton_bmm_3991 0.0092 ms 99.7% + triton_bmm_3985 0.0095 ms 97.0% + triton_bmm_3984 0.0097 ms 95.0% + triton_bmm_3987 0.0097 ms 94.7% + triton_bmm_3990 0.0097 ms 94.7% + triton_bmm_3986 0.0098 ms 94.1% + triton_bmm_3993 0.0100 ms 91.4% + triton_bmm_3988 0.0102 ms 89.7% + triton_bmm_3989 0.0103 ms 89.1% +SingleProcess AUTOTUNE takes 3.9562 seconds +AUTOTUNE int_mm(4096x512, 512x32, 4096x32) + triton_mm_4003 0.0115 ms 100.0% + triton_mm_4002 0.0122 ms 94.7% + triton_mm_4000 0.0127 ms 90.5% + triton_mm_3999 0.0128 ms 89.8% + triton_mm_3997 0.0129 ms 89.3% + triton_mm_4004 0.0133 ms 86.7% + triton_mm_3998 0.0139 ms 83.1% + triton_mm_3995 0.0150 ms 76.9% + triton_mm_3996 0.0161 ms 71.4% + triton_mm_3994 0.0177 ms 65.2% +SingleProcess AUTOTUNE takes 3.9628 seconds +AUTOTUNE addmm(1024x64, 1024x128, 128x64) + triton_mm_4093 0.0079 ms 100.0% + triton_mm_4097 0.0082 ms 96.9% + triton_mm_4094 0.0083 ms 95.4% + triton_mm_4096 0.0083 ms 95.4% + triton_mm_4091 0.0084 ms 93.9% + triton_mm_4090 0.0092 ms 86.4% + triton_mm_4092 0.0092 ms 86.4% + triton_mm_4089 0.0092 ms 86.1% + triton_mm_4088 0.0102 ms 77.5% + triton_mm_4095 0.0113 ms 70.5% +SingleProcess AUTOTUNE takes 4.5906 seconds +AUTOTUNE convolution(1x64x32x32, 64x64x3x3) + convolution 0.0118 ms 100.0% + triton_convolution_4105 0.0347 ms 34.1% + triton_convolution_4104 0.0484 ms 24.4% + triton_convolution_4100 0.0488 ms 24.2% + triton_convolution_4103 0.0526 ms 22.5% + triton_convolution_4106 0.0685 ms 17.3% + triton_convolution_4101 0.1051 ms 11.3% + triton_convolution_4102 0.2429 ms 4.9% +SingleProcess AUTOTUNE takes 3.7550 seconds +AUTOTUNE int_mm(1x64, 64x128, 1x128) + triton_mm_4115 0.0067 ms 100.0% + triton_mm_4116 0.0069 ms 96.3% + triton_mm_4107 0.0071 ms 93.3% + triton_mm_4109 0.0074 ms 89.8% + triton_mm_4112 0.0074 ms 89.7% + triton_mm_4110 0.0074 ms 89.7% + triton_mm_4111 0.0075 ms 89.3% + triton_mm_4113 0.0075 ms 89.3% + triton_mm_4114 0.0077 ms 86.7% + triton_mm_4108 0.0077 ms 86.3% +SingleProcess AUTOTUNE takes 2.8359 seconds +AUTOTUNE int_mm(1024x64, 64x512, 1024x512) + triton_mm_4158 0.0100 ms 100.0% + triton_mm_4155 0.0100 ms 99.4% + triton_mm_4154 0.0103 ms 96.6% + triton_mm_4157 0.0105 ms 95.1% + triton_mm_4152 0.0105 ms 94.8% + triton_mm_4153 0.0106 ms 93.7% + triton_mm_4160 0.0108 ms 92.3% + triton_mm_4156 0.0110 ms 90.4% + triton_mm_4159 0.0144 ms 69.3% + triton_mm_4162 0.0268 ms 37.2% +SingleProcess AUTOTUNE takes 5.4085 seconds +AUTOTUNE bmm(8x1024x64, 8x64x3) + triton_bmm_4172 0.0079 ms 100.0% + triton_bmm_4170 0.0082 ms 96.9% + triton_bmm_4164 0.0085 ms 93.6% + triton_bmm_4165 0.0085 ms 93.6% + triton_bmm_4166 0.0085 ms 93.6% + triton_bmm_4167 0.0085 ms 93.6% + triton_bmm_4168 0.0085 ms 93.6% + triton_bmm_4171 0.0085 ms 93.6% + triton_bmm_4169 0.0085 ms 93.4% + triton_bmm_4163 0.0087 ms 90.8% +SingleProcess AUTOTUNE takes 3.7610 seconds +AUTOTUNE bmm(8x1024x3, 8x3x64) + triton_bmm_4184 0.0072 ms 100.0% + triton_bmm_4175 0.0074 ms 97.0% + triton_bmm_4179 0.0074 ms 97.0% + triton_bmm_4182 0.0074 ms 97.0% + triton_bmm_4181 0.0079 ms 91.6% + triton_bmm_4176 0.0079 ms 91.3% + triton_bmm_4178 0.0079 ms 91.3% + triton_bmm_4177 0.0079 ms 91.1% + triton_bmm_4183 0.0079 ms 91.1% + triton_bmm_4180 0.0079 ms 90.9% +SingleProcess AUTOTUNE takes 3.2990 seconds +AUTOTUNE int_mm(1024x512, 512x64, 1024x64) + triton_mm_4191 0.0113 ms 100.0% + triton_mm_4194 0.0118 ms 95.7% + triton_mm_4192 0.0120 ms 94.1% + triton_mm_4195 0.0125 ms 89.8% + triton_mm_4189 0.0133 ms 84.4% + triton_mm_4187 0.0148 ms 75.9% + triton_mm_4190 0.0157 ms 71.8% + triton_mm_4196 0.0162 ms 69.7% + triton_mm_4188 0.0167 ms 67.6% + triton_mm_4186 0.0180 ms 62.7% +SingleProcess AUTOTUNE takes 5.3642 seconds +AUTOTUNE addmm(256x128, 256x256, 256x128) + triton_mm_4289 0.0089 ms 100.0% + triton_mm_4290 0.0089 ms 100.0% + triton_mm_4293 0.0091 ms 98.2% + triton_mm_4292 0.0092 ms 97.2% + triton_mm_4288 0.0104 ms 85.3% + triton_mm_4287 0.0110 ms 80.8% + triton_mm_4286 0.0115 ms 77.2% + triton_mm_4285 0.0116 ms 77.0% + addmm 0.0130 ms 68.5% + triton_mm_4284 0.0142 ms 62.5% +SingleProcess AUTOTUNE takes 5.2154 seconds +AUTOTUNE convolution(1x128x16x16, 128x128x3x3) + convolution 0.0129 ms 100.0% + triton_convolution_4301 0.0761 ms 16.9% + triton_convolution_4300 0.0861 ms 14.9% + triton_convolution_4299 0.1161 ms 11.1% + triton_convolution_4302 0.1297 ms 9.9% + triton_convolution_4296 0.1310 ms 9.8% + triton_convolution_4298 0.1456 ms 8.8% + triton_convolution_4297 0.2596 ms 5.0% +SingleProcess AUTOTUNE takes 3.9072 seconds +AUTOTUNE int_mm(256x128, 128x512, 256x512) + triton_mm_4355 0.0089 ms 100.0% + triton_mm_4356 0.0090 ms 98.9% + triton_mm_4358 0.0090 ms 98.6% + triton_mm_4350 0.0095 ms 93.6% + triton_mm_4352 0.0105 ms 84.8% + triton_mm_4353 0.0105 ms 84.8% + triton_mm_4351 0.0107 ms 83.2% + triton_mm_4354 0.0108 ms 82.7% + triton_mm_4357 0.0159 ms 55.8% + triton_mm_4359 0.0280 ms 31.8% +SingleProcess AUTOTUNE takes 5.6652 seconds +AUTOTUNE bmm(8x256x64, 8x64x3) + triton_bmm_4364 0.0067 ms 100.0% + triton_bmm_4362 0.0068 ms 99.1% + triton_bmm_4366 0.0068 ms 99.1% + triton_bmm_4367 0.0068 ms 98.6% + triton_bmm_4363 0.0072 ms 93.7% + triton_bmm_4370 0.0072 ms 93.7% + triton_bmm_4372 0.0076 ms 88.6% + triton_bmm_4365 0.0077 ms 87.1% + triton_bmm_4369 0.0077 ms 87.1% + triton_bmm_4361 0.0079 ms 84.8% +SingleProcess AUTOTUNE takes 3.6718 seconds +AUTOTUNE bmm(8x256x3, 8x3x64) + triton_bmm_4378 0.0066 ms 100.0% + triton_bmm_4379 0.0066 ms 100.0% + triton_bmm_4373 0.0067 ms 99.0% + triton_bmm_4374 0.0067 ms 99.0% + triton_bmm_4376 0.0067 ms 99.0% + triton_bmm_4381 0.0067 ms 99.0% + triton_bmm_4383 0.0067 ms 99.0% + triton_bmm_4382 0.0072 ms 92.2% + triton_bmm_4375 0.0072 ms 92.0% + triton_bmm_4380 0.0078 ms 84.4% +SingleProcess AUTOTUNE takes 3.5552 seconds +AUTOTUNE int_mm(256x512, 512x128, 256x128) + triton_mm_4389 0.0110 ms 100.0% + triton_mm_4390 0.0110 ms 99.7% + triton_mm_4392 0.0118 ms 93.2% + triton_mm_4387 0.0150 ms 73.1% + triton_mm_4388 0.0151 ms 72.5% + triton_mm_4393 0.0164 ms 67.0% + triton_mm_4386 0.0166 ms 66.0% + triton_mm_4385 0.0166 ms 66.0% + triton_mm_4384 0.0173 ms 63.3% + triton_mm_4391 0.0260 ms 42.3% +SingleProcess AUTOTUNE takes 6.6632 seconds +AUTOTUNE addmm(256x256, 256x128, 128x256) + triton_mm_4492 0.0076 ms 100.0% + triton_mm_4489 0.0082 ms 93.7% + triton_mm_4491 0.0082 ms 93.4% + triton_mm_4488 0.0086 ms 88.5% + triton_mm_4485 0.0092 ms 83.3% + triton_mm_4484 0.0094 ms 81.0% + triton_mm_4486 0.0094 ms 81.0% + triton_mm_4487 0.0097 ms 78.6% + triton_mm_4483 0.0106 ms 72.2% + triton_mm_4494 0.0113 ms 67.5% +SingleProcess AUTOTUNE takes 5.0343 seconds +AUTOTUNE convolution(1x256x16x16, 256x256x3x3) + convolution 0.0175 ms 100.0% + triton_convolution_4499 0.1700 ms 10.3% + triton_convolution_4500 0.2273 ms 7.7% + triton_convolution_4498 0.2507 ms 7.0% + triton_convolution_4501 0.2927 ms 6.0% + triton_convolution_4497 0.2964 ms 5.9% + triton_convolution_4495 0.5385 ms 3.3% + triton_convolution_4496 0.6048 ms 2.9% +SingleProcess AUTOTUNE takes 4.3093 seconds +AUTOTUNE int_mm(1x64, 64x512, 1x512) + triton_mm_4510 0.0069 ms 100.0% + triton_mm_4504 0.0071 ms 96.9% + triton_mm_4512 0.0071 ms 96.9% + triton_mm_4503 0.0074 ms 93.5% + triton_mm_4507 0.0075 ms 92.7% + triton_mm_4506 0.0076 ms 90.4% + triton_mm_4508 0.0077 ms 89.6% + triton_mm_4502 0.0079 ms 87.4% + triton_mm_4505 0.0082 ms 84.2% + triton_mm_4511 0.0083 ms 83.7% +SingleProcess AUTOTUNE takes 3.2241 seconds +AUTOTUNE int_mm(256x256, 256x512, 256x512) + triton_mm_4529 0.0098 ms 100.0% + triton_mm_4530 0.0098 ms 99.7% + triton_mm_4532 0.0100 ms 97.8% + triton_mm_4524 0.0118 ms 82.9% + triton_mm_4527 0.0118 ms 82.9% + triton_mm_4528 0.0122 ms 80.3% + triton_mm_4526 0.0128 ms 76.1% + triton_mm_4525 0.0131 ms 74.7% + triton_mm_4531 0.0196 ms 49.9% + triton_mm_4534 0.0259 ms 37.7% +SingleProcess AUTOTUNE takes 7.6704 seconds +AUTOTUNE int_mm(256x512, 512x256, 256x256) + triton_mm_4563 0.0115 ms 100.0% + triton_mm_4564 0.0115 ms 100.0% + triton_mm_4566 0.0118 ms 97.6% + triton_mm_4561 0.0146 ms 79.1% + triton_mm_4562 0.0146 ms 78.9% + triton_mm_4559 0.0165 ms 69.8% + triton_mm_4558 0.0166 ms 69.2% + triton_mm_4560 0.0167 ms 69.1% + triton_mm_4565 0.0269 ms 42.9% + triton_mm_4567 0.0292 ms 39.4% +SingleProcess AUTOTUNE takes 7.5762 seconds +AUTOTUNE int_mm(256x256, 256x64, 256x64) + triton_mm_4582 0.0094 ms 100.0% + triton_mm_4581 0.0097 ms 96.6% + triton_mm_4584 0.0100 ms 93.9% + triton_mm_4579 0.0105 ms 89.6% + triton_mm_4577 0.0111 ms 85.0% + triton_mm_4585 0.0116 ms 81.4% + triton_mm_4576 0.0120 ms 78.2% + triton_mm_4580 0.0123 ms 76.8% + triton_mm_4578 0.0125 ms 75.0% + triton_mm_4586 0.0145 ms 64.8% +SingleProcess AUTOTUNE takes 5.2590 seconds +AUTOTUNE convolution(1x384x16x16, 256x384x3x3) + convolution 0.0228 ms 100.0% + triton_convolution_4718 0.2994 ms 7.6% + triton_convolution_4719 0.3832 ms 6.0% + triton_convolution_4720 0.4574 ms 5.0% + triton_convolution_4717 0.4632 ms 4.9% + triton_convolution_4716 0.4662 ms 4.9% + triton_convolution_4715 0.7809 ms 2.9% + triton_convolution_4714 0.8032 ms 2.8% +SingleProcess AUTOTUNE takes 4.6019 seconds +AUTOTUNE addmm(256x256, 256x384, 384x256) + triton_mm_4801 0.0101 ms 100.0% + triton_mm_4804 0.0103 ms 97.8% + triton_mm_4800 0.0105 ms 96.0% + triton_mm_4803 0.0105 ms 96.0% + triton_mm_4799 0.0118 ms 85.4% + triton_mm_4798 0.0126 ms 79.9% + triton_mm_4797 0.0130 ms 77.4% + triton_mm_4796 0.0133 ms 75.7% + triton_mm_4795 0.0177 ms 57.1% + triton_mm_4806 0.0202 ms 49.8% +SingleProcess AUTOTUNE takes 5.4593 seconds +AUTOTUNE addmm(256x512, 256x256, 256x512) + triton_mm_4998 0.0092 ms 100.0% + triton_mm_4999 0.0092 ms 100.0% + triton_mm_5001 0.0094 ms 97.6% + triton_mm_5002 0.0098 ms 94.1% + triton_mm_4997 0.0106 ms 87.0% + triton_mm_4996 0.0111 ms 82.6% + triton_mm_4994 0.0114 ms 80.6% + triton_mm_4995 0.0118 ms 77.8% + triton_mm_4993 0.0143 ms 64.1% + addmm 0.0156 ms 58.7% +SingleProcess AUTOTUNE takes 5.7385 seconds +AUTOTUNE convolution(1x192x32x32, 128x192x3x3) + convolution 0.0151 ms 100.0% + triton_convolution_5010 0.1101 ms 13.7% + triton_convolution_5009 0.1373 ms 11.0% + triton_convolution_5008 0.1690 ms 8.9% + triton_convolution_5011 0.1944 ms 7.8% + triton_convolution_5005 0.2067 ms 7.3% + triton_convolution_5006 0.3858 ms 3.9% + triton_convolution_5007 0.7159 ms 2.1% +SingleProcess AUTOTUNE takes 3.9093 seconds +AUTOTUNE int_mm(1024x128, 128x512, 1024x512) + triton_mm_5042 0.0102 ms 100.0% + triton_mm_5037 0.0108 ms 95.2% + triton_mm_5036 0.0108 ms 95.0% + triton_mm_5034 0.0110 ms 93.0% + triton_mm_5038 0.0110 ms 92.8% + triton_mm_5035 0.0113 ms 90.3% + triton_mm_5039 0.0120 ms 85.1% + triton_mm_5040 0.0128 ms 80.0% + triton_mm_5041 0.0156 ms 65.6% + triton_mm_5043 0.0282 ms 36.3% +SingleProcess AUTOTUNE takes 5.9396 seconds +AUTOTUNE int_mm(1024x512, 512x128, 1024x128) + triton_mm_5073 0.0113 ms 100.0% + triton_mm_5074 0.0120 ms 94.1% + triton_mm_5076 0.0121 ms 92.9% + triton_mm_5071 0.0149 ms 75.7% + triton_mm_5072 0.0151 ms 74.4% + triton_mm_5070 0.0167 ms 67.6% + triton_mm_5077 0.0172 ms 65.7% + triton_mm_5069 0.0175 ms 64.5% + triton_mm_5068 0.0177 ms 63.7% + triton_mm_5075 0.0262 ms 43.0% +SingleProcess AUTOTUNE takes 6.7080 seconds +AUTOTUNE convolution(1x128x32x32, 128x128x3x3) + convolution 0.0137 ms 100.0% + triton_convolution_5084 0.0730 ms 18.8% + triton_convolution_5083 0.0887 ms 15.4% + triton_convolution_5082 0.1092 ms 12.5% + triton_convolution_5079 0.1278 ms 10.7% + triton_convolution_5085 0.1289 ms 10.6% + triton_convolution_5080 0.2448 ms 5.6% + triton_convolution_5081 0.4841 ms 2.8% +SingleProcess AUTOTUNE takes 4.0367 seconds +AUTOTUNE addmm(1024x128, 1024x192, 192x128) + triton_mm_5094 0.0089 ms 100.0% + triton_mm_5095 0.0090 ms 99.3% + triton_mm_5091 0.0091 ms 98.6% + triton_mm_5092 0.0092 ms 97.4% + triton_mm_5089 0.0098 ms 91.5% + triton_mm_5088 0.0102 ms 87.2% + triton_mm_5087 0.0103 ms 86.9% + triton_mm_5090 0.0103 ms 86.8% + triton_mm_5086 0.0121 ms 74.0% + triton_mm_5097 0.0142 ms 63.0% +SingleProcess AUTOTUNE takes 5.1963 seconds +AUTOTUNE addmm(1024x256, 1024x128, 128x256) + triton_mm_5293 0.0088 ms 100.0% + triton_mm_5292 0.0090 ms 97.5% + triton_mm_5289 0.0092 ms 94.8% + triton_mm_5286 0.0093 ms 94.5% + triton_mm_5287 0.0098 ms 89.8% + triton_mm_5288 0.0098 ms 89.5% + triton_mm_5285 0.0100 ms 87.8% + triton_mm_5290 0.0102 ms 85.9% + triton_mm_5284 0.0102 ms 85.6% + triton_mm_5295 0.0116 ms 75.9% +SingleProcess AUTOTUNE takes 5.3030 seconds +AUTOTUNE convolution(1x96x64x64, 64x96x3x3) + convolution 0.0157 ms 100.0% + triton_convolution_5301 0.0519 ms 30.3% + triton_convolution_5296 0.0770 ms 20.4% + triton_convolution_5299 0.0777 ms 20.2% + triton_convolution_5300 0.0793 ms 19.8% + triton_convolution_5302 0.0945 ms 16.6% + triton_convolution_5297 0.1656 ms 9.5% + triton_convolution_5298 0.2607 ms 6.0% +SingleProcess AUTOTUNE takes 4.3653 seconds +AUTOTUNE int_mm(4096x64, 64x512, 4096x512) + triton_mm_5332 0.0156 ms 100.0% + triton_mm_5326 0.0160 ms 97.2% + triton_mm_5324 0.0161 ms 96.8% + triton_mm_5325 0.0169 ms 91.9% + triton_mm_5328 0.0171 ms 91.2% + triton_mm_5327 0.0172 ms 90.3% + triton_mm_5330 0.0179 ms 86.8% + triton_mm_5329 0.0187 ms 83.4% + triton_mm_5331 0.0234 ms 66.6% + triton_mm_5334 0.0279 ms 55.8% +SingleProcess AUTOTUNE takes 6.4286 seconds +AUTOTUNE int_mm(4096x512, 512x64, 4096x64) + triton_mm_5366 0.0126 ms 100.0% + triton_mm_5361 0.0139 ms 90.8% + triton_mm_5367 0.0139 ms 90.6% + triton_mm_5359 0.0152 ms 82.9% + triton_mm_5363 0.0153 ms 82.0% + triton_mm_5364 0.0158 ms 79.7% + triton_mm_5362 0.0163 ms 77.1% + triton_mm_5368 0.0171 ms 73.6% + triton_mm_5360 0.0173 ms 72.6% + triton_mm_5358 0.0187 ms 67.2% +SingleProcess AUTOTUNE takes 6.1846 seconds +AUTOTUNE convolution(1x64x64x64, 64x64x3x3) + convolution 0.0124 ms 100.0% + triton_convolution_5374 0.0362 ms 34.3% + triton_convolution_5369 0.0535 ms 23.2% + triton_convolution_5373 0.0549 ms 22.6% + triton_convolution_5372 0.0555 ms 22.4% + triton_convolution_5375 0.0652 ms 19.0% + triton_convolution_5370 0.0995 ms 12.5% + triton_convolution_5371 0.2487 ms 5.0% +SingleProcess AUTOTUNE takes 3.7092 seconds +AUTOTUNE addmm(4096x64, 4096x96, 96x64) + triton_mm_5379 0.0089 ms 100.0% + triton_mm_5377 0.0091 ms 97.9% + triton_mm_5384 0.0091 ms 97.9% + triton_mm_5385 0.0091 ms 97.9% + triton_mm_5382 0.0092 ms 96.5% + triton_mm_5381 0.0095 ms 93.9% + triton_mm_5380 0.0097 ms 91.3% + triton_mm_5378 0.0098 ms 91.1% + triton_mm_5376 0.0100 ms 88.5% + triton_mm_5383 0.0104 ms 85.5% +SingleProcess AUTOTUNE takes 4.9149 seconds +AUTOTUNE addmm(4096x128, 4096x64, 64x128) + triton_mm_5574 0.0087 ms 100.0% + triton_mm_5576 0.0088 ms 99.6% + triton_mm_5573 0.0088 ms 99.3% + triton_mm_5572 0.0090 ms 96.8% + triton_mm_5577 0.0091 ms 96.1% + triton_mm_5580 0.0093 ms 93.8% + triton_mm_5581 0.0094 ms 93.2% + triton_mm_5575 0.0094 ms 92.5% + triton_mm_5578 0.0095 ms 91.6% + triton_mm_5583 0.0097 ms 90.4% +SingleProcess AUTOTUNE takes 4.9904 seconds +AUTOTUNE convolution(1x48x128x128, 32x48x3x3) + convolution 0.0203 ms 100.0% + triton_convolution_5589 0.0291 ms 69.7% + triton_convolution_5587 0.0324 ms 62.6% + triton_convolution_5584 0.0324 ms 62.5% + triton_convolution_5588 0.0350 ms 58.0% + triton_convolution_5590 0.0488 ms 41.6% + triton_convolution_5585 0.0623 ms 32.5% + triton_convolution_5586 0.0767 ms 26.4% +SingleProcess AUTOTUNE takes 2.8968 seconds +AUTOTUNE int_mm(16384x32, 32x512, 16384x512) + triton_mm_5614 0.0310 ms 100.0% + triton_mm_5608 0.0352 ms 88.0% + triton_mm_5609 0.0364 ms 85.1% + triton_mm_5613 0.0369 ms 83.9% + triton_mm_5616 0.0370 ms 83.7% + triton_mm_5611 0.0396 ms 78.1% + triton_mm_5610 0.0509 ms 60.9% + triton_mm_5612 0.0540 ms 57.4% + triton_mm_5615 0.0617 ms 50.2% + triton_mm_5617 0.0729 ms 42.5% +SingleProcess AUTOTUNE takes 5.1999 seconds +AUTOTUNE int_mm(16384x512, 512x32, 16384x32) + triton_mm_5650 0.0176 ms 100.0% + triton_mm_5652 0.0182 ms 97.0% + triton_mm_5643 0.0183 ms 96.5% + triton_mm_5651 0.0188 ms 93.8% + triton_mm_5645 0.0195 ms 90.5% + triton_mm_5648 0.0199 ms 88.4% + triton_mm_5647 0.0200 ms 88.0% + triton_mm_5644 0.0203 ms 87.0% + triton_mm_5646 0.0211 ms 83.7% + triton_mm_5642 0.0216 ms 81.4% +SingleProcess AUTOTUNE takes 4.2028 seconds +AUTOTUNE convolution(1x32x128x128, 32x32x3x3) + convolution 0.0135 ms 100.0% + triton_convolution_5658 0.0183 ms 73.5% + triton_convolution_5656 0.0198 ms 68.0% + triton_convolution_5657 0.0199 ms 67.7% + triton_convolution_5653 0.0209 ms 64.4% + triton_convolution_5659 0.0226 ms 59.5% + triton_convolution_5654 0.0405 ms 33.2% + triton_convolution_5655 0.0743 ms 18.1% +SingleProcess AUTOTUNE takes 3.9787 seconds +AUTOTUNE addmm(16384x32, 16384x48, 48x32) + triton_mm_5661 0.0094 ms 100.0% + triton_mm_5668 0.0095 ms 99.7% + triton_mm_5669 0.0095 ms 99.0% + triton_mm_5671 0.0097 ms 97.4% + triton_mm_5664 0.0100 ms 94.6% + triton_mm_5662 0.0102 ms 92.2% + triton_mm_5660 0.0103 ms 91.9% + triton_mm_5670 0.0103 ms 91.3% + triton_mm_5667 0.0105 ms 90.2% + triton_mm_5663 0.0105 ms 89.9% +SingleProcess AUTOTUNE takes 4.4488 seconds +AUTOTUNE addmm(16384x64, 16384x32, 32x64) + triton_mm_5849 0.0092 ms 100.0% + triton_mm_5853 0.0092 ms 100.0% + triton_mm_5856 0.0095 ms 97.6% + triton_mm_5857 0.0096 ms 96.3% + triton_mm_5859 0.0096 ms 96.3% + triton_mm_5848 0.0098 ms 94.1% + triton_mm_5850 0.0100 ms 92.6% + triton_mm_5851 0.0100 ms 92.3% + triton_mm_5854 0.0100 ms 92.3% + triton_mm_5855 0.0100 ms 92.3% +SingleProcess AUTOTUNE takes 4.4521 seconds +AUTOTUNE convolution(1x32x256x256, 16x32x3x3) + convolution 0.0207 ms 100.0% + triton_convolution_5863 0.0439 ms 47.1% + triton_convolution_5864 0.0544 ms 38.0% + triton_convolution_5860 0.0561 ms 36.9% + triton_convolution_5861 0.0586 ms 35.3% + triton_convolution_5865 0.0592 ms 35.0% + triton_convolution_5862 0.0791 ms 26.2% +SingleProcess AUTOTUNE takes 2.6616 seconds +AUTOTUNE addmm(65536x16, 65536x32, 32x16) + triton_mm_5880 0.0116 ms 100.0% + triton_mm_5883 0.0116 ms 100.0% + triton_mm_5876 0.0118 ms 98.6% + triton_mm_5879 0.0118 ms 98.6% + triton_mm_5882 0.0122 ms 95.3% + triton_mm_5884 0.0122 ms 95.3% + triton_mm_5877 0.0123 ms 94.5% + triton_mm_5881 0.0124 ms 94.3% + triton_mm_5886 0.0125 ms 93.1% + triton_mm_5885 0.0125 ms 92.9% +SingleProcess AUTOTUNE takes 3.4271 seconds +AUTOTUNE addmm(65536x3, 65536x19, 19x3) + triton_mm_5972 0.0110 ms 100.0% + triton_mm_5971 0.0111 ms 99.1% + triton_mm_5970 0.0111 ms 98.8% + triton_mm_5978 0.0113 ms 96.9% + triton_mm_5977 0.0116 ms 95.0% + triton_mm_5976 0.0116 ms 94.8% + triton_mm_5975 0.0116 ms 94.5% + triton_mm_5968 0.0116 ms 94.2% + triton_mm_5969 0.0116 ms 94.2% + triton_mm_5973 0.0116 ms 94.2% +SingleProcess AUTOTUNE takes 3.8037 seconds +pass-sqnr-inf + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +LearningToPaint +cuda eval LearningToPaint int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for LearningToPaint. Setting accuracy check to cosine +AUTOTUNE convolution(1x9x128x128, 64x9x3x3) + triton_convolution_4 0.0139 ms 100.0% + triton_convolution_3 0.0164 ms 84.4% + convolution 0.0170 ms 81.4% + triton_convolution_0 0.0220 ms 62.8% + triton_convolution_5 0.0223 ms 62.1% + triton_convolution_2 0.0354 ms 39.1% + triton_convolution_1 0.0558 ms 24.8% +SingleProcess AUTOTUNE takes 1.1020 seconds +AUTOTUNE convolution(1x64x64x64, 64x64x3x3) + convolution 0.0115 ms 100.0% + triton_convolution_11 0.0572 ms 20.0% + triton_convolution_6 0.0803 ms 14.3% + triton_convolution_9 0.0848 ms 13.5% + triton_convolution_10 0.0929 ms 12.3% + triton_convolution_12 0.0943 ms 12.2% + triton_convolution_7 0.1628 ms 7.0% + triton_convolution_8 0.2531 ms 4.5% +SingleProcess AUTOTUNE takes 1.0448 seconds +AUTOTUNE convolution(1x64x32x32, 64x64x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_18 0.0346 ms 31.7% + triton_convolution_13 0.0490 ms 22.4% + triton_convolution_17 0.0494 ms 22.2% + triton_convolution_16 0.0521 ms 21.1% + triton_convolution_19 0.0680 ms 16.1% + triton_convolution_14 0.1043 ms 10.5% + triton_convolution_15 0.2434 ms 4.5% +SingleProcess AUTOTUNE takes 1.1529 seconds +AUTOTUNE convolution(1x64x64x64, 64x64x1x1) + triton_convolution_25 0.0084 ms 100.0% + triton_convolution_24 0.0087 ms 97.1% + convolution 0.0095 ms 89.2% + triton_convolution_23 0.0095 ms 89.2% + triton_convolution_20 0.0107 ms 79.3% + triton_convolution_26 0.0128 ms 66.0% + triton_convolution_21 0.0149 ms 56.8% + triton_convolution_22 0.0352 ms 24.0% +SingleProcess AUTOTUNE takes 1.0762 seconds +AUTOTUNE convolution(1x64x32x32, 128x64x3x3) + convolution 0.0115 ms 100.0% + triton_convolution_46 0.0583 ms 19.8% + triton_convolution_45 0.0840 ms 13.7% + triton_convolution_41 0.0844 ms 13.6% + triton_convolution_47 0.0943 ms 12.2% + triton_convolution_43 0.1027 ms 11.2% + triton_convolution_44 0.1038 ms 11.1% + triton_convolution_42 0.1516 ms 7.6% +SingleProcess AUTOTUNE takes 1.0519 seconds +AUTOTUNE convolution(1x128x16x16, 128x128x3x3) + convolution 0.0133 ms 100.0% + triton_convolution_53 0.0762 ms 17.5% + triton_convolution_52 0.0866 ms 15.4% + triton_convolution_51 0.1158 ms 11.5% + triton_convolution_54 0.1294 ms 10.3% + triton_convolution_48 0.1307 ms 10.2% + triton_convolution_50 0.1451 ms 9.2% + triton_convolution_49 0.2573 ms 5.2% +SingleProcess AUTOTUNE takes 1.0493 seconds +AUTOTUNE convolution(1x64x32x32, 128x64x1x1) + convolution 0.0082 ms 100.0% + triton_convolution_59 0.0087 ms 94.5% + triton_convolution_60 0.0101 ms 81.1% + triton_convolution_55 0.0102 ms 80.6% + triton_convolution_58 0.0110 ms 74.7% + triton_convolution_61 0.0128 ms 64.1% + triton_convolution_56 0.0141 ms 58.4% + triton_convolution_57 0.0188 ms 43.7% +SingleProcess AUTOTUNE takes 1.0776 seconds +AUTOTUNE convolution(1x128x16x16, 256x128x3x3) + convolution 0.0136 ms 100.0% + triton_convolution_78 0.0886 ms 15.3% + triton_convolution_81 0.1620 ms 8.4% + triton_convolution_79 0.1640 ms 8.3% + triton_convolution_82 0.1682 ms 8.1% + triton_convolution_80 0.1784 ms 7.6% + triton_convolution_77 0.1964 ms 6.9% + triton_convolution_76 0.3199 ms 4.3% +SingleProcess AUTOTUNE takes 2.1863 seconds +AUTOTUNE convolution(1x256x8x8, 256x256x3x3) + convolution 0.0175 ms 100.0% + triton_convolution_89 0.1402 ms 12.5% + triton_convolution_85 0.1422 ms 12.3% + triton_convolution_86 0.1659 ms 10.6% + triton_convolution_87 0.1787 ms 9.8% + triton_convolution_84 0.1980 ms 8.8% + triton_convolution_88 0.2550 ms 6.9% + triton_convolution_83 0.5753 ms 3.0% +SingleProcess AUTOTUNE takes 3.6506 seconds +AUTOTUNE convolution(1x128x16x16, 256x128x1x1) + convolution 0.0087 ms 100.0% + triton_convolution_94 0.0115 ms 75.8% + triton_convolution_92 0.0136 ms 64.4% + triton_convolution_93 0.0157 ms 55.5% + triton_convolution_96 0.0162 ms 54.1% + triton_convolution_95 0.0169 ms 51.6% + triton_convolution_91 0.0198 ms 44.2% + triton_convolution_90 0.0227 ms 38.4% +SingleProcess AUTOTUNE takes 4.3443 seconds +AUTOTUNE convolution(1x256x8x8, 512x256x3x3) + convolution 0.0189 ms 100.0% + triton_convolution_113 0.1296 ms 14.6% + triton_convolution_117 0.1451 ms 13.0% + triton_convolution_114 0.1645 ms 11.5% + triton_convolution_115 0.1897 ms 10.0% + triton_convolution_112 0.2001 ms 9.5% + triton_convolution_116 0.2213 ms 8.6% + triton_convolution_111 0.5017 ms 3.8% +SingleProcess AUTOTUNE takes 3.5988 seconds +AUTOTUNE convolution(1x512x4x4, 512x512x3x3) + convolution 0.0225 ms 100.0% + triton_convolution_122 0.2108 ms 10.7% + triton_convolution_120 0.3178 ms 7.1% + triton_convolution_123 0.3263 ms 6.9% + triton_convolution_119 0.3455 ms 6.5% + triton_convolution_121 0.3556 ms 6.3% + triton_convolution_118 0.3774 ms 6.0% +SingleProcess AUTOTUNE takes 2.3202 seconds +AUTOTUNE convolution(1x256x8x8, 512x256x1x1) + convolution 0.0109 ms 100.0% + triton_convolution_126 0.0157 ms 69.1% + triton_convolution_128 0.0162 ms 67.2% + triton_convolution_127 0.0177 ms 61.5% + triton_convolution_130 0.0190 ms 57.3% + triton_convolution_125 0.0203 ms 53.6% + triton_convolution_129 0.0252 ms 43.3% + triton_convolution_124 0.0370 ms 29.4% +SingleProcess AUTOTUNE takes 3.4069 seconds +AUTOTUNE int_mm(1x512, 512x65, 1x65) + triton_mm_152 0.0091 ms 100.0% + triton_mm_151 0.0098 ms 93.1% + triton_mm_148 0.0098 ms 92.4% + triton_mm_149 0.0100 ms 90.7% + triton_mm_147 0.0104 ms 87.1% + triton_mm_145 0.0123 ms 73.8% + triton_mm_144 0.0125 ms 72.4% + triton_mm_143 0.0146 ms 62.1% + triton_mm_146 0.0158 ms 57.6% + triton_mm_150 0.0203 ms 44.9% +SingleProcess AUTOTUNE takes 3.0051 seconds +pass-sqnr-55.632 + loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead + loading model: 0it [00:03, ?it/s] +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +Super_SloMo +cuda eval Super_SloMo int8dynamic-bs1-acc +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +AUTOTUNE convolution(1x256x44x44, 512x256x3x3) + convolution 0.0382 ms 100.0% + triton_convolution_420 0.2179 ms 17.5% + triton_convolution_418 0.2303 ms 16.6% + triton_convolution_421 0.2826 ms 13.5% + triton_convolution_419 0.3583 ms 10.7% + triton_convolution_415 0.5132 ms 7.4% + triton_convolution_416 0.5866 ms 6.5% + triton_convolution_417 0.9438 ms 4.0% +SingleProcess AUTOTUNE takes 1.3161 seconds +AUTOTUNE convolution(1x512x44x44, 512x512x3x3) + convolution 0.0660 ms 100.0% + triton_convolution_427 0.4709 ms 14.0% + triton_convolution_428 0.5663 ms 11.7% + triton_convolution_425 0.6668 ms 9.9% + triton_convolution_422 1.0076 ms 6.5% + triton_convolution_426 1.0143 ms 6.5% + triton_convolution_423 1.1858 ms 5.6% + triton_convolution_424 1.8958 ms 3.5% +SingleProcess AUTOTUNE takes 1.1083 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +alexnet +cuda eval alexnet int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for alexnet. Setting accuracy check to cosine +pass-sqnr-39.586 + loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_edgecnn int8dynamic-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-50.445 + loading model: 0it [00:00, ?it/s]basic_gnn_gcn + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_gcn int8dynamic-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-49.672 + loading model: 0it [00:00, ?it/s]basic_gnn_gin + loading model: 0it [00:04, ?it/s] +cuda eval basic_gnn_gin int8dynamic-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-44.907 + loading model: 0it [00:00, ?it/s]basic_gnn_sage + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_sage int8dynamic-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-48.436 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:09, ?it/s] +cm3leon_generate +cuda eval cm3leon_generate int8dynamic-bs1-acc +AUTOTUNE bmm(16x1x13, 16x13x96) + triton_bmm_1126 0.0063 ms 100.0% + triton_bmm_1129 0.0066 ms 96.1% + triton_bmm_1130 0.0069 ms 91.2% + triton_bmm_1124 0.0071 ms 89.1% + triton_bmm_1127 0.0071 ms 89.1% + triton_bmm_1128 0.0071 ms 89.1% + triton_bmm_1131 0.0071 ms 89.1% + triton_bmm_1132 0.0071 ms 88.7% + triton_bmm_1133 0.0071 ms 88.7% + triton_bmm_1125 0.0072 ms 87.0% +SingleProcess AUTOTUNE takes 1.6010 seconds +AUTOTUNE bmm(16x1x96, 16x96x14) + triton_bmm_1195 0.0066 ms 100.0% + triton_bmm_1197 0.0070 ms 93.6% + triton_bmm_1196 0.0071 ms 92.8% + triton_bmm_1198 0.0071 ms 92.8% + triton_bmm_1199 0.0071 ms 92.8% + triton_bmm_1194 0.0080 ms 81.8% + triton_bmm_1200 0.0084 ms 78.5% + triton_bmm_1201 0.0089 ms 74.0% + bmm 0.0531 ms 12.4% +SingleProcess AUTOTUNE takes 1.3269 seconds +AUTOTUNE bmm(16x1x14, 16x14x96) + triton_bmm_1223 0.0063 ms 100.0% + triton_bmm_1221 0.0066 ms 96.1% + triton_bmm_1216 0.0068 ms 92.1% + triton_bmm_1219 0.0068 ms 92.1% + triton_bmm_1222 0.0068 ms 92.1% + triton_bmm_1220 0.0069 ms 91.8% + triton_bmm_1214 0.0070 ms 90.0% + triton_bmm_1218 0.0070 ms 89.5% + triton_bmm_1217 0.0071 ms 89.1% + triton_bmm_1215 0.0073 ms 86.0% +SingleProcess AUTOTUNE takes 1.4733 seconds +AUTOTUNE bmm(16x1x96, 16x96x15) + triton_bmm_1287 0.0066 ms 100.0% + triton_bmm_1288 0.0066 ms 100.0% + triton_bmm_1286 0.0071 ms 92.8% + triton_bmm_1285 0.0073 ms 89.9% + triton_bmm_1289 0.0078 ms 83.7% + triton_bmm_1284 0.0081 ms 81.0% + triton_bmm_1291 0.0084 ms 78.2% + triton_bmm_1290 0.0085 ms 76.9% + bmm 0.0597 ms 11.0% +SingleProcess AUTOTUNE takes 1.2291 seconds +AUTOTUNE bmm(16x1x15, 16x15x96) + triton_bmm_1306 0.0063 ms 100.0% + triton_bmm_1308 0.0063 ms 100.0% + triton_bmm_1310 0.0063 ms 100.0% + triton_bmm_1307 0.0066 ms 96.1% + triton_bmm_1309 0.0066 ms 96.1% + triton_bmm_1311 0.0066 ms 96.1% + triton_bmm_1312 0.0066 ms 96.1% + triton_bmm_1304 0.0070 ms 90.4% + triton_bmm_1313 0.0071 ms 88.5% + triton_bmm_1305 0.0073 ms 86.6% +SingleProcess AUTOTUNE takes 1.4606 seconds +AUTOTUNE bmm(16x1x96, 16x96x16) + triton_bmm_1376 0.0065 ms 100.0% + triton_bmm_1378 0.0066 ms 99.0% + triton_bmm_1377 0.0071 ms 91.9% + triton_bmm_1375 0.0073 ms 89.4% + triton_bmm_1379 0.0078 ms 82.9% + triton_bmm_1374 0.0081 ms 80.6% + triton_bmm_1381 0.0083 ms 78.1% + triton_bmm_1380 0.0088 ms 73.8% + bmm 0.0559 ms 11.6% +SingleProcess AUTOTUNE takes 1.2995 seconds +AUTOTUNE bmm(16x1x16, 16x16x96) + triton_bmm_1394 0.0063 ms 100.0% + triton_bmm_1396 0.0063 ms 100.0% + triton_bmm_1398 0.0063 ms 100.0% + triton_bmm_1399 0.0063 ms 100.0% + triton_bmm_1402 0.0063 ms 100.0% + triton_bmm_1403 0.0063 ms 100.0% + triton_bmm_1395 0.0063 ms 99.5% + triton_bmm_1397 0.0066 ms 96.1% + triton_bmm_1401 0.0066 ms 96.1% + bmm 0.0068 ms 92.5% +SingleProcess AUTOTUNE takes 1.4498 seconds +AUTOTUNE bmm(16x1x96, 16x96x17) + triton_bmm_1466 0.0066 ms 100.0% + triton_bmm_1467 0.0066 ms 100.0% + triton_bmm_1468 0.0066 ms 100.0% + triton_bmm_1465 0.0073 ms 89.5% + triton_bmm_1464 0.0078 ms 84.4% + triton_bmm_1469 0.0086 ms 76.5% + triton_bmm_1470 0.0088 ms 74.3% + triton_bmm_1471 0.0089 ms 74.0% + bmm 0.0541 ms 12.1% +SingleProcess AUTOTUNE takes 1.2320 seconds +AUTOTUNE bmm(16x1x17, 16x17x96) + triton_bmm_1485 0.0066 ms 100.0% + triton_bmm_1491 0.0066 ms 99.5% + triton_bmm_1484 0.0068 ms 96.7% + triton_bmm_1490 0.0068 ms 96.7% + bmm 0.0068 ms 96.2% + triton_bmm_1489 0.0068 ms 96.2% + triton_bmm_1492 0.0068 ms 96.2% + triton_bmm_1494 0.0070 ms 93.2% + triton_bmm_1487 0.0072 ms 90.5% + triton_bmm_1486 0.0073 ms 89.5% +SingleProcess AUTOTUNE takes 1.6116 seconds +AUTOTUNE bmm(16x1x96, 16x96x18) + triton_bmm_1559 0.0066 ms 100.0% + triton_bmm_1557 0.0071 ms 92.8% + triton_bmm_1558 0.0071 ms 92.8% + triton_bmm_1556 0.0073 ms 89.5% + triton_bmm_1555 0.0082 ms 79.8% + triton_bmm_1561 0.0083 ms 78.8% + triton_bmm_1562 0.0083 ms 78.8% + triton_bmm_1560 0.0093 ms 70.4% + bmm 0.0583 ms 11.2% +SingleProcess AUTOTUNE takes 1.2974 seconds +AUTOTUNE bmm(16x1x18, 16x18x96) + triton_bmm_1579 0.0066 ms 100.0% + triton_bmm_1581 0.0066 ms 100.0% + triton_bmm_1580 0.0068 ms 96.2% + triton_bmm_1584 0.0070 ms 93.2% + triton_bmm_1576 0.0071 ms 92.8% + triton_bmm_1577 0.0071 ms 92.8% + triton_bmm_1585 0.0071 ms 92.8% + triton_bmm_1575 0.0071 ms 92.3% + triton_bmm_1583 0.0074 ms 89.1% + triton_bmm_1578 0.0076 ms 86.5% +SingleProcess AUTOTUNE takes 1.6988 seconds +AUTOTUNE bmm(16x1x96, 16x96x19) + triton_bmm_1647 0.0068 ms 100.0% + triton_bmm_1648 0.0071 ms 95.9% + triton_bmm_1649 0.0071 ms 95.5% + triton_bmm_1650 0.0073 ms 93.0% + triton_bmm_1646 0.0078 ms 86.9% + triton_bmm_1651 0.0086 ms 79.1% + triton_bmm_1653 0.0086 ms 79.1% + triton_bmm_1652 0.0091 ms 74.6% + bmm 0.0550 ms 12.3% +SingleProcess AUTOTUNE takes 1.2436 seconds +AUTOTUNE bmm(16x1x19, 16x19x96) + triton_bmm_1667 0.0066 ms 100.0% + triton_bmm_1670 0.0068 ms 97.2% + triton_bmm_1671 0.0068 ms 96.7% + triton_bmm_1676 0.0071 ms 93.2% + triton_bmm_1672 0.0073 ms 90.4% + triton_bmm_1673 0.0073 ms 90.4% + triton_bmm_1666 0.0073 ms 90.0% + triton_bmm_1668 0.0073 ms 90.0% + triton_bmm_1669 0.0073 ms 90.0% + triton_bmm_1674 0.0073 ms 90.0% +SingleProcess AUTOTUNE takes 1.6032 seconds +AUTOTUNE bmm(16x1x96, 16x96x20) + triton_bmm_1739 0.0066 ms 100.0% + triton_bmm_1740 0.0066 ms 100.0% + triton_bmm_1741 0.0066 ms 100.0% + triton_bmm_1738 0.0073 ms 89.5% + triton_bmm_1742 0.0078 ms 84.0% + triton_bmm_1737 0.0083 ms 79.3% + triton_bmm_1743 0.0084 ms 77.9% + triton_bmm_1744 0.0089 ms 74.0% + bmm 0.0537 ms 12.2% +SingleProcess AUTOTUNE takes 1.2302 seconds +AUTOTUNE bmm(16x1x20, 16x20x96) + triton_bmm_1757 0.0066 ms 100.0% + triton_bmm_1758 0.0066 ms 100.0% + triton_bmm_1761 0.0066 ms 100.0% + triton_bmm_1765 0.0068 ms 96.2% + triton_bmm_1760 0.0071 ms 92.8% + triton_bmm_1763 0.0071 ms 92.8% + triton_bmm_1764 0.0071 ms 92.8% + triton_bmm_1759 0.0071 ms 92.3% + bmm 0.0072 ms 90.7% + triton_bmm_1762 0.0074 ms 88.4% +SingleProcess AUTOTUNE takes 1.6008 seconds +AUTOTUNE bmm(16x1x208, 16x208x96) + triton_bmm_19665 0.0080 ms 100.0% + triton_bmm_19664 0.0081 ms 99.6% + triton_bmm_19668 0.0081 ms 99.6% + triton_bmm_19669 0.0083 ms 96.5% + triton_bmm_19666 0.0084 ms 96.2% + bmm 0.0086 ms 93.3% + triton_bmm_19662 0.0088 ms 90.9% + triton_bmm_19663 0.0089 ms 90.0% + triton_bmm_19661 0.0091 ms 88.4% + triton_bmm_19660 0.0109 ms 73.8% +SingleProcess AUTOTUNE takes 3.2607 seconds +AUTOTUNE bmm(16x1x96, 16x96x209) + triton_bmm_19734 0.0073 ms 100.0% + triton_bmm_19733 0.0076 ms 96.6% + triton_bmm_19735 0.0076 ms 96.6% + triton_bmm_19738 0.0078 ms 93.9% + triton_bmm_19736 0.0078 ms 93.5% + triton_bmm_19740 0.0081 ms 90.7% + triton_bmm_19737 0.0081 ms 90.0% + triton_bmm_19732 0.0083 ms 88.1% + triton_bmm_19739 0.0088 ms 83.0% + triton_bmm_19741 0.0096 ms 76.7% +SingleProcess AUTOTUNE takes 3.8095 seconds +AUTOTUNE bmm(16x1x209, 16x209x96) + triton_bmm_19765 0.0093 ms 100.0% + triton_bmm_19764 0.0100 ms 93.6% + bmm 0.0111 ms 84.5% + triton_bmm_19760 0.0113 ms 82.5% + triton_bmm_19762 0.0113 ms 82.5% + triton_bmm_19758 0.0116 ms 80.7% + triton_bmm_19761 0.0116 ms 80.4% + triton_bmm_19757 0.0124 ms 75.3% + triton_bmm_19759 0.0128 ms 72.8% + triton_bmm_19756 0.0141 ms 66.1% +SingleProcess AUTOTUNE takes 3.7730 seconds +AUTOTUNE bmm(16x1x96, 16x96x210) + triton_bmm_19832 0.0071 ms 100.0% + triton_bmm_19834 0.0071 ms 100.0% + triton_bmm_19829 0.0076 ms 93.7% + triton_bmm_19833 0.0078 ms 91.0% + triton_bmm_19830 0.0079 ms 90.2% + triton_bmm_19836 0.0081 ms 88.1% + triton_bmm_19831 0.0081 ms 87.7% + triton_bmm_19835 0.0088 ms 80.4% + triton_bmm_19828 0.0089 ms 80.1% + triton_bmm_19838 0.0091 ms 78.2% +SingleProcess AUTOTUNE takes 3.9563 seconds +AUTOTUNE bmm(16x1x210, 16x210x96) + triton_bmm_19857 0.0078 ms 100.0% + triton_bmm_19858 0.0085 ms 91.7% + triton_bmm_19855 0.0086 ms 91.0% + triton_bmm_19860 0.0086 ms 90.9% + triton_bmm_19861 0.0086 ms 90.4% + triton_bmm_19856 0.0088 ms 88.4% + bmm 0.0091 ms 85.9% + triton_bmm_19854 0.0096 ms 81.7% + triton_bmm_19853 0.0096 ms 81.3% + triton_bmm_19852 0.0116 ms 67.2% +SingleProcess AUTOTUNE takes 4.0140 seconds +AUTOTUNE bmm(16x1x96, 16x96x211) + triton_bmm_19928 0.0071 ms 100.0% + triton_bmm_19930 0.0071 ms 99.6% + triton_bmm_19932 0.0076 ms 94.1% + triton_bmm_19929 0.0078 ms 91.0% + triton_bmm_19926 0.0079 ms 90.2% + triton_bmm_19927 0.0082 ms 86.4% + triton_bmm_19925 0.0083 ms 85.4% + triton_bmm_19924 0.0089 ms 80.1% + triton_bmm_19933 0.0091 ms 78.4% + triton_bmm_19931 0.0093 ms 76.0% +SingleProcess AUTOTUNE takes 3.6439 seconds +AUTOTUNE bmm(16x1x211, 16x211x96) + triton_bmm_19957 0.0093 ms 100.0% + triton_bmm_19956 0.0104 ms 89.5% + bmm 0.0111 ms 83.9% + triton_bmm_19952 0.0113 ms 82.2% + triton_bmm_19950 0.0116 ms 80.4% + triton_bmm_19954 0.0116 ms 80.2% + triton_bmm_19953 0.0116 ms 79.9% + triton_bmm_19949 0.0124 ms 75.4% + triton_bmm_19951 0.0126 ms 73.9% + triton_bmm_19948 0.0141 ms 66.0% +SingleProcess AUTOTUNE takes 4.2061 seconds +AUTOTUNE bmm(16x1x96, 16x96x212) + triton_bmm_20024 0.0071 ms 100.0% + triton_bmm_20028 0.0076 ms 94.1% + triton_bmm_20021 0.0076 ms 93.7% + triton_bmm_20026 0.0078 ms 90.8% + triton_bmm_20022 0.0078 ms 90.6% + triton_bmm_20023 0.0081 ms 87.7% + triton_bmm_20025 0.0081 ms 87.7% + triton_bmm_20020 0.0089 ms 80.1% + triton_bmm_20030 0.0091 ms 78.2% + triton_bmm_20031 0.0091 ms 78.2% +SingleProcess AUTOTUNE takes 3.6144 seconds +AUTOTUNE bmm(16x1x212, 16x212x96) + triton_bmm_20049 0.0078 ms 100.0% + triton_bmm_20050 0.0078 ms 99.6% + triton_bmm_20052 0.0081 ms 96.8% + bmm 0.0085 ms 91.4% + triton_bmm_20048 0.0086 ms 90.7% + triton_bmm_20053 0.0086 ms 90.7% + triton_bmm_20047 0.0091 ms 85.9% + triton_bmm_20046 0.0094 ms 83.3% + triton_bmm_20045 0.0096 ms 81.3% + triton_bmm_20044 0.0111 ms 70.5% +SingleProcess AUTOTUNE takes 4.1415 seconds +AUTOTUNE bmm(16x1x96, 16x96x213) + triton_bmm_20118 0.0073 ms 100.0% + triton_bmm_20119 0.0076 ms 96.6% + triton_bmm_20122 0.0078 ms 94.2% + triton_bmm_20120 0.0078 ms 93.5% + triton_bmm_20124 0.0080 ms 91.6% + triton_bmm_20121 0.0081 ms 90.2% + triton_bmm_20117 0.0082 ms 89.6% + triton_bmm_20116 0.0083 ms 88.1% + triton_bmm_20123 0.0088 ms 83.0% + triton_bmm_20126 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.7273 seconds +AUTOTUNE bmm(16x1x213, 16x213x96) + triton_bmm_20149 0.0099 ms 100.0% + triton_bmm_20148 0.0104 ms 94.5% + bmm 0.0112 ms 87.7% + triton_bmm_20144 0.0113 ms 87.0% + triton_bmm_20145 0.0116 ms 84.8% + triton_bmm_20146 0.0117 ms 84.0% + triton_bmm_20141 0.0118 ms 83.2% + triton_bmm_20142 0.0121 ms 81.5% + triton_bmm_20143 0.0126 ms 78.2% + triton_bmm_20140 0.0138 ms 71.3% +SingleProcess AUTOTUNE takes 3.6867 seconds +AUTOTUNE bmm(16x1x96, 16x96x214) + triton_bmm_20218 0.0071 ms 100.0% + triton_bmm_20216 0.0071 ms 99.6% + triton_bmm_20213 0.0076 ms 93.7% + triton_bmm_20217 0.0077 ms 92.5% + triton_bmm_20214 0.0079 ms 90.2% + triton_bmm_20220 0.0081 ms 87.7% + triton_bmm_20215 0.0081 ms 87.2% + triton_bmm_20212 0.0088 ms 80.4% + triton_bmm_20222 0.0091 ms 78.2% + triton_bmm_20219 0.0093 ms 76.0% +SingleProcess AUTOTUNE takes 3.6644 seconds +AUTOTUNE bmm(16x1x214, 16x214x96) + triton_bmm_20244 0.0081 ms 100.0% + triton_bmm_20241 0.0084 ms 96.6% + triton_bmm_20242 0.0085 ms 94.4% + triton_bmm_20240 0.0088 ms 91.3% + triton_bmm_20245 0.0088 ms 91.3% + triton_bmm_20239 0.0091 ms 88.7% + bmm 0.0094 ms 86.0% + triton_bmm_20238 0.0096 ms 84.0% + triton_bmm_20237 0.0096 ms 83.7% + triton_bmm_20236 0.0111 ms 72.6% +SingleProcess AUTOTUNE takes 3.8951 seconds +AUTOTUNE bmm(16x1x96, 16x96x215) + triton_bmm_20310 0.0073 ms 100.0% + triton_bmm_20309 0.0076 ms 96.6% + triton_bmm_20312 0.0078 ms 93.5% + triton_bmm_20314 0.0078 ms 93.5% + triton_bmm_20316 0.0081 ms 90.5% + triton_bmm_20311 0.0081 ms 90.2% + triton_bmm_20308 0.0083 ms 88.1% + triton_bmm_20313 0.0083 ms 88.1% + triton_bmm_20315 0.0088 ms 83.0% + triton_bmm_20317 0.0096 ms 76.3% +SingleProcess AUTOTUNE takes 3.7790 seconds +AUTOTUNE bmm(16x1x215, 16x215x96) + triton_bmm_20341 0.0093 ms 100.0% + triton_bmm_20340 0.0104 ms 89.6% + triton_bmm_20338 0.0113 ms 82.7% + bmm 0.0114 ms 82.3% + triton_bmm_20334 0.0116 ms 80.7% + triton_bmm_20337 0.0116 ms 80.4% + triton_bmm_20336 0.0119 ms 78.7% + triton_bmm_20333 0.0121 ms 77.5% + triton_bmm_20335 0.0123 ms 75.8% + triton_bmm_20332 0.0138 ms 67.6% +SingleProcess AUTOTUNE takes 3.6628 seconds +AUTOTUNE bmm(16x1x96, 16x96x216) + triton_bmm_20405 0.0076 ms 100.0% + triton_bmm_20412 0.0076 ms 100.0% + triton_bmm_20410 0.0078 ms 96.9% + triton_bmm_20409 0.0078 ms 96.7% + triton_bmm_20408 0.0078 ms 96.3% + triton_bmm_20406 0.0080 ms 94.0% + triton_bmm_20407 0.0081 ms 93.3% + triton_bmm_20413 0.0085 ms 88.4% + triton_bmm_20404 0.0089 ms 85.2% + triton_bmm_20414 0.0091 ms 83.1% +SingleProcess AUTOTUNE takes 4.3108 seconds +AUTOTUNE bmm(16x1x216, 16x216x96) + triton_bmm_20433 0.0081 ms 100.0% + triton_bmm_20432 0.0083 ms 97.3% + triton_bmm_20436 0.0083 ms 96.9% + triton_bmm_20434 0.0083 ms 96.7% + triton_bmm_20437 0.0084 ms 96.6% + bmm 0.0086 ms 93.3% + triton_bmm_20430 0.0090 ms 89.7% + triton_bmm_20431 0.0091 ms 88.7% + triton_bmm_20429 0.0096 ms 84.0% + triton_bmm_20428 0.0114 ms 70.6% +SingleProcess AUTOTUNE takes 3.7572 seconds +AUTOTUNE bmm(16x1x96, 16x96x217) + triton_bmm_20504 0.0072 ms 100.0% + triton_bmm_20502 0.0073 ms 98.7% + triton_bmm_20506 0.0078 ms 92.6% + triton_bmm_20508 0.0081 ms 89.3% + triton_bmm_20501 0.0083 ms 87.3% + triton_bmm_20503 0.0083 ms 87.3% + triton_bmm_20505 0.0083 ms 87.3% + triton_bmm_20500 0.0083 ms 86.9% + triton_bmm_20509 0.0091 ms 79.9% + triton_bmm_20511 0.0091 ms 79.6% +SingleProcess AUTOTUNE takes 3.7814 seconds +AUTOTUNE bmm(16x1x217, 16x217x96) + triton_bmm_20533 0.0093 ms 100.0% + triton_bmm_20532 0.0101 ms 92.7% + triton_bmm_20529 0.0113 ms 82.7% + triton_bmm_20528 0.0113 ms 82.5% + triton_bmm_20530 0.0113 ms 82.5% + bmm 0.0116 ms 80.7% + triton_bmm_20526 0.0116 ms 80.7% + triton_bmm_20525 0.0121 ms 77.5% + triton_bmm_20527 0.0129 ms 72.6% + triton_bmm_20524 0.0138 ms 67.6% +SingleProcess AUTOTUNE takes 3.7624 seconds +AUTOTUNE bmm(16x1x96, 16x96x218) + triton_bmm_20598 0.0073 ms 100.0% + triton_bmm_20597 0.0076 ms 96.6% + triton_bmm_20599 0.0076 ms 96.6% + triton_bmm_20601 0.0077 ms 94.6% + triton_bmm_20602 0.0078 ms 93.7% + triton_bmm_20600 0.0078 ms 93.5% + triton_bmm_20604 0.0081 ms 90.5% + triton_bmm_20596 0.0088 ms 83.0% + triton_bmm_20606 0.0092 ms 79.8% + triton_bmm_20603 0.0093 ms 78.4% +SingleProcess AUTOTUNE takes 3.6637 seconds +AUTOTUNE bmm(16x1x218, 16x218x96) + triton_bmm_20625 0.0078 ms 100.0% + triton_bmm_20626 0.0081 ms 96.8% + triton_bmm_20628 0.0081 ms 96.4% + triton_bmm_20624 0.0083 ms 93.8% + triton_bmm_20623 0.0086 ms 91.0% + triton_bmm_20629 0.0088 ms 88.7% + triton_bmm_20622 0.0090 ms 86.8% + bmm 0.0091 ms 85.9% + triton_bmm_20621 0.0096 ms 81.1% + triton_bmm_20620 0.0118 ms 65.9% +SingleProcess AUTOTUNE takes 3.9297 seconds +AUTOTUNE bmm(16x1x96, 16x96x219) + triton_bmm_20694 0.0073 ms 100.0% + triton_bmm_20695 0.0076 ms 96.6% + triton_bmm_20700 0.0076 ms 96.6% + triton_bmm_20693 0.0077 ms 95.0% + triton_bmm_20697 0.0078 ms 93.9% + triton_bmm_20696 0.0078 ms 93.5% + triton_bmm_20698 0.0078 ms 93.5% + triton_bmm_20692 0.0083 ms 88.1% + triton_bmm_20699 0.0088 ms 83.0% + triton_bmm_20701 0.0091 ms 80.9% +SingleProcess AUTOTUNE takes 3.9577 seconds +AUTOTUNE bmm(16x1x219, 16x219x96) + triton_bmm_20725 0.0093 ms 100.0% + triton_bmm_20724 0.0101 ms 92.7% + triton_bmm_20721 0.0112 ms 83.2% + triton_bmm_20722 0.0113 ms 82.5% + triton_bmm_20720 0.0114 ms 82.3% + triton_bmm_20718 0.0116 ms 80.7% + bmm 0.0117 ms 79.8% + triton_bmm_20719 0.0123 ms 75.8% + triton_bmm_20717 0.0126 ms 74.3% + triton_bmm_20716 0.0144 ms 65.0% +SingleProcess AUTOTUNE takes 4.1489 seconds +AUTOTUNE bmm(16x1x96, 16x96x220) + triton_bmm_20792 0.0071 ms 100.0% + triton_bmm_20789 0.0076 ms 93.7% + triton_bmm_20793 0.0077 ms 91.7% + triton_bmm_20794 0.0078 ms 90.8% + triton_bmm_20790 0.0080 ms 89.2% + triton_bmm_20796 0.0080 ms 88.4% + triton_bmm_20791 0.0081 ms 87.7% + triton_bmm_20788 0.0083 ms 85.4% + triton_bmm_20795 0.0088 ms 80.7% + triton_bmm_20797 0.0088 ms 80.4% +SingleProcess AUTOTUNE takes 4.1295 seconds +AUTOTUNE bmm(16x1x220, 16x220x96) + triton_bmm_20817 0.0078 ms 100.0% + triton_bmm_20818 0.0078 ms 99.6% + triton_bmm_20820 0.0081 ms 96.8% + triton_bmm_20816 0.0081 ms 96.4% + bmm 0.0085 ms 91.4% + triton_bmm_20815 0.0086 ms 91.0% + triton_bmm_20821 0.0087 ms 89.4% + triton_bmm_20814 0.0089 ms 87.8% + triton_bmm_20813 0.0093 ms 84.1% + triton_bmm_20812 0.0116 ms 67.4% +SingleProcess AUTOTUNE takes 4.0625 seconds +AUTOTUNE bmm(16x1x96, 16x96x221) + triton_bmm_20890 0.0073 ms 100.0% + triton_bmm_20885 0.0077 ms 94.2% + triton_bmm_20888 0.0078 ms 93.1% + triton_bmm_20886 0.0080 ms 91.4% + triton_bmm_20892 0.0081 ms 90.1% + triton_bmm_20884 0.0083 ms 87.7% + triton_bmm_20889 0.0083 ms 87.7% + triton_bmm_20887 0.0083 ms 87.5% + triton_bmm_20893 0.0091 ms 80.6% + triton_bmm_20895 0.0091 ms 80.3% +SingleProcess AUTOTUNE takes 3.7300 seconds +AUTOTUNE bmm(16x1x221, 16x221x96) + triton_bmm_20916 0.0101 ms 100.0% + triton_bmm_20917 0.0101 ms 100.0% + triton_bmm_20914 0.0113 ms 89.0% + triton_bmm_20912 0.0114 ms 88.7% + triton_bmm_20913 0.0117 ms 86.1% + bmm 0.0119 ms 84.9% + triton_bmm_20910 0.0121 ms 83.3% + triton_bmm_20909 0.0126 ms 80.2% + triton_bmm_20911 0.0129 ms 78.4% + triton_bmm_20908 0.0138 ms 72.9% +SingleProcess AUTOTUNE takes 3.9156 seconds +AUTOTUNE bmm(16x1x96, 16x96x222) + triton_bmm_20988 0.0076 ms 100.0% + triton_bmm_20981 0.0076 ms 99.6% + triton_bmm_20984 0.0077 ms 97.7% + triton_bmm_20985 0.0078 ms 96.7% + triton_bmm_20986 0.0078 ms 96.7% + triton_bmm_20982 0.0081 ms 93.7% + triton_bmm_20983 0.0083 ms 91.3% + triton_bmm_20987 0.0088 ms 85.5% + triton_bmm_20980 0.0089 ms 85.2% + triton_bmm_20991 0.0091 ms 83.4% +SingleProcess AUTOTUNE takes 3.7538 seconds +AUTOTUNE bmm(16x1x222, 16x222x96) + triton_bmm_21009 0.0078 ms 100.0% + triton_bmm_21010 0.0078 ms 99.6% + triton_bmm_21007 0.0086 ms 91.0% + triton_bmm_21012 0.0086 ms 90.7% + triton_bmm_21008 0.0088 ms 88.7% + triton_bmm_21013 0.0088 ms 88.7% + bmm 0.0091 ms 85.9% + triton_bmm_21006 0.0096 ms 81.6% + triton_bmm_21005 0.0097 ms 80.8% + triton_bmm_21004 0.0116 ms 67.0% +SingleProcess AUTOTUNE takes 4.0953 seconds +AUTOTUNE bmm(16x1x96, 16x96x223) + triton_bmm_21080 0.0073 ms 100.0% + triton_bmm_21078 0.0073 ms 99.6% + triton_bmm_21084 0.0076 ms 96.6% + triton_bmm_21077 0.0076 ms 96.2% + triton_bmm_21082 0.0078 ms 93.1% + triton_bmm_21079 0.0082 ms 89.1% + triton_bmm_21076 0.0083 ms 87.7% + triton_bmm_21081 0.0083 ms 87.7% + triton_bmm_21087 0.0091 ms 80.3% + triton_bmm_21083 0.0094 ms 77.8% +SingleProcess AUTOTUNE takes 3.9699 seconds +AUTOTUNE bmm(16x1x223, 16x223x96) + triton_bmm_21108 0.0101 ms 100.0% + triton_bmm_21109 0.0101 ms 99.8% + triton_bmm_21104 0.0108 ms 92.9% + triton_bmm_21102 0.0111 ms 91.0% + triton_bmm_21106 0.0113 ms 89.0% + triton_bmm_21105 0.0117 ms 85.8% + bmm 0.0121 ms 83.3% + triton_bmm_21103 0.0123 ms 82.2% + triton_bmm_21101 0.0126 ms 79.9% + triton_bmm_21100 0.0149 ms 67.7% +SingleProcess AUTOTUNE takes 3.6340 seconds +AUTOTUNE bmm(16x1x96, 16x96x224) + triton_bmm_21176 0.0073 ms 100.0% + triton_bmm_21178 0.0073 ms 100.0% + triton_bmm_21180 0.0076 ms 96.6% + triton_bmm_21173 0.0076 ms 96.2% + triton_bmm_21175 0.0076 ms 96.2% + triton_bmm_21177 0.0076 ms 96.2% + triton_bmm_21174 0.0081 ms 90.5% + triton_bmm_21181 0.0086 ms 85.1% + triton_bmm_21179 0.0088 ms 82.6% + triton_bmm_21172 0.0089 ms 82.3% +SingleProcess AUTOTUNE takes 3.6840 seconds +AUTOTUNE bmm(16x1x224, 16x224x96) + triton_bmm_21202 0.0078 ms 100.0% + triton_bmm_21204 0.0081 ms 97.2% + triton_bmm_21201 0.0084 ms 93.5% + triton_bmm_21199 0.0086 ms 91.4% + triton_bmm_21200 0.0088 ms 88.8% + triton_bmm_21205 0.0088 ms 88.8% + triton_bmm_21198 0.0090 ms 87.5% + triton_bmm_21197 0.0091 ms 86.3% + bmm 0.0096 ms 81.9% + triton_bmm_21196 0.0111 ms 70.8% +SingleProcess AUTOTUNE takes 3.8002 seconds +AUTOTUNE bmm(16x1x96, 16x96x225) + triton_bmm_21270 0.0073 ms 100.0% + triton_bmm_21272 0.0073 ms 100.0% + triton_bmm_21274 0.0073 ms 100.0% + triton_bmm_21271 0.0076 ms 96.6% + triton_bmm_21273 0.0078 ms 93.9% + triton_bmm_21276 0.0080 ms 91.2% + triton_bmm_21269 0.0082 ms 89.1% + triton_bmm_21275 0.0088 ms 83.0% + triton_bmm_21268 0.0089 ms 82.7% + triton_bmm_21277 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.8735 seconds +AUTOTUNE bmm(16x1x225, 16x225x96) + triton_bmm_21301 0.0100 ms 100.0% + bmm 0.0103 ms 96.9% + triton_bmm_21300 0.0106 ms 94.6% + triton_bmm_21298 0.0116 ms 86.5% + triton_bmm_21294 0.0121 ms 83.0% + triton_bmm_21297 0.0121 ms 82.6% + triton_bmm_21293 0.0124 ms 81.1% + triton_bmm_21296 0.0124 ms 81.1% + triton_bmm_21295 0.0133 ms 75.1% + triton_bmm_21292 0.0144 ms 69.7% +SingleProcess AUTOTUNE takes 3.7598 seconds +AUTOTUNE bmm(16x1x96, 16x96x226) + triton_bmm_21366 0.0073 ms 100.0% + triton_bmm_21370 0.0073 ms 100.0% + triton_bmm_21372 0.0076 ms 97.0% + triton_bmm_21367 0.0076 ms 96.6% + triton_bmm_21368 0.0078 ms 93.5% + triton_bmm_21364 0.0083 ms 88.1% + triton_bmm_21365 0.0083 ms 88.1% + triton_bmm_21369 0.0083 ms 88.1% + triton_bmm_21371 0.0088 ms 83.0% + triton_bmm_21375 0.0096 ms 76.3% +SingleProcess AUTOTUNE takes 4.0708 seconds +AUTOTUNE bmm(16x1x226, 16x226x96) + triton_bmm_21393 0.0078 ms 100.0% + triton_bmm_21394 0.0081 ms 97.2% + triton_bmm_21392 0.0086 ms 91.4% + bmm 0.0086 ms 91.1% + triton_bmm_21396 0.0086 ms 91.1% + triton_bmm_21397 0.0088 ms 88.8% + triton_bmm_21390 0.0091 ms 86.0% + triton_bmm_21391 0.0094 ms 83.6% + triton_bmm_21389 0.0099 ms 79.5% + triton_bmm_21388 0.0118 ms 66.4% +SingleProcess AUTOTUNE takes 3.8769 seconds +AUTOTUNE bmm(16x1x96, 16x96x227) + triton_bmm_21462 0.0073 ms 100.0% + triton_bmm_21466 0.0073 ms 100.0% + triton_bmm_21468 0.0076 ms 97.0% + triton_bmm_21463 0.0076 ms 96.6% + triton_bmm_21464 0.0078 ms 93.5% + triton_bmm_21461 0.0083 ms 88.4% + triton_bmm_21465 0.0083 ms 88.1% + triton_bmm_21460 0.0089 ms 82.1% + triton_bmm_21469 0.0091 ms 80.6% + triton_bmm_21467 0.0095 ms 77.4% +SingleProcess AUTOTUNE takes 3.8254 seconds +AUTOTUNE bmm(16x1x227, 16x227x96) + triton_bmm_21493 0.0096 ms 100.0% + triton_bmm_21492 0.0101 ms 94.9% + bmm 0.0108 ms 88.5% + triton_bmm_21489 0.0116 ms 82.6% + triton_bmm_21488 0.0118 ms 80.8% + triton_bmm_21490 0.0121 ms 79.1% + triton_bmm_21485 0.0126 ms 76.1% + triton_bmm_21486 0.0126 ms 75.9% + triton_bmm_21487 0.0128 ms 74.6% + triton_bmm_21484 0.0144 ms 66.3% +SingleProcess AUTOTUNE takes 4.0795 seconds +AUTOTUNE bmm(16x1x96, 16x96x228) + triton_bmm_21560 0.0073 ms 100.0% + triton_bmm_21558 0.0073 ms 99.6% + triton_bmm_21562 0.0078 ms 93.1% + triton_bmm_21557 0.0080 ms 91.6% + triton_bmm_21564 0.0080 ms 91.2% + triton_bmm_21559 0.0081 ms 90.1% + triton_bmm_21556 0.0083 ms 87.7% + triton_bmm_21561 0.0083 ms 87.7% + triton_bmm_21567 0.0091 ms 80.3% + triton_bmm_21566 0.0091 ms 80.0% +SingleProcess AUTOTUNE takes 3.7486 seconds +AUTOTUNE bmm(16x1x228, 16x228x96) + triton_bmm_21586 0.0081 ms 100.0% + triton_bmm_21589 0.0081 ms 100.0% + triton_bmm_21584 0.0084 ms 96.6% + triton_bmm_21585 0.0086 ms 94.0% + triton_bmm_21588 0.0086 ms 93.3% + bmm 0.0088 ms 91.3% + triton_bmm_21583 0.0088 ms 91.3% + triton_bmm_21582 0.0096 ms 84.4% + triton_bmm_21581 0.0100 ms 80.6% + triton_bmm_21580 0.0116 ms 69.6% +SingleProcess AUTOTUNE takes 3.9119 seconds +AUTOTUNE bmm(16x1x96, 16x96x229) + triton_bmm_21654 0.0073 ms 100.0% + triton_bmm_21658 0.0073 ms 100.0% + triton_bmm_21655 0.0076 ms 96.6% + triton_bmm_21660 0.0076 ms 96.6% + triton_bmm_21653 0.0076 ms 95.8% + triton_bmm_21656 0.0078 ms 93.5% + triton_bmm_21652 0.0083 ms 88.1% + triton_bmm_21657 0.0083 ms 87.9% + triton_bmm_21659 0.0088 ms 83.0% + triton_bmm_21663 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.9164 seconds +AUTOTUNE bmm(16x1x229, 16x229x96) + triton_bmm_21685 0.0100 ms 100.0% + triton_bmm_21684 0.0101 ms 99.7% + bmm 0.0111 ms 90.2% + triton_bmm_21681 0.0118 ms 85.1% + triton_bmm_21678 0.0121 ms 83.1% + triton_bmm_21682 0.0122 ms 82.6% + triton_bmm_21680 0.0123 ms 81.8% + triton_bmm_21677 0.0126 ms 79.9% + triton_bmm_21679 0.0133 ms 75.3% + triton_bmm_21676 0.0146 ms 68.9% +SingleProcess AUTOTUNE takes 3.8453 seconds +AUTOTUNE bmm(16x1x96, 16x96x230) + triton_bmm_21752 0.0073 ms 100.0% + triton_bmm_21756 0.0076 ms 97.0% + triton_bmm_21751 0.0076 ms 96.6% + triton_bmm_21753 0.0078 ms 93.9% + triton_bmm_21754 0.0078 ms 93.7% + triton_bmm_21750 0.0080 ms 92.0% + triton_bmm_21749 0.0082 ms 89.1% + triton_bmm_21748 0.0091 ms 80.8% + triton_bmm_21755 0.0093 ms 78.4% + triton_bmm_21759 0.0096 ms 76.3% +SingleProcess AUTOTUNE takes 3.8551 seconds +AUTOTUNE bmm(16x1x230, 16x230x96) + triton_bmm_21777 0.0078 ms 100.0% + triton_bmm_21778 0.0081 ms 97.2% + triton_bmm_21780 0.0081 ms 97.2% + triton_bmm_21781 0.0082 ms 95.3% + triton_bmm_21775 0.0088 ms 88.8% + triton_bmm_21776 0.0091 ms 86.6% + triton_bmm_21774 0.0091 ms 86.3% + bmm 0.0097 ms 80.6% + triton_bmm_21773 0.0099 ms 79.3% + triton_bmm_21772 0.0124 ms 63.5% +SingleProcess AUTOTUNE takes 3.8956 seconds +AUTOTUNE bmm(16x1x96, 16x96x231) + triton_bmm_21848 0.0073 ms 100.0% + triton_bmm_21846 0.0075 ms 98.3% + triton_bmm_21850 0.0078 ms 93.7% + triton_bmm_21852 0.0081 ms 90.7% + triton_bmm_21847 0.0082 ms 89.8% + triton_bmm_21844 0.0083 ms 88.1% + triton_bmm_21845 0.0083 ms 88.1% + triton_bmm_21849 0.0083 ms 87.9% + triton_bmm_21851 0.0088 ms 83.0% + triton_bmm_21853 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.7668 seconds +AUTOTUNE bmm(16x1x231, 16x231x96) + triton_bmm_21877 0.0096 ms 100.0% + triton_bmm_21876 0.0107 ms 89.8% + bmm 0.0108 ms 88.2% + triton_bmm_21874 0.0118 ms 80.8% + triton_bmm_21872 0.0121 ms 79.3% + triton_bmm_21873 0.0122 ms 78.3% + triton_bmm_21870 0.0126 ms 75.7% + triton_bmm_21871 0.0129 ms 74.4% + triton_bmm_21869 0.0131 ms 72.9% + triton_bmm_21868 0.0154 ms 62.3% +SingleProcess AUTOTUNE takes 4.1333 seconds +AUTOTUNE bmm(16x1x96, 16x96x232) + triton_bmm_21946 0.0073 ms 100.0% + triton_bmm_21941 0.0076 ms 96.6% + triton_bmm_21943 0.0076 ms 96.6% + triton_bmm_21945 0.0078 ms 93.9% + triton_bmm_21944 0.0078 ms 93.5% + triton_bmm_21942 0.0081 ms 90.9% + triton_bmm_21948 0.0081 ms 90.5% + triton_bmm_21940 0.0083 ms 88.1% + triton_bmm_21947 0.0088 ms 83.0% + triton_bmm_21950 0.0091 ms 80.4% +SingleProcess AUTOTUNE takes 3.6485 seconds +AUTOTUNE bmm(16x1x232, 16x232x96) + triton_bmm_21972 0.0083 ms 100.0% + triton_bmm_21968 0.0084 ms 99.6% + triton_bmm_21970 0.0086 ms 97.0% + triton_bmm_21969 0.0086 ms 96.3% + triton_bmm_21973 0.0091 ms 91.5% + bmm 0.0093 ms 89.0% + triton_bmm_21967 0.0093 ms 89.0% + triton_bmm_21966 0.0096 ms 86.7% + triton_bmm_21965 0.0099 ms 84.0% + triton_bmm_21964 0.0121 ms 68.8% +SingleProcess AUTOTUNE takes 3.9831 seconds +AUTOTUNE bmm(16x1x96, 16x96x233) + triton_bmm_22042 0.0073 ms 100.0% + triton_bmm_22037 0.0077 ms 95.0% + triton_bmm_22041 0.0078 ms 93.9% + triton_bmm_22040 0.0078 ms 93.5% + triton_bmm_22038 0.0081 ms 90.9% + triton_bmm_22044 0.0081 ms 90.5% + triton_bmm_22039 0.0083 ms 88.1% + triton_bmm_22036 0.0090 ms 81.8% + triton_bmm_22046 0.0093 ms 78.7% + triton_bmm_22043 0.0094 ms 78.3% +SingleProcess AUTOTUNE takes 4.2113 seconds +AUTOTUNE bmm(16x1x233, 16x233x96) + triton_bmm_22069 0.0096 ms 100.0% + triton_bmm_22068 0.0106 ms 90.1% + bmm 0.0108 ms 88.2% + triton_bmm_22066 0.0118 ms 81.0% + triton_bmm_22065 0.0118 ms 80.8% + triton_bmm_22062 0.0123 ms 78.1% + triton_bmm_22064 0.0124 ms 77.3% + triton_bmm_22061 0.0126 ms 76.1% + triton_bmm_22063 0.0134 ms 71.2% + triton_bmm_22060 0.0151 ms 63.2% +SingleProcess AUTOTUNE takes 4.6859 seconds +AUTOTUNE bmm(16x1x96, 16x96x234) + triton_bmm_22136 0.0073 ms 100.0% + triton_bmm_22135 0.0076 ms 96.6% + triton_bmm_22137 0.0078 ms 93.9% + triton_bmm_22138 0.0078 ms 93.5% + triton_bmm_22134 0.0080 ms 91.1% + triton_bmm_22140 0.0081 ms 90.5% + triton_bmm_22133 0.0083 ms 88.8% + triton_bmm_22132 0.0083 ms 88.1% + triton_bmm_22139 0.0088 ms 83.0% + triton_bmm_22143 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 4.3995 seconds +AUTOTUNE bmm(16x1x234, 16x234x96) + triton_bmm_22164 0.0081 ms 100.0% + triton_bmm_22161 0.0086 ms 94.0% + triton_bmm_22162 0.0086 ms 93.5% + triton_bmm_22165 0.0088 ms 91.6% + triton_bmm_22160 0.0091 ms 88.7% + triton_bmm_22158 0.0092 ms 87.2% + triton_bmm_22157 0.0093 ms 86.3% + triton_bmm_22159 0.0093 ms 86.3% + bmm 0.0094 ms 85.6% + triton_bmm_22156 0.0118 ms 68.1% +SingleProcess AUTOTUNE takes 4.1167 seconds +AUTOTUNE bmm(16x1x96, 16x96x235) + triton_bmm_22232 0.0073 ms 100.0% + triton_bmm_22234 0.0073 ms 100.0% + triton_bmm_22231 0.0076 ms 96.6% + triton_bmm_22230 0.0081 ms 90.7% + triton_bmm_22236 0.0081 ms 90.5% + triton_bmm_22229 0.0082 ms 88.9% + triton_bmm_22233 0.0083 ms 88.4% + triton_bmm_22235 0.0088 ms 83.0% + triton_bmm_22228 0.0089 ms 81.9% + triton_bmm_22238 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 4.0055 seconds +AUTOTUNE bmm(16x1x235, 16x235x96) + triton_bmm_22261 0.0096 ms 100.0% + triton_bmm_22260 0.0107 ms 89.8% + bmm 0.0113 ms 84.5% + triton_bmm_22258 0.0120 ms 79.5% + triton_bmm_22256 0.0121 ms 79.1% + triton_bmm_22257 0.0124 ms 77.5% + triton_bmm_22253 0.0126 ms 76.1% + triton_bmm_22254 0.0128 ms 74.8% + triton_bmm_22255 0.0128 ms 74.6% + triton_bmm_22252 0.0154 ms 62.3% +SingleProcess AUTOTUNE takes 3.7388 seconds +AUTOTUNE bmm(16x1x96, 16x96x236) + triton_bmm_22328 0.0073 ms 100.0% + triton_bmm_22330 0.0073 ms 100.0% + triton_bmm_22332 0.0076 ms 96.6% + triton_bmm_22325 0.0076 ms 96.2% + triton_bmm_22327 0.0076 ms 96.2% + triton_bmm_22326 0.0080 ms 90.7% + triton_bmm_22329 0.0082 ms 88.9% + triton_bmm_22333 0.0088 ms 82.6% + triton_bmm_22324 0.0089 ms 82.3% + triton_bmm_22331 0.0093 ms 78.1% +SingleProcess AUTOTUNE takes 3.9085 seconds +AUTOTUNE bmm(16x1x236, 16x236x96) + triton_bmm_22353 0.0078 ms 100.0% + triton_bmm_22357 0.0081 ms 97.2% + triton_bmm_22356 0.0081 ms 96.8% + bmm 0.0084 ms 93.9% + triton_bmm_22354 0.0086 ms 91.4% + triton_bmm_22351 0.0088 ms 88.8% + triton_bmm_22352 0.0090 ms 87.2% + triton_bmm_22350 0.0091 ms 86.3% + triton_bmm_22349 0.0101 ms 77.8% + triton_bmm_22348 0.0117 ms 66.8% +SingleProcess AUTOTUNE takes 3.8862 seconds +AUTOTUNE bmm(16x1x96, 16x96x237) + triton_bmm_22426 0.0073 ms 100.0% + triton_bmm_22428 0.0076 ms 96.6% + triton_bmm_22421 0.0078 ms 93.9% + triton_bmm_22424 0.0078 ms 93.5% + triton_bmm_22422 0.0081 ms 90.5% + triton_bmm_22423 0.0084 ms 87.7% + triton_bmm_22425 0.0084 ms 87.7% + triton_bmm_22427 0.0088 ms 83.0% + triton_bmm_22420 0.0091 ms 80.6% + triton_bmm_22429 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 4.0295 seconds +AUTOTUNE bmm(16x1x237, 16x237x96) + triton_bmm_22453 0.0096 ms 100.0% + triton_bmm_22452 0.0108 ms 88.5% + triton_bmm_22448 0.0121 ms 79.4% + triton_bmm_22449 0.0124 ms 77.7% + triton_bmm_22450 0.0125 ms 76.7% + triton_bmm_22445 0.0128 ms 74.8% + triton_bmm_22446 0.0129 ms 74.6% + triton_bmm_22447 0.0136 ms 70.6% + triton_bmm_22444 0.0154 ms 62.5% + triton_bmm_22455 0.0156 ms 61.3% +SingleProcess AUTOTUNE takes 3.7974 seconds +AUTOTUNE bmm(16x1x96, 16x96x238) + triton_bmm_22518 0.0073 ms 100.0% + triton_bmm_22520 0.0073 ms 100.0% + triton_bmm_22522 0.0073 ms 100.0% + triton_bmm_22521 0.0078 ms 93.9% + triton_bmm_22524 0.0081 ms 90.5% + triton_bmm_22517 0.0082 ms 89.1% + triton_bmm_22519 0.0082 ms 88.9% + triton_bmm_22516 0.0083 ms 88.1% + triton_bmm_22523 0.0088 ms 83.0% + triton_bmm_22527 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.8530 seconds +AUTOTUNE bmm(16x1x238, 16x238x96) + triton_bmm_22546 0.0081 ms 100.0% + triton_bmm_22548 0.0081 ms 100.0% + triton_bmm_22545 0.0084 ms 95.8% + triton_bmm_22544 0.0086 ms 94.4% + triton_bmm_22543 0.0088 ms 91.7% + triton_bmm_22549 0.0088 ms 91.7% + triton_bmm_22542 0.0093 ms 86.9% + triton_bmm_22541 0.0101 ms 80.3% + triton_bmm_22540 0.0118 ms 68.4% + triton_bmm_22547 0.0126 ms 64.4% +SingleProcess AUTOTUNE takes 4.0389 seconds +AUTOTUNE bmm(16x1x96, 16x96x239) + triton_bmm_22615 0.0076 ms 100.0% + triton_bmm_22620 0.0076 ms 100.0% + triton_bmm_22617 0.0078 ms 97.1% + triton_bmm_22616 0.0078 ms 96.9% + triton_bmm_22618 0.0078 ms 96.7% + triton_bmm_22614 0.0081 ms 94.0% + triton_bmm_22613 0.0084 ms 90.8% + triton_bmm_22619 0.0088 ms 85.9% + triton_bmm_22612 0.0089 ms 85.3% + triton_bmm_22621 0.0091 ms 83.5% +SingleProcess AUTOTUNE takes 3.6767 seconds +AUTOTUNE bmm(16x1x239, 16x239x96) + triton_bmm_22645 0.0096 ms 100.0% + triton_bmm_22644 0.0101 ms 94.6% + bmm 0.0113 ms 84.5% + triton_bmm_22642 0.0118 ms 80.8% + triton_bmm_22641 0.0119 ms 80.7% + triton_bmm_22640 0.0121 ms 79.1% + triton_bmm_22638 0.0128 ms 74.6% + triton_bmm_22637 0.0131 ms 72.9% + triton_bmm_22639 0.0134 ms 71.5% + triton_bmm_22636 0.0148 ms 64.4% +SingleProcess AUTOTUNE takes 3.6398 seconds +AUTOTUNE bmm(16x1x96, 16x96x240) + triton_bmm_22714 0.0073 ms 100.0% + triton_bmm_22709 0.0076 ms 96.2% + triton_bmm_22711 0.0076 ms 96.2% + triton_bmm_22713 0.0078 ms 93.4% + triton_bmm_22712 0.0078 ms 93.1% + triton_bmm_22710 0.0081 ms 90.5% + triton_bmm_22716 0.0081 ms 90.1% + triton_bmm_22715 0.0088 ms 82.6% + triton_bmm_22717 0.0088 ms 82.6% + triton_bmm_22708 0.0089 ms 82.3% +SingleProcess AUTOTUNE takes 3.8065 seconds +AUTOTUNE bmm(16x1x240, 16x240x96) + triton_bmm_22738 0.0081 ms 100.0% + triton_bmm_22736 0.0084 ms 96.6% + triton_bmm_22741 0.0085 ms 94.7% + triton_bmm_22737 0.0086 ms 93.3% + triton_bmm_22740 0.0087 ms 92.3% + triton_bmm_22735 0.0088 ms 91.3% + triton_bmm_22734 0.0091 ms 88.7% + triton_bmm_22733 0.0093 ms 86.3% + bmm 0.0095 ms 84.6% + triton_bmm_22732 0.0116 ms 69.4% +SingleProcess AUTOTUNE takes 4.0736 seconds +AUTOTUNE bmm(16x1x96, 16x96x241) + triton_bmm_22806 0.0073 ms 100.0% + triton_bmm_22808 0.0073 ms 100.0% + triton_bmm_22810 0.0073 ms 100.0% + triton_bmm_22807 0.0076 ms 95.8% + triton_bmm_22805 0.0078 ms 93.9% + triton_bmm_22809 0.0078 ms 93.9% + triton_bmm_22812 0.0081 ms 90.9% + triton_bmm_22811 0.0088 ms 83.0% + triton_bmm_22804 0.0090 ms 81.5% + triton_bmm_22815 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.8046 seconds +AUTOTUNE bmm(16x1x241, 16x241x96) + triton_bmm_22837 0.0093 ms 100.0% + triton_bmm_22836 0.0108 ms 86.1% + bmm 0.0116 ms 80.7% + triton_bmm_22833 0.0118 ms 78.9% + triton_bmm_22834 0.0121 ms 77.2% + triton_bmm_22832 0.0126 ms 73.9% + triton_bmm_22829 0.0128 ms 72.8% + triton_bmm_22830 0.0129 ms 72.6% + triton_bmm_22831 0.0131 ms 71.2% + triton_bmm_22828 0.0151 ms 62.0% +SingleProcess AUTOTUNE takes 3.8613 seconds +AUTOTUNE bmm(16x1x96, 16x96x242) + triton_bmm_22902 0.0073 ms 100.0% + triton_bmm_22904 0.0073 ms 100.0% + triton_bmm_22906 0.0073 ms 100.0% + triton_bmm_22908 0.0076 ms 97.0% + triton_bmm_22901 0.0076 ms 96.6% + triton_bmm_22903 0.0076 ms 96.6% + triton_bmm_22905 0.0082 ms 89.1% + triton_bmm_22900 0.0089 ms 82.7% + triton_bmm_22907 0.0093 ms 78.4% + triton_bmm_22911 0.0095 ms 76.8% +SingleProcess AUTOTUNE takes 3.7015 seconds +AUTOTUNE bmm(16x1x242, 16x242x96) + triton_bmm_22930 0.0081 ms 100.0% + triton_bmm_22932 0.0081 ms 99.8% + triton_bmm_22933 0.0081 ms 99.2% + triton_bmm_22929 0.0085 ms 94.4% + triton_bmm_22928 0.0086 ms 94.0% + triton_bmm_22927 0.0088 ms 91.3% + bmm 0.0089 ms 91.0% + triton_bmm_22926 0.0093 ms 86.6% + triton_bmm_22925 0.0101 ms 80.0% + triton_bmm_22924 0.0118 ms 68.1% +SingleProcess AUTOTUNE takes 3.8509 seconds +AUTOTUNE bmm(16x1x96, 16x96x243) + triton_bmm_23000 0.0073 ms 100.0% + triton_bmm_23004 0.0076 ms 96.6% + triton_bmm_22997 0.0078 ms 93.9% + triton_bmm_23001 0.0078 ms 93.9% + triton_bmm_23002 0.0078 ms 93.5% + triton_bmm_22998 0.0081 ms 90.9% + triton_bmm_22996 0.0083 ms 88.1% + triton_bmm_22999 0.0083 ms 88.1% + triton_bmm_23003 0.0094 ms 78.0% + triton_bmm_23005 0.0096 ms 76.3% +SingleProcess AUTOTUNE takes 4.0832 seconds +AUTOTUNE bmm(16x1x243, 16x243x96) + triton_bmm_23029 0.0099 ms 100.0% + triton_bmm_23028 0.0108 ms 92.0% + bmm 0.0120 ms 82.2% + triton_bmm_23026 0.0121 ms 82.0% + triton_bmm_23024 0.0121 ms 81.7% + triton_bmm_23025 0.0124 ms 80.1% + triton_bmm_23022 0.0129 ms 76.9% + triton_bmm_23021 0.0133 ms 74.1% + triton_bmm_23023 0.0136 ms 72.5% + triton_bmm_23020 0.0150 ms 65.9% +SingleProcess AUTOTUNE takes 3.7646 seconds +AUTOTUNE bmm(16x1x96, 16x96x244) + triton_bmm_23096 0.0073 ms 100.0% + triton_bmm_23095 0.0076 ms 96.6% + triton_bmm_23097 0.0078 ms 93.9% + triton_bmm_23098 0.0078 ms 93.5% + triton_bmm_23100 0.0080 ms 91.4% + triton_bmm_23094 0.0080 ms 91.2% + triton_bmm_23092 0.0083 ms 88.1% + triton_bmm_23093 0.0083 ms 88.1% + triton_bmm_23099 0.0093 ms 78.7% + triton_bmm_23102 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 5.3749 seconds +AUTOTUNE bmm(16x1x244, 16x244x96) + triton_bmm_23121 0.0078 ms 100.0% + bmm 0.0084 ms 93.9% + triton_bmm_23122 0.0086 ms 91.1% + triton_bmm_23125 0.0087 ms 89.7% + triton_bmm_23124 0.0088 ms 89.4% + triton_bmm_23119 0.0088 ms 88.8% + triton_bmm_23120 0.0090 ms 86.7% + triton_bmm_23118 0.0098 ms 79.9% + triton_bmm_23117 0.0100 ms 78.5% + triton_bmm_23116 0.0124 ms 63.5% +SingleProcess AUTOTUNE takes 4.5362 seconds +AUTOTUNE bmm(16x1x96, 16x96x245) + triton_bmm_23194 0.0073 ms 100.0% + triton_bmm_23192 0.0078 ms 93.9% + triton_bmm_23193 0.0078 ms 93.9% + triton_bmm_23190 0.0081 ms 90.9% + triton_bmm_23196 0.0081 ms 90.5% + triton_bmm_23188 0.0083 ms 88.1% + triton_bmm_23191 0.0083 ms 88.1% + triton_bmm_23189 0.0084 ms 87.7% + triton_bmm_23195 0.0088 ms 83.0% + triton_bmm_23198 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.7784 seconds +AUTOTUNE bmm(16x1x245, 16x245x96) + triton_bmm_23221 0.0100 ms 100.0% + triton_bmm_23220 0.0103 ms 96.9% + triton_bmm_23217 0.0118 ms 84.6% + triton_bmm_23218 0.0121 ms 82.8% + bmm 0.0122 ms 82.2% + triton_bmm_23216 0.0126 ms 79.4% + triton_bmm_23213 0.0129 ms 77.9% + triton_bmm_23214 0.0129 ms 77.9% + triton_bmm_23215 0.0133 ms 75.1% + triton_bmm_23212 0.0156 ms 64.3% +SingleProcess AUTOTUNE takes 4.0647 seconds +AUTOTUNE bmm(16x1x96, 16x96x246) + triton_bmm_23290 0.0073 ms 100.0% + triton_bmm_23292 0.0076 ms 97.0% + triton_bmm_23285 0.0076 ms 96.6% + triton_bmm_23287 0.0076 ms 96.6% + triton_bmm_23289 0.0078 ms 93.9% + triton_bmm_23288 0.0078 ms 93.5% + triton_bmm_23286 0.0080 ms 91.1% + triton_bmm_23284 0.0091 ms 80.9% + triton_bmm_23295 0.0091 ms 80.9% + triton_bmm_23294 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.8868 seconds +AUTOTUNE bmm(16x1x246, 16x246x96) + triton_bmm_23314 0.0081 ms 100.0% + triton_bmm_23316 0.0081 ms 100.0% + triton_bmm_23313 0.0085 ms 95.1% + triton_bmm_23312 0.0086 ms 94.0% + triton_bmm_23311 0.0088 ms 91.3% + triton_bmm_23317 0.0088 ms 91.3% + bmm 0.0092 ms 87.8% + triton_bmm_23310 0.0093 ms 86.6% + triton_bmm_23309 0.0094 ms 86.0% + triton_bmm_23308 0.0124 ms 65.1% +SingleProcess AUTOTUNE takes 4.0111 seconds +AUTOTUNE bmm(16x1x96, 16x96x247) + triton_bmm_23386 0.0073 ms 100.0% + triton_bmm_23382 0.0076 ms 97.0% + triton_bmm_23388 0.0076 ms 96.6% + triton_bmm_23383 0.0078 ms 94.2% + triton_bmm_23384 0.0078 ms 94.0% + triton_bmm_23381 0.0078 ms 93.9% + triton_bmm_23385 0.0078 ms 93.9% + triton_bmm_23387 0.0088 ms 83.0% + triton_bmm_23380 0.0091 ms 80.9% + triton_bmm_23390 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 4.1476 seconds +AUTOTUNE bmm(16x1x247, 16x247x96) + triton_bmm_23413 0.0096 ms 100.0% + triton_bmm_23412 0.0108 ms 88.2% + triton_bmm_23409 0.0119 ms 80.6% + bmm 0.0119 ms 80.4% + triton_bmm_23410 0.0121 ms 79.1% + triton_bmm_23406 0.0124 ms 77.5% + triton_bmm_23408 0.0127 ms 75.6% + triton_bmm_23405 0.0128 ms 74.6% + triton_bmm_23407 0.0136 ms 70.2% + triton_bmm_23404 0.0151 ms 63.3% +SingleProcess AUTOTUNE takes 3.8267 seconds +AUTOTUNE bmm(16x1x96, 16x96x248) + triton_bmm_23480 0.0073 ms 100.0% + triton_bmm_23478 0.0076 ms 97.0% + triton_bmm_23481 0.0078 ms 93.9% + triton_bmm_23482 0.0078 ms 93.9% + triton_bmm_23477 0.0081 ms 90.5% + triton_bmm_23484 0.0081 ms 90.5% + triton_bmm_23479 0.0083 ms 88.2% + triton_bmm_23476 0.0083 ms 88.1% + triton_bmm_23483 0.0088 ms 83.0% + triton_bmm_23485 0.0093 ms 78.4% +SingleProcess AUTOTUNE takes 3.7830 seconds +AUTOTUNE bmm(16x1x248, 16x248x96) + triton_bmm_23506 0.0081 ms 100.0% + triton_bmm_23509 0.0084 ms 95.5% + triton_bmm_23505 0.0087 ms 93.2% + triton_bmm_23508 0.0088 ms 91.3% + triton_bmm_23502 0.0091 ms 88.7% + triton_bmm_23504 0.0091 ms 88.7% + bmm 0.0093 ms 86.3% + triton_bmm_23503 0.0094 ms 86.0% + triton_bmm_23501 0.0101 ms 80.0% + triton_bmm_23500 0.0121 ms 66.5% +SingleProcess AUTOTUNE takes 4.0902 seconds +AUTOTUNE bmm(16x1x96, 16x96x249) + triton_bmm_23576 0.0073 ms 100.0% + triton_bmm_23580 0.0076 ms 96.6% + triton_bmm_23575 0.0077 ms 95.0% + triton_bmm_23573 0.0078 ms 93.9% + triton_bmm_23578 0.0078 ms 93.5% + triton_bmm_23574 0.0081 ms 90.5% + triton_bmm_23577 0.0083 ms 88.1% + triton_bmm_23572 0.0084 ms 87.7% + triton_bmm_23579 0.0088 ms 83.0% + triton_bmm_23581 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.7547 seconds +AUTOTUNE bmm(16x1x249, 16x249x96) + triton_bmm_23605 0.0101 ms 100.0% + triton_bmm_23604 0.0103 ms 97.5% + bmm 0.0118 ms 85.1% + triton_bmm_23601 0.0119 ms 84.7% + triton_bmm_23600 0.0126 ms 79.9% + triton_bmm_23602 0.0126 ms 79.9% + triton_bmm_23598 0.0128 ms 78.5% + triton_bmm_23597 0.0130 ms 77.6% + triton_bmm_23599 0.0133 ms 75.5% + triton_bmm_23596 0.0151 ms 66.9% +SingleProcess AUTOTUNE takes 3.7115 seconds +AUTOTUNE bmm(16x1x96, 16x96x250) + triton_bmm_23672 0.0073 ms 100.0% + triton_bmm_23670 0.0074 ms 99.1% + triton_bmm_23671 0.0076 ms 96.6% + triton_bmm_23674 0.0078 ms 93.5% + triton_bmm_23676 0.0081 ms 90.5% + triton_bmm_23669 0.0082 ms 88.9% + triton_bmm_23668 0.0083 ms 88.1% + triton_bmm_23673 0.0083 ms 88.1% + triton_bmm_23675 0.0093 ms 78.4% + triton_bmm_23679 0.0096 ms 76.5% +SingleProcess AUTOTUNE takes 4.2908 seconds +AUTOTUNE bmm(16x1x250, 16x250x96) + triton_bmm_23697 0.0081 ms 100.0% + triton_bmm_23698 0.0088 ms 91.6% + triton_bmm_23700 0.0088 ms 91.6% + triton_bmm_23701 0.0088 ms 91.5% + triton_bmm_23696 0.0091 ms 88.7% + bmm 0.0094 ms 86.0% + triton_bmm_23695 0.0094 ms 85.7% + triton_bmm_23693 0.0096 ms 84.3% + triton_bmm_23694 0.0098 ms 82.1% + triton_bmm_23692 0.0118 ms 68.1% +SingleProcess AUTOTUNE takes 3.9647 seconds +AUTOTUNE bmm(16x1x96, 16x96x251) + triton_bmm_23768 0.0073 ms 100.0% + triton_bmm_23770 0.0073 ms 100.0% + triton_bmm_23766 0.0076 ms 97.0% + triton_bmm_23772 0.0081 ms 90.5% + triton_bmm_23769 0.0083 ms 88.2% + triton_bmm_23765 0.0084 ms 87.7% + triton_bmm_23767 0.0084 ms 87.7% + triton_bmm_23771 0.0088 ms 83.0% + triton_bmm_23764 0.0090 ms 81.5% + triton_bmm_23773 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 4.2841 seconds +AUTOTUNE bmm(16x1x251, 16x251x96) + triton_bmm_23797 0.0101 ms 100.0% + triton_bmm_23796 0.0103 ms 97.5% + triton_bmm_23793 0.0121 ms 83.6% + triton_bmm_23794 0.0121 ms 83.3% + bmm 0.0124 ms 81.6% + triton_bmm_23790 0.0124 ms 81.6% + triton_bmm_23792 0.0127 ms 79.1% + triton_bmm_23789 0.0130 ms 77.8% + triton_bmm_23791 0.0133 ms 75.5% + triton_bmm_23788 0.0156 ms 64.5% +SingleProcess AUTOTUNE takes 4.0227 seconds +AUTOTUNE bmm(16x1x96, 16x96x252) + triton_bmm_23862 0.0073 ms 100.0% + triton_bmm_23866 0.0073 ms 100.0% + triton_bmm_23863 0.0076 ms 96.6% + triton_bmm_23865 0.0078 ms 93.9% + triton_bmm_23864 0.0078 ms 93.5% + triton_bmm_23868 0.0080 ms 91.2% + triton_bmm_23861 0.0083 ms 88.1% + triton_bmm_23867 0.0088 ms 83.0% + triton_bmm_23860 0.0089 ms 82.1% + triton_bmm_23870 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.8624 seconds +AUTOTUNE bmm(16x1x252, 16x252x96) + triton_bmm_23890 0.0081 ms 100.0% + bmm 0.0084 ms 95.5% + triton_bmm_23888 0.0086 ms 94.0% + triton_bmm_23889 0.0086 ms 94.0% + triton_bmm_23892 0.0088 ms 92.1% + triton_bmm_23893 0.0088 ms 91.3% + triton_bmm_23885 0.0096 ms 84.3% + triton_bmm_23887 0.0096 ms 84.0% + triton_bmm_23886 0.0099 ms 81.8% + triton_bmm_23884 0.0124 ms 65.3% +SingleProcess AUTOTUNE takes 3.7931 seconds +AUTOTUNE bmm(16x1x96, 16x96x253) + triton_bmm_23960 0.0073 ms 100.0% + triton_bmm_23958 0.0076 ms 97.0% + triton_bmm_23964 0.0076 ms 97.0% + triton_bmm_23962 0.0080 ms 92.0% + triton_bmm_23957 0.0083 ms 88.1% + triton_bmm_23959 0.0083 ms 88.1% + triton_bmm_23956 0.0084 ms 87.7% + triton_bmm_23961 0.0084 ms 87.7% + triton_bmm_23963 0.0088 ms 83.0% + triton_bmm_23965 0.0096 ms 76.1% +SingleProcess AUTOTUNE takes 4.1166 seconds +AUTOTUNE bmm(16x1x253, 16x253x96) + triton_bmm_23989 0.0096 ms 100.0% + triton_bmm_23988 0.0108 ms 88.2% + triton_bmm_23985 0.0121 ms 79.3% + triton_bmm_23986 0.0121 ms 79.1% + triton_bmm_23984 0.0123 ms 77.7% + bmm 0.0126 ms 75.9% + triton_bmm_23982 0.0129 ms 74.4% + triton_bmm_23981 0.0134 ms 71.5% + triton_bmm_23983 0.0138 ms 69.4% + triton_bmm_23980 0.0156 ms 61.3% +SingleProcess AUTOTUNE takes 3.7565 seconds +AUTOTUNE bmm(16x1x96, 16x96x254) + triton_bmm_24056 0.0073 ms 100.0% + triton_bmm_24058 0.0073 ms 100.0% + triton_bmm_24054 0.0075 ms 97.9% + triton_bmm_24053 0.0078 ms 93.9% + triton_bmm_24060 0.0080 ms 91.1% + triton_bmm_24055 0.0082 ms 89.8% + triton_bmm_24057 0.0082 ms 89.5% + triton_bmm_24052 0.0084 ms 87.7% + triton_bmm_24063 0.0091 ms 80.6% + triton_bmm_24062 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.7148 seconds +AUTOTUNE bmm(16x1x254, 16x254x96) + triton_bmm_24081 0.0081 ms 100.0% + triton_bmm_24082 0.0081 ms 100.0% + triton_bmm_24085 0.0083 ms 96.9% + triton_bmm_24079 0.0088 ms 91.3% + triton_bmm_24084 0.0088 ms 91.3% + triton_bmm_24080 0.0091 ms 89.0% + triton_bmm_24078 0.0093 ms 86.6% + bmm 0.0093 ms 86.3% + triton_bmm_24077 0.0101 ms 79.7% + triton_bmm_24076 0.0125 ms 64.3% +SingleProcess AUTOTUNE takes 3.9518 seconds +AUTOTUNE bmm(16x1x96, 16x96x255) + triton_bmm_24152 0.0073 ms 100.0% + triton_bmm_24150 0.0076 ms 97.0% + triton_bmm_24156 0.0076 ms 96.6% + triton_bmm_24151 0.0078 ms 93.9% + triton_bmm_24154 0.0078 ms 93.5% + triton_bmm_24153 0.0083 ms 88.1% + triton_bmm_24149 0.0084 ms 87.7% + triton_bmm_24148 0.0085 ms 86.4% + triton_bmm_24159 0.0091 ms 80.6% + triton_bmm_24158 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.8394 seconds +AUTOTUNE bmm(16x1x255, 16x255x96) + triton_bmm_24181 0.0096 ms 100.0% + triton_bmm_24180 0.0108 ms 88.2% + triton_bmm_24177 0.0111 ms 86.2% + triton_bmm_24176 0.0116 ms 82.6% + triton_bmm_24174 0.0118 ms 80.8% + triton_bmm_24178 0.0119 ms 80.2% + bmm 0.0124 ms 77.5% + triton_bmm_24175 0.0131 ms 72.9% + triton_bmm_24173 0.0136 ms 70.4% + triton_bmm_24172 0.0156 ms 61.3% +SingleProcess AUTOTUNE takes 3.7502 seconds +AUTOTUNE bmm(16x1x96, 16x96x256) + triton_bmm_24246 0.0076 ms 100.0% + triton_bmm_24247 0.0076 ms 99.6% + triton_bmm_24249 0.0078 ms 96.7% + triton_bmm_24250 0.0078 ms 96.3% + triton_bmm_24248 0.0079 ms 95.9% + triton_bmm_24252 0.0081 ms 93.3% + triton_bmm_24245 0.0081 ms 92.9% + triton_bmm_24244 0.0083 ms 90.8% + triton_bmm_24255 0.0091 ms 83.1% + triton_bmm_24253 0.0091 ms 82.8% +SingleProcess AUTOTUNE takes 3.9395 seconds +AUTOTUNE bmm(16x1x256, 16x256x96) + triton_bmm_24274 0.0081 ms 100.0% + triton_bmm_24277 0.0083 ms 97.7% + triton_bmm_24272 0.0086 ms 94.0% + triton_bmm_24273 0.0086 ms 94.0% + triton_bmm_24276 0.0087 ms 92.8% + triton_bmm_24271 0.0089 ms 91.0% + triton_bmm_24270 0.0093 ms 86.6% + bmm 0.0093 ms 86.3% + triton_bmm_24269 0.0096 ms 84.3% + triton_bmm_24268 0.0123 ms 65.4% +SingleProcess AUTOTUNE takes 3.7277 seconds +AUTOTUNE bmm(16x1x96, 16x96x257) + triton_bmm_24346 0.0074 ms 100.0% + triton_bmm_24343 0.0077 ms 95.0% + triton_bmm_24341 0.0078 ms 94.3% + triton_bmm_24345 0.0078 ms 93.9% + triton_bmm_24344 0.0079 ms 93.3% + triton_bmm_24342 0.0081 ms 90.9% + triton_bmm_24348 0.0081 ms 90.4% + triton_bmm_24347 0.0088 ms 83.3% + triton_bmm_24340 0.0090 ms 81.6% + triton_bmm_24349 0.0093 ms 79.0% +SingleProcess AUTOTUNE takes 3.9481 seconds +AUTOTUNE bmm(16x1x257, 16x257x96) + triton_bmm_24373 0.0104 ms 100.0% + bmm 0.0111 ms 93.4% + triton_bmm_24372 0.0113 ms 91.8% + triton_bmm_24369 0.0124 ms 83.9% + triton_bmm_24370 0.0125 ms 83.1% + triton_bmm_24368 0.0126 ms 82.4% + triton_bmm_24366 0.0129 ms 80.6% + triton_bmm_24365 0.0133 ms 77.7% + triton_bmm_24367 0.0137 ms 75.5% + triton_bmm_24364 0.0161 ms 64.3% +SingleProcess AUTOTUNE takes 4.1173 seconds +AUTOTUNE bmm(16x1x96, 16x96x258) + triton_bmm_24439 0.0076 ms 100.0% + triton_bmm_24444 0.0076 ms 100.0% + triton_bmm_24437 0.0077 ms 98.3% + triton_bmm_24441 0.0078 ms 96.9% + triton_bmm_24440 0.0079 ms 96.5% + triton_bmm_24442 0.0079 ms 96.3% + triton_bmm_24438 0.0081 ms 93.7% + triton_bmm_24443 0.0088 ms 85.9% + triton_bmm_24436 0.0091 ms 83.7% + triton_bmm_24447 0.0096 ms 79.0% +SingleProcess AUTOTUNE takes 4.0132 seconds +AUTOTUNE bmm(16x1x258, 16x258x96) + triton_bmm_24468 0.0086 ms 100.0% + triton_bmm_24465 0.0087 ms 98.9% + triton_bmm_24466 0.0089 ms 96.8% + triton_bmm_24464 0.0093 ms 92.6% + triton_bmm_24469 0.0093 ms 91.8% + bmm 0.0094 ms 91.5% + triton_bmm_24462 0.0096 ms 89.3% + triton_bmm_24463 0.0096 ms 89.0% + triton_bmm_24461 0.0104 ms 82.7% + triton_bmm_24460 0.0126 ms 68.2% +SingleProcess AUTOTUNE takes 4.2596 seconds +AUTOTUNE bmm(16x1x96, 16x96x259) + triton_bmm_24534 0.0076 ms 100.0% + triton_bmm_24533 0.0078 ms 97.1% + triton_bmm_24537 0.0078 ms 96.7% + triton_bmm_24536 0.0079 ms 96.0% + triton_bmm_24538 0.0079 ms 96.0% + triton_bmm_24540 0.0082 ms 92.9% + triton_bmm_24535 0.0084 ms 90.8% + triton_bmm_24539 0.0088 ms 85.9% + triton_bmm_24532 0.0090 ms 84.0% + triton_bmm_24541 0.0093 ms 81.4% +SingleProcess AUTOTUNE takes 4.0705 seconds +AUTOTUNE bmm(16x1x259, 16x259x96) + triton_bmm_24565 0.0104 ms 100.0% + bmm 0.0108 ms 96.2% + triton_bmm_24564 0.0108 ms 96.2% + triton_bmm_24562 0.0125 ms 83.2% + triton_bmm_24561 0.0129 ms 80.9% + triton_bmm_24558 0.0130 ms 80.3% + triton_bmm_24560 0.0131 ms 79.5% + triton_bmm_24557 0.0139 ms 75.3% + triton_bmm_24559 0.0141 ms 73.8% + triton_bmm_24556 0.0157 ms 66.5% +SingleProcess AUTOTUNE takes 3.8456 seconds +AUTOTUNE bmm(16x1x96, 16x96x260) + triton_bmm_24632 0.0074 ms 100.0% + triton_bmm_24629 0.0076 ms 96.6% + triton_bmm_24634 0.0079 ms 93.5% + triton_bmm_24630 0.0081 ms 91.3% + triton_bmm_24636 0.0081 ms 90.6% + triton_bmm_24631 0.0084 ms 88.1% + triton_bmm_24633 0.0084 ms 88.1% + triton_bmm_24635 0.0088 ms 83.3% + triton_bmm_24637 0.0091 ms 81.3% + triton_bmm_24628 0.0091 ms 81.0% +SingleProcess AUTOTUNE takes 3.8518 seconds +AUTOTUNE bmm(16x1x260, 16x260x96) + triton_bmm_24658 0.0083 ms 100.0% + triton_bmm_24657 0.0086 ms 96.7% + bmm 0.0091 ms 91.5% + triton_bmm_24660 0.0091 ms 91.2% + triton_bmm_24656 0.0091 ms 91.1% + triton_bmm_24661 0.0096 ms 87.0% + triton_bmm_24655 0.0096 ms 86.7% + triton_bmm_24653 0.0098 ms 84.7% + triton_bmm_24654 0.0101 ms 82.5% + triton_bmm_24652 0.0129 ms 64.7% +SingleProcess AUTOTUNE takes 3.8411 seconds +AUTOTUNE bmm(16x1x96, 16x96x261) + triton_bmm_24728 0.0074 ms 100.0% + triton_bmm_24730 0.0074 ms 100.0% + triton_bmm_24726 0.0076 ms 97.0% + triton_bmm_24732 0.0077 ms 95.8% + triton_bmm_24725 0.0078 ms 94.3% + triton_bmm_24727 0.0078 ms 94.3% + triton_bmm_24729 0.0084 ms 88.1% + triton_bmm_24724 0.0085 ms 86.8% + triton_bmm_24731 0.0095 ms 77.8% + triton_bmm_24735 0.0096 ms 76.4% +SingleProcess AUTOTUNE takes 3.7042 seconds +AUTOTUNE bmm(16x1x261, 16x261x96) + triton_bmm_24757 0.0099 ms 100.0% + bmm 0.0112 ms 88.0% + triton_bmm_24756 0.0114 ms 86.8% + triton_bmm_24753 0.0124 ms 79.6% + triton_bmm_24754 0.0125 ms 78.6% + triton_bmm_24752 0.0126 ms 78.2% + triton_bmm_24749 0.0133 ms 73.9% + triton_bmm_24750 0.0134 ms 73.7% + triton_bmm_24751 0.0138 ms 71.6% + triton_bmm_24748 0.0161 ms 61.1% +SingleProcess AUTOTUNE takes 3.8582 seconds +AUTOTUNE bmm(16x1x96, 16x96x262) + triton_bmm_24824 0.0073 ms 100.0% + triton_bmm_24822 0.0076 ms 97.0% + triton_bmm_24825 0.0078 ms 93.5% + triton_bmm_24826 0.0079 ms 93.1% + triton_bmm_24828 0.0081 ms 90.5% + triton_bmm_24821 0.0083 ms 88.2% + triton_bmm_24820 0.0084 ms 87.7% + triton_bmm_24823 0.0084 ms 87.7% + triton_bmm_24827 0.0088 ms 83.0% + triton_bmm_24831 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.8462 seconds +AUTOTUNE bmm(16x1x262, 16x262x96) + triton_bmm_24849 0.0082 ms 100.0% + triton_bmm_24852 0.0086 ms 95.9% + triton_bmm_24848 0.0088 ms 93.5% + triton_bmm_24850 0.0089 ms 92.8% + triton_bmm_24847 0.0091 ms 90.5% + triton_bmm_24853 0.0099 ms 82.9% + bmm 0.0100 ms 81.8% + triton_bmm_24846 0.0101 ms 81.1% + triton_bmm_24845 0.0104 ms 79.3% + triton_bmm_24844 0.0126 ms 65.4% +SingleProcess AUTOTUNE takes 3.9348 seconds +AUTOTUNE bmm(16x1x96, 16x96x263) + triton_bmm_24922 0.0074 ms 100.0% + triton_bmm_24918 0.0076 ms 97.0% + triton_bmm_24924 0.0076 ms 96.6% + triton_bmm_24917 0.0078 ms 94.3% + triton_bmm_24919 0.0078 ms 94.3% + triton_bmm_24920 0.0081 ms 90.9% + triton_bmm_24921 0.0084 ms 87.8% + triton_bmm_24923 0.0088 ms 83.3% + triton_bmm_24916 0.0091 ms 81.0% + triton_bmm_24927 0.0091 ms 81.0% +SingleProcess AUTOTUNE takes 3.8135 seconds +AUTOTUNE bmm(16x1x263, 16x263x96) + triton_bmm_24949 0.0106 ms 100.0% + bmm 0.0108 ms 97.3% + triton_bmm_24948 0.0108 ms 97.3% + triton_bmm_24946 0.0130 ms 81.1% + triton_bmm_24945 0.0131 ms 80.7% + triton_bmm_24944 0.0132 ms 80.1% + triton_bmm_24942 0.0134 ms 78.6% + triton_bmm_24941 0.0135 ms 78.2% + triton_bmm_24943 0.0144 ms 73.5% + triton_bmm_24940 0.0156 ms 67.5% +SingleProcess AUTOTUNE takes 3.7103 seconds +AUTOTUNE bmm(16x1x96, 16x96x264) + triton_bmm_25016 0.0073 ms 100.0% + triton_bmm_25014 0.0076 ms 96.6% + triton_bmm_25020 0.0076 ms 95.8% + triton_bmm_25018 0.0079 ms 93.1% + triton_bmm_25013 0.0081 ms 90.2% + triton_bmm_25015 0.0083 ms 88.1% + triton_bmm_25017 0.0084 ms 87.7% + triton_bmm_25012 0.0084 ms 87.1% + triton_bmm_25019 0.0088 ms 83.0% + triton_bmm_25023 0.0091 ms 80.6% +SingleProcess AUTOTUNE takes 3.6071 seconds +AUTOTUNE bmm(16x1x264, 16x264x96) + triton_bmm_25042 0.0083 ms 100.0% + triton_bmm_25041 0.0088 ms 94.5% + triton_bmm_25039 0.0091 ms 91.5% + triton_bmm_25044 0.0092 ms 90.9% + triton_bmm_25040 0.0092 ms 90.3% + triton_bmm_25045 0.0093 ms 89.3% + triton_bmm_25038 0.0096 ms 87.0% + bmm 0.0098 ms 84.7% + triton_bmm_25037 0.0099 ms 84.4% + triton_bmm_25036 0.0124 ms 67.4% +SingleProcess AUTOTUNE takes 4.3683 seconds +AUTOTUNE bmm(16x1x96, 16x96x265) + triton_bmm_25111 0.0078 ms 100.0% + triton_bmm_25116 0.0078 ms 100.0% + triton_bmm_25114 0.0080 ms 98.2% + triton_bmm_25113 0.0080 ms 98.0% + triton_bmm_25112 0.0080 ms 97.6% + triton_bmm_25110 0.0080 ms 97.0% + triton_bmm_25109 0.0084 ms 93.5% + triton_bmm_25108 0.0089 ms 87.5% + triton_bmm_25119 0.0091 ms 85.6% + triton_bmm_25118 0.0093 ms 83.8% +SingleProcess AUTOTUNE takes 3.9081 seconds +AUTOTUNE bmm(16x1x265, 16x265x96) + triton_bmm_25141 0.0100 ms 100.0% + bmm 0.0110 ms 91.0% + triton_bmm_25140 0.0116 ms 86.3% + triton_bmm_25137 0.0126 ms 79.4% + triton_bmm_25136 0.0128 ms 77.8% + triton_bmm_25138 0.0131 ms 76.1% + triton_bmm_25134 0.0136 ms 73.6% + triton_bmm_25135 0.0139 ms 72.1% + triton_bmm_25133 0.0139 ms 71.7% + triton_bmm_25132 0.0161 ms 61.9% +SingleProcess AUTOTUNE takes 3.8896 seconds +AUTOTUNE bmm(16x1x96, 16x96x266) + triton_bmm_25207 0.0077 ms 100.0% + triton_bmm_25205 0.0078 ms 99.2% + triton_bmm_25209 0.0078 ms 98.8% + triton_bmm_25210 0.0079 ms 98.4% + triton_bmm_25208 0.0079 ms 98.0% + triton_bmm_25206 0.0081 ms 95.7% + triton_bmm_25212 0.0082 ms 94.2% + triton_bmm_25204 0.0085 ms 90.6% + triton_bmm_25211 0.0093 ms 82.9% + triton_bmm_25215 0.0096 ms 80.7% +SingleProcess AUTOTUNE takes 3.7498 seconds +AUTOTUNE bmm(16x1x266, 16x266x96) + triton_bmm_25233 0.0083 ms 100.0% + triton_bmm_25234 0.0083 ms 100.0% + triton_bmm_25236 0.0087 ms 95.9% + triton_bmm_25232 0.0088 ms 94.2% + triton_bmm_25231 0.0091 ms 91.5% + triton_bmm_25230 0.0097 ms 85.5% + bmm 0.0099 ms 84.1% + triton_bmm_25237 0.0099 ms 84.1% + triton_bmm_25229 0.0104 ms 80.2% + triton_bmm_25228 0.0126 ms 66.2% +SingleProcess AUTOTUNE takes 3.8927 seconds +AUTOTUNE bmm(16x1x96, 16x96x267) + triton_bmm_25308 0.0076 ms 100.0% + triton_bmm_25301 0.0078 ms 97.5% + triton_bmm_25303 0.0078 ms 97.5% + triton_bmm_25304 0.0079 ms 96.0% + triton_bmm_25306 0.0079 ms 96.0% + triton_bmm_25305 0.0080 ms 95.4% + triton_bmm_25302 0.0081 ms 94.1% + triton_bmm_25300 0.0091 ms 83.8% + triton_bmm_25310 0.0093 ms 81.8% + triton_bmm_25307 0.0094 ms 80.7% +SingleProcess AUTOTUNE takes 4.1015 seconds +AUTOTUNE bmm(16x1x267, 16x267x96) + triton_bmm_25333 0.0104 ms 100.0% + triton_bmm_25332 0.0115 ms 90.6% + triton_bmm_25326 0.0131 ms 79.7% + triton_bmm_25329 0.0131 ms 79.5% + triton_bmm_25325 0.0133 ms 78.2% + triton_bmm_25330 0.0133 ms 78.2% + triton_bmm_25328 0.0134 ms 78.0% + triton_bmm_25327 0.0143 ms 72.8% + triton_bmm_25335 0.0161 ms 64.7% + triton_bmm_25324 0.0164 ms 63.5% +SingleProcess AUTOTUNE takes 3.8870 seconds +AUTOTUNE bmm(16x1x96, 16x96x268) + triton_bmm_25398 0.0076 ms 100.0% + triton_bmm_25404 0.0077 ms 98.3% + triton_bmm_25397 0.0078 ms 97.1% + triton_bmm_25400 0.0079 ms 96.3% + triton_bmm_25402 0.0079 ms 96.3% + triton_bmm_25399 0.0083 ms 91.5% + triton_bmm_25396 0.0084 ms 90.8% + triton_bmm_25401 0.0084 ms 90.8% + triton_bmm_25403 0.0088 ms 85.9% + triton_bmm_25407 0.0091 ms 83.5% +SingleProcess AUTOTUNE takes 4.2294 seconds +AUTOTUNE bmm(16x1x268, 16x268x96) + triton_bmm_25425 0.0082 ms 100.0% + triton_bmm_25428 0.0086 ms 95.9% + triton_bmm_25424 0.0087 ms 94.1% + triton_bmm_25426 0.0088 ms 93.1% + triton_bmm_25423 0.0092 ms 89.5% + triton_bmm_25429 0.0096 ms 86.0% + triton_bmm_25422 0.0096 ms 85.4% + bmm 0.0097 ms 85.1% + triton_bmm_25421 0.0100 ms 82.6% + triton_bmm_25420 0.0129 ms 63.9% +SingleProcess AUTOTUNE takes 4.3445 seconds +AUTOTUNE bmm(16x1x96, 16x96x269) + triton_bmm_25496 0.0073 ms 100.0% + triton_bmm_25494 0.0076 ms 96.6% + triton_bmm_25495 0.0078 ms 93.9% + triton_bmm_25500 0.0078 ms 93.9% + triton_bmm_25498 0.0081 ms 90.9% + triton_bmm_25493 0.0084 ms 87.7% + triton_bmm_25497 0.0084 ms 87.7% + triton_bmm_25499 0.0088 ms 83.0% + triton_bmm_25492 0.0091 ms 80.6% + triton_bmm_25503 0.0092 ms 79.2% +SingleProcess AUTOTUNE takes 4.1292 seconds +AUTOTUNE bmm(16x1x269, 16x269x96) + triton_bmm_25525 0.0106 ms 100.0% + triton_bmm_25524 0.0108 ms 97.6% + bmm 0.0116 ms 91.4% + triton_bmm_25521 0.0126 ms 84.2% + triton_bmm_25520 0.0128 ms 82.5% + triton_bmm_25522 0.0131 ms 80.6% + triton_bmm_25518 0.0136 ms 77.9% + triton_bmm_25519 0.0139 ms 76.4% + triton_bmm_25517 0.0139 ms 76.4% + triton_bmm_25516 0.0164 ms 64.7% +SingleProcess AUTOTUNE takes 3.7844 seconds +AUTOTUNE bmm(16x1x96, 16x96x270) + triton_bmm_25594 0.0073 ms 100.0% + triton_bmm_25590 0.0076 ms 96.6% + triton_bmm_25591 0.0078 ms 93.9% + triton_bmm_25593 0.0078 ms 93.5% + triton_bmm_25592 0.0081 ms 90.5% + triton_bmm_25589 0.0083 ms 88.1% + triton_bmm_25596 0.0083 ms 88.1% + triton_bmm_25588 0.0085 ms 85.8% + triton_bmm_25595 0.0094 ms 78.2% + triton_bmm_25599 0.0096 ms 76.3% +SingleProcess AUTOTUNE takes 3.7730 seconds +AUTOTUNE bmm(16x1x270, 16x270x96) + triton_bmm_25617 0.0081 ms 100.0% + triton_bmm_25620 0.0086 ms 94.4% + triton_bmm_25616 0.0088 ms 91.7% + triton_bmm_25618 0.0089 ms 91.3% + triton_bmm_25615 0.0091 ms 89.1% + triton_bmm_25621 0.0093 ms 86.6% + triton_bmm_25614 0.0098 ms 83.0% + triton_bmm_25613 0.0106 ms 76.6% + bmm 0.0106 ms 76.1% + triton_bmm_25612 0.0126 ms 64.2% +SingleProcess AUTOTUNE takes 4.0085 seconds +AUTOTUNE bmm(16x1x96, 16x96x271) + triton_bmm_25688 0.0074 ms 100.0% + triton_bmm_25692 0.0077 ms 95.0% + triton_bmm_25687 0.0078 ms 94.3% + triton_bmm_25685 0.0078 ms 93.9% + triton_bmm_25690 0.0080 ms 92.0% + triton_bmm_25686 0.0081 ms 91.3% + triton_bmm_25689 0.0084 ms 88.1% + triton_bmm_25691 0.0088 ms 83.3% + triton_bmm_25684 0.0090 ms 81.6% + triton_bmm_25695 0.0092 ms 79.6% +SingleProcess AUTOTUNE takes 3.9639 seconds +AUTOTUNE bmm(16x1x271, 16x271x96) + triton_bmm_25717 0.0101 ms 100.0% + triton_bmm_25716 0.0111 ms 91.0% + bmm 0.0116 ms 87.0% + triton_bmm_25710 0.0131 ms 77.0% + triton_bmm_25713 0.0131 ms 76.8% + triton_bmm_25714 0.0132 ms 76.6% + triton_bmm_25712 0.0134 ms 75.4% + triton_bmm_25709 0.0136 ms 74.3% + triton_bmm_25711 0.0139 ms 72.7% + triton_bmm_25708 0.0164 ms 61.5% +SingleProcess AUTOTUNE takes 3.8458 seconds +AUTOTUNE bmm(16x1x96, 16x96x272) + triton_bmm_25783 0.0076 ms 100.0% + triton_bmm_25788 0.0077 ms 98.8% + triton_bmm_25785 0.0078 ms 96.7% + triton_bmm_25784 0.0079 ms 96.3% + triton_bmm_25786 0.0079 ms 96.3% + triton_bmm_25782 0.0081 ms 93.7% + triton_bmm_25781 0.0082 ms 92.2% + triton_bmm_25780 0.0090 ms 84.3% + triton_bmm_25789 0.0093 ms 81.7% + triton_bmm_25790 0.0093 ms 81.4% +SingleProcess AUTOTUNE takes 3.6346 seconds +AUTOTUNE bmm(16x1x272, 16x272x96) + triton_bmm_25809 0.0083 ms 100.0% + triton_bmm_25810 0.0083 ms 100.0% + triton_bmm_25808 0.0086 ms 97.0% + triton_bmm_25812 0.0086 ms 97.0% + triton_bmm_25806 0.0096 ms 86.7% + triton_bmm_25807 0.0097 ms 85.5% + triton_bmm_25813 0.0098 ms 84.7% + triton_bmm_25805 0.0100 ms 82.8% + bmm 0.0101 ms 82.3% + triton_bmm_25804 0.0129 ms 64.7% +SingleProcess AUTOTUNE takes 3.8950 seconds +AUTOTUNE bmm(16x1x96, 16x96x273) + triton_bmm_25882 0.0073 ms 100.0% + triton_bmm_25878 0.0076 ms 96.6% + triton_bmm_25879 0.0078 ms 93.9% + triton_bmm_25884 0.0078 ms 93.9% + triton_bmm_25881 0.0079 ms 93.1% + triton_bmm_25880 0.0081 ms 90.9% + triton_bmm_25877 0.0084 ms 87.7% + triton_bmm_25876 0.0089 ms 82.1% + triton_bmm_25887 0.0092 ms 79.2% + triton_bmm_25885 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 3.8267 seconds +AUTOTUNE bmm(16x1x273, 16x273x96) + triton_bmm_25909 0.0101 ms 100.0% + triton_bmm_25908 0.0111 ms 91.0% + bmm 0.0118 ms 85.4% + triton_bmm_25902 0.0131 ms 76.8% + triton_bmm_25905 0.0132 ms 76.6% + triton_bmm_25906 0.0133 ms 75.5% + triton_bmm_25904 0.0134 ms 75.4% + triton_bmm_25901 0.0138 ms 73.3% + triton_bmm_25903 0.0146 ms 68.9% + triton_bmm_25900 0.0161 ms 62.6% +SingleProcess AUTOTUNE takes 3.8586 seconds +AUTOTUNE bmm(16x1x96, 16x96x274) + triton_bmm_25978 0.0073 ms 100.0% + triton_bmm_25976 0.0074 ms 99.6% + triton_bmm_25974 0.0076 ms 96.6% + triton_bmm_25973 0.0078 ms 93.9% + triton_bmm_25975 0.0082 ms 89.3% + triton_bmm_25980 0.0083 ms 88.1% + triton_bmm_25977 0.0084 ms 87.7% + triton_bmm_25979 0.0088 ms 83.0% + triton_bmm_25972 0.0090 ms 81.2% + triton_bmm_25982 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 4.5288 seconds +AUTOTUNE bmm(16x1x274, 16x274x96) + triton_bmm_26001 0.0083 ms 100.0% + triton_bmm_26002 0.0083 ms 100.0% + triton_bmm_26004 0.0086 ms 97.0% + triton_bmm_25999 0.0093 ms 89.3% + triton_bmm_26000 0.0093 ms 89.0% + bmm 0.0100 ms 83.3% + triton_bmm_25997 0.0101 ms 82.5% + triton_bmm_26005 0.0101 ms 82.5% + triton_bmm_25998 0.0101 ms 82.0% + triton_bmm_25996 0.0131 ms 63.4% +SingleProcess AUTOTUNE takes 4.1856 seconds +AUTOTUNE bmm(16x1x96, 16x96x275) + triton_bmm_26072 0.0074 ms 100.0% + triton_bmm_26074 0.0074 ms 100.0% + triton_bmm_26070 0.0076 ms 97.0% + triton_bmm_26071 0.0078 ms 94.3% + triton_bmm_26069 0.0084 ms 88.1% + triton_bmm_26073 0.0084 ms 88.1% + triton_bmm_26076 0.0084 ms 88.1% + triton_bmm_26068 0.0086 ms 85.8% + triton_bmm_26079 0.0093 ms 79.2% + triton_bmm_26077 0.0093 ms 78.8% +SingleProcess AUTOTUNE takes 4.2297 seconds +AUTOTUNE bmm(16x1x275, 16x275x96) + triton_bmm_26101 0.0106 ms 100.0% + triton_bmm_26100 0.0116 ms 91.2% + bmm 0.0121 ms 87.3% + triton_bmm_26094 0.0131 ms 80.9% + triton_bmm_26097 0.0132 ms 80.3% + triton_bmm_26098 0.0133 ms 79.4% + triton_bmm_26096 0.0134 ms 79.0% + triton_bmm_26093 0.0143 ms 73.9% + triton_bmm_26095 0.0146 ms 72.4% + triton_bmm_26092 0.0166 ms 63.8% +SingleProcess AUTOTUNE takes 3.8171 seconds +AUTOTUNE bmm(16x1x96, 16x96x276) + triton_bmm_26168 0.0073 ms 100.0% + triton_bmm_26170 0.0073 ms 100.0% + triton_bmm_26166 0.0076 ms 96.6% + triton_bmm_26167 0.0076 ms 96.6% + triton_bmm_26172 0.0078 ms 93.9% + triton_bmm_26165 0.0083 ms 88.1% + triton_bmm_26164 0.0084 ms 87.7% + triton_bmm_26169 0.0084 ms 87.7% + triton_bmm_26174 0.0093 ms 78.7% + triton_bmm_26171 0.0093 ms 78.4% +SingleProcess AUTOTUNE takes 4.2247 seconds +AUTOTUNE bmm(16x1x276, 16x276x96) + triton_bmm_26193 0.0083 ms 100.0% + triton_bmm_26194 0.0083 ms 100.0% + triton_bmm_26192 0.0086 ms 96.7% + triton_bmm_26191 0.0091 ms 91.5% + triton_bmm_26197 0.0091 ms 91.2% + triton_bmm_26196 0.0092 ms 90.9% + bmm 0.0093 ms 89.0% + triton_bmm_26190 0.0098 ms 85.2% + triton_bmm_26189 0.0106 ms 78.5% + triton_bmm_26188 0.0129 ms 64.5% +SingleProcess AUTOTUNE takes 3.8647 seconds +AUTOTUNE bmm(16x1x96, 16x96x277) + triton_bmm_26266 0.0074 ms 100.0% + triton_bmm_26262 0.0076 ms 97.0% + triton_bmm_26261 0.0078 ms 94.1% + triton_bmm_26264 0.0079 ms 93.1% + triton_bmm_26268 0.0083 ms 88.5% + triton_bmm_26263 0.0084 ms 88.1% + triton_bmm_26265 0.0084 ms 87.8% + triton_bmm_26267 0.0088 ms 83.3% + triton_bmm_26260 0.0090 ms 81.7% + triton_bmm_26271 0.0091 ms 81.0% +SingleProcess AUTOTUNE takes 4.4796 seconds +AUTOTUNE bmm(16x1x277, 16x277x96) + triton_bmm_26293 0.0106 ms 100.0% + triton_bmm_26292 0.0111 ms 96.0% + bmm 0.0122 ms 86.9% + triton_bmm_26289 0.0128 ms 83.0% + triton_bmm_26290 0.0128 ms 82.8% + triton_bmm_26288 0.0129 ms 82.4% + triton_bmm_26286 0.0132 ms 80.4% + triton_bmm_26285 0.0138 ms 76.9% + triton_bmm_26287 0.0146 ms 72.6% + triton_bmm_26284 0.0164 ms 64.8% +SingleProcess AUTOTUNE takes 3.9688 seconds +AUTOTUNE bmm(16x1x96, 16x96x278) + triton_bmm_26360 0.0074 ms 100.0% + triton_bmm_26362 0.0074 ms 100.0% + triton_bmm_26358 0.0076 ms 97.0% + triton_bmm_26357 0.0078 ms 94.3% + triton_bmm_26359 0.0083 ms 88.6% + triton_bmm_26361 0.0084 ms 88.1% + triton_bmm_26364 0.0084 ms 88.1% + triton_bmm_26356 0.0091 ms 81.0% + triton_bmm_26367 0.0091 ms 81.0% + triton_bmm_26366 0.0093 ms 79.0% +SingleProcess AUTOTUNE takes 3.9876 seconds +AUTOTUNE bmm(16x1x278, 16x278x96) + triton_bmm_26388 0.0088 ms 100.0% + triton_bmm_26385 0.0088 ms 99.6% + triton_bmm_26386 0.0089 ms 98.9% + triton_bmm_26384 0.0093 ms 94.2% + triton_bmm_26389 0.0096 ms 92.0% + triton_bmm_26383 0.0099 ms 89.3% + bmm 0.0100 ms 88.1% + triton_bmm_26382 0.0103 ms 85.1% + triton_bmm_26381 0.0106 ms 83.3% + triton_bmm_26380 0.0131 ms 67.1% +SingleProcess AUTOTUNE takes 3.8977 seconds +AUTOTUNE bmm(16x1x96, 16x96x279) + triton_bmm_26458 0.0074 ms 100.0% + triton_bmm_26455 0.0078 ms 94.3% + triton_bmm_26460 0.0078 ms 94.3% + triton_bmm_26454 0.0081 ms 90.9% + triton_bmm_26456 0.0081 ms 90.9% + triton_bmm_26453 0.0084 ms 88.1% + triton_bmm_26457 0.0084 ms 87.1% + triton_bmm_26459 0.0088 ms 83.3% + triton_bmm_26452 0.0091 ms 81.0% + triton_bmm_26461 0.0093 ms 79.0% +SingleProcess AUTOTUNE takes 3.9041 seconds +AUTOTUNE bmm(16x1x279, 16x279x96) + triton_bmm_26485 0.0101 ms 100.0% + triton_bmm_26484 0.0111 ms 91.0% + bmm 0.0121 ms 83.6% + triton_bmm_26481 0.0128 ms 78.6% + triton_bmm_26482 0.0129 ms 78.2% + triton_bmm_26480 0.0136 ms 74.3% + triton_bmm_26478 0.0136 ms 73.9% + triton_bmm_26477 0.0138 ms 72.9% + triton_bmm_26479 0.0143 ms 70.5% + triton_bmm_26476 0.0166 ms 60.6% +SingleProcess AUTOTUNE takes 3.9862 seconds +AUTOTUNE bmm(16x1x96, 16x96x280) + triton_bmm_26552 0.0074 ms 100.0% + triton_bmm_26554 0.0074 ms 100.0% + triton_bmm_26556 0.0078 ms 94.3% + triton_bmm_26553 0.0078 ms 93.9% + triton_bmm_26550 0.0081 ms 90.9% + triton_bmm_26551 0.0083 ms 89.1% + triton_bmm_26549 0.0083 ms 88.6% + triton_bmm_26555 0.0088 ms 83.3% + triton_bmm_26548 0.0091 ms 81.0% + triton_bmm_26558 0.0093 ms 79.0% +SingleProcess AUTOTUNE takes 3.7191 seconds +AUTOTUNE bmm(16x1x280, 16x280x96) + triton_bmm_26577 0.0085 ms 100.0% + triton_bmm_26576 0.0088 ms 96.7% + triton_bmm_26580 0.0088 ms 96.7% + triton_bmm_26578 0.0089 ms 96.0% + triton_bmm_26575 0.0093 ms 91.4% + bmm 0.0096 ms 88.7% + triton_bmm_26581 0.0100 ms 84.7% + triton_bmm_26574 0.0103 ms 82.6% + triton_bmm_26573 0.0106 ms 80.4% + triton_bmm_26572 0.0125 ms 68.0% +SingleProcess AUTOTUNE takes 3.7898 seconds +AUTOTUNE bmm(16x1x96, 16x96x281) + triton_bmm_26648 0.0074 ms 100.0% + triton_bmm_26650 0.0074 ms 100.0% + triton_bmm_26646 0.0076 ms 97.0% + triton_bmm_26652 0.0078 ms 94.3% + triton_bmm_26649 0.0080 ms 92.4% + triton_bmm_26647 0.0083 ms 88.8% + triton_bmm_26645 0.0084 ms 88.1% + triton_bmm_26644 0.0086 ms 85.8% + triton_bmm_26654 0.0093 ms 78.8% + triton_bmm_26651 0.0096 ms 76.9% +SingleProcess AUTOTUNE takes 4.3317 seconds +AUTOTUNE bmm(16x1x281, 16x281x96) + triton_bmm_26677 0.0101 ms 100.0% + triton_bmm_26676 0.0111 ms 90.8% + bmm 0.0121 ms 83.6% + triton_bmm_26673 0.0133 ms 75.5% + triton_bmm_26674 0.0134 ms 75.4% + triton_bmm_26672 0.0134 ms 75.1% + triton_bmm_26670 0.0136 ms 73.9% + triton_bmm_26669 0.0143 ms 70.3% + triton_bmm_26671 0.0146 ms 68.9% + triton_bmm_26668 0.0166 ms 60.7% +SingleProcess AUTOTUNE takes 3.9476 seconds +AUTOTUNE bmm(16x1x96, 16x96x282) + triton_bmm_26746 0.0074 ms 100.0% + triton_bmm_26743 0.0076 ms 96.6% + triton_bmm_26741 0.0078 ms 94.3% + triton_bmm_26748 0.0078 ms 93.9% + triton_bmm_26745 0.0079 ms 93.5% + triton_bmm_26744 0.0079 ms 93.1% + triton_bmm_26742 0.0081 ms 90.9% + triton_bmm_26740 0.0085 ms 86.1% + triton_bmm_26747 0.0088 ms 83.3% + triton_bmm_26750 0.0093 ms 79.0% +SingleProcess AUTOTUNE takes 4.2483 seconds +AUTOTUNE bmm(16x1x282, 16x282x96) + triton_bmm_26769 0.0084 ms 100.0% + triton_bmm_26770 0.0084 ms 100.0% + triton_bmm_26768 0.0088 ms 94.6% + triton_bmm_26772 0.0088 ms 94.6% + triton_bmm_26767 0.0093 ms 89.7% + triton_bmm_26766 0.0098 ms 85.0% + triton_bmm_26773 0.0099 ms 84.1% + bmm 0.0100 ms 83.7% + triton_bmm_26765 0.0106 ms 78.9% + triton_bmm_26764 0.0132 ms 63.5% +SingleProcess AUTOTUNE takes 4.2323 seconds +AUTOTUNE bmm(16x1x96, 16x96x283) + triton_bmm_26842 0.0074 ms 100.0% + triton_bmm_26837 0.0078 ms 94.7% + triton_bmm_26839 0.0078 ms 94.7% + triton_bmm_26844 0.0078 ms 94.3% + triton_bmm_26840 0.0079 ms 93.5% + triton_bmm_26838 0.0081 ms 91.1% + triton_bmm_26841 0.0084 ms 87.8% + triton_bmm_26836 0.0086 ms 86.2% + triton_bmm_26847 0.0091 ms 81.1% + triton_bmm_26843 0.0096 ms 77.1% +SingleProcess AUTOTUNE takes 4.0090 seconds +AUTOTUNE bmm(16x1x283, 16x283x96) + triton_bmm_26869 0.0101 ms 100.0% + triton_bmm_26868 0.0116 ms 86.8% + bmm 0.0126 ms 80.2% + triton_bmm_26864 0.0131 ms 77.0% + triton_bmm_26862 0.0132 ms 76.1% + triton_bmm_26865 0.0133 ms 75.5% + triton_bmm_26866 0.0134 ms 75.4% + triton_bmm_26863 0.0141 ms 71.4% + triton_bmm_26861 0.0141 ms 71.3% + triton_bmm_26860 0.0161 ms 62.6% +SingleProcess AUTOTUNE takes 3.7533 seconds +AUTOTUNE bmm(16x1x96, 16x96x284) + triton_bmm_26938 0.0074 ms 100.0% + triton_bmm_26934 0.0076 ms 97.0% + triton_bmm_26933 0.0078 ms 94.3% + triton_bmm_26936 0.0079 ms 93.7% + triton_bmm_26937 0.0080 ms 91.6% + triton_bmm_26935 0.0084 ms 88.1% + triton_bmm_26940 0.0084 ms 88.1% + triton_bmm_26932 0.0085 ms 86.5% + triton_bmm_26939 0.0094 ms 78.5% + triton_bmm_26941 0.0096 ms 76.7% +SingleProcess AUTOTUNE takes 4.0850 seconds +AUTOTUNE bmm(16x1x284, 16x284x96) + triton_bmm_26962 0.0084 ms 100.0% + triton_bmm_26961 0.0088 ms 94.6% + triton_bmm_26964 0.0088 ms 94.6% + bmm 0.0093 ms 89.4% + triton_bmm_26960 0.0093 ms 89.4% + triton_bmm_26965 0.0097 ms 85.9% + triton_bmm_26959 0.0098 ms 85.2% + triton_bmm_26958 0.0098 ms 85.0% + triton_bmm_26957 0.0106 ms 78.9% + triton_bmm_26956 0.0126 ms 66.4% +SingleProcess AUTOTUNE takes 4.1886 seconds +AUTOTUNE bmm(16x1x96, 16x96x285) + triton_bmm_27031 0.0078 ms 100.0% + triton_bmm_27033 0.0081 ms 96.8% + triton_bmm_27032 0.0081 ms 96.4% + triton_bmm_27034 0.0081 ms 96.4% + triton_bmm_27030 0.0081 ms 96.1% + triton_bmm_27029 0.0084 ms 93.5% + triton_bmm_27036 0.0084 ms 93.5% + triton_bmm_27028 0.0091 ms 85.9% + triton_bmm_27037 0.0093 ms 83.8% + triton_bmm_27038 0.0093 ms 83.6% +SingleProcess AUTOTUNE takes 3.9143 seconds +AUTOTUNE bmm(16x1x285, 16x285x96) + triton_bmm_27061 0.0101 ms 100.0% + triton_bmm_27060 0.0111 ms 91.0% + bmm 0.0126 ms 79.9% + triton_bmm_27058 0.0128 ms 78.6% + triton_bmm_27056 0.0131 ms 77.0% + triton_bmm_27054 0.0131 ms 76.8% + triton_bmm_27057 0.0133 ms 75.5% + triton_bmm_27053 0.0141 ms 71.4% + triton_bmm_27055 0.0141 ms 71.4% + triton_bmm_27052 0.0166 ms 60.6% +SingleProcess AUTOTUNE takes 3.7373 seconds +AUTOTUNE bmm(16x1x96, 16x96x286) + triton_bmm_27128 0.0074 ms 100.0% + triton_bmm_27130 0.0074 ms 100.0% + triton_bmm_27125 0.0078 ms 94.3% + triton_bmm_27126 0.0081 ms 90.9% + triton_bmm_27127 0.0083 ms 88.6% + triton_bmm_27132 0.0084 ms 88.1% + triton_bmm_27129 0.0084 ms 87.5% + triton_bmm_27131 0.0088 ms 83.3% + triton_bmm_27124 0.0091 ms 81.0% + triton_bmm_27135 0.0091 ms 81.0% +SingleProcess AUTOTUNE takes 3.9877 seconds +AUTOTUNE bmm(16x1x286, 16x286x96) + triton_bmm_27154 0.0083 ms 100.0% + triton_bmm_27152 0.0088 ms 94.2% + triton_bmm_27153 0.0088 ms 94.2% + triton_bmm_27156 0.0093 ms 89.0% + triton_bmm_27150 0.0098 ms 84.7% + triton_bmm_27151 0.0098 ms 84.7% + triton_bmm_27149 0.0101 ms 82.5% + triton_bmm_27157 0.0101 ms 82.5% + bmm 0.0103 ms 80.5% + triton_bmm_27148 0.0126 ms 66.0% +SingleProcess AUTOTUNE takes 4.1058 seconds +AUTOTUNE bmm(16x1x96, 16x96x287) + triton_bmm_27222 0.0081 ms 100.0% + triton_bmm_27224 0.0081 ms 99.8% + triton_bmm_27226 0.0081 ms 99.8% + triton_bmm_27223 0.0083 ms 97.5% + triton_bmm_27221 0.0084 ms 96.7% + triton_bmm_27228 0.0084 ms 96.7% + triton_bmm_27225 0.0086 ms 94.2% + triton_bmm_27227 0.0088 ms 91.5% + triton_bmm_27220 0.0090 ms 89.4% + triton_bmm_27230 0.0093 ms 86.5% +SingleProcess AUTOTUNE takes 3.8029 seconds +AUTOTUNE bmm(16x1x287, 16x287x96) + triton_bmm_27253 0.0101 ms 100.0% + triton_bmm_27252 0.0111 ms 90.8% + triton_bmm_27249 0.0118 ms 85.1% + triton_bmm_27250 0.0121 ms 83.3% + bmm 0.0129 ms 78.4% + triton_bmm_27248 0.0129 ms 78.4% + triton_bmm_27246 0.0131 ms 76.8% + triton_bmm_27247 0.0133 ms 75.5% + triton_bmm_27245 0.0139 ms 72.7% + triton_bmm_27244 0.0169 ms 59.8% +SingleProcess AUTOTUNE takes 3.6912 seconds +AUTOTUNE bmm(16x1x96, 16x96x288) + triton_bmm_27322 0.0073 ms 100.0% + triton_bmm_27319 0.0077 ms 94.6% + triton_bmm_27317 0.0078 ms 93.9% + triton_bmm_27321 0.0081 ms 90.9% + triton_bmm_27318 0.0081 ms 90.5% + triton_bmm_27320 0.0081 ms 90.5% + triton_bmm_27324 0.0084 ms 87.7% + triton_bmm_27316 0.0091 ms 80.6% + triton_bmm_27327 0.0091 ms 80.6% + triton_bmm_27325 0.0093 ms 78.7% +SingleProcess AUTOTUNE takes 4.1091 seconds +AUTOTUNE bmm(16x1x288, 16x288x96) + triton_bmm_27346 0.0083 ms 100.0% + triton_bmm_27345 0.0084 ms 98.5% + triton_bmm_27344 0.0088 ms 94.5% + triton_bmm_27343 0.0091 ms 91.5% + triton_bmm_27348 0.0093 ms 89.0% + bmm 0.0098 ms 85.2% + triton_bmm_27342 0.0098 ms 84.7% + triton_bmm_27349 0.0099 ms 84.4% + triton_bmm_27341 0.0106 ms 78.5% + triton_bmm_27340 0.0131 ms 63.4% +SingleProcess AUTOTUNE takes 3.8476 seconds +AUTOTUNE bmm(16x1x96, 16x96x289) + triton_bmm_27418 0.0074 ms 100.0% + triton_bmm_27413 0.0078 ms 94.3% + triton_bmm_27416 0.0079 ms 93.1% + triton_bmm_27417 0.0081 ms 91.7% + triton_bmm_27414 0.0081 ms 91.3% + triton_bmm_27415 0.0082 ms 90.1% + triton_bmm_27420 0.0083 ms 88.8% + triton_bmm_27419 0.0088 ms 83.7% + triton_bmm_27412 0.0091 ms 81.3% + triton_bmm_27421 0.0093 ms 79.4% +SingleProcess AUTOTUNE takes 3.8003 seconds +AUTOTUNE bmm(16x1x289, 16x289x96) + triton_bmm_27445 0.0106 ms 100.0% + bmm 0.0111 ms 95.7% + triton_bmm_27444 0.0116 ms 91.5% + triton_bmm_27441 0.0131 ms 81.2% + triton_bmm_27438 0.0136 ms 78.1% + triton_bmm_27442 0.0136 ms 77.9% + triton_bmm_27440 0.0139 ms 76.7% + triton_bmm_27439 0.0146 ms 72.8% + triton_bmm_27437 0.0146 ms 72.6% + triton_bmm_27436 0.0166 ms 63.8% +SingleProcess AUTOTUNE takes 3.9757 seconds +AUTOTUNE bmm(16x1x96, 16x96x290) + triton_bmm_27512 0.0074 ms 100.0% + triton_bmm_27514 0.0074 ms 99.6% + triton_bmm_27510 0.0076 ms 97.0% + triton_bmm_27516 0.0078 ms 93.9% + triton_bmm_27509 0.0084 ms 88.1% + triton_bmm_27511 0.0084 ms 88.1% + triton_bmm_27513 0.0086 ms 86.0% + triton_bmm_27508 0.0091 ms 81.3% + triton_bmm_27518 0.0093 ms 78.8% + triton_bmm_27515 0.0094 ms 78.2% +SingleProcess AUTOTUNE takes 3.7317 seconds +AUTOTUNE bmm(16x1x290, 16x290x96) + triton_bmm_27537 0.0084 ms 100.0% + triton_bmm_27538 0.0086 ms 97.4% + triton_bmm_27540 0.0093 ms 89.7% + bmm 0.0096 ms 87.3% + triton_bmm_27541 0.0096 ms 87.3% + triton_bmm_27536 0.0096 ms 87.0% + triton_bmm_27534 0.0101 ms 82.9% + triton_bmm_27535 0.0101 ms 82.9% + triton_bmm_27533 0.0108 ms 77.0% + triton_bmm_27532 0.0139 ms 60.3% +SingleProcess AUTOTUNE takes 4.1591 seconds +AUTOTUNE bmm(16x1x96, 16x96x291) + triton_bmm_27610 0.0074 ms 100.0% + triton_bmm_27606 0.0076 ms 97.5% + triton_bmm_27607 0.0078 ms 94.7% + triton_bmm_27612 0.0078 ms 94.3% + triton_bmm_27608 0.0080 ms 92.4% + triton_bmm_27609 0.0081 ms 91.7% + triton_bmm_27605 0.0084 ms 88.5% + triton_bmm_27604 0.0086 ms 86.2% + triton_bmm_27611 0.0088 ms 83.7% + triton_bmm_27615 0.0093 ms 79.4% +SingleProcess AUTOTUNE takes 4.2873 seconds +AUTOTUNE bmm(16x1x291, 16x291x96) + triton_bmm_27637 0.0107 ms 100.0% + triton_bmm_27636 0.0111 ms 96.3% + bmm 0.0116 ms 92.3% + triton_bmm_27633 0.0137 ms 78.0% + triton_bmm_27634 0.0138 ms 77.5% + triton_bmm_27630 0.0138 ms 77.3% + triton_bmm_27632 0.0139 ms 77.1% + triton_bmm_27629 0.0142 ms 75.2% + triton_bmm_27631 0.0146 ms 73.2% + triton_bmm_27628 0.0168 ms 63.5% +SingleProcess AUTOTUNE takes 3.9043 seconds +AUTOTUNE bmm(16x1x96, 16x96x292) + triton_bmm_27706 0.0074 ms 100.0% + triton_bmm_27708 0.0078 ms 93.9% + triton_bmm_27704 0.0079 ms 92.7% + triton_bmm_27702 0.0081 ms 90.9% + triton_bmm_27701 0.0084 ms 88.1% + triton_bmm_27703 0.0084 ms 88.1% + triton_bmm_27705 0.0085 ms 86.1% + triton_bmm_27700 0.0091 ms 81.0% + triton_bmm_27709 0.0091 ms 81.0% + triton_bmm_27707 0.0094 ms 78.5% +SingleProcess AUTOTUNE takes 3.7510 seconds +AUTOTUNE bmm(16x1x292, 16x292x96) + triton_bmm_27730 0.0086 ms 100.0% + triton_bmm_27732 0.0088 ms 97.1% + triton_bmm_27729 0.0089 ms 96.4% + triton_bmm_27733 0.0093 ms 92.1% + triton_bmm_27727 0.0096 ms 89.6% + triton_bmm_27728 0.0096 ms 89.3% + bmm 0.0096 ms 89.0% + triton_bmm_27726 0.0104 ms 82.5% + triton_bmm_27725 0.0108 ms 79.1% + triton_bmm_27724 0.0131 ms 65.5% +SingleProcess AUTOTUNE takes 4.2315 seconds +AUTOTUNE bmm(16x1x96, 16x96x293) + triton_bmm_27802 0.0074 ms 100.0% + triton_bmm_27797 0.0078 ms 94.3% + triton_bmm_27799 0.0078 ms 94.3% + triton_bmm_27804 0.0078 ms 94.3% + triton_bmm_27800 0.0081 ms 91.7% + triton_bmm_27801 0.0081 ms 91.7% + triton_bmm_27798 0.0083 ms 88.8% + triton_bmm_27796 0.0086 ms 86.2% + triton_bmm_27803 0.0090 ms 82.2% + triton_bmm_27805 0.0092 ms 79.9% +SingleProcess AUTOTUNE takes 3.9936 seconds +AUTOTUNE bmm(16x1x293, 16x293x96) + triton_bmm_27829 0.0108 ms 100.0% + triton_bmm_27828 0.0111 ms 97.4% + bmm 0.0116 ms 93.4% + triton_bmm_27825 0.0133 ms 81.1% + triton_bmm_27826 0.0133 ms 81.1% + triton_bmm_27822 0.0138 ms 78.2% + triton_bmm_27824 0.0139 ms 77.9% + triton_bmm_27823 0.0146 ms 74.1% + triton_bmm_27821 0.0147 ms 73.8% + triton_bmm_27820 0.0168 ms 64.3% +SingleProcess AUTOTUNE takes 4.2716 seconds +AUTOTUNE bmm(16x1x96, 16x96x294) + triton_bmm_27900 0.0078 ms 100.0% + triton_bmm_27893 0.0078 ms 99.6% + triton_bmm_27897 0.0081 ms 96.8% + triton_bmm_27894 0.0081 ms 96.4% + triton_bmm_27896 0.0081 ms 96.4% + triton_bmm_27898 0.0081 ms 96.4% + triton_bmm_27895 0.0084 ms 93.5% + triton_bmm_27892 0.0086 ms 91.0% + triton_bmm_27903 0.0091 ms 85.9% + triton_bmm_27902 0.0093 ms 83.6% +SingleProcess AUTOTUNE takes 3.7405 seconds +AUTOTUNE bmm(16x1x294, 16x294x96) + triton_bmm_27921 0.0084 ms 100.0% + triton_bmm_27922 0.0086 ms 97.4% + triton_bmm_27924 0.0088 ms 94.6% + triton_bmm_27920 0.0091 ms 92.2% + triton_bmm_27925 0.0096 ms 87.3% + triton_bmm_27918 0.0101 ms 82.9% + triton_bmm_27919 0.0101 ms 82.9% + bmm 0.0102 ms 81.8% + triton_bmm_27917 0.0103 ms 80.8% + triton_bmm_27916 0.0133 ms 62.6% +SingleProcess AUTOTUNE takes 4.0815 seconds +AUTOTUNE bmm(16x1x96, 16x96x295) + triton_bmm_27992 0.0074 ms 100.0% + triton_bmm_27996 0.0078 ms 94.3% + triton_bmm_27994 0.0080 ms 92.6% + triton_bmm_27990 0.0084 ms 88.5% + triton_bmm_27991 0.0084 ms 88.5% + triton_bmm_27989 0.0084 ms 88.2% + triton_bmm_27988 0.0086 ms 86.2% + triton_bmm_27993 0.0086 ms 85.9% + triton_bmm_27998 0.0093 ms 79.1% + triton_bmm_27995 0.0096 ms 77.0% +SingleProcess AUTOTUNE takes 4.0060 seconds +AUTOTUNE bmm(16x1x295, 16x295x96) + triton_bmm_28021 0.0102 ms 100.0% + triton_bmm_28020 0.0118 ms 87.0% + triton_bmm_28017 0.0133 ms 76.7% + triton_bmm_28018 0.0139 ms 73.9% + triton_bmm_28016 0.0140 ms 73.4% + triton_bmm_28014 0.0143 ms 71.5% + triton_bmm_28013 0.0148 ms 69.0% + triton_bmm_28015 0.0148 ms 69.0% + triton_bmm_28012 0.0169 ms 60.7% + bmm 0.0172 ms 59.7% +SingleProcess AUTOTUNE takes 3.7695 seconds +AUTOTUNE bmm(16x1x96, 16x96x296) + triton_bmm_28090 0.0074 ms 100.0% + triton_bmm_28085 0.0078 ms 94.3% + triton_bmm_28087 0.0078 ms 94.3% + triton_bmm_28088 0.0081 ms 91.3% + triton_bmm_28086 0.0081 ms 90.6% + triton_bmm_28092 0.0084 ms 88.1% + triton_bmm_28089 0.0084 ms 87.5% + triton_bmm_28084 0.0091 ms 81.0% + triton_bmm_28094 0.0093 ms 78.8% + triton_bmm_28091 0.0094 ms 78.5% +SingleProcess AUTOTUNE takes 3.7662 seconds +AUTOTUNE bmm(16x1x296, 16x296x96) + triton_bmm_28114 0.0086 ms 100.0% + triton_bmm_28113 0.0088 ms 97.8% + triton_bmm_28112 0.0088 ms 97.1% + triton_bmm_28116 0.0093 ms 91.8% + bmm 0.0098 ms 87.6% + triton_bmm_28110 0.0098 ms 87.3% + triton_bmm_28111 0.0101 ms 85.1% + triton_bmm_28117 0.0101 ms 84.8% + triton_bmm_28109 0.0108 ms 79.1% + triton_bmm_28108 0.0136 ms 63.1% +SingleProcess AUTOTUNE takes 3.8394 seconds +AUTOTUNE bmm(16x1x96, 16x96x297) + triton_bmm_28186 0.0074 ms 100.0% + triton_bmm_28181 0.0078 ms 94.3% + triton_bmm_28188 0.0078 ms 94.3% + triton_bmm_28185 0.0081 ms 91.7% + triton_bmm_28182 0.0081 ms 91.3% + triton_bmm_28184 0.0081 ms 91.3% + triton_bmm_28183 0.0084 ms 88.5% + triton_bmm_28187 0.0089 ms 83.4% + triton_bmm_28180 0.0091 ms 81.3% + triton_bmm_28191 0.0092 ms 79.9% +SingleProcess AUTOTUNE takes 3.7413 seconds +AUTOTUNE bmm(16x1x297, 16x297x96) + triton_bmm_28213 0.0102 ms 100.0% + triton_bmm_28212 0.0113 ms 90.4% + bmm 0.0116 ms 88.2% + triton_bmm_28210 0.0133 ms 76.7% + triton_bmm_28208 0.0136 ms 75.3% + triton_bmm_28209 0.0138 ms 74.1% + triton_bmm_28206 0.0139 ms 73.9% + triton_bmm_28205 0.0143 ms 71.4% + triton_bmm_28207 0.0148 ms 69.0% + triton_bmm_28204 0.0174 ms 58.9% +SingleProcess AUTOTUNE takes 3.7202 seconds +AUTOTUNE bmm(16x1x96, 16x96x298) + triton_bmm_28280 0.0074 ms 100.0% + triton_bmm_28284 0.0078 ms 94.7% + triton_bmm_28281 0.0081 ms 92.1% + triton_bmm_28278 0.0081 ms 91.7% + triton_bmm_28282 0.0081 ms 91.3% + triton_bmm_28277 0.0084 ms 88.9% + triton_bmm_28279 0.0084 ms 88.9% + triton_bmm_28276 0.0086 ms 86.6% + triton_bmm_28283 0.0094 ms 79.2% + triton_bmm_28287 0.0096 ms 77.3% +SingleProcess AUTOTUNE takes 3.7710 seconds +AUTOTUNE bmm(16x1x298, 16x298x96) + triton_bmm_28305 0.0084 ms 100.0% + triton_bmm_28306 0.0086 ms 97.4% + triton_bmm_28308 0.0088 ms 94.6% + triton_bmm_28309 0.0096 ms 87.3% + triton_bmm_28304 0.0096 ms 87.0% + triton_bmm_28303 0.0101 ms 82.9% + triton_bmm_28301 0.0103 ms 80.8% + triton_bmm_28302 0.0106 ms 78.9% + bmm 0.0106 ms 78.6% + triton_bmm_28300 0.0133 ms 62.6% +SingleProcess AUTOTUNE takes 3.8559 seconds +AUTOTUNE bmm(16x1x96, 16x96x299) + triton_bmm_28376 0.0074 ms 100.0% + triton_bmm_28374 0.0076 ms 97.5% + triton_bmm_28380 0.0080 ms 92.4% + triton_bmm_28377 0.0081 ms 91.7% + triton_bmm_28378 0.0081 ms 90.9% + triton_bmm_28373 0.0084 ms 88.5% + triton_bmm_28375 0.0084 ms 88.5% + triton_bmm_28372 0.0086 ms 86.2% + triton_bmm_28379 0.0089 ms 83.1% + triton_bmm_28383 0.0092 ms 80.2% +SingleProcess AUTOTUNE takes 4.3408 seconds +AUTOTUNE bmm(16x1x299, 16x299x96) + triton_bmm_28405 0.0103 ms 100.0% + triton_bmm_28404 0.0118 ms 87.0% + bmm 0.0121 ms 85.0% + triton_bmm_28402 0.0136 ms 75.8% + triton_bmm_28400 0.0138 ms 74.9% + triton_bmm_28398 0.0139 ms 74.4% + triton_bmm_28401 0.0139 ms 74.4% + triton_bmm_28397 0.0143 ms 71.9% + triton_bmm_28399 0.0154 ms 67.1% + triton_bmm_28396 0.0176 ms 58.4% +SingleProcess AUTOTUNE takes 3.8279 seconds +AUTOTUNE bmm(16x1x96, 16x96x300) + triton_bmm_28472 0.0074 ms 100.0% + triton_bmm_28474 0.0081 ms 90.9% + triton_bmm_28470 0.0081 ms 90.6% + triton_bmm_28469 0.0084 ms 88.1% + triton_bmm_28471 0.0084 ms 88.1% + triton_bmm_28476 0.0084 ms 88.1% + triton_bmm_28468 0.0086 ms 85.8% + triton_bmm_28473 0.0086 ms 85.5% + triton_bmm_28477 0.0091 ms 81.0% + triton_bmm_28479 0.0092 ms 80.3% +SingleProcess AUTOTUNE takes 3.8661 seconds +AUTOTUNE bmm(16x1x300, 16x300x96) + triton_bmm_28497 0.0084 ms 100.0% + triton_bmm_28498 0.0086 ms 97.4% + bmm 0.0091 ms 91.6% + triton_bmm_28500 0.0093 ms 89.4% + triton_bmm_28495 0.0096 ms 87.3% + triton_bmm_28496 0.0096 ms 87.0% + triton_bmm_28501 0.0098 ms 85.3% + triton_bmm_28494 0.0099 ms 84.7% + triton_bmm_28493 0.0108 ms 77.0% + triton_bmm_28492 0.0136 ms 61.4% +SingleProcess AUTOTUNE takes 3.7978 seconds +AUTOTUNE bmm(16x1x96, 16x96x301) + triton_bmm_28568 0.0076 ms 100.0% + triton_bmm_28570 0.0076 ms 100.0% + triton_bmm_28565 0.0078 ms 96.3% + triton_bmm_28567 0.0078 ms 96.3% + triton_bmm_28566 0.0081 ms 92.9% + triton_bmm_28572 0.0084 ms 90.4% + triton_bmm_28564 0.0086 ms 88.1% + triton_bmm_28569 0.0086 ms 88.1% + triton_bmm_28571 0.0089 ms 85.2% + triton_bmm_28575 0.0093 ms 81.1% +SingleProcess AUTOTUNE takes 4.0762 seconds +AUTOTUNE bmm(16x1x301, 16x301x96) + triton_bmm_28597 0.0108 ms 100.0% + triton_bmm_28596 0.0113 ms 95.8% + bmm 0.0124 ms 87.8% + triton_bmm_28593 0.0133 ms 81.3% + triton_bmm_28592 0.0136 ms 79.8% + triton_bmm_28590 0.0139 ms 78.3% + triton_bmm_28594 0.0139 ms 78.1% + triton_bmm_28589 0.0148 ms 73.1% + triton_bmm_28591 0.0152 ms 71.5% + triton_bmm_28588 0.0171 ms 63.5% +SingleProcess AUTOTUNE takes 3.8236 seconds +AUTOTUNE bmm(16x1x96, 16x96x302) + triton_bmm_28664 0.0076 ms 100.0% + triton_bmm_28662 0.0076 ms 99.6% + triton_bmm_28661 0.0078 ms 96.3% + triton_bmm_28663 0.0078 ms 96.3% + triton_bmm_28668 0.0078 ms 96.3% + triton_bmm_28665 0.0081 ms 93.7% + triton_bmm_28666 0.0081 ms 93.3% + triton_bmm_28667 0.0088 ms 85.5% + triton_bmm_28660 0.0091 ms 83.1% + triton_bmm_28670 0.0093 ms 80.8% +SingleProcess AUTOTUNE takes 3.9659 seconds +AUTOTUNE bmm(16x1x302, 16x302x96) + triton_bmm_28689 0.0091 ms 100.0% + triton_bmm_28690 0.0091 ms 99.8% + triton_bmm_28692 0.0093 ms 97.1% + triton_bmm_28693 0.0096 ms 94.8% + triton_bmm_28688 0.0096 ms 94.5% + triton_bmm_28686 0.0101 ms 90.0% + triton_bmm_28687 0.0101 ms 90.0% + triton_bmm_28685 0.0103 ms 87.8% + bmm 0.0104 ms 87.5% + triton_bmm_28684 0.0133 ms 68.0% +SingleProcess AUTOTUNE takes 3.9956 seconds +AUTOTUNE bmm(16x1x96, 16x96x303) + triton_bmm_28760 0.0076 ms 100.0% + triton_bmm_28762 0.0076 ms 100.0% + triton_bmm_28758 0.0076 ms 99.6% + triton_bmm_28759 0.0078 ms 96.3% + triton_bmm_28761 0.0081 ms 93.7% + triton_bmm_28764 0.0084 ms 90.4% + triton_bmm_28756 0.0086 ms 88.1% + triton_bmm_28757 0.0086 ms 87.7% + triton_bmm_28763 0.0089 ms 84.4% + triton_bmm_28767 0.0093 ms 81.1% +SingleProcess AUTOTUNE takes 3.6854 seconds +AUTOTUNE bmm(16x1x303, 16x303x96) + triton_bmm_28789 0.0103 ms 100.0% + triton_bmm_28788 0.0113 ms 91.0% + bmm 0.0124 ms 83.0% + triton_bmm_28785 0.0134 ms 76.8% + triton_bmm_28786 0.0136 ms 75.9% + triton_bmm_28782 0.0139 ms 74.4% + triton_bmm_28784 0.0141 ms 72.9% + triton_bmm_28783 0.0147 ms 70.0% + triton_bmm_28781 0.0149 ms 69.2% + triton_bmm_28780 0.0175 ms 59.0% +SingleProcess AUTOTUNE takes 4.9126 seconds +AUTOTUNE bmm(16x1x96, 16x96x304) + triton_bmm_28854 0.0076 ms 100.0% + triton_bmm_28853 0.0078 ms 97.1% + triton_bmm_28860 0.0078 ms 97.1% + triton_bmm_28858 0.0079 ms 96.2% + triton_bmm_28857 0.0081 ms 94.4% + triton_bmm_28856 0.0081 ms 94.1% + triton_bmm_28855 0.0084 ms 91.2% + triton_bmm_28852 0.0086 ms 88.8% + triton_bmm_28861 0.0088 ms 86.5% + triton_bmm_28859 0.0088 ms 86.2% +SingleProcess AUTOTUNE takes 3.8198 seconds +AUTOTUNE bmm(16x1x304, 16x304x96) + triton_bmm_28881 0.0085 ms 100.0% + triton_bmm_28884 0.0088 ms 96.7% + triton_bmm_28880 0.0091 ms 94.0% + triton_bmm_28882 0.0091 ms 94.0% + triton_bmm_28885 0.0094 ms 90.5% + triton_bmm_28879 0.0096 ms 89.3% + triton_bmm_28877 0.0103 ms 82.7% + triton_bmm_28878 0.0104 ms 81.9% + triton_bmm_28876 0.0136 ms 62.8% + triton_bmm_28883 0.0143 ms 59.6% +SingleProcess AUTOTUNE takes 3.9009 seconds +AUTOTUNE bmm(16x1x96, 16x96x305) + triton_bmm_28952 0.0076 ms 100.0% + triton_bmm_28954 0.0076 ms 100.0% + triton_bmm_28951 0.0078 ms 96.7% + triton_bmm_28950 0.0082 ms 92.9% + triton_bmm_28956 0.0084 ms 90.5% + triton_bmm_28949 0.0084 ms 90.1% + triton_bmm_28953 0.0086 ms 88.1% + triton_bmm_28948 0.0091 ms 83.5% + triton_bmm_28959 0.0093 ms 81.4% + triton_bmm_28955 0.0096 ms 79.1% +SingleProcess AUTOTUNE takes 4.1321 seconds +AUTOTUNE bmm(16x1x305, 16x305x96) + bmm 0.0104 ms 100.0% + triton_bmm_28981 0.0108 ms 95.9% + triton_bmm_28980 0.0118 ms 87.8% + triton_bmm_28978 0.0136 ms 76.5% + triton_bmm_28977 0.0139 ms 74.9% + triton_bmm_28976 0.0141 ms 73.5% + triton_bmm_28974 0.0144 ms 72.2% + triton_bmm_28973 0.0146 ms 71.3% + triton_bmm_28975 0.0156 ms 66.6% + triton_bmm_28972 0.0172 ms 60.6% +SingleProcess AUTOTUNE takes 3.8133 seconds +AUTOTUNE bmm(16x1x96, 16x96x306) + triton_bmm_29048 0.0076 ms 100.0% + triton_bmm_29052 0.0078 ms 96.7% + triton_bmm_29049 0.0081 ms 94.0% + triton_bmm_29050 0.0081 ms 93.3% + triton_bmm_29046 0.0083 ms 91.9% + triton_bmm_29047 0.0084 ms 90.8% + triton_bmm_29045 0.0084 ms 90.5% + triton_bmm_29051 0.0088 ms 85.9% + triton_bmm_29044 0.0091 ms 83.5% + triton_bmm_29055 0.0091 ms 83.5% +SingleProcess AUTOTUNE takes 4.0444 seconds +AUTOTUNE bmm(16x1x306, 16x306x96) + triton_bmm_29073 0.0091 ms 100.0% + triton_bmm_29074 0.0091 ms 100.0% + triton_bmm_29076 0.0093 ms 97.3% + triton_bmm_29071 0.0096 ms 95.0% + triton_bmm_29077 0.0096 ms 95.0% + triton_bmm_29072 0.0096 ms 94.7% + triton_bmm_29070 0.0101 ms 90.2% + triton_bmm_29069 0.0103 ms 87.9% + bmm 0.0128 ms 70.7% + triton_bmm_29068 0.0139 ms 65.6% +SingleProcess AUTOTUNE takes 4.1983 seconds +AUTOTUNE bmm(16x1x96, 16x96x307) + triton_bmm_29144 0.0081 ms 100.0% + triton_bmm_29146 0.0081 ms 100.0% + triton_bmm_29142 0.0083 ms 97.3% + triton_bmm_29143 0.0084 ms 96.9% + triton_bmm_29148 0.0084 ms 96.4% + triton_bmm_29141 0.0084 ms 96.2% + triton_bmm_29140 0.0086 ms 94.4% + triton_bmm_29145 0.0086 ms 94.1% + triton_bmm_29147 0.0091 ms 89.4% + triton_bmm_29151 0.0093 ms 86.9% +SingleProcess AUTOTUNE takes 3.9736 seconds +AUTOTUNE bmm(16x1x307, 16x307x96) + triton_bmm_29173 0.0108 ms 100.0% + bmm 0.0111 ms 98.0% + triton_bmm_29172 0.0113 ms 95.8% + triton_bmm_29170 0.0136 ms 79.8% + triton_bmm_29166 0.0139 ms 77.9% + triton_bmm_29169 0.0141 ms 77.0% + triton_bmm_29168 0.0143 ms 75.8% + triton_bmm_29165 0.0146 ms 74.3% + triton_bmm_29167 0.0151 ms 72.0% + triton_bmm_29164 0.0173 ms 62.8% +SingleProcess AUTOTUNE takes 3.7480 seconds +AUTOTUNE bmm(16x1x96, 16x96x308) + triton_bmm_29238 0.0077 ms 100.0% + triton_bmm_29239 0.0078 ms 98.0% + triton_bmm_29244 0.0078 ms 98.0% + triton_bmm_29240 0.0080 ms 96.0% + triton_bmm_29242 0.0081 ms 94.9% + triton_bmm_29237 0.0084 ms 92.0% + triton_bmm_29241 0.0086 ms 89.2% + triton_bmm_29236 0.0091 ms 84.5% + triton_bmm_29246 0.0093 ms 82.2% + triton_bmm_29243 0.0094 ms 81.9% +SingleProcess AUTOTUNE takes 4.2004 seconds +AUTOTUNE bmm(16x1x308, 16x308x96) + triton_bmm_29265 0.0084 ms 100.0% + triton_bmm_29266 0.0086 ms 97.8% + triton_bmm_29268 0.0088 ms 94.9% + triton_bmm_29264 0.0091 ms 92.6% + triton_bmm_29263 0.0096 ms 87.6% + triton_bmm_29269 0.0098 ms 85.3% + bmm 0.0099 ms 84.8% + triton_bmm_29262 0.0100 ms 84.0% + triton_bmm_29261 0.0103 ms 81.1% + triton_bmm_29260 0.0131 ms 63.9% +SingleProcess AUTOTUNE takes 4.0688 seconds +AUTOTUNE bmm(16x1x96, 16x96x309) + triton_bmm_29336 0.0076 ms 100.0% + triton_bmm_29335 0.0078 ms 96.7% + triton_bmm_29340 0.0078 ms 96.7% + triton_bmm_29333 0.0079 ms 96.3% + triton_bmm_29338 0.0081 ms 93.9% + triton_bmm_29337 0.0081 ms 93.7% + triton_bmm_29334 0.0081 ms 93.3% + triton_bmm_29332 0.0086 ms 88.4% + triton_bmm_29339 0.0091 ms 83.7% + triton_bmm_29343 0.0093 ms 81.4% +SingleProcess AUTOTUNE takes 3.8295 seconds +AUTOTUNE bmm(16x1x309, 16x309x96) + triton_bmm_29365 0.0108 ms 100.0% + bmm 0.0111 ms 98.0% + triton_bmm_29364 0.0118 ms 91.6% + triton_bmm_29362 0.0136 ms 80.0% + triton_bmm_29360 0.0136 ms 79.8% + triton_bmm_29358 0.0139 ms 78.1% + triton_bmm_29361 0.0139 ms 78.1% + triton_bmm_29357 0.0151 ms 71.8% + triton_bmm_29359 0.0154 ms 70.3% + triton_bmm_29356 0.0174 ms 62.2% +SingleProcess AUTOTUNE takes 4.0059 seconds +AUTOTUNE bmm(16x1x96, 16x96x310) + triton_bmm_29432 0.0076 ms 100.0% + triton_bmm_29430 0.0077 ms 98.3% + triton_bmm_29429 0.0078 ms 96.7% + triton_bmm_29434 0.0081 ms 94.0% + triton_bmm_29433 0.0081 ms 93.7% + triton_bmm_29431 0.0084 ms 90.8% + triton_bmm_29436 0.0084 ms 90.8% + triton_bmm_29428 0.0090 ms 83.9% + triton_bmm_29438 0.0093 ms 81.2% + triton_bmm_29435 0.0096 ms 79.3% +SingleProcess AUTOTUNE takes 3.8083 seconds +AUTOTUNE bmm(16x1x310, 16x310x96) + triton_bmm_29457 0.0084 ms 100.0% + triton_bmm_29458 0.0086 ms 97.4% + triton_bmm_29460 0.0088 ms 94.6% + triton_bmm_29456 0.0091 ms 91.9% + bmm 0.0101 ms 82.9% + triton_bmm_29455 0.0101 ms 82.7% + triton_bmm_29461 0.0101 ms 82.6% + triton_bmm_29453 0.0108 ms 77.0% + triton_bmm_29452 0.0133 ms 62.6% + triton_bmm_29459 0.0143 ms 58.3% +SingleProcess AUTOTUNE takes 4.1280 seconds +AUTOTUNE bmm(16x1x96, 16x96x311) + triton_bmm_29530 0.0076 ms 100.0% + triton_bmm_29527 0.0078 ms 96.7% + triton_bmm_29532 0.0078 ms 96.7% + triton_bmm_29525 0.0079 ms 95.6% + triton_bmm_29528 0.0081 ms 94.0% + triton_bmm_29526 0.0084 ms 90.8% + triton_bmm_29524 0.0086 ms 88.4% + triton_bmm_29529 0.0086 ms 88.1% + triton_bmm_29533 0.0093 ms 81.2% + triton_bmm_29534 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 3.7681 seconds +AUTOTUNE bmm(16x1x311, 16x311x96) + triton_bmm_29557 0.0108 ms 100.0% + triton_bmm_29556 0.0113 ms 95.8% + bmm 0.0116 ms 93.3% + triton_bmm_29553 0.0141 ms 77.1% + triton_bmm_29554 0.0141 ms 76.9% + triton_bmm_29552 0.0141 ms 76.7% + triton_bmm_29550 0.0145 ms 75.0% + triton_bmm_29549 0.0146 ms 74.2% + triton_bmm_29551 0.0156 ms 69.5% + triton_bmm_29548 0.0177 ms 61.4% +SingleProcess AUTOTUNE takes 3.8615 seconds +AUTOTUNE bmm(16x1x96, 16x96x312) + triton_bmm_29626 0.0076 ms 100.0% + triton_bmm_29622 0.0077 ms 97.9% + triton_bmm_29621 0.0078 ms 96.7% + triton_bmm_29623 0.0078 ms 96.7% + triton_bmm_29624 0.0081 ms 93.7% + triton_bmm_29628 0.0084 ms 90.8% + triton_bmm_29625 0.0086 ms 88.1% + triton_bmm_29620 0.0091 ms 83.5% + triton_bmm_29630 0.0093 ms 81.2% + triton_bmm_29629 0.0094 ms 80.6% +SingleProcess AUTOTUNE takes 4.0122 seconds +AUTOTUNE bmm(16x1x312, 16x312x96) + triton_bmm_29650 0.0091 ms 100.0% + triton_bmm_29649 0.0093 ms 97.9% + triton_bmm_29652 0.0095 ms 95.8% + bmm 0.0096 ms 94.7% + triton_bmm_29648 0.0096 ms 94.7% + triton_bmm_29647 0.0101 ms 90.2% + triton_bmm_29653 0.0101 ms 89.9% + triton_bmm_29646 0.0106 ms 86.1% + triton_bmm_29645 0.0109 ms 83.5% + triton_bmm_29644 0.0136 ms 66.7% +SingleProcess AUTOTUNE takes 4.6317 seconds +AUTOTUNE bmm(16x1x96, 16x96x313) + triton_bmm_29720 0.0076 ms 100.0% + triton_bmm_29717 0.0081 ms 94.0% + triton_bmm_29722 0.0081 ms 93.7% + triton_bmm_29718 0.0084 ms 90.8% + triton_bmm_29719 0.0084 ms 90.5% + triton_bmm_29724 0.0085 ms 88.9% + triton_bmm_29716 0.0086 ms 88.4% + triton_bmm_29721 0.0086 ms 88.1% + triton_bmm_29725 0.0093 ms 81.4% + triton_bmm_29726 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 4.0652 seconds +AUTOTUNE bmm(16x1x313, 16x313x96) + triton_bmm_29749 0.0108 ms 100.0% + bmm 0.0116 ms 93.1% + triton_bmm_29748 0.0118 ms 91.6% + triton_bmm_29745 0.0136 ms 80.0% + triton_bmm_29746 0.0136 ms 79.8% + triton_bmm_29744 0.0139 ms 78.3% + triton_bmm_29742 0.0144 ms 75.2% + triton_bmm_29741 0.0146 ms 74.3% + triton_bmm_29743 0.0156 ms 69.6% + triton_bmm_29740 0.0173 ms 62.5% +SingleProcess AUTOTUNE takes 3.9209 seconds +AUTOTUNE bmm(16x1x96, 16x96x314) + triton_bmm_29816 0.0076 ms 100.0% + triton_bmm_29814 0.0078 ms 97.1% + triton_bmm_29815 0.0078 ms 96.7% + triton_bmm_29820 0.0078 ms 96.7% + triton_bmm_29817 0.0081 ms 93.7% + triton_bmm_29818 0.0081 ms 93.7% + triton_bmm_29813 0.0084 ms 89.8% + triton_bmm_29812 0.0086 ms 88.4% + triton_bmm_29819 0.0095 ms 79.9% + triton_bmm_29823 0.0096 ms 79.0% +SingleProcess AUTOTUNE takes 3.7519 seconds +AUTOTUNE bmm(16x1x314, 16x314x96) + triton_bmm_29841 0.0086 ms 100.0% + triton_bmm_29842 0.0086 ms 100.0% + triton_bmm_29844 0.0088 ms 97.1% + triton_bmm_29840 0.0091 ms 94.4% + triton_bmm_29839 0.0096 ms 89.6% + bmm 0.0101 ms 85.1% + triton_bmm_29845 0.0101 ms 85.1% + triton_bmm_29837 0.0103 ms 83.0% + triton_bmm_29838 0.0106 ms 81.0% + triton_bmm_29836 0.0133 ms 64.3% +SingleProcess AUTOTUNE takes 4.0166 seconds +AUTOTUNE bmm(16x1x96, 16x96x315) + triton_bmm_29910 0.0078 ms 100.0% + triton_bmm_29911 0.0078 ms 99.6% + triton_bmm_29916 0.0078 ms 99.6% + triton_bmm_29912 0.0081 ms 96.8% + triton_bmm_29914 0.0081 ms 96.4% + triton_bmm_29908 0.0086 ms 91.0% + triton_bmm_29909 0.0086 ms 91.0% + triton_bmm_29913 0.0086 ms 90.7% + triton_bmm_29917 0.0093 ms 83.8% + triton_bmm_29915 0.0096 ms 81.3% +SingleProcess AUTOTUNE takes 4.0843 seconds +AUTOTUNE bmm(16x1x315, 16x315x96) + triton_bmm_29941 0.0109 ms 100.0% + triton_bmm_29940 0.0119 ms 91.6% + bmm 0.0124 ms 88.0% + triton_bmm_29937 0.0136 ms 80.0% + triton_bmm_29938 0.0138 ms 78.7% + triton_bmm_29936 0.0143 ms 75.9% + triton_bmm_29934 0.0145 ms 75.2% + triton_bmm_29933 0.0146 ms 74.4% + triton_bmm_29935 0.0156 ms 69.7% + triton_bmm_29932 0.0179 ms 60.9% +SingleProcess AUTOTUNE takes 3.9705 seconds +AUTOTUNE bmm(16x1x96, 16x96x316) + triton_bmm_30008 0.0076 ms 100.0% + triton_bmm_30010 0.0076 ms 100.0% + triton_bmm_30006 0.0078 ms 97.1% + triton_bmm_30007 0.0084 ms 90.8% + triton_bmm_30012 0.0084 ms 90.8% + triton_bmm_30005 0.0084 ms 90.5% + triton_bmm_30009 0.0086 ms 88.1% + triton_bmm_30004 0.0091 ms 83.5% + triton_bmm_30014 0.0093 ms 81.2% + triton_bmm_30011 0.0094 ms 80.6% +SingleProcess AUTOTUNE takes 4.4021 seconds +AUTOTUNE bmm(16x1x316, 16x316x96) + triton_bmm_30033 0.0091 ms 100.0% + triton_bmm_30034 0.0091 ms 100.0% + triton_bmm_30036 0.0094 ms 96.9% + bmm 0.0095 ms 95.9% + triton_bmm_30031 0.0096 ms 94.7% + triton_bmm_30032 0.0096 ms 94.7% + triton_bmm_30037 0.0098 ms 92.5% + triton_bmm_30030 0.0101 ms 90.2% + triton_bmm_30029 0.0109 ms 83.5% + triton_bmm_30028 0.0138 ms 65.9% +SingleProcess AUTOTUNE takes 3.9966 seconds +AUTOTUNE bmm(16x1x96, 16x96x317) + triton_bmm_30106 0.0076 ms 100.0% + triton_bmm_30102 0.0078 ms 97.1% + triton_bmm_30108 0.0078 ms 96.7% + triton_bmm_30103 0.0079 ms 96.3% + triton_bmm_30101 0.0081 ms 94.0% + triton_bmm_30104 0.0081 ms 94.0% + triton_bmm_30100 0.0086 ms 88.4% + triton_bmm_30105 0.0086 ms 88.1% + triton_bmm_30111 0.0093 ms 81.4% + triton_bmm_30109 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 3.8003 seconds +AUTOTUNE bmm(16x1x317, 16x317x96) + triton_bmm_30133 0.0109 ms 100.0% + triton_bmm_30132 0.0114 ms 96.1% + bmm 0.0121 ms 90.2% + triton_bmm_30129 0.0136 ms 80.2% + triton_bmm_30130 0.0138 ms 78.9% + triton_bmm_30126 0.0141 ms 77.5% + triton_bmm_30128 0.0144 ms 75.9% + triton_bmm_30125 0.0152 ms 71.9% + triton_bmm_30127 0.0153 ms 71.2% + triton_bmm_30124 0.0174 ms 62.8% +SingleProcess AUTOTUNE takes 4.3027 seconds +AUTOTUNE bmm(16x1x96, 16x96x318) + triton_bmm_30200 0.0076 ms 100.0% + triton_bmm_30202 0.0076 ms 100.0% + triton_bmm_30204 0.0078 ms 96.7% + triton_bmm_30197 0.0080 ms 95.2% + triton_bmm_30201 0.0081 ms 94.0% + triton_bmm_30198 0.0084 ms 90.8% + triton_bmm_30199 0.0084 ms 90.8% + triton_bmm_30203 0.0088 ms 85.9% + triton_bmm_30196 0.0090 ms 83.9% + triton_bmm_30207 0.0091 ms 83.5% +SingleProcess AUTOTUNE takes 3.9112 seconds +AUTOTUNE bmm(16x1x318, 16x318x96) + triton_bmm_30225 0.0091 ms 100.0% + triton_bmm_30226 0.0091 ms 99.6% + triton_bmm_30228 0.0093 ms 97.3% + triton_bmm_30229 0.0096 ms 95.0% + triton_bmm_30224 0.0096 ms 94.4% + triton_bmm_30222 0.0101 ms 90.2% + triton_bmm_30223 0.0101 ms 89.9% + bmm 0.0103 ms 88.2% + triton_bmm_30221 0.0108 ms 83.8% + triton_bmm_30220 0.0136 ms 67.0% +SingleProcess AUTOTUNE takes 3.9085 seconds +AUTOTUNE bmm(16x1x96, 16x96x319) + triton_bmm_30296 0.0076 ms 100.0% + triton_bmm_30295 0.0078 ms 96.7% + triton_bmm_30300 0.0078 ms 96.7% + triton_bmm_30293 0.0081 ms 94.0% + triton_bmm_30297 0.0081 ms 93.7% + triton_bmm_30298 0.0081 ms 93.7% + triton_bmm_30294 0.0084 ms 90.8% + triton_bmm_30299 0.0091 ms 83.7% + triton_bmm_30292 0.0091 ms 83.5% + triton_bmm_30301 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 3.7732 seconds +AUTOTUNE bmm(16x1x319, 16x319x96) + triton_bmm_30325 0.0114 ms 100.0% + triton_bmm_30324 0.0119 ms 96.0% + bmm 0.0126 ms 90.1% + triton_bmm_30320 0.0131 ms 86.8% + triton_bmm_30321 0.0131 ms 86.8% + triton_bmm_30318 0.0133 ms 85.4% + triton_bmm_30322 0.0134 ms 85.1% + triton_bmm_30319 0.0143 ms 79.5% + triton_bmm_30317 0.0154 ms 74.2% + triton_bmm_30316 0.0186 ms 61.2% +SingleProcess AUTOTUNE takes 3.7524 seconds +AUTOTUNE bmm(16x1x96, 16x96x320) + triton_bmm_30392 0.0076 ms 100.0% + triton_bmm_30390 0.0078 ms 97.1% + triton_bmm_30391 0.0078 ms 96.7% + triton_bmm_30396 0.0078 ms 96.7% + triton_bmm_30394 0.0081 ms 93.7% + triton_bmm_30389 0.0084 ms 90.8% + triton_bmm_30393 0.0086 ms 88.1% + triton_bmm_30397 0.0088 ms 85.9% + triton_bmm_30388 0.0091 ms 83.5% + triton_bmm_30395 0.0095 ms 79.8% +SingleProcess AUTOTUNE takes 3.8822 seconds +AUTOTUNE bmm(16x1x320, 16x320x96) + triton_bmm_30418 0.0086 ms 100.0% + triton_bmm_30420 0.0088 ms 97.1% + triton_bmm_30416 0.0091 ms 94.4% + triton_bmm_30417 0.0091 ms 94.4% + bmm 0.0098 ms 87.3% + triton_bmm_30421 0.0098 ms 87.3% + triton_bmm_30415 0.0101 ms 85.1% + triton_bmm_30413 0.0103 ms 83.0% + triton_bmm_30414 0.0106 ms 81.0% + triton_bmm_30412 0.0139 ms 61.9% +SingleProcess AUTOTUNE takes 3.7279 seconds +AUTOTUNE bmm(16x1x96, 16x96x321) + triton_bmm_30490 0.0076 ms 100.0% + triton_bmm_30487 0.0078 ms 96.7% + triton_bmm_30485 0.0081 ms 94.0% + triton_bmm_30488 0.0081 ms 93.3% + triton_bmm_30486 0.0084 ms 90.8% + triton_bmm_30484 0.0086 ms 88.4% + triton_bmm_30492 0.0086 ms 88.4% + triton_bmm_30489 0.0086 ms 87.8% + triton_bmm_30491 0.0091 ms 83.7% + triton_bmm_30495 0.0093 ms 81.4% +SingleProcess AUTOTUNE takes 3.7568 seconds +AUTOTUNE bmm(16x1x321, 16x321x96) + bmm 0.0098 ms 100.0% + triton_bmm_30517 0.0106 ms 92.7% + triton_bmm_30516 0.0118 ms 83.0% + triton_bmm_30514 0.0141 ms 69.8% + triton_bmm_30512 0.0141 ms 69.5% + triton_bmm_30513 0.0144 ms 68.3% + triton_bmm_30510 0.0150 ms 65.4% + triton_bmm_30509 0.0151 ms 65.0% + triton_bmm_30511 0.0161 ms 60.9% + triton_bmm_30508 0.0179 ms 54.9% +SingleProcess AUTOTUNE takes 3.7277 seconds +AUTOTUNE bmm(16x1x96, 16x96x322) + triton_bmm_30583 0.0078 ms 100.0% + triton_bmm_30586 0.0080 ms 97.8% + triton_bmm_30581 0.0080 ms 97.4% + triton_bmm_30584 0.0081 ms 96.8% + triton_bmm_30585 0.0081 ms 96.5% + triton_bmm_30582 0.0084 ms 93.9% + triton_bmm_30588 0.0084 ms 93.9% + triton_bmm_30580 0.0091 ms 86.3% + triton_bmm_30587 0.0095 ms 82.5% + triton_bmm_30591 0.0096 ms 81.7% +SingleProcess AUTOTUNE takes 4.1063 seconds +AUTOTUNE bmm(16x1x322, 16x322x96) + triton_bmm_30608 0.0093 ms 100.0% + triton_bmm_30609 0.0093 ms 99.8% + triton_bmm_30610 0.0093 ms 99.7% + triton_bmm_30613 0.0096 ms 97.3% + triton_bmm_30612 0.0096 ms 96.7% + bmm 0.0102 ms 91.2% + triton_bmm_30606 0.0103 ms 90.1% + triton_bmm_30607 0.0104 ms 89.8% + triton_bmm_30605 0.0108 ms 86.1% + triton_bmm_30604 0.0146 ms 64.0% +SingleProcess AUTOTUNE takes 3.8298 seconds +AUTOTUNE bmm(16x1x96, 16x96x323) + triton_bmm_30678 0.0078 ms 100.0% + triton_bmm_30684 0.0078 ms 99.6% + triton_bmm_30680 0.0081 ms 96.1% + triton_bmm_30682 0.0081 ms 96.1% + triton_bmm_30679 0.0084 ms 93.1% + triton_bmm_30677 0.0086 ms 91.0% + triton_bmm_30681 0.0086 ms 90.4% + triton_bmm_30676 0.0091 ms 85.9% + triton_bmm_30683 0.0096 ms 81.3% + triton_bmm_30687 0.0098 ms 79.5% +SingleProcess AUTOTUNE takes 4.6069 seconds +AUTOTUNE bmm(16x1x323, 16x323x96) + bmm 0.0101 ms 100.0% + triton_bmm_30709 0.0111 ms 90.8% + triton_bmm_30708 0.0123 ms 81.7% + triton_bmm_30705 0.0141 ms 71.6% + triton_bmm_30704 0.0142 ms 70.8% + triton_bmm_30702 0.0146 ms 69.1% + triton_bmm_30706 0.0146 ms 69.0% + triton_bmm_30701 0.0156 ms 64.5% + triton_bmm_30703 0.0156 ms 64.5% + triton_bmm_30700 0.0186 ms 54.3% +SingleProcess AUTOTUNE takes 3.8868 seconds +AUTOTUNE bmm(16x1x96, 16x96x324) + triton_bmm_30774 0.0078 ms 100.0% + triton_bmm_30775 0.0078 ms 100.0% + triton_bmm_30773 0.0078 ms 99.6% + triton_bmm_30776 0.0081 ms 96.4% + triton_bmm_30777 0.0081 ms 96.4% + triton_bmm_30778 0.0081 ms 96.4% + triton_bmm_30780 0.0084 ms 93.5% + triton_bmm_30772 0.0086 ms 91.0% + triton_bmm_30779 0.0088 ms 88.4% + triton_bmm_30781 0.0091 ms 85.9% +SingleProcess AUTOTUNE takes 4.0342 seconds +AUTOTUNE bmm(16x1x324, 16x324x96) + bmm 0.0091 ms 100.0% + triton_bmm_30801 0.0091 ms 100.0% + triton_bmm_30805 0.0093 ms 97.9% + triton_bmm_30802 0.0093 ms 97.6% + triton_bmm_30804 0.0096 ms 95.0% + triton_bmm_30799 0.0098 ms 92.8% + triton_bmm_30800 0.0098 ms 92.8% + triton_bmm_30797 0.0108 ms 84.3% + triton_bmm_30798 0.0108 ms 84.1% + triton_bmm_30796 0.0139 ms 65.8% +SingleProcess AUTOTUNE takes 3.8465 seconds +AUTOTUNE bmm(16x1x96, 16x96x325) + triton_bmm_30872 0.0076 ms 100.0% + triton_bmm_30874 0.0076 ms 100.0% + triton_bmm_30871 0.0078 ms 96.7% + triton_bmm_30876 0.0078 ms 96.7% + triton_bmm_30869 0.0081 ms 94.0% + triton_bmm_30870 0.0084 ms 90.8% + triton_bmm_30868 0.0086 ms 88.4% + triton_bmm_30873 0.0086 ms 87.8% + triton_bmm_30879 0.0093 ms 81.4% + triton_bmm_30875 0.0095 ms 79.5% +SingleProcess AUTOTUNE takes 3.8961 seconds +AUTOTUNE bmm(16x1x325, 16x325x96) + bmm 0.0101 ms 100.0% + triton_bmm_30901 0.0111 ms 91.1% + triton_bmm_30900 0.0124 ms 81.9% + triton_bmm_30897 0.0141 ms 71.7% + triton_bmm_30894 0.0146 ms 69.3% + triton_bmm_30898 0.0146 ms 69.1% + triton_bmm_30896 0.0147 ms 69.0% + triton_bmm_30893 0.0151 ms 66.9% + triton_bmm_30895 0.0156 ms 64.9% + triton_bmm_30892 0.0181 ms 55.9% +SingleProcess AUTOTUNE takes 3.8263 seconds +AUTOTUNE bmm(16x1x96, 16x96x326) + triton_bmm_30968 0.0076 ms 100.0% + triton_bmm_30970 0.0076 ms 100.0% + triton_bmm_30967 0.0078 ms 96.7% + triton_bmm_30966 0.0084 ms 90.8% + triton_bmm_30972 0.0084 ms 90.5% + triton_bmm_30965 0.0086 ms 88.4% + triton_bmm_30969 0.0086 ms 88.1% + triton_bmm_30971 0.0088 ms 85.9% + triton_bmm_30964 0.0090 ms 84.6% + triton_bmm_30975 0.0096 ms 78.7% +SingleProcess AUTOTUNE takes 3.8482 seconds +AUTOTUNE bmm(16x1x326, 16x326x96) + triton_bmm_30993 0.0086 ms 100.0% + triton_bmm_30992 0.0093 ms 92.1% + triton_bmm_30994 0.0093 ms 91.8% + triton_bmm_30997 0.0096 ms 89.6% + triton_bmm_30996 0.0096 ms 89.3% + bmm 0.0101 ms 85.1% + triton_bmm_30991 0.0103 ms 83.0% + triton_bmm_30990 0.0104 ms 82.3% + triton_bmm_30989 0.0113 ms 75.6% + triton_bmm_30988 0.0146 ms 58.8% +SingleProcess AUTOTUNE takes 3.9953 seconds +AUTOTUNE bmm(16x1x96, 16x96x327) + triton_bmm_31068 0.0078 ms 100.0% + triton_bmm_31063 0.0079 ms 99.4% + triton_bmm_31065 0.0081 ms 96.8% + triton_bmm_31064 0.0081 ms 96.5% + triton_bmm_31066 0.0082 ms 96.1% + triton_bmm_31062 0.0084 ms 93.9% + triton_bmm_31060 0.0086 ms 91.4% + triton_bmm_31061 0.0086 ms 91.1% + triton_bmm_31071 0.0093 ms 84.2% + triton_bmm_31067 0.0096 ms 81.8% +SingleProcess AUTOTUNE takes 3.9231 seconds +AUTOTUNE bmm(16x1x327, 16x327x96) + bmm 0.0106 ms 100.0% + triton_bmm_31093 0.0111 ms 95.4% + triton_bmm_31092 0.0124 ms 85.6% + triton_bmm_31089 0.0141 ms 75.1% + triton_bmm_31090 0.0146 ms 72.6% + triton_bmm_31088 0.0148 ms 71.3% + triton_bmm_31085 0.0151 ms 70.1% + triton_bmm_31086 0.0151 ms 70.0% + triton_bmm_31087 0.0161 ms 65.7% + triton_bmm_31084 0.0181 ms 58.6% +SingleProcess AUTOTUNE takes 3.7911 seconds +AUTOTUNE bmm(16x1x96, 16x96x328) + triton_bmm_31157 0.0078 ms 100.0% + triton_bmm_31159 0.0078 ms 100.0% + triton_bmm_31164 0.0078 ms 100.0% + triton_bmm_31161 0.0081 ms 97.2% + triton_bmm_31160 0.0081 ms 96.5% + triton_bmm_31162 0.0081 ms 96.5% + triton_bmm_31158 0.0084 ms 93.9% + triton_bmm_31163 0.0088 ms 88.8% + triton_bmm_31165 0.0088 ms 88.8% + triton_bmm_31156 0.0091 ms 86.3% +SingleProcess AUTOTUNE takes 3.7037 seconds +AUTOTUNE bmm(16x1x328, 16x328x96) + triton_bmm_31186 0.0092 ms 100.0% + triton_bmm_31188 0.0092 ms 100.0% + triton_bmm_31185 0.0094 ms 98.0% + triton_bmm_31183 0.0098 ms 93.5% + triton_bmm_31184 0.0098 ms 93.3% + bmm 0.0099 ms 93.2% + triton_bmm_31189 0.0101 ms 90.8% + triton_bmm_31182 0.0103 ms 88.9% + triton_bmm_31181 0.0112 ms 82.2% + triton_bmm_31180 0.0143 ms 64.1% +SingleProcess AUTOTUNE takes 3.8078 seconds +AUTOTUNE bmm(16x1x96, 16x96x329) + triton_bmm_31258 0.0076 ms 100.0% + triton_bmm_31254 0.0078 ms 97.1% + triton_bmm_31260 0.0078 ms 96.7% + triton_bmm_31255 0.0079 ms 95.6% + triton_bmm_31253 0.0081 ms 94.0% + triton_bmm_31257 0.0081 ms 93.7% + triton_bmm_31256 0.0081 ms 93.3% + triton_bmm_31252 0.0090 ms 84.0% + triton_bmm_31259 0.0091 ms 83.7% + triton_bmm_31261 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 4.3200 seconds +AUTOTUNE bmm(16x1x329, 16x329x96) + triton_bmm_31285 0.0106 ms 100.0% + triton_bmm_31284 0.0125 ms 84.4% + bmm 0.0137 ms 77.4% + triton_bmm_31281 0.0141 ms 75.1% + triton_bmm_31282 0.0141 ms 75.1% + triton_bmm_31280 0.0148 ms 71.3% + triton_bmm_31278 0.0151 ms 70.0% + triton_bmm_31279 0.0156 ms 67.8% + triton_bmm_31277 0.0156 ms 67.7% + triton_bmm_31276 0.0184 ms 57.5% +SingleProcess AUTOTUNE takes 3.8810 seconds +AUTOTUNE bmm(16x1x96, 16x96x330) + triton_bmm_31352 0.0076 ms 100.0% + triton_bmm_31350 0.0078 ms 97.1% + triton_bmm_31351 0.0078 ms 96.7% + triton_bmm_31356 0.0078 ms 96.7% + triton_bmm_31349 0.0081 ms 94.0% + triton_bmm_31353 0.0081 ms 93.7% + triton_bmm_31354 0.0081 ms 93.3% + triton_bmm_31348 0.0086 ms 88.4% + triton_bmm_31355 0.0096 ms 79.0% + triton_bmm_31359 0.0096 ms 78.7% +SingleProcess AUTOTUNE takes 4.0102 seconds +AUTOTUNE bmm(16x1x330, 16x330x96) + triton_bmm_31380 0.0091 ms 100.0% + triton_bmm_31377 0.0092 ms 99.0% + triton_bmm_31376 0.0093 ms 97.6% + triton_bmm_31378 0.0094 ms 96.9% + triton_bmm_31375 0.0098 ms 92.5% + triton_bmm_31381 0.0101 ms 89.9% + triton_bmm_31374 0.0111 ms 81.8% + bmm 0.0111 ms 81.6% + triton_bmm_31373 0.0113 ms 80.2% + triton_bmm_31372 0.0141 ms 64.5% +SingleProcess AUTOTUNE takes 4.6442 seconds +AUTOTUNE bmm(16x1x96, 16x96x331) + triton_bmm_31448 0.0076 ms 100.0% + triton_bmm_31450 0.0076 ms 100.0% + triton_bmm_31447 0.0080 ms 94.8% + triton_bmm_31449 0.0081 ms 93.7% + triton_bmm_31446 0.0084 ms 90.8% + triton_bmm_31452 0.0084 ms 90.5% + triton_bmm_31445 0.0086 ms 88.1% + triton_bmm_31444 0.0091 ms 83.2% + triton_bmm_31455 0.0093 ms 81.4% + triton_bmm_31453 0.0093 ms 81.2% +SingleProcess AUTOTUNE takes 3.8030 seconds +AUTOTUNE bmm(16x1x331, 16x331x96) + bmm 0.0106 ms 100.0% + triton_bmm_31477 0.0111 ms 95.7% + triton_bmm_31476 0.0126 ms 84.0% + triton_bmm_31473 0.0141 ms 75.1% + triton_bmm_31474 0.0144 ms 73.7% + triton_bmm_31470 0.0146 ms 72.4% + triton_bmm_31472 0.0149 ms 71.0% + triton_bmm_31469 0.0156 ms 67.7% + triton_bmm_31471 0.0161 ms 65.7% + triton_bmm_31468 0.0189 ms 56.0% +SingleProcess AUTOTUNE takes 3.8276 seconds +AUTOTUNE bmm(16x1x96, 16x96x332) + triton_bmm_31542 0.0078 ms 100.0% + triton_bmm_31548 0.0078 ms 100.0% + triton_bmm_31544 0.0081 ms 96.5% + triton_bmm_31546 0.0081 ms 96.5% + triton_bmm_31543 0.0084 ms 93.5% + triton_bmm_31541 0.0086 ms 91.4% + triton_bmm_31545 0.0086 ms 90.7% + triton_bmm_31547 0.0088 ms 88.8% + triton_bmm_31540 0.0091 ms 86.3% + triton_bmm_31549 0.0091 ms 86.3% +SingleProcess AUTOTUNE takes 4.1499 seconds +AUTOTUNE bmm(16x1x332, 16x332x96) + triton_bmm_31572 0.0091 ms 100.0% + triton_bmm_31569 0.0091 ms 99.6% + triton_bmm_31570 0.0093 ms 97.3% + bmm 0.0096 ms 94.7% + triton_bmm_31568 0.0098 ms 93.0% + triton_bmm_31573 0.0098 ms 92.4% + triton_bmm_31567 0.0103 ms 87.9% + triton_bmm_31566 0.0104 ms 87.7% + triton_bmm_31565 0.0108 ms 83.8% + triton_bmm_31564 0.0144 ms 63.3% +SingleProcess AUTOTUNE takes 3.9563 seconds +AUTOTUNE bmm(16x1x96, 16x96x333) + triton_bmm_31638 0.0078 ms 100.0% + triton_bmm_31639 0.0081 ms 97.2% + triton_bmm_31642 0.0082 ms 96.1% + triton_bmm_31640 0.0082 ms 95.7% + triton_bmm_31644 0.0085 ms 92.6% + triton_bmm_31636 0.0086 ms 91.4% + triton_bmm_31637 0.0086 ms 91.1% + triton_bmm_31641 0.0086 ms 90.7% + triton_bmm_31647 0.0093 ms 84.2% + triton_bmm_31645 0.0093 ms 83.9% +SingleProcess AUTOTUNE takes 4.2933 seconds +AUTOTUNE bmm(16x1x333, 16x333x96) + bmm 0.0108 ms 100.0% + triton_bmm_31669 0.0111 ms 97.4% + triton_bmm_31668 0.0124 ms 87.0% + triton_bmm_31664 0.0144 ms 75.3% + triton_bmm_31665 0.0146 ms 74.0% + triton_bmm_31666 0.0147 ms 73.5% + triton_bmm_31662 0.0151 ms 71.5% + triton_bmm_31661 0.0153 ms 70.6% + triton_bmm_31663 0.0164 ms 66.0% + triton_bmm_31660 0.0183 ms 59.1% +SingleProcess AUTOTUNE takes 3.7655 seconds +AUTOTUNE bmm(16x1x96, 16x96x334) + triton_bmm_31736 0.0076 ms 100.0% + triton_bmm_31735 0.0078 ms 97.1% + triton_bmm_31740 0.0078 ms 97.1% + triton_bmm_31733 0.0081 ms 94.4% + triton_bmm_31737 0.0081 ms 94.1% + triton_bmm_31738 0.0082 ms 93.0% + triton_bmm_31734 0.0084 ms 91.2% + triton_bmm_31739 0.0088 ms 86.2% + triton_bmm_31732 0.0091 ms 83.8% + triton_bmm_31743 0.0091 ms 83.5% +SingleProcess AUTOTUNE takes 3.8656 seconds +AUTOTUNE bmm(16x1x334, 16x334x96) + triton_bmm_31761 0.0086 ms 100.0% + triton_bmm_31762 0.0088 ms 97.1% + triton_bmm_31760 0.0093 ms 92.1% + triton_bmm_31764 0.0096 ms 89.3% + triton_bmm_31765 0.0101 ms 84.8% + triton_bmm_31759 0.0104 ms 82.7% + bmm 0.0104 ms 82.5% + triton_bmm_31758 0.0110 ms 77.9% + triton_bmm_31757 0.0114 ms 75.5% + triton_bmm_31756 0.0141 ms 60.8% +SingleProcess AUTOTUNE takes 4.0573 seconds +AUTOTUNE bmm(16x1x96, 16x96x335) + triton_bmm_31834 0.0076 ms 100.0% + triton_bmm_31830 0.0078 ms 97.1% + triton_bmm_31836 0.0079 ms 96.7% + triton_bmm_31832 0.0082 ms 93.0% + triton_bmm_31828 0.0086 ms 88.8% + triton_bmm_31831 0.0086 ms 88.8% + triton_bmm_31829 0.0086 ms 88.5% + triton_bmm_31833 0.0086 ms 88.5% + triton_bmm_31835 0.0091 ms 83.8% + triton_bmm_31839 0.0093 ms 81.8% +SingleProcess AUTOTUNE takes 3.7250 seconds +AUTOTUNE bmm(16x1x335, 16x335x96) + bmm 0.0111 ms 100.0% + triton_bmm_31861 0.0111 ms 99.7% + triton_bmm_31860 0.0121 ms 91.5% + triton_bmm_31857 0.0147 ms 75.4% + triton_bmm_31854 0.0147 ms 75.2% + triton_bmm_31858 0.0148 ms 74.6% + triton_bmm_31856 0.0149 ms 74.4% + triton_bmm_31853 0.0157 ms 70.6% + triton_bmm_31855 0.0163 ms 68.1% + triton_bmm_31852 0.0184 ms 60.3% +SingleProcess AUTOTUNE takes 3.7257 seconds +AUTOTUNE bmm(16x1x96, 16x96x336) + triton_bmm_31928 0.0076 ms 100.0% + triton_bmm_31930 0.0076 ms 100.0% + triton_bmm_31932 0.0078 ms 97.1% + triton_bmm_31926 0.0084 ms 90.8% + triton_bmm_31927 0.0084 ms 90.8% + triton_bmm_31924 0.0086 ms 88.8% + triton_bmm_31925 0.0086 ms 88.8% + triton_bmm_31929 0.0086 ms 88.5% + triton_bmm_31931 0.0088 ms 86.2% + triton_bmm_31933 0.0088 ms 86.2% +SingleProcess AUTOTUNE takes 4.1144 seconds +AUTOTUNE bmm(16x1x336, 16x336x96) + triton_bmm_31953 0.0088 ms 100.0% + triton_bmm_31954 0.0088 ms 99.3% + triton_bmm_31956 0.0090 ms 97.2% + triton_bmm_31952 0.0093 ms 94.2% + triton_bmm_31957 0.0096 ms 91.6% + bmm 0.0099 ms 88.7% + triton_bmm_31951 0.0103 ms 85.0% + triton_bmm_31950 0.0103 ms 84.8% + triton_bmm_31949 0.0114 ms 77.1% + triton_bmm_31948 0.0144 ms 61.0% +SingleProcess AUTOTUNE takes 3.8276 seconds +AUTOTUNE bmm(16x1x96, 16x96x337) + triton_bmm_32024 0.0076 ms 100.0% + triton_bmm_32022 0.0078 ms 97.1% + triton_bmm_32028 0.0078 ms 97.1% + triton_bmm_32021 0.0081 ms 94.4% + triton_bmm_32023 0.0081 ms 94.4% + triton_bmm_32026 0.0084 ms 90.8% + triton_bmm_32025 0.0086 ms 88.5% + triton_bmm_32020 0.0091 ms 83.8% + triton_bmm_32029 0.0093 ms 81.5% + triton_bmm_32030 0.0093 ms 81.5% +SingleProcess AUTOTUNE takes 4.3731 seconds +AUTOTUNE bmm(16x1x337, 16x337x96) + bmm 0.0099 ms 100.0% + triton_bmm_32053 0.0106 ms 93.1% + triton_bmm_32052 0.0121 ms 81.5% + triton_bmm_32050 0.0144 ms 68.6% + triton_bmm_32049 0.0147 ms 67.2% + triton_bmm_32048 0.0150 ms 65.8% + triton_bmm_32046 0.0151 ms 65.3% + triton_bmm_32045 0.0156 ms 63.2% + triton_bmm_32047 0.0160 ms 61.6% + triton_bmm_32044 0.0189 ms 52.2% +SingleProcess AUTOTUNE takes 4.1069 seconds +AUTOTUNE bmm(16x1x96, 16x96x338) + triton_bmm_32120 0.0076 ms 100.0% + triton_bmm_32118 0.0078 ms 97.1% + triton_bmm_32124 0.0078 ms 97.1% + triton_bmm_32119 0.0080 ms 94.8% + triton_bmm_32117 0.0081 ms 94.4% + triton_bmm_32122 0.0083 ms 91.4% + triton_bmm_32121 0.0086 ms 88.5% + triton_bmm_32116 0.0091 ms 83.8% + triton_bmm_32127 0.0092 ms 83.2% + triton_bmm_32126 0.0093 ms 81.5% +SingleProcess AUTOTUNE takes 3.6865 seconds +AUTOTUNE bmm(16x1x338, 16x338x96) + triton_bmm_32145 0.0086 ms 100.0% + triton_bmm_32146 0.0088 ms 97.1% + triton_bmm_32149 0.0096 ms 89.6% + bmm 0.0096 ms 89.3% + triton_bmm_32148 0.0096 ms 89.3% + triton_bmm_32144 0.0099 ms 87.0% + triton_bmm_32143 0.0104 ms 82.7% + triton_bmm_32142 0.0106 ms 81.2% + triton_bmm_32141 0.0114 ms 75.5% + triton_bmm_32140 0.0146 ms 58.6% +SingleProcess AUTOTUNE takes 3.9965 seconds +AUTOTUNE bmm(16x1x96, 16x96x339) + triton_bmm_32218 0.0076 ms 100.0% + triton_bmm_32216 0.0076 ms 99.6% + triton_bmm_32220 0.0079 ms 96.7% + triton_bmm_32213 0.0081 ms 94.4% + triton_bmm_32212 0.0086 ms 88.8% + triton_bmm_32214 0.0086 ms 88.8% + triton_bmm_32215 0.0086 ms 88.5% + triton_bmm_32217 0.0086 ms 88.5% + triton_bmm_32223 0.0093 ms 81.8% + triton_bmm_32221 0.0093 ms 81.5% +SingleProcess AUTOTUNE takes 4.0297 seconds +AUTOTUNE bmm(16x1x339, 16x339x96) + bmm 0.0104 ms 100.0% + triton_bmm_32245 0.0111 ms 93.4% + triton_bmm_32244 0.0125 ms 83.3% + triton_bmm_32241 0.0143 ms 72.5% + triton_bmm_32242 0.0144 ms 72.4% + triton_bmm_32238 0.0148 ms 70.2% + triton_bmm_32240 0.0151 ms 69.0% + triton_bmm_32237 0.0156 ms 66.7% + triton_bmm_32239 0.0164 ms 63.5% + triton_bmm_32236 0.0188 ms 55.4% +SingleProcess AUTOTUNE takes 4.0687 seconds +AUTOTUNE bmm(16x1x96, 16x96x340) + triton_bmm_32312 0.0076 ms 100.0% + triton_bmm_32310 0.0078 ms 97.1% + triton_bmm_32309 0.0081 ms 94.4% + triton_bmm_32313 0.0081 ms 93.7% + triton_bmm_32314 0.0082 ms 92.6% + triton_bmm_32316 0.0085 ms 89.8% + triton_bmm_32308 0.0086 ms 88.8% + triton_bmm_32311 0.0086 ms 88.8% + triton_bmm_32315 0.0088 ms 86.2% + triton_bmm_32318 0.0093 ms 81.5% +SingleProcess AUTOTUNE takes 4.0666 seconds +AUTOTUNE bmm(16x1x340, 16x340x96) + triton_bmm_32337 0.0086 ms 100.0% + triton_bmm_32338 0.0088 ms 97.1% + triton_bmm_32340 0.0091 ms 94.4% + bmm 0.0094 ms 91.2% + triton_bmm_32335 0.0098 ms 87.3% + triton_bmm_32336 0.0099 ms 87.0% + triton_bmm_32341 0.0099 ms 87.0% + triton_bmm_32333 0.0108 ms 79.1% + triton_bmm_32334 0.0109 ms 78.8% + triton_bmm_32332 0.0144 ms 59.7% +SingleProcess AUTOTUNE takes 3.9864 seconds +AUTOTUNE bmm(16x1x96, 16x96x341) + triton_bmm_32410 0.0076 ms 100.0% + triton_bmm_32407 0.0081 ms 94.4% + triton_bmm_32408 0.0084 ms 91.2% + triton_bmm_32406 0.0086 ms 88.8% + triton_bmm_32412 0.0086 ms 88.5% + triton_bmm_32405 0.0086 ms 88.1% + triton_bmm_32409 0.0087 ms 87.8% + triton_bmm_32404 0.0091 ms 83.8% + triton_bmm_32413 0.0093 ms 81.5% + triton_bmm_32411 0.0096 ms 79.6% +SingleProcess AUTOTUNE takes 3.8534 seconds +AUTOTUNE bmm(16x1x341, 16x341x96) + bmm 0.0102 ms 100.0% + triton_bmm_32437 0.0106 ms 96.4% + triton_bmm_32436 0.0121 ms 84.4% + triton_bmm_32433 0.0143 ms 71.2% + triton_bmm_32434 0.0143 ms 71.2% + triton_bmm_32430 0.0148 ms 68.8% + triton_bmm_32432 0.0149 ms 68.3% + triton_bmm_32429 0.0156 ms 65.5% + triton_bmm_32431 0.0164 ms 62.2% + triton_bmm_32428 0.0187 ms 54.7% +SingleProcess AUTOTUNE takes 3.9197 seconds +AUTOTUNE bmm(16x1x96, 16x96x342) + triton_bmm_32506 0.0076 ms 100.0% + triton_bmm_32502 0.0078 ms 97.1% + triton_bmm_32508 0.0079 ms 96.0% + triton_bmm_32503 0.0080 ms 95.6% + triton_bmm_32501 0.0081 ms 94.4% + triton_bmm_32504 0.0082 ms 93.0% + triton_bmm_32500 0.0086 ms 88.8% + triton_bmm_32505 0.0086 ms 88.1% + triton_bmm_32507 0.0088 ms 86.2% + triton_bmm_32511 0.0091 ms 83.8% +SingleProcess AUTOTUNE takes 3.7706 seconds +AUTOTUNE bmm(16x1x342, 16x342x96) + triton_bmm_32529 0.0086 ms 100.0% + triton_bmm_32528 0.0093 ms 91.8% + triton_bmm_32530 0.0093 ms 91.8% + triton_bmm_32533 0.0096 ms 89.6% + triton_bmm_32532 0.0096 ms 89.3% + triton_bmm_32527 0.0098 ms 87.3% + bmm 0.0103 ms 83.5% + triton_bmm_32526 0.0111 ms 77.2% + triton_bmm_32525 0.0114 ms 75.5% + triton_bmm_32524 0.0141 ms 60.8% +SingleProcess AUTOTUNE takes 4.1403 seconds +AUTOTUNE bmm(16x1x96, 16x96x343) + triton_bmm_32600 0.0076 ms 100.0% + triton_bmm_32598 0.0079 ms 97.2% + triton_bmm_32597 0.0081 ms 94.8% + triton_bmm_32602 0.0082 ms 93.7% + triton_bmm_32599 0.0086 ms 88.8% + triton_bmm_32604 0.0086 ms 88.5% + triton_bmm_32601 0.0087 ms 88.2% + triton_bmm_32603 0.0091 ms 84.5% + triton_bmm_32596 0.0091 ms 83.9% + triton_bmm_32606 0.0093 ms 81.8% +SingleProcess AUTOTUNE takes 5.2090 seconds +AUTOTUNE bmm(16x1x343, 16x343x96) + bmm 0.0109 ms 100.0% + triton_bmm_32629 0.0111 ms 98.1% + triton_bmm_32628 0.0126 ms 86.7% + triton_bmm_32626 0.0145 ms 75.2% + triton_bmm_32622 0.0148 ms 73.6% + triton_bmm_32625 0.0149 ms 73.4% + triton_bmm_32624 0.0151 ms 72.4% + triton_bmm_32621 0.0156 ms 70.3% + triton_bmm_32623 0.0164 ms 66.6% + triton_bmm_32620 0.0186 ms 58.8% +SingleProcess AUTOTUNE takes 3.7819 seconds +AUTOTUNE bmm(16x1x96, 16x96x344) + triton_bmm_32698 0.0076 ms 100.0% + triton_bmm_32694 0.0079 ms 97.2% + triton_bmm_32696 0.0082 ms 93.4% + triton_bmm_32700 0.0084 ms 90.5% + triton_bmm_32693 0.0085 ms 89.8% + triton_bmm_32695 0.0086 ms 88.8% + triton_bmm_32697 0.0087 ms 88.2% + triton_bmm_32701 0.0088 ms 86.6% + triton_bmm_32692 0.0094 ms 81.6% + triton_bmm_32699 0.0096 ms 80.1% +SingleProcess AUTOTUNE takes 3.8499 seconds +AUTOTUNE bmm(16x1x344, 16x344x96) + triton_bmm_32722 0.0093 ms 100.0% + triton_bmm_32724 0.0093 ms 100.0% + triton_bmm_32721 0.0094 ms 99.0% + bmm 0.0099 ms 94.8% + triton_bmm_32720 0.0099 ms 94.8% + triton_bmm_32725 0.0099 ms 94.5% + triton_bmm_32719 0.0104 ms 90.1% + triton_bmm_32717 0.0108 ms 86.1% + triton_bmm_32718 0.0111 ms 84.1% + triton_bmm_32716 0.0144 ms 64.7% +SingleProcess AUTOTUNE takes 3.9452 seconds +AUTOTUNE bmm(16x1x96, 16x96x345) + triton_bmm_32790 0.0078 ms 100.0% + triton_bmm_32791 0.0081 ms 97.2% + triton_bmm_32794 0.0082 ms 95.7% + triton_bmm_32792 0.0082 ms 95.5% + triton_bmm_32796 0.0085 ms 92.1% + triton_bmm_32789 0.0086 ms 91.1% + triton_bmm_32793 0.0087 ms 90.4% + triton_bmm_32788 0.0091 ms 86.3% + triton_bmm_32795 0.0096 ms 81.7% + triton_bmm_32799 0.0098 ms 79.8% +SingleProcess AUTOTUNE takes 3.9605 seconds +AUTOTUNE bmm(16x1x345, 16x345x96) + triton_bmm_32821 0.0107 ms 100.0% + bmm 0.0110 ms 97.8% + triton_bmm_32820 0.0126 ms 84.8% + triton_bmm_32817 0.0144 ms 74.6% + triton_bmm_32818 0.0146 ms 73.5% + triton_bmm_32814 0.0148 ms 72.2% + triton_bmm_32816 0.0151 ms 70.8% + triton_bmm_32813 0.0156 ms 68.8% + triton_bmm_32815 0.0164 ms 65.4% + triton_bmm_32812 0.0186 ms 57.6% +SingleProcess AUTOTUNE takes 4.4060 seconds +AUTOTUNE bmm(16x1x96, 16x96x346) + triton_bmm_32888 0.0076 ms 100.0% + triton_bmm_32892 0.0079 ms 96.8% + triton_bmm_32885 0.0081 ms 94.8% + triton_bmm_32890 0.0082 ms 93.4% + triton_bmm_32886 0.0084 ms 91.6% + triton_bmm_32887 0.0084 ms 90.5% + triton_bmm_32884 0.0086 ms 89.2% + triton_bmm_32889 0.0086 ms 88.5% + triton_bmm_32891 0.0088 ms 86.6% + triton_bmm_32894 0.0093 ms 81.8% +SingleProcess AUTOTUNE takes 4.2686 seconds +AUTOTUNE bmm(16x1x346, 16x346x96) + triton_bmm_32914 0.0088 ms 100.0% + triton_bmm_32913 0.0093 ms 94.8% + triton_bmm_32912 0.0093 ms 94.5% + triton_bmm_32917 0.0096 ms 92.0% + triton_bmm_32916 0.0096 ms 91.7% + triton_bmm_32911 0.0098 ms 89.9% + bmm 0.0102 ms 86.8% + triton_bmm_32909 0.0108 ms 81.4% + triton_bmm_32910 0.0111 ms 79.5% + triton_bmm_32908 0.0143 ms 61.6% +SingleProcess AUTOTUNE takes 4.3520 seconds +AUTOTUNE bmm(16x1x96, 16x96x347) + triton_bmm_32988 0.0079 ms 100.0% + triton_bmm_32983 0.0081 ms 98.0% + triton_bmm_32986 0.0082 ms 96.5% + triton_bmm_32984 0.0083 ms 95.4% + triton_bmm_32982 0.0084 ms 93.9% + triton_bmm_32981 0.0086 ms 91.8% + triton_bmm_32985 0.0087 ms 91.0% + triton_bmm_32980 0.0091 ms 86.8% + triton_bmm_32990 0.0093 ms 84.6% + triton_bmm_32987 0.0096 ms 82.3% +SingleProcess AUTOTUNE takes 3.8819 seconds +AUTOTUNE bmm(16x1x347, 16x347x96) + triton_bmm_33013 0.0113 ms 100.0% + bmm 0.0114 ms 99.2% + triton_bmm_33012 0.0126 ms 89.8% + triton_bmm_33010 0.0146 ms 77.6% + triton_bmm_33006 0.0148 ms 76.3% + triton_bmm_33009 0.0149 ms 76.1% + triton_bmm_33008 0.0151 ms 74.8% + triton_bmm_33005 0.0156 ms 72.5% + triton_bmm_33007 0.0161 ms 70.4% + triton_bmm_33004 0.0191 ms 59.3% +SingleProcess AUTOTUNE takes 4.0045 seconds +AUTOTUNE bmm(16x1x96, 16x96x348) + triton_bmm_33082 0.0076 ms 100.0% + triton_bmm_33078 0.0078 ms 97.6% + triton_bmm_33084 0.0079 ms 96.8% + triton_bmm_33080 0.0082 ms 93.5% + triton_bmm_33076 0.0086 ms 89.2% + triton_bmm_33077 0.0086 ms 89.2% + triton_bmm_33079 0.0086 ms 88.8% + triton_bmm_33081 0.0086 ms 88.5% + triton_bmm_33087 0.0093 ms 82.1% + triton_bmm_33083 0.0096 ms 79.7% +SingleProcess AUTOTUNE takes 3.7362 seconds +AUTOTUNE bmm(16x1x348, 16x348x96) + triton_bmm_33105 0.0086 ms 100.0% + triton_bmm_33106 0.0088 ms 97.5% + bmm 0.0093 ms 92.1% + triton_bmm_33104 0.0093 ms 92.1% + triton_bmm_33108 0.0096 ms 89.4% + triton_bmm_33109 0.0099 ms 87.1% + triton_bmm_33103 0.0104 ms 83.0% + triton_bmm_33101 0.0110 ms 78.2% + triton_bmm_33102 0.0111 ms 77.5% + triton_bmm_33100 0.0146 ms 59.0% +SingleProcess AUTOTUNE takes 4.0488 seconds +AUTOTUNE bmm(16x1x96, 16x96x349) + triton_bmm_33178 0.0077 ms 100.0% + triton_bmm_33174 0.0079 ms 97.6% + triton_bmm_33175 0.0081 ms 95.2% + triton_bmm_33177 0.0081 ms 94.5% + triton_bmm_33176 0.0082 ms 93.7% + triton_bmm_33180 0.0084 ms 90.9% + triton_bmm_33173 0.0086 ms 89.2% + triton_bmm_33179 0.0091 ms 84.8% + triton_bmm_33172 0.0091 ms 84.5% + triton_bmm_33183 0.0093 ms 82.5% +SingleProcess AUTOTUNE takes 3.9157 seconds +AUTOTUNE bmm(16x1x349, 16x349x96) + triton_bmm_33205 0.0113 ms 100.0% + bmm 0.0114 ms 99.7% + triton_bmm_33204 0.0123 ms 91.9% + triton_bmm_33201 0.0144 ms 78.8% + triton_bmm_33202 0.0151 ms 74.8% + triton_bmm_33200 0.0152 ms 74.4% + triton_bmm_33198 0.0154 ms 73.6% + triton_bmm_33199 0.0161 ms 70.4% + triton_bmm_33197 0.0161 ms 70.2% + triton_bmm_33196 0.0188 ms 60.2% +SingleProcess AUTOTUNE takes 4.1535 seconds +AUTOTUNE bmm(16x1x96, 16x96x350) + triton_bmm_33272 0.0076 ms 100.0% + triton_bmm_33274 0.0076 ms 100.0% + triton_bmm_33270 0.0079 ms 97.2% + triton_bmm_33276 0.0086 ms 89.3% + triton_bmm_33268 0.0086 ms 89.2% + triton_bmm_33271 0.0086 ms 89.0% + triton_bmm_33269 0.0086 ms 88.8% + triton_bmm_33273 0.0086 ms 88.5% + triton_bmm_33278 0.0093 ms 81.8% + triton_bmm_33275 0.0096 ms 79.9% +SingleProcess AUTOTUNE takes 3.8544 seconds +AUTOTUNE bmm(16x1x350, 16x350x96) + triton_bmm_33297 0.0087 ms 100.0% + triton_bmm_33298 0.0088 ms 98.9% + triton_bmm_33301 0.0096 ms 91.0% + triton_bmm_33300 0.0097 ms 90.4% + triton_bmm_33295 0.0099 ms 88.6% + triton_bmm_33296 0.0100 ms 87.5% + bmm 0.0103 ms 84.5% + triton_bmm_33294 0.0111 ms 78.4% + triton_bmm_33293 0.0114 ms 76.9% + triton_bmm_33292 0.0147 ms 59.5% +SingleProcess AUTOTUNE takes 3.8383 seconds +AUTOTUNE bmm(16x1x96, 16x96x351) + triton_bmm_33368 0.0077 ms 100.0% + triton_bmm_33370 0.0077 ms 100.0% + triton_bmm_33372 0.0079 ms 96.8% + triton_bmm_33365 0.0081 ms 94.9% + triton_bmm_33366 0.0085 ms 90.1% + triton_bmm_33367 0.0086 ms 89.2% + triton_bmm_33369 0.0087 ms 88.6% + triton_bmm_33364 0.0091 ms 84.2% + triton_bmm_33374 0.0093 ms 82.2% + triton_bmm_33371 0.0096 ms 80.0% +SingleProcess AUTOTUNE takes 3.8424 seconds +AUTOTUNE bmm(16x1x351, 16x351x96) + triton_bmm_33397 0.0108 ms 100.0% + bmm 0.0121 ms 89.4% + triton_bmm_33396 0.0123 ms 87.8% + triton_bmm_33393 0.0139 ms 78.1% + triton_bmm_33394 0.0139 ms 78.1% + triton_bmm_33392 0.0145 ms 74.8% + triton_bmm_33390 0.0147 ms 73.8% + triton_bmm_33391 0.0156 ms 69.3% + triton_bmm_33389 0.0162 ms 66.8% + triton_bmm_33388 0.0196 ms 55.1% +SingleProcess AUTOTUNE takes 3.9395 seconds +AUTOTUNE bmm(16x1x96, 16x96x352) + triton_bmm_33461 0.0078 ms 100.0% + triton_bmm_33462 0.0078 ms 100.0% + triton_bmm_33463 0.0081 ms 97.2% + triton_bmm_33464 0.0082 ms 95.3% + triton_bmm_33466 0.0082 ms 95.3% + triton_bmm_33468 0.0084 ms 93.0% + triton_bmm_33460 0.0086 ms 91.4% + triton_bmm_33465 0.0086 ms 91.1% + triton_bmm_33469 0.0088 ms 88.8% + triton_bmm_33471 0.0091 ms 86.3% +SingleProcess AUTOTUNE takes 3.9342 seconds +AUTOTUNE bmm(16x1x352, 16x352x96) + triton_bmm_33490 0.0088 ms 100.0% + triton_bmm_33492 0.0091 ms 97.2% + triton_bmm_33489 0.0093 ms 94.5% + triton_bmm_33487 0.0098 ms 89.9% + triton_bmm_33488 0.0099 ms 89.6% + triton_bmm_33493 0.0100 ms 88.5% + bmm 0.0105 ms 84.4% + triton_bmm_33486 0.0109 ms 80.7% + triton_bmm_33485 0.0114 ms 77.7% + triton_bmm_33484 0.0146 ms 60.4% +SingleProcess AUTOTUNE takes 3.9848 seconds +AUTOTUNE bmm(16x1x96, 16x96x353) + triton_bmm_33562 0.0076 ms 100.0% + triton_bmm_33558 0.0078 ms 97.6% + triton_bmm_33564 0.0080 ms 96.0% + triton_bmm_33557 0.0081 ms 94.8% + triton_bmm_33560 0.0084 ms 91.6% + triton_bmm_33556 0.0086 ms 89.2% + triton_bmm_33559 0.0086 ms 88.8% + triton_bmm_33561 0.0087 ms 88.2% + triton_bmm_33563 0.0091 ms 84.2% + triton_bmm_33566 0.0093 ms 81.8% +SingleProcess AUTOTUNE takes 4.8047 seconds +AUTOTUNE bmm(16x1x353, 16x353x96) + bmm 0.0104 ms 100.0% + triton_bmm_33589 0.0108 ms 96.2% + triton_bmm_33588 0.0123 ms 84.4% + triton_bmm_33585 0.0148 ms 70.0% + triton_bmm_33586 0.0148 ms 70.0% + triton_bmm_33584 0.0156 ms 66.6% + triton_bmm_33582 0.0159 ms 65.5% + triton_bmm_33581 0.0160 ms 64.8% + triton_bmm_33583 0.0169 ms 61.4% + triton_bmm_33580 0.0196 ms 52.9% +SingleProcess AUTOTUNE takes 4.0796 seconds +AUTOTUNE bmm(16x1x96, 16x96x354) + triton_bmm_33656 0.0076 ms 100.0% + triton_bmm_33658 0.0076 ms 100.0% + triton_bmm_33660 0.0079 ms 97.2% + triton_bmm_33653 0.0081 ms 94.8% + triton_bmm_33655 0.0081 ms 94.8% + triton_bmm_33657 0.0081 ms 94.1% + triton_bmm_33652 0.0086 ms 89.2% + triton_bmm_33654 0.0086 ms 89.2% + triton_bmm_33659 0.0091 ms 84.5% + triton_bmm_33662 0.0093 ms 81.8% +SingleProcess AUTOTUNE takes 4.2596 seconds +AUTOTUNE bmm(16x1x354, 16x354x96) + triton_bmm_33682 0.0091 ms 100.0% + triton_bmm_33684 0.0091 ms 99.6% + triton_bmm_33681 0.0093 ms 96.9% + triton_bmm_33680 0.0101 ms 89.8% + triton_bmm_33685 0.0102 ms 89.0% + triton_bmm_33679 0.0106 ms 85.2% + bmm 0.0107 ms 84.5% + triton_bmm_33678 0.0108 ms 83.7% + triton_bmm_33677 0.0111 ms 81.6% + triton_bmm_33676 0.0148 ms 61.0% +SingleProcess AUTOTUNE takes 4.1009 seconds +AUTOTUNE bmm(16x1x96, 16x96x355) + triton_bmm_33754 0.0076 ms 100.0% + triton_bmm_33751 0.0081 ms 94.8% + triton_bmm_33752 0.0084 ms 91.2% + triton_bmm_33750 0.0085 ms 89.7% + triton_bmm_33756 0.0086 ms 88.8% + triton_bmm_33749 0.0086 ms 88.5% + triton_bmm_33753 0.0087 ms 88.2% + triton_bmm_33748 0.0091 ms 83.9% + triton_bmm_33759 0.0093 ms 82.1% + triton_bmm_33757 0.0096 ms 79.9% +SingleProcess AUTOTUNE takes 3.9443 seconds +AUTOTUNE bmm(16x1x355, 16x355x96) + bmm 0.0106 ms 100.0% + triton_bmm_33781 0.0114 ms 93.2% + triton_bmm_33780 0.0129 ms 82.3% + triton_bmm_33774 0.0154 ms 69.0% + triton_bmm_33777 0.0154 ms 69.0% + triton_bmm_33778 0.0154 ms 68.8% + triton_bmm_33776 0.0156 ms 67.8% + triton_bmm_33773 0.0164 ms 64.6% + triton_bmm_33775 0.0164 ms 64.6% + triton_bmm_33772 0.0193 ms 54.8% +SingleProcess AUTOTUNE takes 4.1173 seconds +AUTOTUNE bmm(16x1x96, 16x96x356) + triton_bmm_33848 0.0076 ms 100.0% + triton_bmm_33850 0.0076 ms 100.0% + triton_bmm_33847 0.0080 ms 96.0% + triton_bmm_33849 0.0081 ms 94.1% + triton_bmm_33846 0.0086 ms 89.2% + triton_bmm_33845 0.0086 ms 88.8% + triton_bmm_33852 0.0086 ms 88.8% + triton_bmm_33844 0.0090 ms 84.8% + triton_bmm_33853 0.0092 ms 83.0% + triton_bmm_33851 0.0096 ms 79.9% +SingleProcess AUTOTUNE takes 3.8354 seconds +AUTOTUNE bmm(16x1x356, 16x356x96) + triton_bmm_33874 0.0088 ms 100.0% + triton_bmm_33876 0.0091 ms 97.2% + bmm 0.0093 ms 94.5% + triton_bmm_33873 0.0093 ms 94.5% + triton_bmm_33877 0.0099 ms 89.6% + triton_bmm_33872 0.0100 ms 88.2% + triton_bmm_33871 0.0101 ms 87.6% + triton_bmm_33870 0.0108 ms 81.7% + triton_bmm_33869 0.0113 ms 78.2% + triton_bmm_33868 0.0146 ms 60.5% +SingleProcess AUTOTUNE takes 3.9691 seconds +AUTOTUNE bmm(16x1x96, 16x96x357) + triton_bmm_33944 0.0076 ms 100.0% + triton_bmm_33943 0.0081 ms 94.8% + triton_bmm_33946 0.0084 ms 91.2% + triton_bmm_33942 0.0086 ms 89.2% + triton_bmm_33948 0.0086 ms 89.2% + triton_bmm_33941 0.0086 ms 88.8% + triton_bmm_33945 0.0089 ms 86.3% + triton_bmm_33940 0.0091 ms 84.2% + triton_bmm_33951 0.0093 ms 81.8% + triton_bmm_33949 0.0094 ms 81.3% +SingleProcess AUTOTUNE takes 3.8829 seconds +AUTOTUNE bmm(16x1x357, 16x357x96) + bmm 0.0106 ms 100.0% + triton_bmm_33973 0.0108 ms 97.9% + triton_bmm_33972 0.0126 ms 84.0% + triton_bmm_33969 0.0148 ms 71.3% + triton_bmm_33970 0.0148 ms 71.3% + triton_bmm_33968 0.0151 ms 70.1% + triton_bmm_33966 0.0154 ms 69.0% + triton_bmm_33965 0.0161 ms 65.8% + triton_bmm_33967 0.0166 ms 63.8% + triton_bmm_33964 0.0191 ms 55.4% +SingleProcess AUTOTUNE takes 3.8057 seconds +AUTOTUNE bmm(16x1x96, 16x96x358) + triton_bmm_34039 0.0081 ms 100.0% + triton_bmm_34041 0.0081 ms 99.2% + triton_bmm_34040 0.0084 ms 96.6% + triton_bmm_34042 0.0084 ms 96.6% + triton_bmm_34044 0.0085 ms 95.1% + triton_bmm_34038 0.0085 ms 94.7% + triton_bmm_34037 0.0086 ms 93.7% + triton_bmm_34036 0.0091 ms 88.4% + triton_bmm_34047 0.0092 ms 87.2% + triton_bmm_34046 0.0093 ms 86.3% +SingleProcess AUTOTUNE takes 4.0200 seconds +AUTOTUNE bmm(16x1x358, 16x358x96) + triton_bmm_34066 0.0091 ms 100.0% + triton_bmm_34065 0.0093 ms 96.9% + triton_bmm_34068 0.0096 ms 94.2% + triton_bmm_34063 0.0101 ms 89.6% + triton_bmm_34064 0.0101 ms 89.6% + triton_bmm_34069 0.0103 ms 88.0% + bmm 0.0106 ms 85.2% + triton_bmm_34062 0.0108 ms 83.7% + triton_bmm_34061 0.0116 ms 77.9% + triton_bmm_34060 0.0148 ms 61.0% +SingleProcess AUTOTUNE takes 3.9678 seconds +AUTOTUNE bmm(16x1x96, 16x96x359) + triton_bmm_34136 0.0076 ms 100.0% + triton_bmm_34134 0.0078 ms 97.1% + triton_bmm_34140 0.0079 ms 96.7% + triton_bmm_34133 0.0081 ms 94.4% + triton_bmm_34137 0.0083 ms 91.5% + triton_bmm_34138 0.0084 ms 91.2% + triton_bmm_34135 0.0086 ms 88.5% + triton_bmm_34132 0.0091 ms 83.8% + triton_bmm_34141 0.0093 ms 81.5% + triton_bmm_34143 0.0093 ms 81.5% +SingleProcess AUTOTUNE takes 3.7591 seconds +AUTOTUNE bmm(16x1x359, 16x359x96) + bmm 0.0111 ms 100.0% + triton_bmm_34165 0.0113 ms 97.7% + triton_bmm_34164 0.0123 ms 89.9% + triton_bmm_34162 0.0149 ms 74.4% + triton_bmm_34158 0.0154 ms 72.1% + triton_bmm_34161 0.0154 ms 71.9% + triton_bmm_34160 0.0156 ms 70.9% + triton_bmm_34157 0.0166 ms 66.6% + triton_bmm_34159 0.0171 ms 64.7% + triton_bmm_34156 0.0192 ms 57.8% +SingleProcess AUTOTUNE takes 3.8555 seconds +AUTOTUNE bmm(16x1x96, 16x96x360) + triton_bmm_34232 0.0076 ms 100.0% + triton_bmm_34234 0.0076 ms 100.0% + triton_bmm_34230 0.0078 ms 97.6% + triton_bmm_34231 0.0081 ms 94.8% + triton_bmm_34233 0.0083 ms 92.6% + triton_bmm_34229 0.0086 ms 89.2% + triton_bmm_34236 0.0086 ms 88.8% + triton_bmm_34228 0.0091 ms 84.2% + triton_bmm_34235 0.0096 ms 79.9% + triton_bmm_34237 0.0096 ms 79.7% +SingleProcess AUTOTUNE takes 4.5444 seconds +AUTOTUNE bmm(16x1x360, 16x360x96) + triton_bmm_34257 0.0090 ms 100.0% + triton_bmm_34258 0.0095 ms 94.6% + triton_bmm_34256 0.0096 ms 94.3% + triton_bmm_34260 0.0099 ms 91.6% + bmm 0.0101 ms 89.5% + triton_bmm_34255 0.0101 ms 89.5% + triton_bmm_34261 0.0102 ms 88.3% + triton_bmm_34254 0.0111 ms 81.3% + triton_bmm_34253 0.0116 ms 77.5% + triton_bmm_34252 0.0151 ms 59.9% +SingleProcess AUTOTUNE takes 4.1167 seconds +AUTOTUNE bmm(16x1x96, 16x96x361) + triton_bmm_34326 0.0078 ms 100.0% + triton_bmm_34327 0.0081 ms 97.2% + triton_bmm_34332 0.0081 ms 97.2% + triton_bmm_34330 0.0084 ms 93.9% + triton_bmm_34328 0.0084 ms 93.5% + triton_bmm_34324 0.0086 ms 91.4% + triton_bmm_34329 0.0088 ms 89.6% + triton_bmm_34325 0.0088 ms 88.8% + triton_bmm_34331 0.0091 ms 86.3% + triton_bmm_34335 0.0093 ms 83.9% +SingleProcess AUTOTUNE takes 4.0999 seconds +AUTOTUNE bmm(16x1x361, 16x361x96) + triton_bmm_34357 0.0113 ms 100.0% + bmm 0.0114 ms 98.9% + triton_bmm_34356 0.0128 ms 87.9% + triton_bmm_34353 0.0154 ms 73.3% + triton_bmm_34350 0.0156 ms 72.5% + triton_bmm_34354 0.0156 ms 72.4% + triton_bmm_34352 0.0156 ms 72.1% + triton_bmm_34349 0.0161 ms 70.2% + triton_bmm_34351 0.0166 ms 68.1% + triton_bmm_34348 0.0194 ms 58.3% +SingleProcess AUTOTUNE takes 3.8660 seconds +AUTOTUNE bmm(16x1x96, 16x96x362) + triton_bmm_34424 0.0076 ms 100.0% + triton_bmm_34422 0.0078 ms 97.1% + triton_bmm_34428 0.0080 ms 94.8% + triton_bmm_34423 0.0081 ms 94.4% + triton_bmm_34425 0.0083 ms 92.2% + triton_bmm_34426 0.0084 ms 91.2% + triton_bmm_34420 0.0086 ms 88.8% + triton_bmm_34421 0.0086 ms 88.5% + triton_bmm_34431 0.0092 ms 82.4% + triton_bmm_34427 0.0096 ms 79.3% +SingleProcess AUTOTUNE takes 3.7084 seconds +AUTOTUNE bmm(16x1x362, 16x362x96) + triton_bmm_34449 0.0088 ms 100.0% + triton_bmm_34450 0.0091 ms 97.2% + triton_bmm_34452 0.0091 ms 97.2% + triton_bmm_34448 0.0096 ms 92.3% + triton_bmm_34453 0.0096 ms 92.0% + bmm 0.0104 ms 84.9% + triton_bmm_34447 0.0106 ms 83.1% + triton_bmm_34446 0.0108 ms 81.7% + triton_bmm_34445 0.0116 ms 75.8% + triton_bmm_34444 0.0154 ms 57.5% +SingleProcess AUTOTUNE takes 4.1773 seconds +AUTOTUNE bmm(16x1x96, 16x96x363) + triton_bmm_34520 0.0077 ms 100.0% + triton_bmm_34519 0.0081 ms 95.2% + triton_bmm_34521 0.0083 ms 92.3% + triton_bmm_34522 0.0084 ms 92.0% + triton_bmm_34518 0.0086 ms 89.6% + triton_bmm_34524 0.0086 ms 89.2% + triton_bmm_34517 0.0088 ms 87.0% + triton_bmm_34516 0.0091 ms 84.5% + triton_bmm_34523 0.0091 ms 84.5% + triton_bmm_34526 0.0093 ms 82.2% +SingleProcess AUTOTUNE takes 3.9439 seconds +AUTOTUNE bmm(16x1x363, 16x363x96) + triton_bmm_34549 0.0113 ms 100.0% + bmm 0.0121 ms 93.5% + triton_bmm_34548 0.0129 ms 87.9% + triton_bmm_34545 0.0149 ms 76.0% + triton_bmm_34544 0.0154 ms 73.6% + triton_bmm_34542 0.0156 ms 72.6% + triton_bmm_34546 0.0157 ms 72.1% + triton_bmm_34543 0.0166 ms 68.1% + triton_bmm_34541 0.0166 ms 68.0% + triton_bmm_34540 0.0202 ms 56.1% +SingleProcess AUTOTUNE takes 4.1606 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +dcgan +cuda eval dcgan int8dynamic-bs1-acc +pass-sqnr-53.240 + loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead + loading model: 0it [00:09, ?it/s] +WARNING:common:Model demucs does not support bfloat16, running with amp instead +demucs +cuda eval demucs int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for demucs. Setting accuracy check to cosine +WARNING:common:Model demucs does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +densenet121 +cuda eval densenet121 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for densenet121. Setting accuracy check to cosine +pass-sqnr-29.354 + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4 +WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 + loading model: 0it [00:09, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4 + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5 +WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn +WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_fcos_r_50_fpn int8dynamic-bs1-acc +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +AUTOTUNE convolution(1x128x100x152, 128x128x3x3) + convolution 0.0365 ms 100.0% + triton_convolution_123 0.2064 ms 17.7% + triton_convolution_124 0.2125 ms 17.2% + triton_convolution_121 0.2346 ms 15.6% + triton_convolution_118 0.2411 ms 15.2% + triton_convolution_122 0.2638 ms 13.9% + triton_convolution_119 0.3414 ms 10.7% + triton_convolution_120 0.9397 ms 3.9% +SingleProcess AUTOTUNE takes 2.5107 seconds +AUTOTUNE mm(15200x128, 128x512) + triton_mm_127 0.0263 ms 100.0% + triton_mm_126 0.0289 ms 91.0% + mm 0.0291 ms 90.2% + triton_mm_132 0.0299 ms 88.0% + triton_mm_129 0.0303 ms 86.6% + triton_mm_125 0.0316 ms 83.2% + triton_mm_128 0.0324 ms 81.0% + triton_mm_133 0.0329 ms 79.9% + triton_mm_135 0.0418 ms 62.9% + triton_mm_131 0.0507 ms 51.9% +SingleProcess AUTOTUNE takes 4.7620 seconds +AUTOTUNE convolution(1x256x200x304, 512x256x1x1) + convolution 0.0372 ms 100.0% + triton_convolution_140 0.0978 ms 38.1% + triton_convolution_142 0.1094 ms 34.0% + triton_convolution_143 0.1128 ms 33.0% + triton_convolution_141 0.1410 ms 26.4% + triton_convolution_137 0.1983 ms 18.8% + triton_convolution_138 0.2241 ms 16.6% + triton_convolution_139 0.5520 ms 6.7% +SingleProcess AUTOTUNE takes 4.3199 seconds +AUTOTUNE mm(15200x512, 512x128) + triton_mm_146 0.0284 ms 100.0% + triton_mm_147 0.0297 ms 95.9% + triton_mm_145 0.0311 ms 91.6% + triton_mm_148 0.0314 ms 90.6% + triton_mm_144 0.0323 ms 88.1% + triton_mm_152 0.0325 ms 87.6% + mm 0.0332 ms 85.6% + triton_mm_151 0.0340 ms 83.8% + triton_mm_149 0.0389 ms 73.2% + triton_mm_150 0.0439 ms 64.8% +SingleProcess AUTOTUNE takes 4.9120 seconds +AUTOTUNE convolution(1x512x100x152, 256x512x1x1) + convolution 0.0164 ms 100.0% + triton_convolution_240 0.0396 ms 41.5% + triton_convolution_243 0.0452 ms 36.4% + triton_convolution_242 0.0460 ms 35.7% + triton_convolution_241 0.0509 ms 32.3% + triton_convolution_237 0.1389 ms 11.8% + triton_convolution_238 0.1543 ms 10.7% + triton_convolution_239 0.2241 ms 7.3% +SingleProcess AUTOTUNE takes 4.4658 seconds +AUTOTUNE convolution(1x256x50x76, 256x256x3x3) + convolution 0.0380 ms 100.0% + triton_convolution_249 0.2245 ms 16.9% + triton_convolution_247 0.2461 ms 15.4% + triton_convolution_250 0.2758 ms 13.8% + triton_convolution_248 0.3974 ms 9.6% + triton_convolution_244 0.5125 ms 7.4% + triton_convolution_245 0.5816 ms 6.5% + triton_convolution_246 0.9586 ms 4.0% +SingleProcess AUTOTUNE takes 5.0221 seconds +AUTOTUNE mm(3800x256, 256x1024) + triton_mm_252 0.0234 ms 100.0% + mm 0.0244 ms 95.9% + triton_mm_253 0.0244 ms 95.7% + triton_mm_254 0.0259 ms 90.4% + triton_mm_255 0.0267 ms 87.8% + triton_mm_258 0.0289 ms 80.9% + triton_mm_259 0.0292 ms 80.0% + triton_mm_251 0.0308 ms 76.1% + triton_mm_261 0.0393 ms 59.6% + triton_mm_257 0.0442 ms 53.0% +SingleProcess AUTOTUNE takes 4.6660 seconds +AUTOTUNE convolution(1x512x100x152, 1024x512x1x1) + convolution 0.0336 ms 100.0% + triton_convolution_266 0.0993 ms 33.9% + triton_convolution_269 0.1162 ms 29.0% + triton_convolution_268 0.1200 ms 28.0% + triton_convolution_267 0.1416 ms 23.8% + triton_convolution_263 0.2281 ms 14.7% + triton_convolution_264 0.2526 ms 13.3% + triton_convolution_265 0.6037 ms 5.6% +SingleProcess AUTOTUNE takes 4.3400 seconds +AUTOTUNE mm(3800x1024, 1024x256) + mm 0.0233 ms 100.0% + triton_mm_272 0.0274 ms 85.0% + triton_mm_271 0.0275 ms 84.7% + triton_mm_273 0.0299 ms 77.9% + triton_mm_278 0.0302 ms 77.0% + triton_mm_274 0.0303 ms 77.0% + triton_mm_275 0.0408 ms 57.1% + triton_mm_276 0.0412 ms 56.5% + triton_mm_270 0.0413 ms 56.3% + triton_mm_277 0.0458 ms 50.8% +SingleProcess AUTOTUNE takes 4.6486 seconds +AUTOTUNE convolution(1x1024x50x76, 512x1024x1x1) + convolution 0.0177 ms 100.0% + triton_convolution_429 0.0584 ms 30.3% + triton_convolution_428 0.0690 ms 25.6% + triton_convolution_431 0.0795 ms 22.3% + triton_convolution_430 0.0811 ms 21.8% + triton_convolution_425 0.2644 ms 6.7% + triton_convolution_426 0.2885 ms 6.1% + triton_convolution_427 0.4139 ms 4.3% +SingleProcess AUTOTUNE takes 4.6566 seconds +AUTOTUNE convolution(1x512x25x38, 512x512x3x3) + convolution 0.0451 ms 100.0% + triton_convolution_437 0.4804 ms 9.4% + triton_convolution_436 0.5893 ms 7.7% + triton_convolution_438 0.6044 ms 7.5% + triton_convolution_435 0.6832 ms 6.6% + triton_convolution_432 1.0699 ms 4.2% + triton_convolution_433 1.2391 ms 3.6% + triton_convolution_434 1.8037 ms 2.5% +SingleProcess AUTOTUNE takes 4.6780 seconds +AUTOTUNE mm(950x512, 512x2048) + triton_mm_440 0.0239 ms 100.0% + triton_mm_441 0.0284 ms 84.1% + triton_mm_442 0.0287 ms 83.3% + triton_mm_443 0.0331 ms 72.1% + triton_mm_447 0.0331 ms 72.1% + triton_mm_446 0.0340 ms 70.2% + triton_mm_439 0.0356 ms 67.1% + triton_mm_449 0.0356 ms 67.0% + triton_mm_445 0.0417 ms 57.3% + mm 0.0443 ms 53.9% +SingleProcess AUTOTUNE takes 6.0963 seconds +AUTOTUNE convolution(1x1024x50x76, 2048x1024x1x1) + convolution 0.0369 ms 100.0% + triton_convolution_454 0.1322 ms 27.9% + triton_convolution_456 0.1450 ms 25.4% + triton_convolution_457 0.1524 ms 24.2% + triton_convolution_455 0.1678 ms 22.0% + triton_convolution_451 0.2643 ms 14.0% + triton_convolution_452 0.2792 ms 13.2% + triton_convolution_453 0.8156 ms 4.5% +SingleProcess AUTOTUNE takes 4.6970 seconds +AUTOTUNE mm(950x2048, 2048x512) + triton_mm_461 0.0338 ms 100.0% + mm 0.0359 ms 94.1% + triton_mm_462 0.0375 ms 90.2% + triton_mm_460 0.0416 ms 81.2% + triton_mm_459 0.0418 ms 80.9% + triton_mm_466 0.0444 ms 76.1% + triton_mm_464 0.0482 ms 70.1% + triton_mm_467 0.0524 ms 64.5% + triton_mm_463 0.0559 ms 60.4% + triton_mm_458 0.0622 ms 54.3% +SingleProcess AUTOTUNE takes 5.2684 seconds +AUTOTUNE addmm(15200x256, 15200x512, 512x256) + triton_mm_522 0.0374 ms 100.0% + triton_mm_521 0.0393 ms 95.3% + triton_mm_523 0.0406 ms 92.2% + triton_mm_524 0.0408 ms 91.6% + triton_mm_528 0.0442 ms 84.7% + triton_mm_520 0.0471 ms 79.4% + addmm 0.0503 ms 74.4% + bias_addmm 0.0541 ms 69.1% + triton_mm_527 0.0546 ms 68.6% + triton_mm_526 0.0688 ms 54.4% +SingleProcess AUTOTUNE takes 5.8164 seconds +AUTOTUNE addmm(3800x256, 3800x1024, 1024x256) + triton_mm_533 0.0278 ms 100.0% + triton_mm_534 0.0286 ms 97.4% + addmm 0.0297 ms 93.8% + triton_mm_535 0.0303 ms 91.8% + triton_mm_540 0.0305 ms 91.2% + triton_mm_536 0.0312 ms 89.1% + bias_addmm 0.0402 ms 69.2% + triton_mm_538 0.0413 ms 67.3% + triton_mm_537 0.0414 ms 67.1% + triton_mm_532 0.0417 ms 66.7% +SingleProcess AUTOTUNE takes 5.4729 seconds +AUTOTUNE addmm(950x256, 950x2048, 2048x256) + triton_mm_552 0.0277 ms 100.0% + triton_mm_553 0.0307 ms 90.3% + triton_mm_547 0.0331 ms 83.8% + addmm 0.0333 ms 83.3% + triton_mm_550 0.0347 ms 79.9% + triton_mm_548 0.0376 ms 73.8% + triton_mm_545 0.0405 ms 68.5% + triton_mm_549 0.0415 ms 66.9% + triton_mm_546 0.0418 ms 66.4% + triton_mm_544 0.0618 ms 44.9% +SingleProcess AUTOTUNE takes 5.7053 seconds +AUTOTUNE convolution(1x256x100x152, 256x256x3x3) + convolution 0.0951 ms 100.0% + triton_convolution_561 0.6191 ms 15.4% + triton_convolution_559 0.6605 ms 14.4% + triton_convolution_562 0.7043 ms 13.5% + triton_convolution_560 0.9832 ms 9.7% + triton_convolution_556 1.0477 ms 9.1% + triton_convolution_557 1.1932 ms 8.0% + triton_convolution_558 2.8248 ms 3.4% +SingleProcess AUTOTUNE takes 4.5594 seconds +AUTOTUNE convolution(1x256x25x38, 256x256x3x3) + convolution 0.0189 ms 100.0% + triton_convolution_574 0.1764 ms 10.7% + triton_convolution_575 0.2252 ms 8.4% + triton_convolution_573 0.2300 ms 8.2% + triton_convolution_576 0.2909 ms 6.5% + triton_convolution_570 0.5394 ms 3.5% + triton_convolution_571 0.6001 ms 3.1% + triton_convolution_572 0.8995 ms 2.1% +SingleProcess AUTOTUNE takes 4.5806 seconds +AUTOTUNE convolution(1x256x25x38, 256x256x3x3) + convolution 0.0170 ms 100.0% + triton_convolution_582 0.2979 ms 5.7% + triton_convolution_581 0.3504 ms 4.8% + triton_convolution_579 0.3898 ms 4.4% + triton_convolution_580 0.4249 ms 4.0% + triton_convolution_583 0.4727 ms 3.6% + triton_convolution_577 0.6132 ms 2.8% + triton_convolution_578 0.6936 ms 2.4% +SingleProcess AUTOTUNE takes 4.7868 seconds +AUTOTUNE convolution(1x256x13x19, 256x256x3x3) + convolution 0.0176 ms 100.0% + triton_convolution_586 0.1852 ms 9.5% + triton_convolution_589 0.2708 ms 6.5% + triton_convolution_590 0.2766 ms 6.4% + triton_convolution_588 0.3073 ms 5.7% + triton_convolution_587 0.3128 ms 5.6% + triton_convolution_585 0.4448 ms 3.9% + triton_convolution_584 0.5243 ms 3.4% +SingleProcess AUTOTUNE takes 4.6710 seconds +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4 +WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4 +WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:11, ?it/s] +dlrm +cuda eval dlrm int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for dlrm. Setting accuracy check to cosine +AUTOTUNE int_mm(1x512, 512x512, 1x512) + triton_mm_10 0.0095 ms 100.0% + triton_mm_9 0.0100 ms 95.8% + triton_mm_5 0.0102 ms 93.4% + triton_mm_8 0.0103 ms 93.0% + triton_mm_4 0.0104 ms 91.4% + triton_mm_6 0.0105 ms 90.6% + triton_mm_2 0.0131 ms 72.9% + triton_mm_1 0.0137 ms 69.8% + triton_mm_0 0.0165 ms 57.6% + triton_mm_3 0.0166 ms 57.4% +SingleProcess AUTOTUNE takes 3.8999 seconds +AUTOTUNE int_mm(1x512, 512x64, 1x64) + triton_mm_17 0.0080 ms 100.0% + triton_mm_16 0.0092 ms 86.8% + triton_mm_14 0.0096 ms 83.1% + triton_mm_13 0.0099 ms 80.9% + triton_mm_15 0.0100 ms 79.6% + triton_mm_12 0.0121 ms 66.1% + triton_mm_11 0.0150 ms 53.3% +SingleProcess AUTOTUNE takes 2.2859 seconds +AUTOTUNE bmm(1x9x64, 1x64x9) + triton_bmm_19 0.0058 ms 100.0% + triton_bmm_20 0.0058 ms 100.0% + triton_bmm_21 0.0058 ms 100.0% + triton_bmm_22 0.0060 ms 95.7% + triton_bmm_18 0.0062 ms 92.3% + triton_bmm_23 0.0066 ms 87.8% + bmm 0.0070 ms 81.8% + triton_bmm_25 0.0072 ms 80.4% + triton_bmm_24 0.0072 ms 79.6% +SingleProcess AUTOTUNE takes 2.4247 seconds +AUTOTUNE int_mm(1x100, 100x1024, 1x1024) + triton_mm_31 0.0075 ms 100.0% + triton_mm_34 0.0075 ms 100.0% + triton_mm_32 0.0077 ms 97.1% + triton_mm_28 0.0077 ms 96.7% + triton_mm_30 0.0077 ms 96.7% + triton_mm_26 0.0085 ms 87.6% + triton_mm_36 0.0086 ms 87.3% + triton_mm_27 0.0086 ms 86.8% + triton_mm_29 0.0092 ms 81.2% + triton_mm_35 0.0095 ms 79.1% +SingleProcess AUTOTUNE takes 3.9097 seconds +AUTOTUNE int_mm(1x1024, 1024x1024, 1x1024) + triton_mm_47 0.0112 ms 100.0% + triton_mm_46 0.0131 ms 85.3% + triton_mm_42 0.0132 ms 85.0% + triton_mm_45 0.0140 ms 80.1% + triton_mm_43 0.0144 ms 77.8% + triton_mm_41 0.0147 ms 76.1% + triton_mm_39 0.0198 ms 56.5% + triton_mm_38 0.0213 ms 52.5% + triton_mm_37 0.0291 ms 38.5% + triton_mm_40 0.0305 ms 36.7% +SingleProcess AUTOTUNE takes 4.1208 seconds +AUTOTUNE int_mm(1x1024, 1024x1, 1x1) + triton_mm_64 0.0088 ms 100.0% + triton_mm_62 0.0103 ms 85.4% + triton_mm_63 0.0106 ms 83.3% + triton_mm_61 0.0109 ms 80.8% + triton_mm_60 0.0148 ms 59.7% + triton_mm_59 0.0176 ms 49.9% +SingleProcess AUTOTUNE takes 1.9201 seconds +pass-sqnr-nan + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +doctr_det_predictor +cuda eval doctr_det_predictor int8dynamic-bs1-acc +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +AUTOTUNE convolution(1x3x1024x1024, 64x3x7x7) + convolution 0.1112 ms 100.0% + triton_convolution_3 0.4625 ms 24.0% + triton_convolution_4 0.5113 ms 21.8% + triton_convolution_5 0.5883 ms 18.9% + triton_convolution_0 0.5889 ms 18.9% + triton_convolution_2 0.7713 ms 14.4% + triton_convolution_1 0.9546 ms 11.6% +SingleProcess AUTOTUNE takes 2.9890 seconds +AUTOTUNE mm(65536x64, 64x64) + triton_mm_14 0.0193 ms 100.0% + triton_mm_8 0.0193 ms 99.8% + triton_mm_6 0.0196 ms 98.5% + triton_mm_7 0.0203 ms 95.1% + mm 0.0214 ms 90.1% + triton_mm_10 0.0215 ms 89.5% + triton_mm_9 0.0217 ms 88.8% + triton_mm_13 0.0221 ms 87.1% + triton_mm_16 0.0229 ms 84.0% + triton_mm_15 0.0243 ms 79.3% +SingleProcess AUTOTUNE takes 4.0114 seconds +AUTOTUNE convolution(1x64x256x256, 64x64x3x3) + convolution 0.0474 ms 100.0% + triton_convolution_18 0.1759 ms 26.9% + triton_convolution_23 0.1767 ms 26.8% + triton_convolution_24 0.1817 ms 26.1% + triton_convolution_21 0.2564 ms 18.5% + triton_convolution_22 0.2668 ms 17.7% + triton_convolution_19 0.3187 ms 14.9% + triton_convolution_20 0.7628 ms 6.2% +SingleProcess AUTOTUNE takes 3.9643 seconds +AUTOTUNE mm(65536x64, 64x256) + triton_mm_27 0.0370 ms 100.0% + triton_mm_26 0.0372 ms 99.6% + triton_mm_33 0.0415 ms 89.1% + triton_mm_29 0.0429 ms 86.1% + triton_mm_28 0.0440 ms 84.1% + mm 0.0456 ms 81.1% + triton_mm_25 0.0461 ms 80.2% + triton_mm_35 0.0485 ms 76.3% + triton_mm_32 0.0537 ms 68.9% + triton_mm_36 0.0646 ms 57.3% +SingleProcess AUTOTUNE takes 4.2940 seconds +AUTOTUNE mm(65536x256, 256x64) + mm 0.0430 ms 100.0% + triton_mm_51 0.0433 ms 99.5% + triton_mm_50 0.0442 ms 97.5% + triton_mm_52 0.0446 ms 96.4% + triton_mm_53 0.0449 ms 95.8% + triton_mm_56 0.0463 ms 93.0% + triton_mm_57 0.0466 ms 92.4% + triton_mm_49 0.0467 ms 92.2% + triton_mm_54 0.0502 ms 85.7% + triton_mm_55 0.0505 ms 85.2% +SingleProcess AUTOTUNE takes 4.3039 seconds +AUTOTUNE mm(65536x256, 256x128) + triton_mm_112 0.0524 ms 100.0% + mm 0.0529 ms 99.0% + triton_mm_113 0.0532 ms 98.4% + triton_mm_114 0.0543 ms 96.5% + triton_mm_115 0.0547 ms 95.7% + triton_mm_118 0.0553 ms 94.7% + triton_mm_111 0.0572 ms 91.6% + triton_mm_119 0.0581 ms 90.1% + triton_mm_121 0.0777 ms 67.4% + triton_mm_116 0.0826 ms 63.4% +SingleProcess AUTOTUNE takes 4.7226 seconds +AUTOTUNE convolution(1x128x256x256, 128x128x3x3) + convolution 0.0459 ms 100.0% + triton_convolution_129 0.3180 ms 14.4% + triton_convolution_128 0.3446 ms 13.3% + triton_convolution_124 0.3704 ms 12.4% + triton_convolution_123 0.3838 ms 12.0% + triton_convolution_126 0.4179 ms 11.0% + triton_convolution_127 0.4281 ms 10.7% + triton_convolution_125 0.9810 ms 4.7% +SingleProcess AUTOTUNE takes 3.9033 seconds +AUTOTUNE mm(16384x128, 128x512) + triton_mm_132 0.0263 ms 100.0% + triton_mm_131 0.0273 ms 96.4% + mm 0.0276 ms 95.2% + triton_mm_130 0.0286 ms 91.8% + triton_mm_137 0.0302 ms 87.1% + triton_mm_134 0.0306 ms 86.1% + triton_mm_133 0.0312 ms 84.4% + triton_mm_138 0.0335 ms 78.6% + triton_mm_140 0.0364 ms 72.4% + triton_mm_136 0.0548 ms 48.0% +SingleProcess AUTOTUNE takes 4.5479 seconds +AUTOTUNE convolution(1x256x256x256, 512x256x1x1) + convolution 0.0375 ms 100.0% + triton_convolution_147 0.1030 ms 36.5% + triton_convolution_145 0.1057 ms 35.5% + triton_convolution_143 0.1069 ms 35.1% + triton_convolution_148 0.1095 ms 34.3% + triton_convolution_142 0.1128 ms 33.3% + triton_convolution_146 0.1160 ms 32.3% + triton_convolution_144 0.5533 ms 6.8% +SingleProcess AUTOTUNE takes 4.4628 seconds +AUTOTUNE mm(16384x512, 512x128) + triton_mm_150 0.0269 ms 100.0% + triton_mm_151 0.0296 ms 90.8% + triton_mm_152 0.0304 ms 88.5% + triton_mm_157 0.0304 ms 88.5% + triton_mm_149 0.0314 ms 85.6% + mm 0.0324 ms 83.0% + triton_mm_153 0.0336 ms 80.2% + triton_mm_156 0.0342 ms 78.6% + triton_mm_159 0.0394 ms 68.3% + triton_mm_154 0.0413 ms 65.1% +SingleProcess AUTOTUNE takes 4.6439 seconds +AUTOTUNE mm(16384x512, 512x256) + mm 0.0356 ms 100.0% + triton_mm_244 0.0362 ms 98.4% + triton_mm_243 0.0375 ms 95.0% + triton_mm_245 0.0386 ms 92.2% + triton_mm_246 0.0390 ms 91.3% + triton_mm_250 0.0455 ms 78.2% + triton_mm_242 0.0472 ms 75.5% + triton_mm_249 0.0524 ms 67.9% + triton_mm_252 0.0690 ms 51.6% + triton_mm_247 0.0714 ms 49.9% +SingleProcess AUTOTUNE takes 4.9959 seconds +AUTOTUNE convolution(1x256x128x128, 256x256x3x3) + convolution 0.0404 ms 100.0% + triton_convolution_259 0.3148 ms 12.8% + triton_convolution_260 0.3548 ms 11.4% + triton_convolution_257 0.4509 ms 8.9% + triton_convolution_254 0.5947 ms 6.8% + triton_convolution_258 0.6044 ms 6.7% + triton_convolution_255 0.6442 ms 6.3% + triton_convolution_256 0.9831 ms 4.1% +SingleProcess AUTOTUNE takes 4.4404 seconds +AUTOTUNE mm(4096x256, 256x1024) + triton_mm_263 0.0217 ms 100.0% + triton_mm_262 0.0224 ms 96.9% + mm 0.0235 ms 92.2% + triton_mm_264 0.0247 ms 87.6% + triton_mm_265 0.0251 ms 86.4% + triton_mm_261 0.0276 ms 78.6% + triton_mm_268 0.0278 ms 78.0% + triton_mm_269 0.0284 ms 76.3% + triton_mm_271 0.0372 ms 58.2% + triton_mm_266 0.0446 ms 48.5% +SingleProcess AUTOTUNE takes 4.8160 seconds +AUTOTUNE convolution(1x512x128x128, 1024x512x1x1) + convolution 0.0343 ms 100.0% + triton_convolution_276 0.1078 ms 31.9% + triton_convolution_278 0.1121 ms 30.6% + triton_convolution_277 0.1136 ms 30.2% + triton_convolution_279 0.1174 ms 29.3% + triton_convolution_274 0.1218 ms 28.2% + triton_convolution_273 0.1340 ms 25.6% + triton_convolution_275 0.6561 ms 5.2% +SingleProcess AUTOTUNE takes 4.3187 seconds +AUTOTUNE mm(4096x1024, 1024x256) + mm 0.0228 ms 100.0% + triton_mm_282 0.0252 ms 90.4% + triton_mm_281 0.0267 ms 85.5% + triton_mm_288 0.0272 ms 83.8% + triton_mm_284 0.0280 ms 81.4% + triton_mm_283 0.0284 ms 80.4% + triton_mm_285 0.0385 ms 59.3% + triton_mm_280 0.0395 ms 57.7% + triton_mm_286 0.0395 ms 57.7% + triton_mm_289 0.0433 ms 52.7% +SingleProcess AUTOTUNE takes 4.6823 seconds +AUTOTUNE mm(4096x1024, 1024x512) + triton_mm_437 0.0314 ms 100.0% + triton_mm_436 0.0323 ms 97.2% + mm 0.0356 ms 88.4% + triton_mm_439 0.0387 ms 81.2% + triton_mm_438 0.0400 ms 78.6% + triton_mm_443 0.0409 ms 76.9% + triton_mm_435 0.0433 ms 72.5% + triton_mm_445 0.0642 ms 48.9% + triton_mm_440 0.0682 ms 46.1% + triton_mm_442 0.0685 ms 45.8% +SingleProcess AUTOTUNE takes 4.8440 seconds +AUTOTUNE convolution(1x512x64x64, 512x512x3x3) + convolution 0.0470 ms 100.0% + triton_convolution_452 0.5858 ms 8.0% + triton_convolution_453 0.7347 ms 6.4% + triton_convolution_451 0.7811 ms 6.0% + triton_convolution_450 0.8387 ms 5.6% + triton_convolution_447 1.2869 ms 3.7% + triton_convolution_448 1.3758 ms 3.4% + triton_convolution_449 2.0305 ms 2.3% +SingleProcess AUTOTUNE takes 4.4927 seconds +AUTOTUNE mm(1024x512, 512x2048) + triton_mm_455 0.0212 ms 100.0% + triton_mm_456 0.0216 ms 98.2% + triton_mm_457 0.0250 ms 85.0% + triton_mm_458 0.0250 ms 85.0% + triton_mm_462 0.0252 ms 84.4% + triton_mm_454 0.0269 ms 79.0% + triton_mm_461 0.0295 ms 72.0% + mm 0.0301 ms 70.5% + triton_mm_464 0.0335 ms 63.5% + triton_mm_463 0.0400 ms 53.1% +SingleProcess AUTOTUNE takes 4.9576 seconds +AUTOTUNE convolution(1x1024x64x64, 2048x1024x1x1) + convolution 0.0367 ms 100.0% + triton_convolution_467 0.1244 ms 29.5% + triton_convolution_470 0.1289 ms 28.5% + triton_convolution_466 0.1438 ms 25.5% + triton_convolution_472 0.1460 ms 25.1% + triton_convolution_469 0.1461 ms 25.1% + triton_convolution_471 0.1461 ms 25.1% + triton_convolution_468 0.8724 ms 4.2% +SingleProcess AUTOTUNE takes 4.3313 seconds +AUTOTUNE mm(1024x2048, 2048x512) + mm 0.0220 ms 100.0% + triton_mm_477 0.0294 ms 74.8% + triton_mm_476 0.0300 ms 73.2% + triton_mm_481 0.0326 ms 67.4% + triton_mm_475 0.0378 ms 58.2% + triton_mm_474 0.0395 ms 55.7% + triton_mm_478 0.0426 ms 51.6% + triton_mm_479 0.0442 ms 49.7% + triton_mm_482 0.0477 ms 46.1% + triton_mm_473 0.0587 ms 37.5% +SingleProcess AUTOTUNE takes 4.6671 seconds +AUTOTUNE mm(1024x2048, 2048x256) + mm 0.0174 ms 100.0% + triton_mm_543 0.0244 ms 71.3% + triton_mm_544 0.0264 ms 66.0% + triton_mm_539 0.0288 ms 60.6% + triton_mm_538 0.0297 ms 58.7% + triton_mm_540 0.0314 ms 55.5% + triton_mm_541 0.0320 ms 54.5% + triton_mm_537 0.0374 ms 46.6% + triton_mm_536 0.0382 ms 45.6% + triton_mm_535 0.0578 ms 30.2% +SingleProcess AUTOTUNE takes 4.6817 seconds +AUTOTUNE mm(65536x256, 256x256) + mm 0.0648 ms 100.0% + triton_mm_572 0.0697 ms 92.9% + triton_mm_573 0.0697 ms 92.9% + triton_mm_575 0.0769 ms 84.2% + triton_mm_574 0.0791 ms 81.8% + triton_mm_578 0.0792 ms 81.8% + triton_mm_571 0.0827 ms 78.4% + triton_mm_579 0.0898 ms 72.1% + triton_mm_581 0.1016 ms 63.8% + triton_mm_576 0.1561 ms 41.5% +SingleProcess AUTOTUNE takes 4.5370 seconds +AUTOTUNE convolution(1x256x256x256, 64x256x3x3) + convolution 0.1154 ms 100.0% + triton_convolution_589 0.7918 ms 14.6% + triton_convolution_588 0.8203 ms 14.1% + triton_convolution_583 0.9000 ms 12.8% + triton_convolution_586 1.0417 ms 11.1% + triton_convolution_587 1.2828 ms 9.0% + triton_convolution_584 1.6458 ms 7.0% + triton_convolution_585 3.0834 ms 3.7% +SingleProcess AUTOTUNE takes 3.8932 seconds +AUTOTUNE convolution(1x256x128x128, 64x256x3x3) + convolution 0.0481 ms 100.0% + triton_convolution_596 0.2750 ms 17.5% + triton_convolution_595 0.2845 ms 16.9% + triton_convolution_590 0.3049 ms 15.8% + triton_convolution_593 0.4036 ms 11.9% + triton_convolution_594 0.4265 ms 11.3% + triton_convolution_591 0.5780 ms 8.3% + triton_convolution_592 0.9668 ms 5.0% +SingleProcess AUTOTUNE takes 3.6314 seconds +AUTOTUNE convolution(1x256x64x64, 64x256x3x3) + convolution 0.0215 ms 100.0% + triton_convolution_602 0.1294 ms 16.6% + triton_convolution_601 0.1996 ms 10.8% + triton_convolution_600 0.2058 ms 10.4% + triton_convolution_597 0.2062 ms 10.4% + triton_convolution_603 0.2768 ms 7.8% + triton_convolution_598 0.5508 ms 3.9% + triton_convolution_599 0.9634 ms 2.2% +SingleProcess AUTOTUNE takes 3.6313 seconds +AUTOTUNE convolution(1x256x32x32, 64x256x3x3) + convolution 0.0168 ms 100.0% + triton_convolution_609 0.1228 ms 13.7% + triton_convolution_608 0.1787 ms 9.4% + triton_convolution_607 0.1901 ms 8.9% + triton_convolution_604 0.1942 ms 8.7% + triton_convolution_610 0.2794 ms 6.0% + triton_convolution_605 0.5451 ms 3.1% + triton_convolution_606 0.9589 ms 1.8% +SingleProcess AUTOTUNE takes 3.7063 seconds +[2023-12-12 22:14:27,078] [2/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +Fatal glibc error: malloc.c:2496 (sysmalloc): assertion failed: (old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for doctr_reco_predictor. Setting accuracy check to cosine +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +AUTOTUNE convolution(1x3x32x128, 64x3x3x3) + convolution 0.0097 ms 100.0% + triton_convolution_4 0.0116 ms 83.9% + triton_convolution_3 0.0144 ms 67.5% + triton_convolution_0 0.0177 ms 54.9% + triton_convolution_5 0.0188 ms 51.7% + triton_convolution_2 0.0221 ms 43.9% + triton_convolution_1 0.0417 ms 23.3% +SingleProcess AUTOTUNE takes 2.9928 seconds +AUTOTUNE convolution(1x64x32x128, 64x64x3x3) + convolution 0.0148 ms 100.0% + triton_convolution_11 0.0387 ms 38.2% + triton_convolution_6 0.0586 ms 25.2% + triton_convolution_9 0.0592 ms 24.9% + triton_convolution_10 0.0593 ms 24.9% + triton_convolution_12 0.0620 ms 23.8% + triton_convolution_7 0.1007 ms 14.7% + triton_convolution_8 0.2502 ms 5.9% +SingleProcess AUTOTUNE takes 4.0690 seconds +AUTOTUNE convolution(1x64x16x64, 128x64x3x3) + convolution 0.0112 ms 100.0% + triton_convolution_18 0.0419 ms 26.7% + triton_convolution_17 0.0516 ms 21.7% + triton_convolution_16 0.0607 ms 18.5% + triton_convolution_19 0.0646 ms 17.3% + triton_convolution_13 0.0712 ms 15.7% + triton_convolution_14 0.0994 ms 11.3% + triton_convolution_15 0.2484 ms 4.5% +SingleProcess AUTOTUNE takes 4.3217 seconds +AUTOTUNE convolution(1x128x16x64, 128x128x3x3) + convolution 0.0132 ms 100.0% + triton_convolution_25 0.0758 ms 17.4% + triton_convolution_24 0.0952 ms 13.9% + triton_convolution_23 0.1118 ms 11.8% + triton_convolution_26 0.1222 ms 10.8% + triton_convolution_20 0.1371 ms 9.6% + triton_convolution_21 0.2382 ms 5.5% + triton_convolution_22 0.4980 ms 2.7% +SingleProcess AUTOTUNE takes 3.9880 seconds +AUTOTUNE convolution(1x128x8x32, 256x128x3x3) + convolution 0.0133 ms 100.0% + triton_convolution_31 0.0873 ms 15.2% + triton_convolution_32 0.1026 ms 12.9% + triton_convolution_30 0.1101 ms 12.1% + triton_convolution_33 0.1301 ms 10.2% + triton_convolution_29 0.1464 ms 9.1% + triton_convolution_28 0.2415 ms 5.5% + triton_convolution_27 0.2552 ms 5.2% +SingleProcess AUTOTUNE takes 4.2246 seconds +AUTOTUNE convolution(1x256x8x32, 256x256x3x3) + convolution 0.0173 ms 100.0% + triton_convolution_38 0.1765 ms 9.8% + triton_convolution_39 0.2219 ms 7.8% + triton_convolution_37 0.2371 ms 7.3% + triton_convolution_40 0.2935 ms 5.9% + triton_convolution_36 0.3012 ms 5.7% + triton_convolution_35 0.5448 ms 3.2% + triton_convolution_34 0.5450 ms 3.2% +SingleProcess AUTOTUNE takes 4.4146 seconds +AUTOTUNE convolution(1x256x4x32, 512x256x3x3) + convolution 0.0177 ms 100.0% + triton_convolution_54 0.1730 ms 10.2% + triton_convolution_52 0.1742 ms 10.2% + triton_convolution_50 0.1848 ms 9.6% + triton_convolution_53 0.2230 ms 7.9% + triton_convolution_51 0.2344 ms 7.5% + triton_convolution_49 0.3187 ms 5.6% + triton_convolution_48 0.5652 ms 3.1% +SingleProcess AUTOTUNE takes 3.7205 seconds +AUTOTUNE convolution(1x512x4x32, 512x512x3x3) + convolution 0.0219 ms 100.0% + triton_convolution_57 0.3379 ms 6.5% + triton_convolution_61 0.4467 ms 4.9% + triton_convolution_59 0.4513 ms 4.8% + triton_convolution_60 0.4741 ms 4.6% + triton_convolution_58 0.6437 ms 3.4% + triton_convolution_56 1.0122 ms 2.2% + triton_convolution_55 1.1620 ms 1.9% +SingleProcess AUTOTUNE takes 3.7802 seconds +AUTOTUNE convolution(1x512x2x32, 512x512x3x3) + convolution 0.0217 ms 100.0% + triton_convolution_71 0.2483 ms 8.7% + triton_convolution_75 0.2965 ms 7.3% + triton_convolution_72 0.3376 ms 6.4% + triton_convolution_73 0.3989 ms 5.4% + triton_convolution_74 0.4661 ms 4.6% + triton_convolution_70 0.4705 ms 4.6% + triton_convolution_69 1.1652 ms 1.9% +SingleProcess AUTOTUNE takes 3.2267 seconds +AUTOTUNE int_mm(32x256, 256x124, 32x124) + triton_mm_95 0.0085 ms 100.0% + triton_mm_98 0.0088 ms 96.7% + triton_mm_94 0.0091 ms 93.3% + triton_mm_96 0.0091 ms 93.3% + triton_mm_99 0.0092 ms 93.0% + triton_mm_92 0.0096 ms 89.0% + triton_mm_93 0.0098 ms 87.2% + triton_mm_91 0.0107 ms 79.6% + triton_mm_90 0.0113 ms 75.6% + triton_mm_97 0.0120 ms 71.1% +SingleProcess AUTOTUNE takes 3.2138 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +drq +cuda eval drq int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for drq. Setting accuracy check to cosine +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3642, in run + runner.run_one_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2518, in run_one_model + status = self.check_accuracy( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2147, in check_accuracy + model, example_inputs = self.maybe_cast(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1937, in maybe_cast + model = self.deepcopy_model(model) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1887, in deepcopy_model + return copy.deepcopy(model) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 297, in _reconstruct + value = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 153, in deepcopy + y = copier(memo) + File "/home/cdhernandez/local/pytorch/torch/_tensor.py", line 86, in __deepcopy__ + raise RuntimeError( +RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment. If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001 +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +fastNLP_Bert +cuda eval fastNLP_Bert int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for fastNLP_Bert. Setting accuracy check to cosine +AUTOTUNE int_mm(475x768, 768x768, 475x768) + triton_mm_8 0.0138 ms 100.0% + triton_mm_3 0.0162 ms 85.0% + triton_mm_4 0.0167 ms 82.4% + triton_mm_5 0.0170 ms 81.2% + triton_mm_6 0.0181 ms 76.3% + triton_mm_1 0.0195 ms 70.7% + triton_mm_2 0.0196 ms 70.2% + triton_mm_0 0.0219 ms 63.0% + triton_mm_9 0.0272 ms 50.6% + triton_mm_10 0.0281 ms 49.1% +SingleProcess AUTOTUNE takes 7.6872 seconds +AUTOTUNE int_mm(475x768, 768x3072, 475x3072) + triton_mm_45 0.0218 ms 100.0% + triton_mm_52 0.0221 ms 98.4% + triton_mm_47 0.0232 ms 93.9% + triton_mm_46 0.0242 ms 89.9% + triton_mm_48 0.0242 ms 89.9% + triton_mm_44 0.0272 ms 80.1% + triton_mm_53 0.0275 ms 79.1% + triton_mm_54 0.0283 ms 77.0% + triton_mm_51 0.0341 ms 63.8% + triton_mm_50 0.0449 ms 48.4% +SingleProcess AUTOTUNE takes 7.6998 seconds +AUTOTUNE int_mm(475x3072, 3072x768, 475x768) + triton_mm_63 0.0305 ms 100.0% + triton_mm_58 0.0383 ms 79.7% + triton_mm_59 0.0404 ms 75.7% + triton_mm_60 0.0431 ms 70.9% + triton_mm_61 0.0433 ms 70.5% + triton_mm_57 0.0531 ms 57.5% + triton_mm_56 0.0538 ms 56.8% + triton_mm_64 0.0542 ms 56.3% + triton_mm_65 0.0545 ms 56.0% + triton_mm_55 0.0649 ms 47.0% +SingleProcess AUTOTUNE takes 7.6732 seconds +AUTOTUNE int_mm(1x768, 768x768, 1x768) + triton_mm_802 0.0107 ms 100.0% + triton_mm_801 0.0113 ms 94.5% + triton_mm_797 0.0118 ms 90.4% + triton_mm_798 0.0118 ms 90.4% + triton_mm_800 0.0120 ms 88.7% + triton_mm_796 0.0124 ms 86.4% + triton_mm_794 0.0164 ms 65.1% + triton_mm_793 0.0170 ms 62.8% + triton_mm_792 0.0224 ms 47.6% + triton_mm_795 0.0224 ms 47.6% +SingleProcess AUTOTUNE takes 3.9529 seconds +AUTOTUNE int_mm(473x768, 768x2, 473x2) + triton_mm_812 0.0103 ms 100.0% + triton_mm_809 0.0116 ms 88.7% + triton_mm_811 0.0119 ms 86.5% + triton_mm_808 0.0120 ms 85.8% + triton_mm_813 0.0123 ms 83.4% + triton_mm_806 0.0131 ms 78.3% + triton_mm_804 0.0162 ms 63.3% + triton_mm_805 0.0175 ms 58.7% + triton_mm_807 0.0205 ms 50.2% + triton_mm_803 0.0219 ms 47.0% +SingleProcess AUTOTUNE takes 3.7576 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +functorch_dp_cifar10 +cuda eval functorch_dp_cifar10 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for functorch_dp_cifar10. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x32x32, 64x3x7x7) + convolution 0.0201 ms 100.0% + triton_convolution_4 0.0453 ms 44.3% + triton_convolution_3 0.0510 ms 39.3% + triton_convolution_0 0.0561 ms 35.8% + triton_convolution_5 0.0712 ms 28.2% + triton_convolution_2 0.0780 ms 25.7% + triton_convolution_1 0.1437 ms 14.0% +SingleProcess AUTOTUNE takes 2.9392 seconds +AUTOTUNE convolution(1x64x8x8, 64x64x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_7 0.0365 ms 30.2% + triton_convolution_8 0.0388 ms 28.4% + triton_convolution_9 0.0483 ms 22.9% + triton_convolution_6 0.0513 ms 21.5% +SingleProcess AUTOTUNE takes 1.6139 seconds +AUTOTUNE convolution(1x64x8x8, 128x64x3x3) + convolution 0.0115 ms 100.0% + triton_convolution_24 0.0323 ms 35.7% + triton_convolution_27 0.0398 ms 28.9% + triton_convolution_25 0.0446 ms 25.8% + triton_convolution_26 0.0499 ms 23.1% + triton_convolution_23 0.0526 ms 21.9% + triton_convolution_22 0.0576 ms 20.0% +SingleProcess AUTOTUNE takes 2.6443 seconds +AUTOTUNE convolution(1x128x4x4, 128x128x3x3) + convolution 0.0133 ms 100.0% + triton_convolution_32 0.0506 ms 26.3% + triton_convolution_29 0.0599 ms 22.2% + triton_convolution_28 0.0699 ms 19.0% + triton_convolution_30 0.0747 ms 17.8% + triton_convolution_31 0.0888 ms 15.0% +SingleProcess AUTOTUNE takes 1.6554 seconds +AUTOTUNE convolution(1x64x8x8, 128x64x1x1) + triton_convolution_37 0.0085 ms 100.0% + triton_convolution_35 0.0085 ms 99.3% + convolution 0.0088 ms 96.7% + triton_convolution_38 0.0088 ms 96.7% + triton_convolution_34 0.0089 ms 95.0% + triton_convolution_33 0.0092 ms 92.7% + triton_convolution_36 0.0092 ms 92.7% +SingleProcess AUTOTUNE takes 2.3924 seconds +AUTOTUNE convolution(1x128x4x4, 256x128x3x3) + convolution 0.0133 ms 100.0% + triton_convolution_53 0.0541 ms 24.6% + triton_convolution_50 0.0714 ms 18.7% + triton_convolution_51 0.0764 ms 17.4% + triton_convolution_54 0.0936 ms 14.2% + triton_convolution_52 0.0958 ms 13.9% + triton_convolution_49 0.1120 ms 11.9% +SingleProcess AUTOTUNE takes 2.0832 seconds +AUTOTUNE convolution(1x256x2x2, 256x256x3x3) + convolution 0.0188 ms 100.0% + triton_convolution_59 0.0907 ms 20.8% + triton_convolution_56 0.1254 ms 15.0% + triton_convolution_57 0.1466 ms 12.9% + triton_convolution_60 0.1630 ms 11.6% + triton_convolution_58 0.1722 ms 10.9% + triton_convolution_55 0.2004 ms 9.4% +SingleProcess AUTOTUNE takes 2.2051 seconds +AUTOTUNE convolution(1x128x4x4, 256x128x1x1) + convolution 0.0090 ms 100.0% + triton_convolution_63 0.0123 ms 73.2% + triton_convolution_62 0.0137 ms 65.8% + triton_convolution_66 0.0139 ms 64.7% + triton_convolution_65 0.0170 ms 53.0% + triton_convolution_64 0.0182 ms 49.5% + triton_convolution_61 0.0218 ms 41.2% +SingleProcess AUTOTUNE takes 2.6498 seconds +AUTOTUNE convolution(1x256x2x2, 512x256x3x3) + convolution 0.0206 ms 100.0% + triton_convolution_83 0.0924 ms 22.3% + triton_convolution_80 0.1345 ms 15.3% + triton_convolution_81 0.1580 ms 13.0% + triton_convolution_84 0.1581 ms 13.0% + triton_convolution_82 0.1728 ms 11.9% + triton_convolution_79 0.4706 ms 4.4% +SingleProcess AUTOTUNE takes 2.3403 seconds +AUTOTUNE convolution(1x512x1x1, 512x512x3x3) + convolution 0.0224 ms 100.0% + triton_convolution_89 0.2222 ms 10.1% + triton_convolution_86 0.2350 ms 9.5% + triton_convolution_87 0.3114 ms 7.2% + triton_convolution_90 0.3138 ms 7.1% + triton_convolution_88 0.9679 ms 2.3% + triton_convolution_85 1.1571 ms 1.9% +SingleProcess AUTOTUNE takes 1.9863 seconds +AUTOTUNE convolution(1x256x2x2, 512x256x1x1) + convolution 0.0092 ms 100.0% + triton_convolution_95 0.0136 ms 67.7% + triton_convolution_93 0.0162 ms 56.8% + triton_convolution_92 0.0211 ms 43.5% + triton_convolution_96 0.0229 ms 40.0% + triton_convolution_94 0.0251 ms 36.6% + triton_convolution_91 0.0399 ms 23.0% +SingleProcess AUTOTUNE takes 2.0351 seconds +AUTOTUNE int_mm(1x512, 512x1000, 1x1000) + triton_mm_119 0.0096 ms 100.0% + triton_mm_114 0.0097 ms 99.0% + triton_mm_117 0.0099 ms 97.1% + triton_mm_118 0.0101 ms 94.9% + triton_mm_115 0.0101 ms 94.6% + triton_mm_113 0.0111 ms 86.4% + triton_mm_111 0.0133 ms 71.7% + triton_mm_110 0.0137 ms 69.9% + triton_mm_109 0.0169 ms 56.7% + triton_mm_112 0.0169 ms 56.7% +SingleProcess AUTOTUNE takes 3.8031 seconds +pass-sqnr-36.648 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +functorch_maml_omniglot +cuda eval functorch_maml_omniglot int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for functorch_maml_omniglot. Setting accuracy check to cosine +AUTOTUNE convolution(5x1x28x28, 64x1x3x3) + convolution 0.0102 ms 100.0% + triton_convolution_0 0.0106 ms 96.1% + triton_convolution_4 0.0106 ms 95.8% + triton_convolution_3 0.0129 ms 78.9% + triton_convolution_5 0.0154 ms 66.3% + triton_convolution_1 0.0159 ms 64.0% + triton_convolution_2 0.0174 ms 58.3% +SingleProcess AUTOTUNE takes 3.3881 seconds +AUTOTUNE convolution(5x64x13x13, 64x64x3x3) + convolution 0.0109 ms 100.0% + triton_convolution_11 0.0379 ms 28.8% + triton_convolution_10 0.0522 ms 20.9% + triton_convolution_6 0.0536 ms 20.4% + triton_convolution_9 0.0550 ms 19.8% + triton_convolution_12 0.0697 ms 15.7% + triton_convolution_7 0.1162 ms 9.4% + triton_convolution_8 0.1780 ms 6.1% +SingleProcess AUTOTUNE takes 3.7166 seconds +AUTOTUNE convolution(5x64x5x5, 64x64x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_15 0.0353 ms 31.1% + triton_convolution_18 0.0463 ms 23.7% + triton_convolution_16 0.0531 ms 20.7% + triton_convolution_17 0.0606 ms 18.1% + triton_convolution_13 0.0688 ms 16.0% + triton_convolution_14 0.0796 ms 13.8% +SingleProcess AUTOTUNE takes 2.5114 seconds +AUTOTUNE int_mm(5x64, 64x5, 5x5) + triton_mm_21 0.0066 ms 100.0% + triton_mm_22 0.0066 ms 100.0% + triton_mm_20 0.0072 ms 92.2% + triton_mm_19 0.0074 ms 89.6% + triton_mm_23 0.0074 ms 89.6% +SingleProcess AUTOTUNE takes 1.5834 seconds +pass-sqnr-38.834 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Albert +cuda eval hf_Albert int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Albert. Setting accuracy check to cosine +AUTOTUNE int_mm(512x128, 128x768, 512x768) + triton_mm_8 0.0089 ms 100.0% + triton_mm_0 0.0092 ms 96.9% + triton_mm_1 0.0097 ms 91.4% + triton_mm_2 0.0099 ms 89.9% + triton_mm_4 0.0102 ms 86.6% + triton_mm_5 0.0104 ms 85.5% + triton_mm_3 0.0105 ms 84.7% + triton_mm_6 0.0109 ms 81.0% + triton_mm_7 0.0134 ms 66.1% + triton_mm_9 0.0276 ms 32.1% +SingleProcess AUTOTUNE takes 5.8088 seconds +AUTOTUNE int_mm(512x768, 768x768, 512x768) + triton_mm_19 0.0132 ms 100.0% + triton_mm_16 0.0169 ms 77.8% + triton_mm_15 0.0169 ms 77.7% + triton_mm_14 0.0172 ms 76.3% + triton_mm_17 0.0174 ms 75.4% + triton_mm_13 0.0193 ms 68.0% + triton_mm_12 0.0198 ms 66.4% + triton_mm_11 0.0221 ms 59.6% + triton_mm_20 0.0246 ms 53.5% + triton_mm_21 0.0248 ms 53.0% +SingleProcess AUTOTUNE takes 1.5004 seconds +AUTOTUNE int_mm(512x768, 768x3072, 512x3072) + triton_mm_63 0.0219 ms 100.0% + triton_mm_56 0.0219 ms 99.7% + triton_mm_57 0.0220 ms 99.1% + triton_mm_59 0.0226 ms 96.6% + triton_mm_58 0.0229 ms 95.3% + triton_mm_65 0.0246 ms 88.9% + triton_mm_64 0.0250 ms 87.6% + triton_mm_55 0.0264 ms 82.6% + triton_mm_62 0.0324 ms 67.6% + triton_mm_60 0.0479 ms 45.6% +SingleProcess AUTOTUNE takes 1.4887 seconds +AUTOTUNE int_mm(512x3072, 3072x768, 512x768) + triton_mm_74 0.0301 ms 100.0% + triton_mm_69 0.0421 ms 71.5% + triton_mm_70 0.0426 ms 70.6% + triton_mm_71 0.0429 ms 70.2% + triton_mm_72 0.0436 ms 69.0% + triton_mm_75 0.0515 ms 58.5% + triton_mm_76 0.0520 ms 57.9% + triton_mm_67 0.0532 ms 56.7% + triton_mm_68 0.0534 ms 56.4% + triton_mm_66 0.0661 ms 45.5% +SingleProcess AUTOTUNE takes 1.4677 seconds +AUTOTUNE int_mm(512x768, 768x128, 512x128) + triton_mm_808 0.0127 ms 100.0% + triton_mm_811 0.0129 ms 98.0% + triton_mm_809 0.0135 ms 94.1% + triton_mm_812 0.0152 ms 83.2% + triton_mm_807 0.0173 ms 73.2% + triton_mm_806 0.0175 ms 72.5% + triton_mm_804 0.0194 ms 65.5% + triton_mm_805 0.0198 ms 63.9% + triton_mm_803 0.0224 ms 56.5% + triton_mm_813 0.0244 ms 51.8% +SingleProcess AUTOTUNE takes 6.3844 seconds +AUTOTUNE int_mm(512x128, 128x30000, 512x30000) + triton_mm_815 0.0488 ms 100.0% + triton_mm_816 0.0537 ms 91.0% + triton_mm_814 0.0564 ms 86.7% + triton_mm_821 0.0568 ms 85.9% + triton_mm_822 0.0577 ms 84.7% + triton_mm_818 0.0578 ms 84.5% + triton_mm_817 0.0600 ms 81.3% + triton_mm_819 0.1187 ms 41.1% + triton_mm_823 0.1202 ms 40.6% + triton_mm_820 0.1295 ms 37.7% +SingleProcess AUTOTUNE takes 5.8059 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:09, ?it/s] +hf_Bart +cuda eval hf_Bart int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Bart. Setting accuracy check to cosine +AUTOTUNE bmm(12x512x64, 12x64x512) + triton_bmm_420 0.0135 ms 100.0% + triton_bmm_426 0.0135 ms 99.5% + bmm 0.0136 ms 99.3% + triton_bmm_419 0.0136 ms 98.9% + triton_bmm_421 0.0141 ms 95.5% + triton_bmm_422 0.0146 ms 92.5% + triton_bmm_418 0.0148 ms 90.7% + triton_bmm_425 0.0152 ms 88.6% + triton_bmm_428 0.0160 ms 84.2% + triton_bmm_427 0.0169 ms 79.6% +SingleProcess AUTOTUNE takes 4.6559 seconds +AUTOTUNE bmm(12x512x512, 12x512x64) + triton_bmm_449 0.0144 ms 100.0% + triton_bmm_444 0.0145 ms 99.3% + triton_bmm_447 0.0148 ms 97.4% + bmm 0.0150 ms 95.5% + triton_bmm_450 0.0151 ms 94.9% + triton_bmm_445 0.0153 ms 93.9% + triton_bmm_446 0.0154 ms 93.3% + triton_bmm_442 0.0162 ms 88.7% + triton_bmm_443 0.0173 ms 82.8% + triton_bmm_441 0.0236 ms 60.8% +SingleProcess AUTOTUNE takes 5.1954 seconds +AUTOTUNE int_mm(512x768, 768x50265, 512x50265) + triton_mm_1209 0.1855 ms 100.0% + triton_mm_1201 0.2068 ms 89.7% + triton_mm_1210 0.2143 ms 86.6% + triton_mm_1202 0.2379 ms 78.0% + triton_mm_1208 0.2542 ms 73.0% + triton_mm_1204 0.2607 ms 71.1% + triton_mm_1203 0.2613 ms 71.0% + triton_mm_1200 0.2795 ms 66.4% + triton_mm_1207 0.4106 ms 45.2% + triton_mm_1205 0.6068 ms 30.6% +SingleProcess AUTOTUNE takes 8.8618 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_BigBird +cuda eval hf_BigBird int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_BigBird. Setting accuracy check to cosine +[2023-12-12 22:25:33,424] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE int_mm(4096x768, 768x768, 4096x768) + triton_mm_9 0.0265 ms 100.0% + triton_mm_10 0.0270 ms 98.2% + triton_mm_7 0.0353 ms 75.1% + triton_mm_8 0.0361 ms 73.3% + triton_mm_2 0.0362 ms 73.2% + triton_mm_1 0.0367 ms 72.2% + triton_mm_4 0.0378 ms 70.2% + triton_mm_3 0.0408 ms 64.9% + triton_mm_0 0.0431 ms 61.5% + triton_mm_5 0.0841 ms 31.5% +SingleProcess AUTOTUNE takes 7.4791 seconds +AUTOTUNE bmm(12x64x64, 12x64x4096) + triton_bmm_30 0.0156 ms 100.0% + triton_bmm_23 0.0170 ms 91.5% + triton_bmm_24 0.0177 ms 88.2% + triton_bmm_25 0.0179 ms 87.3% + triton_bmm_26 0.0183 ms 85.0% + triton_bmm_31 0.0185 ms 84.1% + triton_bmm_22 0.0194 ms 80.2% + triton_bmm_29 0.0197 ms 78.9% + triton_bmm_32 0.0204 ms 76.5% + bmm 0.0211 ms 73.9% +SingleProcess AUTOTUNE takes 4.2582 seconds +AUTOTUNE bmm(12x64x4096, 12x4096x64) + bmm 0.0264 ms 100.0% + triton_bmm_49 0.0381 ms 69.3% + triton_bmm_48 0.0404 ms 65.3% + triton_bmm_52 0.0438 ms 60.2% + triton_bmm_51 0.0448 ms 59.0% + triton_bmm_47 0.0492 ms 53.7% + triton_bmm_46 0.0665 ms 39.7% + triton_bmm_45 0.1191 ms 22.2% + triton_bmm_50 0.1235 ms 21.4% + triton_bmm_54 0.1842 ms 14.3% +SingleProcess AUTOTUNE takes 3.8551 seconds +AUTOTUNE bmm(12x64x64, 12x64x448) + triton_bmm_57 0.0078 ms 100.0% + triton_bmm_56 0.0081 ms 96.8% + triton_bmm_64 0.0081 ms 96.8% + triton_bmm_58 0.0086 ms 91.0% + triton_bmm_59 0.0086 ms 91.0% + triton_bmm_63 0.0086 ms 91.0% + triton_bmm_55 0.0087 ms 90.0% + triton_bmm_65 0.0088 ms 89.1% + triton_bmm_60 0.0088 ms 88.6% + triton_bmm_61 0.0089 ms 88.1% +SingleProcess AUTOTUNE takes 4.1568 seconds +AUTOTUNE bmm(12x64x448, 12x448x64) + triton_bmm_70 0.0104 ms 100.0% + triton_bmm_71 0.0106 ms 98.5% + triton_bmm_73 0.0109 ms 95.3% + bmm 0.0110 ms 94.6% + triton_bmm_74 0.0112 ms 93.1% + triton_bmm_69 0.0112 ms 92.9% + triton_bmm_68 0.0133 ms 78.3% + triton_bmm_72 0.0193 ms 53.8% + triton_bmm_67 0.0196 ms 53.1% + triton_bmm_76 0.0228 ms 45.7% +SingleProcess AUTOTUNE takes 3.5317 seconds +AUTOTUNE bmm(12x3840x64, 12x64x64) + triton_bmm_85 0.0147 ms 100.0% + triton_bmm_79 0.0171 ms 86.3% + triton_bmm_78 0.0171 ms 86.1% + triton_bmm_80 0.0172 ms 85.5% + triton_bmm_86 0.0177 ms 83.0% + triton_bmm_81 0.0180 ms 81.6% + triton_bmm_77 0.0184 ms 80.0% + triton_bmm_87 0.0193 ms 76.4% + triton_bmm_84 0.0197 ms 74.7% + bmm 0.0198 ms 74.3% +SingleProcess AUTOTUNE takes 4.1491 seconds +AUTOTUNE bmm(720x64x64, 720x64x192) + triton_bmm_97 0.0365 ms 100.0% + triton_bmm_91 0.0391 ms 93.4% + triton_bmm_93 0.0403 ms 90.8% + triton_bmm_90 0.0407 ms 89.8% + triton_bmm_98 0.0416 ms 87.8% + triton_bmm_89 0.0418 ms 87.4% + triton_bmm_92 0.0425 ms 86.1% + triton_bmm_99 0.0431 ms 84.7% + triton_bmm_96 0.0461 ms 79.2% + triton_bmm_100 0.0465 ms 78.6% +SingleProcess AUTOTUNE takes 4.0983 seconds +AUTOTUNE bmm(12x3840x64, 12x64x64) + triton_bmm_121 0.0151 ms 100.0% + triton_bmm_114 0.0171 ms 88.4% + triton_bmm_115 0.0171 ms 88.0% + triton_bmm_117 0.0175 ms 86.1% + triton_bmm_116 0.0178 ms 84.7% + triton_bmm_122 0.0182 ms 82.8% + triton_bmm_113 0.0184 ms 81.8% + triton_bmm_123 0.0192 ms 78.4% + bmm 0.0197 ms 76.6% + triton_bmm_120 0.0197 ms 76.5% +SingleProcess AUTOTUNE takes 4.3019 seconds +AUTOTUNE bmm(720x64x192, 720x192x64) + triton_bmm_126 0.0431 ms 100.0% + triton_bmm_131 0.0436 ms 99.0% + triton_bmm_127 0.0443 ms 97.4% + triton_bmm_125 0.0466 ms 92.6% + triton_bmm_128 0.0471 ms 91.6% + triton_bmm_133 0.0475 ms 90.8% + triton_bmm_130 0.0476 ms 90.7% + bmm 0.0482 ms 89.5% + triton_bmm_129 0.0484 ms 89.2% + triton_bmm_132 0.0507 ms 85.1% +SingleProcess AUTOTUNE takes 3.6210 seconds +AUTOTUNE bmm(720x64x192, 720x192x64) + triton_bmm_136 0.0435 ms 100.0% + triton_bmm_141 0.0435 ms 100.0% + triton_bmm_137 0.0447 ms 97.2% + triton_bmm_135 0.0461 ms 94.4% + triton_bmm_138 0.0472 ms 92.2% + triton_bmm_143 0.0478 ms 91.0% + triton_bmm_140 0.0478 ms 91.0% + bmm 0.0483 ms 90.0% + triton_bmm_139 0.0489 ms 88.9% + triton_bmm_142 0.0511 ms 85.2% +SingleProcess AUTOTUNE takes 3.4316 seconds +AUTOTUNE bmm(12x3840x64, 12x64x64) + triton_bmm_153 0.0148 ms 100.0% + triton_bmm_146 0.0163 ms 90.8% + triton_bmm_147 0.0167 ms 88.5% + triton_bmm_149 0.0180 ms 82.4% + triton_bmm_148 0.0181 ms 81.5% + triton_bmm_155 0.0182 ms 81.2% + triton_bmm_145 0.0185 ms 80.1% + triton_bmm_152 0.0187 ms 79.2% + triton_bmm_154 0.0196 ms 75.6% + triton_bmm_151 0.0204 ms 72.6% +SingleProcess AUTOTUNE takes 4.1785 seconds +AUTOTUNE bmm(12x3840x64, 12x64x64) + triton_bmm_165 0.0153 ms 100.0% + triton_bmm_158 0.0164 ms 92.8% + triton_bmm_159 0.0171 ms 89.5% + triton_bmm_161 0.0172 ms 88.5% + triton_bmm_160 0.0177 ms 86.4% + triton_bmm_167 0.0179 ms 85.5% + triton_bmm_164 0.0181 ms 84.3% + triton_bmm_157 0.0185 ms 82.5% + triton_bmm_166 0.0195 ms 78.2% + triton_bmm_163 0.0207 ms 73.8% +SingleProcess AUTOTUNE takes 4.6577 seconds +AUTOTUNE bmm(12x64x64, 12x64x448) + triton_bmm_173 0.0078 ms 100.0% + triton_bmm_169 0.0081 ms 96.8% + triton_bmm_170 0.0081 ms 96.8% + triton_bmm_178 0.0081 ms 96.8% + triton_bmm_171 0.0083 ms 93.8% + triton_bmm_174 0.0083 ms 93.8% + triton_bmm_175 0.0084 ms 93.5% + triton_bmm_177 0.0085 ms 92.1% + triton_bmm_176 0.0088 ms 88.4% + triton_bmm_172 0.0091 ms 85.9% +SingleProcess AUTOTUNE takes 4.0885 seconds +AUTOTUNE bmm(12x64x64, 12x64x4096) + triton_bmm_199 0.0156 ms 100.0% + triton_bmm_192 0.0174 ms 89.4% + triton_bmm_193 0.0176 ms 88.7% + triton_bmm_194 0.0179 ms 87.3% + triton_bmm_195 0.0184 ms 84.8% + triton_bmm_200 0.0190 ms 82.1% + triton_bmm_191 0.0194 ms 80.4% + triton_bmm_201 0.0199 ms 78.2% + triton_bmm_198 0.0202 ms 77.2% + bmm 0.0206 ms 75.6% +SingleProcess AUTOTUNE takes 4.0668 seconds +AUTOTUNE int_mm(4096x768, 768x3072, 4096x3072) + triton_mm_233 0.0841 ms 100.0% + triton_mm_234 0.0856 ms 98.2% + triton_mm_225 0.0982 ms 85.7% + triton_mm_226 0.1006 ms 83.6% + triton_mm_231 0.1071 ms 78.5% + triton_mm_232 0.1109 ms 75.9% + triton_mm_228 0.1168 ms 72.0% + triton_mm_227 0.1194 ms 70.5% + triton_mm_224 0.1337 ms 62.9% + triton_mm_229 0.3009 ms 28.0% +SingleProcess AUTOTUNE takes 7.5792 seconds +AUTOTUNE int_mm(4096x3072, 3072x768, 4096x768) + triton_mm_244 0.0544 ms 100.0% + triton_mm_245 0.0549 ms 99.2% + triton_mm_243 0.0977 ms 55.7% + triton_mm_242 0.1049 ms 51.9% + triton_mm_239 0.1050 ms 51.9% + triton_mm_237 0.1056 ms 51.6% + triton_mm_236 0.1072 ms 50.8% + triton_mm_238 0.1138 ms 47.9% + triton_mm_235 0.1441 ms 37.8% + triton_mm_240 0.2748 ms 19.8% +SingleProcess AUTOTUNE takes 7.7067 seconds +[2023-12-12 22:26:51,360] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:26:52,918] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:26:54,469] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:26:56,022] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:26:57,568] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:26:59,510] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:27:01,073] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:27:02,653] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:27:04,217] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:27:05,753] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-12 22:27:07,299] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE int_mm(4096x768, 768x50358, 4096x50358) + triton_mm_2973 1.2365 ms 100.0% + triton_mm_2972 1.3005 ms 95.1% + triton_mm_2964 1.5030 ms 82.3% + triton_mm_2965 1.5183 ms 81.4% + triton_mm_2971 1.7909 ms 69.0% + triton_mm_2970 1.7980 ms 68.8% + triton_mm_2966 1.8714 ms 66.1% + triton_mm_2967 1.9231 ms 64.3% + triton_mm_2963 2.0007 ms 61.8% + triton_mm_2968 4.8017 ms 25.8% +SingleProcess AUTOTUNE takes 8.5686 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +hf_DistilBert +cuda eval hf_DistilBert int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_DistilBert. Setting accuracy check to cosine +AUTOTUNE bmm(12x512x64, 12x64x512) + triton_bmm_24 0.0131 ms 100.0% + triton_bmm_23 0.0133 ms 98.3% + bmm 0.0134 ms 97.6% + triton_bmm_30 0.0144 ms 90.7% + triton_bmm_25 0.0145 ms 90.3% + triton_bmm_26 0.0146 ms 89.7% + triton_bmm_22 0.0149 ms 87.6% + triton_bmm_32 0.0157 ms 83.1% + triton_bmm_29 0.0158 ms 82.8% + triton_bmm_31 0.0170 ms 77.2% +SingleProcess AUTOTUNE takes 4.2588 seconds +AUTOTUNE bmm(12x512x512, 12x512x64) + triton_bmm_48 0.0143 ms 100.0% + triton_bmm_54 0.0144 ms 98.9% + triton_bmm_53 0.0146 ms 97.7% + triton_bmm_51 0.0148 ms 96.3% + bmm 0.0151 ms 94.5% + triton_bmm_50 0.0153 ms 93.1% + triton_bmm_49 0.0156 ms 91.8% + triton_bmm_46 0.0164 ms 86.9% + triton_bmm_47 0.0172 ms 83.1% + triton_bmm_45 0.0239 ms 59.6% +SingleProcess AUTOTUNE takes 4.3605 seconds +AUTOTUNE int_mm(512x768, 768x30522, 512x30522) + triton_mm_560 0.1216 ms 100.0% + triton_mm_561 0.1230 ms 98.8% + triton_mm_552 0.1293 ms 94.0% + triton_mm_553 0.1353 ms 89.9% + triton_mm_559 0.1461 ms 83.2% + triton_mm_554 0.1538 ms 79.0% + triton_mm_558 0.1556 ms 78.1% + triton_mm_555 0.1573 ms 77.3% + triton_mm_551 0.1748 ms 69.6% + triton_mm_556 0.3711 ms 32.8% +SingleProcess AUTOTUNE takes 7.8405 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_GPT2 +cuda eval hf_GPT2 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_GPT2. Setting accuracy check to cosine +AUTOTUNE addmm(1024x2304, 1024x768, 768x2304) + triton_mm_1 0.0285 ms 100.0% + triton_mm_2 0.0289 ms 98.9% + triton_mm_3 0.0329 ms 86.8% + triton_mm_4 0.0338 ms 84.4% + triton_mm_0 0.0383 ms 74.5% + addmm 0.0395 ms 72.3% + triton_mm_8 0.0396 ms 72.0% + triton_mm_10 0.0468 ms 61.1% + triton_mm_6 0.0602 ms 47.4% + triton_mm_5 0.0629 ms 45.4% +SingleProcess AUTOTUNE takes 5.4727 seconds +AUTOTUNE bmm(12x1024x64, 12x64x1024) + triton_bmm_13 0.0284 ms 100.0% + triton_bmm_14 0.0291 ms 97.8% + triton_bmm_20 0.0323 ms 88.0% + triton_bmm_15 0.0326 ms 87.2% + triton_bmm_16 0.0327 ms 87.1% + triton_bmm_12 0.0358 ms 79.5% + triton_bmm_22 0.0380 ms 74.8% + triton_bmm_19 0.0401 ms 71.0% + triton_bmm_21 0.0465 ms 61.2% + bmm 0.0550 ms 51.7% +SingleProcess AUTOTUNE takes 4.5887 seconds +AUTOTUNE bmm(12x1024x1024, 12x1024x64) + triton_bmm_32 0.0355 ms 100.0% + bmm 0.0358 ms 99.2% + triton_bmm_28 0.0362 ms 98.1% + triton_bmm_27 0.0367 ms 96.7% + triton_bmm_25 0.0382 ms 93.0% + triton_bmm_26 0.0393 ms 90.5% + triton_bmm_30 0.0403 ms 88.1% + triton_bmm_33 0.0415 ms 85.6% + triton_bmm_29 0.0431 ms 82.5% + triton_bmm_24 0.0539 ms 65.9% +SingleProcess AUTOTUNE takes 4.9336 seconds +AUTOTUNE mm(1024x768, 768x768) + mm 0.0155 ms 100.0% + triton_mm_40 0.0161 ms 96.0% + triton_mm_39 0.0164 ms 94.2% + triton_mm_44 0.0180 ms 85.9% + triton_mm_37 0.0188 ms 82.1% + triton_mm_38 0.0194 ms 79.7% + triton_mm_42 0.0266 ms 58.1% + triton_mm_45 0.0266 ms 58.1% + triton_mm_41 0.0272 ms 56.8% + triton_mm_36 0.0295 ms 52.4% +SingleProcess AUTOTUNE takes 4.7526 seconds +AUTOTUNE mm(1024x768, 768x3072) + mm 0.0291 ms 100.0% + triton_mm_49 0.0384 ms 75.8% + triton_mm_51 0.0394 ms 73.9% + triton_mm_50 0.0404 ms 72.1% + triton_mm_52 0.0405 ms 71.8% + triton_mm_56 0.0482 ms 60.4% + triton_mm_48 0.0516 ms 56.4% + triton_mm_55 0.0622 ms 46.8% + triton_mm_58 0.0645 ms 45.2% + triton_mm_54 0.0772 ms 37.7% +SingleProcess AUTOTUNE takes 4.7170 seconds +AUTOTUNE mm(1024x3072, 3072x768) + mm 0.0335 ms 100.0% + triton_mm_63 0.0421 ms 79.6% + triton_mm_64 0.0428 ms 78.3% + triton_mm_68 0.0449 ms 74.6% + triton_mm_61 0.0547 ms 61.3% + triton_mm_62 0.0576 ms 58.3% + triton_mm_69 0.0766 ms 43.8% + triton_mm_66 0.0784 ms 42.8% + triton_mm_65 0.0795 ms 42.2% + triton_mm_60 0.0898 ms 37.3% +SingleProcess AUTOTUNE takes 4.6375 seconds +AUTOTUNE int_mm(1024x768, 768x50257, 1024x50257) + triton_mm_873 0.3407 ms 100.0% + triton_mm_874 0.3936 ms 86.6% + triton_mm_865 0.3978 ms 85.7% + triton_mm_866 0.4408 ms 77.3% + triton_mm_872 0.4978 ms 68.4% + triton_mm_867 0.5047 ms 67.5% + triton_mm_868 0.5089 ms 67.0% + triton_mm_864 0.5328 ms 64.0% + triton_mm_871 0.7737 ms 44.0% + triton_mm_869 1.2033 ms 28.3% +SingleProcess AUTOTUNE takes 8.7797 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:19, ?it/s] +hf_GPT2_large +cuda eval hf_GPT2_large int8dynamic-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_Longformer +cuda eval hf_Longformer int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Longformer. Setting accuracy check to cosine +AUTOTUNE bmm(180x512x64, 180x64x512) + triton_bmm_23 0.0888 ms 100.0% + triton_bmm_24 0.0921 ms 96.4% + bmm 0.0978 ms 90.8% + triton_bmm_30 0.1016 ms 87.4% + triton_bmm_25 0.1051 ms 84.6% + triton_bmm_26 0.1081 ms 82.2% + triton_bmm_32 0.1141 ms 77.9% + triton_bmm_22 0.1157 ms 76.8% + triton_bmm_29 0.1344 ms 66.1% + triton_bmm_33 0.1654 ms 53.7% +SingleProcess AUTOTUNE takes 4.3214 seconds +AUTOTUNE bmm(192x256x768, 192x768x64) + triton_bmm_46 0.1107 ms 100.0% + triton_bmm_48 0.1139 ms 97.2% + triton_bmm_49 0.1140 ms 97.1% + triton_bmm_53 0.1166 ms 94.9% + triton_bmm_51 0.1259 ms 87.9% + triton_bmm_45 0.1285 ms 86.2% + triton_bmm_52 0.1352 ms 81.9% + triton_bmm_47 0.1420 ms 78.0% + bmm 0.1542 ms 71.8% + triton_bmm_54 0.1562 ms 70.9% +SingleProcess AUTOTUNE takes 4.2080 seconds +AUTOTUNE int_mm(4096x768, 768x50265, 4096x50265) + triton_mm_1100 1.2870 ms 100.0% + triton_mm_1101 1.4776 ms 87.1% + triton_mm_1092 1.5354 ms 83.8% + triton_mm_1093 1.6773 ms 76.7% + triton_mm_1099 1.9561 ms 65.8% + triton_mm_1094 1.9654 ms 65.5% + triton_mm_1095 1.9971 ms 64.4% + triton_mm_1091 2.0363 ms 63.2% + triton_mm_1098 3.0153 ms 42.7% + triton_mm_1096 4.7810 ms 26.9% +SingleProcess AUTOTUNE takes 8.6112 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Reformer +cuda eval hf_Reformer int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Reformer. Setting accuracy check to cosine +AUTOTUNE int_mm(4096x256, 256x768, 4096x768) + triton_mm_10 0.0203 ms 100.0% + triton_mm_9 0.0209 ms 97.4% + triton_mm_7 0.0210 ms 96.6% + triton_mm_2 0.0213 ms 95.5% + triton_mm_4 0.0220 ms 92.6% + triton_mm_8 0.0221 ms 91.9% + triton_mm_0 0.0223 ms 91.1% + triton_mm_1 0.0223 ms 91.1% + triton_mm_3 0.0254 ms 80.0% + triton_mm_5 0.0419 ms 48.5% +SingleProcess AUTOTUNE takes 7.3487 seconds +AUTOTUNE bmm(768x64x64, 768x64x128) + triton_bmm_30 0.0305 ms 100.0% + triton_bmm_23 0.0316 ms 96.7% + triton_bmm_25 0.0318 ms 95.9% + triton_bmm_24 0.0322 ms 94.6% + triton_bmm_29 0.0328 ms 93.1% + triton_bmm_22 0.0328 ms 93.0% + triton_bmm_26 0.0330 ms 92.4% + triton_bmm_32 0.0332 ms 91.7% + bmm 0.0335 ms 91.0% + triton_bmm_33 0.0356 ms 85.7% +SingleProcess AUTOTUNE takes 4.2838 seconds +AUTOTUNE bmm(768x64x128, 768x128x64) + triton_bmm_51 0.0340 ms 100.0% + triton_bmm_46 0.0343 ms 99.2% + triton_bmm_50 0.0352 ms 96.6% + triton_bmm_47 0.0354 ms 96.0% + triton_bmm_45 0.0356 ms 95.7% + triton_bmm_49 0.0371 ms 91.6% + bmm 0.0374 ms 90.9% + triton_bmm_48 0.0379 ms 89.9% + triton_bmm_53 0.0389 ms 87.5% + triton_bmm_52 0.0414 ms 82.1% +SingleProcess AUTOTUNE takes 3.6929 seconds +AUTOTUNE int_mm(4096x768, 768x256, 4096x256) + triton_mm_63 0.0181 ms 100.0% + triton_mm_59 0.0219 ms 82.5% + triton_mm_56 0.0222 ms 81.5% + triton_mm_57 0.0224 ms 80.8% + triton_mm_58 0.0242 ms 74.8% + triton_mm_65 0.0246 ms 73.4% + triton_mm_55 0.0253 ms 71.5% + triton_mm_64 0.0253 ms 71.5% + triton_mm_62 0.0326 ms 55.4% + triton_mm_60 0.0332 ms 54.4% +SingleProcess AUTOTUNE takes 6.9953 seconds +AUTOTUNE int_mm(4096x256, 256x512, 4096x512) + triton_mm_67 0.0168 ms 100.0% + triton_mm_74 0.0171 ms 98.1% + triton_mm_68 0.0175 ms 95.8% + triton_mm_66 0.0176 ms 95.5% + triton_mm_70 0.0188 ms 89.6% + triton_mm_69 0.0194 ms 86.5% + triton_mm_76 0.0195 ms 86.3% + triton_mm_75 0.0196 ms 85.5% + triton_mm_73 0.0205 ms 82.0% + triton_mm_71 0.0304 ms 55.2% +SingleProcess AUTOTUNE takes 7.2575 seconds +AUTOTUNE int_mm(4096x512, 512x256, 4096x256) + triton_mm_85 0.0153 ms 100.0% + triton_mm_79 0.0178 ms 86.1% + triton_mm_78 0.0181 ms 84.5% + triton_mm_81 0.0188 ms 81.6% + triton_mm_80 0.0196 ms 78.0% + triton_mm_77 0.0200 ms 76.6% + triton_mm_87 0.0215 ms 71.1% + triton_mm_86 0.0216 ms 70.7% + triton_mm_84 0.0252 ms 60.7% + triton_mm_82 0.0253 ms 60.4% +SingleProcess AUTOTUNE takes 7.2848 seconds +AUTOTUNE bmm(12x4096x64, 12x64x64) + triton_bmm_107 0.0160 ms 100.0% + triton_bmm_101 0.0170 ms 94.0% + triton_bmm_100 0.0175 ms 91.1% + triton_bmm_103 0.0184 ms 86.8% + triton_bmm_106 0.0184 ms 86.6% + triton_bmm_99 0.0190 ms 84.1% + triton_bmm_102 0.0192 ms 83.2% + triton_bmm_108 0.0194 ms 82.2% + triton_bmm_109 0.0196 ms 81.7% + triton_bmm_110 0.0214 ms 74.5% +SingleProcess AUTOTUNE takes 4.4524 seconds +AUTOTUNE bmm(768x64x64, 768x64x128) + triton_bmm_119 0.0300 ms 100.0% + triton_bmm_112 0.0317 ms 94.5% + triton_bmm_113 0.0324 ms 92.6% + triton_bmm_115 0.0332 ms 90.2% + triton_bmm_114 0.0335 ms 89.5% + triton_bmm_120 0.0338 ms 88.7% + triton_bmm_111 0.0344 ms 87.0% + bmm 0.0347 ms 86.3% + triton_bmm_118 0.0347 ms 86.3% + triton_bmm_121 0.0348 ms 86.0% +SingleProcess AUTOTUNE takes 4.1536 seconds +skipping cudagraphs due to ['incompatible ops'] +AUTOTUNE int_mm(4096x512, 512x320, 4096x320) + triton_mm_539 0.0155 ms 100.0% + triton_mm_533 0.0179 ms 86.8% + triton_mm_532 0.0184 ms 84.3% + triton_mm_535 0.0188 ms 82.5% + triton_mm_531 0.0202 ms 77.0% + triton_mm_534 0.0203 ms 76.5% + triton_mm_540 0.0218 ms 71.1% + triton_mm_541 0.0223 ms 69.5% + triton_mm_538 0.0251 ms 61.9% + triton_mm_536 0.0297 ms 52.2% +SingleProcess AUTOTUNE takes 7.3928 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +hf_T5 +cuda eval hf_T5 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_T5. Setting accuracy check to cosine +AUTOTUNE int_mm(2048x512, 512x512, 2048x512) + triton_mm_8 0.0150 ms 100.0% + triton_mm_1 0.0175 ms 85.6% + triton_mm_2 0.0176 ms 84.9% + triton_mm_4 0.0186 ms 80.6% + triton_mm_3 0.0187 ms 80.3% + triton_mm_0 0.0191 ms 78.4% + triton_mm_10 0.0213 ms 70.3% + triton_mm_9 0.0215 ms 69.7% + triton_mm_7 0.0241 ms 62.1% + triton_mm_5 0.0252 ms 59.4% +SingleProcess AUTOTUNE takes 7.1491 seconds +AUTOTUNE bmm(8x2048x64, 8x64x2048) + triton_bmm_23 0.0587 ms 100.0% + triton_bmm_24 0.0589 ms 99.6% + triton_bmm_25 0.0692 ms 84.7% + triton_bmm_26 0.0697 ms 84.2% + triton_bmm_30 0.0729 ms 80.4% + triton_bmm_22 0.0772 ms 76.0% + triton_bmm_29 0.0839 ms 69.9% + triton_bmm_32 0.0848 ms 69.2% + triton_bmm_31 0.1096 ms 53.5% + bmm 0.1308 ms 44.8% +SingleProcess AUTOTUNE takes 4.3748 seconds +AUTOTUNE bmm(8x2048x2048, 8x2048x64) + triton_bmm_53 0.0686 ms 100.0% + bmm 0.0698 ms 98.3% + triton_bmm_46 0.0701 ms 97.9% + triton_bmm_47 0.0708 ms 96.9% + triton_bmm_48 0.0720 ms 95.3% + triton_bmm_49 0.0736 ms 93.2% + triton_bmm_51 0.0779 ms 88.1% + triton_bmm_50 0.0804 ms 85.4% + triton_bmm_45 0.0880 ms 78.0% + triton_bmm_54 0.0890 ms 77.1% +SingleProcess AUTOTUNE takes 4.2810 seconds +AUTOTUNE int_mm(2048x512, 512x2048, 2048x2048) + triton_mm_148 0.0319 ms 100.0% + triton_mm_149 0.0330 ms 96.7% + triton_mm_155 0.0339 ms 94.1% + triton_mm_151 0.0367 ms 86.9% + triton_mm_150 0.0380 ms 84.0% + triton_mm_156 0.0386 ms 82.5% + triton_mm_157 0.0389 ms 81.8% + triton_mm_147 0.0400 ms 79.6% + triton_mm_154 0.0424 ms 75.2% + triton_mm_152 0.0771 ms 41.3% +SingleProcess AUTOTUNE takes 7.3824 seconds +AUTOTUNE int_mm(2048x2048, 2048x512, 2048x512) + triton_mm_166 0.0306 ms 100.0% + triton_mm_168 0.0396 ms 77.1% + triton_mm_167 0.0403 ms 75.9% + triton_mm_159 0.0408 ms 75.0% + triton_mm_160 0.0414 ms 73.9% + triton_mm_161 0.0424 ms 72.1% + triton_mm_162 0.0434 ms 70.4% + triton_mm_158 0.0508 ms 60.2% + triton_mm_163 0.0668 ms 45.8% + triton_mm_165 0.0678 ms 45.1% +SingleProcess AUTOTUNE takes 7.3242 seconds +AUTOTUNE int_mm(2048x512, 512x32128, 2048x32128) + triton_mm_1497 0.3118 ms 100.0% + triton_mm_1498 0.3160 ms 98.7% + triton_mm_1489 0.3540 ms 88.1% + triton_mm_1490 0.3578 ms 87.1% + triton_mm_1496 0.4071 ms 76.6% + triton_mm_1495 0.4097 ms 76.1% + triton_mm_1492 0.4248 ms 73.4% + triton_mm_1491 0.4358 ms 71.5% + triton_mm_1488 0.4514 ms 69.1% + triton_mm_1493 1.0957 ms 28.5% +SingleProcess AUTOTUNE takes 7.5926 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:07, ?it/s] +hf_T5_base +cuda eval hf_T5_base int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_T5_base. Setting accuracy check to cosine +AUTOTUNE int_mm(2048x768, 768x768, 2048x768) + triton_mm_2 0.0213 ms 100.0% + triton_mm_8 0.0214 ms 99.9% + triton_mm_1 0.0219 ms 97.6% + triton_mm_4 0.0225 ms 94.7% + triton_mm_3 0.0232 ms 91.9% + triton_mm_9 0.0243 ms 87.8% + triton_mm_10 0.0250 ms 85.5% + triton_mm_0 0.0257 ms 83.1% + triton_mm_7 0.0316 ms 67.6% + triton_mm_5 0.0482 ms 44.3% +SingleProcess AUTOTUNE takes 7.3086 seconds +AUTOTUNE bmm(12x2048x64, 12x64x2048) + triton_bmm_23 0.0860 ms 100.0% + triton_bmm_24 0.0868 ms 99.0% + triton_bmm_25 0.1002 ms 85.8% + triton_bmm_26 0.1005 ms 85.5% + triton_bmm_30 0.1060 ms 81.1% + triton_bmm_22 0.1146 ms 75.0% + triton_bmm_32 0.1227 ms 70.1% + triton_bmm_29 0.1258 ms 68.3% + triton_bmm_31 0.1612 ms 53.3% + bmm 0.1919 ms 44.8% +SingleProcess AUTOTUNE takes 4.3234 seconds +AUTOTUNE bmm(12x2048x2048, 12x2048x64) + bmm 0.0887 ms 100.0% + triton_bmm_49 0.0893 ms 99.4% + triton_bmm_47 0.0894 ms 99.2% + triton_bmm_46 0.0898 ms 98.8% + triton_bmm_48 0.0952 ms 93.1% + triton_bmm_53 0.0980 ms 90.5% + triton_bmm_52 0.1014 ms 87.5% + triton_bmm_45 0.1023 ms 86.7% + triton_bmm_51 0.1108 ms 80.1% + triton_bmm_50 0.1150 ms 77.2% +SingleProcess AUTOTUNE takes 4.5213 seconds +AUTOTUNE int_mm(2048x768, 768x3072, 2048x3072) + triton_mm_156 0.0471 ms 100.0% + triton_mm_157 0.0474 ms 99.4% + triton_mm_148 0.0556 ms 84.8% + triton_mm_149 0.0571 ms 82.5% + triton_mm_154 0.0600 ms 78.5% + triton_mm_155 0.0605 ms 77.8% + triton_mm_151 0.0661 ms 71.2% + triton_mm_150 0.0674 ms 69.9% + triton_mm_147 0.0727 ms 64.8% + triton_mm_152 0.1562 ms 30.2% +SingleProcess AUTOTUNE takes 7.6661 seconds +AUTOTUNE int_mm(2048x3072, 3072x768, 2048x768) + triton_mm_167 0.0522 ms 100.0% + triton_mm_168 0.0523 ms 99.8% + triton_mm_166 0.0531 ms 98.4% + triton_mm_160 0.0568 ms 91.9% + triton_mm_159 0.0571 ms 91.4% + triton_mm_162 0.0581 ms 89.9% + triton_mm_161 0.0598 ms 87.3% + triton_mm_158 0.0767 ms 68.0% + triton_mm_165 0.0969 ms 53.9% + triton_mm_163 0.1498 ms 34.8% +SingleProcess AUTOTUNE takes 7.5175 seconds +AUTOTUNE int_mm(2048x768, 768x32128, 2048x32128) + triton_mm_2985 0.3764 ms 100.0% + triton_mm_2986 0.3816 ms 98.7% + triton_mm_2978 0.4657 ms 80.8% + triton_mm_2977 0.4678 ms 80.5% + triton_mm_2984 0.5402 ms 69.7% + triton_mm_2983 0.5504 ms 68.4% + triton_mm_2980 0.5611 ms 67.1% + triton_mm_2979 0.5694 ms 66.1% + triton_mm_2976 0.6280 ms 59.9% + triton_mm_2981 1.5333 ms 24.6% +SingleProcess AUTOTUNE takes 7.3481 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:13, ?it/s] +hf_T5_generate +cuda eval hf_T5_generate int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_T5_generate. Setting accuracy check to cosine +AUTOTUNE int_mm(1x512, 512x512, 1x512) + triton_mm_550 0.0090 ms 100.0% + triton_mm_549 0.0100 ms 89.7% + triton_mm_545 0.0100 ms 89.2% + triton_mm_544 0.0102 ms 87.5% + triton_mm_548 0.0103 ms 87.4% + triton_mm_546 0.0105 ms 85.1% + triton_mm_542 0.0131 ms 68.5% + triton_mm_541 0.0136 ms 66.0% + triton_mm_540 0.0162 ms 55.2% + triton_mm_543 0.0164 ms 54.6% +SingleProcess AUTOTUNE takes 3.7205 seconds +AUTOTUNE bmm(8x1x64, 8x64x1) + triton_bmm_564 0.0061 ms 100.0% + triton_bmm_565 0.0061 ms 100.0% + triton_bmm_566 0.0069 ms 89.5% + triton_bmm_563 0.0069 ms 89.3% + triton_bmm_567 0.0069 ms 89.3% + triton_bmm_569 0.0069 ms 88.9% + triton_bmm_562 0.0072 ms 85.3% + triton_bmm_568 0.0077 ms 79.7% + bmm 0.0543 ms 11.3% +SingleProcess AUTOTUNE takes 2.7393 seconds +AUTOTUNE bmm(8x1x1, 8x1x64) + triton_bmm_581 0.0061 ms 100.0% + triton_bmm_584 0.0061 ms 100.0% + triton_bmm_585 0.0061 ms 100.0% + triton_bmm_586 0.0061 ms 100.0% + triton_bmm_587 0.0061 ms 100.0% + triton_bmm_582 0.0068 ms 90.6% + triton_bmm_583 0.0068 ms 90.4% + bmm 0.0540 ms 11.4% +SingleProcess AUTOTUNE takes 2.0376 seconds +AUTOTUNE bmm(8x1x64, 8x64x2048) + triton_bmm_630 0.0092 ms 100.0% + triton_bmm_623 0.0093 ms 99.0% + triton_bmm_629 0.0094 ms 97.3% + triton_bmm_621 0.0095 ms 97.0% + triton_bmm_627 0.0095 ms 96.6% + triton_bmm_624 0.0097 ms 94.4% + triton_bmm_628 0.0097 ms 94.4% + triton_bmm_622 0.0100 ms 91.4% + triton_bmm_625 0.0100 ms 91.4% + triton_bmm_632 0.0103 ms 89.1% +SingleProcess AUTOTUNE takes 3.4023 seconds +AUTOTUNE bmm(8x1x2048, 8x2048x64) + triton_bmm_648 0.0193 ms 100.0% + triton_bmm_647 0.0196 ms 98.4% + triton_bmm_649 0.0205 ms 94.1% + triton_bmm_650 0.0234 ms 82.6% + triton_bmm_646 0.0236 ms 81.8% + triton_bmm_645 0.0308 ms 62.7% + bmm 0.0482 ms 40.1% + triton_bmm_644 0.0562 ms 34.3% + triton_bmm_652 0.0879 ms 22.0% + triton_bmm_651 0.0934 ms 20.7% +SingleProcess AUTOTUNE takes 2.9576 seconds +AUTOTUNE int_mm(1x512, 512x2048, 1x2048) + triton_mm_674 0.0100 ms 100.0% + triton_mm_669 0.0100 ms 99.7% + triton_mm_672 0.0102 ms 97.8% + triton_mm_673 0.0102 ms 97.8% + triton_mm_670 0.0105 ms 95.4% + triton_mm_668 0.0113 ms 88.4% + triton_mm_666 0.0136 ms 73.5% + triton_mm_665 0.0141 ms 71.0% + triton_mm_664 0.0172 ms 58.4% + triton_mm_667 0.0179 ms 55.9% +SingleProcess AUTOTUNE takes 3.6624 seconds +AUTOTUNE int_mm(1x2048, 2048x512, 1x512) + triton_mm_685 0.0154 ms 100.0% + triton_mm_684 0.0172 ms 89.4% + triton_mm_680 0.0196 ms 78.3% + triton_mm_683 0.0197 ms 78.0% + triton_mm_681 0.0206 ms 74.7% + triton_mm_679 0.0225 ms 68.4% + triton_mm_677 0.0324 ms 47.5% + triton_mm_676 0.0355 ms 43.4% + triton_mm_675 0.0522 ms 29.5% + triton_mm_678 0.0563 ms 27.4% +SingleProcess AUTOTUNE takes 3.4805 seconds +AUTOTUNE int_mm(1x512, 512x32128, 1x32128) + triton_mm_1424 0.0253 ms 100.0% + triton_mm_1426 0.0255 ms 99.2% + triton_mm_1418 0.0258 ms 98.4% + triton_mm_1425 0.0259 ms 97.8% + triton_mm_1417 0.0260 ms 97.3% + triton_mm_1421 0.0261 ms 97.2% + triton_mm_1422 0.0261 ms 96.9% + triton_mm_1420 0.0262 ms 96.7% + triton_mm_1416 0.0269 ms 94.1% + triton_mm_1419 0.0273 ms 92.8% +SingleProcess AUTOTUNE takes 3.5759 seconds +AUTOTUNE bmm(8x1x64, 8x64x2) + triton_bmm_1454 0.0061 ms 100.0% + triton_bmm_1452 0.0062 ms 98.5% + triton_bmm_1450 0.0064 ms 96.5% + triton_bmm_1451 0.0064 ms 96.5% + triton_bmm_1453 0.0069 ms 88.9% + triton_bmm_1455 0.0072 ms 85.7% + triton_bmm_1449 0.0072 ms 85.5% + triton_bmm_1456 0.0077 ms 79.7% + bmm 0.0544 ms 11.3% +SingleProcess AUTOTUNE takes 2.4231 seconds +AUTOTUNE bmm(8x1x2, 8x2x64) + triton_bmm_1472 0.0061 ms 100.0% + triton_bmm_1470 0.0067 ms 91.9% + triton_bmm_1468 0.0068 ms 90.6% + triton_bmm_1469 0.0068 ms 89.7% + triton_bmm_1471 0.0069 ms 88.7% + triton_bmm_1473 0.0069 ms 88.7% + triton_bmm_1474 0.0069 ms 88.5% + bmm 0.0535 ms 11.5% +SingleProcess AUTOTUNE takes 2.0614 seconds +AUTOTUNE bmm(8x1x64, 8x64x3) + triton_bmm_2209 0.0062 ms 100.0% + triton_bmm_2208 0.0062 ms 99.7% + triton_bmm_2207 0.0064 ms 97.0% + triton_bmm_2206 0.0064 ms 96.7% + triton_bmm_2204 0.0067 ms 92.5% + triton_bmm_2205 0.0068 ms 90.0% + triton_bmm_2210 0.0072 ms 85.9% + triton_bmm_2211 0.0077 ms 79.9% + bmm 0.0546 ms 11.3% +SingleProcess AUTOTUNE takes 2.8028 seconds +AUTOTUNE bmm(8x1x3, 8x3x64) + triton_bmm_2223 0.0061 ms 100.0% + triton_bmm_2228 0.0064 ms 96.0% + triton_bmm_2229 0.0064 ms 96.0% + triton_bmm_2227 0.0069 ms 89.5% + triton_bmm_2224 0.0069 ms 88.9% + triton_bmm_2225 0.0069 ms 88.9% + triton_bmm_2226 0.0069 ms 88.7% + bmm 0.0571 ms 10.8% +SingleProcess AUTOTUNE takes 1.9078 seconds +AUTOTUNE bmm(8x1x64, 8x64x4) + triton_bmm_2963 0.0061 ms 100.0% + triton_bmm_2961 0.0064 ms 96.5% + triton_bmm_2960 0.0067 ms 91.4% + triton_bmm_2964 0.0068 ms 89.9% + triton_bmm_2962 0.0069 ms 88.5% + triton_bmm_2965 0.0071 ms 86.1% + triton_bmm_2966 0.0072 ms 85.7% + triton_bmm_2959 0.0074 ms 82.8% + bmm 0.0572 ms 10.7% +SingleProcess AUTOTUNE takes 2.3596 seconds +AUTOTUNE bmm(8x1x4, 8x4x64) + triton_bmm_2978 0.0061 ms 100.0% + triton_bmm_2979 0.0061 ms 100.0% + triton_bmm_2980 0.0061 ms 100.0% + triton_bmm_2982 0.0061 ms 100.0% + triton_bmm_2981 0.0064 ms 96.0% + triton_bmm_2983 0.0064 ms 96.0% + triton_bmm_2984 0.0064 ms 96.0% + bmm 0.0557 ms 11.0% +SingleProcess AUTOTUNE takes 1.8849 seconds +AUTOTUNE bmm(8x1x64, 8x64x5) + triton_bmm_3715 0.0064 ms 100.0% + triton_bmm_3716 0.0064 ms 100.0% + triton_bmm_3717 0.0064 ms 100.0% + triton_bmm_3719 0.0067 ms 94.8% + triton_bmm_3718 0.0069 ms 91.7% + triton_bmm_3714 0.0075 ms 85.4% + triton_bmm_3720 0.0077 ms 82.6% + triton_bmm_3721 0.0077 ms 82.6% + bmm 0.0550 ms 11.6% +SingleProcess AUTOTUNE takes 2.3931 seconds +AUTOTUNE bmm(8x1x5, 8x5x64) + triton_bmm_3733 0.0061 ms 100.0% + triton_bmm_3737 0.0069 ms 89.5% + triton_bmm_3734 0.0069 ms 88.9% + triton_bmm_3735 0.0069 ms 88.5% + triton_bmm_3739 0.0071 ms 86.1% + triton_bmm_3736 0.0072 ms 85.7% + triton_bmm_3738 0.0072 ms 85.7% + bmm 0.0556 ms 11.0% +SingleProcess AUTOTUNE takes 1.9002 seconds +AUTOTUNE bmm(8x1x64, 8x64x6) + triton_bmm_4473 0.0064 ms 100.0% + triton_bmm_4470 0.0068 ms 93.0% + triton_bmm_4471 0.0068 ms 93.0% + triton_bmm_4472 0.0068 ms 93.0% + triton_bmm_4469 0.0069 ms 92.6% + triton_bmm_4474 0.0069 ms 91.7% + triton_bmm_4475 0.0077 ms 82.6% + triton_bmm_4476 0.0077 ms 82.6% + bmm 0.0550 ms 11.6% +SingleProcess AUTOTUNE takes 2.6198 seconds +AUTOTUNE bmm(8x1x6, 8x6x64) + triton_bmm_4490 0.0061 ms 100.0% + triton_bmm_4492 0.0061 ms 100.0% + triton_bmm_4488 0.0068 ms 90.6% + triton_bmm_4489 0.0069 ms 88.5% + triton_bmm_4491 0.0071 ms 86.3% + triton_bmm_4493 0.0072 ms 85.3% + triton_bmm_4494 0.0072 ms 85.3% + bmm 0.0541 ms 11.3% +SingleProcess AUTOTUNE takes 2.0426 seconds +AUTOTUNE bmm(8x1x64, 8x64x7) + triton_bmm_5225 0.0064 ms 100.0% + triton_bmm_5226 0.0064 ms 100.0% + triton_bmm_5227 0.0064 ms 100.0% + triton_bmm_5229 0.0064 ms 100.0% + triton_bmm_5224 0.0067 ms 95.2% + triton_bmm_5228 0.0069 ms 92.1% + triton_bmm_5231 0.0072 ms 88.8% + triton_bmm_5230 0.0077 ms 82.6% + bmm 0.0564 ms 11.3% +SingleProcess AUTOTUNE takes 2.5657 seconds +AUTOTUNE bmm(8x1x7, 8x7x64) + triton_bmm_5243 0.0061 ms 100.0% + triton_bmm_5246 0.0064 ms 96.0% + triton_bmm_5244 0.0069 ms 89.5% + triton_bmm_5247 0.0069 ms 89.5% + triton_bmm_5245 0.0069 ms 88.5% + triton_bmm_5248 0.0070 ms 87.3% + triton_bmm_5249 0.0071 ms 86.9% + bmm 0.0542 ms 11.3% +SingleProcess AUTOTUNE takes 2.1123 seconds +AUTOTUNE bmm(8x1x64, 8x64x8) + triton_bmm_5980 0.0063 ms 100.0% + triton_bmm_5982 0.0069 ms 91.6% + triton_bmm_5981 0.0069 ms 90.8% + triton_bmm_5983 0.0069 ms 90.8% + triton_bmm_5984 0.0069 ms 90.8% + triton_bmm_5986 0.0072 ms 87.9% + triton_bmm_5979 0.0074 ms 85.7% + triton_bmm_5985 0.0077 ms 81.7% + bmm 0.0546 ms 11.6% +SingleProcess AUTOTUNE takes 2.4499 seconds +AUTOTUNE bmm(8x1x8, 8x8x64) + triton_bmm_5999 0.0061 ms 100.0% + triton_bmm_6000 0.0061 ms 100.0% + triton_bmm_6002 0.0066 ms 92.8% + triton_bmm_5998 0.0068 ms 89.7% + triton_bmm_6003 0.0070 ms 88.1% + triton_bmm_6001 0.0072 ms 85.3% + triton_bmm_6004 0.0072 ms 85.3% + bmm 0.0507 ms 12.1% +SingleProcess AUTOTUNE takes 1.9195 seconds +AUTOTUNE bmm(8x1x64, 8x64x9) + triton_bmm_6739 0.0064 ms 100.0% + triton_bmm_6737 0.0067 ms 95.0% + triton_bmm_6734 0.0068 ms 93.4% + triton_bmm_6736 0.0069 ms 92.3% + triton_bmm_6735 0.0069 ms 91.7% + triton_bmm_6738 0.0069 ms 91.7% + triton_bmm_6740 0.0077 ms 82.6% + triton_bmm_6741 0.0077 ms 82.2% + bmm 0.0534 ms 11.9% +SingleProcess AUTOTUNE takes 2.4029 seconds +AUTOTUNE bmm(8x1x9, 8x9x64) + triton_bmm_6754 0.0061 ms 100.0% + triton_bmm_6755 0.0061 ms 100.0% + triton_bmm_6753 0.0069 ms 88.9% + triton_bmm_6757 0.0069 ms 88.7% + triton_bmm_6759 0.0070 ms 88.1% + triton_bmm_6756 0.0072 ms 85.7% + triton_bmm_6758 0.0072 ms 85.7% + bmm 0.0549 ms 11.2% +SingleProcess AUTOTUNE takes 2.3224 seconds +AUTOTUNE bmm(8x1x64, 8x64x10) + triton_bmm_7490 0.0064 ms 100.0% + triton_bmm_7492 0.0064 ms 100.0% + triton_bmm_7494 0.0068 ms 93.9% + triton_bmm_7491 0.0069 ms 92.6% + triton_bmm_7493 0.0069 ms 91.7% + triton_bmm_7495 0.0072 ms 88.8% + triton_bmm_7496 0.0072 ms 88.8% + triton_bmm_7489 0.0075 ms 85.4% + bmm 0.0517 ms 12.3% +SingleProcess AUTOTUNE takes 2.5250 seconds +AUTOTUNE bmm(8x1x10, 8x10x64) + triton_bmm_7512 0.0061 ms 100.0% + triton_bmm_7511 0.0064 ms 96.0% + triton_bmm_7513 0.0064 ms 96.0% + triton_bmm_7508 0.0069 ms 89.5% + triton_bmm_7509 0.0069 ms 88.9% + triton_bmm_7510 0.0069 ms 88.7% + triton_bmm_7514 0.0072 ms 85.7% + bmm 0.0564 ms 10.9% +SingleProcess AUTOTUNE takes 1.9291 seconds +AUTOTUNE bmm(8x1x64, 8x64x11) + triton_bmm_8247 0.0064 ms 100.0% + triton_bmm_8248 0.0064 ms 100.0% + triton_bmm_8245 0.0069 ms 91.7% + triton_bmm_8246 0.0069 ms 91.7% + triton_bmm_8249 0.0069 ms 91.7% + triton_bmm_8250 0.0072 ms 88.8% + triton_bmm_8251 0.0072 ms 88.8% + triton_bmm_8244 0.0075 ms 85.4% + bmm 0.0544 ms 11.7% +SingleProcess AUTOTUNE takes 2.4684 seconds +AUTOTUNE bmm(8x1x11, 8x11x64) + triton_bmm_8265 0.0062 ms 100.0% + triton_bmm_8266 0.0065 ms 95.5% + triton_bmm_8264 0.0068 ms 90.6% + triton_bmm_8263 0.0069 ms 89.8% + triton_bmm_8267 0.0069 ms 89.1% + triton_bmm_8268 0.0071 ms 86.9% + triton_bmm_8269 0.0072 ms 86.2% + bmm 0.0556 ms 11.1% +SingleProcess AUTOTUNE takes 2.2911 seconds +AUTOTUNE bmm(8x1x64, 8x64x12) + triton_bmm_9004 0.0068 ms 100.0% + triton_bmm_8999 0.0069 ms 97.7% + triton_bmm_9003 0.0069 ms 97.7% + triton_bmm_9000 0.0069 ms 97.5% + triton_bmm_9001 0.0069 ms 97.2% + triton_bmm_9002 0.0069 ms 97.2% + triton_bmm_9006 0.0072 ms 94.2% + triton_bmm_9005 0.0077 ms 87.6% + bmm 0.0543 ms 12.4% +SingleProcess AUTOTUNE takes 2.3958 seconds +AUTOTUNE bmm(8x1x12, 8x12x64) + triton_bmm_9018 0.0061 ms 100.0% + triton_bmm_9019 0.0061 ms 100.0% + triton_bmm_9020 0.0069 ms 88.5% + triton_bmm_9022 0.0069 ms 88.5% + triton_bmm_9024 0.0071 ms 86.9% + triton_bmm_9023 0.0071 ms 86.7% + triton_bmm_9021 0.0071 ms 86.1% + bmm 0.0548 ms 11.2% +SingleProcess AUTOTUNE takes 1.8970 seconds +AUTOTUNE bmm(8x1x64, 8x64x13) + triton_bmm_9755 0.0064 ms 100.0% + triton_bmm_9758 0.0064 ms 100.0% + triton_bmm_9759 0.0064 ms 100.0% + triton_bmm_9754 0.0069 ms 92.6% + triton_bmm_9756 0.0069 ms 91.9% + triton_bmm_9757 0.0069 ms 91.7% + triton_bmm_9761 0.0072 ms 88.4% + triton_bmm_9760 0.0079 ms 80.2% + bmm 0.0527 ms 12.1% +SingleProcess AUTOTUNE takes 2.4566 seconds +AUTOTUNE bmm(8x1x13, 8x13x64) + triton_bmm_9777 0.0064 ms 100.0% + triton_bmm_9775 0.0069 ms 92.1% + triton_bmm_9773 0.0069 ms 91.9% + triton_bmm_9774 0.0069 ms 91.7% + triton_bmm_9779 0.0071 ms 89.6% + triton_bmm_9778 0.0072 ms 88.8% + triton_bmm_9776 0.0072 ms 88.4% + bmm 0.0543 ms 11.7% +SingleProcess AUTOTUNE takes 1.9612 seconds +AUTOTUNE bmm(8x1x64, 8x64x14) + triton_bmm_10511 0.0064 ms 100.0% + triton_bmm_10512 0.0064 ms 100.0% + triton_bmm_10510 0.0064 ms 99.5% + triton_bmm_10513 0.0064 ms 99.5% + triton_bmm_10514 0.0069 ms 92.8% + triton_bmm_10509 0.0075 ms 85.4% + triton_bmm_10515 0.0077 ms 82.2% + triton_bmm_10516 0.0077 ms 82.2% + bmm 0.0604 ms 10.5% +SingleProcess AUTOTUNE takes 2.4353 seconds +AUTOTUNE bmm(8x1x14, 8x14x64) + triton_bmm_10528 0.0061 ms 100.0% + triton_bmm_10529 0.0061 ms 100.0% + triton_bmm_10530 0.0062 ms 98.5% + triton_bmm_10532 0.0063 ms 98.0% + triton_bmm_10531 0.0066 ms 92.8% + triton_bmm_10534 0.0072 ms 85.9% + triton_bmm_10533 0.0072 ms 85.3% + bmm 0.0605 ms 10.2% +SingleProcess AUTOTUNE takes 1.9941 seconds +AUTOTUNE bmm(8x1x64, 8x64x15) + triton_bmm_11266 0.0064 ms 100.0% + triton_bmm_11267 0.0064 ms 100.0% + triton_bmm_11265 0.0064 ms 99.5% + triton_bmm_11269 0.0069 ms 92.8% + triton_bmm_11268 0.0069 ms 92.1% + triton_bmm_11271 0.0072 ms 88.8% + triton_bmm_11264 0.0074 ms 86.3% + triton_bmm_11270 0.0079 ms 81.1% + bmm 0.0532 ms 12.0% +SingleProcess AUTOTUNE takes 2.6008 seconds +AUTOTUNE bmm(8x1x15, 8x15x64) + triton_bmm_11287 0.0064 ms 100.0% + triton_bmm_11285 0.0068 ms 94.3% + triton_bmm_11283 0.0069 ms 91.7% + triton_bmm_11284 0.0069 ms 91.7% + triton_bmm_11286 0.0071 ms 89.6% + triton_bmm_11288 0.0072 ms 88.4% + triton_bmm_11289 0.0072 ms 88.4% + bmm 0.0611 ms 10.4% +SingleProcess AUTOTUNE takes 2.0584 seconds +AUTOTUNE bmm(8x1x64, 8x64x16) + triton_bmm_12020 0.0064 ms 100.0% + triton_bmm_12021 0.0064 ms 100.0% + triton_bmm_12022 0.0064 ms 100.0% + triton_bmm_12023 0.0064 ms 99.5% + triton_bmm_12024 0.0069 ms 91.7% + triton_bmm_12026 0.0072 ms 88.8% + triton_bmm_12019 0.0073 ms 86.9% + triton_bmm_12025 0.0079 ms 80.9% + bmm 0.0611 ms 10.4% +SingleProcess AUTOTUNE takes 2.7473 seconds +AUTOTUNE bmm(8x1x16, 8x16x64) + triton_bmm_12038 0.0061 ms 100.0% + triton_bmm_12039 0.0061 ms 100.0% + triton_bmm_12040 0.0061 ms 100.0% + triton_bmm_12043 0.0064 ms 96.5% + triton_bmm_12042 0.0068 ms 89.9% + triton_bmm_12041 0.0069 ms 89.3% + triton_bmm_12044 0.0069 ms 88.5% + bmm 0.0574 ms 10.7% +SingleProcess AUTOTUNE takes 2.1602 seconds +AUTOTUNE bmm(8x1x64, 8x64x17) + triton_bmm_12776 0.0064 ms 100.0% + triton_bmm_12777 0.0064 ms 100.0% + triton_bmm_12774 0.0069 ms 92.6% + triton_bmm_12779 0.0069 ms 92.6% + triton_bmm_12775 0.0070 ms 90.9% + triton_bmm_12778 0.0074 ms 86.2% + triton_bmm_12780 0.0074 ms 86.2% + triton_bmm_12781 0.0080 ms 80.3% + bmm 0.0683 ms 9.4% +SingleProcess AUTOTUNE takes 2.4726 seconds +AUTOTUNE bmm(8x1x17, 8x17x64) + triton_bmm_12793 0.0067 ms 100.0% + triton_bmm_12795 0.0067 ms 100.0% + triton_bmm_12798 0.0067 ms 100.0% + triton_bmm_12800 0.0069 ms 96.3% + triton_bmm_12794 0.0072 ms 92.4% + triton_bmm_12797 0.0072 ms 92.4% + triton_bmm_12799 0.0077 ms 86.3% + triton_bmm_12796 0.0126 ms 52.7% + bmm 0.0702 ms 9.5% +SingleProcess AUTOTUNE takes 2.6256 seconds +AUTOTUNE bmm(8x1x64, 8x64x18) + triton_bmm_13536 0.0064 ms 100.0% + triton_bmm_13537 0.0064 ms 100.0% + triton_bmm_13539 0.0069 ms 92.6% + triton_bmm_13538 0.0069 ms 92.2% + triton_bmm_13542 0.0073 ms 87.7% + triton_bmm_13541 0.0074 ms 86.6% + triton_bmm_13535 0.0075 ms 85.8% + triton_bmm_13540 0.0075 ms 85.8% + bmm 0.0549 ms 11.7% +SingleProcess AUTOTUNE takes 2.3849 seconds +AUTOTUNE bmm(8x1x18, 8x18x64) + triton_bmm_13560 0.0069 ms 100.0% + triton_bmm_13554 0.0071 ms 97.1% + triton_bmm_13556 0.0071 ms 96.4% + triton_bmm_13558 0.0072 ms 96.0% + triton_bmm_13555 0.0072 ms 96.0% + triton_bmm_13557 0.0075 ms 92.3% + triton_bmm_13559 0.0075 ms 92.3% + triton_bmm_13561 0.0075 ms 92.3% + bmm 0.0645 ms 10.7% +SingleProcess AUTOTUNE takes 2.3995 seconds +AUTOTUNE bmm(8x1x64, 8x64x19) + triton_bmm_14297 0.0064 ms 100.0% + triton_bmm_14298 0.0064 ms 100.0% + triton_bmm_14299 0.0064 ms 100.0% + triton_bmm_14301 0.0069 ms 92.6% + triton_bmm_14300 0.0074 ms 86.8% + triton_bmm_14303 0.0074 ms 86.2% + triton_bmm_14296 0.0075 ms 85.1% + triton_bmm_14302 0.0080 ms 80.5% + bmm 0.0666 ms 9.6% +SingleProcess AUTOTUNE takes 2.4872 seconds +AUTOTUNE bmm(8x1x19, 8x19x64) + triton_bmm_14315 0.0067 ms 100.0% + triton_bmm_14316 0.0067 ms 100.0% + triton_bmm_14320 0.0067 ms 100.0% + triton_bmm_14317 0.0071 ms 94.1% + triton_bmm_14318 0.0072 ms 92.4% + triton_bmm_14319 0.0072 ms 92.4% + triton_bmm_14322 0.0074 ms 89.5% + triton_bmm_14321 0.0077 ms 86.3% + bmm 0.0542 ms 12.3% +SingleProcess AUTOTUNE takes 2.3681 seconds +AUTOTUNE bmm(8x1x64, 8x64x20) + triton_bmm_15058 0.0064 ms 100.0% + triton_bmm_15059 0.0064 ms 100.0% + triton_bmm_15060 0.0064 ms 100.0% + triton_bmm_15057 0.0069 ms 92.6% + triton_bmm_15064 0.0073 ms 87.3% + triton_bmm_15061 0.0074 ms 87.0% + triton_bmm_15062 0.0075 ms 85.8% + triton_bmm_15063 0.0080 ms 80.3% + bmm 0.0708 ms 9.0% +SingleProcess AUTOTUNE takes 2.7433 seconds +AUTOTUNE bmm(8x1x20, 8x20x64) + triton_bmm_15078 0.0066 ms 100.0% + triton_bmm_15079 0.0066 ms 100.0% + triton_bmm_15080 0.0066 ms 100.0% + triton_bmm_15081 0.0066 ms 100.0% + triton_bmm_15082 0.0069 ms 96.3% + triton_bmm_15077 0.0070 ms 94.5% + triton_bmm_15076 0.0072 ms 92.4% + triton_bmm_15083 0.0075 ms 88.8% + bmm 0.0553 ms 12.0% +SingleProcess AUTOTUNE takes 2.3576 seconds +AUTOTUNE bmm(8x1x64, 8x64x21) + triton_bmm_15823 0.0069 ms 100.0% + triton_bmm_15820 0.0069 ms 99.5% + triton_bmm_15819 0.0070 ms 99.1% + triton_bmm_15821 0.0070 ms 99.1% + triton_bmm_15822 0.0074 ms 93.5% + triton_bmm_15825 0.0074 ms 93.1% + triton_bmm_15818 0.0076 ms 91.5% + triton_bmm_15824 0.0080 ms 86.7% + bmm 0.0548 ms 12.6% +SingleProcess AUTOTUNE takes 2.4216 seconds +AUTOTUNE bmm(8x1x21, 8x21x64) + triton_bmm_15837 0.0067 ms 100.0% + triton_bmm_15840 0.0067 ms 100.0% + triton_bmm_15841 0.0067 ms 100.0% + triton_bmm_15842 0.0067 ms 99.5% + triton_bmm_15838 0.0072 ms 92.4% + triton_bmm_15839 0.0072 ms 92.4% + triton_bmm_15844 0.0075 ms 88.7% + triton_bmm_15843 0.0079 ms 83.9% + bmm 0.0543 ms 12.3% +SingleProcess AUTOTUNE takes 2.5202 seconds +AUTOTUNE bmm(8x1x64, 8x64x22) + triton_bmm_16582 0.0064 ms 100.0% + triton_bmm_16583 0.0069 ms 92.6% + triton_bmm_16584 0.0069 ms 92.6% + triton_bmm_16581 0.0069 ms 92.2% + triton_bmm_16580 0.0070 ms 91.3% + triton_bmm_16579 0.0075 ms 85.5% + triton_bmm_16586 0.0078 ms 82.1% + triton_bmm_16585 0.0080 ms 80.5% + bmm 0.0535 ms 12.0% +SingleProcess AUTOTUNE takes 2.3824 seconds +AUTOTUNE bmm(8x1x22, 8x22x64) + triton_bmm_16600 0.0065 ms 100.0% + triton_bmm_16598 0.0066 ms 99.0% + triton_bmm_16602 0.0066 ms 99.0% + triton_bmm_16601 0.0069 ms 94.4% + triton_bmm_16605 0.0069 ms 94.4% + triton_bmm_16599 0.0072 ms 91.1% + triton_bmm_16603 0.0075 ms 87.6% + triton_bmm_16604 0.0075 ms 87.6% + bmm 0.0529 ms 12.3% +SingleProcess AUTOTUNE takes 2.4667 seconds +AUTOTUNE bmm(8x1x64, 8x64x23) + triton_bmm_17341 0.0064 ms 100.0% + triton_bmm_17342 0.0064 ms 100.0% + triton_bmm_17343 0.0064 ms 100.0% + triton_bmm_17340 0.0069 ms 92.6% + triton_bmm_17347 0.0074 ms 86.2% + triton_bmm_17344 0.0075 ms 85.8% + triton_bmm_17345 0.0075 ms 85.8% + triton_bmm_17346 0.0080 ms 80.3% + bmm 0.0546 ms 11.7% +SingleProcess AUTOTUNE takes 2.4293 seconds +AUTOTUNE bmm(8x1x23, 8x23x64) + triton_bmm_17359 0.0067 ms 100.0% + triton_bmm_17360 0.0067 ms 100.0% + triton_bmm_17364 0.0067 ms 100.0% + triton_bmm_17361 0.0072 ms 92.4% + triton_bmm_17363 0.0072 ms 92.4% + triton_bmm_17362 0.0075 ms 89.3% + triton_bmm_17366 0.0075 ms 88.9% + triton_bmm_17365 0.0080 ms 83.5% + bmm 0.0559 ms 11.9% +SingleProcess AUTOTUNE takes 2.5012 seconds +AUTOTUNE bmm(8x1x64, 8x64x24) + triton_bmm_18102 0.0064 ms 100.0% + triton_bmm_18103 0.0064 ms 100.0% + triton_bmm_18104 0.0064 ms 100.0% + triton_bmm_18101 0.0069 ms 92.6% + triton_bmm_18105 0.0069 ms 92.6% + triton_bmm_18106 0.0069 ms 92.6% + triton_bmm_18108 0.0072 ms 88.9% + triton_bmm_18107 0.0079 ms 81.0% + bmm 0.0622 ms 10.3% +SingleProcess AUTOTUNE takes 2.3647 seconds +AUTOTUNE bmm(8x1x24, 8x24x64) + triton_bmm_18122 0.0065 ms 100.0% + triton_bmm_18126 0.0069 ms 94.7% + triton_bmm_18127 0.0069 ms 94.2% + triton_bmm_18124 0.0070 ms 92.9% + triton_bmm_18120 0.0072 ms 90.8% + triton_bmm_18121 0.0072 ms 90.8% + triton_bmm_18123 0.0072 ms 90.6% + triton_bmm_18125 0.0072 ms 90.4% + bmm 0.0642 ms 10.1% +SingleProcess AUTOTUNE takes 2.2809 seconds +AUTOTUNE bmm(8x1x64, 8x64x25) + triton_bmm_18864 0.0064 ms 100.0% + triton_bmm_18865 0.0064 ms 100.0% + triton_bmm_18862 0.0069 ms 92.6% + triton_bmm_18866 0.0069 ms 92.6% + triton_bmm_18863 0.0070 ms 91.3% + triton_bmm_18868 0.0074 ms 86.2% + triton_bmm_18869 0.0074 ms 86.2% + triton_bmm_18867 0.0075 ms 85.8% + bmm 0.0869 ms 7.4% +SingleProcess AUTOTUNE takes 2.5338 seconds +AUTOTUNE bmm(8x1x25, 8x25x64) + triton_bmm_18881 0.0067 ms 100.0% + triton_bmm_18882 0.0067 ms 100.0% + triton_bmm_18883 0.0067 ms 100.0% + triton_bmm_18884 0.0067 ms 100.0% + triton_bmm_18888 0.0071 ms 93.3% + triton_bmm_18885 0.0072 ms 92.4% + triton_bmm_18886 0.0073 ms 90.8% + triton_bmm_18887 0.0082 ms 80.9% + bmm 0.0580 ms 11.5% +SingleProcess AUTOTUNE takes 2.3010 seconds +AUTOTUNE bmm(8x1x64, 8x64x26) + triton_bmm_19624 0.0064 ms 100.0% + triton_bmm_19625 0.0069 ms 92.2% + triton_bmm_19626 0.0069 ms 92.2% + triton_bmm_19630 0.0072 ms 89.3% + triton_bmm_19629 0.0074 ms 86.6% + triton_bmm_19623 0.0075 ms 85.8% + triton_bmm_19627 0.0075 ms 85.8% + triton_bmm_19628 0.0075 ms 85.8% + bmm 0.0583 ms 11.0% +SingleProcess AUTOTUNE takes 2.5562 seconds +AUTOTUNE bmm(8x1x26, 8x26x64) + triton_bmm_19642 0.0064 ms 100.0% + triton_bmm_19643 0.0064 ms 100.0% + triton_bmm_19644 0.0064 ms 100.0% + triton_bmm_19648 0.0069 ms 92.6% + triton_bmm_19646 0.0072 ms 88.9% + triton_bmm_19647 0.0073 ms 87.1% + triton_bmm_19645 0.0074 ms 86.6% + triton_bmm_19649 0.0077 ms 83.0% + bmm 0.0615 ms 10.4% +SingleProcess AUTOTUNE takes 2.3276 seconds +AUTOTUNE bmm(8x1x64, 8x64x27) + triton_bmm_20385 0.0069 ms 100.0% + triton_bmm_20386 0.0069 ms 100.0% + triton_bmm_20387 0.0069 ms 100.0% + triton_bmm_20389 0.0074 ms 94.3% + triton_bmm_20388 0.0074 ms 93.5% + triton_bmm_20384 0.0075 ms 93.1% + triton_bmm_20391 0.0079 ms 87.5% + triton_bmm_20390 0.0080 ms 87.1% + bmm 0.0618 ms 11.2% +SingleProcess AUTOTUNE takes 2.6889 seconds +AUTOTUNE bmm(8x1x27, 8x27x64) + triton_bmm_20403 0.0067 ms 100.0% + triton_bmm_20404 0.0067 ms 100.0% + triton_bmm_20405 0.0067 ms 100.0% + triton_bmm_20407 0.0067 ms 100.0% + triton_bmm_20408 0.0067 ms 100.0% + triton_bmm_20406 0.0072 ms 92.4% + triton_bmm_20410 0.0077 ms 86.3% + triton_bmm_20409 0.0080 ms 83.5% + bmm 0.0552 ms 12.1% +SingleProcess AUTOTUNE takes 2.4148 seconds +AUTOTUNE bmm(8x1x64, 8x64x28) + triton_bmm_21146 0.0064 ms 100.0% + triton_bmm_21147 0.0064 ms 100.0% + triton_bmm_21148 0.0064 ms 100.0% + triton_bmm_21149 0.0069 ms 93.0% + triton_bmm_21150 0.0069 ms 93.0% + triton_bmm_21145 0.0069 ms 92.6% + triton_bmm_21151 0.0079 ms 80.6% + triton_bmm_21152 0.0080 ms 80.3% + bmm 0.0563 ms 11.4% +SingleProcess AUTOTUNE takes 2.4006 seconds +AUTOTUNE bmm(8x1x28, 8x28x64) + triton_bmm_21165 0.0066 ms 100.0% + triton_bmm_21168 0.0066 ms 100.0% + triton_bmm_21167 0.0067 ms 99.5% + triton_bmm_21166 0.0071 ms 93.0% + triton_bmm_21164 0.0072 ms 92.0% + triton_bmm_21169 0.0072 ms 92.0% + triton_bmm_21170 0.0072 ms 91.6% + triton_bmm_21171 0.0076 ms 87.0% + bmm 0.0547 ms 12.1% +SingleProcess AUTOTUNE takes 2.2356 seconds +AUTOTUNE bmm(8x1x64, 8x64x29) + triton_bmm_21910 0.0069 ms 100.0% + triton_bmm_21911 0.0069 ms 100.0% + triton_bmm_21908 0.0069 ms 99.5% + triton_bmm_21909 0.0069 ms 99.5% + triton_bmm_21907 0.0070 ms 99.3% + triton_bmm_21913 0.0074 ms 93.1% + triton_bmm_21906 0.0075 ms 92.7% + triton_bmm_21912 0.0079 ms 87.1% + bmm 0.0547 ms 12.6% +SingleProcess AUTOTUNE takes 2.9729 seconds +AUTOTUNE bmm(8x1x29, 8x29x64) + triton_bmm_21926 0.0067 ms 100.0% + triton_bmm_21930 0.0067 ms 100.0% + triton_bmm_21932 0.0072 ms 92.9% + triton_bmm_21925 0.0072 ms 92.4% + triton_bmm_21927 0.0072 ms 92.4% + triton_bmm_21928 0.0072 ms 92.4% + triton_bmm_21929 0.0072 ms 92.4% + triton_bmm_21931 0.0082 ms 80.9% + bmm 0.0642 ms 10.4% +SingleProcess AUTOTUNE takes 2.6468 seconds +AUTOTUNE bmm(8x1x64, 8x64x30) + triton_bmm_22669 0.0064 ms 100.0% + triton_bmm_22668 0.0068 ms 94.3% + triton_bmm_22671 0.0069 ms 93.0% + triton_bmm_22670 0.0069 ms 92.2% + triton_bmm_22667 0.0075 ms 85.8% + triton_bmm_22672 0.0075 ms 85.8% + triton_bmm_22674 0.0077 ms 82.6% + triton_bmm_22673 0.0079 ms 80.6% + bmm 0.0713 ms 9.0% +SingleProcess AUTOTUNE takes 2.6770 seconds +AUTOTUNE bmm(8x1x30, 8x30x64) + triton_bmm_22686 0.0066 ms 100.0% + triton_bmm_22687 0.0066 ms 100.0% + triton_bmm_22688 0.0066 ms 100.0% + triton_bmm_22690 0.0066 ms 100.0% + triton_bmm_22689 0.0067 ms 99.5% + triton_bmm_22692 0.0069 ms 96.3% + triton_bmm_22691 0.0072 ms 91.8% + triton_bmm_22693 0.0077 ms 85.9% + bmm 0.0691 ms 9.6% +SingleProcess AUTOTUNE takes 2.2873 seconds +AUTOTUNE bmm(8x1x64, 8x64x31) + triton_bmm_23430 0.0069 ms 100.0% + triton_bmm_23432 0.0069 ms 99.5% + triton_bmm_23431 0.0069 ms 99.1% + triton_bmm_23429 0.0070 ms 98.9% + triton_bmm_23434 0.0074 ms 92.7% + triton_bmm_23435 0.0074 ms 92.7% + triton_bmm_23433 0.0074 ms 92.5% + triton_bmm_23428 0.0075 ms 92.3% + bmm 0.0603 ms 11.4% +SingleProcess AUTOTUNE takes 2.8880 seconds +AUTOTUNE bmm(8x1x31, 8x31x64) + triton_bmm_23449 0.0067 ms 100.0% + triton_bmm_23450 0.0067 ms 100.0% + triton_bmm_23451 0.0067 ms 100.0% + triton_bmm_23447 0.0072 ms 92.4% + triton_bmm_23448 0.0072 ms 92.4% + triton_bmm_23452 0.0072 ms 92.2% + triton_bmm_23454 0.0076 ms 87.0% + triton_bmm_23453 0.0080 ms 83.7% + bmm 0.0548 ms 12.1% +SingleProcess AUTOTUNE takes 2.2545 seconds +AUTOTUNE bmm(8x1x64, 8x64x32) + triton_bmm_24192 0.0064 ms 100.0% + triton_bmm_24190 0.0068 ms 93.5% + triton_bmm_24193 0.0069 ms 93.0% + triton_bmm_24191 0.0069 ms 92.2% + triton_bmm_24195 0.0073 ms 87.7% + triton_bmm_24194 0.0074 ms 87.0% + triton_bmm_24189 0.0075 ms 85.8% + triton_bmm_24196 0.0078 ms 82.3% + bmm 0.0603 ms 10.6% +SingleProcess AUTOTUNE takes 2.4183 seconds +AUTOTUNE bmm(8x1x32, 8x32x64) + triton_bmm_24208 0.0064 ms 100.0% + triton_bmm_24210 0.0064 ms 100.0% + triton_bmm_24212 0.0064 ms 100.0% + triton_bmm_24209 0.0069 ms 92.6% + triton_bmm_24211 0.0069 ms 92.3% + triton_bmm_24215 0.0069 ms 92.1% + triton_bmm_24213 0.0069 ms 91.7% + triton_bmm_24214 0.0074 ms 86.5% + bmm 0.0691 ms 9.2% +SingleProcess AUTOTUNE takes 2.3092 seconds +AUTOTUNE bmm(8x1x64, 8x64x33) + triton_bmm_24952 0.0066 ms 100.0% + triton_bmm_24954 0.0066 ms 100.0% + triton_bmm_24956 0.0069 ms 94.9% + triton_bmm_24950 0.0071 ms 92.8% + triton_bmm_24951 0.0071 ms 92.3% + triton_bmm_24953 0.0071 ms 92.1% + triton_bmm_24955 0.0074 ms 88.6% + triton_bmm_24957 0.0074 ms 88.4% + triton_bmm_24958 0.0080 ms 82.2% + bmm 0.0627 ms 10.5% +SingleProcess AUTOTUNE takes 2.9092 seconds +AUTOTUNE bmm(8x1x33, 8x33x64) + triton_bmm_24971 0.0069 ms 100.0% + triton_bmm_24972 0.0069 ms 100.0% + triton_bmm_24970 0.0073 ms 94.7% + triton_bmm_24976 0.0074 ms 93.1% + triton_bmm_24974 0.0075 ms 92.7% + triton_bmm_24973 0.0077 ms 90.0% + triton_bmm_24975 0.0077 ms 90.0% + triton_bmm_24978 0.0080 ms 86.2% + triton_bmm_24977 0.0085 ms 81.5% + bmm 0.0555 ms 12.5% +SingleProcess AUTOTUNE takes 2.8985 seconds +AUTOTUNE bmm(8x1x64, 8x64x34) + triton_bmm_25724 0.0064 ms 100.0% + triton_bmm_25725 0.0064 ms 100.0% + triton_bmm_25729 0.0069 ms 92.6% + triton_bmm_25723 0.0069 ms 92.2% + triton_bmm_25727 0.0071 ms 90.7% + triton_bmm_25726 0.0071 ms 89.7% + triton_bmm_25728 0.0074 ms 86.2% + triton_bmm_25730 0.0074 ms 86.2% + triton_bmm_25731 0.0080 ms 80.3% + bmm 0.0571 ms 11.2% +SingleProcess AUTOTUNE takes 2.7193 seconds +AUTOTUNE bmm(8x1x34, 8x34x64) + triton_bmm_25744 0.0064 ms 100.0% + triton_bmm_25747 0.0064 ms 100.0% + triton_bmm_25748 0.0069 ms 93.0% + triton_bmm_25743 0.0069 ms 92.6% + triton_bmm_25745 0.0069 ms 92.2% + triton_bmm_25746 0.0071 ms 89.9% + triton_bmm_25750 0.0072 ms 89.3% + triton_bmm_25751 0.0074 ms 86.2% + triton_bmm_25749 0.0077 ms 83.0% + bmm 0.0564 ms 11.3% +SingleProcess AUTOTUNE takes 2.8074 seconds +AUTOTUNE bmm(8x1x64, 8x64x35) + triton_bmm_26499 0.0066 ms 100.0% + triton_bmm_26497 0.0066 ms 99.0% + triton_bmm_26498 0.0066 ms 99.0% + triton_bmm_26500 0.0066 ms 99.0% + triton_bmm_26501 0.0069 ms 95.3% + triton_bmm_26496 0.0072 ms 91.5% + triton_bmm_26502 0.0075 ms 87.6% + triton_bmm_26503 0.0076 ms 86.5% + triton_bmm_26504 0.0080 ms 82.3% + bmm 0.0631 ms 10.4% +SingleProcess AUTOTUNE takes 2.7431 seconds +AUTOTUNE bmm(8x1x35, 8x35x64) + triton_bmm_26518 0.0069 ms 100.0% + triton_bmm_26520 0.0069 ms 100.0% + triton_bmm_26519 0.0072 ms 96.4% + triton_bmm_26521 0.0072 ms 96.4% + triton_bmm_26524 0.0074 ms 93.1% + triton_bmm_26517 0.0075 ms 92.7% + triton_bmm_26522 0.0078 ms 88.5% + triton_bmm_26516 0.0079 ms 87.1% + triton_bmm_26523 0.0079 ms 87.1% + bmm 0.0558 ms 12.4% +SingleProcess AUTOTUNE takes 2.6814 seconds +AUTOTUNE bmm(8x1x64, 8x64x36) + triton_bmm_27270 0.0064 ms 100.0% + triton_bmm_27271 0.0064 ms 100.0% + triton_bmm_27273 0.0064 ms 100.0% + triton_bmm_27275 0.0069 ms 92.6% + triton_bmm_27269 0.0070 ms 90.9% + triton_bmm_27272 0.0071 ms 89.7% + triton_bmm_27277 0.0074 ms 86.2% + triton_bmm_27274 0.0075 ms 85.8% + triton_bmm_27276 0.0080 ms 80.0% + bmm 0.0650 ms 9.8% +SingleProcess AUTOTUNE takes 2.7604 seconds +AUTOTUNE bmm(8x1x36, 8x36x64) + triton_bmm_27291 0.0064 ms 100.0% + triton_bmm_27292 0.0069 ms 92.2% + triton_bmm_27290 0.0070 ms 91.7% + triton_bmm_27293 0.0070 ms 91.7% + triton_bmm_27295 0.0072 ms 89.3% + triton_bmm_27294 0.0074 ms 86.2% + triton_bmm_27289 0.0075 ms 85.8% + triton_bmm_27297 0.0076 ms 84.7% + triton_bmm_27296 0.0079 ms 80.6% + bmm 0.0557 ms 11.5% +SingleProcess AUTOTUNE takes 2.8668 seconds +AUTOTUNE bmm(8x1x64, 8x64x37) + triton_bmm_28045 0.0066 ms 100.0% + triton_bmm_28043 0.0067 ms 98.6% + triton_bmm_28044 0.0072 ms 91.1% + triton_bmm_28046 0.0072 ms 91.1% + triton_bmm_28050 0.0074 ms 88.4% + triton_bmm_28047 0.0075 ms 88.0% + triton_bmm_28048 0.0075 ms 87.6% + triton_bmm_28042 0.0077 ms 85.1% + triton_bmm_28049 0.0082 ms 79.8% + bmm 0.0659 ms 10.0% +SingleProcess AUTOTUNE takes 2.7964 seconds +AUTOTUNE bmm(8x1x37, 8x37x64) + triton_bmm_28063 0.0072 ms 100.0% + triton_bmm_28064 0.0072 ms 100.0% + triton_bmm_28065 0.0072 ms 100.0% + triton_bmm_28067 0.0072 ms 99.1% + triton_bmm_28068 0.0074 ms 96.6% + triton_bmm_28070 0.0074 ms 96.6% + triton_bmm_28066 0.0076 ms 93.7% + triton_bmm_28062 0.0082 ms 87.3% + triton_bmm_28069 0.0087 ms 82.1% + bmm 0.0628 ms 11.4% +SingleProcess AUTOTUNE takes 2.6767 seconds +AUTOTUNE bmm(8x1x64, 8x64x38) + triton_bmm_28817 0.0064 ms 100.0% + triton_bmm_28818 0.0067 ms 96.2% + triton_bmm_28820 0.0069 ms 92.6% + triton_bmm_28821 0.0069 ms 92.6% + triton_bmm_28815 0.0072 ms 89.3% + triton_bmm_28819 0.0072 ms 89.3% + triton_bmm_28816 0.0072 ms 88.9% + triton_bmm_28823 0.0074 ms 86.2% + triton_bmm_28822 0.0076 ms 84.7% + bmm 0.0663 ms 9.7% +SingleProcess AUTOTUNE takes 2.6534 seconds +AUTOTUNE bmm(8x1x38, 8x38x64) + triton_bmm_28836 0.0064 ms 100.0% + triton_bmm_28839 0.0064 ms 100.0% + triton_bmm_28835 0.0069 ms 92.6% + triton_bmm_28837 0.0071 ms 89.9% + triton_bmm_28841 0.0072 ms 89.3% + triton_bmm_28838 0.0072 ms 88.9% + triton_bmm_28842 0.0074 ms 86.2% + triton_bmm_28843 0.0074 ms 86.2% + triton_bmm_28840 0.0075 ms 85.8% + bmm 0.0550 ms 11.6% +SingleProcess AUTOTUNE takes 2.7074 seconds +AUTOTUNE bmm(8x1x64, 8x64x39) + triton_bmm_29591 0.0066 ms 100.0% + triton_bmm_29589 0.0066 ms 99.5% + triton_bmm_29590 0.0067 ms 99.0% + triton_bmm_29592 0.0067 ms 99.0% + triton_bmm_29594 0.0069 ms 95.4% + triton_bmm_29588 0.0072 ms 92.0% + triton_bmm_29596 0.0074 ms 88.8% + triton_bmm_29593 0.0074 ms 88.6% + triton_bmm_29595 0.0082 ms 80.2% + bmm 0.0556 ms 11.9% +SingleProcess AUTOTUNE takes 2.8038 seconds +AUTOTUNE bmm(8x1x39, 8x39x64) + triton_bmm_29612 0.0069 ms 100.0% + triton_bmm_29610 0.0070 ms 98.2% + triton_bmm_29611 0.0073 ms 94.7% + triton_bmm_29613 0.0074 ms 93.5% + triton_bmm_29616 0.0074 ms 93.1% + triton_bmm_29608 0.0076 ms 90.4% + triton_bmm_29609 0.0077 ms 89.6% + triton_bmm_29614 0.0080 ms 86.7% + triton_bmm_29615 0.0088 ms 78.8% + bmm 0.0508 ms 13.6% +SingleProcess AUTOTUNE takes 2.8029 seconds +AUTOTUNE bmm(8x1x64, 8x64x40) + triton_bmm_30362 0.0064 ms 100.0% + triton_bmm_30363 0.0064 ms 100.0% + triton_bmm_30365 0.0064 ms 100.0% + triton_bmm_30364 0.0067 ms 96.2% + triton_bmm_30366 0.0069 ms 93.0% + triton_bmm_30361 0.0071 ms 89.7% + triton_bmm_30367 0.0074 ms 86.8% + triton_bmm_30369 0.0079 ms 80.6% + triton_bmm_30368 0.0082 ms 77.8% + bmm 0.0552 ms 11.6% +SingleProcess AUTOTUNE takes 2.8752 seconds +AUTOTUNE bmm(8x1x40, 8x40x64) + triton_bmm_30382 0.0064 ms 100.0% + triton_bmm_30385 0.0064 ms 100.0% + triton_bmm_30384 0.0067 ms 96.2% + triton_bmm_30386 0.0069 ms 92.6% + triton_bmm_30383 0.0072 ms 89.3% + triton_bmm_30381 0.0075 ms 85.8% + triton_bmm_30387 0.0077 ms 82.6% + triton_bmm_30388 0.0079 ms 81.0% + triton_bmm_30389 0.0081 ms 79.4% + bmm 0.0557 ms 11.5% +SingleProcess AUTOTUNE takes 2.8486 seconds +AUTOTUNE bmm(8x1x64, 8x64x41) + triton_bmm_31137 0.0066 ms 100.0% + triton_bmm_31135 0.0067 ms 98.6% + triton_bmm_31136 0.0067 ms 98.6% + triton_bmm_31138 0.0067 ms 98.6% + triton_bmm_31139 0.0069 ms 94.9% + triton_bmm_31140 0.0069 ms 94.9% + triton_bmm_31142 0.0074 ms 88.4% + triton_bmm_31134 0.0077 ms 85.1% + triton_bmm_31141 0.0082 ms 79.8% + bmm 0.0555 ms 11.8% +SingleProcess AUTOTUNE takes 2.7032 seconds +AUTOTUNE bmm(8x1x41, 8x41x64) + triton_bmm_31155 0.0072 ms 100.0% + triton_bmm_31156 0.0072 ms 100.0% + triton_bmm_31157 0.0074 ms 96.6% + triton_bmm_31162 0.0074 ms 96.6% + triton_bmm_31154 0.0077 ms 93.3% + triton_bmm_31158 0.0077 ms 93.1% + triton_bmm_31159 0.0080 ms 90.0% + triton_bmm_31160 0.0080 ms 90.0% + triton_bmm_31161 0.0088 ms 81.5% + bmm 0.0563 ms 12.7% +SingleProcess AUTOTUNE takes 2.9419 seconds +AUTOTUNE bmm(8x1x64, 8x64x42) + triton_bmm_31908 0.0066 ms 100.0% + triton_bmm_31909 0.0066 ms 100.0% + triton_bmm_31907 0.0072 ms 91.5% + triton_bmm_31910 0.0072 ms 91.5% + triton_bmm_31911 0.0072 ms 91.1% + triton_bmm_31912 0.0075 ms 88.0% + triton_bmm_31913 0.0075 ms 88.0% + triton_bmm_31914 0.0077 ms 85.4% + triton_bmm_31915 0.0080 ms 82.0% + bmm 0.0588 ms 11.1% +SingleProcess AUTOTUNE takes 3.4794 seconds +AUTOTUNE bmm(8x1x42, 8x42x64) + triton_bmm_31929 0.0065 ms 100.0% + triton_bmm_31930 0.0066 ms 98.6% + triton_bmm_31931 0.0071 ms 91.5% + triton_bmm_31928 0.0072 ms 91.1% + triton_bmm_31934 0.0074 ms 87.9% + triton_bmm_31932 0.0075 ms 87.6% + triton_bmm_31935 0.0076 ms 85.4% + triton_bmm_31927 0.0077 ms 84.6% + triton_bmm_31933 0.0077 ms 84.3% + bmm 0.0552 ms 11.8% +SingleProcess AUTOTUNE takes 2.9121 seconds +AUTOTUNE bmm(8x1x64, 8x64x43) + triton_bmm_32681 0.0067 ms 100.0% + triton_bmm_32686 0.0069 ms 96.3% + triton_bmm_32680 0.0072 ms 92.9% + triton_bmm_32683 0.0072 ms 92.9% + triton_bmm_32684 0.0072 ms 92.7% + triton_bmm_32682 0.0072 ms 92.4% + triton_bmm_32685 0.0075 ms 89.3% + triton_bmm_32687 0.0077 ms 86.7% + triton_bmm_32688 0.0080 ms 83.2% + bmm 0.0552 ms 12.1% +SingleProcess AUTOTUNE takes 2.7214 seconds +AUTOTUNE bmm(8x1x43, 8x43x64) + triton_bmm_32702 0.0072 ms 100.0% + triton_bmm_32706 0.0074 ms 96.6% + triton_bmm_32704 0.0077 ms 93.3% + triton_bmm_32701 0.0077 ms 92.9% + triton_bmm_32703 0.0077 ms 92.9% + triton_bmm_32705 0.0080 ms 90.0% + triton_bmm_32708 0.0080 ms 89.6% + triton_bmm_32700 0.0082 ms 87.2% + triton_bmm_32707 0.0084 ms 84.8% + bmm 0.0553 ms 13.0% +SingleProcess AUTOTUNE takes 2.6661 seconds +AUTOTUNE bmm(8x1x64, 8x64x44) + triton_bmm_33457 0.0065 ms 100.0% + triton_bmm_33456 0.0067 ms 97.1% + triton_bmm_33458 0.0069 ms 93.5% + triton_bmm_33459 0.0069 ms 93.5% + triton_bmm_33455 0.0071 ms 91.2% + triton_bmm_33454 0.0072 ms 90.0% + triton_bmm_33460 0.0077 ms 84.2% + triton_bmm_33453 0.0077 ms 83.8% + triton_bmm_33461 0.0080 ms 81.1% + bmm 0.0587 ms 11.0% +SingleProcess AUTOTUNE takes 2.9092 seconds +AUTOTUNE bmm(8x1x44, 8x44x64) + triton_bmm_33475 0.0064 ms 100.0% + triton_bmm_33477 0.0064 ms 100.0% + triton_bmm_33476 0.0064 ms 99.5% + triton_bmm_33473 0.0069 ms 92.6% + triton_bmm_33474 0.0072 ms 89.3% + triton_bmm_33479 0.0072 ms 89.3% + triton_bmm_33478 0.0075 ms 85.1% + triton_bmm_33481 0.0076 ms 84.4% + triton_bmm_33480 0.0080 ms 80.3% + bmm 0.0571 ms 11.2% +SingleProcess AUTOTUNE takes 2.9440 seconds +AUTOTUNE bmm(8x1x64, 8x64x45) + triton_bmm_34229 0.0065 ms 100.0% + triton_bmm_34231 0.0069 ms 94.4% + triton_bmm_34232 0.0069 ms 94.4% + triton_bmm_34228 0.0072 ms 91.3% + triton_bmm_34227 0.0072 ms 90.7% + triton_bmm_34230 0.0072 ms 90.7% + triton_bmm_34234 0.0074 ms 87.9% + triton_bmm_34226 0.0077 ms 84.6% + triton_bmm_34233 0.0082 ms 79.4% + bmm 0.0595 ms 11.0% +SingleProcess AUTOTUNE takes 2.8799 seconds +AUTOTUNE bmm(8x1x45, 8x45x64) + triton_bmm_34248 0.0072 ms 100.0% + triton_bmm_34252 0.0074 ms 96.6% + triton_bmm_34254 0.0074 ms 96.6% + triton_bmm_34246 0.0077 ms 92.9% + triton_bmm_34250 0.0077 ms 92.9% + triton_bmm_34247 0.0077 ms 92.8% + triton_bmm_34249 0.0079 ms 90.7% + triton_bmm_34251 0.0079 ms 90.3% + triton_bmm_34253 0.0090 ms 79.7% + bmm 0.0567 ms 12.6% +SingleProcess AUTOTUNE takes 2.6515 seconds +AUTOTUNE bmm(8x1x64, 8x64x46) + triton_bmm_35000 0.0066 ms 100.0% + triton_bmm_35003 0.0066 ms 100.0% + triton_bmm_35001 0.0066 ms 99.5% + triton_bmm_35002 0.0067 ms 99.0% + triton_bmm_35004 0.0069 ms 95.4% + triton_bmm_34999 0.0072 ms 92.0% + triton_bmm_35005 0.0075 ms 88.4% + triton_bmm_35006 0.0077 ms 85.8% + triton_bmm_35007 0.0080 ms 82.7% + bmm 0.0663 ms 9.9% +SingleProcess AUTOTUNE takes 2.7092 seconds +AUTOTUNE bmm(8x1x46, 8x46x64) + triton_bmm_35021 0.0065 ms 100.0% + triton_bmm_35023 0.0066 ms 99.5% + triton_bmm_35022 0.0066 ms 98.6% + triton_bmm_35024 0.0069 ms 94.4% + triton_bmm_35020 0.0071 ms 91.7% + triton_bmm_35019 0.0077 ms 84.6% + triton_bmm_35025 0.0078 ms 83.6% + triton_bmm_35026 0.0080 ms 81.9% + triton_bmm_35027 0.0080 ms 81.9% + bmm 0.0624 ms 10.5% +SingleProcess AUTOTUNE takes 3.1512 seconds +AUTOTUNE bmm(8x1x64, 8x64x47) + triton_bmm_35773 0.0067 ms 100.0% + triton_bmm_35774 0.0067 ms 100.0% + triton_bmm_35777 0.0069 ms 96.7% + triton_bmm_35778 0.0069 ms 96.3% + triton_bmm_35772 0.0072 ms 92.9% + triton_bmm_35775 0.0072 ms 92.4% + triton_bmm_35776 0.0072 ms 92.4% + triton_bmm_35780 0.0082 ms 81.6% + triton_bmm_35779 0.0082 ms 81.2% + bmm 0.0553 ms 12.0% +SingleProcess AUTOTUNE takes 3.0715 seconds +AUTOTUNE bmm(8x1x47, 8x47x64) + triton_bmm_35796 0.0072 ms 100.0% + triton_bmm_35797 0.0074 ms 97.0% + triton_bmm_35793 0.0077 ms 92.9% + triton_bmm_35794 0.0077 ms 92.9% + triton_bmm_35795 0.0078 ms 92.4% + triton_bmm_35798 0.0080 ms 90.0% + triton_bmm_35800 0.0080 ms 90.0% + triton_bmm_35792 0.0082 ms 87.2% + triton_bmm_35799 0.0088 ms 81.0% + bmm 0.0695 ms 10.3% +SingleProcess AUTOTUNE takes 2.7215 seconds +AUTOTUNE bmm(8x1x64, 8x64x48) + triton_bmm_36546 0.0065 ms 100.0% + triton_bmm_36549 0.0065 ms 99.5% + triton_bmm_36550 0.0069 ms 94.0% + triton_bmm_36551 0.0069 ms 94.0% + triton_bmm_36545 0.0071 ms 91.0% + triton_bmm_36547 0.0072 ms 90.4% + triton_bmm_36548 0.0072 ms 90.2% + triton_bmm_36553 0.0074 ms 87.5% + triton_bmm_36552 0.0082 ms 79.0% + bmm 0.0556 ms 11.7% +SingleProcess AUTOTUNE takes 2.9374 seconds +AUTOTUNE bmm(8x1x48, 8x48x64) + triton_bmm_36566 0.0064 ms 100.0% + triton_bmm_36567 0.0064 ms 100.0% + triton_bmm_36568 0.0066 ms 96.6% + triton_bmm_36571 0.0068 ms 93.9% + triton_bmm_36565 0.0069 ms 92.6% + triton_bmm_36569 0.0070 ms 91.3% + triton_bmm_36570 0.0072 ms 88.9% + triton_bmm_36572 0.0074 ms 86.2% + triton_bmm_36573 0.0080 ms 80.3% + bmm 0.0580 ms 11.0% +SingleProcess AUTOTUNE takes 2.9326 seconds +AUTOTUNE bmm(8x1x64, 8x64x49) + triton_bmm_37320 0.0066 ms 100.0% + triton_bmm_37321 0.0066 ms 100.0% + triton_bmm_37319 0.0067 ms 99.5% + triton_bmm_37323 0.0069 ms 95.8% + triton_bmm_37324 0.0069 ms 95.8% + triton_bmm_37322 0.0071 ms 92.8% + triton_bmm_37318 0.0072 ms 92.4% + triton_bmm_37326 0.0074 ms 89.2% + triton_bmm_37325 0.0082 ms 80.5% + bmm 0.0540 ms 12.3% +SingleProcess AUTOTUNE takes 2.8578 seconds +AUTOTUNE bmm(8x1x49, 8x49x64) + triton_bmm_37342 0.0072 ms 100.0% + triton_bmm_37338 0.0077 ms 93.3% + triton_bmm_37339 0.0077 ms 92.9% + triton_bmm_37340 0.0077 ms 92.9% + triton_bmm_37344 0.0079 ms 90.3% + triton_bmm_37341 0.0080 ms 90.0% + triton_bmm_37343 0.0080 ms 90.0% + triton_bmm_37346 0.0081 ms 88.5% + triton_bmm_37345 0.0093 ms 77.0% + bmm 0.0696 ms 10.3% +SingleProcess AUTOTUNE takes 2.8549 seconds +AUTOTUNE bmm(8x1x64, 8x64x50) + triton_bmm_38092 0.0066 ms 100.0% + triton_bmm_38093 0.0070 ms 94.1% + triton_bmm_38091 0.0072 ms 92.0% + triton_bmm_38094 0.0072 ms 91.6% + triton_bmm_38095 0.0072 ms 91.6% + triton_bmm_38096 0.0074 ms 89.4% + triton_bmm_38099 0.0074 ms 88.8% + triton_bmm_38097 0.0076 ms 86.2% + triton_bmm_38098 0.0076 ms 86.2% + bmm 0.0581 ms 11.3% +SingleProcess AUTOTUNE takes 3.0207 seconds +AUTOTUNE bmm(8x1x50, 8x50x64) + triton_bmm_38112 0.0064 ms 100.0% + triton_bmm_38113 0.0064 ms 100.0% + triton_bmm_38115 0.0064 ms 100.0% + triton_bmm_38114 0.0067 ms 96.2% + triton_bmm_38111 0.0071 ms 89.7% + triton_bmm_38116 0.0076 ms 84.7% + triton_bmm_38117 0.0082 ms 77.8% + triton_bmm_38118 0.0084 ms 76.3% + triton_bmm_38119 0.0087 ms 73.3% + bmm 0.0551 ms 11.6% +SingleProcess AUTOTUNE takes 2.9178 seconds +AUTOTUNE bmm(8x1x64, 8x64x51) + triton_bmm_38867 0.0072 ms 100.0% + triton_bmm_38869 0.0072 ms 100.0% + triton_bmm_38865 0.0072 ms 99.8% + triton_bmm_38866 0.0072 ms 99.6% + triton_bmm_38868 0.0072 ms 99.6% + triton_bmm_38872 0.0074 ms 96.6% + triton_bmm_38870 0.0075 ms 96.1% + triton_bmm_38864 0.0077 ms 92.9% + triton_bmm_38871 0.0082 ms 87.5% + bmm 0.0553 ms 13.0% +SingleProcess AUTOTUNE takes 2.7252 seconds +AUTOTUNE bmm(8x1x51, 8x51x64) + triton_bmm_38886 0.0072 ms 100.0% + triton_bmm_38888 0.0072 ms 100.0% + triton_bmm_38889 0.0074 ms 97.0% + triton_bmm_38884 0.0076 ms 93.7% + triton_bmm_38885 0.0077 ms 92.9% + triton_bmm_38887 0.0079 ms 90.9% + triton_bmm_38890 0.0079 ms 90.3% + triton_bmm_38892 0.0087 ms 82.7% + triton_bmm_38891 0.0093 ms 77.2% + bmm 0.0538 ms 13.3% +SingleProcess AUTOTUNE takes 2.8034 seconds +AUTOTUNE bmm(8x1x64, 8x64x52) + triton_bmm_39641 0.0064 ms 100.0% + triton_bmm_39639 0.0065 ms 98.5% + triton_bmm_39640 0.0067 ms 96.6% + triton_bmm_39642 0.0069 ms 93.1% + triton_bmm_39643 0.0069 ms 93.1% + triton_bmm_39638 0.0071 ms 90.5% + triton_bmm_39645 0.0074 ms 86.6% + triton_bmm_39637 0.0077 ms 83.4% + triton_bmm_39644 0.0081 ms 79.4% + bmm 0.0564 ms 11.4% +SingleProcess AUTOTUNE takes 2.6401 seconds +AUTOTUNE bmm(8x1x52, 8x52x64) + triton_bmm_39658 0.0064 ms 100.0% + triton_bmm_39659 0.0069 ms 92.6% + triton_bmm_39657 0.0069 ms 92.6% + triton_bmm_39662 0.0069 ms 92.6% + triton_bmm_39660 0.0071 ms 90.1% + triton_bmm_39661 0.0072 ms 89.3% + triton_bmm_39663 0.0077 ms 83.3% + triton_bmm_39664 0.0085 ms 75.5% + triton_bmm_39665 0.0088 ms 72.7% + bmm 0.0631 ms 10.1% +SingleProcess AUTOTUNE takes 3.3242 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:16, ?it/s] +hf_T5_large +cuda eval hf_T5_large int8dynamic-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +hf_Whisper +cuda eval hf_Whisper int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Whisper. Setting accuracy check to cosine +AUTOTUNE int_mm(1500x256, 256x256, 1500x256) + triton_mm_8 0.0101 ms 100.0% + triton_mm_3 0.0113 ms 89.0% + triton_mm_4 0.0116 ms 86.8% + triton_mm_5 0.0120 ms 84.2% + triton_mm_0 0.0120 ms 84.0% + triton_mm_2 0.0121 ms 83.3% + triton_mm_6 0.0122 ms 82.7% + triton_mm_1 0.0125 ms 80.6% + triton_mm_7 0.0168 ms 59.9% + triton_mm_10 0.0186 ms 54.3% +SingleProcess AUTOTUNE takes 7.1614 seconds +AUTOTUNE int_mm(1500x256, 256x1536, 1500x1536) + triton_mm_45 0.0170 ms 100.0% + triton_mm_46 0.0179 ms 94.8% + triton_mm_44 0.0183 ms 92.8% + triton_mm_52 0.0185 ms 91.9% + triton_mm_48 0.0190 ms 89.4% + triton_mm_53 0.0195 ms 87.2% + triton_mm_54 0.0197 ms 85.9% + triton_mm_47 0.0198 ms 85.8% + triton_mm_51 0.0204 ms 83.1% + triton_mm_49 0.0329 ms 51.5% +SingleProcess AUTOTUNE takes 7.4789 seconds +AUTOTUNE int_mm(1500x1536, 1536x256, 1500x256) + triton_mm_63 0.0196 ms 100.0% + triton_mm_58 0.0240 ms 81.9% + triton_mm_59 0.0250 ms 78.5% + triton_mm_60 0.0262 ms 75.0% + triton_mm_61 0.0262 ms 74.9% + triton_mm_56 0.0310 ms 63.4% + triton_mm_57 0.0310 ms 63.4% + triton_mm_65 0.0343 ms 57.3% + triton_mm_64 0.0343 ms 57.3% + triton_mm_55 0.0366 ms 53.6% +SingleProcess AUTOTUNE takes 7.3714 seconds +AUTOTUNE int_mm(1x256, 256x2, 1x2) + triton_mm_411 0.0072 ms 100.0% + triton_mm_412 0.0073 ms 98.7% + triton_mm_410 0.0075 ms 96.6% + triton_mm_409 0.0082 ms 88.2% + triton_mm_408 0.0083 ms 87.2% + triton_mm_407 0.0098 ms 73.5% +SingleProcess AUTOTUNE takes 1.8473 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +WARNING:root:hf_clip failed to load +hf_clip +Original Error: 'str' object has no attribute 'shape' +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward + vision_outputs = self.vision_model( + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward + hidden_states = self.embeddings(pixel_values) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward + batch_size = pixel_values.shape[0] +AttributeError: 'str' object has no attribute 'shape' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +lennard_jones +cuda eval lennard_jones int8dynamic-bs1-acc +AUTOTUNE mm(1x1, 1x16) + triton_mm_0 0.0058 ms 100.0% + triton_mm_1 0.0058 ms 100.0% + triton_mm_2 0.0058 ms 100.0% + triton_mm_4 0.0058 ms 100.0% + mm 0.0064 ms 91.0% + triton_mm_3 0.0065 ms 90.1% +SingleProcess AUTOTUNE takes 1.4291 seconds +AUTOTUNE mm(1x16, 16x16) + triton_mm_5 0.0058 ms 100.0% + triton_mm_6 0.0058 ms 100.0% + triton_mm_8 0.0058 ms 100.0% + mm 0.0062 ms 93.3% + triton_mm_7 0.0064 ms 91.0% + triton_mm_9 0.0065 ms 90.1% +SingleProcess AUTOTUNE takes 1.3339 seconds +AUTOTUNE addmm(1x1, 1x16, 16x1) + triton_mm_20 0.0056 ms 100.0% + triton_mm_21 0.0056 ms 100.0% + triton_mm_24 0.0056 ms 100.0% + triton_mm_22 0.0063 ms 89.3% + triton_mm_23 0.0063 ms 89.3% + addmm 0.0120 ms 46.5% +SingleProcess AUTOTUNE takes 1.3734 seconds +pass-sqnr-53.975 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +llama +cuda eval llama int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for llama. Setting accuracy check to cosine +pass-sqnr-37.083 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:52, ?it/s] +llama_v2_7b_16h +cuda eval llama_v2_7b_16h int8dynamic-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +maml_omniglot +cuda eval maml_omniglot int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for maml_omniglot. Setting accuracy check to cosine +pass-sqnr-46.636 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mnasnet1_0 +cuda eval mnasnet1_0 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for mnasnet1_0. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x224x224, 32x3x3x3) + triton_convolution_4 0.0115 ms 100.0% + convolution 0.0131 ms 87.8% + triton_convolution_3 0.0138 ms 83.5% + triton_convolution_0 0.0146 ms 78.9% + triton_convolution_5 0.0187 ms 61.6% + triton_convolution_2 0.0252 ms 45.6% + triton_convolution_1 0.0304 ms 37.7% +SingleProcess AUTOTUNE takes 2.9246 seconds +AUTOTUNE mm(12544x32, 32x16) + triton_mm_6 0.0075 ms 100.0% + triton_mm_7 0.0075 ms 100.0% + triton_mm_8 0.0075 ms 100.0% + triton_mm_10 0.0075 ms 100.0% + triton_mm_11 0.0075 ms 100.0% + triton_mm_13 0.0080 ms 94.0% + triton_mm_9 0.0080 ms 93.6% + triton_mm_14 0.0082 ms 91.4% + triton_mm_12 0.0083 ms 91.1% + triton_mm_16 0.0085 ms 88.3% +SingleProcess AUTOTUNE takes 3.1633 seconds +AUTOTUNE mm(12544x16, 16x48) + triton_mm_18 0.0076 ms 100.0% + triton_mm_21 0.0076 ms 100.0% + triton_mm_24 0.0076 ms 100.0% + triton_mm_22 0.0078 ms 98.0% + triton_mm_26 0.0079 ms 96.8% + triton_mm_23 0.0079 ms 96.4% + triton_mm_27 0.0079 ms 96.4% + triton_mm_17 0.0081 ms 94.1% + triton_mm_25 0.0082 ms 93.7% + triton_mm_20 0.0082 ms 93.0% +SingleProcess AUTOTUNE takes 3.1635 seconds +AUTOTUNE mm(3136x48, 48x24) + triton_mm_36 0.0070 ms 100.0% + triton_mm_29 0.0072 ms 96.5% + triton_mm_32 0.0073 ms 96.0% + triton_mm_37 0.0073 ms 96.0% + triton_mm_31 0.0077 ms 90.6% + triton_mm_33 0.0077 ms 90.1% + triton_mm_34 0.0077 ms 90.1% + triton_mm_30 0.0078 ms 89.7% + triton_mm_28 0.0082 ms 85.5% + triton_mm_35 0.0085 ms 82.1% +SingleProcess AUTOTUNE takes 3.6912 seconds +AUTOTUNE mm(3136x24, 24x72) + triton_mm_40 0.0073 ms 100.0% + triton_mm_49 0.0074 ms 99.6% + triton_mm_51 0.0075 ms 97.4% + triton_mm_46 0.0076 ms 95.8% + triton_mm_41 0.0078 ms 94.2% + triton_mm_44 0.0078 ms 94.2% + triton_mm_42 0.0078 ms 93.9% + triton_mm_45 0.0078 ms 93.7% + triton_mm_48 0.0079 ms 92.7% + triton_mm_43 0.0080 ms 91.8% +SingleProcess AUTOTUNE takes 4.1231 seconds +AUTOTUNE mm(3136x72, 72x24) + triton_mm_57 0.0075 ms 100.0% + triton_mm_60 0.0077 ms 97.9% + triton_mm_56 0.0078 ms 96.7% + triton_mm_54 0.0079 ms 95.5% + triton_mm_55 0.0079 ms 94.8% + triton_mm_53 0.0081 ms 92.5% + triton_mm_52 0.0085 ms 88.3% + triton_mm_61 0.0086 ms 87.4% + triton_mm_58 0.0086 ms 87.0% + triton_mm_59 0.0096 ms 78.6% +SingleProcess AUTOTUNE takes 4.0099 seconds +AUTOTUNE mm(784x72, 72x40) + triton_mm_105 0.0072 ms 100.0% + triton_mm_103 0.0075 ms 96.8% + triton_mm_104 0.0078 ms 92.6% + triton_mm_106 0.0080 ms 90.4% + triton_mm_101 0.0080 ms 90.0% + triton_mm_108 0.0081 ms 89.2% + triton_mm_109 0.0083 ms 87.3% + triton_mm_102 0.0085 ms 85.4% + triton_mm_100 0.0085 ms 84.6% + mm 0.0087 ms 83.4% +SingleProcess AUTOTUNE takes 4.2482 seconds +AUTOTUNE mm(784x40, 40x120) + triton_mm_118 0.0070 ms 100.0% + triton_mm_120 0.0072 ms 96.5% + triton_mm_113 0.0075 ms 93.2% + mm 0.0076 ms 92.0% + triton_mm_117 0.0076 ms 91.6% + triton_mm_116 0.0078 ms 89.7% + triton_mm_121 0.0078 ms 89.7% + triton_mm_112 0.0081 ms 86.5% + triton_mm_115 0.0083 ms 84.5% + triton_mm_114 0.0083 ms 84.2% +SingleProcess AUTOTUNE takes 4.3814 seconds +AUTOTUNE mm(784x120, 120x40) + triton_mm_132 0.0077 ms 100.0% + triton_mm_127 0.0078 ms 99.6% + triton_mm_129 0.0080 ms 96.4% + triton_mm_130 0.0083 ms 93.8% + triton_mm_128 0.0083 ms 93.4% + triton_mm_133 0.0084 ms 91.7% + triton_mm_125 0.0085 ms 90.6% + mm 0.0086 ms 90.3% + triton_mm_126 0.0089 ms 87.1% + triton_mm_124 0.0102 ms 76.2% +SingleProcess AUTOTUNE takes 4.3142 seconds +AUTOTUNE mm(784x40, 40x240) + triton_mm_162 0.0075 ms 100.0% + triton_mm_169 0.0076 ms 99.6% + triton_mm_164 0.0078 ms 96.7% + triton_mm_165 0.0078 ms 96.7% + triton_mm_168 0.0078 ms 96.5% + triton_mm_160 0.0080 ms 93.4% + triton_mm_166 0.0081 ms 92.9% + triton_mm_161 0.0082 ms 91.4% + triton_mm_170 0.0083 ms 90.7% + triton_mm_163 0.0084 ms 90.0% +SingleProcess AUTOTUNE takes 4.5122 seconds +AUTOTUNE mm(196x240, 240x80) + triton_mm_181 0.0081 ms 100.0% + triton_mm_178 0.0088 ms 91.6% + triton_mm_177 0.0091 ms 88.7% + triton_mm_180 0.0091 ms 88.7% + triton_mm_174 0.0102 ms 78.8% + mm 0.0103 ms 78.3% + triton_mm_175 0.0103 ms 78.0% + triton_mm_176 0.0106 ms 75.9% + triton_mm_173 0.0108 ms 74.3% + triton_mm_172 0.0127 ms 63.3% +SingleProcess AUTOTUNE takes 4.8399 seconds +AUTOTUNE mm(196x80, 80x480) + triton_mm_190 0.0073 ms 100.0% + triton_mm_189 0.0080 ms 91.6% + triton_mm_185 0.0080 ms 90.8% + triton_mm_186 0.0080 ms 90.8% + triton_mm_187 0.0083 ms 88.0% + triton_mm_188 0.0083 ms 88.0% + triton_mm_192 0.0083 ms 88.0% + triton_mm_193 0.0086 ms 85.1% + triton_mm_195 0.0088 ms 83.2% + triton_mm_184 0.0091 ms 80.3% +SingleProcess AUTOTUNE takes 4.9695 seconds +AUTOTUNE mm(196x480, 480x80) + triton_mm_202 0.0096 ms 100.0% + triton_mm_205 0.0104 ms 92.9% + triton_mm_201 0.0106 ms 91.2% + triton_mm_204 0.0112 ms 86.2% + mm 0.0128 ms 75.4% + triton_mm_199 0.0129 ms 74.5% + triton_mm_200 0.0132 ms 72.7% + triton_mm_197 0.0140 ms 68.6% + triton_mm_198 0.0140 ms 68.6% + triton_mm_196 0.0190 ms 50.6% +SingleProcess AUTOTUNE takes 5.0042 seconds +AUTOTUNE mm(196x480, 480x96) + triton_mm_250 0.0104 ms 100.0% + triton_mm_249 0.0106 ms 98.2% + triton_mm_253 0.0109 ms 95.0% + triton_mm_252 0.0112 ms 92.8% + triton_mm_247 0.0123 ms 84.6% + mm 0.0131 ms 79.4% + triton_mm_248 0.0133 ms 78.1% + triton_mm_245 0.0135 ms 76.6% + triton_mm_246 0.0135 ms 76.6% + triton_mm_244 0.0193 ms 53.8% +SingleProcess AUTOTUNE takes 4.8429 seconds +AUTOTUNE mm(196x96, 96x576) + triton_mm_262 0.0078 ms 100.0% + triton_mm_261 0.0080 ms 97.8% + triton_mm_257 0.0080 ms 97.6% + triton_mm_258 0.0081 ms 97.2% + triton_mm_259 0.0083 ms 94.6% + triton_mm_260 0.0083 ms 94.6% + triton_mm_264 0.0084 ms 93.0% + triton_mm_256 0.0086 ms 91.4% + triton_mm_265 0.0091 ms 86.4% + triton_mm_267 0.0096 ms 81.4% +SingleProcess AUTOTUNE takes 4.5964 seconds +AUTOTUNE mm(196x576, 576x96) + triton_mm_274 0.0112 ms 100.0% + triton_mm_273 0.0114 ms 97.6% + triton_mm_276 0.0117 ms 95.4% + triton_mm_277 0.0119 ms 93.6% + mm 0.0134 ms 83.5% + triton_mm_271 0.0140 ms 79.5% + triton_mm_272 0.0148 ms 75.4% + triton_mm_269 0.0153 ms 73.0% + triton_mm_270 0.0157 ms 71.2% + triton_mm_268 0.0216 ms 51.6% +SingleProcess AUTOTUNE takes 5.0321 seconds +AUTOTUNE mm(49x576, 576x192) + mm 0.0140 ms 100.0% + triton_mm_301 0.0169 ms 82.4% + triton_mm_298 0.0203 ms 68.7% + triton_mm_300 0.0213 ms 65.4% + triton_mm_297 0.0217 ms 64.4% + triton_mm_296 0.0235 ms 59.5% + triton_mm_293 0.0253 ms 55.2% + triton_mm_295 0.0255 ms 54.6% + triton_mm_303 0.0273 ms 51.2% + triton_mm_292 0.0303 ms 46.0% +SingleProcess AUTOTUNE takes 4.4412 seconds +AUTOTUNE mm(49x192, 192x1152) + triton_mm_313 0.0109 ms 100.0% + triton_mm_309 0.0113 ms 96.5% + triton_mm_310 0.0115 ms 95.4% + triton_mm_312 0.0119 ms 91.6% + mm 0.0123 ms 88.7% + triton_mm_308 0.0123 ms 88.6% + triton_mm_307 0.0130 ms 84.1% + triton_mm_315 0.0130 ms 84.1% + triton_mm_305 0.0132 ms 82.5% + triton_mm_304 0.0142 ms 76.8% +SingleProcess AUTOTUNE takes 4.2583 seconds +AUTOTUNE mm(49x1152, 1152x192) + mm 0.0150 ms 100.0% + triton_mm_325 0.0269 ms 55.8% + triton_mm_322 0.0352 ms 42.7% + triton_mm_324 0.0365 ms 41.1% + triton_mm_321 0.0373 ms 40.2% + triton_mm_320 0.0410 ms 36.6% + triton_mm_317 0.0438 ms 34.2% + triton_mm_319 0.0449 ms 33.4% + triton_mm_327 0.0491 ms 30.6% + triton_mm_318 0.0569 ms 26.4% +SingleProcess AUTOTUNE takes 4.1256 seconds +AUTOTUNE mm(49x1152, 1152x320) + mm 0.0162 ms 100.0% + triton_mm_397 0.0268 ms 60.4% + triton_mm_394 0.0357 ms 45.3% + triton_mm_396 0.0365 ms 44.3% + triton_mm_393 0.0378 ms 42.8% + triton_mm_392 0.0414 ms 39.1% + triton_mm_389 0.0436 ms 37.2% + triton_mm_391 0.0454 ms 35.7% + triton_mm_399 0.0488 ms 33.2% + triton_mm_390 0.0561 ms 28.9% +SingleProcess AUTOTUNE takes 4.0931 seconds +AUTOTUNE mm(49x320, 320x1280) + triton_mm_409 0.0125 ms 100.0% + triton_mm_406 0.0148 ms 84.4% + triton_mm_408 0.0151 ms 82.8% + triton_mm_405 0.0152 ms 82.1% + triton_mm_404 0.0163 ms 76.5% + mm 0.0165 ms 75.7% + triton_mm_403 0.0173 ms 72.1% + triton_mm_401 0.0177 ms 70.7% + triton_mm_411 0.0180 ms 69.1% + triton_mm_400 0.0195 ms 64.0% +SingleProcess AUTOTUNE takes 4.2409 seconds +AUTOTUNE int_mm(1x1280, 1280x1000, 1x1000) + triton_mm_422 0.0127 ms 100.0% + triton_mm_421 0.0140 ms 90.4% + triton_mm_417 0.0154 ms 82.7% + triton_mm_418 0.0156 ms 81.4% + triton_mm_420 0.0157 ms 80.9% + triton_mm_416 0.0167 ms 76.2% + triton_mm_414 0.0232 ms 54.7% + triton_mm_413 0.0247 ms 51.5% + triton_mm_412 0.0350 ms 36.3% + triton_mm_415 0.0366 ms 34.7% +SingleProcess AUTOTUNE takes 3.8065 seconds +pass-sqnr-28.804 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mobilenet_v2 +cuda eval mobilenet_v2 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for mobilenet_v2. Setting accuracy check to cosine +AUTOTUNE mm(12544x16, 16x96) + triton_mm_26 0.0084 ms 100.0% + triton_mm_27 0.0084 ms 99.2% + triton_mm_20 0.0087 ms 96.0% + triton_mm_21 0.0087 ms 96.0% + triton_mm_18 0.0088 ms 95.6% + triton_mm_24 0.0089 ms 94.4% + triton_mm_22 0.0090 ms 93.6% + triton_mm_17 0.0092 ms 91.0% + triton_mm_19 0.0092 ms 91.0% + triton_mm_25 0.0095 ms 88.5% +SingleProcess AUTOTUNE takes 3.3252 seconds +AUTOTUNE mm(3136x96, 96x24) + triton_mm_31 0.0075 ms 100.0% + triton_mm_33 0.0076 ms 98.7% + triton_mm_36 0.0076 ms 98.3% + triton_mm_29 0.0079 ms 95.1% + triton_mm_30 0.0081 ms 92.5% + triton_mm_34 0.0083 ms 90.0% + triton_mm_37 0.0084 ms 89.7% + triton_mm_32 0.0084 ms 89.0% + triton_mm_35 0.0093 ms 80.6% + mm 0.0094 ms 79.6% +SingleProcess AUTOTUNE takes 3.8683 seconds +AUTOTUNE mm(3136x24, 24x144) + triton_mm_49 0.0078 ms 100.0% + triton_mm_44 0.0079 ms 98.8% + triton_mm_48 0.0079 ms 98.8% + triton_mm_45 0.0084 ms 92.4% + triton_mm_46 0.0084 ms 92.4% + triton_mm_51 0.0084 ms 92.4% + triton_mm_40 0.0084 ms 92.2% + triton_mm_43 0.0086 ms 90.2% + triton_mm_42 0.0087 ms 89.7% + triton_mm_41 0.0089 ms 87.1% +SingleProcess AUTOTUNE takes 3.9458 seconds +AUTOTUNE mm(3136x144, 144x24) + triton_mm_57 0.0081 ms 100.0% + triton_mm_55 0.0084 ms 96.9% + triton_mm_58 0.0084 ms 96.9% + triton_mm_60 0.0084 ms 96.9% + triton_mm_56 0.0086 ms 94.4% + triton_mm_53 0.0087 ms 93.7% + triton_mm_54 0.0091 ms 89.1% + triton_mm_61 0.0091 ms 89.1% + mm 0.0102 ms 79.6% + triton_mm_52 0.0110 ms 74.2% +SingleProcess AUTOTUNE takes 3.8103 seconds +AUTOTUNE mm(784x144, 144x32) + triton_mm_82 0.0073 ms 100.0% + triton_mm_81 0.0076 ms 95.8% + triton_mm_85 0.0079 ms 92.7% + triton_mm_84 0.0083 ms 88.4% + triton_mm_77 0.0083 ms 87.7% + triton_mm_79 0.0084 ms 86.7% + triton_mm_78 0.0086 ms 84.4% + triton_mm_80 0.0089 ms 81.7% + mm 0.0092 ms 78.9% + triton_mm_76 0.0104 ms 69.9% +SingleProcess AUTOTUNE takes 3.9510 seconds +AUTOTUNE mm(784x32, 32x192) + triton_mm_96 0.0068 ms 100.0% + triton_mm_97 0.0068 ms 100.0% + triton_mm_99 0.0073 ms 93.4% + triton_mm_91 0.0074 ms 93.0% + triton_mm_93 0.0074 ms 92.2% + triton_mm_94 0.0074 ms 92.2% + triton_mm_88 0.0076 ms 90.7% + triton_mm_89 0.0076 ms 89.9% + triton_mm_98 0.0079 ms 87.0% + mm 0.0080 ms 85.9% +SingleProcess AUTOTUNE takes 4.1696 seconds +AUTOTUNE mm(784x192, 192x32) + triton_mm_103 0.0081 ms 100.0% + triton_mm_108 0.0081 ms 100.0% + triton_mm_109 0.0081 ms 100.0% + triton_mm_106 0.0081 ms 99.4% + triton_mm_101 0.0086 ms 94.1% + triton_mm_104 0.0086 ms 94.1% + triton_mm_105 0.0086 ms 93.7% + triton_mm_102 0.0095 ms 85.5% + mm 0.0098 ms 83.0% + triton_mm_100 0.0113 ms 71.5% +SingleProcess AUTOTUNE takes 4.0790 seconds +AUTOTUNE mm(196x192, 192x64) + triton_mm_154 0.0079 ms 100.0% + triton_mm_157 0.0079 ms 100.0% + triton_mm_156 0.0084 ms 94.3% + triton_mm_153 0.0086 ms 91.1% + triton_mm_151 0.0091 ms 86.9% + triton_mm_152 0.0094 ms 84.0% + triton_mm_149 0.0094 ms 84.0% + triton_mm_150 0.0094 ms 83.7% + mm 0.0097 ms 81.2% + triton_mm_148 0.0117 ms 67.0% +SingleProcess AUTOTUNE takes 4.8166 seconds +AUTOTUNE mm(196x64, 64x384) + triton_mm_169 0.0073 ms 100.0% + triton_mm_160 0.0076 ms 95.8% + triton_mm_161 0.0076 ms 95.8% + triton_mm_162 0.0076 ms 95.8% + triton_mm_166 0.0076 ms 95.4% + triton_mm_165 0.0078 ms 93.1% + triton_mm_168 0.0079 ms 91.9% + triton_mm_171 0.0081 ms 89.8% + triton_mm_163 0.0084 ms 87.0% + triton_mm_164 0.0086 ms 84.9% +SingleProcess AUTOTUNE takes 4.7913 seconds +AUTOTUNE mm(196x384, 384x64) + triton_mm_178 0.0092 ms 100.0% + triton_mm_177 0.0100 ms 92.0% + triton_mm_181 0.0100 ms 92.0% + triton_mm_175 0.0104 ms 88.0% + triton_mm_180 0.0105 ms 87.2% + mm 0.0115 ms 79.9% + triton_mm_173 0.0119 ms 76.9% + triton_mm_176 0.0119 ms 76.9% + triton_mm_174 0.0129 ms 71.1% + triton_mm_172 0.0167 ms 54.7% +SingleProcess AUTOTUNE takes 4.4517 seconds +AUTOTUNE mm(196x384, 384x96) + triton_mm_253 0.0094 ms 100.0% + triton_mm_250 0.0098 ms 95.3% + triton_mm_249 0.0101 ms 93.2% + triton_mm_252 0.0106 ms 88.3% + mm 0.0117 ms 80.1% + triton_mm_247 0.0119 ms 78.8% + triton_mm_246 0.0124 ms 75.5% + triton_mm_248 0.0125 ms 75.1% + triton_mm_245 0.0127 ms 73.6% + triton_mm_244 0.0162 ms 57.8% +SingleProcess AUTOTUNE takes 5.3224 seconds +AUTOTUNE mm(49x576, 576x160) + mm 0.0128 ms 100.0% + triton_mm_325 0.0170 ms 75.7% + triton_mm_322 0.0209 ms 61.5% + triton_mm_324 0.0213 ms 60.3% + triton_mm_321 0.0215 ms 59.6% + triton_mm_320 0.0235 ms 54.7% + triton_mm_317 0.0253 ms 50.6% + triton_mm_319 0.0256 ms 50.2% + triton_mm_327 0.0276 ms 46.5% + triton_mm_316 0.0304 ms 42.3% +SingleProcess AUTOTUNE takes 4.5732 seconds +AUTOTUNE mm(49x160, 160x960) + triton_mm_334 0.0101 ms 100.0% + triton_mm_333 0.0104 ms 97.2% + triton_mm_332 0.0109 ms 92.7% + triton_mm_336 0.0114 ms 88.8% + mm 0.0114 ms 88.5% + triton_mm_337 0.0115 ms 88.3% + triton_mm_331 0.0119 ms 84.9% + triton_mm_339 0.0120 ms 84.0% + triton_mm_329 0.0122 ms 82.7% + triton_mm_328 0.0124 ms 81.4% +SingleProcess AUTOTUNE takes 4.3177 seconds +AUTOTUNE mm(49x960, 960x160) + mm 0.0146 ms 100.0% + triton_mm_349 0.0258 ms 56.5% + triton_mm_346 0.0300 ms 48.6% + triton_mm_348 0.0319 ms 45.7% + triton_mm_345 0.0322 ms 45.3% + triton_mm_344 0.0352 ms 41.5% + triton_mm_341 0.0375 ms 38.9% + triton_mm_343 0.0388 ms 37.6% + triton_mm_351 0.0418 ms 34.9% + triton_mm_342 0.0486 ms 30.0% +SingleProcess AUTOTUNE takes 4.2567 seconds +AUTOTUNE mm(49x960, 960x320) + mm 0.0158 ms 100.0% + triton_mm_397 0.0256 ms 61.9% + triton_mm_394 0.0307 ms 51.7% + triton_mm_396 0.0314 ms 50.4% + triton_mm_393 0.0324 ms 48.9% + triton_mm_392 0.0354 ms 44.7% + triton_mm_389 0.0375 ms 42.2% + triton_mm_391 0.0391 ms 40.5% + triton_mm_399 0.0422 ms 37.5% + triton_mm_390 0.0489 ms 32.4% +SingleProcess AUTOTUNE takes 4.0725 seconds +pass-sqnr-28.894 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:mobilenet_v2_quantized_qat failed to load +mobilenet_v2_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mobilenet_v3_large +cuda eval mobilenet_v3_large int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for mobilenet_v3_large. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x224x224, 16x3x3x3) + triton_convolution_0 0.0099 ms 100.0% + triton_convolution_3 0.0106 ms 93.2% + triton_convolution_4 0.0119 ms 83.3% + convolution 0.0124 ms 79.5% + triton_convolution_1 0.0136 ms 72.5% + triton_convolution_2 0.0250 ms 39.6% +SingleProcess AUTOTUNE takes 1.9581 seconds +AUTOTUNE mm(12544x16, 16x16) + triton_mm_5 0.0071 ms 100.0% + triton_mm_9 0.0071 ms 100.0% + triton_mm_10 0.0071 ms 100.0% + triton_mm_11 0.0071 ms 100.0% + triton_mm_8 0.0075 ms 94.4% + triton_mm_6 0.0076 ms 93.2% + triton_mm_7 0.0076 ms 93.2% + triton_mm_12 0.0076 ms 93.2% + triton_mm_14 0.0076 ms 92.9% + triton_mm_13 0.0076 ms 92.5% +SingleProcess AUTOTUNE takes 2.5650 seconds +AUTOTUNE mm(12544x16, 16x64) + triton_mm_16 0.0078 ms 100.0% + triton_mm_18 0.0078 ms 100.0% + triton_mm_22 0.0078 ms 100.0% + triton_mm_24 0.0079 ms 99.6% + triton_mm_15 0.0083 ms 95.0% + triton_mm_21 0.0083 ms 94.6% + triton_mm_23 0.0083 ms 94.0% + triton_mm_17 0.0084 ms 93.9% + triton_mm_19 0.0084 ms 93.9% + triton_mm_20 0.0084 ms 93.5% +SingleProcess AUTOTUNE takes 3.4918 seconds +AUTOTUNE mm(3136x64, 64x24) + triton_mm_29 0.0071 ms 100.0% + triton_mm_31 0.0071 ms 100.0% + triton_mm_28 0.0073 ms 96.9% + triton_mm_32 0.0073 ms 96.5% + triton_mm_30 0.0075 ms 94.4% + triton_mm_27 0.0076 ms 93.2% + triton_mm_34 0.0076 ms 92.5% + triton_mm_35 0.0076 ms 92.5% + triton_mm_26 0.0077 ms 91.3% + triton_mm_33 0.0081 ms 87.7% +SingleProcess AUTOTUNE takes 3.5267 seconds +AUTOTUNE addmm(1x24, 1x72, 72x24) + triton_mm_76 0.0073 ms 100.0% + triton_mm_77 0.0073 ms 99.6% + triton_mm_78 0.0074 ms 99.1% + triton_mm_74 0.0076 ms 96.6% + triton_mm_75 0.0076 ms 95.8% + triton_mm_79 0.0079 ms 92.7% + triton_mm_80 0.0087 ms 84.1% + triton_mm_81 0.0095 ms 77.0% + addmm 0.0114 ms 63.9% + bias_addmm 0.0381 ms 19.1% +SingleProcess AUTOTUNE takes 3.0739 seconds +AUTOTUNE addmm(1x72, 1x24, 24x72) + triton_mm_87 0.0063 ms 100.0% + triton_mm_90 0.0063 ms 100.0% + triton_mm_82 0.0063 ms 99.5% + triton_mm_88 0.0063 ms 99.5% + triton_mm_85 0.0070 ms 90.4% + triton_mm_84 0.0070 ms 90.0% + triton_mm_89 0.0070 ms 89.5% + triton_mm_86 0.0071 ms 89.1% + triton_mm_83 0.0071 ms 88.7% + triton_mm_92 0.0074 ms 85.7% +SingleProcess AUTOTUNE takes 3.9717 seconds +AUTOTUNE addmm(1x32, 1x120, 120x32) + triton_mm_119 0.0074 ms 100.0% + triton_mm_120 0.0074 ms 100.0% + triton_mm_121 0.0074 ms 100.0% + triton_mm_118 0.0076 ms 96.6% + triton_mm_122 0.0078 ms 94.7% + triton_mm_117 0.0081 ms 90.9% + triton_mm_123 0.0099 ms 74.4% + addmm 0.0105 ms 70.3% + triton_mm_124 0.0124 ms 59.3% + bias_addmm 0.0398 ms 18.5% +SingleProcess AUTOTUNE takes 2.7010 seconds +AUTOTUNE addmm(1x120, 1x32, 32x120) + triton_mm_129 0.0063 ms 100.0% + triton_mm_130 0.0063 ms 100.0% + triton_mm_133 0.0063 ms 100.0% + triton_mm_135 0.0068 ms 92.5% + triton_mm_125 0.0070 ms 90.4% + triton_mm_132 0.0070 ms 90.2% + triton_mm_127 0.0070 ms 90.0% + triton_mm_128 0.0070 ms 90.0% + triton_mm_134 0.0071 ms 89.6% + triton_mm_126 0.0071 ms 89.4% +SingleProcess AUTOTUNE takes 4.1532 seconds +AUTOTUNE mm(196x80, 80x200) + triton_mm_221 0.0073 ms 100.0% + triton_mm_220 0.0076 ms 96.6% + triton_mm_224 0.0078 ms 93.5% + triton_mm_216 0.0081 ms 90.5% + triton_mm_223 0.0083 ms 88.2% + triton_mm_217 0.0083 ms 88.1% + triton_mm_218 0.0084 ms 87.7% + triton_mm_219 0.0084 ms 87.7% + mm 0.0091 ms 80.4% + triton_mm_215 0.0091 ms 80.4% +SingleProcess AUTOTUNE takes 4.7826 seconds +AUTOTUNE mm(196x200, 200x80) + triton_mm_233 0.0081 ms 100.0% + triton_mm_236 0.0081 ms 100.0% + triton_mm_232 0.0084 ms 96.9% + triton_mm_231 0.0096 ms 84.3% + triton_mm_235 0.0096 ms 84.2% + triton_mm_229 0.0099 ms 82.1% + triton_mm_230 0.0101 ms 80.1% + triton_mm_228 0.0104 ms 78.1% + mm 0.0107 ms 76.0% + triton_mm_227 0.0119 ms 68.0% +SingleProcess AUTOTUNE takes 4.7992 seconds +AUTOTUNE mm(196x80, 80x184) + triton_mm_245 0.0078 ms 100.0% + triton_mm_248 0.0078 ms 99.4% + triton_mm_240 0.0081 ms 96.2% + triton_mm_244 0.0081 ms 96.2% + triton_mm_241 0.0084 ms 93.3% + triton_mm_247 0.0084 ms 92.9% + triton_mm_242 0.0088 ms 88.7% + mm 0.0089 ms 87.9% + triton_mm_243 0.0091 ms 85.4% + triton_mm_239 0.0092 ms 85.1% +SingleProcess AUTOTUNE takes 4.6633 seconds +AUTOTUNE mm(196x184, 184x80) + triton_mm_257 0.0081 ms 100.0% + triton_mm_260 0.0081 ms 100.0% + triton_mm_256 0.0084 ms 96.9% + triton_mm_259 0.0086 ms 94.1% + triton_mm_254 0.0094 ms 86.3% + mm 0.0100 ms 80.8% + triton_mm_252 0.0101 ms 80.3% + triton_mm_253 0.0101 ms 80.1% + triton_mm_255 0.0101 ms 79.9% + triton_mm_251 0.0116 ms 69.5% +SingleProcess AUTOTUNE takes 4.7957 seconds +AUTOTUNE addmm(1x120, 1x480, 480x120) + triton_mm_305 0.0096 ms 100.0% + triton_mm_304 0.0098 ms 98.4% + triton_mm_303 0.0104 ms 92.6% + triton_mm_307 0.0104 ms 92.6% + triton_mm_308 0.0107 ms 90.1% + triton_mm_302 0.0114 ms 84.0% + triton_mm_301 0.0119 ms 80.9% + triton_mm_300 0.0129 ms 74.4% + addmm 0.0130 ms 73.9% + triton_mm_299 0.0179 ms 53.8% +SingleProcess AUTOTUNE takes 4.0999 seconds +AUTOTUNE addmm(1x480, 1x120, 120x480) + triton_mm_319 0.0074 ms 100.0% + triton_mm_316 0.0076 ms 97.5% + triton_mm_317 0.0080 ms 92.6% + triton_mm_315 0.0081 ms 91.7% + triton_mm_313 0.0083 ms 89.2% + triton_mm_320 0.0086 ms 86.2% + triton_mm_312 0.0086 ms 86.1% + triton_mm_314 0.0087 ms 85.0% + triton_mm_311 0.0089 ms 83.8% + triton_mm_318 0.0101 ms 73.2% +SingleProcess AUTOTUNE takes 4.1146 seconds +AUTOTUNE mm(196x480, 480x112) + triton_mm_329 0.0099 ms 100.0% + triton_mm_328 0.0106 ms 92.9% + triton_mm_332 0.0109 ms 90.3% + triton_mm_331 0.0118 ms 83.5% + triton_mm_326 0.0124 ms 79.4% + triton_mm_327 0.0129 ms 76.4% + mm 0.0131 ms 75.5% + triton_mm_324 0.0137 ms 72.1% + triton_mm_325 0.0137 ms 72.1% + triton_mm_323 0.0189 ms 52.0% +SingleProcess AUTOTUNE takes 4.8155 seconds +AUTOTUNE mm(196x112, 112x672) + triton_mm_341 0.0077 ms 100.0% + triton_mm_343 0.0084 ms 92.0% + triton_mm_336 0.0086 ms 89.2% + triton_mm_340 0.0086 ms 88.9% + triton_mm_344 0.0086 ms 88.9% + triton_mm_339 0.0088 ms 87.0% + triton_mm_337 0.0094 ms 81.9% + triton_mm_338 0.0094 ms 81.9% + triton_mm_335 0.0100 ms 77.2% + mm 0.0100 ms 76.7% +SingleProcess AUTOTUNE takes 4.6048 seconds +AUTOTUNE addmm(1x168, 1x672, 672x168) + triton_mm_352 0.0109 ms 100.0% + triton_mm_353 0.0114 ms 95.4% + addmm 0.0115 ms 94.6% + triton_mm_355 0.0117 ms 93.0% + triton_mm_356 0.0121 ms 89.6% + triton_mm_351 0.0124 ms 87.5% + triton_mm_350 0.0129 ms 84.2% + triton_mm_349 0.0141 ms 76.8% + triton_mm_348 0.0150 ms 72.2% + triton_mm_347 0.0222 ms 48.8% +SingleProcess AUTOTUNE takes 4.0443 seconds +AUTOTUNE addmm(1x672, 1x168, 168x672) + triton_mm_365 0.0079 ms 100.0% + triton_mm_364 0.0081 ms 97.2% + triton_mm_368 0.0081 ms 97.2% + triton_mm_367 0.0086 ms 91.4% + triton_mm_363 0.0089 ms 88.8% + triton_mm_361 0.0091 ms 86.3% + triton_mm_362 0.0094 ms 84.0% + triton_mm_360 0.0096 ms 82.0% + triton_mm_359 0.0111 ms 71.1% + triton_mm_366 0.0119 ms 66.1% +SingleProcess AUTOTUNE takes 4.0570 seconds +AUTOTUNE mm(196x672, 672x112) + triton_mm_377 0.0111 ms 100.0% + triton_mm_376 0.0114 ms 97.8% + triton_mm_380 0.0131 ms 84.9% + mm 0.0135 ms 82.3% + triton_mm_379 0.0136 ms 81.7% + triton_mm_374 0.0147 ms 76.0% + triton_mm_375 0.0152 ms 73.4% + triton_mm_373 0.0169 ms 65.8% + triton_mm_372 0.0170 ms 65.7% + triton_mm_371 0.0247 ms 45.1% +SingleProcess AUTOTUNE takes 5.2522 seconds +AUTOTUNE mm(49x672, 672x160) + mm 0.0138 ms 100.0% + triton_mm_425 0.0235 ms 58.8% + triton_mm_428 0.0236 ms 58.5% + triton_mm_424 0.0244 ms 56.8% + triton_mm_427 0.0261 ms 53.0% + triton_mm_423 0.0265 ms 52.1% + triton_mm_420 0.0280 ms 49.3% + triton_mm_422 0.0290 ms 47.6% + triton_mm_430 0.0318 ms 43.5% + triton_mm_419 0.0354 ms 39.1% +SingleProcess AUTOTUNE takes 4.8817 seconds +AUTOTUNE addmm(1x240, 1x960, 960x240) + addmm 0.0127 ms 100.0% + triton_mm_448 0.0129 ms 98.5% + triton_mm_449 0.0134 ms 95.0% + triton_mm_451 0.0139 ms 91.7% + triton_mm_452 0.0141 ms 89.8% + triton_mm_447 0.0144 ms 88.2% + triton_mm_446 0.0159 ms 79.7% + triton_mm_445 0.0187 ms 68.0% + triton_mm_444 0.0196 ms 64.8% + triton_mm_443 0.0298 ms 42.6% +SingleProcess AUTOTUNE takes 4.9714 seconds +AUTOTUNE addmm(1x960, 1x240, 240x960) + triton_mm_461 0.0086 ms 100.0% + triton_mm_460 0.0088 ms 97.5% + triton_mm_463 0.0088 ms 97.5% + triton_mm_459 0.0093 ms 92.1% + triton_mm_464 0.0094 ms 91.8% + triton_mm_457 0.0101 ms 85.1% + triton_mm_458 0.0101 ms 85.1% + triton_mm_456 0.0106 ms 81.3% + triton_mm_455 0.0124 ms 69.5% + addmm 0.0133 ms 64.5% +SingleProcess AUTOTUNE takes 4.4898 seconds +AUTOTUNE int_mm(1x960, 960x1280, 1x1280) + triton_mm_549 0.0121 ms 100.0% + triton_mm_548 0.0129 ms 93.8% + triton_mm_547 0.0133 ms 90.9% + triton_mm_544 0.0134 ms 90.7% + triton_mm_545 0.0139 ms 87.3% + triton_mm_543 0.0147 ms 82.8% + triton_mm_541 0.0189 ms 64.0% + triton_mm_540 0.0204 ms 59.3% + triton_mm_539 0.0282 ms 43.0% + triton_mm_542 0.0287 ms 42.3% +SingleProcess AUTOTUNE takes 3.5884 seconds +pass-sqnr-27.481 + loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0 + loading model: 0it [00:03, ?it/s] +moco +cuda eval moco int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for moco. Setting accuracy check to cosine +ERROR:common:add_(): argument 'other' (position 1) must be Tensor, not NoneType +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2156, in check_accuracy + correct_result = self.run_n_iterations( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward + self._momentum_update_key_encoder() # update the key encoder + File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context + return func(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder + param_k.mul_(self.m).add_(param_q.mul(1. - self.m)) +TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType +eager_1st_run_fail + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +nanogpt +number of parameters: 123.69M +num decayed parameter tensors: 50, with 124,354,560 parameters +num non-decayed parameter tensors: 98, with 121,344 parameters +using fused AdamW: True +cuda eval nanogpt int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for nanogpt. Setting accuracy check to cosine +AUTOTUNE int_mm(64x768, 768x2304, 64x2304) + triton_mm_10 0.0128 ms 100.0% + triton_mm_6 0.0135 ms 94.5% + triton_mm_8 0.0138 ms 92.6% + triton_mm_5 0.0142 ms 90.1% + triton_mm_4 0.0158 ms 80.8% + triton_mm_9 0.0158 ms 80.8% + triton_mm_3 0.0174 ms 73.2% + triton_mm_2 0.0183 ms 69.6% + triton_mm_1 0.0199 ms 64.1% + triton_mm_0 0.0243 ms 52.5% +SingleProcess AUTOTUNE takes 4.8751 seconds +AUTOTUNE int_mm(64x768, 768x768, 64x768) + triton_mm_21 0.0122 ms 100.0% + triton_mm_16 0.0130 ms 93.8% + triton_mm_19 0.0132 ms 92.0% + triton_mm_17 0.0133 ms 91.4% + triton_mm_15 0.0148 ms 82.3% + triton_mm_20 0.0157 ms 77.6% + triton_mm_14 0.0168 ms 72.4% + triton_mm_13 0.0177 ms 69.0% + triton_mm_12 0.0192 ms 63.4% + triton_mm_11 0.0236 ms 51.6% +SingleProcess AUTOTUNE takes 4.8643 seconds +AUTOTUNE int_mm(64x768, 768x3072, 64x3072) + triton_mm_32 0.0136 ms 100.0% + triton_mm_30 0.0140 ms 97.1% + triton_mm_27 0.0143 ms 95.1% + triton_mm_28 0.0145 ms 93.8% + triton_mm_31 0.0159 ms 85.5% + triton_mm_26 0.0161 ms 84.7% + triton_mm_25 0.0177 ms 77.0% + triton_mm_24 0.0187 ms 72.8% + triton_mm_23 0.0206 ms 65.9% + triton_mm_22 0.0239 ms 57.0% +SingleProcess AUTOTUNE takes 5.4456 seconds +AUTOTUNE int_mm(64x3072, 3072x768, 64x768) + triton_mm_43 0.0241 ms 100.0% + triton_mm_38 0.0295 ms 81.6% + triton_mm_39 0.0300 ms 80.2% + triton_mm_41 0.0301 ms 79.8% + triton_mm_42 0.0306 ms 78.7% + triton_mm_37 0.0379 ms 63.5% + triton_mm_36 0.0428 ms 56.2% + triton_mm_35 0.0498 ms 48.4% + triton_mm_34 0.0540 ms 44.5% + triton_mm_33 0.0780 ms 30.9% +SingleProcess AUTOTUNE takes 4.9506 seconds +AUTOTUNE int_mm(1x768, 768x50304, 1x50304) + triton_mm_538 0.0451 ms 100.0% + triton_mm_537 0.0454 ms 99.5% + triton_mm_536 0.0468 ms 96.4% + triton_mm_533 0.0534 ms 84.5% + triton_mm_529 0.0542 ms 83.2% + triton_mm_534 0.0544 ms 82.9% + triton_mm_532 0.0547 ms 82.5% + triton_mm_531 0.0556 ms 81.2% + triton_mm_530 0.0564 ms 80.0% + triton_mm_528 0.0568 ms 79.5% +SingleProcess AUTOTUNE takes 4.0906 seconds +pass-sqnr-29.506 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +nvidia_deeprecommender +cuda eval nvidia_deeprecommender int8dynamic-bs1-acc +AUTOTUNE mm(1x197951, 197951x512) + mm 0.2257 ms 100.0% + triton_mm_8 3.5181 ms 6.4% + triton_mm_9 3.6999 ms 6.1% + triton_mm_5 5.3036 ms 4.3% + triton_mm_6 5.7490 ms 3.9% + triton_mm_2 5.7502 ms 3.9% + triton_mm_4 5.7510 ms 3.9% + triton_mm_0 5.7522 ms 3.9% + triton_mm_1 6.8611 ms 3.3% + triton_mm_3 6.9066 ms 3.3% +SingleProcess AUTOTUNE takes 4.8850 seconds +AUTOTUNE mm(1x512, 512x512) + mm 0.0092 ms 100.0% + triton_mm_17 0.0097 ms 94.7% + triton_mm_18 0.0097 ms 94.7% + triton_mm_16 0.0102 ms 89.9% + triton_mm_20 0.0104 ms 87.7% + triton_mm_21 0.0106 ms 86.1% + triton_mm_15 0.0112 ms 81.7% + triton_mm_14 0.0130 ms 70.4% + triton_mm_13 0.0138 ms 66.5% + triton_mm_12 0.0189 ms 48.5% +SingleProcess AUTOTUNE takes 3.7706 seconds +AUTOTUNE mm(1x512, 512x1024) + mm 0.0092 ms 100.0% + triton_mm_29 0.0101 ms 90.9% + triton_mm_30 0.0104 ms 88.3% + triton_mm_32 0.0104 ms 88.3% + triton_mm_33 0.0107 ms 86.2% + triton_mm_28 0.0109 ms 84.2% + triton_mm_27 0.0115 ms 80.4% + triton_mm_26 0.0130 ms 70.9% + triton_mm_25 0.0140 ms 65.6% + triton_mm_24 0.0194 ms 47.4% +SingleProcess AUTOTUNE takes 3.6148 seconds +AUTOTUNE mm(1x1024, 1024x512) + mm 0.0094 ms 100.0% + triton_mm_41 0.0133 ms 70.8% + triton_mm_44 0.0137 ms 68.9% + triton_mm_42 0.0138 ms 68.4% + triton_mm_45 0.0142 ms 66.1% + triton_mm_40 0.0148 ms 63.6% + triton_mm_39 0.0160 ms 58.8% + triton_mm_38 0.0196 ms 48.0% + triton_mm_37 0.0207 ms 45.4% + triton_mm_36 0.0314 ms 30.0% +SingleProcess AUTOTUNE takes 3.9268 seconds +AUTOTUNE mm(1x512, 512x197951) + triton_mm_61 0.1446 ms 100.0% + triton_mm_64 0.1448 ms 99.9% + triton_mm_66 0.1451 ms 99.7% + triton_mm_62 0.1455 ms 99.4% + triton_mm_63 0.1459 ms 99.1% + mm 0.1469 ms 98.4% + triton_mm_60 0.1897 ms 76.3% + triton_mm_67 0.1903 ms 76.0% + triton_mm_71 0.1923 ms 75.2% + triton_mm_70 0.1978 ms 73.1% +SingleProcess AUTOTUNE takes 3.9520 seconds +pass-sqnr-41.873 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +opacus_cifar10 +cuda eval opacus_cifar10 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for opacus_cifar10. Setting accuracy check to cosine +AUTOTUNE int_mm(1x512, 512x10, 1x10) + triton_mm_114 0.0074 ms 100.0% + triton_mm_112 0.0085 ms 86.8% + triton_mm_113 0.0088 ms 83.6% + triton_mm_111 0.0092 ms 79.6% + triton_mm_110 0.0105 ms 69.9% + triton_mm_109 0.0129 ms 57.1% +SingleProcess AUTOTUNE takes 2.2391 seconds +pass-sqnr-36.409 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:24, ?it/s] +phi_1_5 +cuda eval phi_1_5 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for phi_1_5. Setting accuracy check to cosine +AUTOTUNE int_mm(512x2048, 2048x6144, 512x6144) + triton_mm_9 0.0426 ms 100.0% + triton_mm_10 0.0433 ms 98.4% + triton_mm_8 0.0712 ms 59.9% + triton_mm_1 0.0761 ms 56.0% + triton_mm_7 0.0771 ms 55.3% + triton_mm_2 0.0794 ms 53.7% + triton_mm_4 0.0804 ms 53.0% + triton_mm_3 0.0806 ms 52.9% + triton_mm_0 0.1170 ms 36.4% + triton_mm_5 0.1873 ms 22.8% +SingleProcess AUTOTUNE takes 7.3026 seconds +AUTOTUNE bmm(32x512x64, 32x64x512) + bmm 0.0797 ms 100.0% + triton_bmm_13 0.1091 ms 73.0% + triton_bmm_12 0.1211 ms 65.8% + triton_bmm_18 0.1300 ms 61.3% + triton_bmm_11 0.1477 ms 54.0% + triton_bmm_21 0.1524 ms 52.3% + triton_bmm_14 0.1788 ms 44.6% + triton_bmm_15 0.1791 ms 44.5% + triton_bmm_19 0.2580 ms 30.9% + triton_bmm_20 0.2829 ms 28.2% +SingleProcess AUTOTUNE takes 10.6880 seconds +AUTOTUNE bmm(32x512x512, 32x512x64) + triton_bmm_24 0.0260 ms 100.0% + triton_bmm_25 0.0267 ms 97.5% + triton_bmm_31 0.0281 ms 92.7% + triton_bmm_27 0.0286 ms 91.1% + triton_bmm_26 0.0286 ms 90.9% + triton_bmm_29 0.0292 ms 89.3% + triton_bmm_28 0.0297 ms 87.6% + bmm 0.0304 ms 85.6% + triton_bmm_32 0.0330 ms 78.8% + triton_bmm_23 0.0333 ms 78.2% +SingleProcess AUTOTUNE takes 4.6111 seconds +AUTOTUNE int_mm(512x2048, 2048x2048, 512x2048) + triton_mm_43 0.0316 ms 100.0% + triton_mm_44 0.0402 ms 78.6% + triton_mm_45 0.0410 ms 77.0% + triton_mm_36 0.0414 ms 76.1% + triton_mm_37 0.0420 ms 75.0% + triton_mm_38 0.0420 ms 75.0% + triton_mm_39 0.0452 ms 69.8% + triton_mm_35 0.0524 ms 60.3% + triton_mm_40 0.0675 ms 46.8% + triton_mm_41 0.0678 ms 46.6% +SingleProcess AUTOTUNE takes 7.4139 seconds +AUTOTUNE int_mm(512x2048, 2048x8192, 512x8192) + triton_mm_56 0.0775 ms 100.0% + triton_mm_55 0.0779 ms 99.6% + triton_mm_54 0.0862 ms 89.9% + triton_mm_47 0.0999 ms 77.6% + triton_mm_50 0.1005 ms 77.1% + triton_mm_49 0.1012 ms 76.6% + triton_mm_48 0.1016 ms 76.3% + triton_mm_46 0.1347 ms 57.6% + triton_mm_53 0.1395 ms 55.6% + triton_mm_51 0.2359 ms 32.9% +SingleProcess AUTOTUNE takes 7.2124 seconds +AUTOTUNE int_mm(512x8192, 8192x2048, 512x2048) + triton_mm_65 0.0965 ms 100.0% + triton_mm_66 0.1144 ms 84.4% + triton_mm_67 0.1155 ms 83.6% + triton_mm_60 0.1382 ms 69.8% + triton_mm_58 0.1387 ms 69.6% + triton_mm_59 0.1403 ms 68.8% + triton_mm_61 0.1476 ms 65.4% + triton_mm_57 0.1874 ms 51.5% + triton_mm_62 0.2335 ms 41.3% + triton_mm_63 0.2339 ms 41.3% +SingleProcess AUTOTUNE takes 7.2106 seconds +AUTOTUNE int_mm(512x2048, 2048x51200, 512x51200) + triton_mm_1641 0.2900 ms 100.0% + triton_mm_1642 0.2913 ms 99.5% + triton_mm_1640 0.4703 ms 61.7% + triton_mm_1633 0.4884 ms 59.4% + triton_mm_1634 0.5009 ms 57.9% + triton_mm_1635 0.5342 ms 54.3% + triton_mm_1636 0.5392 ms 53.8% + triton_mm_1639 0.5481 ms 52.9% + triton_mm_1632 0.6509 ms 44.5% + triton_mm_1637 1.4326 ms 20.2% +SingleProcess AUTOTUNE takes 7.4048 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +phlippe_densenet +cuda eval phlippe_densenet int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for phlippe_densenet. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x32x32, 32x3x3x3) + convolution 0.0103 ms 100.0% + triton_convolution_4 0.0108 ms 95.5% + triton_convolution_0 0.0114 ms 90.2% + triton_convolution_3 0.0119 ms 86.5% + triton_convolution_5 0.0148 ms 69.3% + triton_convolution_2 0.0198 ms 51.9% + triton_convolution_1 0.0224 ms 45.9% +SingleProcess AUTOTUNE takes 2.4786 seconds +AUTOTUNE mm(1024x32, 32x32) + triton_mm_9 0.0067 ms 100.0% + triton_mm_14 0.0067 ms 100.0% + triton_mm_15 0.0068 ms 97.2% + triton_mm_8 0.0069 ms 96.7% + triton_mm_13 0.0069 ms 96.7% + triton_mm_12 0.0070 ms 95.4% + triton_mm_6 0.0070 ms 94.5% + triton_mm_11 0.0072 ms 93.1% + triton_mm_7 0.0072 ms 92.4% + triton_mm_10 0.0074 ms 89.7% +SingleProcess AUTOTUNE takes 3.5111 seconds +AUTOTUNE convolution(1x32x32x32, 16x32x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_22 0.0170 ms 65.0% + triton_convolution_21 0.0170 ms 64.9% + triton_convolution_18 0.0177 ms 62.3% + triton_convolution_23 0.0239 ms 46.1% + triton_convolution_19 0.0310 ms 35.6% + triton_convolution_20 0.0756 ms 14.6% +SingleProcess AUTOTUNE takes 2.6112 seconds +AUTOTUNE mm(1024x48, 48x32) + triton_mm_30 0.0067 ms 100.0% + triton_mm_25 0.0067 ms 99.5% + triton_mm_26 0.0069 ms 96.7% + triton_mm_32 0.0069 ms 96.7% + triton_mm_28 0.0073 ms 91.2% + triton_mm_33 0.0074 ms 89.7% + triton_mm_29 0.0075 ms 89.3% + triton_mm_27 0.0075 ms 88.7% + triton_mm_24 0.0079 ms 84.4% + triton_mm_31 0.0080 ms 83.2% +SingleProcess AUTOTUNE takes 3.7552 seconds +AUTOTUNE mm(1024x64, 64x32) + triton_mm_48 0.0067 ms 100.0% + triton_mm_51 0.0067 ms 100.0% + triton_mm_47 0.0069 ms 96.7% + triton_mm_50 0.0069 ms 96.7% + triton_mm_43 0.0074 ms 89.7% + triton_mm_45 0.0075 ms 89.3% + mm 0.0076 ms 87.8% + triton_mm_44 0.0076 ms 87.2% + triton_mm_46 0.0078 ms 85.6% + triton_mm_42 0.0080 ms 82.7% +SingleProcess AUTOTUNE takes 3.6139 seconds +AUTOTUNE mm(1024x80, 80x32) + triton_mm_65 0.0071 ms 100.0% + triton_mm_69 0.0073 ms 96.9% + triton_mm_66 0.0074 ms 95.7% + triton_mm_64 0.0076 ms 94.1% + triton_mm_68 0.0077 ms 91.7% + triton_mm_63 0.0078 ms 91.5% + triton_mm_61 0.0079 ms 89.9% + mm 0.0080 ms 88.4% + triton_mm_60 0.0083 ms 86.0% + triton_mm_62 0.0083 ms 85.4% +SingleProcess AUTOTUNE takes 4.1125 seconds +AUTOTUNE mm(1024x96, 96x32) + triton_mm_84 0.0068 ms 100.0% + triton_mm_81 0.0071 ms 96.4% + triton_mm_86 0.0073 ms 93.9% + triton_mm_79 0.0076 ms 90.7% + triton_mm_83 0.0076 ms 89.5% + triton_mm_87 0.0081 ms 84.8% + triton_mm_82 0.0081 ms 84.6% + triton_mm_80 0.0083 ms 82.3% + triton_mm_78 0.0085 ms 80.8% + triton_mm_85 0.0089 ms 76.7% +SingleProcess AUTOTUNE takes 3.8938 seconds +AUTOTUNE mm(1024x112, 112x32) + triton_mm_104 0.0071 ms 100.0% + triton_mm_102 0.0075 ms 95.7% + triton_mm_97 0.0078 ms 91.8% + triton_mm_105 0.0080 ms 89.2% + triton_mm_101 0.0081 ms 88.1% + triton_mm_98 0.0082 ms 86.8% + mm 0.0083 ms 86.1% + triton_mm_99 0.0083 ms 86.1% + triton_mm_100 0.0085 ms 83.7% + triton_mm_96 0.0092 ms 78.0% +SingleProcess AUTOTUNE takes 4.5198 seconds +AUTOTUNE mm(1024x128, 128x64) + triton_mm_120 0.0076 ms 100.0% + triton_mm_122 0.0076 ms 100.0% + triton_mm_115 0.0080 ms 94.4% + triton_mm_119 0.0081 ms 93.3% + mm 0.0083 ms 91.5% + triton_mm_123 0.0083 ms 90.8% + triton_mm_117 0.0084 ms 90.4% + triton_mm_118 0.0089 ms 84.7% + triton_mm_116 0.0090 ms 83.7% + triton_mm_114 0.0104 ms 72.8% +SingleProcess AUTOTUNE takes 4.4307 seconds +AUTOTUNE mm(256x64, 64x32) + triton_mm_127 0.0067 ms 100.0% + triton_mm_132 0.0070 ms 95.4% + triton_mm_135 0.0070 ms 95.4% + triton_mm_134 0.0072 ms 92.4% + triton_mm_126 0.0073 ms 90.8% + triton_mm_131 0.0073 ms 90.6% + triton_mm_129 0.0074 ms 90.0% + triton_mm_128 0.0074 ms 89.7% + triton_mm_130 0.0076 ms 87.6% + triton_mm_133 0.0083 ms 80.3% +SingleProcess AUTOTUNE takes 3.9367 seconds +AUTOTUNE convolution(1x32x16x16, 16x32x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_142 0.0167 ms 65.6% + triton_convolution_141 0.0175 ms 62.6% + triton_convolution_138 0.0178 ms 61.8% + triton_convolution_143 0.0244 ms 45.0% + triton_convolution_140 0.0287 ms 38.2% + triton_convolution_139 0.0319 ms 34.4% +SingleProcess AUTOTUNE takes 2.2573 seconds +AUTOTUNE mm(256x80, 80x32) + triton_mm_149 0.0071 ms 100.0% + triton_mm_145 0.0071 ms 99.5% + triton_mm_152 0.0071 ms 99.5% + triton_mm_150 0.0072 ms 98.2% + triton_mm_153 0.0073 ms 96.5% + triton_mm_147 0.0076 ms 93.4% + triton_mm_146 0.0080 ms 88.8% + triton_mm_148 0.0081 ms 87.4% + triton_mm_144 0.0083 ms 85.2% + triton_mm_151 0.0085 ms 83.4% +SingleProcess AUTOTUNE takes 4.0959 seconds +AUTOTUNE mm(256x96, 96x32) + triton_mm_167 0.0069 ms 100.0% + triton_mm_168 0.0072 ms 96.6% + triton_mm_165 0.0074 ms 93.1% + triton_mm_166 0.0075 ms 91.9% + triton_mm_164 0.0076 ms 91.5% + triton_mm_170 0.0076 ms 90.4% + triton_mm_163 0.0077 ms 89.6% + triton_mm_171 0.0079 ms 87.8% + triton_mm_162 0.0086 ms 80.3% + mm 0.0087 ms 79.4% +SingleProcess AUTOTUNE takes 3.7826 seconds +AUTOTUNE mm(256x112, 112x32) + triton_mm_186 0.0072 ms 100.0% + triton_mm_183 0.0073 ms 98.7% + triton_mm_188 0.0076 ms 94.6% + triton_mm_184 0.0078 ms 93.0% + triton_mm_189 0.0078 ms 93.0% + triton_mm_185 0.0078 ms 92.2% + triton_mm_181 0.0079 ms 91.5% + triton_mm_182 0.0079 ms 91.1% + mm 0.0089 ms 81.0% + triton_mm_180 0.0092 ms 78.2% +SingleProcess AUTOTUNE takes 3.9557 seconds +AUTOTUNE mm(256x128, 128x32) + triton_mm_206 0.0071 ms 100.0% + triton_mm_204 0.0072 ms 98.2% + triton_mm_203 0.0073 ms 96.9% + triton_mm_199 0.0076 ms 94.1% + triton_mm_201 0.0076 ms 94.1% + triton_mm_202 0.0078 ms 91.4% + triton_mm_207 0.0081 ms 87.7% + triton_mm_200 0.0083 ms 85.4% + mm 0.0092 ms 76.8% + triton_mm_198 0.0096 ms 74.4% +SingleProcess AUTOTUNE takes 3.9459 seconds +AUTOTUNE mm(256x144, 144x32) + triton_mm_222 0.0071 ms 100.0% + triton_mm_224 0.0076 ms 93.6% + triton_mm_225 0.0076 ms 93.6% + triton_mm_219 0.0078 ms 90.9% + triton_mm_217 0.0080 ms 88.4% + triton_mm_220 0.0080 ms 88.4% + triton_mm_221 0.0081 ms 87.4% + triton_mm_218 0.0084 ms 83.7% + mm 0.0092 ms 77.3% + triton_mm_216 0.0096 ms 73.9% +SingleProcess AUTOTUNE takes 3.8555 seconds +AUTOTUNE mm(256x160, 160x80) + triton_mm_239 0.0076 ms 100.0% + triton_mm_240 0.0076 ms 100.0% + triton_mm_243 0.0083 ms 91.1% + triton_mm_237 0.0084 ms 89.4% + triton_mm_242 0.0086 ms 88.1% + triton_mm_238 0.0090 ms 84.0% + triton_mm_235 0.0093 ms 81.4% + triton_mm_236 0.0093 ms 81.4% + triton_mm_234 0.0108 ms 69.8% + triton_mm_245 0.0120 ms 63.2% +SingleProcess AUTOTUNE takes 4.8890 seconds +AUTOTUNE mm(64x80, 80x32) + triton_mm_250 0.0064 ms 100.0% + triton_mm_249 0.0069 ms 93.5% + triton_mm_253 0.0071 ms 90.5% + triton_mm_247 0.0074 ms 86.8% + triton_mm_248 0.0074 ms 86.6% + triton_mm_252 0.0074 ms 86.6% + triton_mm_246 0.0079 ms 81.4% + triton_mm_251 0.0081 ms 79.1% + triton_mm_254 0.0087 ms 74.2% + triton_mm_255 0.0088 ms 73.1% +SingleProcess AUTOTUNE takes 3.0935 seconds +AUTOTUNE convolution(1x32x8x8, 16x32x3x3) + convolution 0.0101 ms 100.0% + triton_convolution_258 0.0166 ms 60.7% + triton_convolution_256 0.0184 ms 54.7% + triton_convolution_257 0.0184 ms 54.7% +SingleProcess AUTOTUNE takes 1.2554 seconds +AUTOTUNE mm(64x96, 96x32) + triton_mm_263 0.0064 ms 100.0% + triton_mm_261 0.0067 ms 96.6% + triton_mm_262 0.0067 ms 96.2% + triton_mm_266 0.0073 ms 88.2% + triton_mm_260 0.0074 ms 86.6% + triton_mm_265 0.0074 ms 86.6% + triton_mm_259 0.0078 ms 82.7% + triton_mm_264 0.0083 ms 77.8% + mm 0.0089 ms 71.9% + triton_mm_268 0.0092 ms 69.6% +SingleProcess AUTOTUNE takes 2.9956 seconds +AUTOTUNE mm(64x112, 112x32) + triton_mm_275 0.0071 ms 100.0% + triton_mm_276 0.0071 ms 99.8% + triton_mm_278 0.0074 ms 95.7% + triton_mm_279 0.0076 ms 94.1% + triton_mm_273 0.0076 ms 92.9% + triton_mm_274 0.0077 ms 92.5% + triton_mm_272 0.0083 ms 86.0% + triton_mm_277 0.0087 ms 81.9% + mm 0.0090 ms 78.7% + triton_mm_281 0.0094 ms 75.3% +SingleProcess AUTOTUNE takes 3.0599 seconds +AUTOTUNE mm(64x128, 128x32) + triton_mm_289 0.0067 ms 100.0% + triton_mm_291 0.0069 ms 96.7% + triton_mm_288 0.0075 ms 88.5% + triton_mm_286 0.0076 ms 87.0% + triton_mm_287 0.0077 ms 86.7% + triton_mm_292 0.0079 ms 84.6% + mm 0.0084 ms 78.8% + triton_mm_285 0.0090 ms 73.8% + triton_mm_290 0.0090 ms 73.8% + triton_mm_294 0.0104 ms 64.2% +SingleProcess AUTOTUNE takes 3.3893 seconds +AUTOTUNE mm(64x144, 144x32) + triton_mm_301 0.0073 ms 100.0% + triton_mm_304 0.0073 ms 100.0% + triton_mm_302 0.0074 ms 98.9% + triton_mm_299 0.0076 ms 97.0% + triton_mm_305 0.0079 ms 93.1% + triton_mm_300 0.0079 ms 92.7% + triton_mm_298 0.0089 ms 82.1% + mm 0.0092 ms 79.2% + triton_mm_303 0.0099 ms 74.1% + triton_mm_307 0.0107 ms 68.4% +SingleProcess AUTOTUNE takes 3.4012 seconds +AUTOTUNE mm(64x160, 160x32) + triton_mm_314 0.0071 ms 100.0% + triton_mm_317 0.0073 ms 97.4% + triton_mm_315 0.0074 ms 96.3% + triton_mm_318 0.0079 ms 90.7% + triton_mm_313 0.0079 ms 90.3% + triton_mm_312 0.0081 ms 88.1% + mm 0.0089 ms 79.9% + triton_mm_311 0.0094 ms 76.1% + triton_mm_316 0.0094 ms 76.1% + triton_mm_320 0.0115 ms 61.9% +SingleProcess AUTOTUNE takes 3.3648 seconds +AUTOTUNE mm(64x176, 176x88) + triton_mm_332 0.0079 ms 100.0% + triton_mm_333 0.0081 ms 97.2% + triton_mm_329 0.0082 ms 96.5% + triton_mm_326 0.0083 ms 95.0% + triton_mm_330 0.0084 ms 94.3% + triton_mm_328 0.0086 ms 91.8% + triton_mm_327 0.0087 ms 90.8% + mm 0.0089 ms 88.2% + triton_mm_325 0.0095 ms 83.1% + triton_mm_324 0.0107 ms 73.4% +SingleProcess AUTOTUNE takes 5.0935 seconds +AUTOTUNE mm(16x88, 88x32) + triton_mm_338 0.0062 ms 100.0% + triton_mm_337 0.0064 ms 96.5% + triton_mm_339 0.0068 ms 90.9% + triton_mm_340 0.0070 ms 89.0% + triton_mm_336 0.0071 ms 87.4% + triton_mm_341 0.0077 ms 80.2% + mm 0.0086 ms 72.5% + triton_mm_342 0.0090 ms 68.8% + triton_mm_343 0.0095 ms 65.5% +SingleProcess AUTOTUNE takes 2.8154 seconds +AUTOTUNE convolution(1x32x4x4, 16x32x3x3) + convolution 0.0100 ms 100.0% + triton_convolution_344 0.0209 ms 48.1% + triton_convolution_345 0.0209 ms 48.1% + triton_convolution_346 0.0216 ms 46.5% +SingleProcess AUTOTUNE takes 1.3691 seconds +AUTOTUNE mm(16x104, 104x32) + triton_mm_350 0.0064 ms 100.0% + triton_mm_349 0.0067 ms 96.2% + triton_mm_351 0.0070 ms 91.7% + triton_mm_348 0.0074 ms 86.2% + triton_mm_352 0.0078 ms 82.3% + triton_mm_347 0.0084 ms 76.6% + mm 0.0084 ms 76.2% + triton_mm_353 0.0093 ms 68.5% + triton_mm_354 0.0096 ms 66.7% +SingleProcess AUTOTUNE takes 2.5535 seconds +AUTOTUNE mm(16x120, 120x32) + triton_mm_362 0.0064 ms 100.0% + triton_mm_360 0.0067 ms 96.2% + triton_mm_359 0.0068 ms 94.3% + triton_mm_361 0.0068 ms 94.3% + triton_mm_363 0.0078 ms 82.3% + triton_mm_358 0.0084 ms 76.0% + mm 0.0084 ms 75.8% + triton_mm_364 0.0104 ms 61.5% + triton_mm_365 0.0108 ms 59.0% +SingleProcess AUTOTUNE takes 2.7011 seconds +AUTOTUNE mm(16x136, 136x32) + triton_mm_372 0.0072 ms 100.0% + triton_mm_371 0.0074 ms 97.0% + triton_mm_373 0.0074 ms 97.0% + triton_mm_374 0.0076 ms 94.1% + triton_mm_370 0.0077 ms 93.8% + triton_mm_369 0.0090 ms 79.8% + mm 0.0099 ms 72.6% + triton_mm_375 0.0111 ms 65.0% + triton_mm_376 0.0115 ms 62.5% +SingleProcess AUTOTUNE takes 2.7369 seconds +AUTOTUNE mm(16x152, 152x32) + triton_mm_383 0.0067 ms 100.0% + triton_mm_382 0.0068 ms 97.2% + triton_mm_381 0.0073 ms 91.2% + triton_mm_384 0.0074 ms 89.7% + triton_mm_385 0.0076 ms 87.0% + triton_mm_380 0.0086 ms 77.0% + mm 0.0094 ms 70.7% + triton_mm_386 0.0112 ms 59.4% + triton_mm_387 0.0118 ms 56.2% +SingleProcess AUTOTUNE takes 2.6624 seconds +AUTOTUNE mm(16x168, 168x32) + triton_mm_395 0.0068 ms 100.0% + triton_mm_393 0.0069 ms 99.5% + triton_mm_396 0.0071 ms 96.4% + triton_mm_392 0.0073 ms 93.4% + triton_mm_394 0.0074 ms 92.2% + mm 0.0094 ms 72.8% + triton_mm_391 0.0097 ms 70.6% + triton_mm_397 0.0118 ms 57.8% + triton_mm_398 0.0125 ms 54.7% +SingleProcess AUTOTUNE takes 2.7642 seconds +AUTOTUNE int_mm(1x184, 184x10, 1x10) + triton_mm_405 0.0068 ms 100.0% + triton_mm_404 0.0069 ms 99.5% + triton_mm_407 0.0072 ms 95.1% + triton_mm_406 0.0073 ms 93.9% + triton_mm_403 0.0079 ms 87.0% + triton_mm_402 0.0082 ms 83.3% +SingleProcess AUTOTUNE takes 2.0640 seconds +pass-sqnr-39.189 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +phlippe_resnet +cuda eval phlippe_resnet int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for phlippe_resnet. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x32x32, 16x3x3x3) + triton_convolution_0 0.0086 ms 100.0% + triton_convolution_3 0.0091 ms 94.4% + convolution 0.0099 ms 86.8% + triton_convolution_4 0.0104 ms 82.8% + triton_convolution_1 0.0142 ms 60.7% + triton_convolution_2 0.0203 ms 42.4% +SingleProcess AUTOTUNE takes 2.0849 seconds +AUTOTUNE convolution(1x16x32x32, 16x16x3x3) + convolution 0.0109 ms 100.0% + triton_convolution_5 0.0114 ms 95.9% + triton_convolution_8 0.0122 ms 89.6% + triton_convolution_9 0.0151 ms 72.2% + triton_convolution_6 0.0163 ms 67.1% + triton_convolution_7 0.0286 ms 38.2% +SingleProcess AUTOTUNE takes 1.9362 seconds +AUTOTUNE convolution(1x16x32x32, 32x16x3x3) + convolution 0.0120 ms 100.0% + triton_convolution_39 0.0127 ms 94.7% + triton_convolution_38 0.0152 ms 78.9% + triton_convolution_35 0.0155 ms 77.6% + triton_convolution_40 0.0218 ms 55.1% + triton_convolution_37 0.0250 ms 48.0% + triton_convolution_36 0.0333 ms 36.1% +SingleProcess AUTOTUNE takes 2.4642 seconds +AUTOTUNE convolution(1x32x16x16, 32x32x3x3) + convolution 0.0105 ms 100.0% + triton_convolution_46 0.0144 ms 72.5% + triton_convolution_44 0.0180 ms 58.0% + triton_convolution_41 0.0185 ms 56.6% + triton_convolution_45 0.0186 ms 56.2% + triton_convolution_47 0.0246 ms 42.5% + triton_convolution_43 0.0289 ms 36.3% + triton_convolution_42 0.0385 ms 27.2% +SingleProcess AUTOTUNE takes 2.8977 seconds +AUTOTUNE convolution(1x16x32x32, 32x16x1x1) + triton_convolution_48 0.0068 ms 100.0% + triton_convolution_52 0.0069 ms 98.6% + triton_convolution_51 0.0076 ms 89.1% + triton_convolution_53 0.0084 ms 81.6% + triton_convolution_50 0.0096 ms 71.0% + convolution 0.0097 ms 70.3% + triton_convolution_49 0.0104 ms 65.4% +SingleProcess AUTOTUNE takes 2.4159 seconds +AUTOTUNE convolution(1x32x16x16, 64x32x3x3) + convolution 0.0112 ms 100.0% + triton_convolution_84 0.0264 ms 42.4% + triton_convolution_87 0.0332 ms 33.8% + triton_convolution_85 0.0393 ms 28.5% + triton_convolution_82 0.0432 ms 25.9% + triton_convolution_88 0.0468 ms 23.9% + triton_convolution_86 0.0496 ms 22.6% + triton_convolution_83 0.0534 ms 21.0% +SingleProcess AUTOTUNE takes 3.5156 seconds +AUTOTUNE convolution(1x32x16x16, 64x32x1x1) + triton_convolution_93 0.0077 ms 100.0% + triton_convolution_98 0.0079 ms 97.2% + triton_convolution_95 0.0084 ms 92.0% + triton_convolution_97 0.0084 ms 91.6% + triton_convolution_96 0.0086 ms 89.2% + convolution 0.0097 ms 79.5% + triton_convolution_99 0.0104 ms 73.6% + triton_convolution_94 0.0108 ms 70.8% +SingleProcess AUTOTUNE takes 3.3256 seconds +AUTOTUNE int_mm(1x64, 64x10, 1x10) + triton_mm_117 0.0066 ms 100.0% + triton_mm_118 0.0066 ms 100.0% + triton_mm_116 0.0067 ms 98.1% + triton_mm_120 0.0071 ms 92.6% + triton_mm_119 0.0071 ms 92.4% +SingleProcess AUTOTUNE takes 1.4472 seconds +pass-sqnr-46.131 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +pyhpc_equation_of_state +cuda eval pyhpc_equation_of_state int8dynamic-bs1-acc +pass-sqnr-40.034 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +pyhpc_isoneutral_mixing +cuda eval pyhpc_isoneutral_mixing int8dynamic-bs1-acc +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead + loading model: 0it [00:01, ?it/s] +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pyhpc_turbulent_kinetic_energy +cuda eval pyhpc_turbulent_kinetic_energy int8dynamic-bs1-acc +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +pytorch_CycleGAN_and_pix2pix +cuda eval pytorch_CycleGAN_and_pix2pix int8dynamic-bs1-acc +AUTOTUNE convolution(1x3x262x262, 64x3x7x7) + convolution 0.1012 ms 100.0% + triton_convolution_3 0.1373 ms 73.7% + triton_convolution_4 0.1458 ms 69.4% + triton_convolution_0 0.1723 ms 58.8% + triton_convolution_5 0.1900 ms 53.3% + triton_convolution_2 0.2150 ms 47.1% + triton_convolution_1 0.3280 ms 30.9% +SingleProcess AUTOTUNE takes 3.1580 seconds +AUTOTUNE convolution(1x64x256x256, 128x64x3x3) + convolution 0.0299 ms 100.0% + triton_convolution_12 0.1467 ms 20.4% + triton_convolution_11 0.1733 ms 17.2% + triton_convolution_6 0.1769 ms 16.9% + triton_convolution_7 0.1857 ms 16.1% + triton_convolution_9 0.2071 ms 14.4% + triton_convolution_10 0.2117 ms 14.1% + triton_convolution_8 0.4953 ms 6.0% +SingleProcess AUTOTUNE takes 4.1808 seconds +AUTOTUNE convolution(1x128x128x128, 256x128x3x3) + convolution 0.0283 ms 100.0% + triton_convolution_18 0.1538 ms 18.4% + triton_convolution_19 0.1761 ms 16.0% + triton_convolution_16 0.2016 ms 14.0% + triton_convolution_17 0.2695 ms 10.5% + triton_convolution_13 0.2889 ms 9.8% + triton_convolution_14 0.3233 ms 8.7% + triton_convolution_15 0.4956 ms 5.7% +SingleProcess AUTOTUNE takes 4.4345 seconds +AUTOTUNE convolution(1x256x66x66, 256x256x3x3) + convolution 0.0418 ms 100.0% + triton_convolution_25 0.2447 ms 17.1% + triton_convolution_23 0.2538 ms 16.5% + triton_convolution_26 0.2808 ms 14.9% + triton_convolution_24 0.3725 ms 11.2% + triton_convolution_20 0.5383 ms 7.8% + triton_convolution_21 0.5404 ms 7.7% + triton_convolution_22 0.9729 ms 4.3% +SingleProcess AUTOTUNE takes 4.3840 seconds +AUTOTUNE convolution(1x64x262x262, 3x64x7x7) + convolution 0.1183 ms 100.0% + triton_convolution_150 0.4190 ms 28.2% + triton_convolution_146 0.4308 ms 27.5% + triton_convolution_151 0.4893 ms 24.2% + triton_convolution_149 0.4936 ms 24.0% + triton_convolution_147 0.5683 ms 20.8% + triton_convolution_148 8.5109 ms 1.4% +SingleProcess AUTOTUNE takes 3.2442 seconds +pass-sqnr-33.538 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +pytorch_stargan +cuda eval pytorch_stargan int8dynamic-bs1-acc +AUTOTUNE convolution(16x8x128x128, 64x8x7x7) + convolution 0.2432 ms 100.0% + triton_convolution_3 0.4733 ms 51.4% + triton_convolution_4 0.5168 ms 47.1% + triton_convolution_5 0.5551 ms 43.8% + triton_convolution_0 0.5860 ms 41.5% + triton_convolution_2 0.8516 ms 28.6% + triton_convolution_1 0.8625 ms 28.2% +SingleProcess AUTOTUNE takes 3.4649 seconds +AUTOTUNE convolution(16x64x128x128, 128x64x4x4) + convolution 0.0942 ms 100.0% + triton_convolution_6 0.6442 ms 14.6% + triton_convolution_12 0.6479 ms 14.5% + triton_convolution_9 0.6612 ms 14.2% + triton_convolution_11 0.7654 ms 12.3% + triton_convolution_7 1.0560 ms 8.9% + triton_convolution_10 1.1906 ms 7.9% + triton_convolution_8 2.2395 ms 4.2% +SingleProcess AUTOTUNE takes 4.0679 seconds +AUTOTUNE convolution(16x128x64x64, 256x128x4x4) + convolution 0.0929 ms 100.0% + triton_convolution_18 0.6336 ms 14.7% + triton_convolution_19 0.7338 ms 12.7% + triton_convolution_16 0.8280 ms 11.2% + triton_convolution_13 0.9571 ms 9.7% + triton_convolution_17 1.0931 ms 8.5% + triton_convolution_14 1.2739 ms 7.3% + triton_convolution_15 2.6092 ms 3.6% +SingleProcess AUTOTUNE takes 5.2136 seconds +AUTOTUNE convolution(16x256x32x32, 256x256x3x3) + convolution 0.0950 ms 100.0% + triton_convolution_23 0.6000 ms 15.8% + triton_convolution_25 0.6112 ms 15.5% + triton_convolution_26 0.7212 ms 13.2% + triton_convolution_24 0.9867 ms 9.6% + triton_convolution_20 1.0125 ms 9.4% + triton_convolution_21 1.1915 ms 8.0% + triton_convolution_22 2.9134 ms 3.3% +SingleProcess AUTOTUNE takes 5.2395 seconds +AUTOTUNE convolution(16x64x128x128, 3x64x7x7) + convolution 0.3772 ms 100.0% + triton_convolution_108 1.3290 ms 28.4% + triton_convolution_109 1.4593 ms 25.8% + triton_convolution_105 1.4972 ms 25.2% + triton_convolution_104 1.5608 ms 24.2% + triton_convolution_107 1.7312 ms 21.8% + triton_convolution_106 24.9830 ms 1.5% +SingleProcess AUTOTUNE takes 3.5547 seconds +pass-sqnr-41.851 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +pytorch_unet +cuda eval pytorch_unet int8dynamic-bs1-acc +AUTOTUNE convolution(1x3x640x959, 64x3x3x3) + triton_convolution_4 0.2393 ms 100.0% + triton_convolution_3 0.2478 ms 96.6% + convolution 0.2989 ms 80.1% + triton_convolution_5 0.3256 ms 73.5% + triton_convolution_0 0.3362 ms 71.2% + triton_convolution_2 0.4469 ms 53.6% + triton_convolution_1 0.5437 ms 44.0% +SingleProcess AUTOTUNE takes 3.0560 seconds +AUTOTUNE convolution(1x64x640x959, 64x64x3x3) + convolution 0.2793 ms 100.0% + triton_convolution_6 1.3391 ms 20.9% + triton_convolution_12 1.3459 ms 20.8% + triton_convolution_11 1.5441 ms 18.1% + triton_convolution_7 2.0020 ms 14.0% + triton_convolution_9 2.2442 ms 12.4% + triton_convolution_10 2.3698 ms 11.8% + triton_convolution_8 5.5314 ms 5.0% +SingleProcess AUTOTUNE takes 4.0341 seconds +AUTOTUNE convolution(1x64x320x479, 128x64x3x3) + convolution 0.1235 ms 100.0% + triton_convolution_13 0.6796 ms 18.2% + triton_convolution_16 0.6963 ms 17.7% + triton_convolution_19 0.6970 ms 17.7% + triton_convolution_18 0.7692 ms 16.1% + triton_convolution_14 1.0236 ms 12.1% + triton_convolution_17 1.1986 ms 10.3% + triton_convolution_15 2.8668 ms 4.3% +SingleProcess AUTOTUNE takes 4.2977 seconds +AUTOTUNE convolution(1x128x320x479, 128x128x3x3) + convolution 0.2164 ms 100.0% + triton_convolution_23 1.3702 ms 15.8% + triton_convolution_26 1.3869 ms 15.6% + triton_convolution_20 1.4336 ms 15.1% + triton_convolution_25 1.5249 ms 14.2% + triton_convolution_21 2.1281 ms 10.2% + triton_convolution_24 2.3389 ms 9.3% + triton_convolution_22 5.7275 ms 3.8% +SingleProcess AUTOTUNE takes 4.3857 seconds +AUTOTUNE convolution(1x128x160x239, 256x128x3x3) + convolution 0.1096 ms 100.0% + triton_convolution_32 0.5930 ms 18.5% + triton_convolution_30 0.6725 ms 16.3% + triton_convolution_27 0.8132 ms 13.5% + triton_convolution_33 0.9582 ms 11.4% + triton_convolution_31 0.9708 ms 11.3% + triton_convolution_28 1.2226 ms 9.0% + triton_convolution_29 2.8964 ms 3.8% +SingleProcess AUTOTUNE takes 4.9857 seconds +AUTOTUNE convolution(1x256x160x239, 256x256x3x3) + convolution 0.2014 ms 100.0% + triton_convolution_39 1.3112 ms 15.4% + triton_convolution_37 1.4998 ms 13.4% + triton_convolution_34 1.6421 ms 12.3% + triton_convolution_40 2.2002 ms 9.2% + triton_convolution_38 2.7117 ms 7.4% + triton_convolution_35 3.0480 ms 6.6% + triton_convolution_36 5.7767 ms 3.5% +SingleProcess AUTOTUNE takes 4.8388 seconds +AUTOTUNE convolution(1x256x80x119, 512x256x3x3) + convolution 0.0991 ms 100.0% + triton_convolution_46 0.6072 ms 16.3% + triton_convolution_44 0.6590 ms 15.0% + triton_convolution_47 0.7395 ms 13.4% + triton_convolution_41 1.0579 ms 9.4% + triton_convolution_45 1.2031 ms 8.2% + triton_convolution_42 1.2186 ms 8.1% + triton_convolution_43 2.8597 ms 3.5% +SingleProcess AUTOTUNE takes 4.6320 seconds +AUTOTUNE convolution(1x512x80x119, 512x512x3x3) + convolution 0.1901 ms 100.0% + triton_convolution_53 1.3096 ms 14.5% + triton_convolution_54 1.4978 ms 12.7% + triton_convolution_51 1.7931 ms 10.6% + triton_convolution_48 2.1105 ms 9.0% + triton_convolution_49 2.4575 ms 7.7% + triton_convolution_52 2.9947 ms 6.3% + triton_convolution_50 5.7681 ms 3.3% +SingleProcess AUTOTUNE takes 4.6545 seconds +AUTOTUNE convolution(1x512x40x59, 512x512x3x3) + convolution 0.0683 ms 100.0% + triton_convolution_60 0.4803 ms 14.2% + triton_convolution_58 0.6710 ms 10.2% + triton_convolution_61 0.7616 ms 9.0% + triton_convolution_59 1.0415 ms 6.6% + triton_convolution_55 1.1431 ms 6.0% + triton_convolution_56 1.2347 ms 5.5% + triton_convolution_57 1.9248 ms 3.5% +SingleProcess AUTOTUNE takes 4.6955 seconds +AUTOTUNE convolution(1x1024x80x119, 512x1024x3x3) + convolution 0.3761 ms 100.0% + triton_convolution_74 2.7438 ms 13.7% + triton_convolution_75 3.7220 ms 10.1% + triton_convolution_72 4.0789 ms 9.2% + triton_convolution_69 4.4687 ms 8.4% + triton_convolution_73 6.5496 ms 5.7% + triton_convolution_70 6.6495 ms 5.7% + triton_convolution_71 11.5287 ms 3.3% +SingleProcess AUTOTUNE takes 4.7998 seconds +AUTOTUNE convolution(1x512x80x119, 256x512x3x3) + convolution 0.1220 ms 100.0% + triton_convolution_81 0.8992 ms 13.6% + triton_convolution_82 0.9972 ms 12.2% + triton_convolution_76 1.1336 ms 10.8% + triton_convolution_79 1.2013 ms 10.2% + triton_convolution_77 1.3028 ms 9.4% + triton_convolution_80 1.5917 ms 7.7% + triton_convolution_78 3.8487 ms 3.2% +SingleProcess AUTOTUNE takes 4.6261 seconds +AUTOTUNE convolution(1x512x160x239, 256x512x3x3) + convolution 0.3853 ms 100.0% + triton_convolution_88 2.7532 ms 14.0% + triton_convolution_83 3.9055 ms 9.9% + triton_convolution_86 3.9936 ms 9.6% + triton_convolution_89 6.5570 ms 5.9% + triton_convolution_87 7.1805 ms 5.4% + triton_convolution_84 7.4789 ms 5.2% + triton_convolution_85 11.7883 ms 3.3% +SingleProcess AUTOTUNE takes 4.9068 seconds +AUTOTUNE convolution(1x256x160x239, 128x256x3x3) + convolution 0.1042 ms 100.0% + triton_convolution_93 0.7740 ms 13.5% + triton_convolution_95 0.8464 ms 12.3% + triton_convolution_90 0.9361 ms 11.1% + triton_convolution_96 1.1379 ms 9.2% + triton_convolution_94 1.3997 ms 7.4% + triton_convolution_91 1.7843 ms 5.8% + triton_convolution_92 2.8963 ms 3.6% +SingleProcess AUTOTUNE takes 4.1963 seconds +AUTOTUNE convolution(1x256x320x479, 128x256x3x3) + convolution 0.4064 ms 100.0% + triton_convolution_100 2.8816 ms 14.1% + triton_convolution_102 3.0574 ms 13.3% + triton_convolution_103 3.2248 ms 12.6% + triton_convolution_97 3.4970 ms 11.6% + triton_convolution_101 6.0468 ms 6.7% + triton_convolution_98 6.0965 ms 6.7% + triton_convolution_99 11.6215 ms 3.5% +SingleProcess AUTOTUNE takes 4.4075 seconds +AUTOTUNE convolution(1x128x320x479, 64x128x3x3) + convolution 0.1241 ms 100.0% + triton_convolution_110 0.7150 ms 17.4% + triton_convolution_104 0.7783 ms 15.9% + triton_convolution_109 0.8001 ms 15.5% + triton_convolution_105 1.0695 ms 11.6% + triton_convolution_108 1.1950 ms 10.4% + triton_convolution_107 1.2026 ms 10.3% + triton_convolution_106 2.8889 ms 4.3% +SingleProcess AUTOTUNE takes 4.2336 seconds +AUTOTUNE convolution(1x128x640x959, 64x128x3x3) + convolution 0.4624 ms 100.0% + triton_convolution_117 2.7076 ms 17.1% + triton_convolution_116 3.0765 ms 15.0% + triton_convolution_111 3.3640 ms 13.7% + triton_convolution_112 4.2114 ms 11.0% + triton_convolution_115 4.6104 ms 10.0% + triton_convolution_114 4.6313 ms 10.0% + triton_convolution_113 11.0388 ms 4.2% +SingleProcess AUTOTUNE takes 4.2507 seconds +AUTOTUNE addmm(613760x2, 613760x64, 64x2) + triton_mm_127 0.0758 ms 100.0% + triton_mm_133 0.0761 ms 99.6% + triton_mm_136 0.0781 ms 97.0% + triton_mm_129 0.0784 ms 96.6% + triton_mm_132 0.0794 ms 95.4% + triton_mm_126 0.0802 ms 94.5% + triton_mm_125 0.0808 ms 93.8% + triton_mm_128 0.0812 ms 93.3% + triton_mm_130 0.0824 ms 92.0% + triton_mm_134 0.0846 ms 89.6% +SingleProcess AUTOTUNE takes 4.0681 seconds +pass-sqnr-49.327 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +resnet152 +cuda eval resnet152 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for resnet152. Setting accuracy check to cosine +AUTOTUNE mm(3136x64, 64x64) + triton_mm_7 0.0074 ms 100.0% + triton_mm_12 0.0076 ms 97.0% + triton_mm_14 0.0076 ms 97.0% + triton_mm_15 0.0076 ms 97.0% + triton_mm_8 0.0078 ms 93.9% + triton_mm_9 0.0080 ms 92.4% + triton_mm_10 0.0081 ms 91.3% + triton_mm_6 0.0081 ms 90.9% + triton_mm_11 0.0081 ms 90.9% + mm 0.0091 ms 80.7% +SingleProcess AUTOTUNE takes 4.2283 seconds +AUTOTUNE convolution(1x64x56x56, 64x64x3x3) + convolution 0.0125 ms 100.0% + triton_convolution_23 0.0376 ms 33.3% + triton_convolution_21 0.0542 ms 23.1% + triton_convolution_22 0.0564 ms 22.2% + triton_convolution_18 0.0596 ms 21.0% + triton_convolution_24 0.1002 ms 12.5% + triton_convolution_19 0.1456 ms 8.6% + triton_convolution_20 0.2483 ms 5.0% +SingleProcess AUTOTUNE takes 4.4481 seconds +AUTOTUNE mm(3136x64, 64x256) + triton_mm_27 0.0079 ms 100.0% + triton_mm_29 0.0082 ms 97.3% + triton_mm_26 0.0083 ms 96.1% + triton_mm_28 0.0083 ms 95.4% + triton_mm_33 0.0094 ms 84.6% + triton_mm_25 0.0094 ms 84.4% + triton_mm_34 0.0097 ms 81.8% + triton_mm_30 0.0099 ms 80.0% + triton_mm_35 0.0100 ms 79.7% + triton_mm_36 0.0100 ms 79.5% +SingleProcess AUTOTUNE takes 4.3005 seconds +AUTOTUNE mm(3136x256, 256x64) + triton_mm_57 0.0095 ms 100.0% + triton_mm_55 0.0097 ms 97.4% + triton_mm_54 0.0098 ms 96.7% + triton_mm_58 0.0100 ms 94.9% + triton_mm_52 0.0102 ms 93.1% + mm 0.0107 ms 88.6% + triton_mm_50 0.0108 ms 87.8% + triton_mm_53 0.0108 ms 87.6% + triton_mm_51 0.0114 ms 83.1% + triton_mm_49 0.0147 ms 64.6% +SingleProcess AUTOTUNE takes 4.5493 seconds +AUTOTUNE convolution(1x128x56x56, 128x128x3x3) + convolution 0.0137 ms 100.0% + triton_convolution_128 0.1212 ms 11.3% + triton_convolution_127 0.1703 ms 8.0% + triton_convolution_123 0.1761 ms 7.8% + triton_convolution_129 0.2021 ms 6.8% + triton_convolution_126 0.2068 ms 6.6% + triton_convolution_124 0.3941 ms 3.5% + triton_convolution_125 0.4316 ms 3.2% +SingleProcess AUTOTUNE takes 3.9890 seconds +AUTOTUNE mm(784x128, 128x512) + triton_mm_138 0.0088 ms 100.0% + triton_mm_131 0.0090 ms 98.6% + triton_mm_133 0.0092 ms 95.8% + triton_mm_134 0.0092 ms 95.8% + triton_mm_135 0.0093 ms 94.8% + triton_mm_132 0.0094 ms 93.9% + triton_mm_139 0.0097 ms 91.1% + mm 0.0100 ms 88.5% + triton_mm_136 0.0101 ms 87.3% + triton_mm_130 0.0106 ms 83.6% +SingleProcess AUTOTUNE takes 4.6596 seconds +AUTOTUNE convolution(1x256x56x56, 512x256x1x1) + convolution 0.0107 ms 100.0% + triton_convolution_146 0.0169 ms 63.1% + triton_convolution_145 0.0252 ms 42.3% + triton_convolution_147 0.0255 ms 42.0% + triton_convolution_148 0.0260 ms 41.1% + triton_convolution_143 0.0354 ms 30.2% + triton_convolution_142 0.0371 ms 28.8% + triton_convolution_144 0.0958 ms 11.2% +SingleProcess AUTOTUNE takes 4.4682 seconds +AUTOTUNE mm(784x512, 512x128) + triton_mm_154 0.0104 ms 100.0% + mm 0.0107 ms 97.3% + triton_mm_155 0.0111 ms 93.9% + triton_mm_158 0.0113 ms 91.9% + triton_mm_157 0.0117 ms 89.0% + triton_mm_153 0.0126 ms 82.3% + triton_mm_152 0.0132 ms 78.9% + triton_mm_151 0.0152 ms 68.6% + triton_mm_150 0.0152 ms 68.4% + triton_mm_149 0.0215 ms 48.3% +SingleProcess AUTOTUNE takes 4.7809 seconds +AUTOTUNE convolution(1x128x28x28, 128x128x3x3) + convolution 0.0135 ms 100.0% + triton_convolution_166 0.0732 ms 18.4% + triton_convolution_165 0.0899 ms 15.0% + triton_convolution_161 0.1098 ms 12.3% + triton_convolution_164 0.1108 ms 12.2% + triton_convolution_167 0.1674 ms 8.0% + triton_convolution_162 0.3528 ms 3.8% + triton_convolution_163 0.3844 ms 3.5% +SingleProcess AUTOTUNE takes 4.1259 seconds +AUTOTUNE convolution(1x256x28x28, 256x256x3x3) + convolution 0.0189 ms 100.0% + triton_convolution_383 0.2890 ms 6.5% + triton_convolution_382 0.3478 ms 5.4% + triton_convolution_380 0.3727 ms 5.1% + triton_convolution_384 0.3775 ms 5.0% + triton_convolution_381 0.4120 ms 4.6% + triton_convolution_378 0.5629 ms 3.4% + triton_convolution_379 0.5925 ms 3.2% +SingleProcess AUTOTUNE takes 4.4577 seconds +AUTOTUNE mm(196x256, 256x1024) + triton_mm_393 0.0095 ms 100.0% + triton_mm_394 0.0106 ms 90.0% + triton_mm_389 0.0107 ms 89.2% + triton_mm_388 0.0109 ms 87.6% + triton_mm_386 0.0109 ms 87.1% + triton_mm_387 0.0110 ms 86.6% + triton_mm_391 0.0114 ms 83.5% + triton_mm_390 0.0123 ms 77.5% + mm 0.0128 ms 74.7% + triton_mm_385 0.0142 ms 67.1% +SingleProcess AUTOTUNE takes 4.5840 seconds +AUTOTUNE convolution(1x512x28x28, 1024x512x1x1) + convolution 0.0107 ms 100.0% + triton_convolution_401 0.0258 ms 41.5% + triton_convolution_402 0.0427 ms 25.1% + triton_convolution_400 0.0432 ms 24.7% + triton_convolution_403 0.0434 ms 24.6% + triton_convolution_398 0.0624 ms 17.1% + triton_convolution_397 0.0631 ms 16.9% + triton_convolution_399 0.0896 ms 11.9% +SingleProcess AUTOTUNE takes 4.5062 seconds +AUTOTUNE mm(196x1024, 1024x256) + triton_mm_410 0.0146 ms 100.0% + triton_mm_409 0.0152 ms 96.0% + mm 0.0156 ms 93.6% + triton_mm_413 0.0156 ms 93.4% + triton_mm_412 0.0164 ms 88.9% + triton_mm_407 0.0193 ms 75.5% + triton_mm_408 0.0203 ms 71.7% + triton_mm_405 0.0222 ms 65.6% + triton_mm_406 0.0227 ms 64.1% + triton_mm_404 0.0340 ms 42.8% +SingleProcess AUTOTUNE takes 5.4897 seconds +AUTOTUNE convolution(1x256x14x14, 256x256x3x3) + convolution 0.0182 ms 100.0% + triton_convolution_420 0.1640 ms 11.1% + triton_convolution_421 0.2228 ms 8.2% + triton_convolution_419 0.2275 ms 8.0% + triton_convolution_418 0.2720 ms 6.7% + triton_convolution_422 0.3605 ms 5.1% + triton_convolution_416 0.4917 ms 3.7% + triton_convolution_417 0.5792 ms 3.1% +SingleProcess AUTOTUNE takes 4.6042 seconds +AUTOTUNE convolution(1x512x14x14, 512x512x3x3) + convolution 0.0224 ms 100.0% + triton_convolution_1503 0.2861 ms 7.8% + triton_convolution_1507 0.4889 ms 4.6% + triton_convolution_1504 0.5690 ms 3.9% + triton_convolution_1506 0.5868 ms 3.8% + triton_convolution_1505 0.6540 ms 3.4% + triton_convolution_1502 0.7608 ms 2.9% + triton_convolution_1501 1.2512 ms 1.8% +SingleProcess AUTOTUNE takes 4.6250 seconds +AUTOTUNE mm(49x512, 512x2048) + triton_mm_1517 0.0167 ms 100.0% + triton_mm_1516 0.0202 ms 83.0% + mm 0.0204 ms 82.2% + triton_mm_1514 0.0205 ms 81.5% + triton_mm_1513 0.0210 ms 79.7% + triton_mm_1512 0.0225 ms 74.4% + triton_mm_1509 0.0245 ms 68.3% + triton_mm_1511 0.0248 ms 67.5% + triton_mm_1519 0.0270 ms 62.0% + triton_mm_1508 0.0285 ms 58.6% +SingleProcess AUTOTUNE takes 4.5823 seconds +AUTOTUNE convolution(1x1024x14x14, 2048x1024x1x1) + convolution 0.0138 ms 100.0% + triton_convolution_1524 0.0470 ms 29.3% + triton_convolution_1526 0.0778 ms 17.7% + triton_convolution_1525 0.0778 ms 17.7% + triton_convolution_1523 0.0788 ms 17.5% + triton_convolution_1522 0.0859 ms 16.1% + triton_convolution_1521 0.1081 ms 12.8% + triton_convolution_1520 0.1300 ms 10.6% +SingleProcess AUTOTUNE takes 4.4598 seconds +AUTOTUNE mm(49x2048, 2048x512) + mm 0.0187 ms 100.0% + triton_mm_1536 0.0468 ms 39.9% + triton_mm_1535 0.0596 ms 31.3% + triton_mm_1533 0.0607 ms 30.7% + triton_mm_1532 0.0639 ms 29.2% + triton_mm_1531 0.0689 ms 27.1% + triton_mm_1528 0.0734 ms 25.4% + triton_mm_1530 0.0759 ms 24.6% + triton_mm_1538 0.0823 ms 22.7% + triton_mm_1529 0.0962 ms 19.4% +SingleProcess AUTOTUNE takes 4.2007 seconds +AUTOTUNE convolution(1x512x7x7, 512x512x3x3) + convolution 0.0221 ms 100.0% + triton_convolution_1541 0.2511 ms 8.8% + triton_convolution_1545 0.3120 ms 7.1% + triton_convolution_1542 0.3563 ms 6.2% + triton_convolution_1543 0.3972 ms 5.6% + triton_convolution_1540 0.5099 ms 4.3% + triton_convolution_1544 0.5195 ms 4.3% + triton_convolution_1539 1.2984 ms 1.7% +SingleProcess AUTOTUNE takes 3.3503 seconds +AUTOTUNE int_mm(1x2048, 2048x1000, 1x1000) + triton_mm_1599 0.0156 ms 100.0% + triton_mm_1598 0.0180 ms 86.9% + triton_mm_1594 0.0199 ms 78.5% + triton_mm_1597 0.0208 ms 75.3% + triton_mm_1595 0.0210 ms 74.7% + triton_mm_1593 0.0227 ms 68.9% + triton_mm_1591 0.0333 ms 47.0% + triton_mm_1590 0.0358 ms 43.7% + triton_mm_1589 0.0532 ms 29.4% + triton_mm_1592 0.0570 ms 27.4% +SingleProcess AUTOTUNE takes 4.0742 seconds +pass-sqnr-36.094 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +resnet18 +cuda eval resnet18 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for resnet18. Setting accuracy check to cosine +AUTOTUNE convolution(1x64x56x56, 128x64x3x3) + convolution 0.0115 ms 100.0% + triton_convolution_39 0.0606 ms 19.0% + triton_convolution_38 0.0825 ms 13.9% + triton_convolution_34 0.0846 ms 13.6% + triton_convolution_37 0.0999 ms 11.5% + triton_convolution_40 0.1032 ms 11.1% + triton_convolution_35 0.1990 ms 5.8% + triton_convolution_36 0.2194 ms 5.2% +SingleProcess AUTOTUNE takes 3.9521 seconds +AUTOTUNE convolution(1x64x56x56, 128x64x1x1) + triton_convolution_52 0.0088 ms 100.0% + convolution 0.0095 ms 92.6% + triton_convolution_48 0.0099 ms 89.3% + triton_convolution_53 0.0104 ms 84.6% + triton_convolution_51 0.0119 ms 73.8% + triton_convolution_54 0.0132 ms 66.7% + triton_convolution_49 0.0146 ms 60.4% + triton_convolution_50 0.0302 ms 29.2% +SingleProcess AUTOTUNE takes 3.9464 seconds +AUTOTUNE convolution(1x128x28x28, 256x128x3x3) + convolution 0.0139 ms 100.0% + triton_convolution_74 0.1440 ms 9.6% + triton_convolution_73 0.1578 ms 8.8% + triton_convolution_71 0.1876 ms 7.4% + triton_convolution_75 0.1909 ms 7.3% + triton_convolution_72 0.1915 ms 7.3% + triton_convolution_69 0.2853 ms 4.9% + triton_convolution_70 0.2977 ms 4.7% +SingleProcess AUTOTUNE takes 4.4813 seconds +AUTOTUNE convolution(1x128x28x28, 256x128x1x1) + convolution 0.0092 ms 100.0% + triton_convolution_87 0.0115 ms 79.4% + triton_convolution_86 0.0164 ms 55.9% + triton_convolution_88 0.0164 ms 55.9% + triton_convolution_89 0.0172 ms 53.3% + triton_convolution_83 0.0214 ms 42.8% + triton_convolution_84 0.0214 ms 42.8% + triton_convolution_85 0.0274 ms 33.5% +SingleProcess AUTOTUNE takes 4.5067 seconds +AUTOTUNE convolution(1x256x14x14, 512x256x3x3) + convolution 0.0185 ms 100.0% + triton_convolution_106 0.1469 ms 12.6% + triton_convolution_110 0.2439 ms 7.6% + triton_convolution_107 0.2819 ms 6.6% + triton_convolution_109 0.2970 ms 6.2% + triton_convolution_108 0.2992 ms 6.2% + triton_convolution_105 0.3650 ms 5.1% + triton_convolution_104 0.6510 ms 2.8% +SingleProcess AUTOTUNE takes 4.4468 seconds +AUTOTUNE convolution(1x256x14x14, 512x256x1x1) + convolution 0.0107 ms 100.0% + triton_convolution_122 0.0164 ms 65.0% + triton_convolution_120 0.0206 ms 51.9% + triton_convolution_121 0.0248 ms 43.2% + triton_convolution_123 0.0253 ms 42.3% + triton_convolution_124 0.0253 ms 42.3% + triton_convolution_119 0.0329 ms 32.5% + triton_convolution_118 0.0378 ms 28.3% +SingleProcess AUTOTUNE takes 4.5790 seconds +pass-sqnr-33.109 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +resnet50 +cuda eval resnet50 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for resnet50. Setting accuracy check to cosine +pass-sqnr-36.070 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:resnet50_quantized_qat failed to load +resnet50_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +resnext50_32x4d +cuda eval resnext50_32x4d int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for resnext50_32x4d. Setting accuracy check to cosine +AUTOTUNE mm(3136x128, 128x256) + triton_mm_20 0.0091 ms 100.0% + triton_mm_21 0.0091 ms 100.0% + triton_mm_19 0.0093 ms 98.3% + triton_mm_22 0.0096 ms 95.3% + triton_mm_26 0.0098 ms 93.1% + mm 0.0108 ms 84.8% + triton_mm_18 0.0112 ms 81.4% + triton_mm_25 0.0119 ms 76.8% + triton_mm_23 0.0124 ms 73.6% + triton_mm_24 0.0126 ms 72.3% +SingleProcess AUTOTUNE takes 4.3659 seconds +AUTOTUNE mm(3136x256, 256x256) + triton_mm_94 0.0106 ms 100.0% + triton_mm_93 0.0111 ms 95.4% + triton_mm_92 0.0113 ms 93.8% + triton_mm_98 0.0113 ms 93.5% + triton_mm_91 0.0115 ms 91.9% + mm 0.0120 ms 88.2% + triton_mm_95 0.0147 ms 71.7% + triton_mm_90 0.0152 ms 69.5% + triton_mm_96 0.0155 ms 68.3% + triton_mm_99 0.0159 ms 66.3% +SingleProcess AUTOTUNE takes 4.4751 seconds +AUTOTUNE mm(784x256, 256x512) + triton_mm_110 0.0094 ms 100.0% + triton_mm_106 0.0100 ms 93.9% + mm 0.0102 ms 91.8% + triton_mm_105 0.0104 ms 89.9% + triton_mm_107 0.0107 ms 87.5% + triton_mm_108 0.0107 ms 87.5% + triton_mm_104 0.0109 ms 86.2% + triton_mm_111 0.0110 ms 85.4% + triton_mm_103 0.0111 ms 84.2% + triton_mm_102 0.0143 ms 65.4% +SingleProcess AUTOTUNE takes 4.7596 seconds +AUTOTUNE mm(784x512, 512x512) + mm 0.0118 ms 100.0% + triton_mm_201 0.0122 ms 96.9% + triton_mm_196 0.0128 ms 92.5% + triton_mm_197 0.0129 ms 91.3% + triton_mm_198 0.0137 ms 86.2% + triton_mm_199 0.0138 ms 85.4% + triton_mm_202 0.0144 ms 82.0% + triton_mm_194 0.0149 ms 79.4% + triton_mm_195 0.0151 ms 78.3% + triton_mm_193 0.0215 ms 55.0% +SingleProcess AUTOTUNE takes 4.6867 seconds +AUTOTUNE mm(196x512, 512x1024) + triton_mm_213 0.0119 ms 100.0% + triton_mm_214 0.0131 ms 90.9% + triton_mm_208 0.0132 ms 90.0% + triton_mm_209 0.0141 ms 84.1% + triton_mm_211 0.0142 ms 83.7% + mm 0.0153 ms 77.5% + triton_mm_206 0.0155 ms 76.8% + triton_mm_207 0.0155 ms 76.5% + triton_mm_210 0.0161 ms 73.9% + triton_mm_205 0.0210 ms 56.6% +SingleProcess AUTOTUNE takes 4.9112 seconds +AUTOTUNE mm(196x1024, 1024x1024) + triton_mm_352 0.0167 ms 100.0% + triton_mm_353 0.0185 ms 90.3% + triton_mm_347 0.0200 ms 83.7% + triton_mm_348 0.0204 ms 81.8% + triton_mm_350 0.0207 ms 80.6% + mm 0.0214 ms 77.9% + triton_mm_345 0.0231 ms 72.3% + triton_mm_349 0.0235 ms 71.0% + triton_mm_346 0.0239 ms 69.9% + triton_mm_344 0.0353 ms 47.4% +SingleProcess AUTOTUNE takes 4.9042 seconds +AUTOTUNE mm(49x1024, 1024x2048) + mm 0.0250 ms 100.0% + triton_mm_365 0.0271 ms 92.2% + triton_mm_364 0.0339 ms 73.7% + triton_mm_362 0.0348 ms 71.7% + triton_mm_361 0.0360 ms 69.4% + triton_mm_360 0.0395 ms 63.3% + triton_mm_357 0.0412 ms 60.5% + triton_mm_359 0.0427 ms 58.5% + triton_mm_367 0.0473 ms 52.8% + triton_mm_358 0.0518 ms 48.2% +SingleProcess AUTOTUNE takes 4.0966 seconds +AUTOTUNE mm(49x2048, 2048x1024) + mm 0.0242 ms 100.0% + triton_mm_384 0.0474 ms 51.0% + triton_mm_383 0.0596 ms 40.6% + triton_mm_381 0.0614 ms 39.4% + triton_mm_380 0.0646 ms 37.4% + triton_mm_379 0.0704 ms 34.3% + triton_mm_376 0.0736 ms 32.9% + triton_mm_378 0.0761 ms 31.8% + triton_mm_386 0.0837 ms 28.9% + triton_mm_377 0.0956 ms 25.3% +SingleProcess AUTOTUNE takes 3.9778 seconds +pass-sqnr-35.405 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:11, ?it/s] +sam +cuda eval sam int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for sam. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x1024x1024, 1280x3x16x16) + triton_convolution_6 0.5005 ms 100.0% + convolution 0.5163 ms 96.9% + triton_convolution_1 0.5200 ms 96.3% + triton_convolution_3 0.5724 ms 87.4% + triton_convolution_5 0.8185 ms 61.1% + triton_convolution_4 0.8870 ms 56.4% + triton_convolution_0 0.9256 ms 54.1% + triton_convolution_2 2.2645 ms 22.1% +SingleProcess AUTOTUNE takes 4.6738 seconds +AUTOTUNE int_mm(4900x1280, 1280x3840, 4900x3840) + triton_mm_16 0.1613 ms 100.0% + triton_mm_17 0.1643 ms 98.2% + triton_mm_8 0.2109 ms 76.5% + triton_mm_9 0.2167 ms 74.4% + triton_mm_15 0.2371 ms 68.0% + triton_mm_14 0.2496 ms 64.6% + triton_mm_10 0.2568 ms 62.8% + triton_mm_11 0.2570 ms 62.8% + triton_mm_7 0.2899 ms 55.6% + triton_mm_12 0.7001 ms 23.0% +SingleProcess AUTOTUNE takes 7.1098 seconds +AUTOTUNE bmm(400x196x80, 400x80x196) + triton_bmm_19 0.0622 ms 100.0% + triton_bmm_20 0.0633 ms 98.2% + triton_bmm_21 0.0668 ms 93.1% + triton_bmm_22 0.0682 ms 91.2% + triton_bmm_18 0.0700 ms 88.8% + triton_bmm_25 0.0704 ms 88.3% + triton_bmm_28 0.0787 ms 79.0% + triton_bmm_29 0.0917 ms 67.8% + bmm 0.0935 ms 66.5% + triton_bmm_26 0.1005 ms 61.9% +SingleProcess AUTOTUNE takes 4.5812 seconds +AUTOTUNE bmm(14x5600x80, 14x80x14) + triton_bmm_40 0.0206 ms 100.0% + triton_bmm_33 0.0218 ms 94.4% + triton_bmm_41 0.0221 ms 93.3% + triton_bmm_34 0.0223 ms 92.5% + triton_bmm_35 0.0223 ms 92.3% + triton_bmm_38 0.0223 ms 92.3% + triton_bmm_39 0.0224 ms 91.9% + triton_bmm_32 0.0226 ms 91.2% + triton_bmm_37 0.0226 ms 91.2% + triton_bmm_30 0.0228 ms 90.6% +SingleProcess AUTOTUNE takes 3.6038 seconds +AUTOTUNE bmm(14x5600x80, 14x80x14) + triton_bmm_42 0.0216 ms 100.0% + triton_bmm_52 0.0221 ms 98.0% + triton_bmm_49 0.0221 ms 97.8% + triton_bmm_45 0.0221 ms 97.7% + triton_bmm_47 0.0224 ms 96.7% + triton_bmm_46 0.0230 ms 94.0% + triton_bmm_50 0.0232 ms 93.1% + triton_bmm_43 0.0237 ms 91.4% + triton_bmm_53 0.0240 ms 90.1% + triton_bmm_48 0.0242 ms 89.4% +SingleProcess AUTOTUNE takes 3.6184 seconds +AUTOTUNE bmm(400x196x196, 400x196x80) + triton_bmm_55 0.0657 ms 100.0% + triton_bmm_57 0.0688 ms 95.4% + triton_bmm_56 0.0700 ms 93.9% + triton_bmm_58 0.0733 ms 89.6% + bmm 0.0775 ms 84.8% + triton_bmm_54 0.0848 ms 77.5% + triton_bmm_64 0.0878 ms 74.8% + triton_bmm_62 0.0917 ms 71.7% + triton_bmm_61 0.0952 ms 69.0% + triton_bmm_63 0.0998 ms 65.8% +SingleProcess AUTOTUNE takes 4.9943 seconds +AUTOTUNE int_mm(4900x1280, 1280x1280, 4900x1280) + triton_mm_75 0.0594 ms 100.0% + triton_mm_76 0.0601 ms 98.9% + triton_mm_67 0.0818 ms 72.7% + triton_mm_68 0.0826 ms 71.9% + triton_mm_74 0.0875 ms 67.9% + triton_mm_73 0.0937 ms 63.4% + triton_mm_70 0.0982 ms 60.5% + triton_mm_69 0.0992 ms 59.9% + triton_mm_66 0.1156 ms 51.4% + triton_mm_71 0.2408 ms 24.7% +SingleProcess AUTOTUNE takes 7.0909 seconds +AUTOTUNE int_mm(4096x1280, 1280x5120, 4096x5120) + triton_mm_86 0.1623 ms 100.0% + triton_mm_87 0.1636 ms 99.2% + triton_mm_78 0.2345 ms 69.2% + triton_mm_79 0.2367 ms 68.6% + triton_mm_84 0.2585 ms 62.8% + triton_mm_85 0.2617 ms 62.0% + triton_mm_81 0.2719 ms 59.7% + triton_mm_80 0.2731 ms 59.4% + triton_mm_77 0.3187 ms 50.9% + triton_mm_82 0.7734 ms 21.0% +SingleProcess AUTOTUNE takes 7.4914 seconds +AUTOTUNE int_mm(4096x5120, 5120x1280, 4096x1280) + triton_mm_97 0.1503 ms 100.0% + triton_mm_98 0.1508 ms 99.7% + triton_mm_96 0.2305 ms 65.2% + triton_mm_89 0.2440 ms 61.6% + triton_mm_90 0.2461 ms 61.1% + triton_mm_92 0.2550 ms 59.0% + triton_mm_91 0.2589 ms 58.1% + triton_mm_95 0.3098 ms 48.5% + triton_mm_88 0.3484 ms 43.1% + triton_mm_93 0.7162 ms 21.0% +SingleProcess AUTOTUNE takes 7.2234 seconds +AUTOTUNE int_mm(4096x1280, 1280x3840, 4096x3840) + triton_mm_660 0.1345 ms 100.0% + triton_mm_661 0.1361 ms 98.8% + triton_mm_652 0.1783 ms 75.4% + triton_mm_653 0.1813 ms 74.1% + triton_mm_659 0.1990 ms 67.6% + triton_mm_655 0.2070 ms 65.0% + triton_mm_654 0.2107 ms 63.8% + triton_mm_658 0.2108 ms 63.8% + triton_mm_651 0.2460 ms 54.6% + triton_mm_656 0.5862 ms 22.9% +SingleProcess AUTOTUNE takes 7.6128 seconds +AUTOTUNE bmm(16x4096x80, 16x80x4096) + triton_bmm_663 0.5210 ms 100.0% + triton_bmm_664 0.5221 ms 99.8% + triton_bmm_669 0.5245 ms 99.3% + triton_bmm_666 0.5859 ms 88.9% + triton_bmm_665 0.5941 ms 87.7% + triton_bmm_662 0.6019 ms 86.6% + bmm 0.6484 ms 80.4% + triton_bmm_672 0.7048 ms 73.9% + triton_bmm_670 0.8706 ms 59.8% + triton_bmm_673 1.2096 ms 43.1% +SingleProcess AUTOTUNE takes 4.5382 seconds +AUTOTUNE bmm(64x1024x80, 64x80x64) + triton_bmm_678 0.0231 ms 100.0% + triton_bmm_677 0.0235 ms 98.4% + triton_bmm_674 0.0235 ms 98.2% + triton_bmm_681 0.0238 ms 97.1% + triton_bmm_676 0.0240 ms 96.4% + triton_bmm_675 0.0240 ms 96.2% + triton_bmm_682 0.0243 ms 95.1% + triton_bmm_684 0.0249 ms 92.8% + bmm 0.0277 ms 83.4% + triton_bmm_680 0.0277 ms 83.4% +SingleProcess AUTOTUNE takes 3.9932 seconds +AUTOTUNE bmm(16x4096x4096, 16x4096x80) + bmm 0.3941 ms 100.0% + triton_bmm_699 0.4543 ms 86.8% + triton_bmm_700 0.4739 ms 83.2% + triton_bmm_701 0.4819 ms 81.8% + triton_bmm_702 0.4995 ms 78.9% + triton_bmm_706 0.5690 ms 69.3% + triton_bmm_698 0.6052 ms 65.1% + triton_bmm_707 0.7676 ms 51.3% + triton_bmm_703 0.7716 ms 51.1% + triton_bmm_705 0.7788 ms 50.6% +SingleProcess AUTOTUNE takes 4.9549 seconds +AUTOTUNE int_mm(4096x1280, 1280x1280, 4096x1280) + triton_mm_720 0.0581 ms 100.0% + triton_mm_719 0.0584 ms 99.6% + triton_mm_711 0.0721 ms 80.6% + triton_mm_712 0.0725 ms 80.2% + triton_mm_718 0.0728 ms 79.9% + triton_mm_714 0.0750 ms 77.4% + triton_mm_713 0.0788 ms 73.7% + triton_mm_717 0.0889 ms 65.4% + triton_mm_710 0.0944 ms 61.5% + triton_mm_715 0.2004 ms 29.0% +SingleProcess AUTOTUNE takes 7.2816 seconds +AUTOTUNE int_mm(5x256, 256x256, 5x256) + triton_mm_2934 0.0076 ms 100.0% + triton_mm_2937 0.0078 ms 96.7% + triton_mm_2933 0.0081 ms 93.7% + triton_mm_2935 0.0081 ms 93.7% + triton_mm_2939 0.0087 ms 86.8% + triton_mm_2931 0.0091 ms 83.5% + triton_mm_2938 0.0093 ms 81.2% + triton_mm_2930 0.0101 ms 75.2% + triton_mm_2929 0.0106 ms 71.4% + triton_mm_2932 0.0111 ms 68.3% +SingleProcess AUTOTUNE takes 3.8448 seconds +AUTOTUNE int_mm(5x256, 256x128, 5x128) + triton_mm_2978 0.0076 ms 100.0% + triton_mm_2979 0.0081 ms 93.7% + triton_mm_2982 0.0081 ms 93.7% + triton_mm_2981 0.0084 ms 90.8% + triton_mm_2977 0.0087 ms 87.1% + triton_mm_2975 0.0091 ms 83.5% + triton_mm_2973 0.0101 ms 75.0% + triton_mm_2974 0.0106 ms 71.4% + triton_mm_2976 0.0118 ms 64.1% + triton_mm_2980 0.0139 ms 54.5% +SingleProcess AUTOTUNE takes 3.0724 seconds +AUTOTUNE convolution(1x1280x64x64, 256x1280x1x1) + convolution 0.0255 ms 100.0% + triton_convolution_3009 0.0689 ms 37.1% + triton_convolution_3008 0.0900 ms 28.4% + triton_convolution_3011 0.0939 ms 27.2% + triton_convolution_3006 0.1191 ms 21.4% + triton_convolution_3010 0.1516 ms 16.8% + triton_convolution_3005 0.2024 ms 12.6% + triton_convolution_3007 0.2262 ms 11.3% + conv1x1_via_mm 0.3048 ms 8.4% +SingleProcess AUTOTUNE takes 5.0502 seconds +AUTOTUNE convolution(1x256x64x64, 256x256x3x3) + convolution 0.0623 ms 100.0% + triton_convolution_3018 0.1505 ms 41.4% + triton_convolution_3015 0.1726 ms 36.1% + triton_convolution_3016 0.1802 ms 34.6% + triton_convolution_3013 0.2284 ms 27.3% + triton_convolution_3017 0.2445 ms 25.5% + triton_convolution_3012 0.3610 ms 17.3% + triton_convolution_3014 0.3760 ms 16.6% +SingleProcess AUTOTUNE takes 4.7490 seconds +AUTOTUNE mm(4096x2, 2x128) + triton_mm_3028 0.0071 ms 100.0% + triton_mm_3025 0.0072 ms 97.8% + triton_mm_3022 0.0073 ms 96.9% + triton_mm_3021 0.0073 ms 96.5% + triton_mm_3027 0.0073 ms 96.5% + triton_mm_3023 0.0076 ms 93.2% + triton_mm_3024 0.0076 ms 92.9% + triton_mm_3029 0.0076 ms 92.9% + triton_mm_3020 0.0078 ms 90.9% + triton_mm_3019 0.0078 ms 90.2% +SingleProcess AUTOTUNE takes 3.4219 seconds +AUTOTUNE int_mm(4096x256, 256x128, 4096x128) + triton_mm_3038 0.0114 ms 100.0% + triton_mm_3034 0.0120 ms 95.3% + triton_mm_3032 0.0123 ms 93.1% + triton_mm_3031 0.0125 ms 91.2% + triton_mm_3033 0.0125 ms 91.2% + triton_mm_3039 0.0126 ms 90.7% + triton_mm_3030 0.0138 ms 82.9% + triton_mm_3035 0.0140 ms 82.0% + triton_mm_3036 0.0153 ms 74.9% + triton_mm_3037 0.0174 ms 65.6% +SingleProcess AUTOTUNE takes 6.2572 seconds +AUTOTUNE int_mm(5x128, 128x256, 5x256) + triton_mm_3057 0.0068 ms 100.0% + triton_mm_3056 0.0073 ms 93.4% + triton_mm_3060 0.0076 ms 90.3% + triton_mm_3058 0.0078 ms 87.3% + triton_mm_3054 0.0081 ms 84.3% + triton_mm_3062 0.0084 ms 82.0% + triton_mm_3052 0.0086 ms 79.6% + triton_mm_3055 0.0088 ms 77.5% + triton_mm_3053 0.0089 ms 77.3% + triton_mm_3061 0.0089 ms 77.3% +SingleProcess AUTOTUNE takes 3.5449 seconds +AUTOTUNE int_mm(5x256, 256x2048, 5x2048) + triton_mm_3069 0.0083 ms 100.0% + triton_mm_3067 0.0085 ms 98.1% + triton_mm_3068 0.0086 ms 96.7% + triton_mm_3071 0.0088 ms 94.9% + triton_mm_3073 0.0089 ms 93.9% + triton_mm_3065 0.0094 ms 88.4% + triton_mm_3072 0.0100 ms 83.6% + triton_mm_3064 0.0103 ms 80.5% + triton_mm_3063 0.0108 ms 77.4% + triton_mm_3066 0.0118 ms 70.4% +SingleProcess AUTOTUNE takes 3.7613 seconds +AUTOTUNE int_mm(5x2048, 2048x256, 5x256) + triton_mm_3084 0.0156 ms 100.0% + triton_mm_3079 0.0181 ms 86.4% + triton_mm_3083 0.0181 ms 86.1% + triton_mm_3082 0.0188 ms 82.9% + triton_mm_3080 0.0194 ms 80.7% + triton_mm_3078 0.0209 ms 74.7% + triton_mm_3076 0.0305 ms 51.3% + triton_mm_3075 0.0348 ms 44.9% + triton_mm_3074 0.0428 ms 36.5% + triton_mm_3077 0.0450 ms 34.7% +SingleProcess AUTOTUNE takes 3.7453 seconds +AUTOTUNE int_mm(4096x128, 128x256, 4096x256) + triton_mm_3124 0.0117 ms 100.0% + triton_mm_3116 0.0118 ms 99.5% + triton_mm_3117 0.0120 ms 97.9% + triton_mm_3118 0.0123 ms 95.1% + triton_mm_3120 0.0127 ms 92.2% + triton_mm_3123 0.0135 ms 86.5% + triton_mm_3119 0.0140 ms 83.6% + triton_mm_3121 0.0151 ms 77.4% + triton_mm_3122 0.0167 ms 70.0% + triton_mm_3125 0.0282 ms 41.5% +SingleProcess AUTOTUNE takes 5.7546 seconds +AUTOTUNE int_mm(1x256, 256x256, 1x256) + triton_mm_3331 0.0078 ms 100.0% + triton_mm_3327 0.0081 ms 96.4% + triton_mm_3326 0.0084 ms 93.5% + triton_mm_3329 0.0084 ms 93.1% + triton_mm_3330 0.0086 ms 91.0% + triton_mm_3325 0.0089 ms 88.1% + triton_mm_3323 0.0097 ms 80.8% + triton_mm_3322 0.0098 ms 79.5% + triton_mm_3321 0.0106 ms 73.7% + triton_mm_3324 0.0117 ms 66.8% +SingleProcess AUTOTUNE takes 3.7154 seconds +AUTOTUNE int_mm(1x256, 256x32, 1x32) + triton_mm_3348 0.0068 ms 100.0% + triton_mm_3347 0.0074 ms 92.6% + triton_mm_3346 0.0076 ms 89.1% + triton_mm_3345 0.0084 ms 81.6% + triton_mm_3344 0.0088 ms 77.7% + triton_mm_3343 0.0098 ms 69.4% +SingleProcess AUTOTUNE takes 1.8093 seconds +AUTOTUNE bmm(1x4x32, 1x32x65536) + triton_bmm_3442 0.0111 ms 100.0% + triton_bmm_3443 0.0111 ms 100.0% + triton_bmm_3435 0.0113 ms 97.7% + triton_bmm_3437 0.0113 ms 97.7% + triton_bmm_3434 0.0114 ms 97.2% + triton_bmm_3436 0.0114 ms 96.9% + triton_bmm_3440 0.0114 ms 96.9% + triton_bmm_3433 0.0119 ms 93.0% + triton_bmm_3439 0.0119 ms 93.0% + triton_bmm_3438 0.0124 ms 89.3% +SingleProcess AUTOTUNE takes 3.0116 seconds +AUTOTUNE int_mm(1x256, 256x4, 1x4) + triton_mm_3471 0.0068 ms 100.0% + triton_mm_3470 0.0071 ms 95.9% + triton_mm_3469 0.0073 ms 93.0% + triton_mm_3468 0.0076 ms 90.3% + triton_mm_3467 0.0083 ms 81.9% + triton_mm_3466 0.0092 ms 74.2% +SingleProcess AUTOTUNE takes 1.7779 seconds +[2023-12-12 23:50:50,844] torch._dynamo.utils: [WARNING] Similarity score=0.9880209565162659 +[2023-12-12 23:50:50,844] torch._dynamo.utils: [ERROR] Accuracy failed for key name low_res_logits +fail_accuracy-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +shufflenet_v2_x1_0 +cuda eval shufflenet_v2_x1_0 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for shufflenet_v2_x1_0. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x224x224, 24x3x3x3) + triton_convolution_4 0.0114 ms 100.0% + triton_convolution_3 0.0131 ms 86.8% + convolution 0.0132 ms 85.7% + triton_convolution_0 0.0144 ms 78.9% + triton_convolution_5 0.0184 ms 61.9% + triton_convolution_2 0.0253 ms 44.9% + triton_convolution_1 0.0305 ms 37.3% +SingleProcess AUTOTUNE takes 2.5298 seconds +AUTOTUNE mm(784x24, 24x58) + triton_mm_12 0.0067 ms 100.0% + triton_mm_6 0.0069 ms 96.8% + triton_mm_15 0.0071 ms 95.0% + triton_mm_9 0.0072 ms 93.7% + triton_mm_11 0.0072 ms 93.7% + triton_mm_14 0.0072 ms 93.7% + triton_mm_7 0.0075 ms 89.4% + triton_mm_17 0.0075 ms 89.4% + triton_mm_16 0.0076 ms 88.2% + triton_mm_10 0.0078 ms 86.1% +SingleProcess AUTOTUNE takes 3.9131 seconds +AUTOTUNE mm(3136x24, 24x58) + triton_mm_29 0.0073 ms 100.0% + triton_mm_24 0.0075 ms 97.0% + triton_mm_28 0.0076 ms 95.8% + triton_mm_18 0.0077 ms 94.6% + triton_mm_19 0.0077 ms 94.6% + triton_mm_21 0.0077 ms 94.6% + triton_mm_26 0.0077 ms 94.6% + triton_mm_27 0.0077 ms 94.6% + triton_mm_23 0.0077 ms 94.2% + mm 0.0078 ms 93.8% +SingleProcess AUTOTUNE takes 4.0271 seconds +AUTOTUNE mm(784x58, 58x58) + triton_mm_35 0.0070 ms 100.0% + triton_mm_31 0.0074 ms 94.8% + triton_mm_33 0.0074 ms 94.4% + triton_mm_39 0.0074 ms 94.4% + mm 0.0079 ms 87.9% + triton_mm_36 0.0080 ms 87.6% + triton_mm_30 0.0080 ms 86.9% + triton_mm_38 0.0081 ms 86.5% + triton_mm_34 0.0083 ms 84.5% + triton_mm_32 0.0084 ms 83.2% +SingleProcess AUTOTUNE takes 4.5812 seconds +ERROR:common:backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={clone, view_1} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2232, in check_accuracy + new_result = optimized_model_iter_fn(model_copy, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors + return callback(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame + result = inner_convert(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert + compiled_product = _compile( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile + guarded_code = compile_inner(code, one_graph, hooks, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner + out_code = transform_code_object(code, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object + transformations(instructions, code_options) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform + tracer.run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run + super().run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run + and self.step() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step + getattr(self, inst.opname)(inst) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE + self.output.compile_subgraph( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph + self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph + compiled_fn = self.call_user_compiler(gm) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler + raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler + compiled_fn = compiler_fn(gm, self.example_inputs()) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper + compiled_gm = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__ + return compile_fx(model_, inputs_, config_patches=self.config) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx + return compile_fx( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx + return aot_autograd( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn + cg = aot_module_simplified(gm, example_inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified + compiled_fn = create_aot_dispatcher_function( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function + compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe + return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base + return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base + compiled_fw = compiler(fw_module, updated_flat_args) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base + return inner_compile( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper + inner_compiled_fn = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner + compiled_graph = fx_codegen_and_compile( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile + graph.run(*example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run + return super().run(*args) + File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run + self.env[node] = self.run_node(node) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node + result = self.call_function(n.target, args, kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function + raise LoweringException(e, target, args, kwargs).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function + out = lowerings[target](*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped + out = decomp_fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution + return convert_1x1_conv_to_mm(x, weight, bias) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm + x.freeze_layout() + File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__ + fn = getattr(self.data, name) +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={clone, view_1} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +TorchDynamo optimized model failed to run because of following error +fail_to_run + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +soft_actor_critic +cuda eval soft_actor_critic int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for soft_actor_critic. Setting accuracy check to cosine +AUTOTUNE mm(256x3, 3x1024) + triton_mm_8 0.0068 ms 100.0% + triton_mm_5 0.0070 ms 96.8% + triton_mm_10 0.0073 ms 93.4% + triton_mm_0 0.0074 ms 91.8% + triton_mm_6 0.0075 ms 90.8% + triton_mm_2 0.0075 ms 90.2% + triton_mm_3 0.0075 ms 90.2% + triton_mm_9 0.0076 ms 89.8% + mm 0.0078 ms 86.5% + triton_mm_1 0.0081 ms 84.1% +SingleProcess AUTOTUNE takes 4.0144 seconds +AUTOTUNE int_mm(256x1024, 1024x1024, 256x1024) + triton_mm_19 0.0155 ms 100.0% + triton_mm_17 0.0198 ms 77.9% + triton_mm_16 0.0202 ms 76.5% + triton_mm_15 0.0206 ms 74.9% + triton_mm_14 0.0207 ms 74.7% + triton_mm_13 0.0236 ms 65.6% + triton_mm_12 0.0240 ms 64.4% + triton_mm_20 0.0274 ms 56.4% + triton_mm_11 0.0278 ms 55.6% + triton_mm_21 0.0278 ms 55.6% +SingleProcess AUTOTUNE takes 7.4752 seconds +AUTOTUNE int_mm(256x1024, 1024x2, 256x2) + triton_mm_31 0.0115 ms 100.0% + triton_mm_32 0.0123 ms 93.7% + triton_mm_28 0.0128 ms 89.8% + triton_mm_30 0.0134 ms 86.1% + triton_mm_27 0.0140 ms 82.5% + triton_mm_25 0.0149 ms 77.1% + triton_mm_23 0.0205 ms 56.3% + triton_mm_24 0.0209 ms 55.0% + triton_mm_26 0.0249 ms 46.2% + triton_mm_22 0.0254 ms 45.3% +SingleProcess AUTOTUNE takes 4.2352 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +speech_transformer +cuda eval speech_transformer int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for speech_transformer. Setting accuracy check to cosine +AUTOTUNE int_mm(2040x320, 320x512, 2040x512) + triton_mm_8 0.0135 ms 100.0% + triton_mm_1 0.0138 ms 98.1% + triton_mm_2 0.0139 ms 97.5% + triton_mm_0 0.0146 ms 93.0% + triton_mm_3 0.0148 ms 91.8% + triton_mm_4 0.0148 ms 91.2% + triton_mm_7 0.0184 ms 73.4% + triton_mm_5 0.0185 ms 73.2% + triton_mm_6 0.0192 ms 70.5% + triton_mm_9 0.0199 ms 68.0% +SingleProcess AUTOTUNE takes 7.1859 seconds +AUTOTUNE int_mm(2040x512, 512x512, 2040x512) + triton_mm_19 0.0154 ms 100.0% + triton_mm_12 0.0175 ms 88.3% + triton_mm_13 0.0175 ms 88.0% + triton_mm_15 0.0181 ms 85.3% + triton_mm_14 0.0188 ms 82.1% + triton_mm_11 0.0191 ms 80.7% + triton_mm_21 0.0215 ms 71.6% + triton_mm_20 0.0219 ms 70.6% + triton_mm_18 0.0236 ms 65.2% + triton_mm_16 0.0256 ms 60.3% +SingleProcess AUTOTUNE takes 6.9939 seconds +AUTOTUNE bmm(80x204x64, 80x64x204) + triton_bmm_34 0.0176 ms 100.0% + triton_bmm_35 0.0177 ms 99.8% + triton_bmm_37 0.0197 ms 89.4% + triton_bmm_36 0.0197 ms 89.3% + triton_bmm_42 0.0198 ms 88.9% + triton_bmm_41 0.0203 ms 86.9% + triton_bmm_40 0.0205 ms 86.1% + triton_bmm_43 0.0210 ms 84.0% + triton_bmm_33 0.0211 ms 83.7% + triton_bmm_44 0.0239 ms 73.7% +SingleProcess AUTOTUNE takes 4.6349 seconds +AUTOTUNE bmm(80x204x204, 80x204x64) + triton_bmm_57 0.0180 ms 100.0% + triton_bmm_58 0.0181 ms 99.3% + triton_bmm_56 0.0182 ms 98.6% + triton_bmm_59 0.0193 ms 93.0% + triton_bmm_63 0.0194 ms 92.7% + triton_bmm_60 0.0195 ms 92.1% + triton_bmm_64 0.0196 ms 91.4% + triton_bmm_62 0.0201 ms 89.3% + triton_bmm_66 0.0214 ms 84.0% + bmm 0.0226 ms 79.3% +SingleProcess AUTOTUNE takes 4.3850 seconds +AUTOTUNE int_mm(2040x512, 512x2048, 2040x2048) + triton_mm_80 0.0317 ms 100.0% + triton_mm_81 0.0328 ms 96.7% + triton_mm_87 0.0343 ms 92.4% + triton_mm_83 0.0362 ms 87.6% + triton_mm_82 0.0380 ms 83.5% + triton_mm_89 0.0387 ms 81.8% + triton_mm_88 0.0390 ms 81.3% + triton_mm_79 0.0401 ms 79.1% + triton_mm_86 0.0417 ms 75.9% + triton_mm_84 0.0769 ms 41.2% +SingleProcess AUTOTUNE takes 7.2311 seconds +AUTOTUNE int_mm(2040x2048, 2048x512, 2040x512) + triton_mm_98 0.0306 ms 100.0% + triton_mm_100 0.0395 ms 77.4% + triton_mm_99 0.0400 ms 76.5% + triton_mm_91 0.0408 ms 74.9% + triton_mm_92 0.0409 ms 74.7% + triton_mm_93 0.0425 ms 72.0% + triton_mm_94 0.0436 ms 70.1% + triton_mm_90 0.0506 ms 60.4% + triton_mm_97 0.0666 ms 45.9% + triton_mm_95 0.0671 ms 45.6% +SingleProcess AUTOTUNE takes 7.1812 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE int_mm(220x512, 512x512, 220x512) + triton_mm_559 0.0109 ms 100.0% + triton_mm_556 0.0114 ms 95.8% + triton_mm_557 0.0116 ms 93.9% + triton_mm_555 0.0135 ms 80.8% + triton_mm_554 0.0135 ms 80.6% + triton_mm_552 0.0157 ms 69.2% + triton_mm_553 0.0159 ms 68.4% + triton_mm_551 0.0165 ms 65.8% + triton_mm_560 0.0212 ms 51.4% + triton_mm_561 0.0214 ms 50.9% +SingleProcess AUTOTUNE takes 7.3139 seconds +AUTOTUNE bmm(80x22x64, 80x64x22) + triton_bmm_575 0.0069 ms 100.0% + triton_bmm_576 0.0069 ms 100.0% + triton_bmm_577 0.0069 ms 100.0% + triton_bmm_578 0.0069 ms 100.0% + triton_bmm_574 0.0074 ms 93.9% + bmm 0.0078 ms 88.6% + triton_bmm_573 0.0078 ms 88.6% + triton_bmm_580 0.0078 ms 88.6% + triton_bmm_579 0.0084 ms 82.2% +SingleProcess AUTOTUNE takes 2.6404 seconds +AUTOTUNE bmm(80x22x22, 80x22x64) + triton_bmm_593 0.0068 ms 100.0% + triton_bmm_596 0.0069 ms 98.1% + triton_bmm_599 0.0069 ms 97.2% + triton_bmm_592 0.0073 ms 92.7% + triton_bmm_601 0.0074 ms 91.7% + triton_bmm_594 0.0074 ms 91.3% + triton_bmm_597 0.0074 ms 91.3% + triton_bmm_598 0.0074 ms 91.3% + triton_bmm_595 0.0074 ms 90.9% + triton_bmm_600 0.0078 ms 86.1% +SingleProcess AUTOTUNE takes 3.1549 seconds +AUTOTUNE bmm(80x22x64, 80x64x204) + triton_bmm_642 0.0101 ms 100.0% + triton_bmm_637 0.0102 ms 99.4% + triton_bmm_644 0.0102 ms 99.4% + triton_bmm_640 0.0102 ms 99.1% + triton_bmm_646 0.0102 ms 99.1% + triton_bmm_643 0.0102 ms 98.7% + triton_bmm_645 0.0102 ms 98.7% + triton_bmm_641 0.0103 ms 98.1% + triton_bmm_636 0.0103 ms 97.8% + triton_bmm_639 0.0104 ms 97.5% +SingleProcess AUTOTUNE takes 3.8252 seconds +AUTOTUNE bmm(80x22x204, 80x204x64) + triton_bmm_664 0.0103 ms 100.0% + triton_bmm_659 0.0103 ms 99.7% + triton_bmm_662 0.0104 ms 99.1% + triton_bmm_660 0.0105 ms 97.6% + triton_bmm_665 0.0107 ms 95.8% + triton_bmm_661 0.0111 ms 92.8% + triton_bmm_658 0.0125 ms 82.3% + triton_bmm_663 0.0129 ms 79.5% + bmm 0.0134 ms 76.6% + triton_bmm_666 0.0164 ms 62.6% +SingleProcess AUTOTUNE takes 3.3785 seconds +AUTOTUNE int_mm(220x512, 512x2048, 220x2048) + triton_mm_687 0.0127 ms 100.0% + triton_mm_682 0.0142 ms 89.8% + triton_mm_683 0.0142 ms 89.4% + triton_mm_681 0.0158 ms 80.8% + triton_mm_680 0.0160 ms 79.6% + triton_mm_679 0.0175 ms 72.9% + triton_mm_685 0.0183 ms 69.7% + triton_mm_684 0.0183 ms 69.6% + triton_mm_688 0.0217 ms 58.6% + triton_mm_689 0.0219 ms 58.2% +SingleProcess AUTOTUNE takes 7.3541 seconds +AUTOTUNE int_mm(220x2048, 2048x512, 220x512) + triton_mm_695 0.0213 ms 100.0% + triton_mm_698 0.0218 ms 97.5% + triton_mm_696 0.0220 ms 96.7% + triton_mm_693 0.0280 ms 76.0% + triton_mm_694 0.0294 ms 72.3% + triton_mm_691 0.0384 ms 55.4% + triton_mm_692 0.0389 ms 54.6% + triton_mm_699 0.0400 ms 53.2% + triton_mm_700 0.0404 ms 52.7% + triton_mm_690 0.0465 ms 45.8% +SingleProcess AUTOTUNE takes 7.5515 seconds +AUTOTUNE int_mm(220x512, 512x1014, 220x1014) + triton_mm_1459 0.0116 ms 100.0% + triton_mm_1456 0.0142 ms 81.9% + triton_mm_1455 0.0142 ms 81.8% + triton_mm_1457 0.0143 ms 81.4% + triton_mm_1454 0.0151 ms 77.1% + triton_mm_1452 0.0157 ms 74.1% + triton_mm_1451 0.0164 ms 71.0% + triton_mm_1453 0.0164 ms 70.9% + triton_mm_1461 0.0234 ms 49.7% + triton_mm_1460 0.0246 ms 47.3% +SingleProcess AUTOTUNE takes 7.9013 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +squeezenet1_1 +cuda eval squeezenet1_1 int8dynamic-bs1-acc +AUTOTUNE convolution(1x3x224x224, 64x3x3x3) + triton_convolution_4 0.0141 ms 100.0% + triton_convolution_3 0.0155 ms 90.9% + convolution 0.0162 ms 87.2% + triton_convolution_5 0.0222 ms 63.6% + triton_convolution_0 0.0237 ms 59.6% + triton_convolution_2 0.0248 ms 56.8% + triton_convolution_1 0.0611 ms 23.1% +SingleProcess AUTOTUNE takes 3.2720 seconds +AUTOTUNE addmm(3025x16, 3025x64, 64x16) + triton_mm_15 0.0082 ms 100.0% + triton_mm_14 0.0083 ms 98.1% + triton_mm_17 0.0092 ms 88.9% + triton_mm_12 0.0097 ms 84.2% + triton_mm_7 0.0099 ms 82.3% + triton_mm_11 0.0099 ms 82.3% + triton_mm_6 0.0102 ms 80.2% + triton_mm_9 0.0103 ms 78.9% + triton_mm_13 0.0105 ms 78.0% + triton_mm_16 0.0105 ms 78.0% +SingleProcess AUTOTUNE takes 4.2614 seconds +AUTOTUNE addmm(3025x64, 3025x16, 16x64) + triton_mm_26 0.0071 ms 100.0% + triton_mm_19 0.0074 ms 96.5% + triton_mm_28 0.0074 ms 96.5% + triton_mm_27 0.0074 ms 96.1% + triton_mm_24 0.0076 ms 94.5% + triton_mm_21 0.0076 ms 93.9% + triton_mm_23 0.0076 ms 93.3% + triton_mm_18 0.0079 ms 89.9% + triton_mm_20 0.0084 ms 84.8% + triton_mm_22 0.0087 ms 82.0% +SingleProcess AUTOTUNE takes 4.1096 seconds +AUTOTUNE convolution(1x16x55x55, 64x16x3x3) + convolution 0.0109 ms 100.0% + triton_convolution_33 0.0138 ms 79.1% + triton_convolution_32 0.0158 ms 68.9% + triton_convolution_29 0.0204 ms 53.4% + triton_convolution_34 0.0209 ms 52.1% + triton_convolution_31 0.0304 ms 35.9% + triton_convolution_30 0.0350 ms 31.2% +SingleProcess AUTOTUNE takes 3.1970 seconds +AUTOTUNE addmm(3025x16, 3025x128, 128x16) + triton_mm_44 0.0100 ms 100.0% + triton_mm_40 0.0115 ms 86.6% + triton_mm_41 0.0115 ms 86.6% + triton_mm_46 0.0123 ms 81.2% + triton_mm_35 0.0123 ms 81.0% + triton_mm_43 0.0127 ms 78.1% + triton_mm_38 0.0130 ms 76.6% + triton_mm_36 0.0134 ms 74.4% + triton_mm_42 0.0135 ms 73.5% + triton_mm_39 0.0138 ms 72.3% +SingleProcess AUTOTUNE takes 4.4521 seconds +AUTOTUNE addmm(729x32, 729x128, 128x32) + triton_mm_73 0.0084 ms 100.0% + triton_mm_70 0.0099 ms 84.5% + triton_mm_69 0.0105 ms 79.9% + triton_mm_67 0.0107 ms 78.2% + triton_mm_72 0.0110 ms 76.5% + triton_mm_75 0.0117 ms 71.4% + triton_mm_64 0.0120 ms 70.1% + triton_mm_65 0.0129 ms 65.2% + triton_mm_66 0.0138 ms 60.8% + triton_mm_71 0.0138 ms 60.6% +SingleProcess AUTOTUNE takes 4.2887 seconds +AUTOTUNE addmm(729x128, 729x32, 32x128) + triton_mm_81 0.0069 ms 100.0% + triton_mm_76 0.0074 ms 93.1% + triton_mm_82 0.0074 ms 92.5% + triton_mm_85 0.0074 ms 92.5% + triton_mm_84 0.0078 ms 88.5% + triton_mm_77 0.0081 ms 85.3% + triton_mm_87 0.0081 ms 84.6% + triton_mm_78 0.0082 ms 84.3% + triton_mm_80 0.0084 ms 81.7% + triton_mm_79 0.0089 ms 77.3% +SingleProcess AUTOTUNE takes 5.1150 seconds +AUTOTUNE convolution(1x32x27x27, 128x32x3x3) + convolution 0.0110 ms 100.0% + triton_convolution_93 0.0235 ms 46.8% + triton_convolution_92 0.0279 ms 39.5% + triton_convolution_88 0.0314 ms 35.0% + triton_convolution_91 0.0326 ms 33.8% + triton_convolution_94 0.0375 ms 29.3% + triton_convolution_89 0.0551 ms 20.0% + triton_convolution_90 0.0623 ms 17.7% +SingleProcess AUTOTUNE takes 4.1971 seconds +AUTOTUNE addmm(729x32, 729x256, 256x32) + triton_mm_104 0.0117 ms 100.0% + triton_mm_101 0.0137 ms 85.9% + triton_mm_103 0.0143 ms 82.1% + triton_mm_100 0.0148 ms 79.4% + triton_mm_98 0.0151 ms 77.8% + triton_mm_106 0.0176 ms 66.7% + addmm 0.0178 ms 66.1% + triton_mm_96 0.0191 ms 61.4% + triton_mm_97 0.0204 ms 57.5% + triton_mm_99 0.0207 ms 56.9% +SingleProcess AUTOTUNE takes 4.5602 seconds +AUTOTUNE addmm(169x48, 169x256, 256x48) + triton_mm_135 0.0113 ms 100.0% + triton_mm_131 0.0138 ms 81.7% + triton_mm_132 0.0138 ms 81.5% + triton_mm_134 0.0146 ms 77.4% + triton_mm_129 0.0153 ms 73.5% + triton_mm_137 0.0161 ms 70.1% + triton_mm_128 0.0171 ms 65.8% + triton_mm_130 0.0171 ms 65.8% + triton_mm_126 0.0181 ms 62.3% + triton_mm_127 0.0186 ms 60.5% +SingleProcess AUTOTUNE takes 4.9880 seconds +AUTOTUNE addmm(169x192, 169x48, 48x192) + triton_mm_147 0.0077 ms 100.0% + triton_mm_146 0.0083 ms 92.8% + triton_mm_144 0.0083 ms 92.7% + triton_mm_143 0.0086 ms 89.6% + triton_mm_149 0.0087 ms 88.6% + triton_mm_141 0.0097 ms 79.2% + triton_mm_139 0.0099 ms 77.9% + triton_mm_140 0.0100 ms 77.2% + triton_mm_142 0.0100 ms 77.2% + triton_mm_148 0.0100 ms 76.8% +SingleProcess AUTOTUNE takes 5.5186 seconds +AUTOTUNE convolution(1x48x13x13, 192x48x3x3) + convolution 0.0120 ms 100.0% + triton_convolution_152 0.0368 ms 32.7% + triton_convolution_154 0.0457 ms 26.3% + triton_convolution_153 0.0558 ms 21.6% + triton_convolution_156 0.0624 ms 19.3% + triton_convolution_155 0.0638 ms 18.9% + triton_convolution_150 0.0751 ms 16.0% + triton_convolution_151 0.0881 ms 13.7% +SingleProcess AUTOTUNE takes 4.4411 seconds +AUTOTUNE addmm(169x48, 169x384, 384x48) + triton_mm_166 0.0124 ms 100.0% + triton_mm_163 0.0168 ms 74.0% + triton_mm_162 0.0173 ms 71.8% + triton_mm_165 0.0181 ms 68.7% + addmm 0.0181 ms 68.6% + triton_mm_160 0.0191 ms 65.1% + triton_mm_168 0.0212 ms 58.8% + triton_mm_161 0.0217 ms 57.4% + triton_mm_159 0.0219 ms 56.8% + triton_mm_157 0.0233 ms 53.5% +SingleProcess AUTOTUNE takes 5.7157 seconds +AUTOTUNE addmm(169x64, 169x384, 384x64) + triton_mm_197 0.0125 ms 100.0% + triton_mm_194 0.0168 ms 74.1% + triton_mm_193 0.0179 ms 69.9% + triton_mm_196 0.0181 ms 68.9% + triton_mm_191 0.0192 ms 65.0% + triton_mm_199 0.0212 ms 58.9% + triton_mm_192 0.0218 ms 57.4% + triton_mm_190 0.0220 ms 56.8% + addmm 0.0220 ms 56.6% + triton_mm_188 0.0237 ms 52.6% +SingleProcess AUTOTUNE takes 5.2951 seconds +AUTOTUNE addmm(169x256, 169x64, 64x256) + triton_mm_209 0.0079 ms 100.0% + triton_mm_211 0.0087 ms 91.1% + triton_mm_208 0.0089 ms 88.5% + triton_mm_206 0.0089 ms 88.4% + triton_mm_203 0.0097 ms 81.2% + triton_mm_205 0.0097 ms 81.2% + triton_mm_202 0.0100 ms 79.4% + triton_mm_200 0.0104 ms 75.8% + triton_mm_204 0.0105 ms 75.5% + triton_mm_210 0.0107 ms 74.0% +SingleProcess AUTOTUNE takes 5.4531 seconds +AUTOTUNE convolution(1x64x13x13, 256x64x3x3) + convolution 0.0108 ms 100.0% + triton_convolution_216 0.0439 ms 24.5% + triton_convolution_215 0.0539 ms 20.0% + triton_convolution_217 0.0556 ms 19.3% + triton_convolution_214 0.0641 ms 16.8% + triton_convolution_218 0.0661 ms 16.3% + triton_convolution_213 0.1106 ms 9.7% + triton_convolution_212 0.1272 ms 8.5% +SingleProcess AUTOTUNE takes 4.3827 seconds +AUTOTUNE addmm(169x64, 169x512, 512x64) + triton_mm_228 0.0152 ms 100.0% + addmm 0.0169 ms 90.3% + triton_mm_225 0.0208 ms 73.1% + triton_mm_224 0.0214 ms 71.0% + triton_mm_227 0.0217 ms 70.1% + triton_mm_222 0.0230 ms 66.1% + triton_mm_223 0.0265 ms 57.6% + triton_mm_230 0.0268 ms 56.8% + triton_mm_221 0.0271 ms 56.3% + triton_mm_219 0.0296 ms 51.5% +SingleProcess AUTOTUNE takes 5.2284 seconds +AUTOTUNE addmm(169x1000, 169x512, 512x1000) + triton_mm_259 0.0170 ms 100.0% + triton_mm_256 0.0206 ms 82.6% + triton_mm_258 0.0212 ms 80.1% + triton_mm_255 0.0219 ms 77.8% + triton_mm_251 0.0252 ms 67.4% + triton_mm_253 0.0253 ms 67.3% + triton_mm_254 0.0267 ms 63.7% + addmm 0.0271 ms 62.7% + triton_mm_261 0.0272 ms 62.7% + triton_mm_252 0.0276 ms 61.7% +SingleProcess AUTOTUNE takes 5.3328 seconds +pass-sqnr-43.374 + loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder + + Loading pipeline components...: 0%| | 0/6 [00:00 +AUTOTUNE convolution(1x3x224x224, 768x3x32x32) + convolution 0.2530 ms 100.0% + triton_convolution_4 0.4998 ms 50.6% + triton_convolution_6 0.7181 ms 35.2% + triton_convolution_1 0.8088 ms 31.3% + triton_convolution_3 0.8430 ms 30.0% + triton_convolution_2 0.9885 ms 25.6% + triton_convolution_5 1.1838 ms 21.4% + triton_convolution_0 1.3892 ms 18.2% +SingleProcess AUTOTUNE takes 4.7463 seconds +AUTOTUNE mm(50x768, 768x2304) + mm 0.0128 ms 100.0% + triton_mm_13 0.0136 ms 94.1% + triton_mm_12 0.0142 ms 89.9% + triton_mm_15 0.0144 ms 88.9% + triton_mm_16 0.0146 ms 87.7% + triton_mm_11 0.0148 ms 86.2% + triton_mm_10 0.0162 ms 78.9% + triton_mm_9 0.0184 ms 69.5% + triton_mm_8 0.0203 ms 62.8% + triton_mm_7 0.0294 ms 43.4% +SingleProcess AUTOTUNE takes 4.9880 seconds +AUTOTUNE int_mm(50x768, 768x768, 50x768) + triton_mm_29 0.0123 ms 100.0% + triton_mm_24 0.0126 ms 97.2% + triton_mm_25 0.0129 ms 95.3% + triton_mm_27 0.0129 ms 95.3% + triton_mm_23 0.0149 ms 82.4% + triton_mm_28 0.0164 ms 75.0% + triton_mm_21 0.0169 ms 72.6% + triton_mm_22 0.0169 ms 72.6% + triton_mm_20 0.0198 ms 62.1% + triton_mm_19 0.0238 ms 51.5% +SingleProcess AUTOTUNE takes 5.0110 seconds +AUTOTUNE int_mm(50x768, 768x3072, 50x3072) + triton_mm_40 0.0137 ms 100.0% + triton_mm_38 0.0143 ms 95.6% + triton_mm_36 0.0147 ms 93.4% + triton_mm_35 0.0150 ms 91.3% + triton_mm_34 0.0158 ms 86.8% + triton_mm_39 0.0162 ms 84.8% + triton_mm_33 0.0177 ms 77.4% + triton_mm_32 0.0180 ms 75.9% + triton_mm_31 0.0207 ms 66.0% + triton_mm_30 0.0237 ms 57.7% +SingleProcess AUTOTUNE takes 5.3626 seconds +AUTOTUNE int_mm(50x3072, 3072x768, 50x768) + triton_mm_51 0.0240 ms 100.0% + triton_mm_46 0.0291 ms 82.6% + triton_mm_47 0.0301 ms 79.7% + triton_mm_49 0.0306 ms 78.4% + triton_mm_50 0.0313 ms 76.7% + triton_mm_45 0.0376 ms 63.8% + triton_mm_44 0.0428 ms 56.0% + triton_mm_43 0.0484 ms 49.5% + triton_mm_42 0.0545 ms 44.0% + triton_mm_41 0.0779 ms 30.8% +SingleProcess AUTOTUNE takes 5.1156 seconds +AUTOTUNE mm(2464x512, 512x1536) + triton_mm_548 0.0314 ms 100.0% + triton_mm_549 0.0320 ms 98.0% + triton_mm_550 0.0362 ms 86.7% + triton_mm_551 0.0364 ms 86.2% + triton_mm_555 0.0401 ms 78.4% + mm 0.0402 ms 78.1% + triton_mm_547 0.0411 ms 76.4% + triton_mm_554 0.0442 ms 71.0% + triton_mm_557 0.0627 ms 50.1% + triton_mm_556 0.0666 ms 47.1% +SingleProcess AUTOTUNE takes 5.0998 seconds +AUTOTUNE int_mm(2464x512, 512x512, 2464x512) + triton_mm_567 0.0153 ms 100.0% + triton_mm_561 0.0175 ms 87.0% + triton_mm_560 0.0180 ms 84.7% + triton_mm_563 0.0189 ms 80.6% + triton_mm_562 0.0192 ms 79.4% + triton_mm_559 0.0193 ms 79.2% + triton_mm_568 0.0215 ms 71.1% + triton_mm_569 0.0217 ms 70.3% + triton_mm_566 0.0246 ms 62.1% + triton_mm_564 0.0294 ms 51.8% +SingleProcess AUTOTUNE takes 7.3853 seconds +AUTOTUNE int_mm(2464x512, 512x2048, 2464x2048) + triton_mm_571 0.0365 ms 100.0% + triton_mm_572 0.0380 ms 96.3% + triton_mm_580 0.0389 ms 93.9% + triton_mm_578 0.0393 ms 93.1% + triton_mm_579 0.0395 ms 92.6% + triton_mm_574 0.0412 ms 88.7% + triton_mm_573 0.0434 ms 84.2% + triton_mm_570 0.0440 ms 83.0% + triton_mm_577 0.0460 ms 79.4% + triton_mm_576 0.0946 ms 38.6% +SingleProcess AUTOTUNE takes 7.3797 seconds +AUTOTUNE int_mm(2464x2048, 2048x512, 2464x512) + triton_mm_589 0.0314 ms 100.0% + triton_mm_590 0.0399 ms 78.7% + triton_mm_591 0.0404 ms 77.8% + triton_mm_583 0.0420 ms 74.7% + triton_mm_582 0.0421 ms 74.5% + triton_mm_584 0.0431 ms 72.9% + triton_mm_585 0.0435 ms 72.1% + triton_mm_581 0.0526 ms 59.7% + triton_mm_588 0.0696 ms 45.1% + triton_mm_586 0.0793 ms 39.6% +SingleProcess AUTOTUNE takes 7.2199 seconds +AUTOTUNE mm(1x768, 768x512) + mm 0.0089 ms 100.0% + triton_mm_1092 0.0115 ms 77.4% + triton_mm_1095 0.0116 ms 76.1% + triton_mm_1093 0.0117 ms 75.9% + triton_mm_1091 0.0130 ms 68.1% + triton_mm_1096 0.0131 ms 67.6% + triton_mm_1090 0.0141 ms 62.7% + triton_mm_1089 0.0154 ms 57.5% + triton_mm_1088 0.0162 ms 54.7% + triton_mm_1087 0.0245 ms 36.2% +SingleProcess AUTOTUNE takes 3.9875 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]WARNING:common:Model tts_angular does not support bfloat16, running with amp instead + loading model: 0it [00:01, ?it/s] +WARNING:common:Model tts_angular does not support bfloat16, running with amp instead +tts_angular +cuda eval tts_angular int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for tts_angular. Setting accuracy check to cosine +WARNING:common:Model tts_angular does not support bfloat16, running with amp instead +AUTOTUNE int_mm(50x768, 768x256, 50x256) + triton_mm_5 0.0124 ms 100.0% + triton_mm_6 0.0128 ms 96.8% + triton_mm_10 0.0129 ms 95.7% + triton_mm_8 0.0134 ms 92.4% + triton_mm_4 0.0146 ms 84.9% + triton_mm_3 0.0165 ms 74.9% + triton_mm_2 0.0173 ms 71.4% + triton_mm_9 0.0180 ms 68.7% + triton_mm_1 0.0205 ms 60.4% + triton_mm_0 0.0214 ms 57.9% +SingleProcess AUTOTUNE takes 5.6383 seconds +pass-sqnr-36.419 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +vgg16 +cuda eval vgg16 int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for vgg16. Setting accuracy check to cosine +AUTOTUNE convolution(1x3x224x224, 64x3x3x3) + triton_convolution_3 0.0277 ms 100.0% + triton_convolution_4 0.0287 ms 96.7% + convolution 0.0337 ms 82.3% + triton_convolution_5 0.0354 ms 78.3% + triton_convolution_0 0.0375 ms 74.1% + triton_convolution_2 0.0387 ms 71.7% + triton_convolution_1 0.0941 ms 29.5% +SingleProcess AUTOTUNE takes 3.1405 seconds +AUTOTUNE convolution(1x64x224x224, 64x64x3x3) + convolution 0.0339 ms 100.0% + triton_convolution_6 0.1276 ms 26.5% + triton_convolution_11 0.1441 ms 23.5% + triton_convolution_12 0.1850 ms 18.3% + triton_convolution_9 0.2120 ms 16.0% + triton_convolution_10 0.2151 ms 15.8% + triton_convolution_7 0.2389 ms 14.2% + triton_convolution_8 0.4951 ms 6.8% +SingleProcess AUTOTUNE takes 3.8029 seconds +AUTOTUNE convolution(1x64x112x112, 128x64x3x3) + convolution 0.0202 ms 100.0% + triton_convolution_16 0.0668 ms 30.2% + triton_convolution_13 0.0678 ms 29.8% + triton_convolution_18 0.0792 ms 25.5% + triton_convolution_17 0.1001 ms 20.2% + triton_convolution_19 0.1056 ms 19.1% + triton_convolution_14 0.1523 ms 13.3% + triton_convolution_15 0.2499 ms 8.1% +SingleProcess AUTOTUNE takes 4.1363 seconds +AUTOTUNE convolution(1x128x112x112, 128x128x3x3) + convolution 0.0297 ms 100.0% + triton_convolution_23 0.1268 ms 23.4% + triton_convolution_20 0.1350 ms 22.0% + triton_convolution_25 0.1499 ms 19.8% + triton_convolution_24 0.1891 ms 15.7% + triton_convolution_26 0.1999 ms 14.9% + triton_convolution_21 0.3741 ms 7.9% + triton_convolution_22 0.4889 ms 6.1% +SingleProcess AUTOTUNE takes 4.1108 seconds +AUTOTUNE convolution(1x128x56x56, 256x128x3x3) + convolution 0.0184 ms 100.0% + triton_convolution_32 0.1071 ms 17.2% + triton_convolution_31 0.1080 ms 17.0% + triton_convolution_30 0.1137 ms 16.2% + triton_convolution_33 0.1885 ms 9.8% + triton_convolution_27 0.2404 ms 7.7% + triton_convolution_28 0.3695 ms 5.0% + triton_convolution_29 0.4851 ms 3.8% +SingleProcess AUTOTUNE takes 4.5493 seconds +AUTOTUNE convolution(1x256x56x56, 256x256x3x3) + convolution 0.0280 ms 100.0% + triton_convolution_39 0.2347 ms 11.9% + triton_convolution_37 0.2439 ms 11.5% + triton_convolution_38 0.2535 ms 11.0% + triton_convolution_40 0.3829 ms 7.3% + triton_convolution_34 0.5168 ms 5.4% + triton_convolution_35 0.7414 ms 3.8% + triton_convolution_36 0.9621 ms 2.9% +SingleProcess AUTOTUNE takes 5.1384 seconds +AUTOTUNE convolution(1x256x28x28, 512x256x3x3) + convolution 0.0193 ms 100.0% + triton_convolution_52 0.1733 ms 11.2% + triton_convolution_53 0.2345 ms 8.2% + triton_convolution_51 0.2426 ms 8.0% + triton_convolution_54 0.3954 ms 4.9% + triton_convolution_48 0.5175 ms 3.7% + triton_convolution_50 0.7628 ms 2.5% + triton_convolution_49 0.7751 ms 2.5% +SingleProcess AUTOTUNE takes 4.6666 seconds +AUTOTUNE convolution(1x512x28x28, 512x512x3x3) + convolution 0.0295 ms 100.0% + triton_convolution_59 0.4670 ms 6.3% + triton_convolution_60 0.5042 ms 5.9% + triton_convolution_58 0.6984 ms 4.2% + triton_convolution_61 0.7930 ms 3.7% + triton_convolution_55 1.1882 ms 2.5% + triton_convolution_57 1.5142 ms 2.0% + triton_convolution_56 1.6066 ms 1.8% +SingleProcess AUTOTUNE takes 4.9398 seconds +AUTOTUNE convolution(1x512x14x14, 512x512x3x3) + convolution 0.0244 ms 100.0% + triton_convolution_73 0.4603 ms 5.3% + triton_convolution_74 0.4944 ms 4.9% + triton_convolution_71 0.5028 ms 4.8% + triton_convolution_72 0.6874 ms 3.5% + triton_convolution_75 0.8235 ms 3.0% + triton_convolution_69 1.1755 ms 2.1% + triton_convolution_70 1.2216 ms 2.0% +SingleProcess AUTOTUNE takes 4.8287 seconds +AUTOTUNE int_mm(1x25088, 25088x4096, 1x4096) + triton_mm_100 0.1270 ms 100.0% + triton_mm_99 0.1426 ms 89.1% + triton_mm_98 0.1801 ms 70.5% + triton_mm_95 0.1862 ms 68.2% + triton_mm_96 0.1947 ms 65.3% + triton_mm_94 0.2127 ms 59.7% + triton_mm_92 0.3380 ms 37.6% + triton_mm_91 0.3747 ms 33.9% + triton_mm_90 0.5241 ms 24.2% + triton_mm_93 0.5913 ms 21.5% +SingleProcess AUTOTUNE takes 3.7734 seconds +pass-sqnr-41.484 + loading model: 0it [00:00, ?it/s]WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead + loading model: 0it [00:04, ?it/s] +WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead +vision_maskrcnn +cuda eval vision_maskrcnn int8dynamic-bs1-acc +WARNING:common:fp64 golden ref were not generated for vision_maskrcnn. Setting accuracy check to cosine +WARNING:common:Model vision_maskrcnn does not support bfloat16, running with float16 instead +AUTOTUNE mm(60800x256, 256x128) + mm 0.0478 ms 100.0% + triton_mm_112 0.0480 ms 99.7% + triton_mm_113 0.0489 ms 97.9% + triton_mm_114 0.0520 ms 92.0% + triton_mm_115 0.0529 ms 90.5% + triton_mm_119 0.0542 ms 88.2% + triton_mm_111 0.0545 ms 87.8% + triton_mm_118 0.0545 ms 87.8% + triton_mm_121 0.0757 ms 63.2% + triton_mm_116 0.0784 ms 61.0% +SingleProcess AUTOTUNE takes 5.0811 seconds +AUTOTUNE convolution(1x128x200x304, 128x128x3x3) + convolution 0.0464 ms 100.0% + triton_convolution_129 0.3076 ms 15.1% + triton_convolution_128 0.3440 ms 13.5% + triton_convolution_124 0.3723 ms 12.5% + triton_convolution_123 0.3832 ms 12.1% + triton_convolution_126 0.4029 ms 11.5% + triton_convolution_127 0.4209 ms 11.0% + triton_convolution_125 0.9983 ms 4.7% +SingleProcess AUTOTUNE takes 4.5278 seconds +AUTOTUNE mm(15200x512, 512x256) + triton_mm_244 0.0357 ms 100.0% + mm 0.0364 ms 98.2% + triton_mm_243 0.0374 ms 95.5% + triton_mm_246 0.0388 ms 91.9% + triton_mm_245 0.0398 ms 89.6% + triton_mm_250 0.0424 ms 84.2% + triton_mm_242 0.0471 ms 75.8% + triton_mm_249 0.0538 ms 66.3% + triton_mm_252 0.0680 ms 52.5% + triton_mm_248 0.0680 ms 52.5% +SingleProcess AUTOTUNE takes 5.0421 seconds +AUTOTUNE convolution(1x256x100x152, 256x256x3x3) + convolution 0.0405 ms 100.0% + triton_convolution_259 0.3056 ms 13.3% + triton_convolution_260 0.3481 ms 11.6% + triton_convolution_257 0.4397 ms 9.2% + triton_convolution_254 0.5898 ms 6.9% + triton_convolution_258 0.5984 ms 6.8% + triton_convolution_255 0.6599 ms 6.1% + triton_convolution_256 1.0213 ms 4.0% +SingleProcess AUTOTUNE takes 4.8855 seconds +AUTOTUNE mm(3800x1024, 1024x512) + triton_mm_436 0.0358 ms 100.0% + triton_mm_437 0.0380 ms 94.3% + mm 0.0397 ms 90.2% + triton_mm_438 0.0423 ms 84.6% + triton_mm_439 0.0443 ms 80.8% + triton_mm_443 0.0449 ms 79.7% + triton_mm_435 0.0460 ms 77.8% + triton_mm_445 0.0652 ms 54.9% + triton_mm_441 0.0664 ms 53.9% + triton_mm_440 0.0671 ms 53.4% +SingleProcess AUTOTUNE takes 4.6894 seconds +AUTOTUNE convolution(1x512x50x76, 512x512x3x3) + convolution 0.0465 ms 100.0% + triton_convolution_452 0.5966 ms 7.8% + triton_convolution_453 0.6855 ms 6.8% + triton_convolution_451 0.7745 ms 6.0% + triton_convolution_450 0.8244 ms 5.6% + triton_convolution_448 1.2252 ms 3.8% + triton_convolution_447 1.2768 ms 3.6% + triton_convolution_449 1.9313 ms 2.4% +SingleProcess AUTOTUNE takes 4.3919 seconds +AUTOTUNE addmm(60800x256, 60800x256, 256x256) + bias_addmm 0.0690 ms 100.0% + triton_mm_556 0.0721 ms 95.6% + triton_mm_555 0.0791 ms 87.2% + triton_mm_558 0.0793 ms 87.0% + triton_mm_561 0.0810 ms 85.1% + triton_mm_557 0.0862 ms 80.0% + triton_mm_554 0.0878 ms 78.5% + triton_mm_562 0.0918 ms 75.1% + triton_mm_564 0.1196 ms 57.6% + addmm 0.1225 ms 56.3% +SingleProcess AUTOTUNE takes 5.1635 seconds +AUTOTUNE convolution(1x256x200x304, 256x256x3x3) + convolution 0.3142 ms 100.0% + triton_convolution_595 1.8956 ms 16.6% + triton_convolution_593 2.3172 ms 13.6% + triton_convolution_596 2.5791 ms 12.2% + triton_convolution_590 2.7150 ms 11.6% + triton_convolution_594 4.8677 ms 6.5% + triton_convolution_591 4.9803 ms 6.3% + triton_convolution_592 8.7186 ms 3.6% +SingleProcess AUTOTUNE takes 4.6822 seconds +AUTOTUNE addmm(60800x12, 60800x256, 256x12) + triton_mm_620 0.0367 ms 100.0% + triton_mm_619 0.0376 ms 97.5% + triton_mm_618 0.0389 ms 94.4% + triton_mm_625 0.0391 ms 93.9% + triton_mm_626 0.0394 ms 93.2% + triton_mm_621 0.0396 ms 92.6% + triton_mm_622 0.0397 ms 92.5% + triton_mm_623 0.0407 ms 90.2% + triton_mm_628 0.0408 ms 90.0% + triton_mm_624 0.0426 ms 86.2% +SingleProcess AUTOTUNE takes 4.3083 seconds +AUTOTUNE addmm(15200x12, 15200x256, 256x12) + triton_mm_639 0.0142 ms 100.0% + triton_mm_638 0.0143 ms 99.8% + triton_mm_641 0.0148 ms 96.1% + triton_mm_640 0.0149 ms 95.3% + triton_mm_642 0.0149 ms 95.3% + triton_mm_645 0.0151 ms 94.5% + triton_mm_646 0.0155 ms 92.1% + triton_mm_643 0.0158 ms 90.0% + triton_mm_637 0.0168 ms 85.0% + triton_mm_644 0.0171 ms 83.2% +SingleProcess AUTOTUNE takes 4.3255 seconds +AUTOTUNE addmm(3800x12, 3800x256, 256x12) + triton_mm_659 0.0096 ms 100.0% + triton_mm_662 0.0096 ms 100.0% + triton_mm_661 0.0098 ms 98.7% + triton_mm_664 0.0099 ms 97.1% + triton_mm_660 0.0101 ms 95.6% + triton_mm_665 0.0102 ms 94.4% + triton_mm_657 0.0103 ms 93.2% + triton_mm_658 0.0114 ms 84.3% + triton_mm_656 0.0140 ms 68.6% + triton_mm_663 0.0145 ms 66.3% +SingleProcess AUTOTUNE takes 4.0208 seconds +AUTOTUNE addmm(950x12, 950x256, 256x12) + triton_mm_681 0.0084 ms 100.0% + triton_mm_683 0.0088 ms 95.3% + triton_mm_684 0.0088 ms 95.3% + triton_mm_680 0.0090 ms 93.2% + triton_mm_678 0.0096 ms 87.0% + triton_mm_676 0.0099 ms 85.1% + triton_mm_679 0.0108 ms 77.7% + triton_mm_677 0.0109 ms 77.1% + triton_mm_675 0.0135 ms 61.9% + addmm 0.0138 ms 60.9% +SingleProcess AUTOTUNE takes 4.4240 seconds +AUTOTUNE convolution(1x256x13x19, 256x256x3x3) + convolution 0.0169 ms 100.0% + triton_convolution_691 0.1676 ms 10.1% + triton_convolution_692 0.2361 ms 7.1% + triton_convolution_690 0.2409 ms 7.0% + triton_convolution_689 0.3028 ms 5.6% + triton_convolution_693 0.3882 ms 4.3% + triton_convolution_687 0.5306 ms 3.2% + triton_convolution_688 0.6054 ms 2.8% +SingleProcess AUTOTUNE takes 4.3348 seconds +AUTOTUNE addmm(247x12, 247x256, 256x12) + triton_mm_703 0.0141 ms 100.0% + addmm 0.0151 ms 93.4% + triton_mm_705 0.0162 ms 87.0% + triton_mm_702 0.0169 ms 83.3% + triton_mm_700 0.0170 ms 83.0% + triton_mm_697 0.0172 ms 81.9% + triton_mm_695 0.0175 ms 80.4% + triton_mm_699 0.0177 ms 79.4% + triton_mm_694 0.0181 ms 77.6% + triton_mm_701 0.0185 ms 76.3% +SingleProcess AUTOTUNE takes 4.5044 seconds +AUTOTUNE addmm(60800x3, 60800x256, 256x3) + triton_mm_708 0.0373 ms 100.0% + triton_mm_707 0.0381 ms 98.1% + triton_mm_713 0.0386 ms 96.8% + triton_mm_710 0.0389 ms 96.0% + triton_mm_706 0.0391 ms 95.6% + triton_mm_709 0.0395 ms 94.5% + triton_mm_714 0.0398 ms 93.8% + triton_mm_711 0.0407 ms 91.7% + triton_mm_716 0.0411 ms 91.0% + triton_mm_717 0.0430 ms 86.8% +SingleProcess AUTOTUNE takes 4.1615 seconds +AUTOTUNE addmm(15200x3, 15200x256, 256x3) + triton_mm_719 0.0143 ms 100.0% + triton_mm_720 0.0143 ms 99.8% + triton_mm_721 0.0143 ms 99.6% + triton_mm_723 0.0148 ms 96.1% + triton_mm_722 0.0154 ms 92.9% + triton_mm_726 0.0162 ms 87.9% + triton_mm_718 0.0168 ms 84.8% + triton_mm_725 0.0171 ms 83.4% + triton_mm_728 0.0208 ms 68.7% + triton_mm_729 0.0208 ms 68.7% +SingleProcess AUTOTUNE takes 4.2508 seconds +AUTOTUNE addmm(3800x3, 3800x256, 256x3) + triton_mm_733 0.0096 ms 100.0% + triton_mm_738 0.0096 ms 99.7% + triton_mm_735 0.0099 ms 97.4% + triton_mm_736 0.0104 ms 92.0% + triton_mm_734 0.0107 ms 90.1% + triton_mm_732 0.0108 ms 88.5% + triton_mm_731 0.0109 ms 88.2% + triton_mm_739 0.0112 ms 85.5% + triton_mm_730 0.0141 ms 68.2% + triton_mm_737 0.0145 ms 66.1% +SingleProcess AUTOTUNE takes 4.3623 seconds +AUTOTUNE addmm(950x3, 950x256, 256x3) + triton_mm_748 0.0089 ms 100.0% + triton_mm_751 0.0091 ms 98.8% + triton_mm_747 0.0094 ms 94.7% + triton_mm_750 0.0095 ms 93.9% + triton_mm_745 0.0096 ms 92.9% + triton_mm_743 0.0099 ms 90.7% + triton_mm_746 0.0108 ms 82.9% + triton_mm_744 0.0114 ms 78.3% + triton_mm_742 0.0130 ms 68.8% + triton_mm_749 0.0140 ms 63.7% +SingleProcess AUTOTUNE takes 4.4793 seconds +AUTOTUNE addmm(247x3, 247x256, 256x3) + triton_mm_763 0.0140 ms 100.0% + addmm 0.0146 ms 96.1% + triton_mm_765 0.0156 ms 90.0% + triton_mm_760 0.0164 ms 85.6% + triton_mm_755 0.0171 ms 82.4% + triton_mm_757 0.0171 ms 82.4% + triton_mm_762 0.0172 ms 81.6% + triton_mm_759 0.0177 ms 79.4% + triton_mm_761 0.0177 ms 79.4% + triton_mm_754 0.0180 ms 78.1% +SingleProcess AUTOTUNE takes 4.3377 seconds +AUTOTUNE int_mm(0x12544, 12544x1024, 0x1024) + triton_mm_766 0.0028 ms 100.0% + triton_mm_767 0.0028 ms 100.0% + triton_mm_768 0.0028 ms 100.0% + triton_mm_769 0.0028 ms 100.0% + triton_mm_770 0.0028 ms 100.0% + triton_mm_771 0.0028 ms 100.0% + triton_mm_772 0.0028 ms 100.0% + triton_mm_773 0.0028 ms 100.0% + triton_mm_774 0.0028 ms 100.0% + triton_mm_775 0.0028 ms 100.0% +SingleProcess AUTOTUNE takes 2.2396 seconds +AUTOTUNE int_mm(0x1024, 1024x1024, 0x1024) + triton_mm_777 0.0028 ms 100.0% + triton_mm_778 0.0028 ms 100.0% + triton_mm_779 0.0028 ms 100.0% + triton_mm_780 0.0028 ms 100.0% + triton_mm_781 0.0028 ms 100.0% + triton_mm_782 0.0028 ms 100.0% + triton_mm_783 0.0028 ms 100.0% + triton_mm_784 0.0028 ms 100.0% + triton_mm_785 0.0028 ms 100.0% + triton_mm_786 0.0028 ms 100.0% +SingleProcess AUTOTUNE takes 2.2929 seconds +AUTOTUNE int_mm(0x1024, 1024x91, 0x91) + triton_mm_788 0.0028 ms 100.0% + triton_mm_789 0.0028 ms 100.0% + triton_mm_790 0.0028 ms 100.0% + triton_mm_791 0.0028 ms 100.0% + triton_mm_792 0.0028 ms 100.0% + triton_mm_793 0.0028 ms 100.0% + triton_mm_794 0.0028 ms 100.0% + triton_mm_795 0.0028 ms 100.0% + triton_mm_796 0.0028 ms 100.0% + triton_mm_797 0.0028 ms 100.0% +SingleProcess AUTOTUNE takes 2.0265 seconds +AUTOTUNE int_mm(0x1024, 1024x364, 0x364) + triton_mm_798 0.0028 ms 100.0% + triton_mm_799 0.0028 ms 100.0% + triton_mm_800 0.0028 ms 100.0% + triton_mm_801 0.0028 ms 100.0% + triton_mm_802 0.0028 ms 100.0% + triton_mm_803 0.0028 ms 100.0% + triton_mm_804 0.0028 ms 100.0% + triton_mm_805 0.0028 ms 100.0% + triton_mm_806 0.0028 ms 100.0% + triton_mm_807 0.0028 ms 100.0% +SingleProcess AUTOTUNE takes 2.2097 seconds +ERROR:common:backend='inductor' raised: +LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution) + +expected size 256==256, stride 196==1 at dim=1 + target: aten.convolution.default + args[0]: TensorBox(StorageBox( + InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1])) + )) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1])) + )) + args[2]: TensorBox(StorageBox( + InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1])) + )) + args[3]: [1, 1] + args[4]: [1, 1] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2232, in check_accuracy + new_result = optimized_model_iter_fn(model_copy, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 104, in forward + proposals, proposal_losses = self.rpn(images, features, targets) + File "/home/cdhernandez/local/vision/torchvision/models/detection/generalized_rcnn.py", line 105, in resume_in_forward + detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 761, in forward + box_features = self.box_roi_pool(features, proposals, image_shapes) + File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 775, in resume_in_forward + boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes) + File "/home/cdhernandez/local/vision/torchvision/models/detection/roi_heads.py", line 804, in resume_in_forward + mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors + return callback(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame + result = inner_convert(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert + compiled_product = _compile( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile + guarded_code = compile_inner(code, one_graph, hooks, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner + out_code = transform_code_object(code, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object + transformations(instructions, code_options) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform + tracer.run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run + super().run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run + and self.step() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step + getattr(self, inst.opname)(inst) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE + self.output.compile_subgraph( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 945, in compile_subgraph + self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph + compiled_fn = self.call_user_compiler(gm) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler + raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler + compiled_fn = compiler_fn(gm, self.example_inputs()) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper + compiled_gm = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__ + return compile_fx(model_, inputs_, config_patches=self.config) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx + return compile_fx( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx + return aot_autograd( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn + cg = aot_module_simplified(gm, example_inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified + compiled_fn = create_aot_dispatcher_function( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function + compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe + return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base + return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base + compiled_fw = compiler(fw_module, updated_flat_args) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base + return inner_compile( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper + inner_compiled_fn = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner + compiled_graph = fx_codegen_and_compile( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile + graph.run(*example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run + return super().run(*args) + File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run + self.env[node] = self.run_node(node) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node + result = self.call_function(n.target, args, kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function + raise LoweringException(e, target, args, kwargs).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function + out = lowerings[target](*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped + out = decomp_fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 367, in convolution + result = convolution(x, weight, None, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped + out = decomp_fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 457, in convolution + return autotune_select_algorithm("convolution", choices, args, layout) + File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 991, in autotune_select_algorithm + return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 748, in __call__ + timings = self.lookup( + File "/home/cdhernandez/local/pytorch/torch/_inductor/codecache.py", line 291, in lookup + timings = benchmark(choices) + File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 739, in autotune + return make_benchmark_fn()(choices) + File "/home/cdhernandez/local/pytorch/torch/_inductor/select_algorithm.py", line 865, in benchmark_in_current_process + raise AssertionError( # noqa: TRY200 +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +LoweringException: AssertionError: Incorrect result from choice ExternKernelCaller(extern_kernels.convolution) + +expected size 256==256, stride 196==1 at dim=1 + target: aten.convolution.default + args[0]: TensorBox(StorageBox( + InputBuffer(name='arg12_1', layout=FixedLayout('cuda', torch.float16, size=[0, 256, 14, 14], stride=[50176, 196, 14, 1])) + )) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg0_1', layout=FixedLayout('cuda', torch.float16, size=[256, 256, 3, 3], stride=[2304, 9, 3, 1])) + )) + args[2]: TensorBox(StorageBox( + InputBuffer(name='arg1_1', layout=FixedLayout('cuda', torch.float16, size=[256], stride=[1])) + )) + args[3]: [1, 1] + args[4]: [1, 1] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +TorchDynamo optimized model failed to run because of following error +fail_to_run + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +yolov3 +cuda eval yolov3 int8dynamic-bs1-acc +AUTOTUNE convolution(1x3x384x512, 32x3x3x3) + convolution 0.0588 ms 100.0% + triton_convolution_4 0.0634 ms 92.8% + triton_convolution_0 0.0664 ms 88.5% + triton_convolution_3 0.0691 ms 85.2% + triton_convolution_2 0.0692 ms 85.0% + triton_convolution_5 0.0902 ms 65.2% + triton_convolution_1 0.0939 ms 62.6% +SingleProcess AUTOTUNE takes 2.5874 seconds +AUTOTUNE convolution(1x32x384x512, 64x32x3x3) + convolution 0.0314 ms 100.0% + triton_convolution_6 0.0966 ms 32.5% + triton_convolution_12 0.0983 ms 31.9% + triton_convolution_7 0.1173 ms 26.8% + triton_convolution_11 0.1196 ms 26.2% + triton_convolution_9 0.1892 ms 16.6% + triton_convolution_10 0.2024 ms 15.5% + triton_convolution_8 0.2647 ms 11.9% +SingleProcess AUTOTUNE takes 3.7895 seconds +AUTOTUNE mm(49152x64, 64x32) + triton_mm_23 0.0135 ms 100.0% + triton_mm_24 0.0136 ms 99.3% + triton_mm_20 0.0137 ms 98.8% + triton_mm_13 0.0137 ms 98.4% + triton_mm_21 0.0140 ms 96.6% + triton_mm_14 0.0144 ms 94.0% + triton_mm_15 0.0144 ms 94.0% + triton_mm_17 0.0150 ms 90.2% + triton_mm_16 0.0155 ms 87.0% + triton_mm_22 0.0156 ms 86.3% +SingleProcess AUTOTUNE takes 3.7509 seconds +AUTOTUNE convolution(1x32x192x256, 64x32x3x3) + convolution 0.0226 ms 100.0% + triton_convolution_25 0.0646 ms 34.9% + triton_convolution_31 0.0661 ms 34.1% + triton_convolution_30 0.0739 ms 30.5% + triton_convolution_26 0.0910 ms 24.8% + triton_convolution_28 0.1099 ms 20.5% + triton_convolution_29 0.1113 ms 20.3% + triton_convolution_27 0.1468 ms 15.4% +SingleProcess AUTOTUNE takes 4.0916 seconds +AUTOTUNE convolution(1x64x192x256, 128x64x3x3) + convolution 0.0217 ms 100.0% + triton_convolution_38 0.0913 ms 23.8% + triton_convolution_32 0.0990 ms 22.0% + triton_convolution_35 0.1156 ms 18.8% + triton_convolution_37 0.1221 ms 17.8% + triton_convolution_36 0.1548 ms 14.0% + triton_convolution_33 0.1598 ms 13.6% + triton_convolution_34 0.2525 ms 8.6% +SingleProcess AUTOTUNE takes 3.9398 seconds +AUTOTUNE mm(12288x128, 128x64) + triton_mm_43 0.0107 ms 100.0% + triton_mm_41 0.0111 ms 96.2% + triton_mm_40 0.0112 ms 95.1% + triton_mm_42 0.0116 ms 92.2% + triton_mm_39 0.0119 ms 89.8% + triton_mm_47 0.0119 ms 89.5% + triton_mm_46 0.0128 ms 83.1% + triton_mm_45 0.0132 ms 80.6% + mm 0.0135 ms 78.7% + triton_mm_44 0.0137 ms 78.0% +SingleProcess AUTOTUNE takes 4.2455 seconds +AUTOTUNE convolution(1x64x96x128, 128x64x3x3) + convolution 0.0202 ms 100.0% + triton_convolution_57 0.0631 ms 32.0% + triton_convolution_54 0.0686 ms 29.4% + triton_convolution_51 0.0754 ms 26.8% + triton_convolution_56 0.0763 ms 26.4% + triton_convolution_55 0.0990 ms 20.4% + triton_convolution_52 0.1081 ms 18.7% + triton_convolution_53 0.2497 ms 8.1% +SingleProcess AUTOTUNE takes 4.0077 seconds +AUTOTUNE convolution(1x128x96x128, 256x128x3x3) + convolution 0.0200 ms 100.0% + triton_convolution_82 0.1523 ms 13.1% + triton_convolution_83 0.1740 ms 11.5% + triton_convolution_81 0.1918 ms 10.4% + triton_convolution_80 0.1987 ms 10.0% + triton_convolution_77 0.2878 ms 6.9% + triton_convolution_78 0.3203 ms 6.2% + triton_convolution_79 0.4956 ms 4.0% +SingleProcess AUTOTUNE takes 4.5199 seconds +AUTOTUNE mm(3072x256, 256x128) + triton_mm_87 0.0103 ms 100.0% + triton_mm_92 0.0104 ms 98.8% + mm 0.0107 ms 95.8% + triton_mm_88 0.0108 ms 95.3% + triton_mm_89 0.0113 ms 90.9% + triton_mm_90 0.0114 ms 89.9% + triton_mm_86 0.0115 ms 89.4% + triton_mm_85 0.0117 ms 87.9% + triton_mm_93 0.0118 ms 87.2% + triton_mm_84 0.0148 ms 69.6% +SingleProcess AUTOTUNE takes 4.8861 seconds +AUTOTUNE convolution(1x128x48x64, 256x128x3x3) + convolution 0.0184 ms 100.0% + triton_convolution_101 0.1032 ms 17.8% + triton_convolution_100 0.1113 ms 16.5% + triton_convolution_99 0.1122 ms 16.4% + triton_convolution_102 0.1220 ms 15.1% + triton_convolution_97 0.2475 ms 7.4% + triton_convolution_96 0.2481 ms 7.4% + triton_convolution_98 0.4992 ms 3.7% +SingleProcess AUTOTUNE takes 4.6222 seconds +AUTOTUNE convolution(1x256x48x64, 512x256x3x3) + convolution 0.0220 ms 100.0% + triton_convolution_241 0.2915 ms 7.5% + triton_convolution_240 0.3545 ms 6.2% + triton_convolution_242 0.3702 ms 5.9% + triton_convolution_239 0.4304 ms 5.1% + triton_convolution_236 0.5869 ms 3.7% + triton_convolution_237 0.6838 ms 3.2% + triton_convolution_238 0.7921 ms 2.8% +SingleProcess AUTOTUNE takes 4.6344 seconds +AUTOTUNE mm(768x512, 512x256) + mm 0.0107 ms 100.0% + triton_mm_249 0.0107 ms 100.0% + triton_mm_251 0.0111 ms 96.3% + triton_mm_248 0.0113 ms 95.2% + triton_mm_252 0.0124 ms 86.3% + triton_mm_246 0.0127 ms 84.4% + triton_mm_247 0.0130 ms 82.3% + triton_mm_245 0.0143 ms 74.9% + triton_mm_244 0.0146 ms 73.6% + triton_mm_243 0.0207 ms 51.7% +SingleProcess AUTOTUNE takes 4.7352 seconds +AUTOTUNE convolution(1x256x24x32, 512x256x3x3) + convolution 0.0197 ms 100.0% + triton_convolution_259 0.1709 ms 11.5% + triton_convolution_260 0.2213 ms 8.9% + triton_convolution_258 0.2428 ms 8.1% + triton_convolution_261 0.3062 ms 6.4% + triton_convolution_255 0.5509 ms 3.6% + triton_convolution_256 0.5803 ms 3.4% + triton_convolution_257 0.8345 ms 2.4% +SingleProcess AUTOTUNE takes 4.5293 seconds +AUTOTUNE convolution(1x512x24x32, 1024x512x3x3) + convolution 0.0296 ms 100.0% + triton_convolution_400 0.5815 ms 5.1% + triton_convolution_401 0.7320 ms 4.0% + triton_convolution_397 0.7438 ms 4.0% + triton_convolution_399 0.7620 ms 3.9% + triton_convolution_398 0.7924 ms 3.7% + triton_convolution_396 1.2406 ms 2.4% + triton_convolution_395 1.3357 ms 2.2% +SingleProcess AUTOTUNE takes 4.8568 seconds +AUTOTUNE mm(192x1024, 1024x512) + mm 0.0127 ms 100.0% + triton_mm_407 0.0139 ms 91.3% + triton_mm_408 0.0141 ms 90.2% + triton_mm_410 0.0156 ms 81.2% + triton_mm_411 0.0159 ms 80.0% + triton_mm_405 0.0178 ms 71.5% + triton_mm_406 0.0182 ms 69.8% + triton_mm_404 0.0217 ms 58.6% + triton_mm_403 0.0225 ms 56.6% + triton_mm_402 0.0355 ms 35.8% +SingleProcess AUTOTUNE takes 5.1834 seconds +AUTOTUNE convolution(1x512x12x16, 1024x512x3x3) + convolution 0.0285 ms 100.0% + triton_convolution_418 0.5090 ms 5.6% + triton_convolution_419 0.5236 ms 5.5% + triton_convolution_416 0.5397 ms 5.3% + triton_convolution_420 0.6704 ms 4.3% + triton_convolution_417 0.6952 ms 4.1% + triton_convolution_415 1.1640 ms 2.5% + triton_convolution_414 1.2940 ms 2.2% +SingleProcess AUTOTUNE takes 4.5735 seconds +AUTOTUNE mm(192x2048, 2048x512) + mm 0.0151 ms 100.0% + triton_mm_514 0.0216 ms 69.8% + triton_mm_515 0.0219 ms 69.0% + triton_mm_518 0.0240 ms 62.9% + triton_mm_517 0.0244 ms 61.9% + triton_mm_513 0.0284 ms 53.2% + triton_mm_512 0.0286 ms 52.7% + triton_mm_511 0.0372 ms 40.5% + triton_mm_510 0.0374 ms 40.4% + triton_mm_509 0.0543 ms 27.8% +SingleProcess AUTOTUNE takes 4.6016 seconds +AUTOTUNE addmm(192x255, 192x1024, 1024x255) + triton_mm_552 0.0160 ms 100.0% + triton_mm_553 0.0160 ms 99.8% + triton_mm_556 0.0163 ms 98.0% + triton_mm_555 0.0182 ms 87.5% + triton_mm_550 0.0230 ms 69.4% + addmm 0.0236 ms 67.6% + triton_mm_551 0.0238 ms 67.2% + triton_mm_549 0.0270 ms 59.1% + triton_mm_548 0.0276 ms 57.8% + triton_mm_547 0.0391 ms 40.8% +SingleProcess AUTOTUNE takes 6.0296 seconds +AUTOTUNE mm(192x512, 512x256) + triton_mm_568 0.0100 ms 100.0% + triton_mm_564 0.0102 ms 97.8% + mm 0.0104 ms 95.7% + triton_mm_565 0.0107 ms 93.0% + triton_mm_567 0.0113 ms 88.2% + triton_mm_562 0.0128 ms 77.9% + triton_mm_563 0.0129 ms 77.0% + triton_mm_561 0.0147 ms 67.6% + triton_mm_560 0.0148 ms 67.3% + triton_mm_559 0.0196 ms 50.7% +SingleProcess AUTOTUNE takes 4.8666 seconds +AUTOTUNE mm(768x768, 768x256) + mm 0.0114 ms 100.0% + triton_mm_576 0.0130 ms 87.7% + triton_mm_577 0.0131 ms 86.8% + triton_mm_579 0.0140 ms 81.7% + triton_mm_580 0.0148 ms 76.9% + triton_mm_574 0.0155 ms 73.6% + triton_mm_575 0.0156 ms 73.0% + triton_mm_573 0.0184 ms 62.0% + triton_mm_572 0.0190 ms 59.8% + triton_mm_571 0.0282 ms 40.5% +SingleProcess AUTOTUNE takes 5.2033 seconds +AUTOTUNE addmm(768x255, 768x512, 512x255) + triton_mm_634 0.0132 ms 100.0% + triton_mm_633 0.0136 ms 97.2% + triton_mm_637 0.0144 ms 92.0% + triton_mm_636 0.0148 ms 89.2% + triton_mm_632 0.0182 ms 72.6% + triton_mm_631 0.0183 ms 72.3% + triton_mm_629 0.0199 ms 66.7% + triton_mm_630 0.0201 ms 65.9% + addmm 0.0238 ms 55.6% + triton_mm_628 0.0244 ms 54.3% +SingleProcess AUTOTUNE takes 6.2084 seconds +AUTOTUNE mm(768x256, 256x128) + triton_mm_649 0.0084 ms 100.0% + triton_mm_645 0.0090 ms 93.6% + triton_mm_646 0.0090 ms 92.9% + triton_mm_648 0.0093 ms 90.0% + triton_mm_644 0.0097 ms 86.2% + mm 0.0098 ms 85.6% + triton_mm_643 0.0102 ms 82.1% + triton_mm_642 0.0111 ms 75.3% + triton_mm_641 0.0112 ms 75.0% + triton_mm_640 0.0132 ms 63.6% +SingleProcess AUTOTUNE takes 4.7710 seconds +AUTOTUNE mm(3072x384, 384x128) + triton_mm_660 0.0111 ms 100.0% + triton_mm_656 0.0116 ms 95.3% + mm 0.0122 ms 91.1% + triton_mm_655 0.0123 ms 90.4% + triton_mm_657 0.0129 ms 86.1% + triton_mm_654 0.0130 ms 85.5% + triton_mm_653 0.0132 ms 83.8% + triton_mm_658 0.0139 ms 80.0% + triton_mm_661 0.0144 ms 76.9% + triton_mm_652 0.0188 ms 58.9% +SingleProcess AUTOTUNE takes 4.6901 seconds +AUTOTUNE addmm(3072x255, 3072x256, 256x255) + triton_mm_712 0.0201 ms 100.0% + triton_mm_717 0.0202 ms 99.2% + triton_mm_713 0.0208 ms 96.6% + triton_mm_711 0.0221 ms 90.9% + triton_mm_710 0.0222 ms 90.3% + triton_mm_715 0.0226 ms 88.7% + triton_mm_714 0.0234 ms 85.8% + triton_mm_709 0.0235 ms 85.3% + triton_mm_718 0.0240 ms 83.5% + triton_mm_716 0.0279 ms 71.8% +SingleProcess AUTOTUNE takes 6.1139 seconds +pass-sqnr-error + +Summary for tag=0.000000: +abs_latency gmean=0.00x mean=0.000x +compilation_latency mean=0.000 seconds +compression_ratio mean=0.000x +eager_peak_mem gmean=0.00x mean=0.000x +dynamo_peak_mem gmean=0.00x mean=0.000x +calls_captured gmean=0.00x mean=0.000x +unique_graphs gmean=0.00x mean=0.000x +graph_breaks gmean=0.00x mean=0.000x +unique_graph_breaks gmean=0.00x mean=0.000x + +Summary for tag=int8dynamic: +abs_latency gmean=4.24x mean=10.510x +compilation_latency mean=34.839 seconds +compression_ratio mean=1.263x +eager_peak_mem gmean=0.38x mean=0.878x +dynamo_peak_mem gmean=0.36x mean=0.844x +calls_captured gmean=233.44x mean=564.988x +unique_graphs gmean=1.86x mean=7.136x +graph_breaks gmean=0.00x mean=5.160x +unique_graph_breaks gmean=0.00x mean=1.333x + +Summary for tag=int8weightonly: +abs_latency gmean=4.51x mean=11.782x +compilation_latency mean=31.136 seconds +compression_ratio mean=1.098x +eager_peak_mem gmean=0.38x mean=0.871x +dynamo_peak_mem gmean=0.46x mean=0.896x +calls_captured gmean=233.16x mean=563.963x +unique_graphs gmean=1.85x mean=7.183x +graph_breaks gmean=0.00x mean=5.220x +unique_graph_breaks gmean=0.00x mean=1.317x + +Summary for tag=int4weightonly: +abs_latency gmean=6.14x mean=33.943x +compilation_latency mean=27.431 seconds +compression_ratio mean=1.140x +eager_peak_mem gmean=0.33x mean=0.696x +dynamo_peak_mem gmean=0.37x mean=0.739x +calls_captured gmean=219.02x mean=494.800x +unique_graphs gmean=1.83x mean=7.125x +graph_breaks gmean=0.00x mean=5.088x +unique_graph_breaks gmean=0.00x mean=1.312x + +Summary for tag=baseline: +abs_latency gmean=4.22x mean=13.273x +compilation_latency mean=36.647 seconds +compression_ratio mean=1.125x +eager_peak_mem gmean=0.42x mean=1.075x +dynamo_peak_mem gmean=0.45x mean=1.120x +calls_captured gmean=240.73x mean=595.060x +unique_graphs gmean=1.89x mean=6.619x +graph_breaks gmean=0.00x mean=5.071x +unique_graph_breaks gmean=0.00x mean=1.333x + +Summary for tag=int8weightonly-bs1: +abs_latency gmean=2.99x mean=8.375x +compilation_latency mean=35.067 seconds +compression_ratio mean=0.937x +eager_peak_mem gmean=0.24x mean=0.786x +dynamo_peak_mem gmean=0.38x mean=0.886x +calls_captured gmean=232.72x mean=567.580x +unique_graphs gmean=1.87x mean=7.259x +graph_breaks gmean=0.00x mean=5.284x +unique_graph_breaks gmean=0.00x mean=1.333x + +Summary for tag=int4weightonly-bs1: +abs_latency gmean=3.59x mean=15.921x +compilation_latency mean=27.963 seconds +compression_ratio mean=0.986x +eager_peak_mem gmean=0.20x mean=0.605x +dynamo_peak_mem gmean=0.30x mean=0.703x +calls_captured gmean=218.43x mean=497.633x +unique_graphs gmean=1.84x mean=7.203x +graph_breaks gmean=0.00x mean=5.152x +unique_graph_breaks gmean=0.00x mean=1.329x + +Summary for tag=baseline-bs1: +abs_latency gmean=2.67x mean=9.869x +compilation_latency mean=37.506 seconds +compression_ratio mean=1.159x +eager_peak_mem gmean=0.27x mean=0.992x +dynamo_peak_mem gmean=0.30x mean=1.025x +calls_captured gmean=240.37x mean=598.928x +unique_graphs gmean=1.90x mean=6.687x +graph_breaks gmean=0.00x mean=5.133x +unique_graph_breaks gmean=0.00x mean=1.349x + +Summary for tag=int8dynamic-bs32: +abs_latency gmean=5.10x mean=53.884x +compilation_latency mean=111.120 seconds +compression_ratio mean=0.887x +eager_peak_mem gmean=0.32x mean=2.090x +dynamo_peak_mem gmean=0.61x mean=1.711x +calls_captured gmean=188.71x mean=645.288x +unique_graphs gmean=1.96x mean=41.712x +graph_breaks gmean=0.00x mean=37.424x +unique_graph_breaks gmean=0.00x mean=0.864x + +Summary for tag=baseline-bs32: +abs_latency gmean=6.00x mean=63.020x +compilation_latency mean=99.524 seconds +compression_ratio mean=0.775x +eager_peak_mem gmean=0.40x mean=2.403x +dynamo_peak_mem gmean=0.83x mean=2.203x +calls_captured gmean=202.02x mean=564.403x +unique_graphs gmean=1.84x mean=8.486x +graph_breaks gmean=0.00x mean=6.167x +unique_graph_breaks gmean=0.00x mean=0.986x + +Summary for tag=infra_error: +abs_latency gmean=0.00x mean=0.000x +compilation_latency mean=0.000 seconds +compression_ratio mean=0.000x +eager_peak_mem gmean=0.00x mean=0.000x +dynamo_peak_mem gmean=0.00x mean=0.000x +calls_captured gmean=0.00x mean=0.000x +unique_graphs gmean=0.00x mean=0.000x +graph_breaks gmean=0.00x mean=0.000x +unique_graph_breaks gmean=0.00x mean=0.000x + +Summary for tag=int8dynamic-bs1-acc: +abs_latency gmean=0.00x mean=451.387x +compilation_latency mean=2.800 seconds +compression_ratio mean=1.320x +eager_peak_mem gmean=0.00x mean=0.333x +dynamo_peak_mem gmean=0.00x mean=0.000x +calls_captured gmean=0.00x mean=0.000x +unique_graphs gmean=0.00x mean=0.000x +graph_breaks gmean=0.00x mean=0.000x +unique_graph_breaks gmean=0.00x mean=0.000x + +Summary for tag=timeout: +abs_latency gmean=0.00x mean=0.000x +compilation_latency mean=0.000 seconds +compression_ratio mean=0.000x +eager_peak_mem gmean=0.00x mean=0.000x +dynamo_peak_mem gmean=0.00x mean=0.000x +calls_captured gmean=0.00x mean=0.000x +unique_graphs gmean=0.00x mean=0.000x +graph_breaks gmean=0.00x mean=0.000x +unique_graph_breaks gmean=0.00x mean=0.000x + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +torchrec_dlrm +/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/fbgemm_gpu_py.so: undefined symbol: _ZNK5torch8autograd4Node4nameEv +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int8weightonly-bs1-acc +AUTOTUNE mixed_mm(128x768, 768x768) + triton_mm_9 0.0167 ms 100.0% + triton_mm_5 0.0172 ms 97.4% + triton_mm_6 0.0175 ms 95.8% + triton_mm_8 0.0214 ms 78.2% + triton_mm_4 0.0241 ms 69.5% + triton_mm_3 0.0256 ms 65.4% + triton_mm_2 0.0266 ms 63.0% + triton_mm_1 0.0268 ms 62.5% + triton_mm_0 0.0337 ms 49.7% + triton_mm_10 0.0340 ms 49.3% +SingleProcess AUTOTUNE takes 5.0684 seconds +AUTOTUNE mixed_mm(128x768, 768x3072) + triton_mm_76 0.0227 ms 100.0% + triton_mm_72 0.0242 ms 94.0% + triton_mm_74 0.0249 ms 91.3% + triton_mm_73 0.0254 ms 89.4% + triton_mm_71 0.0255 ms 89.1% + triton_mm_70 0.0270 ms 84.2% + triton_mm_77 0.0271 ms 83.9% + triton_mm_69 0.0271 ms 83.8% + triton_mm_68 0.0352 ms 64.5% + triton_mm_78 0.0382 ms 59.4% +SingleProcess AUTOTUNE takes 5.0346 seconds +AUTOTUNE mixed_mm(128x3072, 3072x768) + triton_mm_88 0.0470 ms 100.0% + triton_mm_85 0.0487 ms 96.6% + triton_mm_84 0.0494 ms 95.2% + triton_mm_87 0.0635 ms 74.1% + triton_mm_83 0.0740 ms 63.6% + fallback_mixed_mm 0.0748 ms 62.8% + triton_mm_82 0.0803 ms 58.6% + triton_mm_80 0.0846 ms 55.6% + triton_mm_81 0.0853 ms 55.1% + triton_mm_79 0.1136 ms 41.4% +SingleProcess AUTOTUNE takes 5.5239 seconds +pass-sqnr-41.682 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +Background_Matting +cuda eval Background_Matting int8weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead + loading model: 0it [00:15, ?it/s] +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +DALLE2_pytorch +cuda eval DALLE2_pytorch int8weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for DALLE2_pytorch. Setting accuracy check to cosine +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +[2023-12-13 00:29:35,222] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mixed_mm(154x512, 512x512) + triton_mm_21 0.0133 ms 100.0% + triton_mm_17 0.0136 ms 98.1% + triton_mm_18 0.0138 ms 96.8% + triton_mm_20 0.0166 ms 80.2% + triton_mm_15 0.0184 ms 72.5% + triton_mm_13 0.0187 ms 71.3% + triton_mm_14 0.0189 ms 70.4% + triton_mm_16 0.0197 ms 67.6% + triton_mm_22 0.0223 ms 59.7% + triton_mm_12 0.0240 ms 55.7% +SingleProcess AUTOTUNE takes 5.7069 seconds +AUTOTUNE mixed_mm(154x512, 512x2048) + triton_mm_31 0.0170 ms 100.0% + triton_mm_32 0.0175 ms 97.3% + triton_mm_26 0.0181 ms 93.7% + triton_mm_29 0.0182 ms 93.6% + triton_mm_28 0.0188 ms 90.5% + triton_mm_24 0.0195 ms 87.1% + triton_mm_25 0.0198 ms 85.7% + triton_mm_27 0.0200 ms 84.9% + triton_mm_33 0.0247 ms 68.8% + triton_mm_23 0.0252 ms 67.4% +SingleProcess AUTOTUNE takes 5.0024 seconds +AUTOTUNE mixed_mm(154x2048, 2048x512) + triton_mm_39 0.0332 ms 100.0% + triton_mm_40 0.0343 ms 96.8% + triton_mm_43 0.0344 ms 96.7% + triton_mm_42 0.0437 ms 76.1% + triton_mm_37 0.0504 ms 65.9% + triton_mm_35 0.0539 ms 61.7% + triton_mm_36 0.0548 ms 60.7% + triton_mm_38 0.0580 ms 57.4% + triton_mm_44 0.0706 ms 47.1% + fallback_mixed_mm 0.0723 ms 46.0% +SingleProcess AUTOTUNE takes 5.8073 seconds +[2023-12-13 00:29:52,179] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:52,449] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:52,717] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:52,988] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:53,264] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:53,533] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:53,802] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:54,080] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:54,350] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:54,620] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:29:54,886] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mixed_mm(2x512, 512x1024) + triton_mm_557 0.0117 ms 100.0% + triton_mm_560 0.0127 ms 92.4% + triton_mm_558 0.0132 ms 88.4% + triton_mm_556 0.0136 ms 86.3% + triton_mm_561 0.0140 ms 83.8% + triton_mm_554 0.0169 ms 69.3% + triton_mm_555 0.0171 ms 68.5% + triton_mm_553 0.0174 ms 67.4% + triton_mm_562 0.0228 ms 51.3% + triton_mm_552 0.0252 ms 46.6% +SingleProcess AUTOTUNE takes 4.2160 seconds +AUTOTUNE mixed_mm(2x1024, 1024x1024) + triton_mm_568 0.0159 ms 100.0% + triton_mm_571 0.0187 ms 85.1% + triton_mm_567 0.0197 ms 80.8% + triton_mm_569 0.0207 ms 77.0% + triton_mm_572 0.0218 ms 73.0% + triton_mm_565 0.0265 ms 60.2% + triton_mm_564 0.0273 ms 58.4% + triton_mm_566 0.0281 ms 56.7% + triton_mm_573 0.0403 ms 39.5% + triton_mm_563 0.0436 ms 36.5% +SingleProcess AUTOTUNE takes 3.9697 seconds +AUTOTUNE mixed_mm(2x1024, 1024x512) + triton_mm_579 0.0155 ms 100.0% + triton_mm_582 0.0176 ms 88.0% + triton_mm_578 0.0200 ms 77.6% + triton_mm_580 0.0205 ms 75.5% + triton_mm_583 0.0210 ms 73.8% + triton_mm_576 0.0252 ms 61.7% + triton_mm_575 0.0266 ms 58.4% + triton_mm_577 0.0280 ms 55.4% + triton_mm_584 0.0385 ms 40.3% + triton_mm_574 0.0414 ms 37.5% +SingleProcess AUTOTUNE takes 3.9197 seconds +AUTOTUNE mixed_mm(520x512, 512x128) + triton_mm_594 0.0137 ms 100.0% + triton_mm_590 0.0138 ms 99.7% + triton_mm_591 0.0140 ms 97.9% + triton_mm_593 0.0161 ms 85.4% + triton_mm_588 0.0185 ms 74.4% + triton_mm_586 0.0194 ms 70.9% + triton_mm_587 0.0195 ms 70.5% + triton_mm_589 0.0197 ms 69.6% + triton_mm_595 0.0226 ms 60.7% + triton_mm_585 0.0247 ms 55.6% +SingleProcess AUTOTUNE takes 5.2099 seconds +AUTOTUNE mixed_mm(520x512, 512x512) + triton_mm_604 0.0164 ms 100.0% + triton_mm_605 0.0172 ms 95.7% + triton_mm_601 0.0181 ms 90.7% + triton_mm_599 0.0185 ms 88.9% + triton_mm_602 0.0187 ms 88.0% + triton_mm_597 0.0187 ms 87.9% + triton_mm_598 0.0195 ms 84.4% + triton_mm_600 0.0204 ms 80.8% + triton_mm_606 0.0236 ms 69.8% + triton_mm_596 0.0241 ms 68.3% +SingleProcess AUTOTUNE takes 4.8879 seconds +AUTOTUNE mixed_mm(520x512, 512x4096) + triton_mm_644 0.0361 ms 100.0% + triton_mm_643 0.0361 ms 99.9% + triton_mm_646 0.0379 ms 95.2% + triton_mm_645 0.0380 ms 95.0% + triton_mm_642 0.0448 ms 80.5% + triton_mm_650 0.0603 ms 59.8% + triton_mm_652 0.0604 ms 59.7% + triton_mm_647 0.0626 ms 57.6% + triton_mm_648 0.0636 ms 56.7% + triton_mm_651 0.0676 ms 53.4% +SingleProcess AUTOTUNE takes 4.8283 seconds +AUTOTUNE mixed_mm(520x2048, 2048x512) + triton_mm_662 0.0444 ms 100.0% + triton_mm_661 0.0448 ms 99.2% + triton_mm_658 0.0499 ms 89.0% + triton_mm_656 0.0501 ms 88.7% + triton_mm_659 0.0514 ms 86.5% + triton_mm_654 0.0547 ms 81.3% + triton_mm_655 0.0571 ms 77.9% + triton_mm_657 0.0582 ms 76.4% + fallback_mixed_mm 0.0634 ms 70.1% + triton_mm_663 0.0719 ms 61.8% +SingleProcess AUTOTUNE takes 4.9408 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-13 00:30:58,308] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mixed_mm(77x512, 512x512) + triton_mm_1095 0.0132 ms 100.0% + triton_mm_1096 0.0132 ms 99.8% + triton_mm_1099 0.0135 ms 97.6% + triton_mm_1098 0.0164 ms 80.8% + triton_mm_1093 0.0178 ms 74.1% + triton_mm_1091 0.0193 ms 68.6% + triton_mm_1092 0.0198 ms 66.8% + triton_mm_1094 0.0201 ms 65.8% + triton_mm_1100 0.0226 ms 58.6% + triton_mm_1090 0.0236 ms 55.9% +SingleProcess AUTOTUNE takes 4.8134 seconds +AUTOTUNE mixed_mm(77x512, 512x2048) + triton_mm_1107 0.0143 ms 100.0% + triton_mm_1110 0.0148 ms 96.1% + triton_mm_1109 0.0171 ms 83.4% + triton_mm_1104 0.0181 ms 78.7% + triton_mm_1106 0.0188 ms 75.9% + triton_mm_1102 0.0192 ms 74.5% + triton_mm_1103 0.0196 ms 72.8% + triton_mm_1105 0.0198 ms 72.1% + triton_mm_1111 0.0242 ms 59.1% + triton_mm_1101 0.0249 ms 57.3% +SingleProcess AUTOTUNE takes 5.0907 seconds +AUTOTUNE mixed_mm(77x2048, 2048x512) + triton_mm_1117 0.0325 ms 100.0% + triton_mm_1121 0.0334 ms 97.2% + triton_mm_1118 0.0337 ms 96.3% + triton_mm_1120 0.0448 ms 72.5% + triton_mm_1115 0.0499 ms 65.1% + triton_mm_1113 0.0545 ms 59.6% + triton_mm_1114 0.0567 ms 57.2% + triton_mm_1116 0.0578 ms 56.2% + triton_mm_1122 0.0698 ms 46.5% + triton_mm_1112 0.0752 ms 43.2% +SingleProcess AUTOTUNE takes 5.0308 seconds +[2023-12-13 00:31:13,603] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:13,860] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:14,123] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:14,382] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:14,642] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:14,901] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:15,163] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:15,426] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:15,690] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:15,953] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 00:31:16,218] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE mixed_mm(1x128, 128x512) + triton_mm_1639 0.0077 ms 100.0% + triton_mm_1636 0.0080 ms 97.2% + triton_mm_1634 0.0082 ms 94.5% + triton_mm_1632 0.0085 ms 91.3% + triton_mm_1631 0.0086 ms 90.0% + triton_mm_1635 0.0088 ms 88.3% + triton_mm_1638 0.0089 ms 87.1% + triton_mm_1640 0.0098 ms 79.3% + triton_mm_1633 0.0098 ms 79.1% + triton_mm_1630 0.0100 ms 77.6% +SingleProcess AUTOTUNE takes 4.8986 seconds +AUTOTUNE mixed_mm(1x512, 512x256) + triton_mm_1646 0.0114 ms 100.0% + triton_mm_1650 0.0127 ms 89.9% + triton_mm_1649 0.0127 ms 89.7% + triton_mm_1645 0.0130 ms 87.9% + triton_mm_1647 0.0130 ms 87.9% + triton_mm_1643 0.0151 ms 75.6% + triton_mm_1642 0.0153 ms 74.3% + triton_mm_1644 0.0177 ms 64.4% + triton_mm_1651 0.0210 ms 54.2% + triton_mm_1641 0.0222 ms 51.3% +SingleProcess AUTOTUNE takes 3.9064 seconds +AUTOTUNE mixed_mm(1x512, 512x512) + triton_mm_1657 0.0122 ms 100.0% + triton_mm_1660 0.0130 ms 94.1% + triton_mm_1656 0.0132 ms 92.3% + triton_mm_1658 0.0137 ms 89.2% + triton_mm_1661 0.0138 ms 88.6% + triton_mm_1654 0.0156 ms 78.1% + triton_mm_1653 0.0159 ms 76.8% + triton_mm_1655 0.0178 ms 68.4% + triton_mm_1662 0.0224 ms 54.4% + triton_mm_1652 0.0232 ms 52.6% +SingleProcess AUTOTUNE takes 3.9448 seconds +AUTOTUNE mixed_mm(77x512, 512x128) + triton_mm_1672 0.0126 ms 100.0% + triton_mm_1669 0.0132 ms 96.1% + triton_mm_1668 0.0135 ms 93.6% + triton_mm_1671 0.0163 ms 77.8% + triton_mm_1666 0.0176 ms 71.8% + triton_mm_1664 0.0184 ms 68.7% + triton_mm_1667 0.0194 ms 65.1% + triton_mm_1665 0.0196 ms 64.3% + triton_mm_1673 0.0218 ms 58.1% + triton_mm_1663 0.0230 ms 54.9% +SingleProcess AUTOTUNE takes 5.0118 seconds +AUTOTUNE mixed_mm(6x128, 128x1024) + triton_mm_1849 0.0077 ms 100.0% + triton_mm_1850 0.0085 ms 90.6% + triton_mm_1853 0.0085 ms 90.6% + triton_mm_1848 0.0087 ms 89.1% + triton_mm_1852 0.0089 ms 86.7% + triton_mm_1845 0.0090 ms 85.8% + triton_mm_1846 0.0092 ms 84.0% + triton_mm_1847 0.0097 ms 79.6% + triton_mm_1854 0.0099 ms 78.6% + triton_mm_1844 0.0101 ms 76.8% +SingleProcess AUTOTUNE takes 4.1647 seconds +AUTOTUNE mixed_mm(4096x128, 128x512) + triton_mm_1857 0.0154 ms 100.0% + triton_mm_1855 0.0155 ms 99.3% + triton_mm_1856 0.0159 ms 96.6% + triton_mm_1859 0.0162 ms 94.9% + triton_mm_1858 0.0164 ms 93.8% + triton_mm_1864 0.0208 ms 74.0% + triton_mm_1865 0.0208 ms 74.0% + triton_mm_1863 0.0213 ms 72.1% + triton_mm_1860 0.0229 ms 67.3% + triton_mm_1861 0.0231 ms 66.6% +SingleProcess AUTOTUNE takes 4.9801 seconds +AUTOTUNE mixed_mm(4096x512, 512x128) + triton_mm_1892 0.0195 ms 100.0% + triton_mm_1890 0.0201 ms 96.7% + triton_mm_1893 0.0206 ms 94.3% + triton_mm_1891 0.0211 ms 92.1% + triton_mm_1898 0.0228 ms 85.2% + triton_mm_1894 0.0234 ms 83.2% + triton_mm_1895 0.0240 ms 81.2% + triton_mm_1897 0.0253 ms 77.0% + triton_mm_1899 0.0270 ms 72.0% + triton_mm_1889 0.0280 ms 69.5% +SingleProcess AUTOTUNE takes 4.9455 seconds +AUTOTUNE mixed_mm(1024x256, 256x512) + triton_mm_2057 0.0127 ms 100.0% + triton_mm_2055 0.0132 ms 96.4% + triton_mm_2056 0.0140 ms 90.8% + triton_mm_2058 0.0142 ms 89.2% + triton_mm_2063 0.0154 ms 82.7% + triton_mm_2054 0.0156 ms 81.5% + triton_mm_2059 0.0157 ms 80.8% + triton_mm_2060 0.0158 ms 80.5% + triton_mm_2064 0.0162 ms 78.5% + triton_mm_2062 0.0168 ms 75.5% +SingleProcess AUTOTUNE takes 4.8803 seconds +AUTOTUNE mixed_mm(1024x512, 512x256) + triton_mm_2096 0.0169 ms 100.0% + triton_mm_2097 0.0177 ms 95.3% + triton_mm_2094 0.0182 ms 92.8% + triton_mm_2091 0.0187 ms 90.6% + triton_mm_2093 0.0187 ms 90.6% + triton_mm_2089 0.0191 ms 88.3% + triton_mm_2090 0.0197 ms 85.9% + triton_mm_2092 0.0199 ms 84.8% + triton_mm_2098 0.0239 ms 70.6% + triton_mm_2088 0.0244 ms 69.3% +SingleProcess AUTOTUNE takes 4.8255 seconds +AUTOTUNE mixed_mm(1x512, 512x1024) + triton_mm_2211 0.0130 ms 100.0% + triton_mm_2210 0.0137 ms 94.6% + triton_mm_2212 0.0137 ms 94.4% + triton_mm_2214 0.0140 ms 92.5% + triton_mm_2215 0.0151 ms 85.8% + triton_mm_2208 0.0158 ms 82.0% + triton_mm_2207 0.0164 ms 79.3% + triton_mm_2209 0.0173 ms 74.7% + triton_mm_2216 0.0226 ms 57.4% + triton_mm_2206 0.0236 ms 54.9% +SingleProcess AUTOTUNE takes 4.3536 seconds +AUTOTUNE mixed_mm(256x512, 512x512) + triton_mm_2259 0.0135 ms 100.0% + triton_mm_2258 0.0138 ms 97.9% + triton_mm_2262 0.0150 ms 90.3% + triton_mm_2261 0.0167 ms 81.0% + triton_mm_2256 0.0185 ms 73.1% + triton_mm_2254 0.0187 ms 72.1% + triton_mm_2255 0.0193 ms 69.9% + triton_mm_2257 0.0203 ms 66.6% + triton_mm_2263 0.0226 ms 59.7% + triton_mm_2253 0.0237 ms 57.0% +SingleProcess AUTOTUNE takes 5.1969 seconds +AUTOTUNE mixed_mm(1x512, 512x2048) + triton_mm_2409 0.0132 ms 100.0% + triton_mm_2411 0.0137 ms 96.2% + triton_mm_2413 0.0142 ms 93.0% + triton_mm_2407 0.0163 ms 81.0% + triton_mm_2410 0.0166 ms 79.6% + triton_mm_2406 0.0169 ms 78.2% + triton_mm_2408 0.0174 ms 76.1% + triton_mm_2414 0.0197 ms 67.2% + triton_mm_2415 0.0237 ms 55.8% + triton_mm_2405 0.0250 ms 52.9% +SingleProcess AUTOTUNE takes 4.5985 seconds +AUTOTUNE mixed_mm(262x128, 128x1024) + triton_mm_2422 0.0096 ms 100.0% + triton_mm_2417 0.0098 ms 98.0% + triton_mm_2419 0.0098 ms 98.0% + triton_mm_2424 0.0098 ms 98.0% + triton_mm_2425 0.0101 ms 95.3% + triton_mm_2418 0.0101 ms 95.0% + triton_mm_2421 0.0105 ms 91.5% + triton_mm_2426 0.0107 ms 90.4% + triton_mm_2416 0.0107 ms 89.9% + triton_mm_2420 0.0108 ms 89.6% +SingleProcess AUTOTUNE takes 4.9473 seconds +AUTOTUNE mixed_mm(256x1024, 1024x512) + triton_mm_2432 0.0197 ms 100.0% + triton_mm_2433 0.0209 ms 94.3% + triton_mm_2436 0.0219 ms 89.9% + triton_mm_2435 0.0252 ms 78.4% + triton_mm_2430 0.0291 ms 67.8% + triton_mm_2428 0.0309 ms 63.8% + triton_mm_2429 0.0311 ms 63.3% + triton_mm_2431 0.0324 ms 60.7% + triton_mm_2437 0.0382 ms 51.6% + triton_mm_2427 0.0413 ms 47.8% +SingleProcess AUTOTUNE takes 4.9364 seconds +AUTOTUNE mixed_mm(256x512, 512x1024) + triton_mm_2470 0.0164 ms 100.0% + triton_mm_2471 0.0170 ms 96.2% + triton_mm_2467 0.0180 ms 90.8% + triton_mm_2465 0.0184 ms 88.9% + triton_mm_2468 0.0187 ms 87.4% + triton_mm_2463 0.0189 ms 86.6% + triton_mm_2464 0.0190 ms 86.2% + triton_mm_2466 0.0204 ms 80.0% + triton_mm_2472 0.0236 ms 69.4% + triton_mm_2462 0.0240 ms 68.2% +SingleProcess AUTOTUNE takes 4.9202 seconds +AUTOTUNE mixed_mm(256x1024, 1024x64) + triton_mm_2489 0.0187 ms 100.0% + triton_mm_2485 0.0194 ms 96.2% + triton_mm_2486 0.0204 ms 91.4% + triton_mm_2481 0.0246 ms 75.9% + triton_mm_2488 0.0246 ms 75.9% + triton_mm_2483 0.0252 ms 74.0% + triton_mm_2482 0.0306 ms 61.0% + triton_mm_2484 0.0327 ms 57.1% + triton_mm_2490 0.0387 ms 48.3% + triton_mm_2480 0.0416 ms 45.0% +SingleProcess AUTOTUNE takes 4.7209 seconds +AUTOTUNE mixed_mm(1024x512, 512x512) + triton_mm_2942 0.0183 ms 100.0% + triton_mm_2940 0.0192 ms 95.2% + triton_mm_2941 0.0203 ms 90.1% + triton_mm_2943 0.0205 ms 88.9% + triton_mm_2948 0.0227 ms 80.4% + triton_mm_2945 0.0231 ms 79.0% + triton_mm_2944 0.0233 ms 78.5% + triton_mm_2939 0.0247 ms 73.9% + triton_mm_2949 0.0252 ms 72.5% + triton_mm_2947 0.0256 ms 71.4% +SingleProcess AUTOTUNE takes 4.8825 seconds +AUTOTUNE mixed_mm(4096x256, 256x512) + triton_mm_3230 0.0222 ms 100.0% + triton_mm_3231 0.0230 ms 96.7% + triton_mm_3232 0.0230 ms 96.4% + triton_mm_3234 0.0235 ms 94.6% + triton_mm_3233 0.0243 ms 91.3% + triton_mm_3238 0.0311 ms 71.4% + triton_mm_3240 0.0338 ms 65.7% + triton_mm_3235 0.0352 ms 63.1% + triton_mm_3236 0.0352 ms 63.0% + triton_mm_3237 0.0376 ms 59.1% +SingleProcess AUTOTUNE takes 4.9502 seconds +AUTOTUNE mixed_mm(4096x512, 512x256) + triton_mm_3265 0.0268 ms 100.0% + triton_mm_3266 0.0271 ms 98.9% + triton_mm_3267 0.0279 ms 96.3% + triton_mm_3264 0.0283 ms 94.8% + triton_mm_3268 0.0288 ms 93.1% + triton_mm_3269 0.0334 ms 80.3% + triton_mm_3270 0.0339 ms 79.2% + triton_mm_3272 0.0346 ms 77.5% + triton_mm_3274 0.0347 ms 77.3% + triton_mm_3273 0.0376 ms 71.4% +SingleProcess AUTOTUNE takes 5.5918 seconds +AUTOTUNE mixed_mm(1x16, 16x64) + triton_mm_3652 0.0062 ms 100.0% + triton_mm_3653 0.0062 ms 100.0% + triton_mm_3654 0.0062 ms 100.0% + triton_mm_3655 0.0062 ms 100.0% + triton_mm_3656 0.0062 ms 100.0% + triton_mm_3658 0.0062 ms 100.0% + triton_mm_3657 0.0068 ms 91.5% + triton_mm_3659 0.0068 ms 90.6% + triton_mm_3660 0.0069 ms 89.4% + fallback_mixed_mm 0.0650 ms 9.5% +SingleProcess AUTOTUNE takes 3.1516 seconds +AUTOTUNE mixed_mm(1x64, 64x256) + triton_mm_3670 0.0067 ms 100.0% + triton_mm_3666 0.0070 ms 95.9% + triton_mm_3669 0.0070 ms 95.9% + triton_mm_3663 0.0078 ms 86.0% + triton_mm_3665 0.0078 ms 86.0% + triton_mm_3667 0.0078 ms 86.0% + triton_mm_3664 0.0078 ms 85.7% + triton_mm_3662 0.0078 ms 85.5% + triton_mm_3671 0.0080 ms 83.3% + triton_mm_3661 0.0083 ms 80.7% +SingleProcess AUTOTUNE takes 3.7745 seconds +AUTOTUNE mixed_mm(1x64, 64x64) + triton_mm_3676 0.0070 ms 100.0% + triton_mm_3679 0.0070 ms 99.5% + triton_mm_3680 0.0072 ms 96.5% + triton_mm_3675 0.0074 ms 94.4% + triton_mm_3673 0.0074 ms 94.4% + triton_mm_3678 0.0075 ms 93.2% + triton_mm_3674 0.0075 ms 92.8% + triton_mm_3672 0.0080 ms 86.9% + triton_mm_3677 0.0080 ms 86.9% + fallback_mixed_mm 0.0662 ms 10.5% +SingleProcess AUTOTUNE takes 3.2627 seconds +AUTOTUNE mixed_mm(1x64, 64x32) + triton_mm_3686 0.0062 ms 100.0% + triton_mm_3682 0.0064 ms 96.0% + triton_mm_3685 0.0069 ms 89.8% + triton_mm_3683 0.0070 ms 87.7% + triton_mm_3684 0.0072 ms 85.4% + triton_mm_3681 0.0075 ms 82.1% + triton_mm_3687 0.0075 ms 82.1% + fallback_mixed_mm 0.0648 ms 9.5% +SingleProcess AUTOTUNE takes 2.2571 seconds +AUTOTUNE mixed_mm(2x128, 128x1024) + triton_mm_3792 0.0080 ms 100.0% + triton_mm_3797 0.0080 ms 100.0% + triton_mm_3793 0.0083 ms 96.5% + triton_mm_3794 0.0085 ms 94.0% + triton_mm_3796 0.0085 ms 94.0% + triton_mm_3790 0.0088 ms 91.3% + triton_mm_3789 0.0091 ms 88.7% + triton_mm_3791 0.0099 ms 81.5% + triton_mm_3798 0.0104 ms 77.5% + triton_mm_3788 0.0107 ms 75.4% +SingleProcess AUTOTUNE takes 4.1222 seconds +AUTOTUNE mixed_mm(16384x16, 16x512) + triton_mm_3799 0.0189 ms 100.0% + triton_mm_3800 0.0189 ms 99.7% + triton_mm_3801 0.0192 ms 98.3% + triton_mm_3802 0.0198 ms 95.4% + triton_mm_3806 0.0201 ms 94.1% + triton_mm_3803 0.0208 ms 90.9% + triton_mm_3807 0.0210 ms 89.9% + triton_mm_3809 0.0241 ms 78.2% + triton_mm_3805 0.0252 ms 75.0% + triton_mm_3804 0.0255 ms 74.0% +SingleProcess AUTOTUNE takes 4.1652 seconds +AUTOTUNE mixed_mm(16384x512, 512x16) + triton_mm_3842 0.0252 ms 100.0% + triton_mm_3839 0.0260 ms 97.0% + triton_mm_3841 0.0260 ms 96.7% + triton_mm_3834 0.0268 ms 94.1% + triton_mm_3836 0.0271 ms 92.8% + triton_mm_3835 0.0273 ms 92.4% + triton_mm_3838 0.0285 ms 88.2% + triton_mm_3837 0.0288 ms 87.5% + triton_mm_3833 0.0322 ms 78.2% + triton_mm_3843 0.0337 ms 74.8% +SingleProcess AUTOTUNE takes 4.0097 seconds +AUTOTUNE mixed_mm(4096x32, 32x512) + triton_mm_3984 0.0101 ms 100.0% + triton_mm_3985 0.0105 ms 96.3% + triton_mm_3988 0.0106 ms 95.5% + triton_mm_3986 0.0107 ms 94.0% + triton_mm_3987 0.0110 ms 91.3% + triton_mm_3992 0.0111 ms 91.0% + triton_mm_3990 0.0117 ms 86.1% + triton_mm_3994 0.0120 ms 83.9% + triton_mm_3993 0.0121 ms 83.6% + triton_mm_3989 0.0121 ms 83.0% +SingleProcess AUTOTUNE takes 4.5156 seconds +AUTOTUNE mixed_mm(4096x512, 512x32) + triton_mm_4024 0.0130 ms 100.0% + triton_mm_4026 0.0135 ms 96.2% + triton_mm_4021 0.0143 ms 91.1% + triton_mm_4023 0.0143 ms 91.1% + triton_mm_4027 0.0150 ms 86.6% + triton_mm_4019 0.0165 ms 78.7% + triton_mm_4022 0.0179 ms 72.9% + triton_mm_4020 0.0181 ms 71.8% + triton_mm_4018 0.0238 ms 54.6% + triton_mm_4028 0.0253 ms 51.5% +SingleProcess AUTOTUNE takes 4.0877 seconds +AUTOTUNE mixed_mm(1x64, 64x128) + triton_mm_4124 0.0066 ms 100.0% + triton_mm_4120 0.0068 ms 96.7% + triton_mm_4119 0.0070 ms 95.0% + triton_mm_4123 0.0073 ms 90.4% + triton_mm_4118 0.0075 ms 88.5% + triton_mm_4116 0.0077 ms 85.9% + triton_mm_4115 0.0077 ms 85.5% + triton_mm_4117 0.0077 ms 85.5% + triton_mm_4121 0.0077 ms 85.5% + triton_mm_4122 0.0080 ms 82.5% +SingleProcess AUTOTUNE takes 3.6973 seconds +AUTOTUNE mixed_mm(1024x64, 64x512) + triton_mm_4183 0.0086 ms 100.0% + triton_mm_4185 0.0088 ms 97.5% + triton_mm_4182 0.0091 ms 94.2% + triton_mm_4191 0.0092 ms 93.4% + triton_mm_4184 0.0093 ms 92.1% + triton_mm_4189 0.0094 ms 91.5% + triton_mm_4181 0.0094 ms 91.2% + triton_mm_4190 0.0096 ms 90.1% + triton_mm_4187 0.0100 ms 86.1% + triton_mm_4186 0.0102 ms 84.6% +SingleProcess AUTOTUNE takes 4.9533 seconds +AUTOTUNE mixed_mm(1024x512, 512x64) + triton_mm_4220 0.0130 ms 100.0% + triton_mm_4224 0.0130 ms 100.0% + triton_mm_4221 0.0134 ms 96.9% + triton_mm_4223 0.0158 ms 82.4% + triton_mm_4218 0.0161 ms 81.1% + triton_mm_4216 0.0167 ms 78.2% + triton_mm_4217 0.0197 ms 66.2% + triton_mm_4219 0.0197 ms 66.1% + triton_mm_4225 0.0233 ms 55.9% + triton_mm_4215 0.0250 ms 52.2% +SingleProcess AUTOTUNE takes 4.4978 seconds +AUTOTUNE mixed_mm(256x128, 128x512) + triton_mm_4385 0.0086 ms 100.0% + triton_mm_4386 0.0088 ms 97.5% + triton_mm_4389 0.0091 ms 94.7% + triton_mm_4381 0.0098 ms 87.6% + triton_mm_4383 0.0098 ms 87.6% + triton_mm_4390 0.0099 ms 86.5% + triton_mm_4388 0.0101 ms 85.1% + triton_mm_4384 0.0103 ms 83.2% + triton_mm_4380 0.0104 ms 82.7% + triton_mm_4382 0.0106 ms 81.1% +SingleProcess AUTOTUNE takes 5.2370 seconds +AUTOTUNE mixed_mm(256x512, 512x128) + triton_mm_4423 0.0129 ms 100.0% + triton_mm_4420 0.0134 ms 96.4% + triton_mm_4419 0.0137 ms 94.4% + triton_mm_4422 0.0166 ms 78.0% + triton_mm_4417 0.0179 ms 72.4% + triton_mm_4415 0.0192 ms 67.4% + triton_mm_4416 0.0192 ms 67.4% + triton_mm_4418 0.0197 ms 65.7% + triton_mm_4424 0.0223 ms 58.0% + triton_mm_4414 0.0243 ms 53.2% +SingleProcess AUTOTUNE takes 4.7911 seconds +AUTOTUNE mixed_mm(1x64, 64x512) + triton_mm_4521 0.0070 ms 100.0% + triton_mm_4522 0.0073 ms 96.0% + triton_mm_4515 0.0073 ms 95.2% + triton_mm_4517 0.0078 ms 89.7% + triton_mm_4518 0.0078 ms 89.7% + triton_mm_4519 0.0078 ms 89.7% + triton_mm_4513 0.0080 ms 87.2% + triton_mm_4516 0.0080 ms 87.2% + triton_mm_4514 0.0080 ms 86.9% + triton_mm_4523 0.0083 ms 83.8% +SingleProcess AUTOTUNE takes 4.0460 seconds +AUTOTUNE mixed_mm(256x256, 256x512) + triton_mm_4560 0.0100 ms 100.0% + triton_mm_4559 0.0106 ms 94.6% + triton_mm_4563 0.0108 ms 92.9% + triton_mm_4562 0.0124 ms 80.9% + triton_mm_4557 0.0132 ms 76.2% + triton_mm_4555 0.0135 ms 74.6% + triton_mm_4556 0.0135 ms 74.4% + triton_mm_4558 0.0135 ms 74.4% + triton_mm_4564 0.0143 ms 70.1% + triton_mm_4554 0.0154 ms 65.4% +SingleProcess AUTOTUNE takes 4.8456 seconds +AUTOTUNE mixed_mm(256x512, 512x256) + triton_mm_4593 0.0130 ms 100.0% + triton_mm_4597 0.0130 ms 100.0% + triton_mm_4594 0.0140 ms 93.1% + triton_mm_4596 0.0161 ms 80.7% + triton_mm_4591 0.0179 ms 72.5% + triton_mm_4589 0.0187 ms 69.6% + triton_mm_4590 0.0193 ms 67.4% + triton_mm_4592 0.0203 ms 64.1% + triton_mm_4598 0.0227 ms 57.3% + triton_mm_4588 0.0240 ms 54.2% +SingleProcess AUTOTUNE takes 5.2245 seconds +AUTOTUNE mixed_mm(256x256, 256x64) + triton_mm_4615 0.0098 ms 100.0% + triton_mm_4611 0.0098 ms 99.5% + triton_mm_4612 0.0098 ms 99.5% + triton_mm_4609 0.0114 ms 85.8% + triton_mm_4614 0.0114 ms 85.8% + triton_mm_4607 0.0115 ms 85.3% + triton_mm_4608 0.0135 ms 72.4% + triton_mm_4610 0.0138 ms 71.0% + triton_mm_4616 0.0143 ms 68.5% + triton_mm_4606 0.0148 ms 66.1% +SingleProcess AUTOTUNE takes 4.4853 seconds +AUTOTUNE mixed_mm(1024x128, 128x512) + triton_mm_5065 0.0101 ms 100.0% + triton_mm_5066 0.0102 ms 98.7% + triton_mm_5067 0.0106 ms 94.9% + triton_mm_5068 0.0110 ms 91.6% + triton_mm_5064 0.0111 ms 90.5% + triton_mm_5074 0.0112 ms 89.7% + triton_mm_5073 0.0114 ms 88.5% + triton_mm_5070 0.0115 ms 88.0% + triton_mm_5069 0.0122 ms 82.6% + triton_mm_5072 0.0129 ms 78.2% +SingleProcess AUTOTUNE takes 5.3335 seconds +AUTOTUNE mixed_mm(1024x512, 512x128) + triton_mm_5103 0.0132 ms 100.0% + triton_mm_5104 0.0136 ms 97.4% + triton_mm_5107 0.0151 ms 87.8% + triton_mm_5106 0.0167 ms 79.5% + triton_mm_5101 0.0181 ms 73.3% + triton_mm_5099 0.0197 ms 67.3% + triton_mm_5102 0.0199 ms 66.7% + triton_mm_5100 0.0202 ms 65.6% + triton_mm_5108 0.0231 ms 57.3% + triton_mm_5098 0.0246 ms 53.8% +SingleProcess AUTOTUNE takes 4.8769 seconds +AUTOTUNE mixed_mm(4096x64, 64x512) + triton_mm_5355 0.0120 ms 100.0% + triton_mm_5356 0.0121 ms 99.7% + triton_mm_5357 0.0127 ms 94.5% + triton_mm_5359 0.0131 ms 92.2% + triton_mm_5358 0.0132 ms 91.5% + triton_mm_5363 0.0132 ms 91.0% + triton_mm_5365 0.0155 ms 77.8% + triton_mm_5364 0.0156 ms 76.9% + triton_mm_5362 0.0167 ms 72.2% + triton_mm_5361 0.0174 ms 69.2% +SingleProcess AUTOTUNE takes 5.0763 seconds +AUTOTUNE mixed_mm(4096x512, 512x64) + triton_mm_5398 0.0165 ms 100.0% + triton_mm_5397 0.0167 ms 99.2% + triton_mm_5392 0.0173 ms 95.4% + triton_mm_5390 0.0179 ms 92.7% + triton_mm_5394 0.0187 ms 88.5% + triton_mm_5395 0.0188 ms 88.0% + triton_mm_5391 0.0204 ms 80.9% + triton_mm_5393 0.0208 ms 79.5% + triton_mm_5389 0.0254 ms 65.0% + triton_mm_5399 0.0255 ms 64.9% +SingleProcess AUTOTUNE takes 4.8824 seconds +AUTOTUNE mixed_mm(16384x32, 32x512) + triton_mm_5645 0.0193 ms 100.0% + triton_mm_5646 0.0194 ms 99.3% + triton_mm_5644 0.0194 ms 99.2% + triton_mm_5648 0.0199 ms 96.8% + triton_mm_5647 0.0201 ms 95.9% + triton_mm_5651 0.0217 ms 88.9% + triton_mm_5652 0.0219 ms 87.9% + triton_mm_5650 0.0244 ms 78.9% + triton_mm_5649 0.0255 ms 75.4% + triton_mm_5654 0.0262 ms 73.4% +SingleProcess AUTOTUNE takes 4.5230 seconds +AUTOTUNE mixed_mm(16384x512, 512x32) + triton_mm_5687 0.0253 ms 100.0% + triton_mm_5686 0.0262 ms 96.3% + triton_mm_5684 0.0263 ms 96.0% + triton_mm_5680 0.0268 ms 94.5% + triton_mm_5679 0.0268 ms 94.4% + triton_mm_5681 0.0268 ms 94.3% + triton_mm_5682 0.0291 ms 87.0% + triton_mm_5683 0.0295 ms 85.8% + triton_mm_5678 0.0319 ms 79.2% + triton_mm_5688 0.0329 ms 76.8% +SingleProcess AUTOTUNE takes 4.2633 seconds +pass-sqnr-inf + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +LearningToPaint +cuda eval LearningToPaint int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x512, 512x65) + triton_mm_152 0.0140 ms 100.0% + triton_mm_151 0.0178 ms 78.4% + triton_mm_148 0.0189 ms 73.6% + triton_mm_147 0.0227 ms 61.4% + triton_mm_149 0.0229 ms 61.0% + triton_mm_145 0.0266 ms 52.5% + triton_mm_144 0.0269 ms 51.9% + triton_mm_153 0.0271 ms 51.5% + triton_mm_146 0.0297 ms 47.0% + triton_mm_143 0.0326 ms 42.7% +SingleProcess AUTOTUNE takes 4.3691 seconds +pass-sqnr-55.632 + loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead + loading model: 0it [00:03, ?it/s] +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +Super_SloMo +cuda eval Super_SloMo int8weightonly-bs1-acc +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +alexnet +cuda eval alexnet int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x9216, 9216x4096) + triton_mm_40 0.1219 ms 100.0% + triton_mm_38 0.1226 ms 99.5% + triton_mm_42 0.1277 ms 95.5% + triton_mm_39 0.1553 ms 78.5% + triton_mm_43 0.1735 ms 70.3% + triton_mm_36 0.1804 ms 67.6% + triton_mm_35 0.1867 ms 65.3% + triton_mm_37 0.2190 ms 55.7% + fallback_mixed_mm 0.2413 ms 50.5% + triton_mm_34 0.3243 ms 37.6% +SingleProcess AUTOTUNE takes 3.9469 seconds +AUTOTUNE mixed_mm(1x4096, 4096x4096) + triton_mm_51 0.0578 ms 100.0% + triton_mm_49 0.0588 ms 98.2% + triton_mm_53 0.0678 ms 85.3% + triton_mm_46 0.0866 ms 66.7% + triton_mm_47 0.0884 ms 65.3% + triton_mm_50 0.0929 ms 62.2% + triton_mm_48 0.1014 ms 57.0% + triton_mm_54 0.1024 ms 56.4% + fallback_mixed_mm 0.1178 ms 49.0% + triton_mm_45 0.1518 ms 38.1% +SingleProcess AUTOTUNE takes 3.7202 seconds +AUTOTUNE mixed_mm(1x4096, 4096x1000) + triton_mm_61 0.0429 ms 100.0% + triton_mm_64 0.0509 ms 84.2% + triton_mm_65 0.0563 ms 76.2% + triton_mm_62 0.0587 ms 73.1% + triton_mm_60 0.0587 ms 73.1% + fallback_mixed_mm 0.0604 ms 71.0% + triton_mm_58 0.0799 ms 53.6% + triton_mm_57 0.0826 ms 51.9% + triton_mm_59 0.1018 ms 42.1% + triton_mm_56 0.1464 ms 29.3% +SingleProcess AUTOTUNE takes 3.7075 seconds +pass-sqnr-41.226 + loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn + loading model: 0it [00:04, ?it/s] +cuda eval basic_gnn_edgecnn int8weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-50.445 + loading model: 0it [00:00, ?it/s]basic_gnn_gcn + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_gcn int8weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-49.672 + loading model: 0it [00:00, ?it/s]basic_gnn_gin + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_gin int8weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-44.907 + loading model: 0it [00:00, ?it/s]basic_gnn_sage + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_sage int8weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-48.436 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:08, ?it/s] +cm3leon_generate +cuda eval cm3leon_generate int8weightonly-bs1-acc +AUTOTUNE bmm(16x1x96, 16x96x364) + triton_bmm_34616 0.0077 ms 100.0% + triton_bmm_34618 0.0077 ms 99.6% + triton_bmm_34614 0.0080 ms 96.8% + triton_bmm_34613 0.0080 ms 96.4% + triton_bmm_34615 0.0080 ms 96.0% + triton_bmm_34620 0.0080 ms 96.0% + triton_bmm_34612 0.0085 ms 90.6% + triton_bmm_34617 0.0088 ms 87.6% + triton_bmm_34619 0.0090 ms 85.5% + triton_bmm_34622 0.0093 ms 82.8% +SingleProcess AUTOTUNE takes 2.0357 seconds +AUTOTUNE bmm(16x1x364, 16x364x96) + triton_bmm_34641 0.0088 ms 100.0% + triton_bmm_34644 0.0092 ms 94.8% + triton_bmm_34640 0.0096 ms 91.6% + triton_bmm_34642 0.0096 ms 91.6% + bmm 0.0096 ms 91.3% + triton_bmm_34645 0.0100 ms 87.8% + triton_bmm_34639 0.0101 ms 87.0% + triton_bmm_34638 0.0112 ms 78.3% + triton_bmm_34637 0.0117 ms 75.1% + triton_bmm_34636 0.0151 ms 58.2% +SingleProcess AUTOTUNE takes 4.1367 seconds +AUTOTUNE bmm(16x1x96, 16x96x365) + triton_bmm_34712 0.0078 ms 100.0% + triton_bmm_34714 0.0078 ms 100.0% + triton_bmm_34710 0.0080 ms 97.6% + triton_bmm_34711 0.0080 ms 97.2% + triton_bmm_34709 0.0082 ms 95.7% + triton_bmm_34713 0.0083 ms 94.6% + triton_bmm_34708 0.0086 ms 91.0% + triton_bmm_34716 0.0086 ms 91.0% + triton_bmm_34715 0.0090 ms 86.5% + triton_bmm_34717 0.0093 ms 83.6% +SingleProcess AUTOTUNE takes 3.9199 seconds +AUTOTUNE bmm(16x1x365, 16x365x96) + triton_bmm_34741 0.0108 ms 100.0% + bmm 0.0116 ms 93.1% + triton_bmm_34740 0.0128 ms 84.5% + triton_bmm_34737 0.0150 ms 72.3% + triton_bmm_34736 0.0153 ms 71.1% + triton_bmm_34738 0.0156 ms 69.6% + triton_bmm_34734 0.0161 ms 67.5% + triton_bmm_34733 0.0161 ms 67.4% + triton_bmm_34735 0.0167 ms 64.8% + triton_bmm_34732 0.0199 ms 54.4% +SingleProcess AUTOTUNE takes 3.8314 seconds +AUTOTUNE bmm(16x1x96, 16x96x366) + triton_bmm_34808 0.0078 ms 100.0% + triton_bmm_34806 0.0080 ms 97.2% + triton_bmm_34807 0.0080 ms 96.8% + triton_bmm_34812 0.0080 ms 96.8% + triton_bmm_34809 0.0083 ms 94.2% + triton_bmm_34810 0.0083 ms 93.3% + triton_bmm_34804 0.0087 ms 89.3% + triton_bmm_34805 0.0088 ms 88.4% + triton_bmm_34815 0.0093 ms 83.8% + triton_bmm_34814 0.0094 ms 82.9% +SingleProcess AUTOTUNE takes 3.9335 seconds +AUTOTUNE bmm(16x1x366, 16x366x96) + triton_bmm_34833 0.0088 ms 100.0% + triton_bmm_34832 0.0096 ms 92.0% + triton_bmm_34834 0.0096 ms 91.7% + triton_bmm_34837 0.0097 ms 90.5% + triton_bmm_34836 0.0098 ms 89.7% + bmm 0.0101 ms 87.0% + triton_bmm_34831 0.0103 ms 85.4% + triton_bmm_34830 0.0114 ms 77.2% + triton_bmm_34829 0.0117 ms 75.3% + triton_bmm_34828 0.0153 ms 57.4% +SingleProcess AUTOTUNE takes 4.2731 seconds +AUTOTUNE bmm(16x1x96, 16x96x367) + triton_bmm_34904 0.0078 ms 100.0% + triton_bmm_34906 0.0078 ms 99.6% + triton_bmm_34903 0.0080 ms 96.8% + triton_bmm_34905 0.0083 ms 93.8% + triton_bmm_34902 0.0085 ms 91.0% + triton_bmm_34908 0.0086 ms 90.7% + triton_bmm_34901 0.0088 ms 88.4% + triton_bmm_34907 0.0090 ms 86.2% + triton_bmm_34900 0.0092 ms 84.4% + triton_bmm_34909 0.0095 ms 81.5% +SingleProcess AUTOTUNE takes 3.8992 seconds +AUTOTUNE bmm(16x1x367, 16x367x96) + triton_bmm_34933 0.0108 ms 100.0% + bmm 0.0122 ms 89.2% + triton_bmm_34932 0.0128 ms 84.7% + triton_bmm_34930 0.0152 ms 71.2% + triton_bmm_34926 0.0156 ms 69.8% + triton_bmm_34929 0.0156 ms 69.8% + triton_bmm_34928 0.0158 ms 68.6% + triton_bmm_34925 0.0163 ms 66.6% + triton_bmm_34927 0.0167 ms 64.9% + triton_bmm_34924 0.0200 ms 54.3% +SingleProcess AUTOTUNE takes 3.6975 seconds +AUTOTUNE bmm(16x1x96, 16x96x368) + triton_bmm_35000 0.0078 ms 100.0% + triton_bmm_34998 0.0080 ms 97.6% + triton_bmm_34999 0.0080 ms 97.2% + triton_bmm_35001 0.0083 ms 94.6% + triton_bmm_35002 0.0084 ms 93.5% + triton_bmm_34996 0.0085 ms 91.4% + triton_bmm_34997 0.0085 ms 91.4% + triton_bmm_35004 0.0086 ms 90.7% + triton_bmm_35003 0.0090 ms 86.5% + triton_bmm_35005 0.0091 ms 86.2% +SingleProcess AUTOTUNE takes 3.6876 seconds +AUTOTUNE bmm(16x1x368, 16x368x96) + triton_bmm_35025 0.0090 ms 100.0% + triton_bmm_35026 0.0090 ms 100.0% + triton_bmm_35028 0.0092 ms 97.9% + triton_bmm_35024 0.0096 ms 94.3% + triton_bmm_35029 0.0101 ms 89.2% + triton_bmm_35023 0.0102 ms 88.7% + triton_bmm_35022 0.0113 ms 79.7% + triton_bmm_35021 0.0119 ms 75.8% + triton_bmm_35020 0.0150 ms 60.0% + triton_bmm_35027 0.0161 ms 56.2% +SingleProcess AUTOTUNE takes 4.1789 seconds +AUTOTUNE bmm(16x1x96, 16x96x369) + triton_bmm_35096 0.0078 ms 100.0% + triton_bmm_35098 0.0078 ms 100.0% + triton_bmm_35094 0.0080 ms 97.2% + triton_bmm_35093 0.0083 ms 94.2% + triton_bmm_35100 0.0085 ms 91.0% + triton_bmm_35095 0.0086 ms 90.0% + triton_bmm_35097 0.0088 ms 88.0% + triton_bmm_35092 0.0093 ms 83.6% + triton_bmm_35101 0.0095 ms 82.1% + triton_bmm_35102 0.0095 ms 81.5% +SingleProcess AUTOTUNE takes 3.7702 seconds +AUTOTUNE bmm(16x1x369, 16x369x96) + bmm 0.0111 ms 100.0% + triton_bmm_35125 0.0114 ms 97.5% + triton_bmm_35124 0.0128 ms 86.5% + triton_bmm_35122 0.0153 ms 72.7% + triton_bmm_35120 0.0153 ms 72.6% + triton_bmm_35118 0.0155 ms 71.5% + triton_bmm_35121 0.0156 ms 71.0% + triton_bmm_35117 0.0169 ms 65.8% + triton_bmm_35119 0.0171 ms 65.1% + triton_bmm_35116 0.0200 ms 55.6% +SingleProcess AUTOTUNE takes 4.5986 seconds +AUTOTUNE bmm(16x1x96, 16x96x370) + triton_bmm_35192 0.0078 ms 100.0% + triton_bmm_35189 0.0080 ms 96.8% + triton_bmm_35191 0.0080 ms 96.8% + triton_bmm_35194 0.0083 ms 93.5% + triton_bmm_35190 0.0085 ms 91.7% + triton_bmm_35196 0.0086 ms 90.7% + triton_bmm_35193 0.0088 ms 88.0% + triton_bmm_35188 0.0093 ms 83.5% + triton_bmm_35198 0.0095 ms 82.1% + triton_bmm_35195 0.0096 ms 81.3% +SingleProcess AUTOTUNE takes 3.9994 seconds +AUTOTUNE bmm(16x1x370, 16x370x96) + triton_bmm_35218 0.0091 ms 100.0% + triton_bmm_35217 0.0096 ms 94.8% + triton_bmm_35216 0.0096 ms 94.6% + triton_bmm_35221 0.0098 ms 92.5% + triton_bmm_35220 0.0098 ms 92.2% + bmm 0.0099 ms 91.3% + triton_bmm_35215 0.0102 ms 89.0% + triton_bmm_35214 0.0108 ms 83.5% + triton_bmm_35213 0.0114 ms 79.7% + triton_bmm_35212 0.0155 ms 58.6% +SingleProcess AUTOTUNE takes 3.9756 seconds +AUTOTUNE bmm(16x1x96, 16x96x371) + triton_bmm_35288 0.0078 ms 100.0% + triton_bmm_35290 0.0078 ms 100.0% + triton_bmm_35286 0.0080 ms 97.2% + triton_bmm_35287 0.0080 ms 96.8% + triton_bmm_35285 0.0083 ms 94.2% + triton_bmm_35292 0.0086 ms 90.5% + triton_bmm_35284 0.0086 ms 90.3% + triton_bmm_35289 0.0088 ms 88.0% + triton_bmm_35291 0.0090 ms 86.2% + triton_bmm_35294 0.0094 ms 82.4% +SingleProcess AUTOTUNE takes 3.8850 seconds +AUTOTUNE bmm(16x1x371, 16x371x96) + triton_bmm_35317 0.0114 ms 100.0% + bmm 0.0116 ms 98.3% + triton_bmm_35316 0.0130 ms 87.9% + triton_bmm_35313 0.0152 ms 74.8% + triton_bmm_35314 0.0153 ms 74.6% + triton_bmm_35312 0.0155 ms 73.4% + triton_bmm_35310 0.0161 ms 70.8% + triton_bmm_35309 0.0169 ms 67.6% + triton_bmm_35311 0.0171 ms 66.8% + triton_bmm_35308 0.0202 ms 56.4% +SingleProcess AUTOTUNE takes 4.0858 seconds +AUTOTUNE bmm(16x1x96, 16x96x372) + triton_bmm_35382 0.0080 ms 100.0% + triton_bmm_35383 0.0080 ms 99.6% + triton_bmm_35384 0.0083 ms 96.2% + triton_bmm_35386 0.0083 ms 96.2% + triton_bmm_35380 0.0085 ms 93.6% + triton_bmm_35388 0.0086 ms 93.3% + triton_bmm_35381 0.0088 ms 90.9% + triton_bmm_35385 0.0088 ms 90.6% + triton_bmm_35391 0.0093 ms 85.9% + triton_bmm_35387 0.0096 ms 83.6% +SingleProcess AUTOTUNE takes 4.0670 seconds +AUTOTUNE bmm(16x1x372, 16x372x96) + triton_bmm_35410 0.0090 ms 100.0% + triton_bmm_35412 0.0093 ms 97.2% + triton_bmm_35409 0.0095 ms 94.6% + bmm 0.0097 ms 93.1% + triton_bmm_35408 0.0101 ms 89.2% + triton_bmm_35413 0.0101 ms 89.2% + triton_bmm_35407 0.0103 ms 87.6% + triton_bmm_35405 0.0114 ms 79.4% + triton_bmm_35406 0.0114 ms 79.2% + triton_bmm_35404 0.0148 ms 61.2% +SingleProcess AUTOTUNE takes 3.8838 seconds +AUTOTUNE bmm(16x1x96, 16x96x373) + triton_bmm_35482 0.0078 ms 100.0% + triton_bmm_35478 0.0080 ms 97.2% + triton_bmm_35477 0.0083 ms 94.2% + triton_bmm_35480 0.0083 ms 93.5% + triton_bmm_35479 0.0086 ms 90.0% + triton_bmm_35484 0.0086 ms 90.0% + triton_bmm_35476 0.0088 ms 88.7% + triton_bmm_35481 0.0091 ms 85.9% + triton_bmm_35483 0.0096 ms 81.0% + triton_bmm_35487 0.0099 ms 78.9% +SingleProcess AUTOTUNE takes 3.7844 seconds +AUTOTUNE bmm(16x1x373, 16x373x96) + triton_bmm_35509 0.0114 ms 100.0% + bmm 0.0118 ms 96.7% + triton_bmm_35508 0.0124 ms 91.8% + triton_bmm_35506 0.0153 ms 74.6% + triton_bmm_35505 0.0156 ms 73.0% + triton_bmm_35504 0.0158 ms 71.9% + triton_bmm_35502 0.0161 ms 70.8% + triton_bmm_35501 0.0165 ms 68.9% + triton_bmm_35503 0.0175 ms 65.0% + triton_bmm_35500 0.0200 ms 57.1% +SingleProcess AUTOTUNE takes 3.7954 seconds +AUTOTUNE bmm(16x1x96, 16x96x374) + triton_bmm_35576 0.0078 ms 100.0% + triton_bmm_35574 0.0080 ms 97.2% + triton_bmm_35575 0.0080 ms 96.8% + triton_bmm_35580 0.0081 ms 96.4% + triton_bmm_35578 0.0083 ms 93.3% + triton_bmm_35577 0.0085 ms 91.7% + triton_bmm_35572 0.0088 ms 88.7% + triton_bmm_35573 0.0088 ms 88.4% + triton_bmm_35579 0.0096 ms 81.3% + triton_bmm_35583 0.0098 ms 79.2% +SingleProcess AUTOTUNE takes 4.0671 seconds +AUTOTUNE bmm(16x1x374, 16x374x96) + triton_bmm_35602 0.0091 ms 100.0% + triton_bmm_35604 0.0093 ms 97.6% + triton_bmm_35601 0.0095 ms 95.0% + triton_bmm_35600 0.0101 ms 89.6% + bmm 0.0101 ms 89.3% + triton_bmm_35599 0.0103 ms 88.3% + triton_bmm_35605 0.0103 ms 88.3% + triton_bmm_35598 0.0108 ms 83.5% + triton_bmm_35597 0.0117 ms 77.1% + triton_bmm_35596 0.0155 ms 58.4% +SingleProcess AUTOTUNE takes 3.8637 seconds +AUTOTUNE bmm(16x1x96, 16x96x375) + triton_bmm_35674 0.0078 ms 100.0% + triton_bmm_35670 0.0080 ms 97.2% + triton_bmm_35676 0.0081 ms 96.4% + triton_bmm_35672 0.0083 ms 93.5% + triton_bmm_35671 0.0088 ms 88.7% + triton_bmm_35669 0.0088 ms 88.4% + triton_bmm_35673 0.0088 ms 88.0% + triton_bmm_35668 0.0092 ms 84.2% + triton_bmm_35679 0.0093 ms 83.5% + triton_bmm_35678 0.0095 ms 81.5% +SingleProcess AUTOTUNE takes 3.8397 seconds +AUTOTUNE bmm(16x1x375, 16x375x96) + triton_bmm_35701 0.0108 ms 100.0% + bmm 0.0119 ms 91.1% + triton_bmm_35700 0.0124 ms 87.4% + triton_bmm_35698 0.0154 ms 70.6% + triton_bmm_35697 0.0156 ms 69.5% + triton_bmm_35696 0.0161 ms 67.5% + triton_bmm_35694 0.0161 ms 67.3% + triton_bmm_35693 0.0169 ms 64.3% + triton_bmm_35695 0.0171 ms 63.6% + triton_bmm_35692 0.0203 ms 53.6% +SingleProcess AUTOTUNE takes 3.9739 seconds +AUTOTUNE bmm(16x1x96, 16x96x376) + triton_bmm_35772 0.0080 ms 100.0% + triton_bmm_35770 0.0083 ms 96.9% + triton_bmm_35768 0.0083 ms 96.5% + triton_bmm_35769 0.0083 ms 96.5% + triton_bmm_35766 0.0085 ms 94.0% + triton_bmm_35767 0.0086 ms 93.3% + triton_bmm_35765 0.0087 ms 92.3% + triton_bmm_35773 0.0091 ms 88.7% + triton_bmm_35764 0.0093 ms 86.6% + triton_bmm_35771 0.0096 ms 83.9% +SingleProcess AUTOTUNE takes 3.8841 seconds +AUTOTUNE bmm(16x1x376, 16x376x96) + triton_bmm_35792 0.0096 ms 100.0% + triton_bmm_35794 0.0096 ms 100.0% + triton_bmm_35793 0.0096 ms 99.7% + triton_bmm_35796 0.0099 ms 97.1% + triton_bmm_35797 0.0100 ms 96.1% + triton_bmm_35791 0.0101 ms 94.9% + bmm 0.0103 ms 92.6% + triton_bmm_35790 0.0108 ms 88.2% + triton_bmm_35789 0.0114 ms 84.2% + triton_bmm_35788 0.0148 ms 64.9% +SingleProcess AUTOTUNE takes 4.7505 seconds +AUTOTUNE bmm(16x1x96, 16x96x377) + triton_bmm_35864 0.0078 ms 100.0% + triton_bmm_35862 0.0080 ms 97.2% + triton_bmm_35866 0.0084 ms 93.1% + triton_bmm_35865 0.0085 ms 91.7% + triton_bmm_35861 0.0088 ms 88.4% + triton_bmm_35863 0.0088 ms 88.4% + triton_bmm_35868 0.0088 ms 88.4% + triton_bmm_35867 0.0091 ms 85.9% + triton_bmm_35860 0.0093 ms 83.8% + triton_bmm_35869 0.0095 ms 82.1% +SingleProcess AUTOTUNE takes 4.3252 seconds +AUTOTUNE bmm(16x1x377, 16x377x96) + triton_bmm_35893 0.0108 ms 100.0% + bmm 0.0119 ms 91.1% + triton_bmm_35892 0.0130 ms 83.7% + triton_bmm_35889 0.0156 ms 69.5% + triton_bmm_35890 0.0158 ms 68.6% + triton_bmm_35888 0.0161 ms 67.5% + triton_bmm_35886 0.0162 ms 67.1% + triton_bmm_35885 0.0165 ms 65.6% + triton_bmm_35887 0.0176 ms 61.6% + triton_bmm_35884 0.0199 ms 54.4% +SingleProcess AUTOTUNE takes 3.7985 seconds +AUTOTUNE bmm(16x1x96, 16x96x378) + triton_bmm_35960 0.0083 ms 100.0% + triton_bmm_35962 0.0084 ms 99.2% + triton_bmm_35958 0.0085 ms 97.4% + triton_bmm_35964 0.0086 ms 97.2% + triton_bmm_35959 0.0088 ms 94.9% + triton_bmm_35957 0.0088 ms 94.5% + triton_bmm_35961 0.0088 ms 94.2% + triton_bmm_35956 0.0093 ms 89.7% + triton_bmm_35967 0.0093 ms 89.7% + triton_bmm_35966 0.0095 ms 87.2% +SingleProcess AUTOTUNE takes 3.6937 seconds +AUTOTUNE bmm(16x1x378, 16x378x96) + triton_bmm_35985 0.0090 ms 100.0% + triton_bmm_35986 0.0090 ms 100.0% + triton_bmm_35988 0.0099 ms 91.6% + bmm 0.0101 ms 89.2% + triton_bmm_35984 0.0102 ms 88.4% + triton_bmm_35983 0.0103 ms 87.3% + triton_bmm_35989 0.0104 ms 87.2% + triton_bmm_35982 0.0108 ms 83.2% + triton_bmm_35981 0.0114 ms 79.4% + triton_bmm_35980 0.0150 ms 60.1% +SingleProcess AUTOTUNE takes 4.0454 seconds +AUTOTUNE bmm(16x1x96, 16x96x379) + triton_bmm_36056 0.0078 ms 100.0% + triton_bmm_36055 0.0080 ms 96.8% + triton_bmm_36060 0.0080 ms 96.8% + triton_bmm_36053 0.0083 ms 94.2% + triton_bmm_36058 0.0083 ms 93.5% + triton_bmm_36054 0.0085 ms 91.0% + triton_bmm_36057 0.0088 ms 88.0% + triton_bmm_36052 0.0092 ms 84.4% + triton_bmm_36061 0.0096 ms 81.3% + triton_bmm_36059 0.0096 ms 81.0% +SingleProcess AUTOTUNE takes 4.0898 seconds +AUTOTUNE bmm(16x1x379, 16x379x96) + triton_bmm_36085 0.0114 ms 100.0% + bmm 0.0124 ms 91.8% + triton_bmm_36084 0.0124 ms 91.8% + triton_bmm_36081 0.0153 ms 74.6% + triton_bmm_36082 0.0153 ms 74.5% + triton_bmm_36078 0.0157 ms 72.4% + triton_bmm_36080 0.0161 ms 70.9% + triton_bmm_36077 0.0171 ms 66.7% + triton_bmm_36079 0.0171 ms 66.7% + triton_bmm_36076 0.0202 ms 56.3% +SingleProcess AUTOTUNE takes 3.7726 seconds +AUTOTUNE bmm(16x1x96, 16x96x380) + triton_bmm_36152 0.0078 ms 100.0% + triton_bmm_36154 0.0078 ms 100.0% + triton_bmm_36149 0.0080 ms 97.2% + triton_bmm_36150 0.0080 ms 97.2% + triton_bmm_36156 0.0080 ms 97.2% + triton_bmm_36153 0.0083 ms 93.8% + triton_bmm_36148 0.0085 ms 91.0% + triton_bmm_36151 0.0086 ms 90.3% + triton_bmm_36155 0.0096 ms 81.0% + triton_bmm_36157 0.0099 ms 78.9% +SingleProcess AUTOTUNE takes 3.8027 seconds +AUTOTUNE bmm(16x1x380, 16x380x96) + triton_bmm_36177 0.0089 ms 100.0% + bmm 0.0093 ms 95.5% + triton_bmm_36176 0.0096 ms 93.3% + triton_bmm_36178 0.0096 ms 93.0% + triton_bmm_36180 0.0098 ms 90.9% + triton_bmm_36181 0.0101 ms 88.6% + triton_bmm_36175 0.0103 ms 86.4% + triton_bmm_36174 0.0114 ms 78.4% + triton_bmm_36173 0.0119 ms 75.0% + triton_bmm_36172 0.0153 ms 58.4% +SingleProcess AUTOTUNE takes 3.9518 seconds +AUTOTUNE bmm(16x1x96, 16x96x381) + triton_bmm_36250 0.0078 ms 100.0% + triton_bmm_36247 0.0080 ms 97.2% + triton_bmm_36245 0.0083 ms 94.2% + triton_bmm_36248 0.0083 ms 93.5% + triton_bmm_36249 0.0085 ms 91.4% + triton_bmm_36246 0.0085 ms 91.0% + triton_bmm_36244 0.0086 ms 90.0% + triton_bmm_36252 0.0088 ms 88.4% + triton_bmm_36255 0.0093 ms 83.5% + triton_bmm_36253 0.0096 ms 81.3% +SingleProcess AUTOTUNE takes 3.9949 seconds +AUTOTUNE bmm(16x1x381, 16x381x96) + triton_bmm_36277 0.0114 ms 100.0% + bmm 0.0124 ms 91.8% + triton_bmm_36276 0.0130 ms 88.1% + triton_bmm_36273 0.0153 ms 74.8% + triton_bmm_36272 0.0155 ms 73.6% + triton_bmm_36270 0.0157 ms 72.9% + triton_bmm_36274 0.0158 ms 72.3% + triton_bmm_36269 0.0169 ms 67.6% + triton_bmm_36271 0.0171 ms 66.9% + triton_bmm_36268 0.0203 ms 56.4% +SingleProcess AUTOTUNE takes 3.7894 seconds +AUTOTUNE bmm(16x1x96, 16x96x382) + triton_bmm_36344 0.0078 ms 100.0% + triton_bmm_36346 0.0078 ms 100.0% + triton_bmm_36342 0.0080 ms 97.2% + triton_bmm_36343 0.0080 ms 97.2% + triton_bmm_36341 0.0080 ms 96.8% + triton_bmm_36348 0.0080 ms 96.8% + triton_bmm_36340 0.0087 ms 89.0% + triton_bmm_36345 0.0088 ms 88.0% + triton_bmm_36347 0.0090 ms 86.2% + triton_bmm_36350 0.0095 ms 81.5% +SingleProcess AUTOTUNE takes 3.6920 seconds +AUTOTUNE bmm(16x1x382, 16x382x96) + triton_bmm_36369 0.0096 ms 100.0% + triton_bmm_36370 0.0096 ms 99.7% + triton_bmm_36368 0.0097 ms 98.4% + triton_bmm_36372 0.0098 ms 97.4% + triton_bmm_36373 0.0098 ms 97.4% + bmm 0.0101 ms 94.6% + triton_bmm_36367 0.0107 ms 89.1% + triton_bmm_36365 0.0114 ms 84.2% + triton_bmm_36366 0.0114 ms 84.0% + triton_bmm_36364 0.0155 ms 61.6% +SingleProcess AUTOTUNE takes 4.2012 seconds +AUTOTUNE bmm(16x1x96, 16x96x383) + triton_bmm_36442 0.0078 ms 100.0% + triton_bmm_36440 0.0084 ms 93.1% + triton_bmm_36438 0.0085 ms 91.0% + triton_bmm_36436 0.0087 ms 89.3% + triton_bmm_36437 0.0088 ms 88.4% + triton_bmm_36444 0.0088 ms 88.4% + triton_bmm_36439 0.0088 ms 88.0% + triton_bmm_36441 0.0089 ms 87.1% + triton_bmm_36443 0.0090 ms 86.2% + triton_bmm_36447 0.0093 ms 83.5% +SingleProcess AUTOTUNE takes 3.9185 seconds +AUTOTUNE bmm(16x1x383, 16x383x96) + triton_bmm_36469 0.0117 ms 100.0% + triton_bmm_36468 0.0129 ms 90.3% + triton_bmm_36466 0.0145 ms 80.8% + triton_bmm_36465 0.0145 ms 80.4% + triton_bmm_36462 0.0149 ms 78.5% + bmm 0.0149 ms 78.2% + triton_bmm_36464 0.0151 ms 77.5% + triton_bmm_36463 0.0160 ms 72.9% + triton_bmm_36461 0.0171 ms 68.2% + triton_bmm_36460 0.0212 ms 55.1% +SingleProcess AUTOTUNE takes 4.1183 seconds +AUTOTUNE bmm(16x1x96, 16x96x384) + triton_bmm_36536 0.0078 ms 100.0% + triton_bmm_36533 0.0080 ms 97.2% + triton_bmm_36534 0.0080 ms 97.2% + triton_bmm_36535 0.0080 ms 97.2% + triton_bmm_36538 0.0083 ms 93.5% + triton_bmm_36532 0.0086 ms 90.7% + triton_bmm_36540 0.0087 ms 89.0% + triton_bmm_36537 0.0089 ms 87.7% + triton_bmm_36542 0.0094 ms 82.4% + triton_bmm_36541 0.0095 ms 81.5% +SingleProcess AUTOTUNE takes 3.9877 seconds +AUTOTUNE bmm(16x1x384, 16x384x96) + triton_bmm_36564 0.0091 ms 100.0% + triton_bmm_36565 0.0095 ms 95.0% + triton_bmm_36561 0.0096 ms 94.6% + triton_bmm_36562 0.0096 ms 94.6% + triton_bmm_36559 0.0101 ms 89.8% + triton_bmm_36560 0.0101 ms 89.6% + bmm 0.0105 ms 86.3% + triton_bmm_36558 0.0108 ms 83.5% + triton_bmm_36557 0.0114 ms 79.7% + triton_bmm_36556 0.0153 ms 59.2% +SingleProcess AUTOTUNE takes 3.8555 seconds +AUTOTUNE bmm(16x1x96, 16x96x385) + triton_bmm_36631 0.0080 ms 100.0% + triton_bmm_36632 0.0081 ms 99.6% + triton_bmm_36634 0.0081 ms 99.6% + triton_bmm_36630 0.0081 ms 98.8% + triton_bmm_36629 0.0083 ms 97.3% + triton_bmm_36636 0.0084 ms 96.2% + triton_bmm_36633 0.0089 ms 90.3% + triton_bmm_36628 0.0093 ms 86.3% + triton_bmm_36639 0.0093 ms 86.3% + triton_bmm_36638 0.0095 ms 84.2% +SingleProcess AUTOTUNE takes 4.0255 seconds +AUTOTUNE bmm(16x1x385, 16x385x96) + bmm 0.0101 ms 100.0% + triton_bmm_36661 0.0127 ms 79.5% + triton_bmm_36660 0.0129 ms 78.0% + triton_bmm_36657 0.0155 ms 64.9% + triton_bmm_36656 0.0158 ms 63.8% + triton_bmm_36658 0.0161 ms 62.6% + triton_bmm_36654 0.0166 ms 60.7% + triton_bmm_36653 0.0168 ms 59.9% + triton_bmm_36655 0.0176 ms 57.4% + triton_bmm_36652 0.0209 ms 48.1% +SingleProcess AUTOTUNE takes 3.7510 seconds +AUTOTUNE bmm(16x1x96, 16x96x386) + triton_bmm_36726 0.0082 ms 100.0% + triton_bmm_36729 0.0083 ms 98.5% + triton_bmm_36728 0.0086 ms 94.8% + triton_bmm_36727 0.0086 ms 94.4% + triton_bmm_36730 0.0086 ms 94.4% + triton_bmm_36724 0.0088 ms 93.1% + triton_bmm_36725 0.0088 ms 92.7% + triton_bmm_36732 0.0089 ms 91.7% + triton_bmm_36731 0.0090 ms 90.4% + triton_bmm_36734 0.0095 ms 85.6% +SingleProcess AUTOTUNE takes 3.7823 seconds +AUTOTUNE bmm(16x1x386, 16x386x96) + triton_bmm_36753 0.0091 ms 100.0% + triton_bmm_36752 0.0098 ms 92.2% + triton_bmm_36754 0.0098 ms 92.2% + triton_bmm_36756 0.0101 ms 89.6% + triton_bmm_36751 0.0104 ms 87.1% + bmm 0.0107 ms 85.0% + triton_bmm_36757 0.0110 ms 82.3% + triton_bmm_36750 0.0113 ms 79.9% + triton_bmm_36749 0.0122 ms 74.3% + triton_bmm_36748 0.0159 ms 56.8% +SingleProcess AUTOTUNE takes 4.2591 seconds +AUTOTUNE bmm(16x1x96, 16x96x387) + triton_bmm_36826 0.0081 ms 100.0% + triton_bmm_36822 0.0082 ms 98.4% + triton_bmm_36821 0.0083 ms 98.1% + triton_bmm_36825 0.0085 ms 95.1% + triton_bmm_36824 0.0086 ms 93.7% + triton_bmm_36820 0.0088 ms 92.3% + triton_bmm_36823 0.0088 ms 91.7% + triton_bmm_36828 0.0089 ms 91.0% + triton_bmm_36831 0.0093 ms 86.9% + triton_bmm_36830 0.0095 ms 84.9% +SingleProcess AUTOTUNE takes 4.2896 seconds +AUTOTUNE bmm(16x1x387, 16x387x96) + bmm 0.0106 ms 100.0% + triton_bmm_36853 0.0132 ms 80.2% + triton_bmm_36852 0.0135 ms 78.7% + triton_bmm_36849 0.0158 ms 67.3% + triton_bmm_36850 0.0158 ms 67.3% + triton_bmm_36848 0.0164 ms 65.0% + triton_bmm_36846 0.0167 ms 63.6% + triton_bmm_36845 0.0169 ms 63.0% + triton_bmm_36847 0.0176 ms 60.5% + triton_bmm_36844 0.0210 ms 50.7% +SingleProcess AUTOTUNE takes 3.8622 seconds +AUTOTUNE bmm(16x1x96, 16x96x388) + triton_bmm_36919 0.0080 ms 100.0% + triton_bmm_36918 0.0080 ms 99.6% + triton_bmm_36920 0.0081 ms 99.2% + triton_bmm_36922 0.0081 ms 99.2% + triton_bmm_36921 0.0085 ms 94.0% + triton_bmm_36916 0.0088 ms 91.2% + triton_bmm_36917 0.0088 ms 90.9% + triton_bmm_36924 0.0089 ms 89.9% + triton_bmm_36923 0.0090 ms 88.7% + triton_bmm_36925 0.0093 ms 85.9% +SingleProcess AUTOTUNE takes 3.8646 seconds +AUTOTUNE bmm(16x1x388, 16x388x96) + triton_bmm_36945 0.0091 ms 100.0% + triton_bmm_36946 0.0098 ms 92.5% + triton_bmm_36944 0.0098 ms 92.2% + triton_bmm_36949 0.0101 ms 89.8% + triton_bmm_36948 0.0101 ms 89.6% + bmm 0.0102 ms 89.0% + triton_bmm_36943 0.0111 ms 81.7% + triton_bmm_36942 0.0111 ms 81.3% + triton_bmm_36941 0.0116 ms 77.7% + triton_bmm_36940 0.0158 ms 57.2% +SingleProcess AUTOTUNE takes 3.8329 seconds +AUTOTUNE bmm(16x1x96, 16x96x389) + triton_bmm_37016 0.0081 ms 100.0% + triton_bmm_37020 0.0084 ms 96.6% + triton_bmm_37018 0.0088 ms 92.0% + triton_bmm_37013 0.0088 ms 91.6% + triton_bmm_37014 0.0088 ms 91.6% + triton_bmm_37015 0.0088 ms 91.6% + triton_bmm_37017 0.0091 ms 89.0% + triton_bmm_37019 0.0091 ms 89.0% + triton_bmm_37012 0.0093 ms 86.6% + triton_bmm_37021 0.0095 ms 84.6% +SingleProcess AUTOTUNE takes 3.9663 seconds +AUTOTUNE bmm(16x1x389, 16x389x96) + bmm 0.0104 ms 100.0% + triton_bmm_37044 0.0131 ms 79.5% + triton_bmm_37045 0.0132 ms 78.5% + triton_bmm_37042 0.0158 ms 65.9% + triton_bmm_37040 0.0160 ms 64.9% + triton_bmm_37041 0.0164 ms 63.6% + triton_bmm_37038 0.0168 ms 62.0% + triton_bmm_37037 0.0169 ms 61.7% + triton_bmm_37039 0.0179 ms 58.0% + triton_bmm_37036 0.0205 ms 50.8% +SingleProcess AUTOTUNE takes 3.9057 seconds +AUTOTUNE bmm(16x1x96, 16x96x390) + triton_bmm_37111 0.0080 ms 100.0% + triton_bmm_37112 0.0081 ms 99.6% + triton_bmm_37114 0.0081 ms 99.2% + triton_bmm_37109 0.0088 ms 91.3% + triton_bmm_37110 0.0088 ms 91.3% + triton_bmm_37116 0.0089 ms 90.1% + triton_bmm_37113 0.0091 ms 88.7% + triton_bmm_37108 0.0093 ms 86.7% + triton_bmm_37119 0.0093 ms 86.3% + triton_bmm_37115 0.0096 ms 83.9% +SingleProcess AUTOTUNE takes 3.6899 seconds +AUTOTUNE bmm(16x1x390, 16x390x96) + triton_bmm_37138 0.0093 ms 100.0% + triton_bmm_37137 0.0096 ms 96.7% + triton_bmm_37140 0.0101 ms 91.9% + triton_bmm_37136 0.0104 ms 89.5% + triton_bmm_37135 0.0106 ms 87.9% + bmm 0.0108 ms 85.8% + triton_bmm_37141 0.0111 ms 83.6% + triton_bmm_37134 0.0114 ms 82.0% + triton_bmm_37133 0.0116 ms 79.9% + triton_bmm_37132 0.0161 ms 58.0% +SingleProcess AUTOTUNE takes 4.5306 seconds +AUTOTUNE bmm(16x1x96, 16x96x391) + triton_bmm_37205 0.0083 ms 100.0% + triton_bmm_37206 0.0083 ms 100.0% + triton_bmm_37212 0.0085 ms 96.6% + triton_bmm_37204 0.0088 ms 94.2% + triton_bmm_37207 0.0088 ms 93.5% + triton_bmm_37208 0.0088 ms 93.5% + triton_bmm_37210 0.0089 ms 93.1% + triton_bmm_37209 0.0091 ms 91.2% + triton_bmm_37211 0.0091 ms 91.2% + triton_bmm_37213 0.0096 ms 86.3% +SingleProcess AUTOTUNE takes 3.8048 seconds +AUTOTUNE bmm(16x1x391, 16x391x96) + bmm 0.0102 ms 100.0% + triton_bmm_37237 0.0127 ms 80.6% + triton_bmm_37236 0.0137 ms 74.7% + triton_bmm_37234 0.0158 ms 64.8% + triton_bmm_37230 0.0163 ms 62.7% + triton_bmm_37233 0.0163 ms 62.7% + triton_bmm_37232 0.0166 ms 61.8% + triton_bmm_37229 0.0176 ms 58.1% + triton_bmm_37231 0.0177 ms 58.0% + triton_bmm_37228 0.0207 ms 49.5% +SingleProcess AUTOTUNE takes 3.8617 seconds +AUTOTUNE bmm(16x1x96, 16x96x392) + triton_bmm_37301 0.0080 ms 100.0% + triton_bmm_37303 0.0081 ms 99.2% + triton_bmm_37306 0.0081 ms 99.2% + triton_bmm_37308 0.0084 ms 95.1% + triton_bmm_37304 0.0086 ms 92.9% + triton_bmm_37300 0.0088 ms 91.2% + triton_bmm_37305 0.0089 ms 90.3% + triton_bmm_37307 0.0090 ms 88.7% + triton_bmm_37309 0.0096 ms 83.6% + triton_bmm_37310 0.0096 ms 83.6% +SingleProcess AUTOTUNE takes 4.3917 seconds +AUTOTUNE bmm(16x1x392, 16x392x96) + triton_bmm_37330 0.0093 ms 100.0% + triton_bmm_37329 0.0096 ms 97.0% + triton_bmm_37332 0.0098 ms 94.5% + triton_bmm_37328 0.0104 ms 89.5% + bmm 0.0104 ms 89.2% + triton_bmm_37327 0.0106 ms 87.6% + triton_bmm_37333 0.0106 ms 87.6% + triton_bmm_37326 0.0111 ms 83.6% + triton_bmm_37325 0.0119 ms 78.2% + triton_bmm_37324 0.0158 ms 58.6% +SingleProcess AUTOTUNE takes 3.8758 seconds +AUTOTUNE bmm(16x1x96, 16x96x393) + triton_bmm_37399 0.0081 ms 100.0% + triton_bmm_37397 0.0088 ms 91.6% + triton_bmm_37398 0.0088 ms 91.6% + triton_bmm_37400 0.0088 ms 91.3% + triton_bmm_37402 0.0089 ms 91.0% + triton_bmm_37401 0.0091 ms 89.0% + triton_bmm_37403 0.0091 ms 89.0% + triton_bmm_37404 0.0091 ms 88.4% + triton_bmm_37396 0.0093 ms 86.6% + triton_bmm_37406 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 4.3353 seconds +AUTOTUNE bmm(16x1x393, 16x393x96) + bmm 0.0104 ms 100.0% + triton_bmm_37429 0.0132 ms 78.5% + triton_bmm_37428 0.0137 ms 75.8% + triton_bmm_37425 0.0158 ms 65.8% + triton_bmm_37422 0.0163 ms 63.7% + triton_bmm_37426 0.0164 ms 63.6% + triton_bmm_37424 0.0166 ms 62.7% + triton_bmm_37421 0.0175 ms 59.4% + triton_bmm_37423 0.0176 ms 59.1% + triton_bmm_37420 0.0209 ms 49.8% +SingleProcess AUTOTUNE takes 4.2555 seconds +AUTOTUNE bmm(16x1x96, 16x96x394) + triton_bmm_37495 0.0080 ms 100.0% + triton_bmm_37494 0.0083 ms 97.3% + triton_bmm_37496 0.0086 ms 93.0% + triton_bmm_37498 0.0088 ms 91.6% + triton_bmm_37493 0.0088 ms 91.3% + triton_bmm_37497 0.0089 ms 90.3% + triton_bmm_37500 0.0091 ms 88.4% + triton_bmm_37492 0.0093 ms 86.3% + triton_bmm_37503 0.0093 ms 86.3% + triton_bmm_37502 0.0096 ms 83.9% +SingleProcess AUTOTUNE takes 3.7627 seconds +AUTOTUNE bmm(16x1x394, 16x394x96) + triton_bmm_37521 0.0091 ms 100.0% + triton_bmm_37522 0.0093 ms 97.3% + triton_bmm_37524 0.0098 ms 92.5% + triton_bmm_37520 0.0098 ms 92.2% + triton_bmm_37519 0.0110 ms 82.3% + bmm 0.0110 ms 82.0% + triton_bmm_37518 0.0114 ms 79.7% + triton_bmm_37517 0.0122 ms 74.3% + triton_bmm_37516 0.0160 ms 56.5% + triton_bmm_37523 0.0169 ms 53.6% +SingleProcess AUTOTUNE takes 3.9982 seconds +AUTOTUNE bmm(16x1x96, 16x96x395) + triton_bmm_37591 0.0081 ms 100.0% + triton_bmm_37594 0.0081 ms 99.6% + triton_bmm_37590 0.0083 ms 97.7% + triton_bmm_37593 0.0085 ms 94.7% + triton_bmm_37596 0.0085 ms 94.4% + triton_bmm_37589 0.0088 ms 91.6% + triton_bmm_37595 0.0091 ms 89.0% + triton_bmm_37588 0.0093 ms 86.6% + triton_bmm_37598 0.0095 ms 84.6% + triton_bmm_37599 0.0099 ms 81.2% +SingleProcess AUTOTUNE takes 4.3499 seconds +AUTOTUNE bmm(16x1x395, 16x395x96) + bmm 0.0109 ms 100.0% + triton_bmm_37621 0.0132 ms 82.2% + triton_bmm_37620 0.0137 ms 79.7% + triton_bmm_37618 0.0160 ms 67.9% + triton_bmm_37616 0.0163 ms 66.9% + triton_bmm_37614 0.0163 ms 66.8% + triton_bmm_37617 0.0163 ms 66.8% + triton_bmm_37613 0.0171 ms 63.8% + triton_bmm_37615 0.0176 ms 61.9% + triton_bmm_37612 0.0210 ms 52.0% +SingleProcess AUTOTUNE takes 5.4967 seconds +AUTOTUNE bmm(16x1x96, 16x96x396) + triton_bmm_37690 0.0081 ms 100.0% + triton_bmm_37686 0.0083 ms 97.7% + triton_bmm_37689 0.0085 ms 94.7% + triton_bmm_37688 0.0086 ms 93.7% + triton_bmm_37685 0.0088 ms 91.6% + triton_bmm_37687 0.0088 ms 91.5% + triton_bmm_37692 0.0090 ms 89.4% + triton_bmm_37684 0.0093 ms 86.6% + triton_bmm_37691 0.0096 ms 84.3% + triton_bmm_37694 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 4.1369 seconds +AUTOTUNE bmm(16x1x396, 16x396x96) + triton_bmm_37714 0.0093 ms 100.0% + triton_bmm_37713 0.0096 ms 96.7% + triton_bmm_37716 0.0098 ms 94.8% + bmm 0.0100 ms 92.7% + triton_bmm_37712 0.0103 ms 89.8% + triton_bmm_37711 0.0106 ms 87.6% + triton_bmm_37717 0.0107 ms 87.1% + triton_bmm_37710 0.0112 ms 83.1% + triton_bmm_37709 0.0119 ms 78.2% + triton_bmm_37708 0.0158 ms 58.6% +SingleProcess AUTOTUNE takes 4.1954 seconds +AUTOTUNE bmm(16x1x96, 16x96x397) + triton_bmm_37783 0.0080 ms 100.0% + triton_bmm_37784 0.0081 ms 99.6% + triton_bmm_37786 0.0082 ms 98.4% + triton_bmm_37781 0.0083 ms 97.3% + triton_bmm_37782 0.0083 ms 97.3% + triton_bmm_37785 0.0085 ms 94.4% + triton_bmm_37780 0.0088 ms 91.6% + triton_bmm_37787 0.0091 ms 88.7% + triton_bmm_37788 0.0091 ms 88.4% + triton_bmm_37791 0.0094 ms 85.7% +SingleProcess AUTOTUNE takes 3.9837 seconds +AUTOTUNE bmm(16x1x397, 16x397x96) + bmm 0.0108 ms 100.0% + triton_bmm_37813 0.0133 ms 81.0% + triton_bmm_37812 0.0137 ms 78.3% + triton_bmm_37806 0.0163 ms 65.9% + triton_bmm_37809 0.0164 ms 65.8% + triton_bmm_37810 0.0164 ms 65.5% + triton_bmm_37808 0.0166 ms 64.7% + triton_bmm_37805 0.0171 ms 62.9% + triton_bmm_37807 0.0178 ms 60.5% + triton_bmm_37804 0.0207 ms 51.9% +SingleProcess AUTOTUNE takes 4.0886 seconds +AUTOTUNE bmm(16x1x96, 16x96x398) + triton_bmm_37884 0.0085 ms 100.0% + triton_bmm_37880 0.0088 ms 97.4% + triton_bmm_37877 0.0088 ms 97.1% + triton_bmm_37878 0.0088 ms 97.1% + triton_bmm_37879 0.0088 ms 97.1% + triton_bmm_37882 0.0089 ms 96.4% + triton_bmm_37883 0.0090 ms 94.7% + triton_bmm_37881 0.0090 ms 94.5% + triton_bmm_37876 0.0093 ms 91.8% + triton_bmm_37887 0.0093 ms 91.8% +SingleProcess AUTOTUNE takes 3.8999 seconds +AUTOTUNE bmm(16x1x398, 16x398x96) + triton_bmm_37906 0.0093 ms 100.0% + triton_bmm_37905 0.0096 ms 96.7% + triton_bmm_37908 0.0098 ms 95.1% + triton_bmm_37904 0.0100 ms 93.3% + triton_bmm_37903 0.0111 ms 83.7% + bmm 0.0111 ms 83.6% + triton_bmm_37909 0.0111 ms 83.6% + triton_bmm_37902 0.0119 ms 78.0% + triton_bmm_37901 0.0122 ms 76.4% + triton_bmm_37900 0.0161 ms 58.0% +SingleProcess AUTOTUNE takes 4.0938 seconds +AUTOTUNE bmm(16x1x96, 16x96x399) + triton_bmm_37976 0.0087 ms 100.0% + triton_bmm_37972 0.0088 ms 98.9% + triton_bmm_37973 0.0088 ms 98.5% + triton_bmm_37975 0.0088 ms 98.5% + triton_bmm_37974 0.0088 ms 98.2% + triton_bmm_37978 0.0088 ms 98.2% + triton_bmm_37977 0.0091 ms 95.8% + triton_bmm_37979 0.0091 ms 95.8% + triton_bmm_37980 0.0091 ms 95.4% + triton_bmm_37981 0.0095 ms 90.9% +SingleProcess AUTOTUNE takes 4.1258 seconds +AUTOTUNE bmm(16x1x399, 16x399x96) + bmm 0.0114 ms 100.0% + triton_bmm_38004 0.0132 ms 86.4% + triton_bmm_38005 0.0133 ms 85.4% + triton_bmm_38002 0.0160 ms 71.1% + triton_bmm_38000 0.0161 ms 70.9% + triton_bmm_38001 0.0164 ms 69.7% + triton_bmm_37998 0.0169 ms 67.6% + triton_bmm_37997 0.0174 ms 65.3% + triton_bmm_37999 0.0176 ms 64.7% + triton_bmm_37996 0.0212 ms 53.6% +SingleProcess AUTOTUNE takes 3.7542 seconds +AUTOTUNE bmm(16x1x96, 16x96x400) + triton_bmm_38069 0.0080 ms 100.0% + triton_bmm_38072 0.0081 ms 99.2% + triton_bmm_38074 0.0081 ms 99.2% + triton_bmm_38073 0.0085 ms 94.0% + triton_bmm_38071 0.0086 ms 92.9% + triton_bmm_38068 0.0088 ms 91.2% + triton_bmm_38070 0.0088 ms 90.6% + triton_bmm_38075 0.0090 ms 88.7% + triton_bmm_38076 0.0091 ms 88.3% + triton_bmm_38079 0.0093 ms 86.2% +SingleProcess AUTOTUNE takes 4.0108 seconds +AUTOTUNE bmm(16x1x400, 16x400x96) + triton_bmm_38097 0.0093 ms 100.0% + triton_bmm_38098 0.0093 ms 100.0% + triton_bmm_38100 0.0097 ms 95.4% + triton_bmm_38096 0.0098 ms 94.5% + bmm 0.0103 ms 90.1% + triton_bmm_38101 0.0103 ms 89.8% + triton_bmm_38095 0.0111 ms 83.3% + triton_bmm_38094 0.0117 ms 79.5% + triton_bmm_38093 0.0122 ms 76.1% + triton_bmm_38092 0.0153 ms 60.5% +SingleProcess AUTOTUNE takes 4.0474 seconds +AUTOTUNE bmm(16x1x96, 16x96x401) + triton_bmm_38168 0.0081 ms 100.0% + triton_bmm_38170 0.0081 ms 100.0% + triton_bmm_38167 0.0082 ms 98.8% + triton_bmm_38169 0.0085 ms 94.7% + triton_bmm_38166 0.0088 ms 91.6% + triton_bmm_38165 0.0088 ms 91.3% + triton_bmm_38172 0.0091 ms 88.7% + triton_bmm_38164 0.0093 ms 86.6% + triton_bmm_38175 0.0093 ms 86.6% + triton_bmm_38171 0.0096 ms 83.7% +SingleProcess AUTOTUNE takes 3.9014 seconds +AUTOTUNE bmm(16x1x401, 16x401x96) + bmm 0.0104 ms 100.0% + triton_bmm_38197 0.0129 ms 80.3% + triton_bmm_38196 0.0132 ms 78.8% + triton_bmm_38194 0.0161 ms 64.6% + triton_bmm_38192 0.0163 ms 63.8% + triton_bmm_38190 0.0164 ms 63.2% + triton_bmm_38193 0.0166 ms 62.6% + triton_bmm_38189 0.0179 ms 58.2% + triton_bmm_38191 0.0184 ms 56.3% + triton_bmm_38188 0.0210 ms 49.5% +SingleProcess AUTOTUNE takes 3.8519 seconds +AUTOTUNE bmm(16x1x96, 16x96x402) + triton_bmm_38263 0.0080 ms 100.0% + triton_bmm_38264 0.0081 ms 99.6% + triton_bmm_38268 0.0085 ms 94.0% + triton_bmm_38261 0.0088 ms 91.3% + triton_bmm_38262 0.0088 ms 91.3% + triton_bmm_38266 0.0088 ms 90.9% + triton_bmm_38265 0.0089 ms 90.6% + triton_bmm_38260 0.0093 ms 86.3% + triton_bmm_38270 0.0096 ms 83.9% + triton_bmm_38267 0.0096 ms 83.7% +SingleProcess AUTOTUNE takes 3.8155 seconds +AUTOTUNE bmm(16x1x402, 16x402x96) + triton_bmm_38290 0.0093 ms 100.0% + triton_bmm_38289 0.0096 ms 97.3% + triton_bmm_38288 0.0100 ms 92.7% + triton_bmm_38292 0.0103 ms 90.4% + triton_bmm_38287 0.0106 ms 87.9% + triton_bmm_38293 0.0106 ms 87.9% + bmm 0.0108 ms 86.1% + triton_bmm_38286 0.0119 ms 78.0% + triton_bmm_38285 0.0122 ms 76.2% + triton_bmm_38284 0.0161 ms 57.9% +SingleProcess AUTOTUNE takes 4.0234 seconds +AUTOTUNE bmm(16x1x96, 16x96x403) + triton_bmm_38360 0.0081 ms 100.0% + triton_bmm_38357 0.0083 ms 97.7% + triton_bmm_38359 0.0083 ms 97.7% + triton_bmm_38358 0.0083 ms 97.3% + triton_bmm_38361 0.0085 ms 94.7% + triton_bmm_38364 0.0085 ms 94.4% + triton_bmm_38356 0.0088 ms 92.0% + triton_bmm_38362 0.0088 ms 91.3% + triton_bmm_38363 0.0091 ms 89.0% + triton_bmm_38367 0.0093 ms 86.6% +SingleProcess AUTOTUNE takes 3.8878 seconds +AUTOTUNE bmm(16x1x403, 16x403x96) + bmm 0.0106 ms 100.0% + triton_bmm_38389 0.0129 ms 82.2% + triton_bmm_38388 0.0138 ms 77.2% + triton_bmm_38385 0.0166 ms 64.1% + triton_bmm_38386 0.0166 ms 64.0% + triton_bmm_38384 0.0167 ms 63.5% + triton_bmm_38382 0.0169 ms 63.0% + triton_bmm_38381 0.0179 ms 59.5% + triton_bmm_38383 0.0184 ms 57.7% + triton_bmm_38380 0.0215 ms 49.4% +SingleProcess AUTOTUNE takes 3.8404 seconds +AUTOTUNE bmm(16x1x96, 16x96x404) + triton_bmm_38453 0.0080 ms 100.0% + triton_bmm_38454 0.0082 ms 97.7% + triton_bmm_38460 0.0085 ms 94.0% + triton_bmm_38455 0.0086 ms 93.3% + triton_bmm_38456 0.0086 ms 93.0% + triton_bmm_38452 0.0088 ms 91.6% + triton_bmm_38458 0.0088 ms 90.9% + triton_bmm_38459 0.0090 ms 89.0% + triton_bmm_38457 0.0091 ms 88.7% + triton_bmm_38461 0.0093 ms 86.6% +SingleProcess AUTOTUNE takes 3.9112 seconds +AUTOTUNE bmm(16x1x404, 16x404x96) + triton_bmm_38482 0.0093 ms 100.0% + triton_bmm_38481 0.0096 ms 97.0% + bmm 0.0100 ms 92.7% + triton_bmm_38484 0.0103 ms 89.8% + triton_bmm_38480 0.0104 ms 89.4% + triton_bmm_38479 0.0106 ms 87.6% + triton_bmm_38485 0.0109 ms 85.4% + triton_bmm_38478 0.0114 ms 81.7% + triton_bmm_38477 0.0124 ms 74.7% + triton_bmm_38476 0.0155 ms 59.8% +SingleProcess AUTOTUNE takes 4.0892 seconds +AUTOTUNE bmm(16x1x96, 16x96x405) + triton_bmm_38552 0.0081 ms 100.0% + triton_bmm_38554 0.0083 ms 97.7% + triton_bmm_38553 0.0085 ms 94.4% + triton_bmm_38556 0.0085 ms 94.4% + triton_bmm_38550 0.0088 ms 91.6% + triton_bmm_38549 0.0088 ms 91.3% + triton_bmm_38551 0.0088 ms 91.3% + triton_bmm_38548 0.0093 ms 86.3% + triton_bmm_38555 0.0096 ms 83.9% + triton_bmm_38559 0.0100 ms 80.5% +SingleProcess AUTOTUNE takes 3.8003 seconds +AUTOTUNE bmm(16x1x405, 16x405x96) + bmm 0.0107 ms 100.0% + triton_bmm_38580 0.0132 ms 81.1% + triton_bmm_38581 0.0135 ms 79.3% + triton_bmm_38577 0.0160 ms 66.7% + triton_bmm_38578 0.0165 ms 64.6% + triton_bmm_38576 0.0166 ms 64.4% + triton_bmm_38574 0.0169 ms 63.4% + triton_bmm_38573 0.0177 ms 60.5% + triton_bmm_38575 0.0179 ms 59.9% + triton_bmm_38572 0.0212 ms 50.3% +SingleProcess AUTOTUNE takes 3.7276 seconds +AUTOTUNE bmm(16x1x96, 16x96x406) + triton_bmm_38648 0.0080 ms 100.0% + triton_bmm_38644 0.0088 ms 91.6% + triton_bmm_38645 0.0088 ms 91.3% + triton_bmm_38647 0.0088 ms 91.3% + triton_bmm_38646 0.0088 ms 90.9% + triton_bmm_38650 0.0088 ms 90.9% + triton_bmm_38649 0.0091 ms 88.7% + triton_bmm_38652 0.0091 ms 88.4% + triton_bmm_38655 0.0093 ms 86.3% + triton_bmm_38651 0.0096 ms 83.7% +SingleProcess AUTOTUNE takes 4.3603 seconds +AUTOTUNE bmm(16x1x406, 16x406x96) + triton_bmm_38673 0.0096 ms 100.0% + triton_bmm_38674 0.0099 ms 97.4% + triton_bmm_38672 0.0100 ms 95.9% + triton_bmm_38676 0.0104 ms 92.9% + triton_bmm_38671 0.0111 ms 86.7% + triton_bmm_38677 0.0112 ms 86.2% + bmm 0.0114 ms 84.6% + triton_bmm_38669 0.0118 ms 81.4% + triton_bmm_38670 0.0119 ms 80.9% + triton_bmm_38668 0.0156 ms 61.9% +SingleProcess AUTOTUNE takes 4.1373 seconds +AUTOTUNE bmm(16x1x96, 16x96x407) + triton_bmm_38744 0.0081 ms 100.0% + triton_bmm_38741 0.0083 ms 97.7% + triton_bmm_38742 0.0083 ms 97.7% + triton_bmm_38746 0.0083 ms 97.7% + triton_bmm_38745 0.0085 ms 94.7% + triton_bmm_38748 0.0085 ms 94.4% + triton_bmm_38740 0.0088 ms 92.0% + triton_bmm_38743 0.0088 ms 91.3% + triton_bmm_38747 0.0091 ms 89.0% + triton_bmm_38750 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 4.2500 seconds +AUTOTUNE bmm(16x1x407, 16x407x96) + bmm 0.0109 ms 100.0% + triton_bmm_38773 0.0129 ms 84.2% + triton_bmm_38772 0.0138 ms 79.1% + triton_bmm_38769 0.0161 ms 67.7% + triton_bmm_38768 0.0163 ms 66.7% + triton_bmm_38766 0.0165 ms 65.8% + triton_bmm_38770 0.0167 ms 65.1% + triton_bmm_38765 0.0173 ms 62.7% + triton_bmm_38767 0.0184 ms 59.0% + triton_bmm_38764 0.0210 ms 51.8% +SingleProcess AUTOTUNE takes 4.7012 seconds +AUTOTUNE bmm(16x1x96, 16x96x408) + triton_bmm_38839 0.0080 ms 100.0% + triton_bmm_38842 0.0081 ms 99.6% + triton_bmm_38844 0.0085 ms 94.4% + triton_bmm_38840 0.0087 ms 92.6% + triton_bmm_38837 0.0088 ms 91.3% + triton_bmm_38838 0.0088 ms 91.3% + triton_bmm_38841 0.0091 ms 88.7% + triton_bmm_38843 0.0091 ms 88.7% + triton_bmm_38845 0.0091 ms 88.7% + triton_bmm_38836 0.0093 ms 86.3% +SingleProcess AUTOTUNE takes 3.6403 seconds +AUTOTUNE bmm(16x1x408, 16x408x96) + triton_bmm_38866 0.0093 ms 100.0% + triton_bmm_38864 0.0100 ms 92.4% + triton_bmm_38865 0.0101 ms 91.8% + bmm 0.0103 ms 89.8% + triton_bmm_38868 0.0104 ms 89.5% + triton_bmm_38863 0.0111 ms 83.3% + triton_bmm_38869 0.0112 ms 83.1% + triton_bmm_38862 0.0118 ms 78.6% + triton_bmm_38861 0.0119 ms 78.2% + triton_bmm_38860 0.0155 ms 59.8% +SingleProcess AUTOTUNE takes 4.0012 seconds +AUTOTUNE bmm(16x1x96, 16x96x409) + triton_bmm_38936 0.0081 ms 100.0% + triton_bmm_38938 0.0082 ms 98.8% + triton_bmm_38932 0.0088 ms 92.0% + triton_bmm_38935 0.0088 ms 91.6% + triton_bmm_38933 0.0088 ms 91.3% + triton_bmm_38934 0.0090 ms 89.8% + triton_bmm_38937 0.0091 ms 89.0% + triton_bmm_38939 0.0091 ms 89.0% + triton_bmm_38940 0.0091 ms 89.0% + triton_bmm_38943 0.0094 ms 85.4% +SingleProcess AUTOTUNE takes 3.8524 seconds +AUTOTUNE bmm(16x1x409, 16x409x96) + bmm 0.0111 ms 100.0% + triton_bmm_38965 0.0130 ms 85.7% + triton_bmm_38964 0.0133 ms 83.2% + triton_bmm_38961 0.0161 ms 69.1% + triton_bmm_38962 0.0163 ms 68.2% + triton_bmm_38960 0.0163 ms 68.0% + triton_bmm_38958 0.0171 ms 65.0% + triton_bmm_38957 0.0178 ms 62.4% + triton_bmm_38959 0.0184 ms 60.2% + triton_bmm_38956 0.0215 ms 51.7% +SingleProcess AUTOTUNE takes 4.3573 seconds +AUTOTUNE bmm(16x1x96, 16x96x410) + triton_bmm_39032 0.0080 ms 100.0% + triton_bmm_39034 0.0082 ms 97.7% + triton_bmm_39030 0.0083 ms 97.3% + triton_bmm_39033 0.0085 ms 94.4% + triton_bmm_39036 0.0085 ms 94.4% + triton_bmm_39028 0.0088 ms 91.6% + triton_bmm_39029 0.0088 ms 91.3% + triton_bmm_39031 0.0088 ms 90.9% + triton_bmm_39035 0.0091 ms 88.7% + triton_bmm_39039 0.0093 ms 86.3% +SingleProcess AUTOTUNE takes 4.4455 seconds +AUTOTUNE bmm(16x1x410, 16x410x96) + triton_bmm_39057 0.0093 ms 100.0% + triton_bmm_39060 0.0098 ms 94.5% + triton_bmm_39058 0.0099 ms 94.2% + triton_bmm_39056 0.0101 ms 92.1% + triton_bmm_39055 0.0106 ms 87.6% + bmm 0.0111 ms 83.8% + triton_bmm_39061 0.0112 ms 82.9% + triton_bmm_39053 0.0119 ms 78.2% + triton_bmm_39054 0.0119 ms 77.7% + triton_bmm_39052 0.0161 ms 57.7% +SingleProcess AUTOTUNE takes 4.0958 seconds +AUTOTUNE bmm(16x1x96, 16x96x411) + triton_bmm_39128 0.0081 ms 100.0% + triton_bmm_39127 0.0082 ms 98.1% + triton_bmm_39126 0.0083 ms 97.7% + triton_bmm_39130 0.0083 ms 97.7% + triton_bmm_39129 0.0085 ms 94.7% + triton_bmm_39132 0.0085 ms 94.4% + triton_bmm_39125 0.0088 ms 91.3% + triton_bmm_39131 0.0091 ms 88.7% + triton_bmm_39124 0.0093 ms 86.3% + triton_bmm_39135 0.0095 ms 84.6% +SingleProcess AUTOTUNE takes 3.8491 seconds +AUTOTUNE bmm(16x1x411, 16x411x96) + bmm 0.0119 ms 100.0% + triton_bmm_39156 0.0133 ms 89.4% + triton_bmm_39157 0.0135 ms 88.2% + triton_bmm_39153 0.0161 ms 74.3% + triton_bmm_39150 0.0165 ms 72.1% + triton_bmm_39154 0.0166 ms 71.9% + triton_bmm_39152 0.0168 ms 70.9% + triton_bmm_39149 0.0179 ms 66.7% + triton_bmm_39151 0.0184 ms 64.8% + triton_bmm_39155 0.0228 ms 52.3% +SingleProcess AUTOTUNE takes 3.8318 seconds +AUTOTUNE bmm(16x1x96, 16x96x412) + triton_bmm_39226 0.0080 ms 100.0% + triton_bmm_39222 0.0083 ms 97.3% + triton_bmm_39228 0.0085 ms 94.4% + triton_bmm_39224 0.0086 ms 93.0% + triton_bmm_39223 0.0087 ms 92.6% + triton_bmm_39221 0.0088 ms 91.3% + triton_bmm_39225 0.0091 ms 88.7% + triton_bmm_39231 0.0093 ms 86.3% + triton_bmm_39220 0.0093 ms 86.1% + triton_bmm_39227 0.0096 ms 83.7% +SingleProcess AUTOTUNE takes 3.8953 seconds +AUTOTUNE bmm(16x1x412, 16x412x96) + triton_bmm_39250 0.0093 ms 100.0% + triton_bmm_39249 0.0098 ms 94.8% + triton_bmm_39248 0.0100 ms 92.7% + triton_bmm_39253 0.0103 ms 90.1% + bmm 0.0104 ms 89.8% + triton_bmm_39252 0.0104 ms 89.8% + triton_bmm_39247 0.0111 ms 83.6% + triton_bmm_39246 0.0119 ms 78.2% + triton_bmm_39245 0.0124 ms 74.8% + triton_bmm_39244 0.0161 ms 58.0% +SingleProcess AUTOTUNE takes 4.1497 seconds +AUTOTUNE bmm(16x1x96, 16x96x413) + triton_bmm_39319 0.0082 ms 100.0% + triton_bmm_39318 0.0083 ms 99.2% + triton_bmm_39317 0.0083 ms 98.8% + triton_bmm_39324 0.0085 ms 95.9% + triton_bmm_39316 0.0088 ms 93.4% + triton_bmm_39320 0.0088 ms 92.8% + triton_bmm_39322 0.0088 ms 92.8% + triton_bmm_39323 0.0091 ms 90.5% + triton_bmm_39321 0.0091 ms 90.1% + triton_bmm_39325 0.0096 ms 85.6% +SingleProcess AUTOTUNE takes 3.8502 seconds +AUTOTUNE bmm(16x1x413, 16x413x96) + bmm 0.0118 ms 100.0% + triton_bmm_39349 0.0135 ms 87.4% + triton_bmm_39348 0.0139 ms 84.8% + triton_bmm_39346 0.0163 ms 72.5% + triton_bmm_39344 0.0163 ms 72.4% + triton_bmm_39345 0.0166 ms 71.1% + triton_bmm_39342 0.0171 ms 69.2% + triton_bmm_39341 0.0173 ms 68.1% + triton_bmm_39343 0.0184 ms 64.1% + triton_bmm_39340 0.0210 ms 56.3% +SingleProcess AUTOTUNE takes 4.0176 seconds +AUTOTUNE bmm(16x1x96, 16x96x414) + triton_bmm_39415 0.0080 ms 100.0% + triton_bmm_39416 0.0081 ms 99.6% + triton_bmm_39418 0.0082 ms 97.7% + triton_bmm_39417 0.0085 ms 94.0% + triton_bmm_39413 0.0088 ms 91.3% + triton_bmm_39414 0.0088 ms 90.9% + triton_bmm_39420 0.0091 ms 88.4% + triton_bmm_39412 0.0093 ms 86.0% + triton_bmm_39419 0.0096 ms 83.7% + triton_bmm_39423 0.0099 ms 81.5% +SingleProcess AUTOTUNE takes 3.9523 seconds +AUTOTUNE bmm(16x1x414, 16x414x96) + triton_bmm_39441 0.0093 ms 100.0% + triton_bmm_39442 0.0093 ms 99.7% + triton_bmm_39444 0.0098 ms 94.5% + triton_bmm_39440 0.0101 ms 92.1% + bmm 0.0112 ms 82.9% + triton_bmm_39445 0.0113 ms 82.0% + triton_bmm_39439 0.0115 ms 81.0% + triton_bmm_39437 0.0119 ms 78.0% + triton_bmm_39438 0.0119 ms 77.7% + triton_bmm_39436 0.0161 ms 57.7% +SingleProcess AUTOTUNE takes 4.1417 seconds +AUTOTUNE bmm(16x1x96, 16x96x415) + triton_bmm_39514 0.0083 ms 100.0% + triton_bmm_39513 0.0085 ms 97.0% + triton_bmm_39509 0.0088 ms 93.8% + triton_bmm_39510 0.0088 ms 93.8% + triton_bmm_39511 0.0088 ms 93.8% + triton_bmm_39512 0.0089 ms 93.5% + triton_bmm_39516 0.0091 ms 90.9% + triton_bmm_39508 0.0093 ms 88.7% + triton_bmm_39517 0.0096 ms 86.6% + triton_bmm_39518 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 3.7904 seconds +AUTOTUNE bmm(16x1x415, 16x415x96) + bmm 0.0124 ms 100.0% + triton_bmm_39541 0.0131 ms 95.1% + triton_bmm_39540 0.0134 ms 92.6% + triton_bmm_39537 0.0153 ms 81.4% + triton_bmm_39538 0.0158 ms 78.7% + triton_bmm_39536 0.0161 ms 77.5% + triton_bmm_39534 0.0161 ms 77.2% + triton_bmm_39535 0.0174 ms 71.6% + triton_bmm_39533 0.0181 ms 68.8% + triton_bmm_39532 0.0220 ms 56.5% +SingleProcess AUTOTUNE takes 4.2710 seconds +AUTOTUNE bmm(16x1x96, 16x96x416) + triton_bmm_39605 0.0080 ms 100.0% + triton_bmm_39606 0.0083 ms 97.3% + triton_bmm_39609 0.0085 ms 94.4% + triton_bmm_39607 0.0086 ms 93.0% + triton_bmm_39610 0.0086 ms 93.0% + triton_bmm_39608 0.0087 ms 92.6% + triton_bmm_39604 0.0088 ms 91.6% + triton_bmm_39613 0.0090 ms 89.0% + triton_bmm_39612 0.0091 ms 88.4% + triton_bmm_39615 0.0093 ms 86.3% +SingleProcess AUTOTUNE takes 4.1787 seconds +AUTOTUNE bmm(16x1x416, 16x416x96) + triton_bmm_39636 0.0098 ms 100.0% + triton_bmm_39633 0.0099 ms 99.7% + triton_bmm_39634 0.0099 ms 99.7% + triton_bmm_39632 0.0101 ms 97.5% + bmm 0.0106 ms 93.0% + triton_bmm_39631 0.0106 ms 92.7% + triton_bmm_39637 0.0106 ms 92.7% + triton_bmm_39629 0.0119 ms 82.7% + triton_bmm_39630 0.0119 ms 82.5% + triton_bmm_39628 0.0161 ms 61.0% +SingleProcess AUTOTUNE takes 4.7174 seconds +AUTOTUNE bmm(16x1x96, 16x96x417) + triton_bmm_39700 0.0088 ms 100.0% + triton_bmm_39702 0.0088 ms 99.6% + triton_bmm_39701 0.0088 ms 99.3% + triton_bmm_39703 0.0088 ms 99.3% + triton_bmm_39704 0.0088 ms 99.3% + triton_bmm_39706 0.0089 ms 98.9% + triton_bmm_39708 0.0091 ms 96.5% + triton_bmm_39705 0.0093 ms 94.5% + triton_bmm_39709 0.0096 ms 91.6% + triton_bmm_39707 0.0098 ms 89.3% +SingleProcess AUTOTUNE takes 4.0654 seconds +AUTOTUNE bmm(16x1x417, 16x417x96) + bmm 0.0103 ms 100.0% + triton_bmm_39733 0.0131 ms 78.9% + triton_bmm_39732 0.0134 ms 76.7% + triton_bmm_39729 0.0169 ms 61.0% + triton_bmm_39726 0.0171 ms 60.4% + triton_bmm_39730 0.0171 ms 60.2% + triton_bmm_39728 0.0172 ms 60.1% + triton_bmm_39725 0.0182 ms 56.7% + triton_bmm_39727 0.0184 ms 56.0% + triton_bmm_39724 0.0215 ms 48.0% +SingleProcess AUTOTUNE takes 4.1247 seconds +AUTOTUNE bmm(16x1x96, 16x96x418) + triton_bmm_39800 0.0081 ms 100.0% + triton_bmm_39799 0.0081 ms 99.6% + triton_bmm_39798 0.0083 ms 97.7% + triton_bmm_39797 0.0083 ms 97.3% + triton_bmm_39804 0.0085 ms 94.4% + triton_bmm_39796 0.0088 ms 92.0% + triton_bmm_39802 0.0088 ms 91.6% + triton_bmm_39801 0.0091 ms 89.0% + triton_bmm_39803 0.0091 ms 89.0% + triton_bmm_39807 0.0093 ms 86.6% +SingleProcess AUTOTUNE takes 3.9011 seconds +AUTOTUNE bmm(16x1x418, 16x418x96) + triton_bmm_39825 0.0093 ms 100.0% + triton_bmm_39826 0.0101 ms 91.8% + triton_bmm_39828 0.0104 ms 89.5% + bmm 0.0108 ms 86.4% + triton_bmm_39823 0.0108 ms 85.8% + triton_bmm_39824 0.0108 ms 85.8% + triton_bmm_39829 0.0113 ms 82.2% + triton_bmm_39821 0.0122 ms 76.6% + triton_bmm_39822 0.0122 ms 76.5% + triton_bmm_39820 0.0166 ms 56.1% +SingleProcess AUTOTUNE takes 4.0697 seconds +AUTOTUNE bmm(16x1x96, 16x96x419) + triton_bmm_39896 0.0081 ms 100.0% + triton_bmm_39898 0.0081 ms 99.2% + triton_bmm_39894 0.0083 ms 97.7% + triton_bmm_39893 0.0083 ms 97.3% + triton_bmm_39900 0.0085 ms 94.4% + triton_bmm_39897 0.0087 ms 92.3% + triton_bmm_39892 0.0088 ms 92.0% + triton_bmm_39895 0.0088 ms 91.1% + triton_bmm_39899 0.0091 ms 89.0% + triton_bmm_39901 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 4.0887 seconds +AUTOTUNE bmm(16x1x419, 16x419x96) + bmm 0.0110 ms 100.0% + triton_bmm_39925 0.0131 ms 84.4% + triton_bmm_39924 0.0139 ms 79.6% + triton_bmm_39922 0.0166 ms 66.6% + triton_bmm_39920 0.0168 ms 65.7% + triton_bmm_39921 0.0171 ms 64.7% + triton_bmm_39918 0.0172 ms 64.2% + triton_bmm_39919 0.0184 ms 60.1% + triton_bmm_39917 0.0209 ms 52.9% + triton_bmm_39916 0.0220 ms 50.1% +SingleProcess AUTOTUNE takes 3.9934 seconds +AUTOTUNE bmm(16x1x96, 16x96x420) + triton_bmm_39994 0.0081 ms 100.0% + triton_bmm_39989 0.0082 ms 98.4% + triton_bmm_39996 0.0085 ms 94.4% + triton_bmm_39991 0.0086 ms 93.3% + triton_bmm_39992 0.0086 ms 93.3% + triton_bmm_39990 0.0088 ms 91.6% + triton_bmm_39993 0.0091 ms 88.7% + triton_bmm_39988 0.0093 ms 86.6% + triton_bmm_39999 0.0095 ms 84.6% + triton_bmm_39998 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 3.7673 seconds +AUTOTUNE bmm(16x1x420, 16x420x96) + triton_bmm_40018 0.0095 ms 100.0% + triton_bmm_40017 0.0099 ms 96.4% + triton_bmm_40016 0.0102 ms 93.9% + triton_bmm_40020 0.0104 ms 92.0% + bmm 0.0106 ms 90.3% + triton_bmm_40015 0.0108 ms 87.9% + triton_bmm_40021 0.0109 ms 87.6% + triton_bmm_40013 0.0122 ms 78.4% + triton_bmm_40014 0.0122 ms 78.4% + triton_bmm_40012 0.0161 ms 59.4% +SingleProcess AUTOTUNE takes 4.2084 seconds +AUTOTUNE bmm(16x1x96, 16x96x421) + triton_bmm_40087 0.0082 ms 100.0% + triton_bmm_40088 0.0087 ms 95.0% + triton_bmm_40084 0.0088 ms 93.8% + triton_bmm_40089 0.0088 ms 93.8% + triton_bmm_40085 0.0088 ms 93.1% + triton_bmm_40090 0.0088 ms 93.1% + triton_bmm_40086 0.0088 ms 92.9% + triton_bmm_40091 0.0091 ms 90.5% + triton_bmm_40092 0.0091 ms 90.5% + triton_bmm_40093 0.0096 ms 86.0% +SingleProcess AUTOTUNE takes 3.8791 seconds +AUTOTUNE bmm(16x1x421, 16x421x96) + bmm 0.0113 ms 100.0% + triton_bmm_40116 0.0134 ms 84.3% + triton_bmm_40117 0.0137 ms 82.7% + triton_bmm_40114 0.0166 ms 68.3% + triton_bmm_40113 0.0167 ms 67.8% + triton_bmm_40112 0.0168 ms 67.3% + triton_bmm_40110 0.0175 ms 64.7% + triton_bmm_40109 0.0179 ms 63.4% + triton_bmm_40111 0.0184 ms 61.6% + triton_bmm_40108 0.0220 ms 51.4% +SingleProcess AUTOTUNE takes 3.8457 seconds +AUTOTUNE bmm(16x1x96, 16x96x422) + triton_bmm_40183 0.0081 ms 100.0% + triton_bmm_40184 0.0081 ms 100.0% + triton_bmm_40181 0.0083 ms 97.7% + triton_bmm_40182 0.0083 ms 97.7% + triton_bmm_40185 0.0086 ms 94.0% + triton_bmm_40180 0.0088 ms 91.6% + triton_bmm_40186 0.0088 ms 91.3% + triton_bmm_40188 0.0091 ms 88.7% + triton_bmm_40191 0.0093 ms 86.6% + triton_bmm_40190 0.0096 ms 84.3% +SingleProcess AUTOTUNE takes 3.8195 seconds +AUTOTUNE bmm(16x1x422, 16x422x96) + triton_bmm_40209 0.0093 ms 100.0% + triton_bmm_40210 0.0096 ms 97.3% + triton_bmm_40212 0.0098 ms 94.8% + triton_bmm_40208 0.0108 ms 86.4% + triton_bmm_40207 0.0108 ms 85.8% + triton_bmm_40213 0.0113 ms 82.6% + bmm 0.0114 ms 81.5% + triton_bmm_40205 0.0122 ms 76.6% + triton_bmm_40206 0.0122 ms 76.4% + triton_bmm_40204 0.0163 ms 57.2% +SingleProcess AUTOTUNE takes 4.6934 seconds +AUTOTUNE bmm(16x1x96, 16x96x423) + triton_bmm_40284 0.0085 ms 100.0% + triton_bmm_40280 0.0087 ms 98.5% + triton_bmm_40278 0.0088 ms 97.1% + triton_bmm_40277 0.0088 ms 96.7% + triton_bmm_40279 0.0088 ms 96.7% + triton_bmm_40282 0.0089 ms 96.4% + triton_bmm_40281 0.0091 ms 94.0% + triton_bmm_40276 0.0093 ms 91.4% + triton_bmm_40286 0.0096 ms 89.3% + triton_bmm_40283 0.0098 ms 87.3% +SingleProcess AUTOTUNE takes 4.1683 seconds +AUTOTUNE bmm(16x1x423, 16x423x96) + bmm 0.0114 ms 100.0% + triton_bmm_40309 0.0132 ms 86.4% + triton_bmm_40308 0.0140 ms 81.5% + triton_bmm_40306 0.0166 ms 68.7% + triton_bmm_40305 0.0171 ms 66.7% + triton_bmm_40304 0.0174 ms 65.6% + triton_bmm_40301 0.0179 ms 63.7% + triton_bmm_40303 0.0190 ms 59.8% + triton_bmm_40300 0.0221 ms 51.6% + triton_bmm_40307 0.0236 ms 48.2% +SingleProcess AUTOTUNE takes 4.1802 seconds +AUTOTUNE bmm(16x1x96, 16x96x424) + triton_bmm_40373 0.0081 ms 100.0% + triton_bmm_40376 0.0081 ms 100.0% + triton_bmm_40378 0.0081 ms 100.0% + triton_bmm_40374 0.0083 ms 97.7% + triton_bmm_40380 0.0084 ms 96.6% + triton_bmm_40377 0.0085 ms 94.7% + triton_bmm_40375 0.0087 ms 93.0% + triton_bmm_40379 0.0090 ms 89.4% + triton_bmm_40372 0.0093 ms 86.3% + triton_bmm_40381 0.0096 ms 84.1% +SingleProcess AUTOTUNE takes 4.0884 seconds +AUTOTUNE bmm(16x1x424, 16x424x96) + triton_bmm_40402 0.0095 ms 100.0% + triton_bmm_40401 0.0096 ms 99.7% + bmm 0.0103 ms 92.3% + triton_bmm_40404 0.0104 ms 91.7% + triton_bmm_40400 0.0106 ms 89.8% + triton_bmm_40399 0.0108 ms 87.9% + triton_bmm_40405 0.0108 ms 87.9% + triton_bmm_40398 0.0120 ms 79.3% + triton_bmm_40397 0.0122 ms 78.4% + triton_bmm_40396 0.0166 ms 57.4% +SingleProcess AUTOTUNE takes 4.1293 seconds +AUTOTUNE bmm(16x1x96, 16x96x425) + triton_bmm_40473 0.0085 ms 100.0% + triton_bmm_40472 0.0087 ms 98.5% + triton_bmm_40470 0.0088 ms 97.1% + triton_bmm_40469 0.0088 ms 96.7% + triton_bmm_40471 0.0089 ms 96.4% + triton_bmm_40474 0.0089 ms 96.4% + triton_bmm_40476 0.0091 ms 93.7% + triton_bmm_40468 0.0093 ms 91.4% + triton_bmm_40478 0.0096 ms 89.3% + triton_bmm_40475 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 3.9816 seconds +AUTOTUNE bmm(16x1x425, 16x425x96) + bmm 0.0116 ms 100.0% + triton_bmm_40501 0.0130 ms 89.3% + triton_bmm_40500 0.0134 ms 86.1% + triton_bmm_40497 0.0166 ms 69.8% + triton_bmm_40498 0.0171 ms 67.6% + triton_bmm_40496 0.0174 ms 66.6% + triton_bmm_40494 0.0176 ms 65.7% + triton_bmm_40493 0.0182 ms 63.5% + triton_bmm_40495 0.0184 ms 62.9% + triton_bmm_40492 0.0220 ms 52.5% +SingleProcess AUTOTUNE takes 3.9222 seconds +AUTOTUNE bmm(16x1x96, 16x96x426) + triton_bmm_40568 0.0081 ms 100.0% + triton_bmm_40567 0.0082 ms 98.8% + triton_bmm_40566 0.0083 ms 97.7% + triton_bmm_40565 0.0088 ms 91.3% + triton_bmm_40570 0.0089 ms 91.0% + triton_bmm_40569 0.0091 ms 89.0% + triton_bmm_40572 0.0091 ms 88.7% + triton_bmm_40564 0.0093 ms 86.3% + triton_bmm_40574 0.0096 ms 84.3% + triton_bmm_40571 0.0097 ms 83.2% +SingleProcess AUTOTUNE takes 3.8758 seconds +AUTOTUNE bmm(16x1x426, 16x426x96) + triton_bmm_40593 0.0093 ms 100.0% + triton_bmm_40596 0.0098 ms 94.8% + triton_bmm_40594 0.0101 ms 92.1% + triton_bmm_40592 0.0108 ms 86.6% + bmm 0.0111 ms 83.6% + triton_bmm_40597 0.0112 ms 83.1% + triton_bmm_40591 0.0114 ms 81.7% + triton_bmm_40590 0.0116 ms 79.9% + triton_bmm_40589 0.0122 ms 76.6% + triton_bmm_40588 0.0168 ms 55.3% +SingleProcess AUTOTUNE takes 4.2324 seconds +AUTOTUNE bmm(16x1x96, 16x96x427) + triton_bmm_40666 0.0082 ms 100.0% + triton_bmm_40663 0.0083 ms 98.8% + triton_bmm_40668 0.0084 ms 96.6% + triton_bmm_40665 0.0087 ms 93.4% + triton_bmm_40662 0.0088 ms 92.7% + triton_bmm_40661 0.0088 ms 92.4% + triton_bmm_40664 0.0088 ms 92.4% + triton_bmm_40660 0.0094 ms 87.0% + triton_bmm_40670 0.0096 ms 85.3% + triton_bmm_40667 0.0098 ms 83.1% +SingleProcess AUTOTUNE takes 3.8198 seconds +AUTOTUNE bmm(16x1x427, 16x427x96) + bmm 0.0119 ms 100.0% + triton_bmm_40693 0.0136 ms 87.2% + triton_bmm_40692 0.0140 ms 85.0% + triton_bmm_40689 0.0166 ms 71.6% + triton_bmm_40686 0.0171 ms 69.3% + triton_bmm_40690 0.0173 ms 68.5% + triton_bmm_40688 0.0174 ms 68.3% + triton_bmm_40685 0.0184 ms 64.5% + triton_bmm_40687 0.0189 ms 62.7% + triton_bmm_40684 0.0220 ms 53.9% +SingleProcess AUTOTUNE takes 4.2488 seconds +AUTOTUNE bmm(16x1x96, 16x96x428) + triton_bmm_40757 0.0083 ms 100.0% + triton_bmm_40758 0.0083 ms 100.0% + triton_bmm_40764 0.0084 ms 98.9% + triton_bmm_40761 0.0085 ms 96.6% + triton_bmm_40760 0.0086 ms 95.6% + triton_bmm_40762 0.0087 ms 95.2% + triton_bmm_40756 0.0088 ms 94.2% + triton_bmm_40759 0.0088 ms 93.6% + triton_bmm_40763 0.0091 ms 91.2% + triton_bmm_40766 0.0096 ms 86.3% +SingleProcess AUTOTUNE takes 5.2237 seconds +AUTOTUNE bmm(16x1x428, 16x428x96) + triton_bmm_40786 0.0096 ms 100.0% + triton_bmm_40785 0.0099 ms 97.1% + triton_bmm_40788 0.0104 ms 92.3% + triton_bmm_40784 0.0107 ms 89.3% + triton_bmm_40789 0.0109 ms 87.9% + triton_bmm_40783 0.0114 ms 84.0% + triton_bmm_40782 0.0116 ms 82.1% + triton_bmm_40781 0.0127 ms 75.3% + bmm 0.0139 ms 68.7% + triton_bmm_40780 0.0162 ms 59.1% +SingleProcess AUTOTUNE takes 3.9908 seconds +AUTOTUNE bmm(16x1x96, 16x96x429) + triton_bmm_40856 0.0081 ms 100.0% + triton_bmm_40858 0.0082 ms 98.8% + triton_bmm_40853 0.0083 ms 97.7% + triton_bmm_40860 0.0085 ms 94.8% + triton_bmm_40857 0.0087 ms 93.4% + triton_bmm_40854 0.0088 ms 92.0% + triton_bmm_40855 0.0088 ms 91.7% + triton_bmm_40859 0.0092 ms 88.2% + triton_bmm_40852 0.0095 ms 85.5% + triton_bmm_40862 0.0097 ms 83.5% +SingleProcess AUTOTUNE takes 4.3413 seconds +AUTOTUNE bmm(16x1x429, 16x429x96) + bmm 0.0119 ms 100.0% + triton_bmm_40884 0.0134 ms 88.8% + triton_bmm_40885 0.0137 ms 86.9% + triton_bmm_40880 0.0170 ms 70.4% + triton_bmm_40878 0.0171 ms 69.7% + triton_bmm_40881 0.0171 ms 69.7% + triton_bmm_40882 0.0172 ms 69.6% + triton_bmm_40877 0.0179 ms 66.7% + triton_bmm_40879 0.0189 ms 63.0% + triton_bmm_40876 0.0218 ms 54.9% +SingleProcess AUTOTUNE takes 4.1257 seconds +AUTOTUNE bmm(16x1x96, 16x96x430) + triton_bmm_40951 0.0081 ms 100.0% + triton_bmm_40952 0.0081 ms 100.0% + triton_bmm_40950 0.0083 ms 97.3% + triton_bmm_40956 0.0084 ms 96.0% + triton_bmm_40953 0.0085 ms 94.4% + triton_bmm_40954 0.0088 ms 91.5% + triton_bmm_40949 0.0088 ms 91.3% + triton_bmm_40959 0.0093 ms 86.6% + triton_bmm_40948 0.0093 ms 86.3% + triton_bmm_40955 0.0096 ms 84.0% +SingleProcess AUTOTUNE takes 4.6735 seconds +AUTOTUNE bmm(16x1x430, 16x430x96) + triton_bmm_40978 0.0096 ms 100.0% + triton_bmm_40977 0.0099 ms 97.1% + triton_bmm_40980 0.0104 ms 92.3% + triton_bmm_40975 0.0108 ms 88.2% + triton_bmm_40976 0.0109 ms 87.9% + triton_bmm_40981 0.0113 ms 84.8% + triton_bmm_40973 0.0122 ms 78.7% + triton_bmm_40974 0.0122 ms 78.5% + triton_bmm_40972 0.0163 ms 58.6% + triton_bmm_40979 0.0182 ms 52.6% +SingleProcess AUTOTUNE takes 4.3520 seconds +AUTOTUNE bmm(16x1x96, 16x96x431) + triton_bmm_41046 0.0083 ms 100.0% + triton_bmm_41048 0.0088 ms 94.5% + triton_bmm_41044 0.0088 ms 94.2% + triton_bmm_41047 0.0088 ms 93.8% + triton_bmm_41050 0.0089 ms 93.5% + triton_bmm_41045 0.0090 ms 92.5% + triton_bmm_41049 0.0091 ms 91.2% + triton_bmm_41052 0.0091 ms 91.2% + triton_bmm_41051 0.0093 ms 89.3% + triton_bmm_41054 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 4.2066 seconds +AUTOTUNE bmm(16x1x431, 16x431x96) + bmm 0.0127 ms 100.0% + triton_bmm_41076 0.0134 ms 94.3% + triton_bmm_41077 0.0137 ms 92.5% + triton_bmm_41072 0.0171 ms 74.3% + triton_bmm_41073 0.0172 ms 73.7% + triton_bmm_41074 0.0172 ms 73.5% + triton_bmm_41070 0.0173 ms 73.1% + triton_bmm_41069 0.0184 ms 68.9% + triton_bmm_41071 0.0186 ms 68.0% + triton_bmm_41068 0.0218 ms 58.1% +SingleProcess AUTOTUNE takes 4.1206 seconds +AUTOTUNE bmm(16x1x96, 16x96x432) + triton_bmm_41143 0.0081 ms 100.0% + triton_bmm_41144 0.0081 ms 100.0% + triton_bmm_41141 0.0082 ms 98.4% + triton_bmm_41142 0.0083 ms 97.3% + triton_bmm_41148 0.0085 ms 94.7% + triton_bmm_41140 0.0088 ms 92.0% + triton_bmm_41146 0.0088 ms 91.8% + triton_bmm_41145 0.0091 ms 89.0% + triton_bmm_41147 0.0091 ms 89.0% + triton_bmm_41149 0.0091 ms 89.0% +SingleProcess AUTOTUNE takes 3.9421 seconds +AUTOTUNE bmm(16x1x432, 16x432x96) + triton_bmm_41169 0.0095 ms 100.0% + triton_bmm_41170 0.0095 ms 100.0% + triton_bmm_41172 0.0104 ms 92.0% + bmm 0.0106 ms 90.0% + triton_bmm_41168 0.0108 ms 88.2% + triton_bmm_41173 0.0111 ms 85.6% + triton_bmm_41167 0.0114 ms 83.7% + triton_bmm_41166 0.0116 ms 82.1% + triton_bmm_41165 0.0127 ms 75.3% + triton_bmm_41164 0.0166 ms 57.4% +SingleProcess AUTOTUNE takes 4.0189 seconds +AUTOTUNE bmm(16x1x96, 16x96x433) + triton_bmm_41242 0.0083 ms 100.0% + triton_bmm_41236 0.0088 ms 94.2% + triton_bmm_41238 0.0088 ms 93.8% + triton_bmm_41239 0.0088 ms 93.8% + triton_bmm_41240 0.0089 ms 93.5% + triton_bmm_41237 0.0091 ms 91.5% + triton_bmm_41241 0.0091 ms 91.2% + triton_bmm_41244 0.0091 ms 91.2% + triton_bmm_41245 0.0096 ms 86.6% + triton_bmm_41247 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 4.2036 seconds +AUTOTUNE bmm(16x1x433, 16x433x96) + bmm 0.0111 ms 100.0% + triton_bmm_41269 0.0132 ms 84.5% + triton_bmm_41268 0.0140 ms 79.5% + triton_bmm_41262 0.0173 ms 64.3% + triton_bmm_41265 0.0173 ms 64.2% + triton_bmm_41266 0.0174 ms 64.1% + triton_bmm_41264 0.0174 ms 64.0% + triton_bmm_41261 0.0184 ms 60.4% + triton_bmm_41263 0.0187 ms 59.7% + triton_bmm_41260 0.0224 ms 49.8% +SingleProcess AUTOTUNE takes 3.9051 seconds +AUTOTUNE bmm(16x1x96, 16x96x434) + triton_bmm_41335 0.0083 ms 100.0% + triton_bmm_41336 0.0083 ms 100.0% + triton_bmm_41333 0.0083 ms 99.6% + triton_bmm_41334 0.0083 ms 99.6% + triton_bmm_41338 0.0083 ms 99.6% + triton_bmm_41337 0.0085 ms 97.0% + triton_bmm_41332 0.0088 ms 93.8% + triton_bmm_41340 0.0091 ms 90.8% + triton_bmm_41343 0.0095 ms 86.6% + triton_bmm_41339 0.0098 ms 84.3% +SingleProcess AUTOTUNE takes 3.7510 seconds +AUTOTUNE bmm(16x1x434, 16x434x96) + triton_bmm_41364 0.0098 ms 100.0% + triton_bmm_41361 0.0099 ms 99.7% + triton_bmm_41362 0.0101 ms 97.2% + bmm 0.0106 ms 93.0% + triton_bmm_41365 0.0106 ms 92.5% + triton_bmm_41360 0.0109 ms 90.3% + triton_bmm_41359 0.0114 ms 86.2% + triton_bmm_41358 0.0116 ms 84.3% + triton_bmm_41357 0.0127 ms 77.3% + triton_bmm_41356 0.0163 ms 60.2% +SingleProcess AUTOTUNE takes 4.0838 seconds +AUTOTUNE bmm(16x1x96, 16x96x435) + triton_bmm_41432 0.0083 ms 100.0% + triton_bmm_41429 0.0083 ms 99.6% + triton_bmm_41434 0.0083 ms 99.6% + triton_bmm_41431 0.0088 ms 93.8% + triton_bmm_41430 0.0088 ms 93.7% + triton_bmm_41428 0.0090 ms 91.8% + triton_bmm_41436 0.0091 ms 90.9% + triton_bmm_41433 0.0092 ms 90.6% + triton_bmm_41438 0.0097 ms 85.2% + triton_bmm_41435 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 4.0030 seconds +AUTOTUNE bmm(16x1x435, 16x435x96) + bmm 0.0116 ms 100.0% + triton_bmm_41461 0.0138 ms 84.4% + triton_bmm_41460 0.0140 ms 83.3% + triton_bmm_41458 0.0168 ms 69.0% + triton_bmm_41457 0.0174 ms 66.9% + triton_bmm_41456 0.0176 ms 66.1% + triton_bmm_41454 0.0177 ms 65.8% + triton_bmm_41453 0.0179 ms 64.8% + triton_bmm_41455 0.0187 ms 62.3% + triton_bmm_41452 0.0224 ms 51.9% +SingleProcess AUTOTUNE takes 3.8941 seconds +AUTOTUNE bmm(16x1x96, 16x96x436) + triton_bmm_41527 0.0083 ms 100.0% + triton_bmm_41525 0.0083 ms 99.6% + triton_bmm_41526 0.0083 ms 99.6% + triton_bmm_41532 0.0085 ms 97.0% + triton_bmm_41528 0.0087 ms 94.9% + triton_bmm_41524 0.0088 ms 94.2% + triton_bmm_41530 0.0089 ms 93.1% + triton_bmm_41529 0.0091 ms 90.8% + triton_bmm_41531 0.0092 ms 89.9% + triton_bmm_41534 0.0097 ms 84.9% +SingleProcess AUTOTUNE takes 4.0483 seconds +AUTOTUNE bmm(16x1x436, 16x436x96) + triton_bmm_41553 0.0093 ms 100.0% + triton_bmm_41554 0.0095 ms 97.7% + triton_bmm_41556 0.0098 ms 94.8% + bmm 0.0103 ms 90.1% + triton_bmm_41552 0.0107 ms 87.4% + triton_bmm_41551 0.0108 ms 85.8% + triton_bmm_41557 0.0109 ms 85.6% + triton_bmm_41549 0.0122 ms 76.6% + triton_bmm_41550 0.0122 ms 76.6% + triton_bmm_41548 0.0166 ms 56.0% +SingleProcess AUTOTUNE takes 4.2264 seconds +AUTOTUNE bmm(16x1x96, 16x96x437) + triton_bmm_41623 0.0083 ms 100.0% + triton_bmm_41626 0.0083 ms 99.6% + triton_bmm_41621 0.0084 ms 99.2% + triton_bmm_41628 0.0085 ms 97.0% + triton_bmm_41624 0.0086 ms 95.9% + triton_bmm_41622 0.0091 ms 91.5% + triton_bmm_41625 0.0091 ms 91.2% + triton_bmm_41620 0.0096 ms 86.6% + triton_bmm_41631 0.0096 ms 86.6% + triton_bmm_41630 0.0098 ms 84.6% +SingleProcess AUTOTUNE takes 4.3106 seconds +AUTOTUNE bmm(16x1x437, 16x437x96) + bmm 0.0119 ms 100.0% + triton_bmm_41653 0.0132 ms 90.1% + triton_bmm_41652 0.0134 ms 88.6% + triton_bmm_41650 0.0168 ms 70.9% + triton_bmm_41649 0.0168 ms 70.7% + triton_bmm_41648 0.0169 ms 70.6% + triton_bmm_41646 0.0173 ms 68.9% + triton_bmm_41645 0.0186 ms 64.0% + triton_bmm_41647 0.0187 ms 63.8% + triton_bmm_41644 0.0222 ms 53.6% +SingleProcess AUTOTUNE takes 4.2318 seconds +AUTOTUNE bmm(16x1x96, 16x96x438) + triton_bmm_41722 0.0083 ms 100.0% + triton_bmm_41724 0.0085 ms 97.4% + triton_bmm_41721 0.0085 ms 97.0% + triton_bmm_41716 0.0088 ms 93.8% + triton_bmm_41718 0.0088 ms 93.8% + triton_bmm_41719 0.0088 ms 93.8% + triton_bmm_41720 0.0089 ms 93.5% + triton_bmm_41717 0.0091 ms 91.5% + triton_bmm_41723 0.0093 ms 89.3% + triton_bmm_41727 0.0095 ms 86.9% +SingleProcess AUTOTUNE takes 3.7793 seconds +AUTOTUNE bmm(16x1x438, 16x438x96) + triton_bmm_41745 0.0093 ms 100.0% + triton_bmm_41748 0.0098 ms 94.8% + triton_bmm_41746 0.0101 ms 92.4% + triton_bmm_41744 0.0103 ms 90.1% + triton_bmm_41743 0.0108 ms 85.8% + bmm 0.0111 ms 83.9% + triton_bmm_41749 0.0113 ms 82.4% + triton_bmm_41742 0.0117 ms 79.3% + triton_bmm_41741 0.0127 ms 73.4% + triton_bmm_41740 0.0163 ms 57.1% +SingleProcess AUTOTUNE takes 4.2819 seconds +AUTOTUNE bmm(16x1x96, 16x96x439) + triton_bmm_41814 0.0083 ms 100.0% + triton_bmm_41815 0.0083 ms 100.0% + triton_bmm_41816 0.0083 ms 99.6% + triton_bmm_41813 0.0084 ms 98.9% + triton_bmm_41817 0.0085 ms 97.0% + triton_bmm_41818 0.0089 ms 93.2% + triton_bmm_41820 0.0091 ms 91.2% + triton_bmm_41812 0.0096 ms 86.6% + triton_bmm_41821 0.0096 ms 86.6% + triton_bmm_41819 0.0098 ms 84.6% +SingleProcess AUTOTUNE takes 3.9725 seconds +AUTOTUNE bmm(16x1x439, 16x439x96) + bmm 0.0122 ms 100.0% + triton_bmm_41845 0.0138 ms 88.6% + triton_bmm_41844 0.0140 ms 87.2% + triton_bmm_41841 0.0168 ms 72.4% + triton_bmm_41842 0.0168 ms 72.4% + triton_bmm_41838 0.0173 ms 70.3% + triton_bmm_41840 0.0176 ms 69.4% + triton_bmm_41837 0.0181 ms 67.3% + triton_bmm_41839 0.0187 ms 65.4% + triton_bmm_41836 0.0225 ms 54.1% +SingleProcess AUTOTUNE takes 4.4576 seconds +AUTOTUNE bmm(16x1x96, 16x96x440) + triton_bmm_41909 0.0083 ms 100.0% + triton_bmm_41911 0.0083 ms 100.0% + triton_bmm_41914 0.0083 ms 99.6% + triton_bmm_41908 0.0088 ms 93.8% + triton_bmm_41912 0.0089 ms 93.1% + triton_bmm_41910 0.0090 ms 91.5% + triton_bmm_41913 0.0091 ms 91.2% + triton_bmm_41916 0.0091 ms 91.0% + triton_bmm_41915 0.0092 ms 89.9% + triton_bmm_41917 0.0096 ms 86.0% +SingleProcess AUTOTUNE takes 4.0044 seconds +AUTOTUNE bmm(16x1x440, 16x440x96) + triton_bmm_41938 0.0095 ms 100.0% + triton_bmm_41937 0.0098 ms 97.1% + triton_bmm_41940 0.0101 ms 94.6% + triton_bmm_41936 0.0103 ms 92.5% + bmm 0.0109 ms 87.4% + triton_bmm_41935 0.0114 ms 83.8% + triton_bmm_41941 0.0114 ms 83.7% + triton_bmm_41933 0.0122 ms 78.4% + triton_bmm_41934 0.0122 ms 78.4% + triton_bmm_41932 0.0168 ms 56.7% +SingleProcess AUTOTUNE takes 3.9934 seconds +AUTOTUNE bmm(16x1x96, 16x96x441) + triton_bmm_42008 0.0083 ms 100.0% + triton_bmm_42010 0.0083 ms 100.0% + triton_bmm_42005 0.0084 ms 98.9% + triton_bmm_42007 0.0088 ms 94.2% + triton_bmm_42006 0.0090 ms 92.5% + triton_bmm_42004 0.0090 ms 92.2% + triton_bmm_42009 0.0091 ms 91.5% + triton_bmm_42012 0.0091 ms 91.5% + triton_bmm_42015 0.0096 ms 87.0% + triton_bmm_42011 0.0098 ms 84.7% +SingleProcess AUTOTUNE takes 3.9611 seconds +AUTOTUNE bmm(16x1x441, 16x441x96) + bmm 0.0120 ms 100.0% + triton_bmm_42037 0.0132 ms 90.8% + triton_bmm_42036 0.0140 ms 85.8% + triton_bmm_42034 0.0169 ms 71.2% + triton_bmm_42032 0.0171 ms 70.2% + triton_bmm_42030 0.0173 ms 69.2% + triton_bmm_42033 0.0174 ms 69.1% + triton_bmm_42029 0.0184 ms 65.1% + triton_bmm_42031 0.0187 ms 64.3% + triton_bmm_42028 0.0220 ms 54.5% +SingleProcess AUTOTUNE takes 3.9848 seconds +AUTOTUNE bmm(16x1x96, 16x96x442) + triton_bmm_42101 0.0083 ms 100.0% + triton_bmm_42102 0.0083 ms 100.0% + triton_bmm_42108 0.0085 ms 97.0% + triton_bmm_42103 0.0088 ms 93.7% + triton_bmm_42104 0.0089 ms 93.5% + triton_bmm_42106 0.0089 ms 93.5% + triton_bmm_42105 0.0091 ms 91.2% + triton_bmm_42107 0.0093 ms 89.3% + triton_bmm_42100 0.0095 ms 87.2% + triton_bmm_42110 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 3.8366 seconds +AUTOTUNE bmm(16x1x442, 16x442x96) + triton_bmm_42132 0.0098 ms 100.0% + triton_bmm_42129 0.0099 ms 99.7% + triton_bmm_42130 0.0101 ms 97.3% + triton_bmm_42128 0.0103 ms 95.0% + triton_bmm_42133 0.0106 ms 92.7% + triton_bmm_42127 0.0108 ms 90.6% + bmm 0.0111 ms 88.2% + triton_bmm_42125 0.0122 ms 80.8% + triton_bmm_42126 0.0122 ms 80.4% + triton_bmm_42124 0.0168 ms 58.4% +SingleProcess AUTOTUNE takes 4.1174 seconds +AUTOTUNE bmm(16x1x96, 16x96x443) + triton_bmm_42198 0.0083 ms 100.0% + triton_bmm_42199 0.0083 ms 100.0% + triton_bmm_42202 0.0083 ms 99.6% + triton_bmm_42204 0.0085 ms 97.0% + triton_bmm_42200 0.0089 ms 93.5% + triton_bmm_42197 0.0091 ms 91.5% + triton_bmm_42201 0.0093 ms 89.5% + triton_bmm_42196 0.0095 ms 86.9% + triton_bmm_42205 0.0096 ms 86.6% + triton_bmm_42207 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 3.8832 seconds +AUTOTUNE bmm(16x1x443, 16x443x96) + bmm 0.0126 ms 100.0% + triton_bmm_42229 0.0137 ms 91.6% + triton_bmm_42228 0.0140 ms 89.9% + triton_bmm_42225 0.0168 ms 74.7% + triton_bmm_42224 0.0171 ms 73.6% + triton_bmm_42226 0.0174 ms 72.3% + triton_bmm_42222 0.0179 ms 70.4% + triton_bmm_42221 0.0184 ms 68.2% + triton_bmm_42223 0.0187 ms 67.4% + triton_bmm_42220 0.0225 ms 56.0% +SingleProcess AUTOTUNE takes 3.8484 seconds +AUTOTUNE bmm(16x1x96, 16x96x444) + triton_bmm_42296 0.0081 ms 100.0% + triton_bmm_42294 0.0083 ms 98.1% + triton_bmm_42293 0.0088 ms 91.7% + triton_bmm_42295 0.0088 ms 91.7% + triton_bmm_42298 0.0088 ms 91.7% + triton_bmm_42297 0.0091 ms 89.1% + triton_bmm_42300 0.0091 ms 89.1% + triton_bmm_42292 0.0093 ms 86.6% + triton_bmm_42299 0.0096 ms 84.1% + triton_bmm_42302 0.0098 ms 82.7% +SingleProcess AUTOTUNE takes 3.9609 seconds +AUTOTUNE bmm(16x1x444, 16x444x96) + triton_bmm_42321 0.0093 ms 100.0% + triton_bmm_42324 0.0098 ms 94.8% + triton_bmm_42322 0.0101 ms 92.4% + bmm 0.0101 ms 92.1% + triton_bmm_42320 0.0103 ms 90.4% + triton_bmm_42325 0.0103 ms 90.1% + triton_bmm_42319 0.0114 ms 81.7% + triton_bmm_42318 0.0122 ms 76.4% + triton_bmm_42317 0.0127 ms 73.4% + triton_bmm_42316 0.0166 ms 56.1% +SingleProcess AUTOTUNE takes 3.9054 seconds +AUTOTUNE bmm(16x1x96, 16x96x445) + triton_bmm_42394 0.0083 ms 100.0% + triton_bmm_42389 0.0085 ms 97.7% + triton_bmm_42396 0.0085 ms 97.4% + triton_bmm_42393 0.0088 ms 94.9% + triton_bmm_42391 0.0088 ms 94.2% + triton_bmm_42392 0.0089 ms 93.9% + triton_bmm_42388 0.0090 ms 92.2% + triton_bmm_42390 0.0091 ms 91.9% + triton_bmm_42399 0.0096 ms 87.0% + triton_bmm_42395 0.0098 ms 84.8% +SingleProcess AUTOTUNE takes 3.8697 seconds +AUTOTUNE bmm(16x1x445, 16x445x96) + bmm 0.0125 ms 100.0% + triton_bmm_42421 0.0132 ms 94.9% + triton_bmm_42420 0.0140 ms 89.3% + triton_bmm_42417 0.0168 ms 74.3% + triton_bmm_42418 0.0169 ms 74.1% + triton_bmm_42416 0.0171 ms 73.2% + triton_bmm_42414 0.0179 ms 70.1% + triton_bmm_42413 0.0184 ms 67.9% + triton_bmm_42415 0.0187 ms 67.1% + triton_bmm_42412 0.0220 ms 56.8% +SingleProcess AUTOTUNE takes 4.8723 seconds +AUTOTUNE bmm(16x1x96, 16x96x446) + triton_bmm_42485 0.0083 ms 100.0% + triton_bmm_42488 0.0083 ms 100.0% + triton_bmm_42489 0.0085 ms 97.0% + triton_bmm_42492 0.0085 ms 97.0% + triton_bmm_42487 0.0088 ms 93.8% + triton_bmm_42490 0.0089 ms 93.5% + triton_bmm_42484 0.0090 ms 91.8% + triton_bmm_42486 0.0091 ms 91.5% + triton_bmm_42495 0.0095 ms 86.9% + triton_bmm_42491 0.0098 ms 84.8% +SingleProcess AUTOTUNE takes 3.8271 seconds +AUTOTUNE bmm(16x1x446, 16x446x96) + triton_bmm_42513 0.0093 ms 100.0% + triton_bmm_42514 0.0101 ms 92.1% + triton_bmm_42516 0.0104 ms 89.8% + triton_bmm_42512 0.0108 ms 86.1% + triton_bmm_42517 0.0112 ms 83.4% + triton_bmm_42511 0.0114 ms 81.7% + triton_bmm_42510 0.0117 ms 79.7% + triton_bmm_42509 0.0122 ms 76.6% + triton_bmm_42508 0.0168 ms 55.3% + triton_bmm_42515 0.0182 ms 51.1% +SingleProcess AUTOTUNE takes 4.3676 seconds +AUTOTUNE bmm(16x1x96, 16x96x447) + triton_bmm_42583 0.0083 ms 100.0% + triton_bmm_42586 0.0083 ms 99.6% + triton_bmm_42581 0.0085 ms 97.4% + triton_bmm_42585 0.0087 ms 95.2% + triton_bmm_42584 0.0089 ms 93.5% + triton_bmm_42580 0.0090 ms 91.8% + triton_bmm_42582 0.0091 ms 91.5% + triton_bmm_42588 0.0091 ms 91.2% + triton_bmm_42589 0.0096 ms 86.6% + triton_bmm_42587 0.0098 ms 84.2% +SingleProcess AUTOTUNE takes 4.4981 seconds +AUTOTUNE bmm(16x1x447, 16x447x96) + bmm 0.0129 ms 100.0% + triton_bmm_42613 0.0132 ms 97.8% + triton_bmm_42612 0.0139 ms 92.9% + triton_bmm_42608 0.0161 ms 80.5% + triton_bmm_42609 0.0161 ms 80.3% + triton_bmm_42606 0.0163 ms 79.2% + triton_bmm_42610 0.0164 ms 79.1% + triton_bmm_42607 0.0174 ms 74.3% + triton_bmm_42605 0.0187 ms 69.3% + triton_bmm_42604 0.0231 ms 56.0% +SingleProcess AUTOTUNE takes 3.8294 seconds +AUTOTUNE bmm(16x1x96, 16x96x448) + triton_bmm_42679 0.0083 ms 100.0% + triton_bmm_42678 0.0083 ms 99.6% + triton_bmm_42684 0.0086 ms 96.3% + triton_bmm_42677 0.0088 ms 93.8% + triton_bmm_42680 0.0089 ms 93.1% + triton_bmm_42682 0.0089 ms 93.1% + triton_bmm_42681 0.0091 ms 90.8% + triton_bmm_42685 0.0093 ms 89.0% + triton_bmm_42676 0.0095 ms 87.3% + triton_bmm_42683 0.0098 ms 84.0% +SingleProcess AUTOTUNE takes 3.7704 seconds +AUTOTUNE bmm(16x1x448, 16x448x96) + triton_bmm_42708 0.0098 ms 100.0% + triton_bmm_42706 0.0099 ms 99.5% + triton_bmm_42704 0.0101 ms 97.5% + triton_bmm_42705 0.0101 ms 97.5% + bmm 0.0106 ms 92.7% + triton_bmm_42703 0.0108 ms 90.6% + triton_bmm_42709 0.0109 ms 90.3% + triton_bmm_42702 0.0122 ms 80.8% + triton_bmm_42701 0.0127 ms 77.3% + triton_bmm_42700 0.0168 ms 58.4% +SingleProcess AUTOTUNE takes 3.8050 seconds +AUTOTUNE bmm(16x1x96, 16x96x449) + triton_bmm_42775 0.0083 ms 100.0% + triton_bmm_42773 0.0085 ms 97.4% + triton_bmm_42774 0.0085 ms 97.0% + triton_bmm_42780 0.0088 ms 94.2% + triton_bmm_42776 0.0089 ms 93.2% + triton_bmm_42778 0.0091 ms 90.9% + triton_bmm_42777 0.0093 ms 89.0% + triton_bmm_42772 0.0096 ms 86.3% + triton_bmm_42779 0.0098 ms 84.4% + triton_bmm_42782 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 3.7764 seconds +AUTOTUNE bmm(16x1x449, 16x449x96) + bmm 0.0104 ms 100.0% + triton_bmm_42805 0.0127 ms 81.9% + triton_bmm_42804 0.0140 ms 74.1% + triton_bmm_42800 0.0173 ms 59.8% + triton_bmm_42801 0.0177 ms 58.7% + triton_bmm_42802 0.0177 ms 58.7% + triton_bmm_42798 0.0182 ms 57.0% + triton_bmm_42797 0.0185 ms 56.1% + triton_bmm_42799 0.0192 ms 54.1% + triton_bmm_42796 0.0226 ms 46.0% +SingleProcess AUTOTUNE takes 3.9205 seconds +AUTOTUNE bmm(16x1x96, 16x96x450) + triton_bmm_42871 0.0083 ms 100.0% + triton_bmm_42869 0.0083 ms 99.6% + triton_bmm_42874 0.0084 ms 99.2% + triton_bmm_42872 0.0089 ms 93.2% + triton_bmm_42868 0.0090 ms 91.8% + triton_bmm_42870 0.0091 ms 91.5% + triton_bmm_42876 0.0091 ms 91.0% + triton_bmm_42873 0.0092 ms 89.8% + triton_bmm_42875 0.0098 ms 84.6% + triton_bmm_42878 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 3.7845 seconds +AUTOTUNE bmm(16x1x450, 16x450x96) + triton_bmm_42897 0.0095 ms 100.0% + triton_bmm_42900 0.0101 ms 94.6% + triton_bmm_42898 0.0102 ms 93.4% + triton_bmm_42901 0.0106 ms 90.0% + triton_bmm_42896 0.0109 ms 87.4% + triton_bmm_42895 0.0116 ms 81.9% + triton_bmm_42894 0.0124 ms 76.6% + triton_bmm_42893 0.0130 ms 73.6% + bmm 0.0143 ms 66.7% + triton_bmm_42892 0.0174 ms 54.9% +SingleProcess AUTOTUNE takes 3.9967 seconds +AUTOTUNE bmm(16x1x96, 16x96x451) + triton_bmm_42967 0.0088 ms 100.0% + triton_bmm_42968 0.0089 ms 98.9% + triton_bmm_42965 0.0091 ms 97.5% + triton_bmm_42966 0.0091 ms 97.5% + triton_bmm_42970 0.0091 ms 97.2% + triton_bmm_42972 0.0091 ms 96.8% + triton_bmm_42969 0.0093 ms 94.8% + triton_bmm_42971 0.0093 ms 94.8% + triton_bmm_42964 0.0096 ms 92.3% + triton_bmm_42975 0.0096 ms 92.0% +SingleProcess AUTOTUNE takes 3.9018 seconds +AUTOTUNE bmm(16x1x451, 16x451x96) + bmm 0.0103 ms 100.0% + triton_bmm_42997 0.0122 ms 85.0% + triton_bmm_42996 0.0140 ms 73.9% + triton_bmm_42994 0.0173 ms 59.6% + triton_bmm_42992 0.0174 ms 59.5% + triton_bmm_42993 0.0177 ms 58.5% + triton_bmm_42990 0.0182 ms 56.9% + triton_bmm_42989 0.0184 ms 56.2% + triton_bmm_42991 0.0197 ms 52.5% + triton_bmm_42988 0.0231 ms 44.7% +SingleProcess AUTOTUNE takes 3.7654 seconds +AUTOTUNE bmm(16x1x96, 16x96x452) + triton_bmm_43061 0.0083 ms 100.0% + triton_bmm_43062 0.0084 ms 98.5% + triton_bmm_43065 0.0088 ms 94.5% + triton_bmm_43063 0.0088 ms 93.8% + triton_bmm_43064 0.0089 ms 93.5% + triton_bmm_43066 0.0089 ms 93.5% + triton_bmm_43060 0.0090 ms 91.8% + triton_bmm_43068 0.0091 ms 91.2% + triton_bmm_43069 0.0095 ms 86.9% + triton_bmm_43071 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 3.7099 seconds +AUTOTUNE bmm(16x1x452, 16x452x96) + triton_bmm_43090 0.0096 ms 100.0% + triton_bmm_43089 0.0101 ms 95.1% + triton_bmm_43092 0.0101 ms 94.9% + triton_bmm_43088 0.0109 ms 87.9% + triton_bmm_43093 0.0109 ms 87.9% + triton_bmm_43087 0.0116 ms 82.1% + triton_bmm_43086 0.0119 ms 80.4% + triton_bmm_43085 0.0126 ms 75.7% + bmm 0.0148 ms 64.4% + triton_bmm_43084 0.0174 ms 55.1% +SingleProcess AUTOTUNE takes 4.2284 seconds +AUTOTUNE bmm(16x1x96, 16x96x453) + triton_bmm_43157 0.0085 ms 100.0% + triton_bmm_43158 0.0085 ms 99.6% + triton_bmm_43164 0.0088 ms 96.4% + triton_bmm_43159 0.0089 ms 96.0% + triton_bmm_43160 0.0090 ms 95.0% + triton_bmm_43162 0.0091 ms 93.7% + triton_bmm_43163 0.0093 ms 91.4% + triton_bmm_43161 0.0093 ms 91.1% + triton_bmm_43167 0.0096 ms 89.0% + triton_bmm_43156 0.0096 ms 88.7% +SingleProcess AUTOTUNE takes 3.8011 seconds +AUTOTUNE bmm(16x1x453, 16x453x96) + bmm 0.0104 ms 100.0% + triton_bmm_43189 0.0127 ms 82.2% + triton_bmm_43188 0.0145 ms 71.9% + triton_bmm_43186 0.0173 ms 60.2% + triton_bmm_43185 0.0179 ms 58.4% + triton_bmm_43184 0.0179 ms 58.4% + triton_bmm_43182 0.0182 ms 57.5% + triton_bmm_43181 0.0186 ms 56.1% + triton_bmm_43183 0.0192 ms 54.5% + triton_bmm_43180 0.0225 ms 46.4% +SingleProcess AUTOTUNE takes 4.3055 seconds +AUTOTUNE bmm(16x1x96, 16x96x454) + triton_bmm_43256 0.0083 ms 100.0% + triton_bmm_43254 0.0085 ms 97.7% + triton_bmm_43255 0.0088 ms 94.2% + triton_bmm_43258 0.0089 ms 93.5% + triton_bmm_43253 0.0090 ms 92.2% + triton_bmm_43257 0.0091 ms 91.2% + triton_bmm_43260 0.0091 ms 91.2% + triton_bmm_43259 0.0093 ms 89.7% + triton_bmm_43263 0.0096 ms 87.0% + triton_bmm_43252 0.0096 ms 86.7% +SingleProcess AUTOTUNE takes 4.1308 seconds +AUTOTUNE bmm(16x1x454, 16x454x96) + triton_bmm_43282 0.0098 ms 100.0% + triton_bmm_43284 0.0101 ms 97.1% + triton_bmm_43281 0.0101 ms 96.8% + triton_bmm_43280 0.0106 ms 92.7% + triton_bmm_43285 0.0107 ms 91.9% + triton_bmm_43279 0.0111 ms 88.2% + triton_bmm_43278 0.0122 ms 80.5% + triton_bmm_43277 0.0130 ms 75.6% + bmm 0.0144 ms 68.2% + triton_bmm_43276 0.0174 ms 56.2% +SingleProcess AUTOTUNE takes 3.9412 seconds +AUTOTUNE bmm(16x1x96, 16x96x455) + triton_bmm_43351 0.0083 ms 100.0% + triton_bmm_43352 0.0083 ms 99.6% + triton_bmm_43354 0.0084 ms 99.2% + triton_bmm_43349 0.0085 ms 97.7% + triton_bmm_43353 0.0088 ms 94.5% + triton_bmm_43356 0.0088 ms 94.2% + triton_bmm_43350 0.0091 ms 91.2% + triton_bmm_43355 0.0093 ms 89.0% + triton_bmm_43348 0.0096 ms 86.3% + triton_bmm_43359 0.0097 ms 85.5% +SingleProcess AUTOTUNE takes 3.8143 seconds +AUTOTUNE bmm(16x1x455, 16x455x96) + bmm 0.0103 ms 100.0% + triton_bmm_43381 0.0122 ms 85.0% + triton_bmm_43380 0.0141 ms 73.2% + triton_bmm_43376 0.0176 ms 58.7% + triton_bmm_43377 0.0179 ms 57.8% + triton_bmm_43378 0.0179 ms 57.8% + triton_bmm_43374 0.0184 ms 56.2% + triton_bmm_43373 0.0192 ms 53.8% + triton_bmm_43375 0.0194 ms 53.2% + triton_bmm_43379 0.0254 ms 40.6% +SingleProcess AUTOTUNE takes 3.9267 seconds +AUTOTUNE bmm(16x1x96, 16x96x456) + triton_bmm_43450 0.0083 ms 100.0% + triton_bmm_43449 0.0086 ms 97.0% + triton_bmm_43445 0.0088 ms 94.5% + triton_bmm_43447 0.0088 ms 94.2% + triton_bmm_43448 0.0089 ms 93.9% + triton_bmm_43444 0.0090 ms 92.2% + triton_bmm_43446 0.0091 ms 91.9% + triton_bmm_43452 0.0093 ms 89.3% + triton_bmm_43455 0.0096 ms 87.0% + triton_bmm_43451 0.0098 ms 84.7% +SingleProcess AUTOTUNE takes 4.1575 seconds +AUTOTUNE bmm(16x1x456, 16x456x96) + triton_bmm_43474 0.0096 ms 100.0% + triton_bmm_43472 0.0103 ms 92.6% + bmm 0.0106 ms 90.6% + triton_bmm_43473 0.0106 ms 90.3% + triton_bmm_43476 0.0107 ms 89.7% + triton_bmm_43477 0.0109 ms 87.9% + triton_bmm_43471 0.0111 ms 86.2% + triton_bmm_43470 0.0119 ms 80.4% + triton_bmm_43469 0.0132 ms 72.7% + triton_bmm_43468 0.0168 ms 56.8% +SingleProcess AUTOTUNE takes 4.1430 seconds +AUTOTUNE bmm(16x1x96, 16x96x457) + triton_bmm_43543 0.0083 ms 100.0% + triton_bmm_43544 0.0084 ms 99.2% + triton_bmm_43546 0.0085 ms 97.4% + triton_bmm_43542 0.0085 ms 97.0% + triton_bmm_43545 0.0088 ms 94.5% + triton_bmm_43548 0.0088 ms 93.8% + triton_bmm_43541 0.0091 ms 91.5% + triton_bmm_43547 0.0093 ms 89.3% + triton_bmm_43540 0.0096 ms 86.6% + triton_bmm_43549 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 3.9424 seconds +AUTOTUNE bmm(16x1x457, 16x457x96) + bmm 0.0104 ms 100.0% + triton_bmm_43573 0.0122 ms 85.3% + triton_bmm_43572 0.0140 ms 74.1% + triton_bmm_43569 0.0173 ms 59.8% + triton_bmm_43570 0.0173 ms 59.8% + triton_bmm_43568 0.0176 ms 58.9% + triton_bmm_43566 0.0184 ms 56.4% + triton_bmm_43565 0.0187 ms 55.6% + triton_bmm_43567 0.0198 ms 52.5% + triton_bmm_43564 0.0230 ms 45.0% +SingleProcess AUTOTUNE takes 3.8607 seconds +AUTOTUNE bmm(16x1x96, 16x96x458) + triton_bmm_43639 0.0083 ms 100.0% + triton_bmm_43640 0.0083 ms 99.6% + triton_bmm_43641 0.0088 ms 94.4% + triton_bmm_43644 0.0088 ms 93.8% + triton_bmm_43642 0.0089 ms 93.2% + triton_bmm_43637 0.0091 ms 91.5% + triton_bmm_43638 0.0091 ms 91.2% + triton_bmm_43636 0.0096 ms 86.6% + triton_bmm_43643 0.0097 ms 85.5% + triton_bmm_43647 0.0101 ms 82.0% +SingleProcess AUTOTUNE takes 4.0844 seconds +AUTOTUNE bmm(16x1x458, 16x458x96) + triton_bmm_43666 0.0098 ms 100.0% + triton_bmm_43665 0.0101 ms 96.8% + triton_bmm_43668 0.0106 ms 92.2% + triton_bmm_43669 0.0108 ms 90.5% + triton_bmm_43664 0.0109 ms 89.6% + triton_bmm_43663 0.0111 ms 88.2% + triton_bmm_43662 0.0121 ms 80.7% + triton_bmm_43661 0.0130 ms 75.1% + bmm 0.0147 ms 66.5% + triton_bmm_43660 0.0168 ms 58.2% +SingleProcess AUTOTUNE takes 4.1896 seconds +AUTOTUNE bmm(16x1x96, 16x96x459) + triton_bmm_43738 0.0084 ms 100.0% + triton_bmm_43737 0.0088 ms 96.0% + triton_bmm_43735 0.0089 ms 95.3% + triton_bmm_43736 0.0090 ms 94.3% + triton_bmm_43732 0.0090 ms 93.6% + triton_bmm_43733 0.0091 ms 93.3% + triton_bmm_43734 0.0091 ms 93.3% + triton_bmm_43740 0.0094 ms 89.8% + triton_bmm_43742 0.0098 ms 86.0% + triton_bmm_43739 0.0099 ms 85.7% +SingleProcess AUTOTUNE takes 4.1090 seconds +AUTOTUNE bmm(16x1x459, 16x459x96) + bmm 0.0110 ms 100.0% + triton_bmm_43765 0.0127 ms 86.4% + triton_bmm_43764 0.0142 ms 77.3% + triton_bmm_43760 0.0176 ms 62.4% + triton_bmm_43761 0.0179 ms 61.4% + triton_bmm_43762 0.0179 ms 61.2% + triton_bmm_43758 0.0184 ms 59.7% + triton_bmm_43757 0.0192 ms 57.2% + triton_bmm_43759 0.0194 ms 56.5% + triton_bmm_43756 0.0228 ms 48.1% +SingleProcess AUTOTUNE takes 3.9311 seconds +AUTOTUNE bmm(16x1x96, 16x96x460) + triton_bmm_43832 0.0084 ms 100.0% + triton_bmm_43831 0.0088 ms 94.9% + triton_bmm_43834 0.0089 ms 93.9% + triton_bmm_43829 0.0090 ms 92.9% + triton_bmm_43830 0.0091 ms 91.9% + triton_bmm_43835 0.0093 ms 90.0% + triton_bmm_43836 0.0093 ms 89.8% + triton_bmm_43833 0.0093 ms 89.4% + triton_bmm_43828 0.0096 ms 87.3% + triton_bmm_43838 0.0098 ms 85.0% +SingleProcess AUTOTUNE takes 3.9290 seconds +AUTOTUNE bmm(16x1x460, 16x460x96) + triton_bmm_43857 0.0096 ms 100.0% + triton_bmm_43858 0.0096 ms 100.0% + triton_bmm_43860 0.0101 ms 94.9% + triton_bmm_43856 0.0108 ms 88.3% + triton_bmm_43861 0.0109 ms 87.7% + triton_bmm_43855 0.0116 ms 82.1% + triton_bmm_43854 0.0119 ms 80.4% + triton_bmm_43853 0.0132 ms 72.4% + bmm 0.0147 ms 65.3% + triton_bmm_43852 0.0168 ms 56.8% +SingleProcess AUTOTUNE takes 4.2656 seconds +AUTOTUNE bmm(16x1x96, 16x96x461) + triton_bmm_43927 0.0083 ms 100.0% + triton_bmm_43928 0.0084 ms 99.2% + triton_bmm_43930 0.0084 ms 98.7% + triton_bmm_43925 0.0085 ms 97.7% + triton_bmm_43926 0.0085 ms 97.0% + triton_bmm_43932 0.0088 ms 93.8% + triton_bmm_43924 0.0090 ms 91.8% + triton_bmm_43929 0.0093 ms 88.7% + triton_bmm_43935 0.0097 ms 85.1% + triton_bmm_43933 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 4.0463 seconds +AUTOTUNE bmm(16x1x461, 16x461x96) + bmm 0.0113 ms 100.0% + triton_bmm_43957 0.0127 ms 88.7% + triton_bmm_43956 0.0146 ms 77.1% + triton_bmm_43953 0.0175 ms 64.5% + triton_bmm_43950 0.0179 ms 63.1% + triton_bmm_43954 0.0179 ms 63.0% + triton_bmm_43952 0.0181 ms 62.1% + triton_bmm_43949 0.0189 ms 59.7% + triton_bmm_43951 0.0195 ms 57.9% + triton_bmm_43948 0.0228 ms 49.4% +SingleProcess AUTOTUNE takes 3.8348 seconds +AUTOTUNE bmm(16x1x96, 16x96x462) + triton_bmm_44023 0.0083 ms 100.0% + triton_bmm_44024 0.0083 ms 99.6% + triton_bmm_44026 0.0084 ms 99.2% + triton_bmm_44028 0.0088 ms 94.5% + triton_bmm_44025 0.0088 ms 94.2% + triton_bmm_44021 0.0091 ms 91.5% + triton_bmm_44022 0.0091 ms 91.2% + triton_bmm_44020 0.0096 ms 86.6% + triton_bmm_44031 0.0096 ms 86.6% + triton_bmm_44027 0.0098 ms 84.5% +SingleProcess AUTOTUNE takes 4.2569 seconds +AUTOTUNE bmm(16x1x462, 16x462x96) + triton_bmm_44049 0.0096 ms 100.0% + triton_bmm_44050 0.0103 ms 92.6% + triton_bmm_44052 0.0107 ms 89.3% + triton_bmm_44048 0.0110 ms 87.3% + triton_bmm_44053 0.0114 ms 84.0% + triton_bmm_44047 0.0116 ms 82.1% + triton_bmm_44046 0.0126 ms 76.1% + triton_bmm_44045 0.0127 ms 75.5% + bmm 0.0145 ms 66.0% + triton_bmm_44044 0.0171 ms 56.1% +SingleProcess AUTOTUNE takes 4.2187 seconds +AUTOTUNE bmm(16x1x96, 16x96x463) + triton_bmm_44120 0.0084 ms 100.0% + triton_bmm_44122 0.0084 ms 99.2% + triton_bmm_44119 0.0089 ms 94.2% + triton_bmm_44116 0.0090 ms 92.6% + triton_bmm_44117 0.0091 ms 92.2% + triton_bmm_44118 0.0091 ms 92.2% + triton_bmm_44121 0.0093 ms 89.4% + triton_bmm_44124 0.0094 ms 89.1% + triton_bmm_44127 0.0098 ms 85.3% + triton_bmm_44123 0.0099 ms 84.7% +SingleProcess AUTOTUNE takes 4.1488 seconds +AUTOTUNE bmm(16x1x463, 16x463x96) + bmm 0.0119 ms 100.0% + triton_bmm_44149 0.0122 ms 97.9% + triton_bmm_44148 0.0148 ms 80.7% + triton_bmm_44145 0.0175 ms 68.0% + triton_bmm_44146 0.0179 ms 66.5% + triton_bmm_44144 0.0181 ms 65.6% + triton_bmm_44142 0.0184 ms 64.7% + triton_bmm_44141 0.0192 ms 61.9% + triton_bmm_44143 0.0200 ms 59.6% + triton_bmm_44140 0.0229 ms 52.0% +SingleProcess AUTOTUNE takes 4.1019 seconds +AUTOTUNE bmm(16x1x96, 16x96x464) + triton_bmm_44216 0.0083 ms 100.0% + triton_bmm_44214 0.0085 ms 97.4% + triton_bmm_44215 0.0088 ms 94.5% + triton_bmm_44213 0.0088 ms 94.2% + triton_bmm_44220 0.0088 ms 94.2% + triton_bmm_44218 0.0089 ms 93.5% + triton_bmm_44212 0.0090 ms 92.2% + triton_bmm_44219 0.0093 ms 89.7% + triton_bmm_44217 0.0093 ms 89.3% + triton_bmm_44221 0.0093 ms 89.3% +SingleProcess AUTOTUNE takes 4.4167 seconds +AUTOTUNE bmm(16x1x464, 16x464x96) + triton_bmm_44241 0.0098 ms 100.0% + triton_bmm_44244 0.0101 ms 97.5% + triton_bmm_44242 0.0101 ms 96.8% + bmm 0.0107 ms 91.6% + triton_bmm_44245 0.0108 ms 90.8% + triton_bmm_44240 0.0109 ms 90.3% + triton_bmm_44239 0.0116 ms 84.3% + triton_bmm_44238 0.0124 ms 78.9% + triton_bmm_44237 0.0127 ms 77.5% + triton_bmm_44236 0.0172 ms 57.0% +SingleProcess AUTOTUNE takes 4.0805 seconds +AUTOTUNE bmm(16x1x96, 16x96x465) + triton_bmm_44311 0.0083 ms 100.0% + triton_bmm_44312 0.0089 ms 92.7% + triton_bmm_44309 0.0091 ms 91.5% + triton_bmm_44310 0.0091 ms 91.5% + triton_bmm_44314 0.0091 ms 90.9% + triton_bmm_44315 0.0093 ms 89.0% + triton_bmm_44313 0.0093 ms 88.7% + triton_bmm_44316 0.0094 ms 88.4% + triton_bmm_44308 0.0096 ms 86.3% + triton_bmm_44317 0.0098 ms 84.4% +SingleProcess AUTOTUNE takes 4.0771 seconds +AUTOTUNE bmm(16x1x465, 16x465x96) + bmm 0.0105 ms 100.0% + triton_bmm_44341 0.0122 ms 86.1% + triton_bmm_44340 0.0148 ms 70.8% + triton_bmm_44338 0.0176 ms 59.5% + triton_bmm_44336 0.0177 ms 59.0% + triton_bmm_44337 0.0182 ms 57.6% + triton_bmm_44334 0.0184 ms 56.8% + triton_bmm_44333 0.0194 ms 53.8% + triton_bmm_44335 0.0200 ms 52.3% + triton_bmm_44332 0.0234 ms 44.8% +SingleProcess AUTOTUNE takes 4.0831 seconds +AUTOTUNE bmm(16x1x96, 16x96x466) + triton_bmm_44405 0.0085 ms 100.0% + triton_bmm_44412 0.0088 ms 96.4% + triton_bmm_44407 0.0089 ms 95.7% + triton_bmm_44408 0.0089 ms 95.3% + triton_bmm_44404 0.0090 ms 94.0% + triton_bmm_44406 0.0091 ms 93.3% + triton_bmm_44410 0.0091 ms 93.0% + triton_bmm_44409 0.0093 ms 90.8% + triton_bmm_44415 0.0096 ms 88.6% + triton_bmm_44411 0.0098 ms 86.3% +SingleProcess AUTOTUNE takes 3.8363 seconds +AUTOTUNE bmm(16x1x466, 16x466x96) + triton_bmm_44434 0.0098 ms 100.0% + triton_bmm_44433 0.0101 ms 97.2% + triton_bmm_44436 0.0101 ms 97.2% + triton_bmm_44437 0.0108 ms 90.6% + triton_bmm_44432 0.0111 ms 88.2% + triton_bmm_44431 0.0116 ms 84.3% + triton_bmm_44430 0.0122 ms 80.8% + triton_bmm_44429 0.0132 ms 74.3% + bmm 0.0144 ms 68.4% + triton_bmm_44428 0.0171 ms 57.5% +SingleProcess AUTOTUNE takes 3.9591 seconds +AUTOTUNE bmm(16x1x96, 16x96x467) + triton_bmm_44503 0.0083 ms 100.0% + triton_bmm_44504 0.0084 ms 99.2% + triton_bmm_44502 0.0085 ms 97.0% + triton_bmm_44508 0.0088 ms 93.8% + triton_bmm_44501 0.0091 ms 91.2% + triton_bmm_44506 0.0091 ms 90.9% + triton_bmm_44507 0.0093 ms 89.0% + triton_bmm_44505 0.0093 ms 88.7% + triton_bmm_44500 0.0096 ms 86.2% + triton_bmm_44511 0.0101 ms 81.7% +SingleProcess AUTOTUNE takes 4.1361 seconds +AUTOTUNE bmm(16x1x467, 16x467x96) + bmm 0.0106 ms 100.0% + triton_bmm_44533 0.0127 ms 83.4% + triton_bmm_44532 0.0148 ms 71.6% + triton_bmm_44530 0.0176 ms 60.2% + triton_bmm_44528 0.0176 ms 60.1% + triton_bmm_44529 0.0179 ms 59.3% + triton_bmm_44526 0.0179 ms 59.2% + triton_bmm_44525 0.0189 ms 56.0% + triton_bmm_44527 0.0200 ms 53.0% + triton_bmm_44524 0.0234 ms 45.3% +SingleProcess AUTOTUNE takes 3.9435 seconds +AUTOTUNE bmm(16x1x96, 16x96x468) + triton_bmm_44598 0.0085 ms 100.0% + triton_bmm_44599 0.0088 ms 97.1% + triton_bmm_44601 0.0088 ms 97.1% + triton_bmm_44600 0.0089 ms 96.0% + triton_bmm_44602 0.0089 ms 95.7% + triton_bmm_44596 0.0090 ms 94.7% + triton_bmm_44597 0.0091 ms 94.3% + triton_bmm_44604 0.0093 ms 91.4% + triton_bmm_44605 0.0095 ms 89.6% + triton_bmm_44603 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 3.9421 seconds +AUTOTUNE bmm(16x1x468, 16x468x96) + triton_bmm_44625 0.0096 ms 100.0% + triton_bmm_44628 0.0101 ms 94.9% + triton_bmm_44626 0.0102 ms 94.2% + triton_bmm_44624 0.0105 ms 91.4% + triton_bmm_44629 0.0110 ms 86.7% + triton_bmm_44623 0.0111 ms 85.9% + triton_bmm_44622 0.0121 ms 79.3% + triton_bmm_44621 0.0127 ms 75.5% + bmm 0.0161 ms 59.5% + triton_bmm_44620 0.0174 ms 55.1% +SingleProcess AUTOTUNE takes 4.1999 seconds +AUTOTUNE bmm(16x1x96, 16x96x469) + triton_bmm_44693 0.0085 ms 100.0% + triton_bmm_44700 0.0088 ms 96.4% + triton_bmm_44695 0.0089 ms 96.0% + triton_bmm_44694 0.0091 ms 93.7% + triton_bmm_44696 0.0091 ms 93.7% + triton_bmm_44698 0.0091 ms 93.3% + triton_bmm_44697 0.0093 ms 91.1% + triton_bmm_44692 0.0096 ms 88.4% + triton_bmm_44703 0.0098 ms 86.9% + triton_bmm_44702 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 4.2203 seconds +AUTOTUNE bmm(16x1x469, 16x469x96) + bmm 0.0108 ms 100.0% + triton_bmm_44725 0.0122 ms 88.2% + triton_bmm_44724 0.0148 ms 72.7% + triton_bmm_44721 0.0177 ms 60.8% + triton_bmm_44722 0.0180 ms 59.9% + triton_bmm_44720 0.0181 ms 59.3% + triton_bmm_44718 0.0184 ms 58.3% + triton_bmm_44717 0.0189 ms 56.8% + triton_bmm_44719 0.0202 ms 53.2% + triton_bmm_44716 0.0231 ms 46.5% +SingleProcess AUTOTUNE takes 4.4915 seconds +AUTOTUNE bmm(16x1x96, 16x96x470) + triton_bmm_44791 0.0083 ms 100.0% + triton_bmm_44792 0.0083 ms 99.6% + triton_bmm_44794 0.0084 ms 98.7% + triton_bmm_44789 0.0084 ms 98.1% + triton_bmm_44793 0.0088 ms 94.2% + triton_bmm_44788 0.0091 ms 91.5% + triton_bmm_44790 0.0091 ms 91.2% + triton_bmm_44796 0.0092 ms 90.2% + triton_bmm_44798 0.0098 ms 84.4% + triton_bmm_44795 0.0099 ms 84.1% +SingleProcess AUTOTUNE takes 4.2180 seconds +AUTOTUNE bmm(16x1x470, 16x470x96) + triton_bmm_44818 0.0098 ms 100.0% + triton_bmm_44817 0.0101 ms 96.8% + triton_bmm_44820 0.0101 ms 96.8% + triton_bmm_44816 0.0106 ms 92.4% + triton_bmm_44821 0.0108 ms 90.3% + triton_bmm_44815 0.0111 ms 87.9% + triton_bmm_44814 0.0127 ms 77.3% + triton_bmm_44813 0.0132 ms 74.1% + bmm 0.0145 ms 67.7% + triton_bmm_44812 0.0175 ms 56.1% +SingleProcess AUTOTUNE takes 4.6416 seconds +AUTOTUNE bmm(16x1x96, 16x96x471) + triton_bmm_44885 0.0085 ms 100.0% + triton_bmm_44892 0.0088 ms 97.1% + triton_bmm_44887 0.0089 ms 96.0% + triton_bmm_44886 0.0091 ms 94.0% + triton_bmm_44888 0.0091 ms 94.0% + triton_bmm_44890 0.0091 ms 93.7% + triton_bmm_44889 0.0093 ms 91.8% + triton_bmm_44884 0.0096 ms 89.0% + triton_bmm_44894 0.0098 ms 87.0% + triton_bmm_44891 0.0099 ms 86.7% +SingleProcess AUTOTUNE takes 3.8881 seconds +AUTOTUNE bmm(16x1x471, 16x471x96) + bmm 0.0111 ms 100.0% + triton_bmm_44917 0.0122 ms 91.3% + triton_bmm_44916 0.0142 ms 78.0% + triton_bmm_44912 0.0179 ms 62.2% + triton_bmm_44913 0.0179 ms 62.2% + triton_bmm_44914 0.0181 ms 61.2% + triton_bmm_44910 0.0184 ms 60.2% + triton_bmm_44909 0.0189 ms 58.7% + triton_bmm_44911 0.0202 ms 54.9% + triton_bmm_44908 0.0235 ms 47.2% +SingleProcess AUTOTUNE takes 5.8403 seconds +AUTOTUNE bmm(16x1x96, 16x96x472) + triton_bmm_44983 0.0083 ms 100.0% + triton_bmm_44986 0.0083 ms 99.6% + triton_bmm_44982 0.0085 ms 97.4% + triton_bmm_44985 0.0088 ms 94.5% + triton_bmm_44988 0.0088 ms 94.2% + triton_bmm_44981 0.0088 ms 93.8% + triton_bmm_44984 0.0089 ms 93.5% + triton_bmm_44980 0.0090 ms 91.8% + triton_bmm_44987 0.0093 ms 89.3% + triton_bmm_44989 0.0093 ms 89.3% +SingleProcess AUTOTUNE takes 3.9889 seconds +AUTOTUNE bmm(16x1x472, 16x472x96) + triton_bmm_45010 0.0103 ms 100.0% + triton_bmm_45008 0.0105 ms 98.3% + bmm 0.0106 ms 97.4% + triton_bmm_45009 0.0106 ms 97.1% + triton_bmm_45012 0.0109 ms 94.9% + triton_bmm_45007 0.0111 ms 92.7% + triton_bmm_45013 0.0116 ms 88.6% + triton_bmm_45006 0.0121 ms 85.1% + triton_bmm_45005 0.0132 ms 78.1% + triton_bmm_45004 0.0168 ms 61.3% +SingleProcess AUTOTUNE takes 4.1778 seconds +AUTOTUNE bmm(16x1x96, 16x96x473) + triton_bmm_45079 0.0083 ms 100.0% + triton_bmm_45080 0.0084 ms 98.1% + triton_bmm_45082 0.0085 ms 97.0% + triton_bmm_45081 0.0088 ms 94.2% + triton_bmm_45076 0.0091 ms 91.5% + triton_bmm_45077 0.0091 ms 91.2% + triton_bmm_45078 0.0091 ms 91.2% + triton_bmm_45084 0.0094 ms 88.1% + triton_bmm_45086 0.0098 ms 84.4% + triton_bmm_45083 0.0099 ms 84.1% +SingleProcess AUTOTUNE takes 3.9344 seconds +AUTOTUNE bmm(16x1x473, 16x473x96) + bmm 0.0111 ms 100.0% + triton_bmm_45109 0.0127 ms 87.7% + triton_bmm_45108 0.0142 ms 78.2% + triton_bmm_45104 0.0179 ms 62.3% + triton_bmm_45102 0.0181 ms 61.4% + triton_bmm_45106 0.0182 ms 61.3% + triton_bmm_45105 0.0184 ms 60.6% + triton_bmm_45101 0.0195 ms 57.2% + triton_bmm_45103 0.0201 ms 55.5% + triton_bmm_45100 0.0231 ms 48.3% +SingleProcess AUTOTUNE takes 3.9913 seconds +AUTOTUNE bmm(16x1x96, 16x96x474) + triton_bmm_45173 0.0085 ms 100.0% + triton_bmm_45178 0.0085 ms 100.0% + triton_bmm_45174 0.0085 ms 99.6% + triton_bmm_45175 0.0089 ms 95.9% + triton_bmm_45176 0.0089 ms 95.3% + triton_bmm_45172 0.0091 ms 94.0% + triton_bmm_45177 0.0093 ms 91.6% + triton_bmm_45180 0.0094 ms 90.8% + triton_bmm_45183 0.0096 ms 89.0% + triton_bmm_45182 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 3.8908 seconds +AUTOTUNE bmm(16x1x474, 16x474x96) + triton_bmm_45202 0.0098 ms 100.0% + triton_bmm_45201 0.0101 ms 97.2% + triton_bmm_45200 0.0106 ms 92.7% + triton_bmm_45204 0.0107 ms 92.1% + triton_bmm_45205 0.0113 ms 87.2% + triton_bmm_45199 0.0117 ms 84.1% + triton_bmm_45198 0.0122 ms 80.8% + triton_bmm_45197 0.0132 ms 74.3% + bmm 0.0145 ms 67.6% + triton_bmm_45196 0.0171 ms 57.5% +SingleProcess AUTOTUNE takes 4.0842 seconds +AUTOTUNE bmm(16x1x96, 16x96x475) + triton_bmm_45274 0.0086 ms 100.0% + triton_bmm_45273 0.0088 ms 97.5% + triton_bmm_45276 0.0088 ms 97.1% + triton_bmm_45271 0.0089 ms 96.1% + triton_bmm_45269 0.0091 ms 94.7% + triton_bmm_45270 0.0091 ms 94.4% + triton_bmm_45272 0.0091 ms 94.4% + triton_bmm_45275 0.0093 ms 92.1% + triton_bmm_45268 0.0096 ms 89.3% + triton_bmm_45279 0.0098 ms 87.6% +SingleProcess AUTOTUNE takes 3.9780 seconds +AUTOTUNE bmm(16x1x475, 16x475x96) + bmm 0.0117 ms 100.0% + triton_bmm_45301 0.0122 ms 96.6% + triton_bmm_45300 0.0142 ms 82.5% + triton_bmm_45297 0.0176 ms 66.6% + triton_bmm_45296 0.0179 ms 65.8% + triton_bmm_45298 0.0182 ms 64.6% + triton_bmm_45294 0.0186 ms 63.3% + triton_bmm_45293 0.0189 ms 62.1% + triton_bmm_45295 0.0202 ms 58.1% + triton_bmm_45292 0.0230 ms 51.0% +SingleProcess AUTOTUNE takes 4.2328 seconds +AUTOTUNE bmm(16x1x96, 16x96x476) + triton_bmm_45367 0.0083 ms 100.0% + triton_bmm_45368 0.0084 ms 99.2% + triton_bmm_45370 0.0084 ms 99.2% + triton_bmm_45364 0.0090 ms 91.8% + triton_bmm_45365 0.0091 ms 91.5% + triton_bmm_45366 0.0091 ms 91.2% + triton_bmm_45371 0.0093 ms 89.3% + triton_bmm_45369 0.0093 ms 88.7% + triton_bmm_45372 0.0094 ms 88.4% + triton_bmm_45373 0.0096 ms 86.6% +SingleProcess AUTOTUNE takes 3.8170 seconds +AUTOTUNE bmm(16x1x476, 16x476x96) + triton_bmm_45394 0.0098 ms 100.0% + triton_bmm_45393 0.0101 ms 97.1% + triton_bmm_45392 0.0104 ms 94.2% + triton_bmm_45397 0.0106 ms 92.7% + triton_bmm_45396 0.0108 ms 90.8% + triton_bmm_45391 0.0117 ms 83.8% + triton_bmm_45390 0.0127 ms 77.1% + triton_bmm_45389 0.0127 ms 76.9% + bmm 0.0148 ms 66.2% + triton_bmm_45388 0.0174 ms 56.2% +SingleProcess AUTOTUNE takes 4.7981 seconds +AUTOTUNE bmm(16x1x96, 16x96x477) + triton_bmm_45464 0.0085 ms 100.0% + triton_bmm_45461 0.0085 ms 99.8% + triton_bmm_45462 0.0086 ms 99.4% + triton_bmm_45466 0.0086 ms 99.4% + triton_bmm_45468 0.0088 ms 96.9% + triton_bmm_45463 0.0089 ms 96.0% + triton_bmm_45465 0.0095 ms 90.0% + triton_bmm_45460 0.0096 ms 88.5% + triton_bmm_45470 0.0098 ms 86.8% + triton_bmm_45467 0.0099 ms 86.5% +SingleProcess AUTOTUNE takes 4.6660 seconds +AUTOTUNE bmm(16x1x477, 16x477x96) + bmm 0.0118 ms 100.0% + triton_bmm_45493 0.0124 ms 95.6% + triton_bmm_45492 0.0148 ms 80.1% + triton_bmm_45489 0.0179 ms 66.3% + triton_bmm_45486 0.0181 ms 65.4% + triton_bmm_45490 0.0182 ms 65.1% + triton_bmm_45488 0.0184 ms 64.3% + triton_bmm_45485 0.0189 ms 62.6% + triton_bmm_45487 0.0197 ms 60.2% + triton_bmm_45484 0.0236 ms 50.2% +SingleProcess AUTOTUNE takes 3.8403 seconds +AUTOTUNE bmm(16x1x96, 16x96x478) + triton_bmm_45557 0.0085 ms 100.0% + triton_bmm_45558 0.0085 ms 100.0% + triton_bmm_45561 0.0090 ms 95.4% + triton_bmm_45559 0.0091 ms 94.3% + triton_bmm_45560 0.0091 ms 94.0% + triton_bmm_45562 0.0091 ms 93.7% + triton_bmm_45564 0.0094 ms 91.1% + triton_bmm_45556 0.0096 ms 89.3% + triton_bmm_45563 0.0098 ms 87.0% + triton_bmm_45566 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 4.1107 seconds +AUTOTUNE bmm(16x1x478, 16x478x96) + triton_bmm_45585 0.0101 ms 100.0% + triton_bmm_45588 0.0102 ms 99.7% + triton_bmm_45586 0.0104 ms 97.8% + triton_bmm_45589 0.0108 ms 93.5% + triton_bmm_45584 0.0111 ms 91.1% + triton_bmm_45583 0.0117 ms 86.8% + triton_bmm_45582 0.0122 ms 83.4% + triton_bmm_45581 0.0132 ms 76.8% + bmm 0.0146 ms 69.5% + triton_bmm_45580 0.0171 ms 59.4% +SingleProcess AUTOTUNE takes 4.1074 seconds +AUTOTUNE bmm(16x1x96, 16x96x479) + triton_bmm_45655 0.0084 ms 100.0% + triton_bmm_45658 0.0086 ms 97.4% + triton_bmm_45660 0.0088 ms 94.6% + triton_bmm_45653 0.0091 ms 92.2% + triton_bmm_45654 0.0091 ms 91.6% + triton_bmm_45656 0.0091 ms 91.6% + triton_bmm_45659 0.0093 ms 89.7% + triton_bmm_45657 0.0094 ms 89.2% + triton_bmm_45652 0.0096 ms 86.9% + triton_bmm_45662 0.0098 ms 85.0% +SingleProcess AUTOTUNE takes 4.0834 seconds +AUTOTUNE bmm(16x1x479, 16x479x96) + triton_bmm_45685 0.0124 ms 100.0% + bmm 0.0124 ms 99.7% + triton_bmm_45684 0.0148 ms 83.8% + triton_bmm_45681 0.0163 ms 75.9% + triton_bmm_45682 0.0171 ms 72.3% + triton_bmm_45680 0.0173 ms 71.5% + triton_bmm_45678 0.0176 ms 70.5% + triton_bmm_45679 0.0184 ms 67.3% + triton_bmm_45677 0.0197 ms 62.9% + triton_bmm_45676 0.0244 ms 50.9% +SingleProcess AUTOTUNE takes 4.0325 seconds +AUTOTUNE bmm(16x1x96, 16x96x480) + triton_bmm_45749 0.0085 ms 100.0% + triton_bmm_45752 0.0085 ms 99.6% + triton_bmm_45754 0.0085 ms 99.6% + triton_bmm_45751 0.0089 ms 96.0% + triton_bmm_45756 0.0090 ms 94.3% + triton_bmm_45750 0.0091 ms 93.7% + triton_bmm_45753 0.0093 ms 91.1% + triton_bmm_45748 0.0096 ms 89.0% + triton_bmm_45759 0.0096 ms 89.0% + triton_bmm_45755 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 3.8242 seconds +AUTOTUNE bmm(16x1x480, 16x480x96) + triton_bmm_45777 0.0098 ms 100.0% + triton_bmm_45780 0.0101 ms 97.5% + triton_bmm_45778 0.0102 ms 95.9% + bmm 0.0108 ms 90.6% + triton_bmm_45776 0.0109 ms 90.0% + triton_bmm_45781 0.0114 ms 86.2% + triton_bmm_45775 0.0117 ms 83.9% + triton_bmm_45774 0.0125 ms 78.4% + triton_bmm_45773 0.0127 ms 77.5% + triton_bmm_45772 0.0171 ms 57.5% +SingleProcess AUTOTUNE takes 3.9401 seconds +AUTOTUNE bmm(16x1x96, 16x96x481) + triton_bmm_45847 0.0084 ms 100.0% + triton_bmm_45845 0.0086 ms 98.5% + triton_bmm_45849 0.0090 ms 93.6% + triton_bmm_45852 0.0091 ms 93.3% + triton_bmm_45848 0.0091 ms 92.6% + triton_bmm_45850 0.0091 ms 92.6% + triton_bmm_45844 0.0093 ms 91.0% + triton_bmm_45846 0.0093 ms 90.7% + triton_bmm_45855 0.0098 ms 86.3% + triton_bmm_45854 0.0098 ms 86.0% +SingleProcess AUTOTUNE takes 3.9986 seconds +AUTOTUNE bmm(16x1x481, 16x481x96) + bmm 0.0106 ms 100.0% + triton_bmm_45877 0.0124 ms 85.3% + triton_bmm_45876 0.0148 ms 71.6% + triton_bmm_45874 0.0180 ms 58.9% + triton_bmm_45872 0.0181 ms 58.4% + triton_bmm_45870 0.0184 ms 57.6% + triton_bmm_45873 0.0187 ms 56.7% + triton_bmm_45869 0.0199 ms 53.1% + triton_bmm_45871 0.0205 ms 51.6% + triton_bmm_45868 0.0238 ms 44.4% +SingleProcess AUTOTUNE takes 3.9723 seconds +AUTOTUNE bmm(16x1x96, 16x96x482) + triton_bmm_45943 0.0084 ms 100.0% + triton_bmm_45944 0.0085 ms 97.8% + triton_bmm_45946 0.0085 ms 97.8% + triton_bmm_45948 0.0088 ms 94.6% + triton_bmm_45941 0.0091 ms 91.9% + triton_bmm_45942 0.0091 ms 91.6% + triton_bmm_45945 0.0093 ms 89.4% + triton_bmm_45951 0.0096 ms 87.3% + triton_bmm_45940 0.0096 ms 86.7% + triton_bmm_45947 0.0099 ms 84.7% +SingleProcess AUTOTUNE takes 3.7449 seconds +AUTOTUNE bmm(16x1x482, 16x482x96) + triton_bmm_45972 0.0102 ms 100.0% + triton_bmm_45969 0.0102 ms 99.4% + triton_bmm_45970 0.0106 ms 96.1% + triton_bmm_45968 0.0106 ms 95.8% + triton_bmm_45973 0.0108 ms 93.8% + triton_bmm_45967 0.0114 ms 89.3% + triton_bmm_45965 0.0129 ms 78.7% + triton_bmm_45966 0.0130 ms 78.5% + bmm 0.0161 ms 63.1% + triton_bmm_45964 0.0181 ms 56.1% +SingleProcess AUTOTUNE takes 4.0386 seconds +AUTOTUNE bmm(16x1x96, 16x96x483) + triton_bmm_46037 0.0086 ms 100.0% + triton_bmm_46040 0.0086 ms 100.0% + triton_bmm_46042 0.0086 ms 100.0% + triton_bmm_46039 0.0089 ms 96.4% + triton_bmm_46041 0.0090 ms 95.0% + triton_bmm_46038 0.0091 ms 94.0% + triton_bmm_46043 0.0093 ms 92.1% + triton_bmm_46044 0.0094 ms 91.5% + triton_bmm_46036 0.0096 ms 89.3% + triton_bmm_46045 0.0098 ms 87.3% +SingleProcess AUTOTUNE takes 4.0123 seconds +AUTOTUNE bmm(16x1x483, 16x483x96) + bmm 0.0111 ms 100.0% + triton_bmm_46069 0.0130 ms 85.8% + triton_bmm_46068 0.0145 ms 76.9% + triton_bmm_46066 0.0181 ms 61.3% + triton_bmm_46064 0.0183 ms 60.6% + triton_bmm_46062 0.0186 ms 59.7% + triton_bmm_46065 0.0187 ms 59.5% + triton_bmm_46061 0.0194 ms 57.2% + triton_bmm_46063 0.0205 ms 54.2% + triton_bmm_46060 0.0238 ms 46.6% +SingleProcess AUTOTUNE takes 3.9565 seconds +AUTOTUNE bmm(16x1x96, 16x96x484) + triton_bmm_46135 0.0083 ms 100.0% + triton_bmm_46136 0.0085 ms 98.1% + triton_bmm_46134 0.0085 ms 97.4% + triton_bmm_46138 0.0086 ms 97.0% + triton_bmm_46137 0.0089 ms 93.2% + triton_bmm_46132 0.0091 ms 91.9% + triton_bmm_46133 0.0091 ms 91.5% + triton_bmm_46140 0.0094 ms 88.6% + triton_bmm_46139 0.0098 ms 84.7% + triton_bmm_46142 0.0098 ms 84.7% +SingleProcess AUTOTUNE takes 3.8626 seconds +AUTOTUNE bmm(16x1x484, 16x484x96) + triton_bmm_46161 0.0104 ms 100.0% + triton_bmm_46162 0.0104 ms 99.7% + triton_bmm_46164 0.0108 ms 96.1% + triton_bmm_46165 0.0111 ms 93.2% + triton_bmm_46160 0.0111 ms 93.1% + triton_bmm_46159 0.0119 ms 86.9% + triton_bmm_46158 0.0128 ms 81.2% + triton_bmm_46157 0.0135 ms 76.8% + bmm 0.0156 ms 66.4% + triton_bmm_46156 0.0176 ms 58.9% +SingleProcess AUTOTUNE takes 3.9486 seconds +AUTOTUNE bmm(16x1x96, 16x96x485) + triton_bmm_46231 0.0085 ms 100.0% + triton_bmm_46229 0.0086 ms 99.3% + triton_bmm_46232 0.0086 ms 99.3% + triton_bmm_46234 0.0086 ms 99.3% + triton_bmm_46228 0.0093 ms 91.7% + triton_bmm_46230 0.0093 ms 91.4% + triton_bmm_46236 0.0094 ms 90.8% + triton_bmm_46233 0.0096 ms 89.0% + triton_bmm_46237 0.0098 ms 86.6% + triton_bmm_46238 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 4.3985 seconds +AUTOTUNE bmm(16x1x485, 16x485x96) + bmm 0.0112 ms 100.0% + triton_bmm_46261 0.0129 ms 86.7% + triton_bmm_46260 0.0149 ms 74.9% + triton_bmm_46257 0.0181 ms 61.7% + triton_bmm_46258 0.0187 ms 60.0% + triton_bmm_46256 0.0188 ms 59.5% + triton_bmm_46254 0.0192 ms 58.4% + triton_bmm_46253 0.0195 ms 57.5% + triton_bmm_46255 0.0202 ms 55.5% + triton_bmm_46252 0.0242 ms 46.4% +SingleProcess AUTOTUNE takes 4.1967 seconds +AUTOTUNE bmm(16x1x96, 16x96x486) + triton_bmm_46325 0.0085 ms 100.0% + triton_bmm_46326 0.0085 ms 100.0% + triton_bmm_46330 0.0086 ms 99.6% + triton_bmm_46329 0.0090 ms 94.7% + triton_bmm_46327 0.0091 ms 94.0% + triton_bmm_46328 0.0091 ms 93.7% + triton_bmm_46332 0.0094 ms 91.1% + triton_bmm_46335 0.0096 ms 89.3% + triton_bmm_46324 0.0096 ms 89.0% + triton_bmm_46331 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 4.4690 seconds +AUTOTUNE bmm(16x1x486, 16x486x96) + triton_bmm_46353 0.0098 ms 100.0% + triton_bmm_46354 0.0106 ms 92.9% + triton_bmm_46356 0.0108 ms 90.6% + triton_bmm_46352 0.0112 ms 88.0% + triton_bmm_46357 0.0114 ms 86.0% + triton_bmm_46351 0.0121 ms 81.2% + triton_bmm_46350 0.0124 ms 79.1% + triton_bmm_46349 0.0135 ms 72.9% + bmm 0.0156 ms 62.8% + triton_bmm_46348 0.0176 ms 55.7% +SingleProcess AUTOTUNE takes 4.6871 seconds +AUTOTUNE bmm(16x1x96, 16x96x487) + triton_bmm_46423 0.0084 ms 100.0% + triton_bmm_46421 0.0086 ms 98.5% + triton_bmm_46422 0.0086 ms 98.5% + triton_bmm_46424 0.0086 ms 98.5% + triton_bmm_46426 0.0086 ms 98.5% + triton_bmm_46420 0.0091 ms 93.3% + triton_bmm_46428 0.0094 ms 89.8% + triton_bmm_46425 0.0095 ms 88.6% + triton_bmm_46431 0.0098 ms 86.3% + triton_bmm_46429 0.0098 ms 86.0% +SingleProcess AUTOTUNE takes 3.7539 seconds +AUTOTUNE bmm(16x1x487, 16x487x96) + bmm 0.0116 ms 100.0% + triton_bmm_46453 0.0124 ms 93.8% + triton_bmm_46452 0.0145 ms 80.4% + triton_bmm_46450 0.0183 ms 63.5% + triton_bmm_46448 0.0184 ms 63.4% + triton_bmm_46446 0.0187 ms 62.4% + triton_bmm_46449 0.0187 ms 62.3% + triton_bmm_46445 0.0197 ms 59.2% + triton_bmm_46447 0.0204 ms 57.0% + triton_bmm_46444 0.0244 ms 47.8% +SingleProcess AUTOTUNE takes 4.0279 seconds +AUTOTUNE bmm(16x1x96, 16x96x488) + triton_bmm_46522 0.0085 ms 100.0% + triton_bmm_46517 0.0085 ms 99.6% + triton_bmm_46518 0.0085 ms 99.3% + triton_bmm_46519 0.0089 ms 95.7% + triton_bmm_46520 0.0089 ms 95.2% + triton_bmm_46521 0.0090 ms 94.6% + triton_bmm_46523 0.0093 ms 91.4% + triton_bmm_46524 0.0094 ms 90.4% + triton_bmm_46516 0.0096 ms 88.6% + triton_bmm_46525 0.0098 ms 86.5% +SingleProcess AUTOTUNE takes 3.6795 seconds +AUTOTUNE bmm(16x1x488, 16x488x96) + triton_bmm_46545 0.0103 ms 100.0% + triton_bmm_46546 0.0104 ms 99.7% + triton_bmm_46544 0.0106 ms 97.6% + triton_bmm_46548 0.0109 ms 95.0% + bmm 0.0111 ms 92.8% + triton_bmm_46549 0.0117 ms 88.5% + triton_bmm_46543 0.0119 ms 86.6% + triton_bmm_46542 0.0124 ms 83.5% + triton_bmm_46541 0.0135 ms 76.5% + triton_bmm_46540 0.0180 ms 57.6% +SingleProcess AUTOTUNE takes 4.1392 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +dcgan +cuda eval dcgan int8weightonly-bs1-acc +pass-sqnr-53.240 + loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead + loading model: 0it [00:10, ?it/s] +WARNING:common:Model demucs does not support bfloat16, running with amp instead +demucs +cuda eval demucs int8weightonly-bs1-acc +WARNING:common:Model demucs does not support bfloat16, running with amp instead +AUTOTUNE mixed_mm(736x4096, 4096x2048) + fallback_mixed_mm 0.1142 ms 100.0% + triton_mm_2 0.1572 ms 72.6% + triton_mm_1 0.1587 ms 71.9% + triton_mm_3 0.1641 ms 69.6% + triton_mm_4 0.1690 ms 67.6% + triton_mm_0 0.1830 ms 62.4% + triton_mm_7 0.2509 ms 45.5% + triton_mm_8 0.2814 ms 40.6% + triton_mm_6 0.2841 ms 40.2% + triton_mm_9 0.3043 ms 37.5% +SingleProcess AUTOTUNE takes 4.9388 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +densenet121 +cuda eval densenet121 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x1024, 1024x1000) + triton_mm_1101 0.0167 ms 100.0% + triton_mm_1104 0.0180 ms 92.6% + triton_mm_1102 0.0195 ms 85.6% + triton_mm_1100 0.0200 ms 83.5% + triton_mm_1105 0.0201 ms 83.0% + triton_mm_1098 0.0253 ms 66.1% + triton_mm_1097 0.0261 ms 63.9% + triton_mm_1099 0.0309 ms 54.0% + triton_mm_1096 0.0421 ms 39.7% + triton_mm_1106 0.0433 ms 38.6% +SingleProcess AUTOTUNE takes 4.1622 seconds +pass-sqnr-29.470 + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4 +WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4 + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5 +WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn +WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_fcos_r_50_fpn int8weightonly-bs1-acc +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4 +WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4 +WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:11, ?it/s] +dlrm +cuda eval dlrm int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x512, 512x512) + triton_mm_5 0.0112 ms 100.0% + triton_mm_8 0.0125 ms 89.3% + triton_mm_6 0.0129 ms 86.6% + triton_mm_4 0.0134 ms 83.7% + triton_mm_9 0.0137 ms 81.6% + triton_mm_2 0.0156 ms 71.9% + triton_mm_1 0.0162 ms 69.2% + triton_mm_3 0.0184 ms 61.0% + triton_mm_0 0.0231 ms 48.4% + triton_mm_10 0.0243 ms 46.1% +SingleProcess AUTOTUNE takes 3.7067 seconds +AUTOTUNE mixed_mm(1x512, 512x64) + triton_mm_14 0.0102 ms 100.0% + triton_mm_18 0.0120 ms 85.3% + triton_mm_17 0.0120 ms 84.8% + triton_mm_15 0.0127 ms 80.4% + triton_mm_13 0.0132 ms 77.1% + triton_mm_12 0.0154 ms 66.2% + triton_mm_16 0.0201 ms 50.7% + triton_mm_19 0.0220 ms 46.3% + triton_mm_11 0.0222 ms 46.0% + fallback_mixed_mm 0.0718 ms 14.2% +SingleProcess AUTOTUNE takes 3.0239 seconds +AUTOTUNE mixed_mm(1x100, 100x1024) + triton_mm_32 0.0083 ms 100.0% + triton_mm_37 0.0083 ms 99.6% + triton_mm_33 0.0084 ms 98.9% + triton_mm_34 0.0086 ms 95.6% + triton_mm_30 0.0092 ms 89.6% + triton_mm_29 0.0093 ms 89.0% + triton_mm_36 0.0093 ms 89.0% + triton_mm_31 0.0097 ms 84.9% + triton_mm_28 0.0107 ms 77.0% + triton_mm_38 0.0108 ms 76.3% +SingleProcess AUTOTUNE takes 3.7915 seconds +AUTOTUNE mixed_mm(1x1024, 1024x1024) + triton_mm_44 0.0167 ms 100.0% + triton_mm_47 0.0192 ms 87.0% + triton_mm_43 0.0194 ms 86.1% + triton_mm_45 0.0194 ms 86.1% + triton_mm_48 0.0214 ms 77.9% + triton_mm_41 0.0253 ms 65.8% + triton_mm_40 0.0264 ms 63.2% + triton_mm_42 0.0302 ms 55.1% + triton_mm_39 0.0414 ms 40.3% + triton_mm_49 0.0447 ms 37.3% +SingleProcess AUTOTUNE takes 3.9345 seconds +AUTOTUNE mixed_mm(1x1024, 1024x1) + triton_mm_65 0.0125 ms 100.0% + triton_mm_63 0.0139 ms 89.7% + triton_mm_64 0.0145 ms 86.3% + triton_mm_66 0.0147 ms 85.0% + triton_mm_62 0.0161 ms 77.4% + triton_mm_61 0.0251 ms 49.8% + triton_mm_67 0.0303 ms 41.2% + fallback_mixed_mm 0.0628 ms 19.9% +SingleProcess AUTOTUNE takes 2.1651 seconds +pass-sqnr-nan + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +doctr_det_predictor +cuda eval doctr_det_predictor int8weightonly-bs1-acc +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +[2023-12-13 01:21:07,274] [2/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +Fatal glibc error: malloc.c:2496 (sysmalloc): assertion failed: (old_top == initial_top (av) && old_size == 0) || ((unsigned long) (old_size) >= MINSIZE && prev_inuse (old_top) && ((unsigned long) old_end & (pagesize - 1)) == 0) +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor int8weightonly-bs1-acc +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +AUTOTUNE mixed_mm(32x256, 256x124) + triton_mm_95 0.0091 ms 100.0% + triton_mm_98 0.0094 ms 97.4% + triton_mm_99 0.0095 ms 96.5% + triton_mm_92 0.0100 ms 90.9% + triton_mm_94 0.0104 ms 87.6% + triton_mm_91 0.0105 ms 86.8% + triton_mm_96 0.0106 ms 86.5% + triton_mm_93 0.0121 ms 75.3% + triton_mm_100 0.0135 ms 67.7% + triton_mm_90 0.0137 ms 66.9% +SingleProcess AUTOTUNE takes 4.3295 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +drq +cuda eval drq int8weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for drq. Setting accuracy check to cosine +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3642, in run + runner.run_one_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2518, in run_one_model + status = self.check_accuracy( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2147, in check_accuracy + model, example_inputs = self.maybe_cast(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1937, in maybe_cast + model = self.deepcopy_model(model) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1887, in deepcopy_model + return copy.deepcopy(model) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 297, in _reconstruct + value = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 153, in deepcopy + y = copier(memo) + File "/home/cdhernandez/local/pytorch/torch/_tensor.py", line 86, in __deepcopy__ + raise RuntimeError( +RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment. If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001 +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +fastNLP_Bert +cuda eval fastNLP_Bert int8weightonly-bs1-acc +AUTOTUNE mixed_mm(475x768, 768x768) + triton_mm_8 0.0219 ms 100.0% + triton_mm_4 0.0244 ms 89.7% + triton_mm_6 0.0247 ms 88.6% + triton_mm_5 0.0253 ms 86.5% + triton_mm_3 0.0261 ms 83.9% + triton_mm_2 0.0267 ms 82.2% + triton_mm_9 0.0269 ms 81.5% + triton_mm_1 0.0270 ms 81.3% + triton_mm_0 0.0334 ms 65.6% + triton_mm_10 0.0365 ms 60.1% +SingleProcess AUTOTUNE takes 5.3742 seconds +AUTOTUNE mixed_mm(475x768, 768x3072) + triton_mm_45 0.0378 ms 100.0% + triton_mm_46 0.0381 ms 99.2% + triton_mm_48 0.0381 ms 99.2% + triton_mm_44 0.0409 ms 92.3% + triton_mm_47 0.0424 ms 89.1% + triton_mm_51 0.0608 ms 62.2% + triton_mm_54 0.0615 ms 61.4% + triton_mm_52 0.0625 ms 60.5% + triton_mm_50 0.0632 ms 59.8% + fallback_mixed_mm 0.0668 ms 56.6% +SingleProcess AUTOTUNE takes 4.6660 seconds +AUTOTUNE mixed_mm(475x3072, 3072x768) + triton_mm_63 0.0644 ms 100.0% + fallback_mixed_mm 0.0689 ms 93.4% + triton_mm_59 0.0753 ms 85.5% + triton_mm_60 0.0774 ms 83.1% + triton_mm_61 0.0781 ms 82.4% + triton_mm_64 0.0795 ms 81.0% + triton_mm_58 0.0831 ms 77.4% + triton_mm_56 0.0851 ms 75.7% + triton_mm_57 0.0860 ms 74.9% + triton_mm_55 0.1131 ms 56.9% +SingleProcess AUTOTUNE takes 5.1943 seconds +AUTOTUNE mixed_mm(1x768, 768x768) + triton_mm_797 0.0144 ms 100.0% + triton_mm_800 0.0159 ms 90.7% + triton_mm_798 0.0161 ms 89.3% + triton_mm_796 0.0167 ms 86.0% + triton_mm_801 0.0184 ms 78.1% + triton_mm_794 0.0208 ms 69.1% + triton_mm_793 0.0216 ms 66.6% + triton_mm_795 0.0247 ms 58.4% + triton_mm_792 0.0325 ms 44.2% + triton_mm_802 0.0346 ms 41.6% +SingleProcess AUTOTUNE takes 3.9563 seconds +AUTOTUNE mixed_mm(473x768, 768x2) + triton_mm_812 0.0182 ms 100.0% + triton_mm_811 0.0224 ms 81.4% + triton_mm_804 0.0285 ms 63.8% + triton_mm_809 0.0294 ms 61.9% + triton_mm_808 0.0298 ms 61.1% + triton_mm_806 0.0301 ms 60.5% + triton_mm_813 0.0342 ms 53.3% + triton_mm_805 0.0344 ms 52.9% + triton_mm_807 0.0357 ms 51.0% + triton_mm_803 0.0401 ms 45.4% +SingleProcess AUTOTUNE takes 4.1810 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +functorch_dp_cifar10 +cuda eval functorch_dp_cifar10 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x512, 512x1000) + triton_mm_114 0.0119 ms 100.0% + triton_mm_117 0.0128 ms 93.0% + triton_mm_115 0.0130 ms 91.4% + triton_mm_113 0.0134 ms 88.9% + triton_mm_118 0.0141 ms 84.2% + triton_mm_111 0.0162 ms 73.4% + triton_mm_110 0.0166 ms 71.9% + triton_mm_112 0.0188 ms 63.2% + triton_mm_109 0.0239 ms 49.8% + triton_mm_119 0.0244 ms 48.9% +SingleProcess AUTOTUNE takes 3.9432 seconds +pass-sqnr-38.030 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +functorch_maml_omniglot +cuda eval functorch_maml_omniglot int8weightonly-bs1-acc +AUTOTUNE mixed_mm(5x64, 64x5) + triton_mm_23 0.0067 ms 100.0% + triton_mm_24 0.0068 ms 99.5% + triton_mm_20 0.0072 ms 93.7% + triton_mm_22 0.0072 ms 93.7% + triton_mm_21 0.0076 ms 88.2% + triton_mm_19 0.0079 ms 85.4% + triton_mm_25 0.0084 ms 79.5% + fallback_mixed_mm 0.0625 ms 10.7% +SingleProcess AUTOTUNE takes 2.5989 seconds +pass-sqnr-38.578 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Albert +cuda eval hf_Albert int8weightonly-bs1-acc +AUTOTUNE mixed_mm(512x128, 128x768) + triton_mm_6 0.0098 ms 100.0% + triton_mm_4 0.0099 ms 99.0% + triton_mm_3 0.0101 ms 96.8% + triton_mm_8 0.0102 ms 96.2% + triton_mm_9 0.0102 ms 95.9% + triton_mm_5 0.0103 ms 95.0% + triton_mm_2 0.0108 ms 90.5% + triton_mm_0 0.0108 ms 90.3% + triton_mm_1 0.0108 ms 90.3% + triton_mm_10 0.0118 ms 83.2% +SingleProcess AUTOTUNE takes 4.9389 seconds +AUTOTUNE mixed_mm(512x768, 768x768) + triton_mm_19 0.0225 ms 100.0% + triton_mm_15 0.0240 ms 93.7% + triton_mm_17 0.0249 ms 90.3% + triton_mm_16 0.0254 ms 88.5% + triton_mm_14 0.0255 ms 88.4% + triton_mm_20 0.0261 ms 86.3% + triton_mm_12 0.0265 ms 85.0% + triton_mm_13 0.0266 ms 84.6% + triton_mm_11 0.0336 ms 67.0% + triton_mm_21 0.0366 ms 61.5% +SingleProcess AUTOTUNE takes 5.0914 seconds +AUTOTUNE mixed_mm(512x768, 768x3072) + triton_mm_56 0.0374 ms 100.0% + triton_mm_57 0.0381 ms 98.1% + triton_mm_59 0.0382 ms 97.8% + triton_mm_58 0.0416 ms 89.9% + triton_mm_55 0.0417 ms 89.7% + triton_mm_62 0.0610 ms 61.4% + triton_mm_63 0.0619 ms 60.4% + fallback_mixed_mm 0.0651 ms 57.4% + triton_mm_60 0.0680 ms 55.0% + triton_mm_61 0.0701 ms 53.3% +SingleProcess AUTOTUNE takes 4.7835 seconds +AUTOTUNE mixed_mm(512x3072, 3072x768) + triton_mm_74 0.0647 ms 100.0% + fallback_mixed_mm 0.0700 ms 92.4% + triton_mm_70 0.0740 ms 87.3% + triton_mm_71 0.0774 ms 83.5% + triton_mm_72 0.0784 ms 82.5% + triton_mm_75 0.0796 ms 81.3% + triton_mm_69 0.0826 ms 78.3% + triton_mm_68 0.0849 ms 76.2% + triton_mm_67 0.0854 ms 75.7% + triton_mm_66 0.1131 ms 57.2% +SingleProcess AUTOTUNE takes 5.1298 seconds +AUTOTUNE mixed_mm(512x768, 768x128) + triton_mm_812 0.0167 ms 100.0% + triton_mm_809 0.0171 ms 98.1% + triton_mm_808 0.0178 ms 94.1% + triton_mm_811 0.0214 ms 78.1% + triton_mm_807 0.0243 ms 69.0% + triton_mm_806 0.0259 ms 64.7% + triton_mm_805 0.0266 ms 62.9% + triton_mm_804 0.0269 ms 62.2% + triton_mm_803 0.0336 ms 49.8% + triton_mm_813 0.0341 ms 49.0% +SingleProcess AUTOTUNE takes 5.5908 seconds +AUTOTUNE mixed_mm(512x128, 128x30000) + triton_mm_815 0.0608 ms 100.0% + triton_mm_816 0.0620 ms 98.1% + triton_mm_814 0.0665 ms 91.5% + triton_mm_818 0.0697 ms 87.3% + triton_mm_821 0.0728 ms 83.5% + triton_mm_817 0.0732 ms 83.1% + fallback_mixed_mm 0.0886 ms 68.6% + triton_mm_823 0.0997 ms 61.0% + triton_mm_824 0.1010 ms 60.3% + triton_mm_819 0.1155 ms 52.7% +SingleProcess AUTOTUNE takes 4.9492 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:07, ?it/s] +hf_Bart +cuda eval hf_Bart int8weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Bart. Setting accuracy check to cosine +AUTOTUNE mm(512x768, 768x50272) + mm 0.1905 ms 100.0% + triton_mm_1201 0.2281 ms 83.5% + triton_mm_1202 0.2318 ms 82.2% + triton_mm_1203 0.2669 ms 71.4% + triton_mm_1204 0.2733 ms 69.7% + triton_mm_1200 0.3125 ms 61.0% + triton_mm_1208 0.3228 ms 59.0% + triton_mm_1210 0.3991 ms 47.7% + triton_mm_1207 0.4509 ms 42.2% + triton_mm_1205 0.5708 ms 33.4% +SingleProcess AUTOTUNE takes 5.2384 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_BigBird +cuda eval hf_BigBird int8weightonly-bs1-acc +[2023-12-13 01:27:01,871] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mixed_mm(4096x768, 768x768) + triton_mm_1 0.0675 ms 100.0% + triton_mm_2 0.0680 ms 99.2% + triton_mm_4 0.0695 ms 97.1% + fallback_mixed_mm 0.0706 ms 95.5% + triton_mm_7 0.0748 ms 90.2% + triton_mm_0 0.0751 ms 89.9% + triton_mm_3 0.0759 ms 88.8% + triton_mm_8 0.1147 ms 58.8% + triton_mm_5 0.1174 ms 57.4% + triton_mm_6 0.1224 ms 55.1% +SingleProcess AUTOTUNE takes 4.9808 seconds +AUTOTUNE mixed_mm(4096x768, 768x3072) + fallback_mixed_mm 0.1132 ms 100.0% + triton_mm_225 0.2185 ms 51.8% + triton_mm_226 0.2204 ms 51.4% + triton_mm_224 0.2309 ms 49.0% + triton_mm_228 0.2357 ms 48.0% + triton_mm_227 0.2583 ms 43.8% + triton_mm_231 0.2636 ms 42.9% + triton_mm_232 0.3951 ms 28.6% + triton_mm_229 0.4188 ms 27.0% + triton_mm_234 0.4244 ms 26.7% +SingleProcess AUTOTUNE takes 4.9460 seconds +AUTOTUNE mixed_mm(4096x3072, 3072x768) + fallback_mixed_mm 0.1041 ms 100.0% + triton_mm_236 0.2408 ms 43.2% + triton_mm_237 0.2440 ms 42.7% + triton_mm_239 0.2460 ms 42.3% + triton_mm_235 0.2652 ms 39.2% + triton_mm_242 0.2701 ms 38.5% + triton_mm_238 0.2732 ms 38.1% + triton_mm_243 0.4149 ms 25.1% + triton_mm_240 0.4306 ms 24.2% + triton_mm_241 0.4518 ms 23.0% +SingleProcess AUTOTUNE takes 4.8512 seconds +[2023-12-13 01:27:19,248] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:20,819] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:22,429] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:24,394] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:25,999] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:27,638] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:29,195] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:30,764] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:32,323] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:33,892] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 01:27:35,803] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +AUTOTUNE mm(4096x768, 768x50360) + mm 1.6335 ms 100.0% + triton_mm_2965 2.0719 ms 78.8% + triton_mm_2964 2.2810 ms 71.6% + triton_mm_2967 2.3612 ms 69.2% + triton_mm_2966 2.4274 ms 67.3% + triton_mm_2963 2.8826 ms 56.7% + triton_mm_2971 2.9188 ms 56.0% + triton_mm_2970 3.4341 ms 47.6% + triton_mm_2973 3.4409 ms 47.5% + triton_mm_2969 4.9185 ms 33.2% +SingleProcess AUTOTUNE takes 5.5039 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_DistilBert +cuda eval hf_DistilBert int8weightonly-bs1-acc +AUTOTUNE mixed_mm(512x768, 768x30522) + triton_mm_553 0.3457 ms 100.0% + fallback_mixed_mm 0.3552 ms 97.3% + triton_mm_555 0.3875 ms 89.2% + triton_mm_551 0.3951 ms 87.5% + triton_mm_554 0.3963 ms 87.2% + triton_mm_552 0.3977 ms 86.9% + triton_mm_558 0.5228 ms 66.1% + triton_mm_560 0.5654 ms 61.1% + triton_mm_559 0.5891 ms 58.7% + triton_mm_561 0.5935 ms 58.2% +SingleProcess AUTOTUNE takes 5.3925 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_GPT2 +cuda eval hf_GPT2 int8weightonly-bs1-acc +AUTOTUNE mm(1024x768, 768x50264) + mm 0.4031 ms 100.0% + triton_mm_866 0.5013 ms 80.4% + triton_mm_865 0.5538 ms 72.8% + triton_mm_868 0.5661 ms 71.2% + triton_mm_867 0.5942 ms 67.8% + triton_mm_864 0.7043 ms 57.2% + triton_mm_872 0.7119 ms 56.6% + triton_mm_874 0.8612 ms 46.8% + triton_mm_871 0.8826 ms 45.7% + triton_mm_870 1.2103 ms 33.3% +SingleProcess AUTOTUNE takes 4.9893 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:18, ?it/s] +hf_GPT2_large +cuda eval hf_GPT2_large int8weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_Longformer +cuda eval hf_Longformer int8weightonly-bs1-acc +AUTOTUNE mm(4096x768, 768x50272) + mm 1.5226 ms 100.0% + triton_mm_1092 1.8732 ms 81.3% + triton_mm_1093 1.8884 ms 80.6% + triton_mm_1094 2.2286 ms 68.3% + triton_mm_1095 2.2553 ms 67.5% + triton_mm_1091 2.5500 ms 59.7% + triton_mm_1099 2.6541 ms 57.4% + triton_mm_1101 3.1172 ms 48.8% + triton_mm_1098 3.3088 ms 46.0% + triton_mm_1096 4.6139 ms 33.0% +SingleProcess AUTOTUNE takes 5.0703 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Reformer +cuda eval hf_Reformer int8weightonly-bs1-acc +AUTOTUNE mixed_mm(4096x256, 256x768) + triton_mm_1 0.0287 ms 100.0% + triton_mm_2 0.0290 ms 99.1% + triton_mm_4 0.0300 ms 95.6% + triton_mm_0 0.0314 ms 91.6% + triton_mm_7 0.0316 ms 91.0% + triton_mm_3 0.0319 ms 90.0% + triton_mm_8 0.0480 ms 59.9% + triton_mm_10 0.0481 ms 59.8% + triton_mm_5 0.0487 ms 59.0% + triton_mm_6 0.0490 ms 58.7% +SingleProcess AUTOTUNE takes 4.8995 seconds +AUTOTUNE mixed_mm(4096x768, 768x256) + triton_mm_56 0.0373 ms 100.0% + triton_mm_57 0.0386 ms 96.6% + triton_mm_59 0.0390 ms 95.6% + triton_mm_55 0.0400 ms 93.3% + triton_mm_58 0.0424 ms 87.9% + triton_mm_60 0.0467 ms 79.9% + triton_mm_61 0.0477 ms 78.2% + triton_mm_63 0.0492 ms 75.9% + triton_mm_65 0.0507 ms 73.6% + triton_mm_64 0.0515 ms 72.5% +SingleProcess AUTOTUNE takes 5.1317 seconds +AUTOTUNE mixed_mm(4096x256, 256x512) + triton_mm_66 0.0223 ms 100.0% + triton_mm_67 0.0234 ms 95.1% + triton_mm_68 0.0235 ms 95.0% + triton_mm_70 0.0248 ms 89.9% + triton_mm_69 0.0257 ms 86.6% + triton_mm_73 0.0305 ms 73.0% + triton_mm_74 0.0339 ms 65.7% + triton_mm_76 0.0351 ms 63.5% + triton_mm_71 0.0353 ms 63.0% + triton_mm_72 0.0358 ms 62.3% +SingleProcess AUTOTUNE takes 4.8928 seconds +AUTOTUNE mixed_mm(4096x512, 512x256) + triton_mm_78 0.0279 ms 100.0% + triton_mm_79 0.0280 ms 99.7% + triton_mm_81 0.0284 ms 98.1% + triton_mm_77 0.0291 ms 95.9% + triton_mm_80 0.0307 ms 90.8% + triton_mm_82 0.0334 ms 83.4% + triton_mm_83 0.0348 ms 80.2% + triton_mm_85 0.0361 ms 77.3% + triton_mm_87 0.0367 ms 76.0% + triton_mm_86 0.0377 ms 74.1% +SingleProcess AUTOTUNE takes 4.7122 seconds +skipping cudagraphs due to ['incompatible ops'] +AUTOTUNE mixed_mm(4096x512, 512x320) + triton_mm_532 0.0280 ms 100.0% + triton_mm_531 0.0285 ms 98.4% + triton_mm_533 0.0285 ms 98.3% + triton_mm_535 0.0288 ms 97.4% + triton_mm_534 0.0317 ms 88.4% + triton_mm_539 0.0357 ms 78.6% + triton_mm_536 0.0387 ms 72.5% + triton_mm_537 0.0390 ms 71.8% + triton_mm_541 0.0429 ms 65.4% + triton_mm_540 0.0433 ms 64.7% +SingleProcess AUTOTUNE takes 5.4091 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_T5 +cuda eval hf_T5 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(2048x512, 512x512) + triton_mm_1 0.0278 ms 100.0% + triton_mm_2 0.0282 ms 98.4% + triton_mm_4 0.0287 ms 96.8% + triton_mm_0 0.0289 ms 96.1% + triton_mm_3 0.0304 ms 91.3% + triton_mm_5 0.0334 ms 83.1% + triton_mm_6 0.0347 ms 80.0% + triton_mm_8 0.0361 ms 77.0% + triton_mm_10 0.0362 ms 76.7% + triton_mm_9 0.0373 ms 74.4% +SingleProcess AUTOTUNE takes 4.8069 seconds +AUTOTUNE mixed_mm(2048x512, 512x2048) + triton_mm_148 0.0563 ms 100.0% + triton_mm_149 0.0575 ms 97.9% + triton_mm_151 0.0600 ms 93.8% + triton_mm_147 0.0620 ms 90.8% + triton_mm_150 0.0657 ms 85.7% + fallback_mixed_mm 0.0784 ms 71.8% + triton_mm_154 0.0809 ms 69.6% + triton_mm_155 0.1007 ms 55.9% + triton_mm_152 0.1030 ms 54.7% + triton_mm_153 0.1067 ms 52.8% +SingleProcess AUTOTUNE takes 4.8737 seconds +AUTOTUNE mixed_mm(2048x2048, 2048x512) + fallback_mixed_mm 0.0709 ms 100.0% + triton_mm_159 0.0849 ms 83.5% + triton_mm_160 0.0881 ms 80.4% + triton_mm_162 0.0890 ms 79.6% + triton_mm_158 0.0918 ms 77.2% + triton_mm_161 0.0981 ms 72.2% + triton_mm_163 0.1074 ms 66.0% + triton_mm_164 0.1116 ms 63.5% + triton_mm_166 0.1124 ms 63.0% + triton_mm_167 0.1156 ms 61.3% +SingleProcess AUTOTUNE takes 4.7793 seconds +AUTOTUNE mixed_mm(2048x512, 512x32128) + fallback_mixed_mm 0.3977 ms 100.0% + triton_mm_1489 0.7151 ms 55.6% + triton_mm_1490 0.7208 ms 55.2% + triton_mm_1492 0.7767 ms 51.2% + triton_mm_1488 0.8018 ms 49.6% + triton_mm_1495 0.8398 ms 47.4% + triton_mm_1491 0.8551 ms 46.5% + triton_mm_1496 1.3933 ms 28.5% + triton_mm_1493 1.4680 ms 27.1% + triton_mm_1498 1.4773 ms 26.9% +SingleProcess AUTOTUNE takes 4.9429 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:07, ?it/s] +hf_T5_base +cuda eval hf_T5_base int8weightonly-bs1-acc +AUTOTUNE mixed_mm(2048x768, 768x768) + triton_mm_1 0.0380 ms 100.0% + triton_mm_2 0.0383 ms 99.2% + triton_mm_4 0.0392 ms 97.0% + triton_mm_0 0.0410 ms 92.9% + triton_mm_3 0.0419 ms 90.8% + triton_mm_7 0.0613 ms 62.1% + triton_mm_8 0.0624 ms 61.0% + fallback_mixed_mm 0.0649 ms 58.7% + triton_mm_5 0.0676 ms 56.3% + triton_mm_6 0.0698 ms 54.5% +SingleProcess AUTOTUNE takes 4.9006 seconds +AUTOTUNE mixed_mm(2048x768, 768x3072) + fallback_mixed_mm 0.0890 ms 100.0% + triton_mm_148 0.1209 ms 73.6% + triton_mm_149 0.1224 ms 72.7% + triton_mm_147 0.1267 ms 70.2% + triton_mm_151 0.1291 ms 68.9% + triton_mm_154 0.1388 ms 64.1% + triton_mm_150 0.1417 ms 62.8% + triton_mm_155 0.2086 ms 42.7% + triton_mm_152 0.2179 ms 40.8% + triton_mm_157 0.2213 ms 40.2% +SingleProcess AUTOTUNE takes 5.1590 seconds +AUTOTUNE mixed_mm(2048x3072, 3072x768) + fallback_mixed_mm 0.0902 ms 100.0% + triton_mm_159 0.1245 ms 72.5% + triton_mm_160 0.1289 ms 70.0% + triton_mm_162 0.1302 ms 69.3% + triton_mm_158 0.1416 ms 63.7% + triton_mm_161 0.1428 ms 63.2% + triton_mm_166 0.2147 ms 42.0% + triton_mm_165 0.2202 ms 41.0% + triton_mm_163 0.2396 ms 37.7% + triton_mm_167 0.2498 ms 36.1% +SingleProcess AUTOTUNE takes 5.0287 seconds +AUTOTUNE mixed_mm(2048x768, 768x32128) + fallback_mixed_mm 0.5653 ms 100.0% + triton_mm_2977 1.0501 ms 53.8% + triton_mm_2978 1.0560 ms 53.5% + triton_mm_2980 1.1348 ms 49.8% + triton_mm_2976 1.1724 ms 48.2% + triton_mm_2983 1.2327 ms 45.9% + triton_mm_2979 1.2503 ms 45.2% + triton_mm_2984 2.0000 ms 28.3% + triton_mm_2981 2.1374 ms 26.4% + triton_mm_2986 2.1792 ms 25.9% +SingleProcess AUTOTUNE takes 5.1252 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:13, ?it/s] +hf_T5_generate +cuda eval hf_T5_generate int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x512, 512x512) + triton_mm_545 0.0115 ms 100.0% + triton_mm_546 0.0129 ms 88.6% + triton_mm_548 0.0132 ms 86.7% + triton_mm_544 0.0135 ms 85.0% + triton_mm_549 0.0138 ms 83.1% + triton_mm_542 0.0157 ms 72.8% + triton_mm_541 0.0161 ms 71.0% + triton_mm_543 0.0184 ms 62.4% + triton_mm_540 0.0231 ms 49.7% + triton_mm_550 0.0251 ms 45.7% +SingleProcess AUTOTUNE takes 4.5849 seconds +AUTOTUNE mixed_mm(1x512, 512x2048) + triton_mm_668 0.0130 ms 100.0% + triton_mm_670 0.0130 ms 100.0% + triton_mm_672 0.0144 ms 89.8% + triton_mm_666 0.0164 ms 79.1% + triton_mm_669 0.0164 ms 78.8% + triton_mm_665 0.0172 ms 75.3% + triton_mm_667 0.0186 ms 69.6% + triton_mm_673 0.0196 ms 66.0% + triton_mm_664 0.0246 ms 52.7% + triton_mm_674 0.0258 ms 50.2% +SingleProcess AUTOTUNE takes 4.0798 seconds +AUTOTUNE mixed_mm(1x2048, 2048x512) + triton_mm_680 0.0256 ms 100.0% + triton_mm_683 0.0300 ms 85.1% + triton_mm_684 0.0322 ms 79.3% + triton_mm_679 0.0327 ms 78.2% + triton_mm_681 0.0328 ms 78.0% + triton_mm_677 0.0414 ms 61.8% + triton_mm_676 0.0444 ms 57.6% + triton_mm_678 0.0535 ms 47.8% + triton_mm_675 0.0730 ms 35.0% + fallback_mixed_mm 0.0760 ms 33.7% +SingleProcess AUTOTUNE takes 4.0467 seconds +AUTOTUNE mixed_mm(1x512, 512x32128) + triton_mm_1420 0.0346 ms 100.0% + triton_mm_1422 0.0349 ms 99.3% + triton_mm_1419 0.0374 ms 92.6% + triton_mm_1426 0.0403 ms 86.0% + triton_mm_1423 0.0407 ms 85.0% + triton_mm_1417 0.0523 ms 66.3% + triton_mm_1424 0.0832 ms 41.6% + triton_mm_1418 0.1004 ms 34.5% + triton_mm_1416 0.1031 ms 33.6% + triton_mm_1425 0.1407 ms 24.6% +SingleProcess AUTOTUNE takes 4.1166 seconds +AUTOTUNE bmm(8x1x64, 8x64x53) + triton_bmm_40413 0.0065 ms 100.0% + triton_bmm_40414 0.0065 ms 100.0% + triton_bmm_40411 0.0065 ms 99.5% + triton_bmm_40412 0.0065 ms 99.5% + triton_bmm_40415 0.0070 ms 92.7% + triton_bmm_40416 0.0070 ms 92.7% + triton_bmm_40410 0.0072 ms 89.8% + triton_bmm_40418 0.0075 ms 86.8% + triton_bmm_40417 0.0077 ms 84.2% + bmm 0.0849 ms 7.7% +SingleProcess AUTOTUNE takes 2.9278 seconds +AUTOTUNE bmm(8x1x53, 8x53x64) + triton_bmm_40435 0.0075 ms 100.0% + triton_bmm_40434 0.0077 ms 96.5% + triton_bmm_40430 0.0077 ms 96.3% + triton_bmm_40431 0.0078 ms 95.9% + triton_bmm_40432 0.0078 ms 95.9% + triton_bmm_40436 0.0080 ms 93.6% + triton_bmm_40433 0.0080 ms 92.8% + triton_bmm_40438 0.0082 ms 91.0% + triton_bmm_40437 0.0095 ms 78.5% + bmm 0.0551 ms 13.5% +SingleProcess AUTOTUNE takes 2.6898 seconds +AUTOTUNE bmm(8x1x64, 8x64x54) + triton_bmm_41185 0.0065 ms 100.0% + triton_bmm_41187 0.0065 ms 100.0% + triton_bmm_41186 0.0065 ms 99.5% + triton_bmm_41188 0.0070 ms 92.7% + triton_bmm_41189 0.0070 ms 92.7% + triton_bmm_41184 0.0070 ms 92.3% + triton_bmm_41191 0.0075 ms 86.8% + triton_bmm_41183 0.0077 ms 84.8% + triton_bmm_41190 0.0080 ms 80.9% + bmm 0.0592 ms 11.0% +SingleProcess AUTOTUNE takes 2.8482 seconds +AUTOTUNE bmm(8x1x54, 8x54x64) + triton_bmm_41204 0.0065 ms 100.0% + triton_bmm_41207 0.0065 ms 100.0% + triton_bmm_41205 0.0070 ms 92.3% + triton_bmm_41206 0.0072 ms 89.8% + triton_bmm_41208 0.0076 ms 86.0% + triton_bmm_41203 0.0076 ms 85.7% + triton_bmm_41209 0.0077 ms 83.9% + triton_bmm_41210 0.0085 ms 76.3% + triton_bmm_41211 0.0088 ms 74.1% + bmm 0.0577 ms 11.3% +SingleProcess AUTOTUNE takes 3.0910 seconds +AUTOTUNE bmm(8x1x64, 8x64x55) + triton_bmm_41959 0.0065 ms 100.0% + triton_bmm_41960 0.0065 ms 100.0% + triton_bmm_41957 0.0065 ms 99.5% + triton_bmm_41958 0.0065 ms 99.5% + triton_bmm_41961 0.0070 ms 92.7% + triton_bmm_41962 0.0070 ms 92.7% + triton_bmm_41956 0.0072 ms 89.8% + triton_bmm_41964 0.0075 ms 86.8% + triton_bmm_41963 0.0077 ms 83.9% + bmm 0.0560 ms 11.6% +SingleProcess AUTOTUNE takes 3.2948 seconds +AUTOTUNE bmm(8x1x55, 8x55x64) + triton_bmm_41977 0.0072 ms 100.0% + triton_bmm_41978 0.0072 ms 100.0% + triton_bmm_41980 0.0072 ms 100.0% + triton_bmm_41976 0.0077 ms 93.4% + triton_bmm_41979 0.0080 ms 90.8% + triton_bmm_41981 0.0080 ms 90.0% + triton_bmm_41984 0.0082 ms 87.9% + triton_bmm_41982 0.0085 ms 85.3% + triton_bmm_41983 0.0095 ms 76.4% + bmm 0.0567 ms 12.8% +SingleProcess AUTOTUNE takes 2.9982 seconds +AUTOTUNE bmm(8x1x64, 8x64x56) + triton_bmm_42730 0.0065 ms 100.0% + triton_bmm_42731 0.0065 ms 100.0% + triton_bmm_42733 0.0065 ms 100.0% + triton_bmm_42732 0.0065 ms 99.5% + triton_bmm_42729 0.0072 ms 90.2% + triton_bmm_42735 0.0075 ms 86.4% + triton_bmm_42736 0.0076 ms 84.9% + triton_bmm_42734 0.0078 ms 83.5% + triton_bmm_42737 0.0080 ms 80.9% + bmm 0.0556 ms 11.7% +SingleProcess AUTOTUNE takes 2.6679 seconds +AUTOTUNE bmm(8x1x56, 8x56x64) + triton_bmm_42751 0.0065 ms 100.0% + triton_bmm_42752 0.0065 ms 100.0% + triton_bmm_42753 0.0065 ms 100.0% + triton_bmm_42750 0.0070 ms 93.1% + triton_bmm_42754 0.0070 ms 93.1% + triton_bmm_42749 0.0075 ms 86.2% + triton_bmm_42755 0.0083 ms 78.4% + triton_bmm_42757 0.0085 ms 76.6% + triton_bmm_42756 0.0085 ms 76.0% + bmm 0.0558 ms 11.6% +SingleProcess AUTOTUNE takes 2.9177 seconds +AUTOTUNE bmm(8x1x64, 8x64x57) + triton_bmm_43506 0.0065 ms 100.0% + triton_bmm_43505 0.0071 ms 92.3% + triton_bmm_43504 0.0071 ms 91.5% + triton_bmm_43503 0.0073 ms 89.9% + triton_bmm_43508 0.0076 ms 86.4% + triton_bmm_43502 0.0078 ms 84.0% + triton_bmm_43507 0.0078 ms 84.0% + triton_bmm_43510 0.0080 ms 81.3% + triton_bmm_43509 0.0083 ms 79.1% + bmm 0.0572 ms 11.4% +SingleProcess AUTOTUNE takes 3.0085 seconds +AUTOTUNE bmm(8x1x57, 8x57x64) + triton_bmm_43524 0.0072 ms 100.0% + triton_bmm_43526 0.0072 ms 100.0% + triton_bmm_43522 0.0077 ms 93.4% + triton_bmm_43523 0.0078 ms 93.0% + triton_bmm_43527 0.0078 ms 92.6% + triton_bmm_43525 0.0079 ms 91.9% + triton_bmm_43530 0.0082 ms 87.9% + triton_bmm_43528 0.0085 ms 84.6% + triton_bmm_43529 0.0092 ms 78.7% + bmm 0.0888 ms 8.1% +SingleProcess AUTOTUNE takes 2.7337 seconds +AUTOTUNE bmm(8x1x64, 8x64x58) + triton_bmm_44276 0.0065 ms 100.0% + triton_bmm_44277 0.0065 ms 100.0% + triton_bmm_44281 0.0069 ms 94.0% + triton_bmm_44279 0.0071 ms 91.9% + triton_bmm_44275 0.0072 ms 89.8% + triton_bmm_44280 0.0072 ms 89.8% + triton_bmm_44278 0.0073 ms 89.0% + triton_bmm_44283 0.0075 ms 86.8% + triton_bmm_44282 0.0082 ms 79.0% + bmm 0.0582 ms 11.2% +SingleProcess AUTOTUNE takes 2.9755 seconds +AUTOTUNE bmm(8x1x58, 8x58x64) + triton_bmm_44297 0.0065 ms 100.0% + triton_bmm_44299 0.0065 ms 100.0% + triton_bmm_44296 0.0069 ms 94.0% + triton_bmm_44295 0.0072 ms 89.8% + triton_bmm_44298 0.0073 ms 89.4% + triton_bmm_44300 0.0078 ms 83.5% + triton_bmm_44302 0.0080 ms 81.2% + triton_bmm_44301 0.0085 ms 76.7% + triton_bmm_44303 0.0085 ms 76.6% + bmm 0.0578 ms 11.2% +SingleProcess AUTOTUNE takes 3.0329 seconds +AUTOTUNE bmm(8x1x64, 8x64x59) + triton_bmm_45051 0.0065 ms 100.0% + triton_bmm_45049 0.0065 ms 99.5% + triton_bmm_45053 0.0070 ms 92.7% + triton_bmm_45050 0.0072 ms 90.4% + triton_bmm_45052 0.0072 ms 90.4% + triton_bmm_45048 0.0072 ms 89.8% + triton_bmm_45054 0.0076 ms 86.0% + triton_bmm_45055 0.0077 ms 83.9% + triton_bmm_45056 0.0080 ms 80.9% + bmm 0.0551 ms 11.8% +SingleProcess AUTOTUNE takes 2.7796 seconds +AUTOTUNE bmm(8x1x59, 8x59x64) + triton_bmm_45069 0.0072 ms 100.0% + triton_bmm_45072 0.0072 ms 100.0% + triton_bmm_45070 0.0078 ms 93.0% + triton_bmm_45073 0.0078 ms 92.6% + triton_bmm_45071 0.0080 ms 90.2% + triton_bmm_45068 0.0083 ms 87.3% + triton_bmm_45074 0.0085 ms 84.6% + triton_bmm_45076 0.0088 ms 82.5% + triton_bmm_45075 0.0095 ms 75.8% + bmm 0.0560 ms 12.9% +SingleProcess AUTOTUNE takes 2.9211 seconds +AUTOTUNE bmm(8x1x64, 8x64x60) + triton_bmm_45822 0.0065 ms 100.0% + triton_bmm_45823 0.0070 ms 92.7% + triton_bmm_45827 0.0070 ms 92.7% + triton_bmm_45825 0.0071 ms 92.1% + triton_bmm_45824 0.0072 ms 90.2% + triton_bmm_45826 0.0072 ms 89.8% + triton_bmm_45829 0.0075 ms 86.8% + triton_bmm_45828 0.0076 ms 84.9% + triton_bmm_45821 0.0078 ms 83.5% + bmm 0.0614 ms 10.6% +SingleProcess AUTOTUNE takes 2.7332 seconds +AUTOTUNE bmm(8x1x60, 8x60x64) + triton_bmm_45843 0.0065 ms 100.0% + triton_bmm_45845 0.0065 ms 100.0% + triton_bmm_45846 0.0070 ms 92.7% + triton_bmm_45842 0.0070 ms 92.3% + triton_bmm_45844 0.0070 ms 92.3% + triton_bmm_45841 0.0076 ms 85.3% + triton_bmm_45847 0.0083 ms 78.1% + triton_bmm_45848 0.0085 ms 76.3% + triton_bmm_45849 0.0090 ms 72.0% + bmm 0.0577 ms 11.3% +SingleProcess AUTOTUNE takes 2.7150 seconds +AUTOTUNE bmm(8x1x64, 8x64x61) + triton_bmm_46597 0.0065 ms 100.0% + triton_bmm_46598 0.0065 ms 99.5% + triton_bmm_46600 0.0070 ms 92.7% + triton_bmm_46594 0.0072 ms 89.8% + triton_bmm_46595 0.0073 ms 89.4% + triton_bmm_46596 0.0073 ms 89.4% + triton_bmm_46602 0.0075 ms 86.8% + triton_bmm_46599 0.0075 ms 86.4% + triton_bmm_46601 0.0083 ms 78.4% + bmm 0.0650 ms 10.0% +SingleProcess AUTOTUNE takes 3.0747 seconds +AUTOTUNE bmm(8x1x61, 8x61x64) + triton_bmm_46616 0.0072 ms 100.0% + triton_bmm_46615 0.0073 ms 99.8% + triton_bmm_46614 0.0077 ms 93.6% + triton_bmm_46618 0.0078 ms 93.2% + triton_bmm_46619 0.0078 ms 92.8% + triton_bmm_46617 0.0079 ms 91.7% + triton_bmm_46620 0.0080 ms 91.0% + triton_bmm_46622 0.0088 ms 82.7% + triton_bmm_46621 0.0098 ms 74.3% + bmm 0.0565 ms 12.8% +SingleProcess AUTOTUNE takes 2.8254 seconds +AUTOTUNE bmm(8x1x64, 8x64x62) + triton_bmm_47368 0.0065 ms 100.0% + triton_bmm_47371 0.0065 ms 100.0% + triton_bmm_47370 0.0065 ms 99.5% + triton_bmm_47373 0.0070 ms 93.1% + triton_bmm_47369 0.0070 ms 92.3% + triton_bmm_47367 0.0072 ms 89.8% + triton_bmm_47374 0.0077 ms 83.9% + triton_bmm_47372 0.0078 ms 83.5% + triton_bmm_47375 0.0080 ms 80.9% + bmm 0.0598 ms 10.9% +SingleProcess AUTOTUNE takes 3.0708 seconds +AUTOTUNE bmm(8x1x62, 8x62x64) + triton_bmm_47388 0.0065 ms 100.0% + triton_bmm_47389 0.0065 ms 100.0% + triton_bmm_47390 0.0065 ms 99.5% + triton_bmm_47391 0.0070 ms 92.7% + triton_bmm_47392 0.0070 ms 92.7% + triton_bmm_47387 0.0072 ms 89.8% + triton_bmm_47393 0.0083 ms 78.4% + triton_bmm_47395 0.0085 ms 76.6% + triton_bmm_47394 0.0085 ms 76.0% + bmm 0.0571 ms 11.4% +SingleProcess AUTOTUNE takes 2.9381 seconds +AUTOTUNE bmm(8x1x64, 8x64x63) + triton_bmm_48141 0.0065 ms 100.0% + triton_bmm_48143 0.0070 ms 93.4% + triton_bmm_48142 0.0071 ms 91.5% + triton_bmm_48144 0.0072 ms 90.5% + triton_bmm_48145 0.0072 ms 90.3% + triton_bmm_48148 0.0075 ms 87.2% + triton_bmm_48146 0.0075 ms 86.8% + triton_bmm_48147 0.0077 ms 84.3% + triton_bmm_48140 0.0078 ms 84.0% + bmm 0.0877 ms 7.4% +SingleProcess AUTOTUNE takes 3.2981 seconds +AUTOTUNE bmm(8x1x63, 8x63x64) + triton_bmm_48161 0.0073 ms 100.0% + triton_bmm_48164 0.0073 ms 100.0% + triton_bmm_48162 0.0078 ms 93.0% + triton_bmm_48165 0.0078 ms 93.0% + triton_bmm_48163 0.0080 ms 91.3% + triton_bmm_48160 0.0083 ms 87.6% + triton_bmm_48166 0.0085 ms 85.3% + triton_bmm_48168 0.0092 ms 78.5% + triton_bmm_48167 0.0095 ms 76.7% + bmm 0.0566 ms 12.8% +SingleProcess AUTOTUNE takes 2.9754 seconds +AUTOTUNE bmm(8x1x64, 8x64x64) + triton_bmm_48914 0.0065 ms 100.0% + triton_bmm_48917 0.0069 ms 93.5% + triton_bmm_48919 0.0070 ms 93.1% + triton_bmm_48915 0.0070 ms 92.3% + triton_bmm_48913 0.0072 ms 90.6% + triton_bmm_48916 0.0073 ms 89.4% + triton_bmm_48918 0.0075 ms 86.8% + triton_bmm_48920 0.0077 ms 84.6% + triton_bmm_48921 0.0080 ms 81.0% + bmm 0.0548 ms 11.9% +SingleProcess AUTOTUNE takes 2.7239 seconds +AUTOTUNE bmm(8x1x64, 8x64x64) + triton_bmm_48937 0.0065 ms 100.0% + triton_bmm_48938 0.0070 ms 93.1% + triton_bmm_48933 0.0070 ms 92.7% + triton_bmm_48934 0.0070 ms 92.3% + triton_bmm_48935 0.0070 ms 92.3% + triton_bmm_48936 0.0071 ms 91.2% + triton_bmm_48939 0.0075 ms 86.4% + triton_bmm_48940 0.0080 ms 81.2% + triton_bmm_48941 0.0080 ms 81.2% + bmm 0.0551 ms 11.8% +SingleProcess AUTOTUNE takes 2.7779 seconds +[2023-12-13 01:55:10,759] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000) +[2023-12-13 01:55:10,759] torch._dynamo.convert_frame: [WARNING] function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645) +[2023-12-13 01:55:10,759] torch._dynamo.convert_frame: [WARNING] last reason: ___check_obj_id(L['past_key_values'], 7628576) # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward +[2023-12-13 01:55:10,759] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". +[2023-12-13 01:55:10,759] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. +AUTOTUNE bmm(8x1x64, 8x64x65) + triton_bmm_49692 0.0065 ms 100.0% + triton_bmm_49689 0.0070 ms 93.2% + triton_bmm_49694 0.0070 ms 93.2% + triton_bmm_49688 0.0072 ms 90.7% + triton_bmm_49690 0.0073 ms 89.9% + triton_bmm_49691 0.0073 ms 89.5% + triton_bmm_49687 0.0074 ms 88.1% + triton_bmm_49695 0.0075 ms 86.8% + triton_bmm_49697 0.0075 ms 86.8% + triton_bmm_49696 0.0077 ms 84.3% +SingleProcess AUTOTUNE takes 3.6674 seconds +AUTOTUNE bmm(8x1x65, 8x65x64) + triton_bmm_49713 0.0075 ms 100.0% + triton_bmm_49714 0.0075 ms 100.0% + triton_bmm_49711 0.0079 ms 94.4% + triton_bmm_49715 0.0080 ms 93.6% + triton_bmm_49712 0.0080 ms 93.2% + triton_bmm_49710 0.0083 ms 90.3% + triton_bmm_49709 0.0088 ms 85.4% + triton_bmm_49717 0.0092 ms 81.0% + triton_bmm_49716 0.0103 ms 72.9% + bmm 0.0560 ms 13.4% +SingleProcess AUTOTUNE takes 2.8901 seconds +AUTOTUNE mixed_mm(1x512, 512x32128) + triton_mm_50448 0.0341 ms 100.0% + triton_mm_50450 0.0349 ms 97.7% + triton_mm_50447 0.0375 ms 90.9% + triton_mm_50454 0.0402 ms 84.9% + triton_mm_50451 0.0407 ms 83.7% + triton_mm_50445 0.0525 ms 64.9% + triton_mm_50452 0.0862 ms 39.6% + triton_mm_50446 0.1006 ms 33.9% + triton_mm_50444 0.1037 ms 32.9% + triton_mm_50453 0.1394 ms 24.5% +SingleProcess AUTOTUNE takes 4.4100 seconds +AUTOTUNE bmm(8x1x64, 8x64x66) + triton_bmm_50479 0.0065 ms 100.0% + triton_bmm_50478 0.0068 ms 96.2% + triton_bmm_50483 0.0072 ms 90.7% + triton_bmm_50477 0.0072 ms 90.3% + triton_bmm_50481 0.0073 ms 89.9% + triton_bmm_50482 0.0073 ms 89.9% + triton_bmm_50484 0.0075 ms 87.2% + triton_bmm_50480 0.0075 ms 86.8% + triton_bmm_50485 0.0076 ms 86.4% + triton_bmm_50486 0.0076 ms 86.1% +SingleProcess AUTOTUNE takes 4.0134 seconds +AUTOTUNE bmm(8x1x66, 8x66x64) + triton_bmm_50504 0.0073 ms 100.0% + triton_bmm_50502 0.0073 ms 99.6% + triton_bmm_50503 0.0073 ms 99.6% + triton_bmm_50505 0.0073 ms 99.6% + triton_bmm_50501 0.0075 ms 96.6% + triton_bmm_50500 0.0081 ms 90.1% + triton_bmm_50506 0.0085 ms 85.3% + triton_bmm_50508 0.0088 ms 82.8% + triton_bmm_50507 0.0090 ms 80.5% + bmm 0.0576 ms 12.6% +SingleProcess AUTOTUNE takes 2.8687 seconds +AUTOTUNE bmm(8x1x64, 8x64x67) + triton_bmm_51263 0.0065 ms 100.0% + triton_bmm_51262 0.0067 ms 98.1% + triton_bmm_51260 0.0070 ms 93.2% + triton_bmm_51266 0.0070 ms 93.2% + triton_bmm_51259 0.0072 ms 91.1% + triton_bmm_51261 0.0072 ms 91.1% + triton_bmm_51258 0.0073 ms 89.1% + triton_bmm_51268 0.0075 ms 86.8% + triton_bmm_51265 0.0076 ms 86.4% + triton_bmm_51257 0.0077 ms 84.3% +SingleProcess AUTOTUNE takes 3.7416 seconds +AUTOTUNE bmm(8x1x67, 8x67x64) + triton_bmm_51283 0.0075 ms 100.0% + triton_bmm_51284 0.0075 ms 100.0% + triton_bmm_51285 0.0075 ms 100.0% + triton_bmm_51281 0.0077 ms 96.7% + triton_bmm_51282 0.0080 ms 94.0% + triton_bmm_51286 0.0080 ms 93.6% + triton_bmm_51280 0.0083 ms 90.7% + triton_bmm_51288 0.0093 ms 80.7% + triton_bmm_51287 0.0102 ms 73.1% + bmm 0.0563 ms 13.3% +SingleProcess AUTOTUNE takes 2.8644 seconds +AUTOTUNE bmm(8x1x64, 8x64x68) + triton_bmm_52039 0.0065 ms 100.0% + triton_bmm_52041 0.0065 ms 100.0% + triton_bmm_52042 0.0066 ms 99.0% + triton_bmm_52040 0.0068 ms 96.2% + triton_bmm_52046 0.0070 ms 92.7% + triton_bmm_52037 0.0072 ms 90.2% + triton_bmm_52043 0.0072 ms 90.2% + triton_bmm_52038 0.0073 ms 89.2% + triton_bmm_52044 0.0075 ms 86.8% + triton_bmm_52045 0.0075 ms 86.4% +SingleProcess AUTOTUNE takes 3.5563 seconds +AUTOTUNE bmm(8x1x68, 8x68x64) + triton_bmm_52065 0.0068 ms 100.0% + triton_bmm_52062 0.0070 ms 95.9% + triton_bmm_52064 0.0071 ms 95.5% + triton_bmm_52063 0.0073 ms 92.5% + triton_bmm_52061 0.0075 ms 89.8% + triton_bmm_52060 0.0076 ms 88.3% + triton_bmm_52066 0.0085 ms 79.6% + triton_bmm_52067 0.0085 ms 79.6% + triton_bmm_52068 0.0087 ms 77.3% + bmm 0.0563 ms 12.0% +SingleProcess AUTOTUNE takes 3.1920 seconds +AUTOTUNE bmm(8x1x64, 8x64x69) + triton_bmm_52823 0.0065 ms 100.0% + triton_bmm_52822 0.0071 ms 92.3% + triton_bmm_52821 0.0072 ms 91.1% + triton_bmm_52817 0.0072 ms 90.3% + triton_bmm_52819 0.0073 ms 89.9% + triton_bmm_52818 0.0075 ms 87.6% + triton_bmm_52824 0.0075 ms 87.2% + triton_bmm_52820 0.0075 ms 86.8% + triton_bmm_52825 0.0076 ms 86.4% + triton_bmm_52826 0.0076 ms 86.4% +SingleProcess AUTOTUNE takes 3.7835 seconds +AUTOTUNE bmm(8x1x69, 8x69x64) + triton_bmm_52843 0.0075 ms 100.0% + triton_bmm_52844 0.0077 ms 97.1% + triton_bmm_52841 0.0080 ms 94.0% + triton_bmm_52845 0.0081 ms 92.9% + triton_bmm_52842 0.0083 ms 90.7% + triton_bmm_52840 0.0085 ms 88.7% + triton_bmm_52846 0.0085 ms 88.3% + triton_bmm_52848 0.0087 ms 86.1% + triton_bmm_52847 0.0102 ms 73.4% + bmm 0.0586 ms 12.8% +SingleProcess AUTOTUNE takes 2.8348 seconds +AUTOTUNE bmm(8x1x64, 8x64x70) + triton_bmm_53599 0.0065 ms 100.0% + triton_bmm_53601 0.0065 ms 99.5% + triton_bmm_53603 0.0065 ms 99.5% + triton_bmm_53602 0.0067 ms 97.6% + triton_bmm_53598 0.0068 ms 95.8% + triton_bmm_53600 0.0070 ms 93.1% + triton_bmm_53604 0.0075 ms 86.8% + triton_bmm_53606 0.0075 ms 86.4% + triton_bmm_53608 0.0075 ms 86.4% + triton_bmm_53597 0.0078 ms 83.5% +SingleProcess AUTOTUNE takes 3.7421 seconds +AUTOTUNE bmm(8x1x70, 8x70x64) + triton_bmm_53624 0.0065 ms 100.0% + triton_bmm_53623 0.0068 ms 96.7% + triton_bmm_53622 0.0073 ms 89.7% + triton_bmm_53625 0.0073 ms 89.5% + triton_bmm_53621 0.0075 ms 86.8% + triton_bmm_53626 0.0080 ms 81.9% + triton_bmm_53620 0.0083 ms 78.9% + triton_bmm_53627 0.0085 ms 77.0% + triton_bmm_53628 0.0093 ms 70.3% + bmm 0.0563 ms 11.6% +SingleProcess AUTOTUNE takes 2.9418 seconds +AUTOTUNE bmm(8x1x64, 8x64x71) + triton_bmm_54382 0.0065 ms 100.0% + triton_bmm_54381 0.0065 ms 99.5% + triton_bmm_54386 0.0070 ms 92.7% + triton_bmm_54377 0.0072 ms 89.8% + triton_bmm_54383 0.0073 ms 89.4% + triton_bmm_54379 0.0073 ms 89.0% + triton_bmm_54384 0.0075 ms 86.8% + triton_bmm_54378 0.0075 ms 86.4% + triton_bmm_54380 0.0075 ms 86.4% + triton_bmm_54387 0.0077 ms 83.9% +SingleProcess AUTOTUNE takes 3.7997 seconds +AUTOTUNE bmm(8x1x71, 8x71x64) + triton_bmm_54405 0.0075 ms 100.0% + triton_bmm_54403 0.0077 ms 96.7% + triton_bmm_54401 0.0080 ms 94.0% + triton_bmm_54406 0.0080 ms 93.6% + triton_bmm_54402 0.0080 ms 93.2% + triton_bmm_54404 0.0080 ms 93.2% + triton_bmm_54408 0.0087 ms 85.7% + triton_bmm_54400 0.0090 ms 83.6% + triton_bmm_54407 0.0103 ms 72.9% + bmm 0.0556 ms 13.5% +SingleProcess AUTOTUNE takes 3.2618 seconds +AUTOTUNE bmm(8x1x64, 8x64x72) + triton_bmm_55162 0.0065 ms 100.0% + triton_bmm_55161 0.0065 ms 99.5% + triton_bmm_55158 0.0068 ms 96.2% + triton_bmm_55166 0.0070 ms 92.7% + triton_bmm_55163 0.0073 ms 89.4% + triton_bmm_55159 0.0073 ms 89.0% + triton_bmm_55160 0.0074 ms 87.5% + triton_bmm_55164 0.0075 ms 86.8% + triton_bmm_55168 0.0075 ms 86.8% + triton_bmm_55165 0.0075 ms 86.4% +SingleProcess AUTOTUNE takes 3.6055 seconds +AUTOTUNE bmm(8x1x72, 8x72x64) + triton_bmm_55185 0.0068 ms 100.0% + triton_bmm_55182 0.0070 ms 96.3% + triton_bmm_55184 0.0071 ms 94.6% + triton_bmm_55183 0.0072 ms 93.2% + triton_bmm_55181 0.0075 ms 90.6% + triton_bmm_55186 0.0075 ms 90.2% + triton_bmm_55180 0.0083 ms 81.8% + triton_bmm_55187 0.0085 ms 79.6% + triton_bmm_55188 0.0093 ms 72.8% + bmm 0.0569 ms 11.9% +SingleProcess AUTOTUNE takes 2.9594 seconds +AUTOTUNE bmm(8x1x64, 8x64x73) + triton_bmm_55939 0.0065 ms 100.0% + triton_bmm_55943 0.0066 ms 99.5% + triton_bmm_55942 0.0067 ms 97.1% + triton_bmm_55938 0.0068 ms 96.7% + triton_bmm_55940 0.0070 ms 93.2% + triton_bmm_55946 0.0070 ms 93.2% + triton_bmm_55941 0.0071 ms 91.9% + triton_bmm_55945 0.0072 ms 90.3% + triton_bmm_55948 0.0075 ms 86.8% + triton_bmm_55937 0.0078 ms 84.0% +SingleProcess AUTOTUNE takes 3.9463 seconds +AUTOTUNE bmm(8x1x73, 8x73x64) + triton_bmm_55965 0.0077 ms 100.0% + triton_bmm_55963 0.0077 ms 99.6% + triton_bmm_55961 0.0080 ms 96.4% + triton_bmm_55966 0.0080 ms 96.4% + triton_bmm_55964 0.0082 ms 94.5% + triton_bmm_55962 0.0082 ms 93.6% + triton_bmm_55960 0.0087 ms 88.9% + triton_bmm_55968 0.0096 ms 80.6% + triton_bmm_55967 0.0105 ms 73.5% + bmm 0.0615 ms 12.5% +SingleProcess AUTOTUNE takes 2.9664 seconds +AUTOTUNE bmm(8x1x64, 8x64x74) + triton_bmm_56723 0.0065 ms 100.0% + triton_bmm_56722 0.0067 ms 97.1% + triton_bmm_56718 0.0069 ms 94.0% + triton_bmm_56719 0.0071 ms 91.5% + triton_bmm_56721 0.0072 ms 90.3% + triton_bmm_56725 0.0072 ms 90.3% + triton_bmm_56720 0.0075 ms 86.8% + triton_bmm_56726 0.0075 ms 86.8% + triton_bmm_56728 0.0075 ms 86.8% + triton_bmm_56717 0.0078 ms 84.0% +SingleProcess AUTOTUNE takes 3.9023 seconds +AUTOTUNE bmm(8x1x74, 8x74x64) + triton_bmm_56744 0.0067 ms 100.0% + triton_bmm_56743 0.0068 ms 99.1% + triton_bmm_56745 0.0068 ms 99.1% + triton_bmm_56742 0.0072 ms 92.9% + triton_bmm_56741 0.0076 ms 88.6% + triton_bmm_56740 0.0077 ms 86.4% + triton_bmm_56746 0.0080 ms 83.6% + triton_bmm_56747 0.0086 ms 77.4% + triton_bmm_56748 0.0090 ms 74.6% + bmm 0.0555 ms 12.0% +SingleProcess AUTOTUNE takes 3.0973 seconds +AUTOTUNE bmm(8x1x64, 8x64x75) + triton_bmm_57501 0.0066 ms 100.0% + triton_bmm_57503 0.0066 ms 100.0% + triton_bmm_57500 0.0070 ms 94.1% + triton_bmm_57506 0.0070 ms 94.1% + triton_bmm_57502 0.0072 ms 91.4% + triton_bmm_57499 0.0073 ms 90.4% + triton_bmm_57498 0.0074 ms 89.2% + triton_bmm_57504 0.0075 ms 87.7% + triton_bmm_57508 0.0075 ms 87.7% + triton_bmm_57505 0.0075 ms 87.5% +SingleProcess AUTOTUNE takes 3.7366 seconds +AUTOTUNE bmm(8x1x75, 8x75x64) + triton_bmm_57523 0.0075 ms 100.0% + triton_bmm_57522 0.0077 ms 97.1% + triton_bmm_57524 0.0083 ms 90.7% + triton_bmm_57525 0.0083 ms 90.7% + triton_bmm_57526 0.0085 ms 88.2% + triton_bmm_57521 0.0086 ms 87.9% + triton_bmm_57520 0.0091 ms 83.0% + triton_bmm_57528 0.0093 ms 81.0% + triton_bmm_57527 0.0100 ms 75.3% + bmm 0.0555 ms 13.6% +SingleProcess AUTOTUNE takes 2.8559 seconds +AUTOTUNE bmm(8x1x64, 8x64x76) + triton_bmm_58283 0.0065 ms 100.0% + triton_bmm_58281 0.0065 ms 99.5% + triton_bmm_58285 0.0070 ms 92.7% + triton_bmm_58286 0.0070 ms 92.7% + triton_bmm_58278 0.0071 ms 91.0% + triton_bmm_58279 0.0072 ms 89.8% + triton_bmm_58282 0.0073 ms 89.4% + triton_bmm_58280 0.0074 ms 87.9% + triton_bmm_58284 0.0075 ms 86.8% + triton_bmm_58288 0.0075 ms 86.8% +SingleProcess AUTOTUNE takes 4.2566 seconds +AUTOTUNE bmm(8x1x76, 8x76x64) + triton_bmm_58302 0.0065 ms 100.0% + triton_bmm_58301 0.0070 ms 93.2% + triton_bmm_58303 0.0072 ms 90.7% + triton_bmm_58304 0.0073 ms 89.7% + triton_bmm_58305 0.0073 ms 89.5% + triton_bmm_58300 0.0077 ms 84.3% + triton_bmm_58306 0.0085 ms 76.7% + triton_bmm_58307 0.0087 ms 75.3% + triton_bmm_58308 0.0095 ms 68.7% + bmm 0.0590 ms 11.1% +SingleProcess AUTOTUNE takes 2.8716 seconds +AUTOTUNE bmm(8x1x64, 8x64x77) + triton_bmm_59061 0.0067 ms 100.0% + triton_bmm_59063 0.0067 ms 99.5% + triton_bmm_59058 0.0068 ms 99.1% + triton_bmm_59060 0.0070 ms 95.4% + triton_bmm_59066 0.0070 ms 95.4% + triton_bmm_59057 0.0072 ms 92.5% + triton_bmm_59062 0.0072 ms 92.5% + triton_bmm_59059 0.0073 ms 92.1% + triton_bmm_59065 0.0076 ms 88.6% + triton_bmm_59068 0.0076 ms 88.2% +SingleProcess AUTOTUNE takes 3.7591 seconds +AUTOTUNE bmm(8x1x77, 8x77x64) + triton_bmm_59082 0.0077 ms 100.0% + triton_bmm_59084 0.0077 ms 100.0% + triton_bmm_59085 0.0077 ms 100.0% + triton_bmm_59083 0.0080 ms 97.2% + triton_bmm_59086 0.0085 ms 90.6% + triton_bmm_59081 0.0086 ms 90.0% + triton_bmm_59080 0.0093 ms 83.4% + triton_bmm_59088 0.0094 ms 82.5% + triton_bmm_59087 0.0102 ms 75.9% + bmm 0.0564 ms 13.7% +SingleProcess AUTOTUNE takes 3.0166 seconds +AUTOTUNE bmm(8x1x64, 8x64x78) + triton_bmm_59841 0.0068 ms 100.0% + triton_bmm_59843 0.0068 ms 100.0% + triton_bmm_59838 0.0070 ms 96.3% + triton_bmm_59840 0.0070 ms 96.3% + triton_bmm_59842 0.0072 ms 93.4% + triton_bmm_59839 0.0072 ms 93.2% + triton_bmm_59837 0.0073 ms 93.0% + triton_bmm_59844 0.0075 ms 90.2% + triton_bmm_59846 0.0075 ms 89.8% + triton_bmm_59847 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.7350 seconds +AUTOTUNE bmm(8x1x78, 8x78x64) + triton_bmm_59862 0.0068 ms 100.0% + triton_bmm_59864 0.0068 ms 100.0% + triton_bmm_59865 0.0068 ms 99.1% + triton_bmm_59861 0.0070 ms 96.3% + triton_bmm_59863 0.0073 ms 93.0% + triton_bmm_59860 0.0077 ms 87.2% + triton_bmm_59866 0.0081 ms 83.1% + triton_bmm_59868 0.0090 ms 75.1% + triton_bmm_59867 0.0092 ms 73.3% + bmm 0.0889 ms 7.6% +SingleProcess AUTOTUNE takes 3.0923 seconds +AUTOTUNE bmm(8x1x64, 8x64x79) + triton_bmm_60619 0.0068 ms 100.0% + triton_bmm_60621 0.0068 ms 100.0% + triton_bmm_60620 0.0070 ms 96.3% + triton_bmm_60625 0.0070 ms 96.3% + triton_bmm_60626 0.0070 ms 96.3% + triton_bmm_60623 0.0073 ms 92.5% + triton_bmm_60618 0.0075 ms 89.8% + triton_bmm_60622 0.0075 ms 89.8% + triton_bmm_60624 0.0077 ms 87.6% + triton_bmm_60617 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 4.1098 seconds +AUTOTUNE bmm(8x1x79, 8x79x64) + triton_bmm_60643 0.0080 ms 100.0% + triton_bmm_60644 0.0080 ms 100.0% + triton_bmm_60641 0.0082 ms 96.9% + triton_bmm_60645 0.0083 ms 96.1% + triton_bmm_60642 0.0085 ms 93.6% + triton_bmm_60646 0.0085 ms 93.3% + triton_bmm_60640 0.0090 ms 88.9% + triton_bmm_60648 0.0105 ms 76.1% + triton_bmm_60647 0.0110 ms 72.4% + bmm 0.0576 ms 13.8% +SingleProcess AUTOTUNE takes 3.1665 seconds +AUTOTUNE bmm(8x1x64, 8x64x80) + triton_bmm_61398 0.0068 ms 100.0% + triton_bmm_61399 0.0068 ms 100.0% + triton_bmm_61401 0.0068 ms 100.0% + triton_bmm_61400 0.0070 ms 96.8% + triton_bmm_61406 0.0070 ms 96.3% + triton_bmm_61397 0.0072 ms 93.4% + triton_bmm_61402 0.0073 ms 92.5% + triton_bmm_61403 0.0073 ms 92.5% + triton_bmm_61404 0.0075 ms 90.2% + triton_bmm_61405 0.0076 ms 89.4% +SingleProcess AUTOTUNE takes 3.7115 seconds +AUTOTUNE bmm(8x1x80, 8x80x64) + triton_bmm_61423 0.0068 ms 100.0% + triton_bmm_61421 0.0070 ms 96.3% + triton_bmm_61422 0.0073 ms 93.0% + triton_bmm_61425 0.0073 ms 92.7% + triton_bmm_61424 0.0073 ms 92.5% + triton_bmm_61426 0.0075 ms 90.2% + triton_bmm_61420 0.0083 ms 81.2% + triton_bmm_61427 0.0087 ms 77.6% + triton_bmm_61428 0.0091 ms 74.6% + bmm 0.0571 ms 11.8% +SingleProcess AUTOTUNE takes 3.1161 seconds +AUTOTUNE bmm(8x1x64, 8x64x81) + triton_bmm_62179 0.0068 ms 100.0% + triton_bmm_62183 0.0068 ms 100.0% + triton_bmm_62178 0.0070 ms 96.8% + triton_bmm_62180 0.0070 ms 96.3% + triton_bmm_62181 0.0072 ms 93.6% + triton_bmm_62177 0.0073 ms 93.0% + triton_bmm_62182 0.0073 ms 92.5% + triton_bmm_62186 0.0075 ms 89.8% + triton_bmm_62185 0.0077 ms 88.1% + triton_bmm_62184 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 4.0217 seconds +AUTOTUNE bmm(8x1x81, 8x81x64) + triton_bmm_62202 0.0077 ms 100.0% + triton_bmm_62205 0.0081 ms 96.0% + triton_bmm_62206 0.0082 ms 94.2% + triton_bmm_62203 0.0083 ms 93.4% + triton_bmm_62204 0.0083 ms 93.4% + triton_bmm_62201 0.0085 ms 90.6% + triton_bmm_62200 0.0087 ms 88.6% + triton_bmm_62208 0.0098 ms 79.3% + triton_bmm_62207 0.0107 ms 72.2% + bmm 0.0585 ms 13.2% +SingleProcess AUTOTUNE takes 3.1220 seconds +AUTOTUNE bmm(8x1x64, 8x64x82) + triton_bmm_62958 0.0070 ms 100.0% + triton_bmm_62965 0.0070 ms 100.0% + triton_bmm_62966 0.0070 ms 100.0% + triton_bmm_62961 0.0070 ms 99.5% + triton_bmm_62963 0.0072 ms 97.3% + triton_bmm_62957 0.0072 ms 96.9% + triton_bmm_62959 0.0073 ms 96.1% + triton_bmm_62964 0.0075 ms 93.6% + triton_bmm_62960 0.0075 ms 93.2% + triton_bmm_62962 0.0075 ms 93.2% +SingleProcess AUTOTUNE takes 3.7320 seconds +AUTOTUNE bmm(8x1x82, 8x82x64) + triton_bmm_62983 0.0072 ms 100.0% + triton_bmm_62982 0.0073 ms 99.1% + triton_bmm_62984 0.0073 ms 98.7% + triton_bmm_62985 0.0074 ms 97.4% + triton_bmm_62981 0.0076 ms 94.9% + triton_bmm_62980 0.0077 ms 93.0% + triton_bmm_62986 0.0088 ms 82.1% + triton_bmm_62987 0.0092 ms 78.1% + triton_bmm_62988 0.0100 ms 72.3% + bmm 0.0643 ms 11.2% +SingleProcess AUTOTUNE takes 2.9102 seconds +AUTOTUNE bmm(8x1x64, 8x64x83) + triton_bmm_63741 0.0068 ms 100.0% + triton_bmm_63743 0.0068 ms 100.0% + triton_bmm_63738 0.0070 ms 96.3% + triton_bmm_63740 0.0070 ms 96.3% + triton_bmm_63745 0.0070 ms 96.3% + triton_bmm_63739 0.0073 ms 92.5% + triton_bmm_63742 0.0073 ms 92.5% + triton_bmm_63746 0.0076 ms 89.4% + triton_bmm_63744 0.0077 ms 87.2% + triton_bmm_63747 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.7566 seconds +AUTOTUNE bmm(8x1x83, 8x83x64) + triton_bmm_63762 0.0080 ms 100.0% + triton_bmm_63764 0.0080 ms 100.0% + triton_bmm_63765 0.0083 ms 96.7% + triton_bmm_63763 0.0083 ms 96.2% + triton_bmm_63761 0.0088 ms 91.2% + triton_bmm_63766 0.0088 ms 91.2% + triton_bmm_63760 0.0090 ms 89.0% + triton_bmm_63768 0.0098 ms 82.0% + triton_bmm_63767 0.0113 ms 70.8% + bmm 0.0654 ms 12.2% +SingleProcess AUTOTUNE takes 3.1141 seconds +AUTOTUNE bmm(8x1x64, 8x64x84) + triton_bmm_64522 0.0067 ms 100.0% + triton_bmm_64521 0.0067 ms 99.3% + triton_bmm_64518 0.0068 ms 98.8% + triton_bmm_64520 0.0070 ms 95.6% + triton_bmm_64526 0.0070 ms 95.2% + triton_bmm_64519 0.0072 ms 93.1% + triton_bmm_64517 0.0072 ms 92.3% + triton_bmm_64523 0.0073 ms 91.9% + triton_bmm_64524 0.0075 ms 89.1% + triton_bmm_64525 0.0076 ms 88.0% +SingleProcess AUTOTUNE takes 3.7557 seconds +AUTOTUNE bmm(8x1x84, 8x84x64) + triton_bmm_64541 0.0070 ms 100.0% + triton_bmm_64544 0.0073 ms 96.5% + triton_bmm_64542 0.0073 ms 96.1% + triton_bmm_64543 0.0073 ms 96.1% + triton_bmm_64545 0.0074 ms 94.4% + triton_bmm_64546 0.0082 ms 85.2% + triton_bmm_64540 0.0083 ms 84.6% + triton_bmm_64547 0.0092 ms 76.3% + triton_bmm_64548 0.0100 ms 70.4% + bmm 0.0637 ms 11.0% +SingleProcess AUTOTUNE takes 3.3426 seconds +AUTOTUNE bmm(8x1x64, 8x64x85) + triton_bmm_65301 0.0068 ms 100.0% + triton_bmm_65303 0.0068 ms 100.0% + triton_bmm_65298 0.0070 ms 96.3% + triton_bmm_65306 0.0070 ms 96.3% + triton_bmm_65299 0.0073 ms 92.5% + triton_bmm_65302 0.0075 ms 89.6% + triton_bmm_65305 0.0076 ms 89.4% + triton_bmm_65304 0.0077 ms 87.6% + triton_bmm_65307 0.0077 ms 87.2% + triton_bmm_65300 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.6445 seconds +AUTOTUNE bmm(8x1x85, 8x85x64) + triton_bmm_65321 0.0082 ms 100.0% + triton_bmm_65325 0.0083 ms 99.2% + triton_bmm_65322 0.0085 ms 96.6% + triton_bmm_65323 0.0085 ms 96.6% + triton_bmm_65324 0.0085 ms 96.6% + triton_bmm_65326 0.0088 ms 93.8% + triton_bmm_65320 0.0090 ms 91.8% + triton_bmm_65328 0.0092 ms 89.2% + triton_bmm_65327 0.0109 ms 75.1% + bmm 0.0633 ms 13.0% +SingleProcess AUTOTUNE takes 2.9543 seconds +AUTOTUNE bmm(8x1x64, 8x64x86) + triton_bmm_66081 0.0068 ms 100.0% + triton_bmm_66080 0.0070 ms 96.3% + triton_bmm_66085 0.0070 ms 96.3% + triton_bmm_66086 0.0070 ms 96.3% + triton_bmm_66083 0.0072 ms 93.4% + triton_bmm_66079 0.0073 ms 92.5% + triton_bmm_66082 0.0073 ms 92.5% + triton_bmm_66078 0.0075 ms 89.8% + triton_bmm_66088 0.0076 ms 88.7% + triton_bmm_66077 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.7751 seconds +AUTOTUNE bmm(8x1x86, 8x86x64) + triton_bmm_66103 0.0068 ms 100.0% + triton_bmm_66102 0.0072 ms 93.2% + triton_bmm_66104 0.0073 ms 92.5% + triton_bmm_66105 0.0075 ms 89.8% + triton_bmm_66101 0.0077 ms 87.9% + triton_bmm_66106 0.0082 ms 82.1% + triton_bmm_66100 0.0083 ms 81.5% + triton_bmm_66107 0.0092 ms 73.3% + triton_bmm_66108 0.0095 ms 71.3% + bmm 0.0872 ms 7.7% +SingleProcess AUTOTUNE takes 2.9093 seconds +AUTOTUNE bmm(8x1x64, 8x64x87) + triton_bmm_66861 0.0068 ms 100.0% + triton_bmm_66863 0.0068 ms 100.0% + triton_bmm_66860 0.0070 ms 96.3% + triton_bmm_66857 0.0073 ms 93.0% + triton_bmm_66859 0.0073 ms 92.5% + triton_bmm_66862 0.0073 ms 92.5% + triton_bmm_66858 0.0075 ms 90.2% + triton_bmm_66865 0.0076 ms 89.4% + triton_bmm_66866 0.0076 ms 89.4% + triton_bmm_66867 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 4.0973 seconds +AUTOTUNE bmm(8x1x87, 8x87x64) + triton_bmm_66885 0.0077 ms 100.0% + triton_bmm_66882 0.0080 ms 97.2% + triton_bmm_66884 0.0080 ms 97.2% + triton_bmm_66881 0.0080 ms 96.4% + triton_bmm_66883 0.0085 ms 91.0% + triton_bmm_66880 0.0088 ms 88.3% + triton_bmm_66886 0.0088 ms 88.3% + triton_bmm_66888 0.0098 ms 79.3% + triton_bmm_66887 0.0115 ms 67.4% + bmm 0.0612 ms 12.7% +SingleProcess AUTOTUNE takes 2.9708 seconds +AUTOTUNE bmm(8x1x64, 8x64x88) + triton_bmm_67638 0.0068 ms 100.0% + triton_bmm_67642 0.0069 ms 97.2% + triton_bmm_67640 0.0070 ms 96.8% + triton_bmm_67645 0.0070 ms 96.3% + triton_bmm_67646 0.0070 ms 96.3% + triton_bmm_67639 0.0072 ms 93.8% + triton_bmm_67637 0.0072 ms 93.4% + triton_bmm_67641 0.0073 ms 92.5% + triton_bmm_67643 0.0073 ms 92.5% + triton_bmm_67644 0.0075 ms 90.2% +SingleProcess AUTOTUNE takes 3.6584 seconds +AUTOTUNE bmm(8x1x88, 8x88x64) + triton_bmm_67665 0.0068 ms 100.0% + triton_bmm_67661 0.0070 ms 96.3% + triton_bmm_67662 0.0073 ms 92.5% + triton_bmm_67664 0.0073 ms 92.5% + triton_bmm_67663 0.0075 ms 89.8% + triton_bmm_67666 0.0081 ms 83.7% + triton_bmm_67660 0.0083 ms 81.8% + triton_bmm_67667 0.0092 ms 73.5% + triton_bmm_67668 0.0098 ms 68.7% + bmm 0.0567 ms 11.9% +SingleProcess AUTOTUNE takes 2.8883 seconds +AUTOTUNE bmm(8x1x64, 8x64x89) + triton_bmm_68419 0.0068 ms 100.0% + triton_bmm_68421 0.0068 ms 100.0% + triton_bmm_68423 0.0068 ms 100.0% + triton_bmm_68418 0.0070 ms 96.8% + triton_bmm_68420 0.0070 ms 96.3% + triton_bmm_68426 0.0072 ms 94.2% + triton_bmm_68417 0.0073 ms 93.0% + triton_bmm_68422 0.0073 ms 93.0% + triton_bmm_68424 0.0075 ms 89.8% + triton_bmm_68428 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.9829 seconds +AUTOTUNE bmm(8x1x89, 8x89x64) + triton_bmm_68442 0.0080 ms 100.0% + triton_bmm_68444 0.0080 ms 100.0% + triton_bmm_68445 0.0083 ms 96.5% + triton_bmm_68443 0.0085 ms 94.3% + triton_bmm_68441 0.0088 ms 91.2% + triton_bmm_68446 0.0088 ms 91.2% + triton_bmm_68448 0.0092 ms 86.5% + triton_bmm_68440 0.0093 ms 85.6% + triton_bmm_68447 0.0112 ms 71.1% + bmm 0.0578 ms 13.8% +SingleProcess AUTOTUNE takes 2.8389 seconds +AUTOTUNE bmm(8x1x64, 8x64x90) + triton_bmm_69203 0.0067 ms 100.0% + triton_bmm_69199 0.0068 ms 99.5% + triton_bmm_69202 0.0068 ms 99.5% + triton_bmm_69200 0.0070 ms 95.9% + triton_bmm_69201 0.0072 ms 92.9% + triton_bmm_69205 0.0073 ms 92.5% + triton_bmm_69204 0.0075 ms 89.7% + triton_bmm_69198 0.0075 ms 89.4% + triton_bmm_69206 0.0076 ms 89.0% + triton_bmm_69208 0.0077 ms 87.1% +SingleProcess AUTOTUNE takes 3.6421 seconds +AUTOTUNE bmm(8x1x90, 8x90x64) + triton_bmm_69223 0.0068 ms 100.0% + triton_bmm_69224 0.0068 ms 100.0% + triton_bmm_69225 0.0068 ms 100.0% + triton_bmm_69221 0.0070 ms 96.3% + triton_bmm_69222 0.0073 ms 92.5% + triton_bmm_69220 0.0083 ms 81.8% + triton_bmm_69226 0.0088 ms 77.0% + triton_bmm_69227 0.0092 ms 73.3% + triton_bmm_69228 0.0100 ms 67.4% + bmm 0.0633 ms 10.7% +SingleProcess AUTOTUNE takes 2.9547 seconds +AUTOTUNE bmm(8x1x64, 8x64x91) + triton_bmm_69983 0.0068 ms 100.0% + triton_bmm_69986 0.0071 ms 95.5% + triton_bmm_69981 0.0071 ms 94.6% + triton_bmm_69979 0.0072 ms 93.8% + triton_bmm_69977 0.0073 ms 93.0% + triton_bmm_69985 0.0073 ms 93.0% + triton_bmm_69982 0.0075 ms 89.8% + triton_bmm_69978 0.0076 ms 89.4% + triton_bmm_69980 0.0076 ms 88.7% + triton_bmm_69987 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.9454 seconds +AUTOTUNE bmm(8x1x91, 8x91x64) + triton_bmm_70005 0.0077 ms 100.0% + triton_bmm_70003 0.0080 ms 96.8% + triton_bmm_70004 0.0083 ms 93.1% + triton_bmm_70002 0.0085 ms 91.0% + triton_bmm_70001 0.0086 ms 90.3% + triton_bmm_70006 0.0088 ms 88.3% + triton_bmm_70000 0.0093 ms 83.2% + triton_bmm_70008 0.0098 ms 79.3% + triton_bmm_70007 0.0112 ms 68.8% + bmm 0.0705 ms 11.0% +SingleProcess AUTOTUNE takes 3.1561 seconds +AUTOTUNE bmm(8x1x64, 8x64x92) + triton_bmm_70763 0.0067 ms 100.0% + triton_bmm_70762 0.0068 ms 99.1% + triton_bmm_70766 0.0070 ms 95.4% + triton_bmm_70757 0.0072 ms 92.5% + triton_bmm_70765 0.0072 ms 92.5% + triton_bmm_70759 0.0073 ms 92.1% + triton_bmm_70761 0.0073 ms 92.1% + triton_bmm_70758 0.0073 ms 91.5% + triton_bmm_70764 0.0075 ms 89.3% + triton_bmm_70760 0.0075 ms 88.9% +SingleProcess AUTOTUNE takes 3.8149 seconds +AUTOTUNE bmm(8x1x92, 8x92x64) + triton_bmm_70785 0.0068 ms 100.0% + triton_bmm_70784 0.0072 ms 94.0% + triton_bmm_70783 0.0073 ms 93.0% + triton_bmm_70782 0.0073 ms 92.5% + triton_bmm_70781 0.0076 ms 89.4% + triton_bmm_70780 0.0077 ms 87.2% + triton_bmm_70786 0.0082 ms 82.1% + triton_bmm_70788 0.0094 ms 71.5% + triton_bmm_70787 0.0095 ms 70.8% + bmm 0.0614 ms 11.0% +SingleProcess AUTOTUNE takes 3.1373 seconds +AUTOTUNE bmm(8x1x64, 8x64x93) + triton_bmm_71541 0.0068 ms 100.0% + triton_bmm_71540 0.0070 ms 96.3% + triton_bmm_71545 0.0070 ms 96.3% + triton_bmm_71539 0.0072 ms 93.6% + triton_bmm_71542 0.0073 ms 92.5% + triton_bmm_71543 0.0073 ms 92.5% + triton_bmm_71538 0.0075 ms 89.8% + triton_bmm_71546 0.0076 ms 89.4% + triton_bmm_71544 0.0077 ms 87.6% + triton_bmm_71548 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 4.0797 seconds +AUTOTUNE bmm(8x1x93, 8x93x64) + triton_bmm_71565 0.0077 ms 100.0% + triton_bmm_71562 0.0080 ms 97.2% + triton_bmm_71564 0.0080 ms 97.2% + triton_bmm_71561 0.0082 ms 94.2% + triton_bmm_71563 0.0085 ms 91.0% + triton_bmm_71566 0.0088 ms 88.3% + triton_bmm_71568 0.0092 ms 84.0% + triton_bmm_71560 0.0095 ms 81.5% + triton_bmm_71567 0.0113 ms 68.6% + bmm 0.0633 ms 12.2% +SingleProcess AUTOTUNE takes 3.1433 seconds +AUTOTUNE bmm(8x1x64, 8x64x94) + triton_bmm_72319 0.0067 ms 100.0% + triton_bmm_72321 0.0068 ms 99.5% + triton_bmm_72322 0.0068 ms 99.5% + triton_bmm_72326 0.0070 ms 95.9% + triton_bmm_72323 0.0072 ms 92.9% + triton_bmm_72320 0.0074 ms 91.1% + triton_bmm_72318 0.0075 ms 90.1% + triton_bmm_72328 0.0077 ms 87.1% + triton_bmm_72327 0.0077 ms 86.8% + triton_bmm_72317 0.0078 ms 86.4% +SingleProcess AUTOTUNE takes 3.6754 seconds +AUTOTUNE bmm(8x1x94, 8x94x64) + triton_bmm_72344 0.0068 ms 100.0% + triton_bmm_72345 0.0068 ms 99.5% + triton_bmm_72342 0.0073 ms 93.0% + triton_bmm_72343 0.0073 ms 93.0% + triton_bmm_72341 0.0077 ms 87.2% + triton_bmm_72340 0.0083 ms 81.5% + triton_bmm_72346 0.0088 ms 77.0% + triton_bmm_72348 0.0095 ms 71.3% + triton_bmm_72347 0.0095 ms 70.8% + bmm 0.0570 ms 11.8% +SingleProcess AUTOTUNE takes 3.4737 seconds +AUTOTUNE bmm(8x1x64, 8x64x95) + triton_bmm_73099 0.0068 ms 100.0% + triton_bmm_73102 0.0068 ms 100.0% + triton_bmm_73103 0.0068 ms 100.0% + triton_bmm_73100 0.0070 ms 96.3% + triton_bmm_73106 0.0071 ms 95.0% + triton_bmm_73101 0.0072 ms 93.8% + triton_bmm_73097 0.0073 ms 93.0% + triton_bmm_73098 0.0075 ms 90.0% + triton_bmm_73108 0.0077 ms 87.2% + triton_bmm_73105 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.9493 seconds +AUTOTUNE bmm(8x1x95, 8x95x64) + triton_bmm_73121 0.0082 ms 100.0% + triton_bmm_73123 0.0082 ms 100.0% + triton_bmm_73124 0.0082 ms 100.0% + triton_bmm_73126 0.0083 ms 99.6% + triton_bmm_73125 0.0083 ms 99.2% + triton_bmm_73122 0.0088 ms 93.8% + triton_bmm_73120 0.0092 ms 89.2% + triton_bmm_73128 0.0115 ms 71.6% + triton_bmm_73127 0.0117 ms 70.0% + bmm 0.0581 ms 14.2% +SingleProcess AUTOTUNE takes 2.9865 seconds +AUTOTUNE bmm(8x1x64, 8x64x96) + triton_bmm_73878 0.0068 ms 100.0% + triton_bmm_73882 0.0068 ms 100.0% + triton_bmm_73883 0.0068 ms 100.0% + triton_bmm_73880 0.0070 ms 96.3% + triton_bmm_73881 0.0073 ms 93.0% + triton_bmm_73879 0.0073 ms 92.5% + triton_bmm_73886 0.0076 ms 89.4% + triton_bmm_73885 0.0077 ms 88.1% + triton_bmm_73877 0.0078 ms 86.8% + triton_bmm_73884 0.0080 ms 84.4% +SingleProcess AUTOTUNE takes 3.7692 seconds +AUTOTUNE bmm(8x1x96, 8x96x64) + triton_bmm_73903 0.0068 ms 100.0% + triton_bmm_73904 0.0068 ms 100.0% + triton_bmm_73905 0.0068 ms 100.0% + triton_bmm_73901 0.0070 ms 96.3% + triton_bmm_73902 0.0073 ms 92.5% + triton_bmm_73900 0.0077 ms 87.2% + triton_bmm_73906 0.0080 ms 84.1% + triton_bmm_73907 0.0090 ms 75.1% + triton_bmm_73908 0.0090 ms 75.1% + bmm 0.0705 ms 9.6% +SingleProcess AUTOTUNE takes 2.9407 seconds +AUTOTUNE bmm(8x1x64, 8x64x97) + triton_bmm_74663 0.0068 ms 100.0% + triton_bmm_74658 0.0070 ms 96.3% + triton_bmm_74660 0.0070 ms 96.3% + triton_bmm_74657 0.0073 ms 93.0% + triton_bmm_74659 0.0073 ms 92.5% + triton_bmm_74661 0.0073 ms 92.5% + triton_bmm_74662 0.0074 ms 91.5% + triton_bmm_74668 0.0077 ms 87.6% + triton_bmm_74665 0.0078 ms 86.8% + triton_bmm_74666 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 4.0462 seconds +AUTOTUNE bmm(8x1x97, 8x97x64) + triton_bmm_74685 0.0077 ms 100.0% + triton_bmm_74682 0.0085 ms 91.3% + triton_bmm_74684 0.0087 ms 89.1% + triton_bmm_74683 0.0088 ms 88.3% + triton_bmm_74686 0.0090 ms 86.1% + triton_bmm_74681 0.0090 ms 85.8% + triton_bmm_74680 0.0095 ms 81.5% + triton_bmm_74688 0.0102 ms 75.6% + triton_bmm_74687 0.0117 ms 66.3% + bmm 0.0610 ms 12.7% +SingleProcess AUTOTUNE takes 3.1821 seconds +AUTOTUNE bmm(8x1x64, 8x64x98) + triton_bmm_75441 0.0068 ms 100.0% + triton_bmm_75442 0.0068 ms 99.5% + triton_bmm_75438 0.0070 ms 96.3% + triton_bmm_75445 0.0072 ms 94.4% + triton_bmm_75439 0.0072 ms 93.6% + triton_bmm_75443 0.0073 ms 93.0% + triton_bmm_75444 0.0075 ms 90.2% + triton_bmm_75440 0.0076 ms 89.4% + triton_bmm_75446 0.0076 ms 89.0% + triton_bmm_75447 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.8005 seconds +AUTOTUNE bmm(8x1x98, 8x98x64) + triton_bmm_75465 0.0069 ms 100.0% + triton_bmm_75463 0.0070 ms 99.1% + triton_bmm_75464 0.0070 ms 99.1% + triton_bmm_75462 0.0070 ms 98.6% + triton_bmm_75461 0.0078 ms 88.9% + triton_bmm_75466 0.0085 ms 81.5% + triton_bmm_75460 0.0088 ms 78.5% + triton_bmm_75467 0.0097 ms 71.1% + triton_bmm_75468 0.0102 ms 67.5% + bmm 0.0632 ms 10.9% +SingleProcess AUTOTUNE takes 2.9221 seconds +AUTOTUNE bmm(8x1x64, 8x64x99) + triton_bmm_76218 0.0070 ms 100.0% + triton_bmm_76226 0.0072 ms 96.5% + triton_bmm_76217 0.0073 ms 96.0% + triton_bmm_76219 0.0073 ms 95.6% + triton_bmm_76221 0.0073 ms 95.6% + triton_bmm_76223 0.0073 ms 95.6% + triton_bmm_76222 0.0073 ms 95.2% + triton_bmm_76220 0.0076 ms 91.4% + triton_bmm_76224 0.0077 ms 90.5% + triton_bmm_76225 0.0078 ms 89.3% +SingleProcess AUTOTUNE takes 3.6981 seconds +AUTOTUNE bmm(8x1x99, 8x99x64) + triton_bmm_76243 0.0082 ms 100.0% + triton_bmm_76244 0.0082 ms 99.2% + triton_bmm_76245 0.0083 ms 98.5% + triton_bmm_76241 0.0090 ms 90.4% + triton_bmm_76242 0.0090 ms 90.4% + triton_bmm_76246 0.0090 ms 90.4% + triton_bmm_76240 0.0097 ms 84.2% + triton_bmm_76248 0.0098 ms 83.6% + triton_bmm_76247 0.0120 ms 67.8% + bmm 0.0576 ms 14.2% +SingleProcess AUTOTUNE takes 3.1807 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:17, ?it/s] +hf_T5_large +cuda eval hf_T5_large int8weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +hf_Whisper +cuda eval hf_Whisper int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1500x256, 256x256) + triton_mm_8 0.0126 ms 100.0% + triton_mm_4 0.0128 ms 98.3% + triton_mm_5 0.0132 ms 95.2% + triton_mm_3 0.0135 ms 93.6% + triton_mm_6 0.0135 ms 93.5% + triton_mm_1 0.0137 ms 92.3% + triton_mm_9 0.0141 ms 89.5% + triton_mm_2 0.0142 ms 89.0% + triton_mm_0 0.0158 ms 79.8% + triton_mm_10 0.0165 ms 76.2% +SingleProcess AUTOTUNE takes 5.3183 seconds +AUTOTUNE mixed_mm(1500x256, 256x1536) + triton_mm_45 0.0231 ms 100.0% + triton_mm_46 0.0231 ms 99.9% + triton_mm_48 0.0243 ms 95.3% + triton_mm_47 0.0264 ms 87.5% + triton_mm_44 0.0277 ms 83.5% + triton_mm_51 0.0304 ms 75.9% + triton_mm_54 0.0369 ms 62.7% + triton_mm_49 0.0379 ms 61.0% + triton_mm_52 0.0379 ms 61.0% + triton_mm_50 0.0383 ms 60.3% +SingleProcess AUTOTUNE takes 5.2682 seconds +AUTOTUNE mixed_mm(1500x1536, 1536x256) + triton_mm_63 0.0359 ms 100.0% + triton_mm_59 0.0407 ms 88.2% + triton_mm_60 0.0422 ms 85.1% + triton_mm_61 0.0423 ms 84.9% + triton_mm_64 0.0429 ms 83.7% + triton_mm_58 0.0442 ms 81.3% + triton_mm_56 0.0461 ms 77.9% + triton_mm_57 0.0462 ms 77.8% + triton_mm_55 0.0612 ms 58.7% + fallback_mixed_mm 0.0658 ms 54.6% +SingleProcess AUTOTUNE takes 5.4920 seconds +AUTOTUNE mixed_mm(1x256, 256x2) + triton_mm_412 0.0094 ms 100.0% + triton_mm_411 0.0097 ms 97.0% + triton_mm_409 0.0111 ms 84.5% + triton_mm_410 0.0111 ms 84.5% + triton_mm_408 0.0113 ms 83.3% + triton_mm_413 0.0127 ms 74.2% + triton_mm_407 0.0135 ms 69.8% + fallback_mixed_mm 0.0658 ms 14.3% +SingleProcess AUTOTUNE takes 2.5011 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +WARNING:root:hf_clip failed to load +hf_clip +Original Error: 'str' object has no attribute 'shape' +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward + vision_outputs = self.vision_model( + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward + hidden_states = self.embeddings(pixel_values) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward + batch_size = pixel_values.shape[0] +AttributeError: 'str' object has no attribute 'shape' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +lennard_jones +cuda eval lennard_jones int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x1, 1x16) + triton_mm_2 0.0064 ms 100.0% + triton_mm_3 0.0064 ms 100.0% + triton_mm_4 0.0070 ms 91.3% + triton_mm_1 0.0071 ms 89.7% + triton_mm_0 0.0072 ms 88.9% + fallback_mixed_mm 0.0644 ms 9.9% +SingleProcess AUTOTUNE takes 1.5918 seconds +AUTOTUNE mixed_mm(1x16, 16x16) + triton_mm_5 0.0061 ms 100.0% + triton_mm_6 0.0067 ms 91.9% + triton_mm_8 0.0068 ms 89.7% + triton_mm_7 0.0069 ms 89.3% + triton_mm_9 0.0069 ms 88.5% + fallback_mixed_mm 0.0644 ms 9.5% +SingleProcess AUTOTUNE takes 1.8487 seconds +AUTOTUNE mixed_mm(1x16, 16x1) + triton_mm_20 0.0059 ms 100.0% + triton_mm_22 0.0059 ms 100.0% + triton_mm_24 0.0061 ms 95.8% + triton_mm_23 0.0065 ms 90.2% + triton_mm_21 0.0066 ms 89.8% + fallback_mixed_mm 0.0595 ms 9.9% +SingleProcess AUTOTUNE takes 1.7266 seconds +pass-sqnr-40.954 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +llama +cuda eval llama int8weightonly-bs1-acc +AUTOTUNE mixed_mm(32x512, 512x512) + triton_mm_5 0.0115 ms 100.0% + triton_mm_9 0.0126 ms 91.3% + triton_mm_8 0.0129 ms 89.3% + triton_mm_4 0.0134 ms 85.7% + triton_mm_6 0.0137 ms 83.7% + triton_mm_2 0.0148 ms 77.7% + triton_mm_1 0.0160 ms 71.8% + triton_mm_3 0.0173 ms 66.2% + triton_mm_0 0.0225 ms 51.0% + triton_mm_10 0.0246 ms 46.7% +SingleProcess AUTOTUNE takes 4.1468 seconds +AUTOTUNE mixed_mm(32x512, 512x1536) + triton_mm_49 0.0115 ms 100.0% + triton_mm_52 0.0130 ms 88.2% + triton_mm_53 0.0133 ms 86.0% + triton_mm_50 0.0134 ms 85.2% + triton_mm_48 0.0137 ms 83.8% + triton_mm_46 0.0153 ms 74.7% + triton_mm_45 0.0164 ms 69.8% + triton_mm_47 0.0177 ms 64.9% + triton_mm_44 0.0234 ms 48.9% + triton_mm_54 0.0252 ms 45.4% +SingleProcess AUTOTUNE takes 4.1124 seconds +AUTOTUNE mixed_mm(32x1536, 1536x512) + triton_mm_71 0.0213 ms 100.0% + triton_mm_74 0.0241 ms 88.2% + triton_mm_75 0.0258 ms 82.4% + triton_mm_70 0.0275 ms 77.5% + triton_mm_72 0.0275 ms 77.4% + triton_mm_68 0.0315 ms 67.5% + triton_mm_67 0.0352 ms 60.4% + triton_mm_69 0.0389 ms 54.7% + triton_mm_66 0.0554 ms 38.4% + triton_mm_76 0.0608 ms 35.0% +SingleProcess AUTOTUNE takes 3.9708 seconds +AUTOTUNE mixed_mm(1x512, 512x32000) + triton_mm_620 0.0340 ms 100.0% + triton_mm_622 0.0344 ms 98.9% + triton_mm_619 0.0373 ms 91.2% + triton_mm_626 0.0391 ms 87.0% + triton_mm_623 0.0403 ms 84.4% + triton_mm_617 0.0517 ms 65.8% + triton_mm_624 0.0832 ms 40.9% + triton_mm_618 0.1001 ms 34.0% + triton_mm_616 0.1024 ms 33.2% + triton_mm_625 0.1392 ms 24.4% +SingleProcess AUTOTUNE takes 4.5855 seconds +pass-sqnr-41.408 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:54, ?it/s] +llama_v2_7b_16h +cuda eval llama_v2_7b_16h int8weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +maml_omniglot +cuda eval maml_omniglot int8weightonly-bs1-acc +pass-sqnr-49.712 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mnasnet1_0 +cuda eval mnasnet1_0 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x1280, 1280x1000) + triton_mm_417 0.0185 ms 100.0% + triton_mm_420 0.0216 ms 85.5% + triton_mm_418 0.0228 ms 81.0% + triton_mm_416 0.0233 ms 79.3% + triton_mm_421 0.0238 ms 77.6% + triton_mm_414 0.0297 ms 62.1% + triton_mm_413 0.0312 ms 59.2% + triton_mm_415 0.0364 ms 50.7% + triton_mm_412 0.0508 ms 36.4% + triton_mm_422 0.0520 ms 35.5% +SingleProcess AUTOTUNE takes 4.1438 seconds +pass-sqnr-29.382 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +mobilenet_v2 +cuda eval mobilenet_v2 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x1280, 1280x1000) + triton_mm_417 0.0181 ms 100.0% + triton_mm_420 0.0208 ms 87.2% + triton_mm_421 0.0230 ms 78.6% + triton_mm_416 0.0232 ms 78.2% + triton_mm_418 0.0232 ms 78.2% + triton_mm_414 0.0293 ms 61.9% + triton_mm_413 0.0304 ms 59.5% + triton_mm_415 0.0364 ms 49.7% + triton_mm_412 0.0504 ms 35.9% + triton_mm_422 0.0522 ms 34.7% +SingleProcess AUTOTUNE takes 4.0498 seconds +pass-sqnr-28.901 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:mobilenet_v2_quantized_qat failed to load +mobilenet_v2_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mobilenet_v3_large +cuda eval mobilenet_v3_large int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x960, 960x1280) + triton_mm_544 0.0178 ms 100.0% + triton_mm_547 0.0186 ms 95.7% + triton_mm_543 0.0190 ms 93.6% + triton_mm_545 0.0190 ms 93.6% + triton_mm_541 0.0239 ms 74.4% + triton_mm_548 0.0248 ms 71.7% + triton_mm_540 0.0252 ms 70.6% + triton_mm_542 0.0286 ms 62.1% + triton_mm_539 0.0398 ms 44.7% + triton_mm_549 0.0420 ms 42.3% +SingleProcess AUTOTUNE takes 4.1103 seconds +pass-sqnr-28.569 + loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0 + loading model: 0it [00:04, ?it/s] +moco +cuda eval moco int8weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for moco. Setting accuracy check to cosine +ERROR:common:add_(): argument 'other' (position 1) must be Tensor, not NoneType +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2156, in check_accuracy + correct_result = self.run_n_iterations( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward + self._momentum_update_key_encoder() # update the key encoder + File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context + return func(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder + param_k.mul_(self.m).add_(param_q.mul(1. - self.m)) +TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType +eager_1st_run_fail + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +nanogpt +number of parameters: 123.69M +num decayed parameter tensors: 50, with 124,354,560 parameters +num non-decayed parameter tensors: 98, with 121,344 parameters +using fused AdamW: True +cuda eval nanogpt int8weightonly-bs1-acc +AUTOTUNE mixed_mm(64x768, 768x2304) + triton_mm_6 0.0178 ms 100.0% + triton_mm_5 0.0180 ms 98.4% + triton_mm_9 0.0188 ms 94.4% + triton_mm_2 0.0213 ms 83.2% + triton_mm_8 0.0216 ms 82.1% + triton_mm_4 0.0230 ms 77.3% + triton_mm_3 0.0258 ms 68.9% + triton_mm_1 0.0271 ms 65.6% + triton_mm_0 0.0348 ms 51.1% + triton_mm_10 0.0354 ms 50.2% +SingleProcess AUTOTUNE takes 4.5823 seconds +AUTOTUNE mixed_mm(64x768, 768x768) + triton_mm_20 0.0164 ms 100.0% + triton_mm_17 0.0169 ms 97.2% + triton_mm_16 0.0171 ms 96.1% + triton_mm_13 0.0204 ms 80.3% + triton_mm_19 0.0214 ms 76.8% + triton_mm_15 0.0233 ms 70.6% + triton_mm_14 0.0255 ms 64.4% + triton_mm_12 0.0267 ms 61.4% + triton_mm_11 0.0337 ms 48.6% + triton_mm_21 0.0341 ms 48.1% +SingleProcess AUTOTUNE takes 4.8111 seconds +AUTOTUNE mixed_mm(64x768, 768x3072) + triton_mm_27 0.0179 ms 100.0% + triton_mm_28 0.0179 ms 99.6% + triton_mm_31 0.0198 ms 90.0% + triton_mm_24 0.0214 ms 83.4% + triton_mm_30 0.0222 ms 80.3% + triton_mm_26 0.0232 ms 77.0% + triton_mm_25 0.0258 ms 69.1% + triton_mm_23 0.0266 ms 67.2% + triton_mm_22 0.0353 ms 50.5% + triton_mm_32 0.0364 ms 49.1% +SingleProcess AUTOTUNE takes 4.4465 seconds +AUTOTUNE mixed_mm(64x3072, 3072x768) + triton_mm_42 0.0466 ms 100.0% + triton_mm_39 0.0484 ms 96.2% + triton_mm_38 0.0492 ms 94.5% + triton_mm_35 0.0631 ms 73.7% + triton_mm_41 0.0635 ms 73.4% + triton_mm_37 0.0713 ms 65.3% + fallback_mixed_mm 0.0738 ms 63.1% + triton_mm_36 0.0800 ms 58.2% + triton_mm_34 0.0852 ms 54.6% + triton_mm_33 0.1147 ms 40.6% +SingleProcess AUTOTUNE takes 4.8492 seconds +AUTOTUNE mixed_mm(1x768, 768x50304) + triton_mm_531 0.0655 ms 100.0% + triton_mm_532 0.0679 ms 96.5% + triton_mm_534 0.0693 ms 94.5% + triton_mm_538 0.0706 ms 92.8% + triton_mm_535 0.0844 ms 77.6% + triton_mm_529 0.0997 ms 65.7% + triton_mm_536 0.1283 ms 51.0% + triton_mm_530 0.1643 ms 39.9% + triton_mm_537 0.2135 ms 30.7% + triton_mm_528 0.2201 ms 29.8% +SingleProcess AUTOTUNE takes 3.9041 seconds +pass-sqnr-32.277 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +nvidia_deeprecommender +cuda eval nvidia_deeprecommender int8weightonly-bs1-acc +pass-sqnr-41.873 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +opacus_cifar10 +cuda eval opacus_cifar10 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x512, 512x10) + triton_mm_114 0.0122 ms 100.0% + triton_mm_113 0.0134 ms 90.7% + triton_mm_111 0.0168 ms 72.4% + triton_mm_112 0.0168 ms 72.4% + triton_mm_110 0.0169 ms 72.2% + triton_mm_109 0.0217 ms 56.3% + triton_mm_115 0.0268 ms 45.5% + fallback_mixed_mm 0.0990 ms 12.3% +SingleProcess AUTOTUNE takes 2.5548 seconds +pass-sqnr-37.928 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:26, ?it/s] +phi_1_5 +cuda eval phi_1_5 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(512x2048, 2048x6144) + fallback_mixed_mm 0.1409 ms 100.0% + triton_mm_2 0.1638 ms 86.0% + triton_mm_1 0.1642 ms 85.8% + triton_mm_4 0.1667 ms 84.6% + triton_mm_0 0.1837 ms 76.7% + triton_mm_3 0.1845 ms 76.4% + triton_mm_7 0.1864 ms 75.6% + triton_mm_8 0.2810 ms 50.2% + triton_mm_5 0.2893 ms 48.7% + triton_mm_6 0.3056 ms 46.1% +SingleProcess AUTOTUNE takes 5.0424 seconds +AUTOTUNE mixed_mm(512x2048, 2048x2048) + fallback_mixed_mm 0.0714 ms 100.0% + triton_mm_36 0.0857 ms 83.4% + triton_mm_39 0.0866 ms 82.5% + triton_mm_37 0.0871 ms 82.0% + triton_mm_35 0.0920 ms 77.6% + triton_mm_38 0.0976 ms 73.2% + triton_mm_40 0.1070 ms 66.7% + triton_mm_41 0.1114 ms 64.1% + triton_mm_43 0.1127 ms 63.4% + triton_mm_44 0.1151 ms 62.1% +SingleProcess AUTOTUNE takes 4.7856 seconds +AUTOTUNE mixed_mm(512x2048, 2048x8192) + triton_mm_47 0.1948 ms 100.0% + triton_mm_48 0.1967 ms 99.0% + triton_mm_50 0.2065 ms 94.3% + fallback_mixed_mm 0.2179 ms 89.4% + triton_mm_46 0.2250 ms 86.5% + triton_mm_49 0.2286 ms 85.2% + triton_mm_53 0.3075 ms 63.3% + triton_mm_54 0.3499 ms 55.7% + triton_mm_51 0.3635 ms 53.6% + triton_mm_52 0.3859 ms 50.5% +SingleProcess AUTOTUNE takes 5.2291 seconds +AUTOTUNE mixed_mm(512x8192, 8192x2048) + fallback_mixed_mm 0.1959 ms 100.0% + triton_mm_58 0.3172 ms 61.8% + triton_mm_61 0.3211 ms 61.0% + triton_mm_59 0.3244 ms 60.4% + triton_mm_57 0.3447 ms 56.8% + triton_mm_60 0.3638 ms 53.8% + triton_mm_62 0.3991 ms 49.1% + triton_mm_63 0.4188 ms 46.8% + triton_mm_65 0.4195 ms 46.7% + triton_mm_66 0.4309 ms 45.5% +SingleProcess AUTOTUNE takes 4.8151 seconds +AUTOTUNE mixed_mm(512x2048, 2048x51200) + fallback_mixed_mm 1.0547 ms 100.0% + triton_mm_1633 1.0957 ms 96.3% + triton_mm_1634 1.0999 ms 95.9% + triton_mm_1636 1.1650 ms 90.5% + triton_mm_1632 1.2259 ms 86.0% + triton_mm_1639 1.2969 ms 81.3% + triton_mm_1635 1.2970 ms 81.3% + triton_mm_1640 2.0421 ms 51.6% + triton_mm_1637 2.1838 ms 48.3% + triton_mm_1642 2.3112 ms 45.6% +SingleProcess AUTOTUNE takes 5.1552 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +phlippe_densenet +cuda eval phlippe_densenet int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x184, 184x10) + triton_mm_406 0.0085 ms 100.0% + triton_mm_403 0.0098 ms 86.9% + triton_mm_405 0.0098 ms 86.9% + triton_mm_407 0.0100 ms 84.9% + triton_mm_404 0.0103 ms 82.3% + triton_mm_408 0.0108 ms 78.6% + triton_mm_402 0.0119 ms 71.4% + fallback_mixed_mm 0.0923 ms 9.2% +SingleProcess AUTOTUNE takes 2.7021 seconds +pass-sqnr-41.252 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +phlippe_resnet +cuda eval phlippe_resnet int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x64, 64x10) + triton_mm_120 0.0070 ms 100.0% + triton_mm_121 0.0071 ms 98.6% + triton_mm_118 0.0075 ms 93.6% + triton_mm_119 0.0075 ms 93.6% + triton_mm_116 0.0079 ms 89.0% + triton_mm_117 0.0080 ms 87.3% + triton_mm_122 0.0083 ms 84.2% + fallback_mixed_mm 0.1003 ms 7.0% +SingleProcess AUTOTUNE takes 2.5204 seconds +pass-sqnr-45.684 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +pyhpc_equation_of_state +cuda eval pyhpc_equation_of_state int8weightonly-bs1-acc +pass-sqnr-40.034 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +pyhpc_isoneutral_mixing +cuda eval pyhpc_isoneutral_mixing int8weightonly-bs1-acc +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead + loading model: 0it [00:01, ?it/s] +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pyhpc_turbulent_kinetic_energy +cuda eval pyhpc_turbulent_kinetic_energy int8weightonly-bs1-acc +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +pytorch_CycleGAN_and_pix2pix +cuda eval pytorch_CycleGAN_and_pix2pix int8weightonly-bs1-acc +pass-sqnr-33.538 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +pytorch_stargan +cuda eval pytorch_stargan int8weightonly-bs1-acc +pass-sqnr-41.851 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +pytorch_unet +cuda eval pytorch_unet int8weightonly-bs1-acc +pass-sqnr-49.327 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +resnet152 +cuda eval resnet152 int8weightonly-bs1-acc +AUTOTUNE mixed_mm(1x2048, 2048x1000) + triton_mm_1594 0.0252 ms 100.0% + triton_mm_1597 0.0295 ms 85.2% + triton_mm_1598 0.0323 ms 78.0% + triton_mm_1595 0.0324 ms 77.5% + triton_mm_1593 0.0330 ms 76.3% + triton_mm_1591 0.0436 ms 57.7% + triton_mm_1590 0.0448 ms 56.1% + triton_mm_1592 0.0542 ms 46.4% + fallback_mixed_mm 0.0658 ms 38.2% + triton_mm_1589 0.0770 ms 32.7% +SingleProcess AUTOTUNE takes 3.8777 seconds +pass-sqnr-37.621 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +resnet18 +cuda eval resnet18 int8weightonly-bs1-acc +pass-sqnr-34.056 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +resnet50 +cuda eval resnet50 int8weightonly-bs1-acc +pass-sqnr-36.759 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:resnet50_quantized_qat failed to load +resnet50_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +resnext50_32x4d +cuda eval resnext50_32x4d int8weightonly-bs1-acc +pass-sqnr-35.597 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:10, ?it/s] +sam +cuda eval sam int8weightonly-bs1-acc +AUTOTUNE mixed_mm(4900x1280, 1280x3840) + fallback_mixed_mm 0.2220 ms 100.0% + triton_mm_8 0.5146 ms 43.2% + triton_mm_9 0.5200 ms 42.7% + triton_mm_7 0.5483 ms 40.5% + triton_mm_11 0.5537 ms 40.1% + triton_mm_14 0.5950 ms 37.3% + triton_mm_10 0.6075 ms 36.6% + triton_mm_15 0.9441 ms 23.5% + triton_mm_12 1.0184 ms 21.8% + triton_mm_17 1.0307 ms 21.5% +SingleProcess AUTOTUNE takes 5.0206 seconds +AUTOTUNE mixed_mm(4900x1280, 1280x1280) + fallback_mixed_mm 0.0871 ms 100.0% + triton_mm_67 0.1936 ms 45.0% + triton_mm_68 0.1964 ms 44.4% + triton_mm_66 0.2033 ms 42.9% + triton_mm_70 0.2067 ms 42.2% + triton_mm_73 0.2257 ms 38.6% + triton_mm_69 0.2272 ms 38.4% + triton_mm_74 0.3340 ms 26.1% + triton_mm_71 0.3519 ms 24.8% + triton_mm_76 0.3640 ms 23.9% +SingleProcess AUTOTUNE takes 5.0077 seconds +AUTOTUNE mixed_mm(4096x1280, 1280x5120) + fallback_mixed_mm 0.2489 ms 100.0% + triton_mm_78 0.5609 ms 44.4% + triton_mm_79 0.5657 ms 44.0% + triton_mm_81 0.6024 ms 41.3% + triton_mm_77 0.6051 ms 41.1% + triton_mm_84 0.6341 ms 39.2% + triton_mm_80 0.6637 ms 37.5% + triton_mm_85 1.0487 ms 23.7% + triton_mm_82 1.1258 ms 22.1% + triton_mm_87 1.1346 ms 21.9% +SingleProcess AUTOTUNE takes 4.9822 seconds +AUTOTUNE mixed_mm(4096x5120, 5120x1280) + fallback_mixed_mm 0.2896 ms 100.0% + triton_mm_89 0.5665 ms 51.1% + triton_mm_90 0.5752 ms 50.3% + triton_mm_92 0.5992 ms 48.3% + triton_mm_91 0.6629 ms 43.7% + triton_mm_88 0.6689 ms 43.3% + triton_mm_95 0.7676 ms 37.7% + triton_mm_96 1.0209 ms 28.4% + triton_mm_93 1.1230 ms 25.8% + triton_mm_94 1.1808 ms 24.5% +SingleProcess AUTOTUNE takes 5.0007 seconds +AUTOTUNE mixed_mm(4096x1280, 1280x3840) + fallback_mixed_mm 0.1887 ms 100.0% + triton_mm_652 0.4243 ms 44.5% + triton_mm_653 0.4288 ms 44.0% + triton_mm_655 0.4549 ms 41.5% + triton_mm_651 0.4617 ms 40.9% + triton_mm_654 0.5019 ms 37.6% + triton_mm_658 0.5252 ms 35.9% + triton_mm_659 0.7902 ms 23.9% + triton_mm_656 0.8526 ms 22.1% + triton_mm_661 0.8606 ms 21.9% +SingleProcess AUTOTUNE takes 5.0035 seconds +AUTOTUNE mixed_mm(4096x1280, 1280x1280) + fallback_mixed_mm 0.0956 ms 100.0% + triton_mm_711 0.1500 ms 63.7% + triton_mm_712 0.1524 ms 62.7% + triton_mm_714 0.1594 ms 59.9% + triton_mm_713 0.1749 ms 54.6% + triton_mm_710 0.1773 ms 53.9% + triton_mm_717 0.1988 ms 48.1% + triton_mm_718 0.2696 ms 35.4% + triton_mm_715 0.2944 ms 32.5% + triton_mm_720 0.3064 ms 31.2% +SingleProcess AUTOTUNE takes 5.2980 seconds +AUTOTUNE mixed_mm(5x256, 256x256) + triton_mm_2934 0.0091 ms 100.0% + triton_mm_2938 0.0095 ms 95.3% + triton_mm_2931 0.0101 ms 89.9% + triton_mm_2935 0.0101 ms 89.7% + triton_mm_2937 0.0101 ms 89.6% + triton_mm_2933 0.0102 ms 89.0% + triton_mm_2930 0.0113 ms 80.5% + triton_mm_2932 0.0124 ms 73.4% + triton_mm_2929 0.0138 ms 66.0% + triton_mm_2939 0.0148 ms 61.2% +SingleProcess AUTOTUNE takes 4.1403 seconds +AUTOTUNE mixed_mm(5x256, 256x128) + triton_mm_2978 0.0085 ms 100.0% + triton_mm_2981 0.0094 ms 90.2% + triton_mm_2982 0.0095 ms 89.3% + triton_mm_2975 0.0099 ms 86.4% + triton_mm_2979 0.0101 ms 84.2% + triton_mm_2977 0.0102 ms 83.8% + triton_mm_2974 0.0111 ms 77.0% + triton_mm_2976 0.0120 ms 71.1% + triton_mm_2973 0.0134 ms 63.6% + triton_mm_2983 0.0138 ms 61.6% +SingleProcess AUTOTUNE takes 4.1063 seconds +AUTOTUNE mixed_mm(4096x256, 256x128) + triton_mm_3035 0.0132 ms 100.0% + triton_mm_3033 0.0141 ms 94.1% + triton_mm_3034 0.0144 ms 92.3% + triton_mm_3032 0.0145 ms 91.6% + triton_mm_3040 0.0158 ms 83.6% + triton_mm_3037 0.0161 ms 82.1% + triton_mm_3036 0.0166 ms 79.9% + triton_mm_3031 0.0168 ms 79.0% + triton_mm_3039 0.0177 ms 75.0% + triton_mm_3041 0.0181 ms 73.1% +SingleProcess AUTOTUNE takes 4.8763 seconds +AUTOTUNE mixed_mm(4096x256, 256x128) + triton_mm_3043 0.0151 ms 100.0% + triton_mm_3044 0.0151 ms 99.9% + triton_mm_3045 0.0172 ms 87.7% + triton_mm_3046 0.0172 ms 87.6% + triton_mm_3042 0.0200 ms 75.6% + triton_mm_3050 0.0207 ms 73.0% + triton_mm_3047 0.0216 ms 69.9% + triton_mm_3051 0.0219 ms 69.0% + triton_mm_3052 0.0222 ms 68.0% + triton_mm_3048 0.0222 ms 68.0% +SingleProcess AUTOTUNE takes 5.3159 seconds +AUTOTUNE mixed_mm(5x128, 128x256) + triton_mm_3062 0.0075 ms 100.0% + triton_mm_3058 0.0078 ms 95.5% + triton_mm_3059 0.0082 ms 90.5% + triton_mm_3057 0.0084 ms 88.3% + triton_mm_3055 0.0086 ms 86.3% + triton_mm_3061 0.0089 ms 84.1% + triton_mm_3054 0.0091 ms 82.0% + triton_mm_3056 0.0097 ms 77.2% + triton_mm_3053 0.0101 ms 73.5% + triton_mm_3063 0.0105 ms 70.8% +SingleProcess AUTOTUNE takes 4.0176 seconds +AUTOTUNE mixed_mm(5x256, 256x2048) + triton_mm_3068 0.0099 ms 100.0% + triton_mm_3069 0.0100 ms 98.7% + triton_mm_3070 0.0104 ms 94.8% + triton_mm_3072 0.0107 ms 92.8% + triton_mm_3073 0.0114 ms 86.8% + triton_mm_3066 0.0116 ms 85.6% + triton_mm_3065 0.0116 ms 85.1% + triton_mm_3067 0.0121 ms 82.0% + triton_mm_3064 0.0149 ms 66.3% + triton_mm_3074 0.0160 ms 61.7% +SingleProcess AUTOTUNE takes 3.9766 seconds +AUTOTUNE mixed_mm(5x2048, 2048x256) + triton_mm_3080 0.0263 ms 100.0% + triton_mm_3083 0.0298 ms 88.3% + triton_mm_3084 0.0310 ms 85.0% + triton_mm_3079 0.0337 ms 78.2% + triton_mm_3081 0.0340 ms 77.3% + triton_mm_3077 0.0390 ms 67.6% + triton_mm_3076 0.0440 ms 59.9% + triton_mm_3078 0.0495 ms 53.2% + triton_mm_3075 0.0672 ms 39.2% + fallback_mixed_mm 0.0675 ms 39.0% +SingleProcess AUTOTUNE takes 3.9061 seconds +AUTOTUNE mixed_mm(4096x128, 128x256) + triton_mm_3120 0.0128 ms 100.0% + triton_mm_3119 0.0129 ms 99.3% + triton_mm_3121 0.0133 ms 96.5% + triton_mm_3123 0.0138 ms 92.8% + triton_mm_3122 0.0143 ms 89.7% + triton_mm_3128 0.0145 ms 88.7% + triton_mm_3129 0.0145 ms 88.7% + triton_mm_3125 0.0152 ms 84.4% + triton_mm_3124 0.0157 ms 81.8% + triton_mm_3127 0.0164 ms 78.5% +SingleProcess AUTOTUNE takes 5.2285 seconds +AUTOTUNE mixed_mm(1x256, 256x256) + triton_mm_3333 0.0089 ms 100.0% + triton_mm_3332 0.0094 ms 93.9% + triton_mm_3336 0.0094 ms 93.9% + triton_mm_3337 0.0098 ms 90.1% + triton_mm_3334 0.0100 ms 88.8% + triton_mm_3330 0.0108 ms 81.7% + triton_mm_3329 0.0113 ms 78.5% + triton_mm_3331 0.0123 ms 72.3% + triton_mm_3328 0.0140 ms 63.4% + triton_mm_3338 0.0145 ms 61.1% +SingleProcess AUTOTUNE takes 4.3059 seconds +AUTOTUNE mixed_mm(1x256, 256x256) + triton_mm_3344 0.0083 ms 100.0% + triton_mm_3348 0.0092 ms 90.6% + triton_mm_3343 0.0094 ms 88.1% + triton_mm_3345 0.0094 ms 88.1% + triton_mm_3347 0.0095 ms 87.5% + triton_mm_3341 0.0108 ms 76.7% + triton_mm_3340 0.0113 ms 73.7% + triton_mm_3342 0.0128 ms 65.2% + triton_mm_3339 0.0136 ms 61.2% + triton_mm_3349 0.0148 ms 56.0% +SingleProcess AUTOTUNE takes 4.0901 seconds +AUTOTUNE mixed_mm(1x256, 256x32) + triton_mm_3353 0.0079 ms 100.0% + triton_mm_3354 0.0080 ms 98.4% + triton_mm_3351 0.0083 ms 95.0% + triton_mm_3355 0.0083 ms 94.6% + triton_mm_3352 0.0085 ms 93.0% + triton_mm_3350 0.0108 ms 72.8% + triton_mm_3356 0.0138 ms 57.1% + fallback_mixed_mm 0.0641 ms 12.3% +SingleProcess AUTOTUNE takes 2.5497 seconds +AUTOTUNE mixed_mm(1x256, 256x256) + triton_mm_3362 0.0089 ms 100.0% + triton_mm_3366 0.0092 ms 96.5% + triton_mm_3361 0.0094 ms 94.2% + triton_mm_3363 0.0094 ms 93.9% + triton_mm_3365 0.0100 ms 88.4% + triton_mm_3358 0.0108 ms 82.4% + triton_mm_3359 0.0108 ms 81.7% + triton_mm_3360 0.0123 ms 72.3% + triton_mm_3357 0.0140 ms 63.2% + triton_mm_3367 0.0146 ms 60.9% +SingleProcess AUTOTUNE takes 4.1169 seconds +AUTOTUNE mixed_mm(1x256, 256x256) + triton_mm_3391 0.0089 ms 100.0% + triton_mm_3392 0.0095 ms 93.6% + triton_mm_3395 0.0097 ms 91.4% + triton_mm_3390 0.0099 ms 89.2% + triton_mm_3394 0.0100 ms 88.4% + triton_mm_3388 0.0103 ms 86.0% + triton_mm_3387 0.0108 ms 82.4% + triton_mm_3389 0.0123 ms 72.3% + triton_mm_3386 0.0136 ms 65.2% + triton_mm_3396 0.0146 ms 60.7% +SingleProcess AUTOTUNE takes 3.9645 seconds +AUTOTUNE mixed_mm(1x256, 256x256) + triton_mm_3420 0.0089 ms 100.0% + triton_mm_3421 0.0094 ms 93.9% + triton_mm_3424 0.0098 ms 90.8% + triton_mm_3419 0.0100 ms 88.8% + triton_mm_3423 0.0100 ms 88.8% + triton_mm_3417 0.0108 ms 81.7% + triton_mm_3416 0.0113 ms 78.5% + triton_mm_3418 0.0123 ms 72.3% + triton_mm_3415 0.0136 ms 65.2% + triton_mm_3425 0.0141 ms 63.0% +SingleProcess AUTOTUNE takes 4.1161 seconds +AUTOTUNE mixed_mm(1x256, 256x4) + triton_mm_3481 0.0081 ms 100.0% + triton_mm_3478 0.0083 ms 97.3% + triton_mm_3479 0.0086 ms 93.7% + triton_mm_3480 0.0086 ms 93.7% + triton_mm_3482 0.0089 ms 91.3% + triton_mm_3477 0.0111 ms 72.9% + triton_mm_3483 0.0129 ms 62.8% + fallback_mixed_mm 0.0543 ms 14.9% +SingleProcess AUTOTUNE takes 2.4657 seconds +[2023-12-13 02:28:46,929] torch._dynamo.utils: [ERROR] RMSE (res-fp64): 0.00147, (ref-fp64): 0.00048 and shape=torch.Size([1, 1]) +[2023-12-13 02:28:46,929] torch._dynamo.utils: [ERROR] Accuracy failed for key name iou_predictions +fail_accuracy-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +shufflenet_v2_x1_0 +cuda eval shufflenet_v2_x1_0 int8weightonly-bs1-acc +ERROR:common:backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={view_1, clone} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2232, in check_accuracy + new_result = optimized_model_iter_fn(model_copy, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors + return callback(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame + result = inner_convert(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert + compiled_product = _compile( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile + guarded_code = compile_inner(code, one_graph, hooks, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner + out_code = transform_code_object(code, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object + transformations(instructions, code_options) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform + tracer.run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run + super().run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run + and self.step() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step + getattr(self, inst.opname)(inst) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE + self.output.compile_subgraph( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph + self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph + compiled_fn = self.call_user_compiler(gm) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler + raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler + compiled_fn = compiler_fn(gm, self.example_inputs()) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper + compiled_gm = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__ + return compile_fx(model_, inputs_, config_patches=self.config) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx + return compile_fx( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx + return aot_autograd( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn + cg = aot_module_simplified(gm, example_inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified + compiled_fn = create_aot_dispatcher_function( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function + compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe + return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base + return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base + compiled_fw = compiler(fw_module, updated_flat_args) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base + return inner_compile( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper + inner_compiled_fn = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner + compiled_graph = fx_codegen_and_compile( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile + graph.run(*example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run + return super().run(*args) + File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run + self.env[node] = self.run_node(node) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node + result = self.call_function(n.target, args, kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function + raise LoweringException(e, target, args, kwargs).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function + out = lowerings[target](*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped + out = decomp_fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution + return convert_1x1_conv_to_mm(x, weight, bias) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm + x.freeze_layout() + File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__ + fn = getattr(self.data, name) +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={view_1, clone} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +TorchDynamo optimized model failed to run because of following error +fail_to_run + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +soft_actor_critic +cuda eval soft_actor_critic int8weightonly-bs1-acc +AUTOTUNE mixed_mm(256x3, 3x1024) + triton_mm_9 0.0069 ms 100.0% + triton_mm_6 0.0074 ms 93.9% + triton_mm_5 0.0077 ms 89.7% + triton_mm_0 0.0078 ms 88.6% + triton_mm_10 0.0079 ms 87.9% + triton_mm_8 0.0080 ms 86.8% + triton_mm_3 0.0087 ms 79.5% + triton_mm_1 0.0088 ms 78.9% + triton_mm_4 0.0090 ms 77.5% + triton_mm_2 0.0090 ms 77.2% +SingleProcess AUTOTUNE takes 4.4565 seconds +AUTOTUNE mixed_mm(256x1024, 1024x1024) + triton_mm_20 0.0256 ms 100.0% + triton_mm_19 0.0263 ms 97.6% + triton_mm_15 0.0300 ms 85.3% + triton_mm_17 0.0302 ms 84.9% + triton_mm_16 0.0307 ms 83.4% + triton_mm_14 0.0321 ms 79.9% + triton_mm_12 0.0326 ms 78.7% + triton_mm_13 0.0334 ms 76.8% + triton_mm_11 0.0427 ms 60.1% + triton_mm_21 0.0449 ms 57.1% +SingleProcess AUTOTUNE takes 5.1529 seconds +AUTOTUNE mixed_mm(256x1024, 1024x2) + triton_mm_31 0.0218 ms 100.0% + triton_mm_30 0.0273 ms 79.9% + triton_mm_23 0.0348 ms 62.6% + triton_mm_28 0.0359 ms 60.6% + triton_mm_27 0.0370 ms 58.9% + triton_mm_25 0.0373 ms 58.5% + triton_mm_32 0.0424 ms 51.4% + triton_mm_24 0.0426 ms 51.2% + triton_mm_26 0.0449 ms 48.5% + triton_mm_22 0.0508 ms 42.9% +SingleProcess AUTOTUNE takes 4.0941 seconds +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +speech_transformer +cuda eval speech_transformer int8weightonly-bs1-acc +AUTOTUNE mixed_mm(2040x320, 320x512) + triton_mm_1 0.0203 ms 100.0% + triton_mm_0 0.0206 ms 98.4% + triton_mm_2 0.0208 ms 97.7% + triton_mm_4 0.0212 ms 95.8% + triton_mm_3 0.0222 ms 91.2% + triton_mm_5 0.0246 ms 82.4% + triton_mm_6 0.0247 ms 82.0% + triton_mm_10 0.0254 ms 79.7% + triton_mm_8 0.0266 ms 76.3% + triton_mm_7 0.0296 ms 68.5% +SingleProcess AUTOTUNE takes 5.4775 seconds +AUTOTUNE mixed_mm(2040x512, 512x512) + triton_mm_12 0.0274 ms 100.0% + triton_mm_13 0.0277 ms 98.6% + triton_mm_15 0.0286 ms 95.6% + triton_mm_11 0.0287 ms 95.3% + triton_mm_14 0.0312 ms 87.7% + triton_mm_16 0.0342 ms 80.0% + triton_mm_17 0.0343 ms 79.8% + triton_mm_19 0.0363 ms 75.4% + triton_mm_21 0.0368 ms 74.4% + triton_mm_20 0.0369 ms 74.2% +SingleProcess AUTOTUNE takes 5.1231 seconds +AUTOTUNE mixed_mm(2040x512, 512x2048) + triton_mm_80 0.0567 ms 100.0% + triton_mm_81 0.0575 ms 98.7% + triton_mm_83 0.0604 ms 94.0% + triton_mm_79 0.0614 ms 92.4% + triton_mm_82 0.0658 ms 86.2% + triton_mm_86 0.0809 ms 70.1% + fallback_mixed_mm 0.0848 ms 66.9% + triton_mm_87 0.1016 ms 55.9% + triton_mm_84 0.1031 ms 55.0% + triton_mm_85 0.1072 ms 52.9% +SingleProcess AUTOTUNE takes 4.9603 seconds +AUTOTUNE mixed_mm(2040x2048, 2048x512) + fallback_mixed_mm 0.0760 ms 100.0% + triton_mm_91 0.0856 ms 88.7% + triton_mm_92 0.0876 ms 86.7% + triton_mm_94 0.0884 ms 85.9% + triton_mm_90 0.0916 ms 83.0% + triton_mm_93 0.0982 ms 77.4% + triton_mm_95 0.1084 ms 70.1% + triton_mm_96 0.1111 ms 68.4% + triton_mm_98 0.1151 ms 66.0% + triton_mm_99 0.1161 ms 65.5% +SingleProcess AUTOTUNE takes 5.1542 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +AUTOTUNE mixed_mm(220x512, 512x512) + triton_mm_556 0.0136 ms 100.0% + triton_mm_557 0.0140 ms 97.3% + triton_mm_560 0.0147 ms 93.0% + triton_mm_559 0.0172 ms 79.0% + triton_mm_555 0.0186 ms 73.2% + triton_mm_554 0.0192 ms 71.0% + triton_mm_552 0.0196 ms 69.4% + triton_mm_553 0.0201 ms 67.8% + triton_mm_551 0.0237 ms 57.5% + triton_mm_561 0.0245 ms 55.6% +SingleProcess AUTOTUNE takes 4.9287 seconds +AUTOTUNE mixed_mm(220x512, 512x2048) + triton_mm_683 0.0189 ms 100.0% + triton_mm_682 0.0194 ms 97.4% + triton_mm_681 0.0203 ms 92.9% + triton_mm_680 0.0204 ms 92.6% + triton_mm_688 0.0230 ms 82.1% + triton_mm_685 0.0243 ms 77.8% + triton_mm_684 0.0243 ms 77.8% + triton_mm_687 0.0257 ms 73.4% + triton_mm_679 0.0258 ms 73.2% + triton_mm_689 0.0280 ms 67.5% +SingleProcess AUTOTUNE takes 5.1043 seconds +AUTOTUNE mixed_mm(220x2048, 2048x512) + triton_mm_699 0.0342 ms 100.0% + triton_mm_695 0.0347 ms 98.5% + triton_mm_696 0.0352 ms 97.2% + triton_mm_698 0.0448 ms 76.3% + triton_mm_694 0.0519 ms 65.8% + triton_mm_693 0.0559 ms 61.1% + triton_mm_691 0.0579 ms 59.1% + triton_mm_692 0.0595 ms 57.5% + fallback_mixed_mm 0.0728 ms 46.9% + triton_mm_690 0.0769 ms 44.5% +SingleProcess AUTOTUNE takes 5.0910 seconds +AUTOTUNE mixed_mm(220x512, 512x1014) + triton_mm_1460 0.0171 ms 100.0% + triton_mm_1459 0.0239 ms 71.7% + triton_mm_1457 0.0241 ms 71.1% + triton_mm_1456 0.0260 ms 65.9% + triton_mm_1461 0.0293 ms 58.3% + triton_mm_1455 0.0311 ms 55.0% + triton_mm_1454 0.0320 ms 53.6% + triton_mm_1451 0.0326 ms 52.6% + triton_mm_1453 0.0357 ms 47.9% + triton_mm_1452 0.0405 ms 42.2% +SingleProcess AUTOTUNE takes 5.5352 seconds +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +squeezenet1_1 +cuda eval squeezenet1_1 int8weightonly-bs1-acc +pass-sqnr-43.374 + loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder + + Loading pipeline components...: 0%| | 0/6 [00:00 + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 383, in load_model + module = importlib.import_module(c) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/importlib/__init__.py", line 126, in import_module + return _bootstrap._gcd_import(name[level:], package, level) + File "", line 1050, in _gcd_import + File "", line 1027, in _find_and_load + File "", line 1006, in _find_and_load_unlocked + File "", line 688, in _load_unlocked + File "", line 883, in exec_module + File "", line 241, in _call_with_frames_removed + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/__init__.py", line 7, in + from .data.dlrm_dataloader import get_dataloader + File "/home/cdhernandez/local/benchmark/torchbenchmark/canary_models/torchrec_dlrm/data/dlrm_dataloader.py", line 13, in + from torchrec.datasets.criteo import ( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/__init__.py", line 8, in + import torchrec.distributed # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/__init__.py", line 36, in + from torchrec.distributed.model_parallel import DistributedModelParallel # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/model_parallel.py", line 21, in + from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/__init__.py", line 22, in + from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/planners.py", line 19, in + from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/planner/constants.py", line 10, in + from torchrec.distributed.embedding_types import EmbeddingComputeKernel + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/torchrec/distributed/embedding_types.py", line 14, in + from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/__init__.py", line 23, in + from . import _fbgemm_gpu_docs, sparse_ops # noqa: F401, E402 # noqa: F401, E402 + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py", line 19, in + torch.ops.fbgemm.jagged_2d_to_dense, + File "/home/cdhernandez/local/pytorch/torch/_ops.py", line 822, in __getattr__ + raise AttributeError( +AttributeError: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense' +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +BERT_pytorch +cuda eval BERT_pytorch int4weightonly-bs1-acc +pass-sqnr-20.916 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +Background_Matting +cuda eval Background_Matting int4weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s]WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead + loading model: 0it [00:12, ?it/s] +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +DALLE2_pytorch +cuda eval DALLE2_pytorch int4weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for DALLE2_pytorch. Setting accuracy check to cosine +WARNING:common:Model DALLE2_pytorch does not support bfloat16, running with amp instead +[2023-12-13 02:46:16,237] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:16,471] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:16,658] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:16,841] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,026] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,212] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,397] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,582] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,763] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:17,943] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:18,133] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:18,316] [3/1_2] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +[2023-12-13 02:46:50,103] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:50,276] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:50,447] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:50,615] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:50,784] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:50,952] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,125] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,295] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,464] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,632] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,798] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 02:46:51,965] [9/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-inf + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +LearningToPaint +cuda eval LearningToPaint int4weightonly-bs1-acc +pass-sqnr-55.632 + loading model: 0it [00:00, ?it/s]WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead + loading model: 0it [00:04, ?it/s] +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +Super_SloMo +cuda eval Super_SloMo int4weightonly-bs1-acc +WARNING:common:Model Super_SloMo does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +alexnet +cuda eval alexnet int4weightonly-bs1-acc +pass-sqnr-23.561 + loading model: 0it [00:00, ?it/s]basic_gnn_edgecnn + loading model: 0it [00:04, ?it/s] +cuda eval basic_gnn_edgecnn int4weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-50.445 + loading model: 0it [00:00, ?it/s]basic_gnn_gcn + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_gcn int4weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-49.672 + loading model: 0it [00:00, ?it/s]basic_gnn_gin + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_gin int4weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-44.907 + loading model: 0it [00:00, ?it/s]basic_gnn_sage + loading model: 0it [00:03, ?it/s] +cuda eval basic_gnn_sage int4weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-48.436 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:09, ?it/s] +cm3leon_generate +cuda eval cm3leon_generate int4weightonly-bs1-acc +AUTOTUNE bmm(16x1x96, 16x96x489) + triton_bmm_46616 0.0084 ms 100.0% + triton_bmm_46613 0.0085 ms 99.2% + triton_bmm_46615 0.0089 ms 95.0% + triton_bmm_46620 0.0089 ms 95.0% + triton_bmm_46618 0.0089 ms 94.8% + triton_bmm_46612 0.0091 ms 93.0% + triton_bmm_46614 0.0091 ms 93.0% + triton_bmm_46619 0.0093 ms 91.0% + triton_bmm_46617 0.0093 ms 90.4% + triton_bmm_46621 0.0097 ms 86.8% +SingleProcess AUTOTUNE takes 3.8982 seconds +AUTOTUNE bmm(16x1x489, 16x489x96) + bmm 0.0116 ms 100.0% + triton_bmm_46645 0.0123 ms 94.5% + triton_bmm_46644 0.0148 ms 79.0% + triton_bmm_46641 0.0182 ms 64.1% + triton_bmm_46638 0.0186 ms 62.7% + triton_bmm_46642 0.0186 ms 62.7% + triton_bmm_46640 0.0188 ms 62.0% + triton_bmm_46637 0.0199 ms 58.6% + triton_bmm_46639 0.0206 ms 56.6% + triton_bmm_46636 0.0238 ms 49.0% +SingleProcess AUTOTUNE takes 4.1799 seconds +AUTOTUNE bmm(16x1x96, 16x96x490) + triton_bmm_46712 0.0084 ms 100.0% + triton_bmm_46716 0.0087 ms 97.4% + triton_bmm_46711 0.0087 ms 97.1% + triton_bmm_46709 0.0089 ms 94.6% + triton_bmm_46714 0.0089 ms 94.6% + triton_bmm_46710 0.0090 ms 94.3% + triton_bmm_46715 0.0093 ms 91.0% + triton_bmm_46713 0.0093 ms 90.7% + triton_bmm_46708 0.0095 ms 88.6% + triton_bmm_46719 0.0100 ms 84.6% +SingleProcess AUTOTUNE takes 3.8193 seconds +AUTOTUNE bmm(16x1x490, 16x490x96) + triton_bmm_46737 0.0097 ms 100.0% + triton_bmm_46738 0.0104 ms 93.5% + triton_bmm_46740 0.0106 ms 91.6% + triton_bmm_46736 0.0111 ms 87.6% + triton_bmm_46741 0.0112 ms 86.6% + triton_bmm_46735 0.0115 ms 84.9% + triton_bmm_46734 0.0125 ms 77.7% + triton_bmm_46733 0.0130 ms 75.1% + bmm 0.0157 ms 61.8% + triton_bmm_46732 0.0176 ms 55.4% +SingleProcess AUTOTUNE takes 4.5039 seconds +AUTOTUNE bmm(16x1x96, 16x96x491) + triton_bmm_46805 0.0085 ms 100.0% + triton_bmm_46806 0.0086 ms 98.1% + triton_bmm_46807 0.0088 ms 96.7% + triton_bmm_46812 0.0089 ms 95.7% + triton_bmm_46808 0.0089 ms 95.0% + triton_bmm_46810 0.0089 ms 95.0% + triton_bmm_46804 0.0091 ms 93.6% + triton_bmm_46809 0.0093 ms 90.8% + triton_bmm_46815 0.0097 ms 87.5% + triton_bmm_46811 0.0097 ms 87.2% +SingleProcess AUTOTUNE takes 3.9143 seconds +AUTOTUNE bmm(16x1x491, 16x491x96) + bmm 0.0122 ms 100.0% + triton_bmm_46837 0.0123 ms 99.2% + triton_bmm_46836 0.0149 ms 82.2% + triton_bmm_46833 0.0183 ms 66.7% + triton_bmm_46834 0.0187 ms 65.5% + triton_bmm_46832 0.0188 ms 64.9% + triton_bmm_46830 0.0190 ms 64.2% + triton_bmm_46829 0.0195 ms 62.8% + triton_bmm_46831 0.0206 ms 59.4% + triton_bmm_46828 0.0244 ms 50.0% +SingleProcess AUTOTUNE takes 3.7980 seconds +AUTOTUNE bmm(16x1x96, 16x96x492) + triton_bmm_46901 0.0084 ms 100.0% + triton_bmm_46906 0.0084 ms 100.0% + triton_bmm_46908 0.0087 ms 97.4% + triton_bmm_46903 0.0087 ms 97.1% + triton_bmm_46904 0.0088 ms 95.7% + triton_bmm_46905 0.0089 ms 95.3% + triton_bmm_46902 0.0089 ms 94.6% + triton_bmm_46907 0.0093 ms 91.0% + triton_bmm_46900 0.0095 ms 89.2% + triton_bmm_46909 0.0095 ms 88.9% +SingleProcess AUTOTUNE takes 5.6501 seconds +AUTOTUNE bmm(16x1x492, 16x492x96) + triton_bmm_46929 0.0097 ms 100.0% + triton_bmm_46932 0.0102 ms 95.6% + triton_bmm_46930 0.0104 ms 93.5% + triton_bmm_46933 0.0105 ms 92.4% + triton_bmm_46928 0.0111 ms 87.9% + triton_bmm_46927 0.0119 ms 81.7% + triton_bmm_46926 0.0123 ms 79.0% + triton_bmm_46925 0.0134 ms 72.6% + bmm 0.0158 ms 61.4% + triton_bmm_46924 0.0180 ms 54.2% +SingleProcess AUTOTUNE takes 4.5446 seconds +AUTOTUNE bmm(16x1x96, 16x96x493) + triton_bmm_46999 0.0084 ms 100.0% + triton_bmm_47002 0.0085 ms 98.9% + triton_bmm_46997 0.0086 ms 98.3% + triton_bmm_46998 0.0086 ms 97.4% + triton_bmm_47000 0.0089 ms 94.3% + triton_bmm_47001 0.0090 ms 93.3% + triton_bmm_46996 0.0091 ms 92.9% + triton_bmm_47003 0.0093 ms 90.7% + triton_bmm_47004 0.0093 ms 90.1% + triton_bmm_47007 0.0097 ms 86.7% +SingleProcess AUTOTUNE takes 3.9997 seconds +AUTOTUNE bmm(16x1x493, 16x493x96) + bmm 0.0121 ms 100.0% + triton_bmm_47029 0.0128 ms 94.7% + triton_bmm_47028 0.0145 ms 83.6% + triton_bmm_47026 0.0182 ms 66.5% + triton_bmm_47024 0.0184 ms 65.9% + triton_bmm_47022 0.0186 ms 65.1% + triton_bmm_47025 0.0186 ms 64.9% + triton_bmm_47021 0.0199 ms 60.7% + triton_bmm_47023 0.0203 ms 59.5% + triton_bmm_47020 0.0243 ms 49.9% +SingleProcess AUTOTUNE takes 4.0475 seconds +AUTOTUNE bmm(16x1x96, 16x96x494) + triton_bmm_47098 0.0085 ms 100.0% + triton_bmm_47096 0.0088 ms 97.1% + triton_bmm_47100 0.0089 ms 96.0% + triton_bmm_47093 0.0089 ms 95.7% + triton_bmm_47095 0.0089 ms 95.7% + triton_bmm_47094 0.0090 ms 95.0% + triton_bmm_47099 0.0093 ms 91.7% + triton_bmm_47097 0.0093 ms 91.1% + triton_bmm_47092 0.0095 ms 89.3% + triton_bmm_47102 0.0099 ms 85.8% +SingleProcess AUTOTUNE takes 3.7389 seconds +AUTOTUNE bmm(16x1x494, 16x494x96) + triton_bmm_47121 0.0098 ms 100.0% + triton_bmm_47122 0.0104 ms 93.8% + triton_bmm_47124 0.0107 ms 91.6% + triton_bmm_47120 0.0108 ms 90.5% + triton_bmm_47125 0.0108 ms 90.2% + triton_bmm_47119 0.0115 ms 85.2% + triton_bmm_47118 0.0125 ms 78.0% + triton_bmm_47117 0.0134 ms 72.8% + bmm 0.0156 ms 62.5% + triton_bmm_47116 0.0177 ms 55.2% +SingleProcess AUTOTUNE takes 4.3331 seconds +AUTOTUNE bmm(16x1x96, 16x96x495) + triton_bmm_47191 0.0084 ms 100.0% + triton_bmm_47192 0.0085 ms 98.9% + triton_bmm_47190 0.0086 ms 97.4% + triton_bmm_47189 0.0089 ms 94.3% + triton_bmm_47193 0.0090 ms 93.9% + triton_bmm_47194 0.0090 ms 93.9% + triton_bmm_47188 0.0091 ms 92.6% + triton_bmm_47196 0.0093 ms 90.1% + triton_bmm_47199 0.0097 ms 86.7% + triton_bmm_47197 0.0097 ms 86.5% +SingleProcess AUTOTUNE takes 4.1087 seconds +AUTOTUNE bmm(16x1x495, 16x495x96) + triton_bmm_47221 0.0123 ms 100.0% + bmm 0.0126 ms 98.0% + triton_bmm_47220 0.0149 ms 82.6% + triton_bmm_47217 0.0184 ms 67.1% + triton_bmm_47218 0.0186 ms 66.2% + triton_bmm_47214 0.0187 ms 66.0% + triton_bmm_47216 0.0188 ms 65.4% + triton_bmm_47213 0.0200 ms 61.7% + triton_bmm_47215 0.0207 ms 59.4% + triton_bmm_47212 0.0240 ms 51.3% +SingleProcess AUTOTUNE takes 3.8697 seconds +AUTOTUNE bmm(16x1x96, 16x96x496) + triton_bmm_47287 0.0083 ms 100.0% + triton_bmm_47288 0.0083 ms 99.6% + triton_bmm_47285 0.0084 ms 98.5% + triton_bmm_47290 0.0085 ms 97.7% + triton_bmm_47286 0.0090 ms 92.2% + triton_bmm_47292 0.0092 ms 90.2% + triton_bmm_47291 0.0093 ms 89.3% + triton_bmm_47289 0.0093 ms 88.7% + triton_bmm_47284 0.0095 ms 86.9% + triton_bmm_47293 0.0098 ms 84.9% +SingleProcess AUTOTUNE takes 3.9100 seconds +AUTOTUNE bmm(16x1x496, 16x496x96) + triton_bmm_47316 0.0102 ms 100.0% + triton_bmm_47314 0.0104 ms 98.1% + triton_bmm_47313 0.0105 ms 97.2% + triton_bmm_47312 0.0106 ms 96.1% + triton_bmm_47317 0.0108 ms 94.1% + bmm 0.0110 ms 92.4% + triton_bmm_47311 0.0118 ms 86.4% + triton_bmm_47310 0.0126 ms 80.7% + triton_bmm_47309 0.0130 ms 78.5% + triton_bmm_47308 0.0175 ms 58.1% +SingleProcess AUTOTUNE takes 3.9837 seconds +AUTOTUNE bmm(16x1x96, 16x96x497) + triton_bmm_47383 0.0084 ms 100.0% + triton_bmm_47384 0.0085 ms 99.2% + triton_bmm_47382 0.0086 ms 97.8% + triton_bmm_47385 0.0089 ms 95.3% + triton_bmm_47388 0.0089 ms 95.3% + triton_bmm_47381 0.0089 ms 94.6% + triton_bmm_47386 0.0090 ms 94.3% + triton_bmm_47380 0.0095 ms 88.6% + triton_bmm_47387 0.0098 ms 86.6% + triton_bmm_47390 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 4.0163 seconds +AUTOTUNE bmm(16x1x497, 16x497x96) + bmm 0.0113 ms 100.0% + triton_bmm_47413 0.0128 ms 88.2% + triton_bmm_47412 0.0145 ms 77.9% + triton_bmm_47408 0.0186 ms 60.7% + triton_bmm_47406 0.0188 ms 60.0% + triton_bmm_47410 0.0188 ms 59.9% + triton_bmm_47409 0.0189 ms 59.7% + triton_bmm_47405 0.0201 ms 56.1% + triton_bmm_47407 0.0208 ms 54.2% + triton_bmm_47404 0.0245 ms 46.0% +SingleProcess AUTOTUNE takes 3.8540 seconds +AUTOTUNE bmm(16x1x96, 16x96x498) + triton_bmm_47479 0.0084 ms 100.0% + triton_bmm_47477 0.0084 ms 99.6% + triton_bmm_47480 0.0085 ms 98.9% + triton_bmm_47482 0.0085 ms 98.9% + triton_bmm_47484 0.0087 ms 96.7% + triton_bmm_47478 0.0090 ms 93.9% + triton_bmm_47481 0.0092 ms 91.6% + triton_bmm_47483 0.0093 ms 90.7% + triton_bmm_47476 0.0095 ms 88.3% + triton_bmm_47486 0.0099 ms 84.8% +SingleProcess AUTOTUNE takes 3.7941 seconds +AUTOTUNE bmm(16x1x498, 16x498x96) + triton_bmm_47506 0.0100 ms 100.0% + triton_bmm_47505 0.0102 ms 97.5% + triton_bmm_47508 0.0106 ms 93.7% + triton_bmm_47504 0.0108 ms 92.0% + triton_bmm_47509 0.0113 ms 88.1% + triton_bmm_47503 0.0115 ms 86.9% + triton_bmm_47502 0.0128 ms 77.8% + triton_bmm_47501 0.0134 ms 74.0% + bmm 0.0158 ms 62.8% + triton_bmm_47500 0.0181 ms 54.9% +SingleProcess AUTOTUNE takes 4.0899 seconds +AUTOTUNE bmm(16x1x96, 16x96x499) + triton_bmm_47576 0.0085 ms 100.0% + triton_bmm_47575 0.0087 ms 97.8% + triton_bmm_47573 0.0090 ms 95.4% + triton_bmm_47574 0.0090 ms 95.0% + triton_bmm_47578 0.0090 ms 95.0% + triton_bmm_47580 0.0092 ms 92.7% + triton_bmm_47577 0.0093 ms 91.4% + triton_bmm_47572 0.0095 ms 89.6% + triton_bmm_47583 0.0097 ms 87.8% + triton_bmm_47579 0.0098 ms 87.5% +SingleProcess AUTOTUNE takes 3.9206 seconds +AUTOTUNE bmm(16x1x499, 16x499x96) + bmm 0.0116 ms 100.0% + triton_bmm_47605 0.0128 ms 91.2% + triton_bmm_47604 0.0149 ms 77.9% + triton_bmm_47602 0.0184 ms 63.4% + triton_bmm_47600 0.0186 ms 62.7% + triton_bmm_47598 0.0188 ms 61.9% + triton_bmm_47601 0.0188 ms 61.8% + triton_bmm_47597 0.0201 ms 58.0% + triton_bmm_47599 0.0208 ms 56.0% + triton_bmm_47596 0.0242 ms 48.1% +SingleProcess AUTOTUNE takes 4.1716 seconds +AUTOTUNE bmm(16x1x96, 16x96x500) + triton_bmm_47671 0.0087 ms 100.0% + triton_bmm_47676 0.0087 ms 100.0% + triton_bmm_47672 0.0088 ms 99.3% + triton_bmm_47674 0.0089 ms 98.2% + triton_bmm_47669 0.0089 ms 97.8% + triton_bmm_47670 0.0090 ms 97.5% + triton_bmm_47668 0.0091 ms 96.5% + triton_bmm_47673 0.0093 ms 93.5% + triton_bmm_47677 0.0095 ms 91.9% + triton_bmm_47675 0.0098 ms 89.5% +SingleProcess AUTOTUNE takes 3.9632 seconds +AUTOTUNE bmm(16x1x500, 16x500x96) + triton_bmm_47697 0.0097 ms 100.0% + triton_bmm_47698 0.0099 ms 98.1% + triton_bmm_47701 0.0106 ms 92.1% + triton_bmm_47700 0.0106 ms 91.6% + triton_bmm_47696 0.0111 ms 88.0% + triton_bmm_47695 0.0115 ms 84.9% + triton_bmm_47694 0.0123 ms 79.0% + triton_bmm_47693 0.0134 ms 72.4% + bmm 0.0156 ms 62.2% + triton_bmm_47692 0.0175 ms 55.5% +SingleProcess AUTOTUNE takes 4.2161 seconds +AUTOTUNE bmm(16x1x96, 16x96x501) + triton_bmm_47767 0.0084 ms 100.0% + triton_bmm_47765 0.0085 ms 99.6% + triton_bmm_47768 0.0085 ms 98.9% + triton_bmm_47770 0.0085 ms 98.9% + triton_bmm_47766 0.0086 ms 97.8% + triton_bmm_47769 0.0089 ms 95.3% + triton_bmm_47772 0.0089 ms 95.0% + triton_bmm_47764 0.0091 ms 93.0% + triton_bmm_47775 0.0097 ms 86.8% + triton_bmm_47771 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 4.0713 seconds +AUTOTUNE bmm(16x1x501, 16x501x96) + bmm 0.0120 ms 100.0% + triton_bmm_47797 0.0123 ms 97.7% + triton_bmm_47796 0.0149 ms 80.7% + triton_bmm_47794 0.0184 ms 65.5% + triton_bmm_47792 0.0186 ms 64.7% + triton_bmm_47790 0.0188 ms 63.9% + triton_bmm_47793 0.0189 ms 63.6% + triton_bmm_47789 0.0199 ms 60.5% + triton_bmm_47791 0.0208 ms 57.8% + triton_bmm_47788 0.0243 ms 49.6% +SingleProcess AUTOTUNE takes 4.0804 seconds +AUTOTUNE bmm(16x1x96, 16x96x502) + triton_bmm_47864 0.0085 ms 100.0% + triton_bmm_47863 0.0088 ms 97.1% + triton_bmm_47868 0.0088 ms 97.1% + triton_bmm_47865 0.0089 ms 96.0% + triton_bmm_47861 0.0089 ms 95.3% + triton_bmm_47862 0.0090 ms 94.7% + triton_bmm_47866 0.0090 ms 94.7% + triton_bmm_47860 0.0091 ms 94.0% + triton_bmm_47867 0.0093 ms 91.7% + triton_bmm_47871 0.0100 ms 85.3% +SingleProcess AUTOTUNE takes 3.8830 seconds +AUTOTUNE bmm(16x1x502, 16x502x96) + triton_bmm_47889 0.0097 ms 100.0% + triton_bmm_47892 0.0102 ms 95.3% + triton_bmm_47890 0.0104 ms 93.5% + triton_bmm_47893 0.0108 ms 89.9% + triton_bmm_47888 0.0112 ms 86.9% + triton_bmm_47887 0.0115 ms 84.9% + triton_bmm_47886 0.0129 ms 75.6% + triton_bmm_47885 0.0131 ms 74.3% + bmm 0.0156 ms 62.3% + triton_bmm_47884 0.0177 ms 54.9% +SingleProcess AUTOTUNE takes 4.0694 seconds +AUTOTUNE bmm(16x1x96, 16x96x503) + triton_bmm_47959 0.0084 ms 100.0% + triton_bmm_47960 0.0085 ms 98.5% + triton_bmm_47962 0.0085 ms 98.5% + triton_bmm_47957 0.0086 ms 97.4% + triton_bmm_47958 0.0087 ms 97.0% + triton_bmm_47964 0.0089 ms 94.3% + triton_bmm_47963 0.0093 ms 90.7% + triton_bmm_47961 0.0093 ms 90.1% + triton_bmm_47956 0.0096 ms 88.0% + triton_bmm_47966 0.0099 ms 84.8% +SingleProcess AUTOTUNE takes 3.8889 seconds +AUTOTUNE bmm(16x1x503, 16x503x96) + bmm 0.0121 ms 100.0% + triton_bmm_47989 0.0128 ms 94.7% + triton_bmm_47988 0.0145 ms 83.6% + triton_bmm_47986 0.0184 ms 65.9% + triton_bmm_47982 0.0188 ms 64.4% + triton_bmm_47984 0.0190 ms 63.6% + triton_bmm_47985 0.0190 ms 63.6% + triton_bmm_47981 0.0201 ms 60.2% + triton_bmm_47983 0.0205 ms 59.0% + triton_bmm_47980 0.0245 ms 49.3% +SingleProcess AUTOTUNE takes 3.7394 seconds +AUTOTUNE bmm(16x1x96, 16x96x504) + triton_bmm_48055 0.0087 ms 100.0% + triton_bmm_48056 0.0088 ms 99.3% + triton_bmm_48058 0.0088 ms 98.9% + triton_bmm_48057 0.0089 ms 98.6% + triton_bmm_48053 0.0089 ms 97.8% + triton_bmm_48054 0.0090 ms 97.5% + triton_bmm_48052 0.0091 ms 96.5% + triton_bmm_48060 0.0092 ms 94.8% + triton_bmm_48059 0.0093 ms 94.1% + triton_bmm_48061 0.0093 ms 93.8% +SingleProcess AUTOTUNE takes 3.7575 seconds +AUTOTUNE bmm(16x1x504, 16x504x96) + triton_bmm_48081 0.0102 ms 100.0% + triton_bmm_48082 0.0102 ms 99.4% + triton_bmm_48080 0.0108 ms 94.4% + triton_bmm_48084 0.0109 ms 93.7% + bmm 0.0109 ms 93.4% + triton_bmm_48085 0.0115 ms 88.3% + triton_bmm_48079 0.0119 ms 85.5% + triton_bmm_48078 0.0128 ms 79.7% + triton_bmm_48077 0.0134 ms 75.7% + triton_bmm_48076 0.0177 ms 57.5% +SingleProcess AUTOTUNE takes 4.2334 seconds +AUTOTUNE bmm(16x1x96, 16x96x505) + triton_bmm_48151 0.0084 ms 100.0% + triton_bmm_48152 0.0085 ms 99.2% + triton_bmm_48149 0.0085 ms 98.9% + triton_bmm_48150 0.0087 ms 97.4% + triton_bmm_48154 0.0090 ms 94.3% + triton_bmm_48156 0.0091 ms 93.3% + triton_bmm_48148 0.0091 ms 93.0% + triton_bmm_48153 0.0093 ms 90.4% + triton_bmm_48157 0.0097 ms 86.8% + triton_bmm_48155 0.0098 ms 86.3% +SingleProcess AUTOTUNE takes 4.2207 seconds +AUTOTUNE bmm(16x1x505, 16x505x96) + triton_bmm_48181 0.0123 ms 100.0% + bmm 0.0124 ms 99.7% + triton_bmm_48180 0.0145 ms 85.2% + triton_bmm_48177 0.0186 ms 66.4% + triton_bmm_48176 0.0186 ms 66.3% + triton_bmm_48174 0.0188 ms 65.5% + triton_bmm_48178 0.0188 ms 65.5% + triton_bmm_48173 0.0197 ms 62.6% + triton_bmm_48175 0.0208 ms 59.2% + triton_bmm_48172 0.0244 ms 50.5% +SingleProcess AUTOTUNE takes 4.4129 seconds +AUTOTUNE bmm(16x1x96, 16x96x506) + triton_bmm_48247 0.0088 ms 100.0% + triton_bmm_48248 0.0088 ms 99.6% + triton_bmm_48250 0.0089 ms 98.9% + triton_bmm_48245 0.0090 ms 97.9% + triton_bmm_48246 0.0090 ms 97.9% + triton_bmm_48249 0.0093 ms 93.8% + triton_bmm_48252 0.0093 ms 93.8% + triton_bmm_48244 0.0095 ms 92.1% + triton_bmm_48251 0.0098 ms 89.5% + triton_bmm_48254 0.0099 ms 88.4% +SingleProcess AUTOTUNE takes 3.7559 seconds +AUTOTUNE bmm(16x1x506, 16x506x96) + triton_bmm_48273 0.0097 ms 100.0% + triton_bmm_48274 0.0104 ms 93.5% + triton_bmm_48276 0.0106 ms 91.6% + triton_bmm_48272 0.0113 ms 86.4% + triton_bmm_48277 0.0113 ms 86.4% + triton_bmm_48271 0.0119 ms 81.7% + triton_bmm_48270 0.0125 ms 77.7% + triton_bmm_48269 0.0130 ms 74.9% + bmm 0.0159 ms 61.3% + triton_bmm_48268 0.0177 ms 54.9% +SingleProcess AUTOTUNE takes 4.1990 seconds +AUTOTUNE bmm(16x1x96, 16x96x507) + triton_bmm_48346 0.0085 ms 100.0% + triton_bmm_48343 0.0088 ms 97.4% + triton_bmm_48342 0.0090 ms 95.4% + triton_bmm_48344 0.0090 ms 95.4% + triton_bmm_48341 0.0090 ms 95.0% + triton_bmm_48347 0.0093 ms 91.8% + triton_bmm_48348 0.0093 ms 91.8% + triton_bmm_48345 0.0093 ms 91.4% + triton_bmm_48340 0.0095 ms 89.6% + triton_bmm_48349 0.0097 ms 87.8% +SingleProcess AUTOTUNE takes 3.9849 seconds +AUTOTUNE bmm(16x1x507, 16x507x96) + triton_bmm_48373 0.0124 ms 100.0% + bmm 0.0128 ms 97.0% + triton_bmm_48372 0.0145 ms 85.4% + triton_bmm_48370 0.0186 ms 66.7% + triton_bmm_48369 0.0188 ms 65.7% + triton_bmm_48368 0.0190 ms 65.2% + triton_bmm_48366 0.0192 ms 64.4% + triton_bmm_48365 0.0201 ms 61.5% + triton_bmm_48367 0.0209 ms 59.4% + triton_bmm_48364 0.0242 ms 51.1% +SingleProcess AUTOTUNE takes 3.8208 seconds +AUTOTUNE bmm(16x1x96, 16x96x508) + triton_bmm_48440 0.0085 ms 100.0% + triton_bmm_48442 0.0085 ms 99.6% + triton_bmm_48439 0.0088 ms 96.7% + triton_bmm_48444 0.0089 ms 95.7% + triton_bmm_48441 0.0089 ms 95.3% + triton_bmm_48437 0.0089 ms 95.0% + triton_bmm_48438 0.0090 ms 94.6% + triton_bmm_48436 0.0091 ms 93.6% + triton_bmm_48445 0.0095 ms 89.2% + triton_bmm_48447 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 4.3291 seconds +AUTOTUNE bmm(16x1x508, 16x508x96) + triton_bmm_48465 0.0102 ms 100.0% + triton_bmm_48466 0.0102 ms 99.8% + triton_bmm_48469 0.0106 ms 96.7% + triton_bmm_48468 0.0106 ms 96.1% + triton_bmm_48464 0.0108 ms 94.7% + triton_bmm_48463 0.0115 ms 89.1% + triton_bmm_48462 0.0123 ms 82.9% + triton_bmm_48461 0.0135 ms 75.8% + bmm 0.0158 ms 64.4% + triton_bmm_48460 0.0177 ms 57.7% +SingleProcess AUTOTUNE takes 4.5624 seconds +AUTOTUNE bmm(16x1x96, 16x96x509) + triton_bmm_48538 0.0085 ms 100.0% + triton_bmm_48535 0.0088 ms 97.1% + triton_bmm_48536 0.0089 ms 95.3% + triton_bmm_48533 0.0090 ms 94.7% + triton_bmm_48537 0.0091 ms 94.0% + triton_bmm_48532 0.0091 ms 93.7% + triton_bmm_48534 0.0091 ms 93.3% + triton_bmm_48540 0.0096 ms 89.0% + triton_bmm_48541 0.0097 ms 87.5% + triton_bmm_48543 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 4.1281 seconds +AUTOTUNE bmm(16x1x509, 16x509x96) + triton_bmm_48565 0.0128 ms 100.0% + bmm 0.0129 ms 99.3% + triton_bmm_48564 0.0146 ms 87.9% + triton_bmm_48561 0.0186 ms 68.8% + triton_bmm_48558 0.0188 ms 68.0% + triton_bmm_48562 0.0188 ms 67.9% + triton_bmm_48560 0.0190 ms 67.2% + triton_bmm_48557 0.0199 ms 64.4% + triton_bmm_48559 0.0205 ms 62.3% + triton_bmm_48556 0.0242 ms 52.8% +SingleProcess AUTOTUNE takes 4.8158 seconds +AUTOTUNE bmm(16x1x96, 16x96x510) + triton_bmm_48630 0.0086 ms 100.0% + triton_bmm_48631 0.0088 ms 98.5% + triton_bmm_48636 0.0089 ms 97.5% + triton_bmm_48633 0.0089 ms 97.1% + triton_bmm_48632 0.0089 ms 96.8% + triton_bmm_48634 0.0089 ms 96.8% + triton_bmm_48629 0.0090 ms 96.4% + triton_bmm_48628 0.0095 ms 90.6% + triton_bmm_48639 0.0095 ms 90.6% + triton_bmm_48635 0.0098 ms 88.5% +SingleProcess AUTOTUNE takes 4.0208 seconds +AUTOTUNE bmm(16x1x510, 16x510x96) + triton_bmm_48658 0.0100 ms 100.0% + triton_bmm_48660 0.0102 ms 97.8% + triton_bmm_48657 0.0102 ms 97.5% + triton_bmm_48656 0.0108 ms 92.0% + triton_bmm_48661 0.0113 ms 88.1% + triton_bmm_48655 0.0115 ms 86.9% + triton_bmm_48654 0.0125 ms 79.5% + triton_bmm_48653 0.0130 ms 76.6% + bmm 0.0159 ms 62.6% + triton_bmm_48652 0.0182 ms 54.8% +SingleProcess AUTOTUNE takes 3.9300 seconds +AUTOTUNE bmm(16x1x96, 16x96x511) + triton_bmm_48727 0.0084 ms 100.0% + triton_bmm_48725 0.0085 ms 98.1% + triton_bmm_48728 0.0085 ms 98.1% + triton_bmm_48730 0.0085 ms 97.8% + triton_bmm_48726 0.0087 ms 96.3% + triton_bmm_48732 0.0091 ms 91.9% + triton_bmm_48729 0.0093 ms 89.4% + triton_bmm_48724 0.0096 ms 87.4% + triton_bmm_48733 0.0097 ms 85.9% + triton_bmm_48731 0.0098 ms 85.6% +SingleProcess AUTOTUNE takes 3.8744 seconds +AUTOTUNE bmm(16x1x511, 16x511x96) + triton_bmm_48757 0.0132 ms 100.0% + bmm 0.0136 ms 97.2% + triton_bmm_48756 0.0149 ms 88.6% + triton_bmm_48753 0.0175 ms 75.4% + triton_bmm_48750 0.0178 ms 74.3% + triton_bmm_48754 0.0178 ms 74.3% + triton_bmm_48752 0.0180 ms 73.5% + triton_bmm_48751 0.0197 ms 67.2% + triton_bmm_48749 0.0203 ms 65.0% + triton_bmm_48748 0.0255 ms 51.8% +SingleProcess AUTOTUNE takes 3.8664 seconds +AUTOTUNE bmm(16x1x96, 16x96x512) + triton_bmm_48821 0.0084 ms 100.0% + triton_bmm_48822 0.0086 ms 97.4% + triton_bmm_48828 0.0087 ms 96.7% + triton_bmm_48823 0.0087 ms 96.3% + triton_bmm_48824 0.0088 ms 95.6% + triton_bmm_48826 0.0089 ms 94.9% + triton_bmm_48820 0.0091 ms 92.9% + triton_bmm_48829 0.0091 ms 92.6% + triton_bmm_48825 0.0093 ms 90.7% + triton_bmm_48831 0.0095 ms 88.3% +SingleProcess AUTOTUNE takes 4.4005 seconds +AUTOTUNE bmm(16x1x512, 16x512x96) + triton_bmm_48849 0.0101 ms 100.0% + triton_bmm_48850 0.0104 ms 97.5% + triton_bmm_48853 0.0106 ms 95.8% + triton_bmm_48852 0.0106 ms 95.5% + bmm 0.0110 ms 92.2% + triton_bmm_48848 0.0112 ms 90.3% + triton_bmm_48847 0.0115 ms 88.5% + triton_bmm_48846 0.0123 ms 82.3% + triton_bmm_48845 0.0130 ms 78.3% + triton_bmm_48844 0.0181 ms 55.9% +SingleProcess AUTOTUNE takes 4.2135 seconds +AUTOTUNE bmm(16x1x96, 16x96x513) + triton_bmm_48924 0.0087 ms 100.0% + triton_bmm_48919 0.0088 ms 98.6% + triton_bmm_48920 0.0090 ms 97.1% + triton_bmm_48917 0.0090 ms 96.8% + triton_bmm_48922 0.0090 ms 96.8% + triton_bmm_48918 0.0091 ms 95.3% + triton_bmm_48923 0.0093 ms 93.5% + triton_bmm_48921 0.0093 ms 93.2% + triton_bmm_48916 0.0096 ms 91.0% + triton_bmm_48925 0.0097 ms 89.5% +SingleProcess AUTOTUNE takes 3.8709 seconds +AUTOTUNE bmm(16x1x513, 16x513x96) + bmm 0.0116 ms 100.0% + triton_bmm_48948 0.0149 ms 77.7% + triton_bmm_48949 0.0149 ms 77.7% + triton_bmm_48946 0.0191 ms 60.7% + triton_bmm_48945 0.0192 ms 60.2% + triton_bmm_48944 0.0193 ms 60.1% + triton_bmm_48942 0.0196 ms 59.0% + triton_bmm_48941 0.0201 ms 57.6% + triton_bmm_48943 0.0214 ms 54.2% + triton_bmm_48940 0.0252 ms 46.1% +SingleProcess AUTOTUNE takes 3.8464 seconds +AUTOTUNE bmm(16x1x96, 16x96x514) + triton_bmm_49013 0.0085 ms 100.0% + triton_bmm_49016 0.0085 ms 100.0% + triton_bmm_49014 0.0087 ms 98.2% + triton_bmm_49015 0.0088 ms 96.7% + triton_bmm_49017 0.0089 ms 96.0% + triton_bmm_49018 0.0089 ms 96.0% + triton_bmm_49012 0.0091 ms 93.7% + triton_bmm_49020 0.0092 ms 93.0% + triton_bmm_49019 0.0097 ms 87.5% + triton_bmm_49022 0.0099 ms 85.8% +SingleProcess AUTOTUNE takes 3.7791 seconds +AUTOTUNE bmm(16x1x514, 16x514x96) + triton_bmm_49041 0.0099 ms 100.0% + triton_bmm_49042 0.0101 ms 97.8% + triton_bmm_49044 0.0106 ms 93.7% + triton_bmm_49040 0.0114 ms 87.3% + triton_bmm_49039 0.0117 ms 84.9% + triton_bmm_49045 0.0119 ms 83.1% + triton_bmm_49038 0.0128 ms 77.7% + triton_bmm_49037 0.0134 ms 74.2% + bmm 0.0173 ms 57.4% + triton_bmm_49036 0.0184 ms 54.0% +SingleProcess AUTOTUNE takes 3.9706 seconds +AUTOTUNE bmm(16x1x96, 16x96x515) + triton_bmm_49112 0.0085 ms 100.0% + triton_bmm_49109 0.0085 ms 99.6% + triton_bmm_49114 0.0085 ms 99.6% + triton_bmm_49110 0.0086 ms 98.5% + triton_bmm_49111 0.0088 ms 96.6% + triton_bmm_49116 0.0089 ms 96.0% + triton_bmm_49113 0.0093 ms 91.1% + triton_bmm_49108 0.0095 ms 89.3% + triton_bmm_49118 0.0098 ms 87.2% + triton_bmm_49115 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 4.3300 seconds +AUTOTUNE bmm(16x1x515, 16x515x96) + bmm 0.0119 ms 100.0% + triton_bmm_49141 0.0145 ms 82.3% + triton_bmm_49140 0.0151 ms 79.0% + triton_bmm_49137 0.0193 ms 62.0% + triton_bmm_49138 0.0193 ms 62.0% + triton_bmm_49134 0.0194 ms 61.4% + triton_bmm_49136 0.0195 ms 61.2% + triton_bmm_49133 0.0203 ms 58.8% + triton_bmm_49135 0.0210 ms 56.9% + triton_bmm_49132 0.0253 ms 47.1% +SingleProcess AUTOTUNE takes 3.8777 seconds +AUTOTUNE bmm(16x1x96, 16x96x516) + triton_bmm_49205 0.0085 ms 100.0% + triton_bmm_49208 0.0085 ms 100.0% + triton_bmm_49212 0.0087 ms 97.8% + triton_bmm_49207 0.0088 ms 97.1% + triton_bmm_49210 0.0089 ms 95.3% + triton_bmm_49204 0.0091 ms 94.0% + triton_bmm_49206 0.0091 ms 94.0% + triton_bmm_49209 0.0093 ms 91.1% + triton_bmm_49211 0.0096 ms 88.4% + triton_bmm_49213 0.0100 ms 85.3% +SingleProcess AUTOTUNE takes 4.1953 seconds +AUTOTUNE bmm(16x1x516, 16x516x96) + triton_bmm_49233 0.0099 ms 100.0% + triton_bmm_49234 0.0100 ms 99.7% + triton_bmm_49232 0.0108 ms 91.7% + triton_bmm_49236 0.0110 ms 89.9% + triton_bmm_49237 0.0111 ms 89.6% + bmm 0.0120 ms 82.4% + triton_bmm_49231 0.0121 ms 81.8% + triton_bmm_49230 0.0127 ms 77.9% + triton_bmm_49229 0.0134 ms 74.0% + triton_bmm_49228 0.0182 ms 54.6% +SingleProcess AUTOTUNE takes 4.3457 seconds +AUTOTUNE bmm(16x1x96, 16x96x517) + triton_bmm_49304 0.0085 ms 100.0% + triton_bmm_49306 0.0085 ms 99.6% + triton_bmm_49303 0.0089 ms 96.0% + triton_bmm_49305 0.0091 ms 94.0% + triton_bmm_49302 0.0091 ms 93.3% + triton_bmm_49301 0.0092 ms 93.0% + triton_bmm_49308 0.0093 ms 91.1% + triton_bmm_49300 0.0096 ms 89.0% + triton_bmm_49309 0.0097 ms 87.5% + triton_bmm_49307 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 3.8904 seconds +AUTOTUNE bmm(16x1x517, 16x517x96) + bmm 0.0119 ms 100.0% + triton_bmm_49333 0.0145 ms 82.1% + triton_bmm_49332 0.0154 ms 77.3% + triton_bmm_49329 0.0188 ms 63.3% + triton_bmm_49330 0.0188 ms 63.3% + triton_bmm_49326 0.0193 ms 61.8% + triton_bmm_49328 0.0194 ms 61.3% + triton_bmm_49325 0.0203 ms 58.6% + triton_bmm_49327 0.0214 ms 55.5% + triton_bmm_49324 0.0253 ms 47.0% +SingleProcess AUTOTUNE takes 3.8149 seconds +AUTOTUNE bmm(16x1x96, 16x96x518) + triton_bmm_49399 0.0084 ms 100.0% + triton_bmm_49397 0.0085 ms 98.1% + triton_bmm_49400 0.0085 ms 98.1% + triton_bmm_49398 0.0087 ms 96.3% + triton_bmm_49404 0.0089 ms 94.2% + triton_bmm_49402 0.0090 ms 93.2% + triton_bmm_49403 0.0093 ms 90.0% + triton_bmm_49401 0.0093 ms 89.4% + triton_bmm_49396 0.0095 ms 87.6% + triton_bmm_49407 0.0100 ms 83.7% +SingleProcess AUTOTUNE takes 3.8680 seconds +AUTOTUNE bmm(16x1x518, 16x518x96) + triton_bmm_49426 0.0101 ms 100.0% + triton_bmm_49425 0.0104 ms 98.0% + triton_bmm_49428 0.0106 ms 95.8% + triton_bmm_49424 0.0110 ms 92.2% + triton_bmm_49429 0.0116 ms 87.1% + triton_bmm_49423 0.0117 ms 86.8% + bmm 0.0120 ms 84.5% + triton_bmm_49422 0.0132 ms 76.9% + triton_bmm_49421 0.0139 ms 73.2% + triton_bmm_49420 0.0188 ms 53.9% +SingleProcess AUTOTUNE takes 4.0022 seconds +AUTOTUNE bmm(16x1x96, 16x96x519) + triton_bmm_49495 0.0085 ms 100.0% + triton_bmm_49496 0.0085 ms 100.0% + triton_bmm_49494 0.0087 ms 98.2% + triton_bmm_49498 0.0090 ms 94.7% + triton_bmm_49497 0.0091 ms 94.0% + triton_bmm_49493 0.0091 ms 93.5% + triton_bmm_49500 0.0093 ms 91.1% + triton_bmm_49492 0.0095 ms 89.3% + triton_bmm_49501 0.0097 ms 87.5% + triton_bmm_49503 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 3.8647 seconds +AUTOTUNE bmm(16x1x519, 16x519x96) + bmm 0.0121 ms 100.0% + triton_bmm_49525 0.0149 ms 80.9% + triton_bmm_49524 0.0151 ms 80.1% + triton_bmm_49522 0.0190 ms 63.6% + triton_bmm_49521 0.0193 ms 62.8% + triton_bmm_49518 0.0194 ms 62.3% + triton_bmm_49520 0.0195 ms 62.1% + triton_bmm_49517 0.0203 ms 59.5% + triton_bmm_49519 0.0216 ms 56.1% + triton_bmm_49516 0.0255 ms 47.4% +SingleProcess AUTOTUNE takes 3.8171 seconds +AUTOTUNE bmm(16x1x96, 16x96x520) + triton_bmm_49589 0.0085 ms 100.0% + triton_bmm_49591 0.0088 ms 96.0% + triton_bmm_49592 0.0088 ms 95.8% + triton_bmm_49594 0.0090 ms 94.3% + triton_bmm_49590 0.0091 ms 93.0% + triton_bmm_49596 0.0092 ms 91.7% + triton_bmm_49595 0.0093 ms 91.4% + triton_bmm_49597 0.0093 ms 91.4% + triton_bmm_49593 0.0093 ms 90.8% + triton_bmm_49588 0.0095 ms 88.9% +SingleProcess AUTOTUNE takes 4.0839 seconds +AUTOTUNE bmm(16x1x520, 16x520x96) + triton_bmm_49618 0.0100 ms 100.0% + triton_bmm_49617 0.0106 ms 94.0% + bmm 0.0108 ms 91.7% + triton_bmm_49616 0.0110 ms 90.7% + triton_bmm_49620 0.0111 ms 89.6% + triton_bmm_49621 0.0119 ms 83.8% + triton_bmm_49615 0.0121 ms 82.1% + triton_bmm_49614 0.0130 ms 76.4% + triton_bmm_49613 0.0134 ms 74.2% + triton_bmm_49612 0.0186 ms 53.5% +SingleProcess AUTOTUNE takes 4.4156 seconds +AUTOTUNE bmm(16x1x96, 16x96x521) + triton_bmm_49685 0.0085 ms 100.0% + triton_bmm_49688 0.0085 ms 100.0% + triton_bmm_49687 0.0089 ms 96.4% + triton_bmm_49692 0.0089 ms 96.4% + triton_bmm_49690 0.0090 ms 94.7% + triton_bmm_49689 0.0091 ms 94.3% + triton_bmm_49684 0.0091 ms 94.0% + triton_bmm_49686 0.0091 ms 93.5% + triton_bmm_49691 0.0093 ms 91.8% + triton_bmm_49693 0.0097 ms 87.8% +SingleProcess AUTOTUNE takes 3.7282 seconds +AUTOTUNE bmm(16x1x521, 16x521x96) + bmm 0.0124 ms 100.0% + triton_bmm_49717 0.0149 ms 83.2% + triton_bmm_49716 0.0151 ms 82.1% + triton_bmm_49713 0.0190 ms 65.5% + triton_bmm_49712 0.0192 ms 64.6% + triton_bmm_49714 0.0193 ms 64.4% + triton_bmm_49710 0.0199 ms 62.6% + triton_bmm_49709 0.0208 ms 59.8% + triton_bmm_49711 0.0214 ms 58.2% + triton_bmm_49708 0.0251 ms 49.6% +SingleProcess AUTOTUNE takes 3.7316 seconds +AUTOTUNE bmm(16x1x96, 16x96x522) + triton_bmm_49784 0.0085 ms 100.0% + triton_bmm_49786 0.0085 ms 100.0% + triton_bmm_49783 0.0088 ms 96.4% + triton_bmm_49781 0.0090 ms 94.7% + triton_bmm_49785 0.0091 ms 94.0% + triton_bmm_49782 0.0092 ms 93.0% + triton_bmm_49787 0.0093 ms 91.7% + triton_bmm_49788 0.0093 ms 91.1% + triton_bmm_49780 0.0096 ms 89.0% + triton_bmm_49791 0.0100 ms 85.0% +SingleProcess AUTOTUNE takes 4.2802 seconds +AUTOTUNE bmm(16x1x522, 16x522x96) + triton_bmm_49809 0.0099 ms 100.0% + triton_bmm_49810 0.0101 ms 97.8% + triton_bmm_49812 0.0106 ms 93.4% + triton_bmm_49808 0.0115 ms 86.4% + triton_bmm_49807 0.0121 ms 81.8% + triton_bmm_49813 0.0121 ms 81.8% + bmm 0.0123 ms 80.9% + triton_bmm_49806 0.0129 ms 76.7% + triton_bmm_49805 0.0134 ms 74.0% + triton_bmm_49804 0.0184 ms 54.0% +SingleProcess AUTOTUNE takes 4.0564 seconds +AUTOTUNE bmm(16x1x96, 16x96x523) + triton_bmm_49879 0.0085 ms 100.0% + triton_bmm_49880 0.0085 ms 100.0% + triton_bmm_49882 0.0085 ms 99.6% + triton_bmm_49877 0.0087 ms 98.2% + triton_bmm_49878 0.0087 ms 98.2% + triton_bmm_49884 0.0089 ms 95.7% + triton_bmm_49876 0.0091 ms 93.7% + triton_bmm_49881 0.0091 ms 93.7% + triton_bmm_49883 0.0093 ms 91.4% + triton_bmm_49885 0.0098 ms 87.2% +SingleProcess AUTOTUNE takes 4.3962 seconds +AUTOTUNE bmm(16x1x523, 16x523x96) + bmm 0.0127 ms 100.0% + triton_bmm_49909 0.0150 ms 84.9% + triton_bmm_49908 0.0156 ms 81.7% + triton_bmm_49906 0.0192 ms 66.4% + triton_bmm_49905 0.0193 ms 66.0% + triton_bmm_49902 0.0195 ms 65.5% + triton_bmm_49904 0.0197 ms 64.7% + triton_bmm_49901 0.0205 ms 62.0% + triton_bmm_49903 0.0217 ms 58.8% + triton_bmm_49900 0.0258 ms 49.3% +SingleProcess AUTOTUNE takes 4.0164 seconds +AUTOTUNE bmm(16x1x96, 16x96x524) + triton_bmm_49973 0.0085 ms 100.0% + triton_bmm_49976 0.0085 ms 100.0% + triton_bmm_49978 0.0085 ms 100.0% + triton_bmm_49974 0.0086 ms 98.5% + triton_bmm_49975 0.0088 ms 96.7% + triton_bmm_49977 0.0091 ms 94.0% + triton_bmm_49979 0.0093 ms 91.7% + triton_bmm_49980 0.0093 ms 91.1% + triton_bmm_49972 0.0095 ms 89.3% + triton_bmm_49982 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 3.7335 seconds +AUTOTUNE bmm(16x1x524, 16x524x96) + triton_bmm_50001 0.0099 ms 100.0% + triton_bmm_50002 0.0104 ms 95.1% + triton_bmm_50004 0.0106 ms 93.7% + triton_bmm_50000 0.0110 ms 90.1% + triton_bmm_50005 0.0112 ms 88.3% + triton_bmm_49999 0.0116 ms 85.2% + bmm 0.0123 ms 80.7% + triton_bmm_49998 0.0131 ms 76.0% + triton_bmm_49997 0.0139 ms 71.6% + triton_bmm_49996 0.0183 ms 54.1% +SingleProcess AUTOTUNE takes 3.8920 seconds +AUTOTUNE bmm(16x1x96, 16x96x525) + triton_bmm_50071 0.0085 ms 100.0% + triton_bmm_50072 0.0085 ms 100.0% + triton_bmm_50069 0.0087 ms 98.2% + triton_bmm_50076 0.0089 ms 95.7% + triton_bmm_50074 0.0090 ms 94.7% + triton_bmm_50068 0.0091 ms 93.7% + triton_bmm_50073 0.0091 ms 93.7% + triton_bmm_50070 0.0092 ms 93.0% + triton_bmm_50075 0.0093 ms 91.4% + triton_bmm_50079 0.0101 ms 84.0% +SingleProcess AUTOTUNE takes 3.9928 seconds +AUTOTUNE bmm(16x1x525, 16x525x96) + bmm 0.0128 ms 100.0% + triton_bmm_50101 0.0147 ms 87.4% + triton_bmm_50100 0.0151 ms 84.8% + triton_bmm_50097 0.0190 ms 67.5% + triton_bmm_50098 0.0190 ms 67.4% + triton_bmm_50096 0.0192 ms 66.7% + triton_bmm_50094 0.0198 ms 64.8% + triton_bmm_50093 0.0208 ms 61.6% + triton_bmm_50095 0.0213 ms 60.1% + triton_bmm_50092 0.0254 ms 50.6% +SingleProcess AUTOTUNE takes 3.9441 seconds +AUTOTUNE bmm(16x1x96, 16x96x526) + triton_bmm_50168 0.0085 ms 100.0% + triton_bmm_50170 0.0085 ms 100.0% + triton_bmm_50166 0.0087 ms 98.2% + triton_bmm_50167 0.0088 ms 96.4% + triton_bmm_50165 0.0090 ms 94.3% + triton_bmm_50164 0.0091 ms 93.7% + triton_bmm_50172 0.0093 ms 91.1% + triton_bmm_50169 0.0094 ms 90.8% + triton_bmm_50175 0.0095 ms 89.3% + triton_bmm_50171 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 3.9074 seconds +AUTOTUNE bmm(16x1x526, 16x526x96) + triton_bmm_50194 0.0101 ms 100.0% + triton_bmm_50193 0.0104 ms 97.5% + triton_bmm_50196 0.0106 ms 95.5% + triton_bmm_50192 0.0110 ms 92.2% + triton_bmm_50191 0.0118 ms 85.7% + bmm 0.0120 ms 84.3% + triton_bmm_50197 0.0121 ms 83.6% + triton_bmm_50190 0.0132 ms 76.8% + triton_bmm_50189 0.0134 ms 75.7% + triton_bmm_50188 0.0188 ms 53.8% +SingleProcess AUTOTUNE takes 4.0339 seconds +AUTOTUNE bmm(16x1x96, 16x96x527) + triton_bmm_50266 0.0087 ms 100.0% + triton_bmm_50261 0.0087 ms 99.6% + triton_bmm_50268 0.0089 ms 97.5% + triton_bmm_50263 0.0090 ms 96.8% + triton_bmm_50264 0.0090 ms 96.4% + triton_bmm_50265 0.0091 ms 95.4% + triton_bmm_50262 0.0092 ms 94.8% + triton_bmm_50267 0.0093 ms 93.1% + triton_bmm_50260 0.0096 ms 90.6% + triton_bmm_50269 0.0097 ms 89.1% +SingleProcess AUTOTUNE takes 3.7870 seconds +AUTOTUNE bmm(16x1x527, 16x527x96) + bmm 0.0132 ms 100.0% + triton_bmm_50293 0.0151 ms 87.3% + triton_bmm_50292 0.0156 ms 84.4% + triton_bmm_50290 0.0192 ms 68.7% + triton_bmm_50289 0.0195 ms 67.7% + triton_bmm_50288 0.0197 ms 66.9% + triton_bmm_50286 0.0199 ms 66.3% + triton_bmm_50285 0.0205 ms 64.2% + triton_bmm_50287 0.0218 ms 60.4% + triton_bmm_50284 0.0258 ms 51.2% +SingleProcess AUTOTUNE takes 3.7870 seconds +AUTOTUNE bmm(16x1x96, 16x96x528) + triton_bmm_50359 0.0084 ms 100.0% + triton_bmm_50357 0.0085 ms 99.2% + triton_bmm_50360 0.0085 ms 98.9% + triton_bmm_50362 0.0085 ms 98.9% + triton_bmm_50358 0.0086 ms 97.4% + triton_bmm_50364 0.0089 ms 94.9% + triton_bmm_50361 0.0091 ms 92.9% + triton_bmm_50363 0.0093 ms 90.7% + triton_bmm_50356 0.0095 ms 88.3% + triton_bmm_50367 0.0095 ms 88.3% +SingleProcess AUTOTUNE takes 3.9807 seconds +AUTOTUNE bmm(16x1x528, 16x528x96) + triton_bmm_50385 0.0104 ms 100.0% + triton_bmm_50386 0.0104 ms 99.4% + triton_bmm_50388 0.0110 ms 93.9% + bmm 0.0112 ms 92.3% + triton_bmm_50384 0.0113 ms 92.0% + triton_bmm_50389 0.0116 ms 89.0% + triton_bmm_50383 0.0119 ms 87.3% + triton_bmm_50382 0.0130 ms 79.8% + triton_bmm_50381 0.0139 ms 74.8% + triton_bmm_50380 0.0181 ms 57.1% +SingleProcess AUTOTUNE takes 3.8765 seconds +AUTOTUNE bmm(16x1x96, 16x96x529) + triton_bmm_50455 0.0085 ms 100.0% + triton_bmm_50458 0.0085 ms 99.6% + triton_bmm_50453 0.0087 ms 97.8% + triton_bmm_50456 0.0090 ms 94.7% + triton_bmm_50457 0.0091 ms 93.7% + triton_bmm_50454 0.0092 ms 93.0% + triton_bmm_50460 0.0094 ms 90.8% + triton_bmm_50452 0.0096 ms 89.0% + triton_bmm_50463 0.0097 ms 87.5% + triton_bmm_50461 0.0098 ms 87.2% +SingleProcess AUTOTUNE takes 3.8993 seconds +AUTOTUNE bmm(16x1x529, 16x529x96) + bmm 0.0118 ms 100.0% + triton_bmm_50485 0.0151 ms 78.3% + triton_bmm_50484 0.0156 ms 75.8% + triton_bmm_50482 0.0190 ms 62.0% + triton_bmm_50480 0.0192 ms 61.4% + triton_bmm_50481 0.0194 ms 60.8% + triton_bmm_50478 0.0195 ms 60.6% + triton_bmm_50477 0.0207 ms 56.9% + triton_bmm_50479 0.0216 ms 54.7% + triton_bmm_50476 0.0261 ms 45.2% +SingleProcess AUTOTUNE takes 4.0054 seconds +AUTOTUNE bmm(16x1x96, 16x96x530) + triton_bmm_50549 0.0085 ms 100.0% + triton_bmm_50551 0.0088 ms 96.4% + triton_bmm_50556 0.0089 ms 96.0% + triton_bmm_50552 0.0090 ms 94.7% + triton_bmm_50554 0.0090 ms 94.7% + triton_bmm_50550 0.0092 ms 93.0% + triton_bmm_50555 0.0093 ms 91.7% + triton_bmm_50553 0.0095 ms 89.6% + triton_bmm_50548 0.0096 ms 89.0% + triton_bmm_50559 0.0096 ms 89.0% +SingleProcess AUTOTUNE takes 3.8640 seconds +AUTOTUNE bmm(16x1x530, 16x530x96) + triton_bmm_50578 0.0101 ms 100.0% + triton_bmm_50577 0.0104 ms 97.5% + triton_bmm_50580 0.0106 ms 95.5% + triton_bmm_50576 0.0110 ms 92.2% + triton_bmm_50575 0.0119 ms 85.4% + triton_bmm_50581 0.0121 ms 83.6% + bmm 0.0124 ms 81.9% + triton_bmm_50574 0.0132 ms 76.8% + triton_bmm_50573 0.0134 ms 75.5% + triton_bmm_50572 0.0184 ms 55.2% +SingleProcess AUTOTUNE takes 3.9052 seconds +AUTOTUNE bmm(16x1x96, 16x96x531) + triton_bmm_50647 0.0085 ms 100.0% + triton_bmm_50652 0.0089 ms 95.7% + triton_bmm_50648 0.0090 ms 94.7% + triton_bmm_50650 0.0090 ms 94.3% + triton_bmm_50649 0.0091 ms 93.7% + triton_bmm_50645 0.0092 ms 93.0% + triton_bmm_50646 0.0092 ms 93.0% + triton_bmm_50644 0.0096 ms 89.0% + triton_bmm_50655 0.0097 ms 87.5% + triton_bmm_50651 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 3.9640 seconds +AUTOTUNE bmm(16x1x531, 16x531x96) + bmm 0.0123 ms 100.0% + triton_bmm_50677 0.0151 ms 81.4% + triton_bmm_50676 0.0152 ms 81.2% + triton_bmm_50673 0.0192 ms 64.1% + triton_bmm_50674 0.0195 ms 63.2% + triton_bmm_50672 0.0197 ms 62.6% + triton_bmm_50670 0.0199 ms 61.8% + triton_bmm_50669 0.0207 ms 59.4% + triton_bmm_50671 0.0219 ms 56.3% + triton_bmm_50668 0.0260 ms 47.5% +SingleProcess AUTOTUNE takes 4.1491 seconds +AUTOTUNE bmm(16x1x96, 16x96x532) + triton_bmm_50743 0.0085 ms 100.0% + triton_bmm_50746 0.0085 ms 99.3% + triton_bmm_50742 0.0087 ms 97.8% + triton_bmm_50744 0.0090 ms 94.5% + triton_bmm_50741 0.0091 ms 93.6% + triton_bmm_50745 0.0091 ms 93.6% + triton_bmm_50747 0.0093 ms 91.4% + triton_bmm_50748 0.0093 ms 90.8% + triton_bmm_50740 0.0096 ms 88.6% + triton_bmm_50749 0.0097 ms 87.7% +SingleProcess AUTOTUNE takes 3.8872 seconds +AUTOTUNE bmm(16x1x532, 16x532x96) + triton_bmm_50770 0.0101 ms 100.0% + triton_bmm_50769 0.0104 ms 97.5% + triton_bmm_50772 0.0106 ms 95.5% + triton_bmm_50768 0.0110 ms 92.2% + triton_bmm_50773 0.0116 ms 87.6% + triton_bmm_50767 0.0119 ms 85.4% + bmm 0.0121 ms 83.9% + triton_bmm_50766 0.0127 ms 79.6% + triton_bmm_50765 0.0136 ms 74.8% + triton_bmm_50764 0.0186 ms 54.5% +SingleProcess AUTOTUNE takes 4.7626 seconds +AUTOTUNE bmm(16x1x96, 16x96x533) + triton_bmm_50839 0.0085 ms 100.0% + triton_bmm_50840 0.0085 ms 99.6% + triton_bmm_50844 0.0089 ms 95.7% + triton_bmm_50842 0.0091 ms 94.0% + triton_bmm_50836 0.0091 ms 93.7% + triton_bmm_50837 0.0092 ms 93.0% + triton_bmm_50838 0.0092 ms 93.0% + triton_bmm_50841 0.0095 ms 89.3% + triton_bmm_50843 0.0098 ms 86.6% + triton_bmm_50846 0.0099 ms 85.8% +SingleProcess AUTOTUNE takes 3.8133 seconds +AUTOTUNE bmm(16x1x533, 16x533x96) + bmm 0.0123 ms 100.0% + triton_bmm_50869 0.0151 ms 81.4% + triton_bmm_50868 0.0153 ms 80.5% + triton_bmm_50864 0.0194 ms 63.4% + triton_bmm_50865 0.0195 ms 63.2% + triton_bmm_50866 0.0197 ms 62.5% + triton_bmm_50862 0.0200 ms 61.7% + triton_bmm_50861 0.0207 ms 59.4% + triton_bmm_50863 0.0219 ms 56.3% + triton_bmm_50860 0.0262 ms 47.1% +SingleProcess AUTOTUNE takes 3.9215 seconds +AUTOTUNE bmm(16x1x96, 16x96x534) + triton_bmm_50938 0.0085 ms 100.0% + triton_bmm_50933 0.0086 ms 98.9% + triton_bmm_50935 0.0088 ms 96.7% + triton_bmm_50940 0.0089 ms 96.4% + triton_bmm_50936 0.0090 ms 95.0% + triton_bmm_50934 0.0092 ms 93.4% + triton_bmm_50937 0.0094 ms 90.5% + triton_bmm_50943 0.0095 ms 89.6% + triton_bmm_50932 0.0096 ms 89.3% + triton_bmm_50939 0.0097 ms 87.8% +SingleProcess AUTOTUNE takes 3.8605 seconds +AUTOTUNE bmm(16x1x534, 16x534x96) + triton_bmm_50962 0.0101 ms 100.0% + triton_bmm_50961 0.0104 ms 97.5% + triton_bmm_50964 0.0108 ms 94.3% + triton_bmm_50960 0.0110 ms 91.9% + triton_bmm_50959 0.0121 ms 83.6% + triton_bmm_50965 0.0121 ms 83.6% + triton_bmm_50958 0.0129 ms 78.5% + triton_bmm_50957 0.0139 ms 73.2% + bmm 0.0161 ms 63.1% + triton_bmm_50956 0.0185 ms 54.8% +SingleProcess AUTOTUNE takes 4.1496 seconds +AUTOTUNE bmm(16x1x96, 16x96x535) + triton_bmm_51031 0.0085 ms 100.0% + triton_bmm_51032 0.0085 ms 99.6% + triton_bmm_51030 0.0087 ms 98.2% + triton_bmm_51036 0.0089 ms 95.7% + triton_bmm_51034 0.0091 ms 94.0% + triton_bmm_51029 0.0092 ms 93.0% + triton_bmm_51033 0.0095 ms 89.6% + triton_bmm_51028 0.0096 ms 89.0% + triton_bmm_51039 0.0097 ms 87.5% + triton_bmm_51037 0.0098 ms 87.2% +SingleProcess AUTOTUNE takes 3.9996 seconds +AUTOTUNE bmm(16x1x535, 16x535x96) + bmm 0.0128 ms 100.0% + triton_bmm_51061 0.0152 ms 84.2% + triton_bmm_51060 0.0157 ms 81.3% + triton_bmm_51057 0.0192 ms 66.6% + triton_bmm_51058 0.0192 ms 66.4% + triton_bmm_51056 0.0195 ms 65.6% + triton_bmm_51054 0.0196 ms 65.0% + triton_bmm_51053 0.0207 ms 61.6% + triton_bmm_51055 0.0216 ms 59.1% + triton_bmm_51052 0.0262 ms 48.7% +SingleProcess AUTOTUNE takes 3.8382 seconds +AUTOTUNE bmm(16x1x96, 16x96x536) + triton_bmm_51125 0.0084 ms 100.0% + triton_bmm_51127 0.0085 ms 99.6% + triton_bmm_51128 0.0085 ms 98.9% + triton_bmm_51130 0.0090 ms 94.0% + triton_bmm_51124 0.0091 ms 93.0% + triton_bmm_51126 0.0092 ms 92.3% + triton_bmm_51129 0.0093 ms 90.4% + triton_bmm_51132 0.0093 ms 90.4% + triton_bmm_51135 0.0095 ms 88.6% + triton_bmm_51131 0.0098 ms 86.6% +SingleProcess AUTOTUNE takes 3.8061 seconds +AUTOTUNE bmm(16x1x536, 16x536x96) + triton_bmm_51154 0.0106 ms 100.0% + triton_bmm_51153 0.0106 ms 99.8% + bmm 0.0108 ms 97.8% + triton_bmm_51156 0.0112 ms 94.2% + triton_bmm_51152 0.0115 ms 92.3% + triton_bmm_51157 0.0119 ms 89.1% + triton_bmm_51151 0.0121 ms 87.2% + triton_bmm_51150 0.0132 ms 80.2% + triton_bmm_51149 0.0136 ms 77.9% + triton_bmm_51148 0.0184 ms 57.6% +SingleProcess AUTOTUNE takes 3.9422 seconds +AUTOTUNE bmm(16x1x96, 16x96x537) + triton_bmm_51224 0.0085 ms 100.0% + triton_bmm_51226 0.0085 ms 100.0% + triton_bmm_51223 0.0089 ms 95.7% + triton_bmm_51220 0.0091 ms 94.0% + triton_bmm_51221 0.0091 ms 93.7% + triton_bmm_51222 0.0092 ms 93.4% + triton_bmm_51228 0.0094 ms 91.1% + triton_bmm_51225 0.0095 ms 90.2% + triton_bmm_51227 0.0098 ms 87.3% + triton_bmm_51229 0.0098 ms 87.3% +SingleProcess AUTOTUNE takes 4.4318 seconds +AUTOTUNE bmm(16x1x537, 16x537x96) + bmm 0.0128 ms 100.0% + triton_bmm_51253 0.0150 ms 85.7% + triton_bmm_51252 0.0153 ms 83.7% + triton_bmm_51249 0.0195 ms 66.0% + triton_bmm_51250 0.0196 ms 65.3% + triton_bmm_51246 0.0197 ms 65.2% + triton_bmm_51248 0.0199 ms 64.6% + triton_bmm_51245 0.0208 ms 61.8% + triton_bmm_51247 0.0220 ms 58.2% + triton_bmm_51244 0.0262 ms 49.0% +SingleProcess AUTOTUNE takes 3.8341 seconds +AUTOTUNE bmm(16x1x96, 16x96x538) + triton_bmm_51322 0.0085 ms 100.0% + triton_bmm_51319 0.0089 ms 96.4% + triton_bmm_51324 0.0089 ms 96.0% + triton_bmm_51320 0.0090 ms 95.0% + triton_bmm_51321 0.0091 ms 94.3% + triton_bmm_51316 0.0091 ms 94.0% + triton_bmm_51317 0.0091 ms 93.7% + triton_bmm_51318 0.0092 ms 93.4% + triton_bmm_51323 0.0098 ms 87.3% + triton_bmm_51327 0.0100 ms 85.3% +SingleProcess AUTOTUNE takes 3.8470 seconds +AUTOTUNE bmm(16x1x538, 16x538x96) + triton_bmm_51345 0.0104 ms 100.0% + triton_bmm_51346 0.0106 ms 98.2% + triton_bmm_51348 0.0111 ms 93.2% + triton_bmm_51344 0.0115 ms 90.3% + triton_bmm_51349 0.0116 ms 89.0% + triton_bmm_51343 0.0122 ms 85.3% + bmm 0.0124 ms 83.7% + triton_bmm_51342 0.0134 ms 77.3% + triton_bmm_51341 0.0139 ms 74.8% + triton_bmm_51340 0.0186 ms 55.9% +SingleProcess AUTOTUNE takes 4.5105 seconds +AUTOTUNE bmm(16x1x96, 16x96x539) + triton_bmm_51416 0.0085 ms 100.0% + triton_bmm_51415 0.0090 ms 95.0% + triton_bmm_51418 0.0090 ms 94.7% + triton_bmm_51417 0.0091 ms 94.0% + triton_bmm_51413 0.0092 ms 93.4% + triton_bmm_51414 0.0092 ms 93.0% + triton_bmm_51419 0.0093 ms 91.8% + triton_bmm_51420 0.0094 ms 91.1% + triton_bmm_51412 0.0096 ms 89.3% + triton_bmm_51422 0.0099 ms 86.1% +SingleProcess AUTOTUNE takes 6.3350 seconds +AUTOTUNE bmm(16x1x539, 16x539x96) + bmm 0.0138 ms 100.0% + triton_bmm_51445 0.0147 ms 93.7% + triton_bmm_51444 0.0153 ms 89.8% + triton_bmm_51441 0.0195 ms 70.6% + triton_bmm_51442 0.0197 ms 69.7% + triton_bmm_51440 0.0199 ms 69.1% + triton_bmm_51438 0.0201 ms 68.4% + triton_bmm_51437 0.0208 ms 66.3% + triton_bmm_51439 0.0216 ms 63.7% + triton_bmm_51436 0.0264 ms 52.2% +SingleProcess AUTOTUNE takes 4.4094 seconds +AUTOTUNE bmm(16x1x96, 16x96x540) + triton_bmm_51509 0.0085 ms 100.0% + triton_bmm_51514 0.0085 ms 99.6% + triton_bmm_51516 0.0089 ms 95.7% + triton_bmm_51511 0.0090 ms 95.0% + triton_bmm_51512 0.0090 ms 94.7% + triton_bmm_51508 0.0091 ms 93.7% + triton_bmm_51510 0.0092 ms 93.0% + triton_bmm_51515 0.0093 ms 91.7% + triton_bmm_51513 0.0093 ms 91.1% + triton_bmm_51518 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 3.8254 seconds +AUTOTUNE bmm(16x1x540, 16x540x96) + triton_bmm_51537 0.0099 ms 100.0% + triton_bmm_51538 0.0101 ms 97.8% + triton_bmm_51540 0.0108 ms 92.0% + triton_bmm_51541 0.0112 ms 88.6% + triton_bmm_51536 0.0114 ms 86.8% + triton_bmm_51535 0.0118 ms 84.0% + bmm 0.0122 ms 81.4% + triton_bmm_51534 0.0129 ms 76.7% + triton_bmm_51533 0.0139 ms 71.4% + triton_bmm_51532 0.0184 ms 53.8% +SingleProcess AUTOTUNE takes 4.2027 seconds +AUTOTUNE bmm(16x1x96, 16x96x541) + triton_bmm_51607 0.0085 ms 100.0% + triton_bmm_51606 0.0087 ms 98.5% + triton_bmm_51608 0.0090 ms 95.0% + triton_bmm_51604 0.0091 ms 94.0% + triton_bmm_51610 0.0091 ms 94.0% + triton_bmm_51605 0.0092 ms 93.4% + triton_bmm_51609 0.0093 ms 91.4% + triton_bmm_51612 0.0094 ms 91.1% + triton_bmm_51615 0.0097 ms 87.8% + triton_bmm_51611 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 3.8737 seconds +AUTOTUNE bmm(16x1x541, 16x541x96) + bmm 0.0134 ms 100.0% + triton_bmm_51637 0.0151 ms 88.4% + triton_bmm_51636 0.0156 ms 85.7% + triton_bmm_51633 0.0192 ms 69.8% + triton_bmm_51634 0.0194 ms 68.9% + triton_bmm_51632 0.0195 ms 68.8% + triton_bmm_51630 0.0197 ms 68.0% + triton_bmm_51629 0.0212 ms 63.1% + triton_bmm_51631 0.0216 ms 61.9% + triton_bmm_51628 0.0260 ms 51.5% +SingleProcess AUTOTUNE takes 3.8812 seconds +AUTOTUNE bmm(16x1x96, 16x96x542) + triton_bmm_51701 0.0087 ms 100.0% + triton_bmm_51703 0.0089 ms 97.5% + triton_bmm_51704 0.0090 ms 96.4% + triton_bmm_51705 0.0090 ms 96.1% + triton_bmm_51706 0.0090 ms 96.1% + triton_bmm_51700 0.0091 ms 95.4% + triton_bmm_51702 0.0092 ms 94.8% + triton_bmm_51707 0.0093 ms 93.1% + triton_bmm_51708 0.0093 ms 92.8% + triton_bmm_51711 0.0101 ms 85.8% +SingleProcess AUTOTUNE takes 4.0343 seconds +AUTOTUNE bmm(16x1x542, 16x542x96) + triton_bmm_51729 0.0099 ms 100.0% + triton_bmm_51730 0.0101 ms 97.8% + triton_bmm_51732 0.0108 ms 92.0% + triton_bmm_51728 0.0115 ms 86.4% + bmm 0.0121 ms 81.8% + triton_bmm_51727 0.0121 ms 81.8% + triton_bmm_51733 0.0121 ms 81.8% + triton_bmm_51726 0.0130 ms 76.5% + triton_bmm_51725 0.0139 ms 71.4% + triton_bmm_51724 0.0189 ms 52.5% +SingleProcess AUTOTUNE takes 4.0996 seconds +AUTOTUNE bmm(16x1x96, 16x96x543) + triton_bmm_51797 0.0087 ms 100.0% + triton_bmm_51799 0.0090 ms 96.4% + triton_bmm_51801 0.0091 ms 95.8% + triton_bmm_51798 0.0092 ms 94.8% + triton_bmm_51800 0.0092 ms 94.8% + triton_bmm_51802 0.0092 ms 94.8% + triton_bmm_51804 0.0094 ms 92.5% + triton_bmm_51796 0.0096 ms 90.6% + triton_bmm_51803 0.0098 ms 88.3% + triton_bmm_51805 0.0098 ms 88.3% +SingleProcess AUTOTUNE takes 3.8961 seconds +AUTOTUNE bmm(16x1x543, 16x543x96) + bmm 0.0138 ms 100.0% + triton_bmm_51829 0.0147 ms 94.1% + triton_bmm_51828 0.0156 ms 88.3% + triton_bmm_51825 0.0180 ms 76.9% + triton_bmm_51826 0.0182 ms 76.1% + triton_bmm_51822 0.0186 ms 74.4% + triton_bmm_51824 0.0187 ms 74.0% + triton_bmm_51823 0.0205 ms 67.3% + triton_bmm_51821 0.0213 ms 65.0% + triton_bmm_51820 0.0274 ms 50.5% +SingleProcess AUTOTUNE takes 3.9381 seconds +AUTOTUNE bmm(16x1x96, 16x96x544) + triton_bmm_51893 0.0085 ms 100.0% + triton_bmm_51896 0.0085 ms 99.6% + triton_bmm_51898 0.0085 ms 99.6% + triton_bmm_51895 0.0089 ms 95.7% + triton_bmm_51897 0.0089 ms 95.7% + triton_bmm_51900 0.0089 ms 95.3% + triton_bmm_51892 0.0091 ms 93.3% + triton_bmm_51894 0.0092 ms 92.7% + triton_bmm_51903 0.0095 ms 88.9% + triton_bmm_51899 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 3.7000 seconds +AUTOTUNE bmm(16x1x544, 16x544x96) + triton_bmm_51922 0.0100 ms 100.0% + triton_bmm_51921 0.0104 ms 96.3% + bmm 0.0110 ms 91.0% + triton_bmm_51920 0.0110 ms 90.7% + triton_bmm_51924 0.0111 ms 90.2% + triton_bmm_51925 0.0115 ms 87.2% + triton_bmm_51919 0.0117 ms 85.5% + triton_bmm_51918 0.0132 ms 75.5% + triton_bmm_51917 0.0138 ms 72.2% + triton_bmm_51916 0.0190 ms 52.5% +SingleProcess AUTOTUNE takes 3.8369 seconds +AUTOTUNE bmm(16x1x96, 16x96x545) + triton_bmm_51996 0.0089 ms 100.0% + triton_bmm_51991 0.0090 ms 98.6% + triton_bmm_51989 0.0092 ms 97.2% + triton_bmm_51992 0.0092 ms 97.2% + triton_bmm_51994 0.0092 ms 97.2% + triton_bmm_51990 0.0093 ms 95.4% + triton_bmm_51988 0.0096 ms 93.0% + triton_bmm_51993 0.0096 ms 93.0% + triton_bmm_51999 0.0097 ms 91.4% + triton_bmm_51995 0.0099 ms 90.0% +SingleProcess AUTOTUNE takes 3.8439 seconds +AUTOTUNE bmm(16x1x545, 16x545x96) + bmm 0.0121 ms 100.0% + triton_bmm_52021 0.0151 ms 79.9% + triton_bmm_52020 0.0158 ms 76.7% + triton_bmm_52017 0.0195 ms 62.2% + triton_bmm_52018 0.0196 ms 61.6% + triton_bmm_52016 0.0203 ms 59.6% + triton_bmm_52014 0.0206 ms 58.8% + triton_bmm_52013 0.0212 ms 57.2% + triton_bmm_52015 0.0224 ms 54.0% + triton_bmm_52012 0.0270 ms 44.8% +SingleProcess AUTOTUNE takes 3.9560 seconds +AUTOTUNE bmm(16x1x96, 16x96x546) + triton_bmm_52087 0.0085 ms 100.0% + triton_bmm_52088 0.0085 ms 100.0% + triton_bmm_52085 0.0087 ms 98.2% + triton_bmm_52090 0.0090 ms 94.7% + triton_bmm_52084 0.0091 ms 93.7% + triton_bmm_52089 0.0091 ms 93.7% + triton_bmm_52086 0.0092 ms 93.0% + triton_bmm_52092 0.0093 ms 91.1% + triton_bmm_52095 0.0096 ms 88.4% + triton_bmm_52091 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 4.1318 seconds +AUTOTUNE bmm(16x1x546, 16x546x96) + triton_bmm_52114 0.0104 ms 100.0% + triton_bmm_52113 0.0104 ms 99.4% + triton_bmm_52116 0.0111 ms 93.6% + triton_bmm_52112 0.0112 ms 92.3% + triton_bmm_52117 0.0120 ms 86.4% + triton_bmm_52111 0.0124 ms 83.9% + bmm 0.0130 ms 79.6% + triton_bmm_52110 0.0136 ms 76.1% + triton_bmm_52109 0.0137 ms 75.5% + triton_bmm_52108 0.0195 ms 53.2% +SingleProcess AUTOTUNE takes 4.1015 seconds +AUTOTUNE bmm(16x1x96, 16x96x547) + triton_bmm_52183 0.0085 ms 100.0% + triton_bmm_52181 0.0087 ms 98.5% + triton_bmm_52186 0.0087 ms 98.2% + triton_bmm_52188 0.0089 ms 95.7% + triton_bmm_52184 0.0091 ms 94.3% + triton_bmm_52185 0.0091 ms 94.0% + triton_bmm_52182 0.0092 ms 93.4% + triton_bmm_52180 0.0096 ms 89.3% + triton_bmm_52187 0.0098 ms 87.3% + triton_bmm_52190 0.0099 ms 86.1% +SingleProcess AUTOTUNE takes 4.3250 seconds +AUTOTUNE bmm(16x1x547, 16x547x96) + bmm 0.0125 ms 100.0% + triton_bmm_52213 0.0152 ms 82.7% + triton_bmm_52212 0.0153 ms 81.8% + triton_bmm_52209 0.0196 ms 63.8% + triton_bmm_52210 0.0197 ms 63.7% + triton_bmm_52208 0.0203 ms 61.7% + triton_bmm_52206 0.0206 ms 61.0% + triton_bmm_52205 0.0215 ms 58.4% + triton_bmm_52207 0.0223 ms 56.2% + triton_bmm_52204 0.0266 ms 47.2% +SingleProcess AUTOTUNE takes 4.2888 seconds +AUTOTUNE bmm(16x1x96, 16x96x548) + triton_bmm_52278 0.0087 ms 100.0% + triton_bmm_52279 0.0089 ms 97.0% + triton_bmm_52280 0.0090 ms 96.4% + triton_bmm_52282 0.0090 ms 96.4% + triton_bmm_52277 0.0091 ms 95.1% + triton_bmm_52283 0.0093 ms 93.4% + triton_bmm_52284 0.0093 ms 92.8% + triton_bmm_52281 0.0094 ms 92.5% + triton_bmm_52276 0.0096 ms 90.8% + triton_bmm_52285 0.0097 ms 89.4% +SingleProcess AUTOTUNE takes 3.8069 seconds +AUTOTUNE bmm(16x1x548, 16x548x96) + triton_bmm_52305 0.0100 ms 100.0% + triton_bmm_52306 0.0102 ms 97.8% + triton_bmm_52308 0.0112 ms 89.1% + triton_bmm_52309 0.0112 ms 88.6% + triton_bmm_52304 0.0115 ms 86.4% + triton_bmm_52303 0.0121 ms 82.3% + triton_bmm_52302 0.0134 ms 74.0% + triton_bmm_52301 0.0138 ms 72.0% + bmm 0.0165 ms 60.4% + triton_bmm_52300 0.0190 ms 52.4% +SingleProcess AUTOTUNE takes 4.0915 seconds +AUTOTUNE bmm(16x1x96, 16x96x549) + triton_bmm_52375 0.0090 ms 100.0% + triton_bmm_52376 0.0091 ms 98.9% + triton_bmm_52377 0.0091 ms 98.9% + triton_bmm_52378 0.0091 ms 98.9% + triton_bmm_52373 0.0092 ms 98.3% + triton_bmm_52374 0.0092 ms 97.9% + triton_bmm_52380 0.0094 ms 95.9% + triton_bmm_52372 0.0096 ms 94.0% + triton_bmm_52379 0.0098 ms 91.5% + triton_bmm_52381 0.0102 ms 88.1% +SingleProcess AUTOTUNE takes 3.7830 seconds +AUTOTUNE bmm(16x1x549, 16x549x96) + bmm 0.0127 ms 100.0% + triton_bmm_52405 0.0151 ms 84.1% + triton_bmm_52404 0.0158 ms 80.7% + triton_bmm_52402 0.0199 ms 64.1% + triton_bmm_52401 0.0201 ms 63.5% + triton_bmm_52400 0.0204 ms 62.6% + triton_bmm_52398 0.0206 ms 61.9% + triton_bmm_52397 0.0212 ms 60.1% + triton_bmm_52399 0.0223 ms 57.1% + triton_bmm_52396 0.0270 ms 47.2% +SingleProcess AUTOTUNE takes 3.7605 seconds +AUTOTUNE bmm(16x1x96, 16x96x550) + triton_bmm_52476 0.0089 ms 100.0% + triton_bmm_52471 0.0090 ms 99.3% + triton_bmm_52472 0.0090 ms 98.9% + triton_bmm_52469 0.0091 ms 97.5% + triton_bmm_52474 0.0092 ms 97.2% + triton_bmm_52470 0.0092 ms 96.9% + triton_bmm_52475 0.0093 ms 95.5% + triton_bmm_52473 0.0094 ms 94.2% + triton_bmm_52468 0.0096 ms 93.0% + triton_bmm_52479 0.0097 ms 91.7% +SingleProcess AUTOTUNE takes 3.7363 seconds +AUTOTUNE bmm(16x1x550, 16x550x96) + triton_bmm_52497 0.0101 ms 100.0% + triton_bmm_52498 0.0104 ms 97.8% + triton_bmm_52500 0.0108 ms 94.1% + triton_bmm_52501 0.0117 ms 86.8% + triton_bmm_52496 0.0117 ms 86.6% + triton_bmm_52495 0.0125 ms 81.4% + bmm 0.0129 ms 78.9% + triton_bmm_52494 0.0132 ms 76.9% + triton_bmm_52493 0.0142 ms 71.2% + triton_bmm_52492 0.0191 ms 53.1% +SingleProcess AUTOTUNE takes 4.1646 seconds +AUTOTUNE bmm(16x1x96, 16x96x551) + triton_bmm_52567 0.0085 ms 100.0% + triton_bmm_52565 0.0087 ms 98.5% + triton_bmm_52566 0.0087 ms 98.5% + triton_bmm_52572 0.0089 ms 95.7% + triton_bmm_52568 0.0091 ms 94.3% + triton_bmm_52564 0.0091 ms 93.7% + triton_bmm_52570 0.0092 ms 93.4% + triton_bmm_52569 0.0094 ms 90.8% + triton_bmm_52575 0.0097 ms 87.8% + triton_bmm_52571 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 3.7983 seconds +AUTOTUNE bmm(16x1x551, 16x551x96) + bmm 0.0132 ms 100.0% + triton_bmm_52597 0.0148 ms 89.8% + triton_bmm_52596 0.0158 ms 83.8% + triton_bmm_52594 0.0199 ms 66.6% + triton_bmm_52592 0.0201 ms 65.9% + triton_bmm_52593 0.0201 ms 65.8% + triton_bmm_52590 0.0207 ms 64.1% + triton_bmm_52589 0.0216 ms 61.2% + triton_bmm_52591 0.0225 ms 58.9% + triton_bmm_52588 0.0268 ms 49.4% +SingleProcess AUTOTUNE takes 3.7589 seconds +AUTOTUNE bmm(16x1x96, 16x96x552) + triton_bmm_52661 0.0085 ms 100.0% + triton_bmm_52663 0.0085 ms 99.6% + triton_bmm_52664 0.0085 ms 99.3% + triton_bmm_52662 0.0087 ms 97.8% + triton_bmm_52668 0.0089 ms 95.3% + triton_bmm_52666 0.0090 ms 94.0% + triton_bmm_52660 0.0091 ms 93.3% + triton_bmm_52667 0.0093 ms 91.4% + triton_bmm_52665 0.0094 ms 90.4% + triton_bmm_52669 0.0098 ms 86.9% +SingleProcess AUTOTUNE takes 3.9899 seconds +AUTOTUNE bmm(16x1x552, 16x552x96) + triton_bmm_52690 0.0106 ms 100.0% + triton_bmm_52692 0.0110 ms 96.5% + triton_bmm_52689 0.0114 ms 93.0% + triton_bmm_52688 0.0116 ms 91.7% + bmm 0.0116 ms 91.5% + triton_bmm_52693 0.0124 ms 86.0% + triton_bmm_52687 0.0124 ms 85.9% + triton_bmm_52686 0.0130 ms 81.8% + triton_bmm_52685 0.0143 ms 74.4% + triton_bmm_52684 0.0193 ms 55.0% +SingleProcess AUTOTUNE takes 4.0244 seconds +AUTOTUNE bmm(16x1x96, 16x96x553) + triton_bmm_52759 0.0085 ms 100.0% + triton_bmm_52762 0.0087 ms 98.2% + triton_bmm_52758 0.0088 ms 96.7% + triton_bmm_52761 0.0091 ms 94.0% + triton_bmm_52757 0.0092 ms 93.4% + triton_bmm_52760 0.0092 ms 93.4% + triton_bmm_52764 0.0094 ms 91.1% + triton_bmm_52756 0.0096 ms 89.3% + triton_bmm_52765 0.0097 ms 87.7% + triton_bmm_52763 0.0098 ms 87.0% +SingleProcess AUTOTUNE takes 4.0865 seconds +AUTOTUNE bmm(16x1x553, 16x553x96) + bmm 0.0134 ms 100.0% + triton_bmm_52789 0.0149 ms 90.1% + triton_bmm_52788 0.0155 ms 86.6% + triton_bmm_52785 0.0197 ms 68.2% + triton_bmm_52786 0.0199 ms 67.6% + triton_bmm_52784 0.0201 ms 66.9% + triton_bmm_52782 0.0207 ms 64.8% + triton_bmm_52781 0.0217 ms 62.0% + triton_bmm_52783 0.0221 ms 60.9% + triton_bmm_52780 0.0268 ms 50.2% +SingleProcess AUTOTUNE takes 3.9729 seconds +AUTOTUNE bmm(16x1x96, 16x96x554) + triton_bmm_52856 0.0085 ms 100.0% + triton_bmm_52857 0.0089 ms 95.7% + triton_bmm_52855 0.0090 ms 95.0% + triton_bmm_52858 0.0091 ms 94.0% + triton_bmm_52853 0.0092 ms 93.0% + triton_bmm_52854 0.0092 ms 92.7% + triton_bmm_52860 0.0093 ms 91.1% + triton_bmm_52852 0.0096 ms 89.0% + triton_bmm_52859 0.0098 ms 87.1% + triton_bmm_52863 0.0100 ms 85.0% +SingleProcess AUTOTUNE takes 3.8945 seconds +AUTOTUNE bmm(16x1x554, 16x554x96) + triton_bmm_52882 0.0104 ms 100.0% + triton_bmm_52881 0.0105 ms 98.8% + triton_bmm_52884 0.0108 ms 96.1% + triton_bmm_52880 0.0117 ms 88.5% + triton_bmm_52885 0.0121 ms 85.5% + triton_bmm_52879 0.0124 ms 83.7% + triton_bmm_52878 0.0132 ms 78.6% + bmm 0.0132 ms 78.5% + triton_bmm_52877 0.0138 ms 75.0% + triton_bmm_52876 0.0192 ms 53.9% +SingleProcess AUTOTUNE takes 4.4384 seconds +AUTOTUNE bmm(16x1x96, 16x96x555) + triton_bmm_52951 0.0090 ms 100.0% + triton_bmm_52948 0.0091 ms 99.3% + triton_bmm_52952 0.0091 ms 99.3% + triton_bmm_52949 0.0092 ms 98.6% + triton_bmm_52954 0.0092 ms 98.6% + triton_bmm_52950 0.0093 ms 96.6% + triton_bmm_52956 0.0094 ms 96.2% + triton_bmm_52953 0.0095 ms 95.3% + triton_bmm_52955 0.0095 ms 95.3% + triton_bmm_52959 0.0097 ms 92.8% +SingleProcess AUTOTUNE takes 4.1684 seconds +AUTOTUNE bmm(16x1x555, 16x555x96) + bmm 0.0136 ms 100.0% + triton_bmm_52981 0.0154 ms 88.1% + triton_bmm_52980 0.0156 ms 87.2% + triton_bmm_52976 0.0201 ms 67.5% + triton_bmm_52977 0.0201 ms 67.4% + triton_bmm_52978 0.0204 ms 66.7% + triton_bmm_52974 0.0207 ms 65.5% + triton_bmm_52973 0.0212 ms 64.0% + triton_bmm_52975 0.0220 ms 61.5% + triton_bmm_52972 0.0266 ms 51.0% +SingleProcess AUTOTUNE takes 4.3016 seconds +AUTOTUNE bmm(16x1x96, 16x96x556) + triton_bmm_53047 0.0085 ms 100.0% + triton_bmm_53048 0.0085 ms 99.6% + triton_bmm_53050 0.0085 ms 99.6% + triton_bmm_53045 0.0086 ms 98.5% + triton_bmm_53049 0.0089 ms 95.7% + triton_bmm_53052 0.0089 ms 95.7% + triton_bmm_53046 0.0092 ms 93.0% + triton_bmm_53051 0.0093 ms 91.7% + triton_bmm_53044 0.0096 ms 89.0% + triton_bmm_53053 0.0097 ms 87.5% +SingleProcess AUTOTUNE takes 4.1575 seconds +AUTOTUNE bmm(16x1x556, 16x556x96) + triton_bmm_53073 0.0104 ms 100.0% + triton_bmm_53074 0.0106 ms 98.2% + triton_bmm_53076 0.0111 ms 94.2% + triton_bmm_53072 0.0116 ms 89.6% + triton_bmm_53077 0.0117 ms 89.1% + triton_bmm_53071 0.0121 ms 86.5% + bmm 0.0130 ms 80.5% + triton_bmm_53070 0.0135 ms 77.4% + triton_bmm_53069 0.0142 ms 73.3% + triton_bmm_53068 0.0190 ms 54.9% +SingleProcess AUTOTUNE takes 4.1320 seconds +AUTOTUNE bmm(16x1x96, 16x96x557) + triton_bmm_53146 0.0087 ms 100.0% + triton_bmm_53148 0.0089 ms 97.8% + triton_bmm_53143 0.0090 ms 96.8% + triton_bmm_53144 0.0091 ms 96.5% + triton_bmm_53145 0.0091 ms 96.5% + triton_bmm_53140 0.0091 ms 96.1% + triton_bmm_53141 0.0092 ms 95.5% + triton_bmm_53142 0.0092 ms 95.1% + triton_bmm_53147 0.0093 ms 93.5% + triton_bmm_53151 0.0097 ms 89.8% +SingleProcess AUTOTUNE takes 3.8190 seconds +AUTOTUNE bmm(16x1x557, 16x557x96) + bmm 0.0138 ms 100.0% + triton_bmm_53173 0.0150 ms 92.3% + triton_bmm_53172 0.0159 ms 87.1% + triton_bmm_53169 0.0199 ms 69.6% + triton_bmm_53170 0.0201 ms 68.9% + triton_bmm_53166 0.0203 ms 68.0% + triton_bmm_53168 0.0205 ms 67.3% + triton_bmm_53165 0.0212 ms 65.2% + triton_bmm_53167 0.0225 ms 61.5% + triton_bmm_53164 0.0270 ms 51.2% +SingleProcess AUTOTUNE takes 4.0094 seconds +AUTOTUNE bmm(16x1x96, 16x96x558) + triton_bmm_53239 0.0085 ms 100.0% + triton_bmm_53240 0.0085 ms 99.6% + triton_bmm_53241 0.0091 ms 94.0% + triton_bmm_53242 0.0091 ms 94.0% + triton_bmm_53236 0.0091 ms 93.3% + triton_bmm_53237 0.0091 ms 93.3% + triton_bmm_53238 0.0092 ms 92.7% + triton_bmm_53243 0.0093 ms 91.4% + triton_bmm_53244 0.0094 ms 90.8% + triton_bmm_53247 0.0097 ms 87.8% +SingleProcess AUTOTUNE takes 3.8161 seconds +AUTOTUNE bmm(16x1x558, 16x558x96) + triton_bmm_53265 0.0106 ms 100.0% + triton_bmm_53268 0.0108 ms 98.5% + triton_bmm_53266 0.0108 ms 98.4% + triton_bmm_53264 0.0112 ms 94.6% + triton_bmm_53263 0.0121 ms 87.8% + triton_bmm_53269 0.0122 ms 87.4% + bmm 0.0131 ms 81.4% + triton_bmm_53262 0.0136 ms 77.9% + triton_bmm_53261 0.0141 ms 75.3% + triton_bmm_53260 0.0192 ms 55.2% +SingleProcess AUTOTUNE takes 4.0745 seconds +AUTOTUNE bmm(16x1x96, 16x96x559) + triton_bmm_53333 0.0087 ms 100.0% + triton_bmm_53338 0.0087 ms 100.0% + triton_bmm_53335 0.0090 ms 96.5% + triton_bmm_53332 0.0091 ms 95.4% + triton_bmm_53336 0.0092 ms 95.1% + triton_bmm_53334 0.0092 ms 94.8% + triton_bmm_53340 0.0094 ms 92.8% + triton_bmm_53337 0.0094 ms 92.5% + triton_bmm_53339 0.0098 ms 88.6% + triton_bmm_53341 0.0099 ms 87.7% +SingleProcess AUTOTUNE takes 4.2524 seconds +AUTOTUNE bmm(16x1x559, 16x559x96) + bmm 0.0140 ms 100.0% + triton_bmm_53365 0.0154 ms 91.5% + triton_bmm_53364 0.0159 ms 88.2% + triton_bmm_53362 0.0200 ms 70.4% + triton_bmm_53360 0.0201 ms 69.9% + triton_bmm_53361 0.0202 ms 69.6% + triton_bmm_53358 0.0203 ms 69.1% + triton_bmm_53357 0.0216 ms 64.9% + triton_bmm_53359 0.0220 ms 63.7% + triton_bmm_53356 0.0266 ms 52.8% +SingleProcess AUTOTUNE takes 3.8328 seconds +AUTOTUNE bmm(16x1x96, 16x96x560) + triton_bmm_53434 0.0085 ms 100.0% + triton_bmm_53436 0.0089 ms 96.0% + triton_bmm_53431 0.0090 ms 95.2% + triton_bmm_53432 0.0090 ms 94.7% + triton_bmm_53429 0.0091 ms 94.3% + triton_bmm_53430 0.0092 ms 93.4% + triton_bmm_53433 0.0093 ms 91.4% + triton_bmm_53439 0.0095 ms 89.6% + triton_bmm_53428 0.0096 ms 89.3% + triton_bmm_53435 0.0098 ms 87.3% +SingleProcess AUTOTUNE takes 3.8891 seconds +AUTOTUNE bmm(16x1x560, 16x560x96) + triton_bmm_53457 0.0106 ms 100.0% + triton_bmm_53458 0.0106 ms 99.7% + triton_bmm_53460 0.0108 ms 98.2% + bmm 0.0112 ms 94.3% + triton_bmm_53456 0.0116 ms 91.4% + triton_bmm_53461 0.0116 ms 90.9% + triton_bmm_53455 0.0121 ms 87.8% + triton_bmm_53454 0.0130 ms 81.3% + triton_bmm_53453 0.0143 ms 74.2% + triton_bmm_53452 0.0193 ms 55.0% +SingleProcess AUTOTUNE takes 3.9282 seconds +AUTOTUNE bmm(16x1x96, 16x96x561) + triton_bmm_53525 0.0087 ms 100.0% + triton_bmm_53526 0.0089 ms 98.2% + triton_bmm_53532 0.0089 ms 97.8% + triton_bmm_53527 0.0090 ms 96.8% + triton_bmm_53524 0.0091 ms 95.8% + triton_bmm_53529 0.0091 ms 95.8% + triton_bmm_53528 0.0092 ms 95.1% + triton_bmm_53530 0.0092 ms 95.1% + triton_bmm_53531 0.0098 ms 88.6% + triton_bmm_53534 0.0099 ms 87.7% +SingleProcess AUTOTUNE takes 3.7555 seconds +AUTOTUNE bmm(16x1x561, 16x561x96) + bmm 0.0125 ms 100.0% + triton_bmm_53557 0.0154 ms 81.5% + triton_bmm_53556 0.0158 ms 79.4% + triton_bmm_53552 0.0201 ms 62.4% + triton_bmm_53554 0.0203 ms 61.8% + triton_bmm_53553 0.0204 ms 61.6% + triton_bmm_53550 0.0207 ms 60.5% + triton_bmm_53549 0.0214 ms 58.7% + triton_bmm_53551 0.0225 ms 55.7% + triton_bmm_53548 0.0264 ms 47.6% +SingleProcess AUTOTUNE takes 4.5294 seconds +AUTOTUNE bmm(16x1x96, 16x96x562) + triton_bmm_53623 0.0085 ms 100.0% + triton_bmm_53624 0.0085 ms 100.0% + triton_bmm_53622 0.0088 ms 96.7% + triton_bmm_53628 0.0089 ms 96.0% + triton_bmm_53625 0.0091 ms 94.3% + triton_bmm_53621 0.0091 ms 93.7% + triton_bmm_53626 0.0092 ms 93.4% + triton_bmm_53627 0.0093 ms 91.8% + triton_bmm_53620 0.0096 ms 89.3% + triton_bmm_53630 0.0100 ms 85.9% +SingleProcess AUTOTUNE takes 3.8980 seconds +AUTOTUNE bmm(16x1x562, 16x562x96) + triton_bmm_53649 0.0101 ms 100.0% + triton_bmm_53650 0.0108 ms 93.8% + triton_bmm_53648 0.0112 ms 90.3% + triton_bmm_53652 0.0113 ms 90.1% + triton_bmm_53653 0.0117 ms 86.8% + triton_bmm_53647 0.0121 ms 83.9% + triton_bmm_53646 0.0132 ms 76.9% + bmm 0.0133 ms 76.2% + triton_bmm_53645 0.0138 ms 73.4% + triton_bmm_53644 0.0195 ms 52.0% +SingleProcess AUTOTUNE takes 4.7453 seconds +AUTOTUNE bmm(16x1x96, 16x96x563) + triton_bmm_53719 0.0085 ms 100.0% + triton_bmm_53717 0.0087 ms 98.2% + triton_bmm_53720 0.0087 ms 98.2% + triton_bmm_53722 0.0092 ms 93.4% + triton_bmm_53718 0.0092 ms 92.5% + triton_bmm_53716 0.0092 ms 92.4% + triton_bmm_53724 0.0094 ms 91.1% + triton_bmm_53721 0.0094 ms 90.8% + triton_bmm_53723 0.0095 ms 90.2% + triton_bmm_53726 0.0099 ms 86.1% +SingleProcess AUTOTUNE takes 4.0178 seconds +AUTOTUNE bmm(16x1x563, 16x563x96) + bmm 0.0132 ms 100.0% + triton_bmm_53749 0.0151 ms 87.5% + triton_bmm_53748 0.0158 ms 83.4% + triton_bmm_53746 0.0199 ms 66.3% + triton_bmm_53745 0.0204 ms 64.8% + triton_bmm_53744 0.0205 ms 64.2% + triton_bmm_53742 0.0208 ms 63.5% + triton_bmm_53741 0.0217 ms 60.7% + triton_bmm_53743 0.0225 ms 58.5% + triton_bmm_53740 0.0264 ms 50.0% +SingleProcess AUTOTUNE takes 4.0121 seconds +AUTOTUNE bmm(16x1x96, 16x96x564) + triton_bmm_53815 0.0085 ms 100.0% + triton_bmm_53816 0.0085 ms 99.6% + triton_bmm_53813 0.0086 ms 98.5% + triton_bmm_53814 0.0087 ms 98.2% + triton_bmm_53820 0.0089 ms 95.7% + triton_bmm_53818 0.0092 ms 93.0% + triton_bmm_53817 0.0094 ms 90.8% + triton_bmm_53812 0.0096 ms 89.0% + triton_bmm_53819 0.0098 ms 87.2% + triton_bmm_53822 0.0099 ms 85.8% +SingleProcess AUTOTUNE takes 3.8087 seconds +AUTOTUNE bmm(16x1x564, 16x564x96) + triton_bmm_53842 0.0104 ms 100.0% + triton_bmm_53841 0.0106 ms 97.6% + triton_bmm_53844 0.0108 ms 95.9% + triton_bmm_53840 0.0112 ms 92.3% + triton_bmm_53845 0.0113 ms 92.0% + triton_bmm_53839 0.0121 ms 85.7% + bmm 0.0134 ms 77.3% + triton_bmm_53838 0.0135 ms 76.6% + triton_bmm_53837 0.0139 ms 74.8% + triton_bmm_53836 0.0190 ms 54.5% +SingleProcess AUTOTUNE takes 4.0687 seconds +AUTOTUNE bmm(16x1x96, 16x96x565) + triton_bmm_53911 0.0085 ms 100.0% + triton_bmm_53914 0.0087 ms 97.8% + triton_bmm_53910 0.0088 ms 96.7% + triton_bmm_53916 0.0089 ms 96.0% + triton_bmm_53913 0.0091 ms 94.0% + triton_bmm_53909 0.0092 ms 93.4% + triton_bmm_53912 0.0092 ms 93.0% + triton_bmm_53908 0.0096 ms 89.3% + triton_bmm_53915 0.0098 ms 87.0% + triton_bmm_53918 0.0099 ms 86.1% +SingleProcess AUTOTUNE takes 4.0692 seconds +AUTOTUNE bmm(16x1x565, 16x565x96) + bmm 0.0132 ms 100.0% + triton_bmm_53941 0.0155 ms 85.2% + triton_bmm_53940 0.0158 ms 83.4% + triton_bmm_53938 0.0199 ms 66.2% + triton_bmm_53936 0.0201 ms 65.6% + triton_bmm_53934 0.0203 ms 64.9% + triton_bmm_53937 0.0204 ms 64.8% + triton_bmm_53933 0.0214 ms 61.6% + triton_bmm_53935 0.0226 ms 58.4% + triton_bmm_53932 0.0267 ms 49.4% +SingleProcess AUTOTUNE takes 3.8095 seconds +AUTOTUNE bmm(16x1x96, 16x96x566) + triton_bmm_54006 0.0088 ms 100.0% + triton_bmm_54012 0.0089 ms 98.9% + triton_bmm_54007 0.0090 ms 97.5% + triton_bmm_54009 0.0091 ms 97.2% + triton_bmm_54004 0.0091 ms 96.8% + triton_bmm_54005 0.0092 ms 96.2% + triton_bmm_54008 0.0092 ms 96.2% + triton_bmm_54010 0.0092 ms 95.8% + triton_bmm_54015 0.0097 ms 90.8% + triton_bmm_54011 0.0098 ms 90.0% +SingleProcess AUTOTUNE takes 3.9818 seconds +AUTOTUNE bmm(16x1x566, 16x566x96) + triton_bmm_54033 0.0102 ms 100.0% + triton_bmm_54036 0.0108 ms 94.1% + triton_bmm_54034 0.0108 ms 93.8% + triton_bmm_54032 0.0117 ms 86.9% + triton_bmm_54037 0.0123 ms 82.8% + triton_bmm_54031 0.0125 ms 81.1% + bmm 0.0131 ms 77.8% + triton_bmm_54030 0.0137 ms 74.5% + triton_bmm_54029 0.0138 ms 73.6% + triton_bmm_54028 0.0192 ms 52.9% +SingleProcess AUTOTUNE takes 3.9304 seconds +AUTOTUNE bmm(16x1x96, 16x96x567) + triton_bmm_54106 0.0087 ms 100.0% + triton_bmm_54102 0.0088 ms 99.3% + triton_bmm_54108 0.0089 ms 97.8% + triton_bmm_54103 0.0090 ms 97.2% + triton_bmm_54105 0.0091 ms 96.1% + triton_bmm_54101 0.0092 ms 95.1% + triton_bmm_54104 0.0092 ms 95.1% + triton_bmm_54100 0.0093 ms 94.1% + triton_bmm_54107 0.0095 ms 91.9% + triton_bmm_54111 0.0097 ms 89.8% +SingleProcess AUTOTUNE takes 3.9076 seconds +AUTOTUNE bmm(16x1x567, 16x567x96) + bmm 0.0138 ms 100.0% + triton_bmm_54133 0.0156 ms 88.9% + triton_bmm_54132 0.0158 ms 87.4% + triton_bmm_54130 0.0201 ms 68.9% + triton_bmm_54128 0.0201 ms 68.8% + triton_bmm_54126 0.0203 ms 68.0% + triton_bmm_54129 0.0204 ms 67.8% + triton_bmm_54125 0.0214 ms 64.6% + triton_bmm_54127 0.0223 ms 62.1% + triton_bmm_54124 0.0266 ms 52.0% +SingleProcess AUTOTUNE takes 3.7477 seconds +AUTOTUNE bmm(16x1x96, 16x96x568) + triton_bmm_54199 0.0085 ms 100.0% + triton_bmm_54197 0.0086 ms 99.6% + triton_bmm_54202 0.0087 ms 98.5% + triton_bmm_54200 0.0091 ms 94.2% + triton_bmm_54198 0.0092 ms 93.0% + triton_bmm_54203 0.0093 ms 91.8% + triton_bmm_54204 0.0094 ms 91.1% + triton_bmm_54201 0.0094 ms 90.8% + triton_bmm_54205 0.0095 ms 89.9% + triton_bmm_54196 0.0096 ms 89.3% +SingleProcess AUTOTUNE takes 3.6932 seconds +AUTOTUNE bmm(16x1x568, 16x568x96) + triton_bmm_54226 0.0104 ms 100.0% + triton_bmm_54224 0.0112 ms 92.3% + triton_bmm_54228 0.0114 ms 90.8% + triton_bmm_54225 0.0116 ms 89.0% + triton_bmm_54223 0.0121 ms 85.7% + triton_bmm_54229 0.0123 ms 84.4% + triton_bmm_54222 0.0132 ms 78.6% + triton_bmm_54221 0.0143 ms 72.3% + bmm 0.0161 ms 64.2% + triton_bmm_54220 0.0190 ms 54.5% +SingleProcess AUTOTUNE takes 4.7586 seconds +AUTOTUNE bmm(16x1x96, 16x96x569) + triton_bmm_54293 0.0087 ms 100.0% + triton_bmm_54298 0.0087 ms 99.6% + triton_bmm_54300 0.0089 ms 97.5% + triton_bmm_54295 0.0090 ms 96.5% + triton_bmm_54297 0.0091 ms 95.8% + triton_bmm_54296 0.0092 ms 94.8% + triton_bmm_54294 0.0093 ms 93.5% + triton_bmm_54299 0.0095 ms 91.6% + triton_bmm_54292 0.0096 ms 90.4% + triton_bmm_54303 0.0097 ms 89.5% +SingleProcess AUTOTUNE takes 3.7911 seconds +AUTOTUNE bmm(16x1x569, 16x569x96) + bmm 0.0140 ms 100.0% + triton_bmm_54325 0.0156 ms 90.1% + triton_bmm_54324 0.0160 ms 87.8% + triton_bmm_54322 0.0200 ms 70.2% + triton_bmm_54321 0.0201 ms 69.9% + triton_bmm_54320 0.0206 ms 68.3% + triton_bmm_54318 0.0208 ms 67.5% + triton_bmm_54317 0.0219 ms 64.2% + triton_bmm_54319 0.0223 ms 62.9% + triton_bmm_54316 0.0270 ms 52.0% +SingleProcess AUTOTUNE takes 4.3519 seconds +AUTOTUNE bmm(16x1x96, 16x96x570) + triton_bmm_54392 0.0086 ms 100.0% + triton_bmm_54389 0.0087 ms 98.9% + triton_bmm_54390 0.0087 ms 98.9% + triton_bmm_54394 0.0087 ms 98.9% + triton_bmm_54391 0.0090 ms 95.4% + triton_bmm_54393 0.0091 ms 94.7% + triton_bmm_54396 0.0094 ms 91.8% + triton_bmm_54388 0.0096 ms 90.0% + triton_bmm_54399 0.0097 ms 88.5% + triton_bmm_54395 0.0098 ms 87.9% +SingleProcess AUTOTUNE takes 4.1323 seconds +AUTOTUNE bmm(16x1x570, 16x570x96) + triton_bmm_54417 0.0103 ms 100.0% + triton_bmm_54418 0.0108 ms 95.3% + triton_bmm_54420 0.0110 ms 93.9% + triton_bmm_54416 0.0118 ms 87.5% + triton_bmm_54415 0.0121 ms 85.4% + triton_bmm_54421 0.0124 ms 83.7% + bmm 0.0131 ms 79.0% + triton_bmm_54414 0.0134 ms 77.3% + triton_bmm_54413 0.0143 ms 72.3% + triton_bmm_54412 0.0192 ms 53.7% +SingleProcess AUTOTUNE takes 4.1355 seconds +AUTOTUNE bmm(16x1x96, 16x96x571) + triton_bmm_54485 0.0087 ms 100.0% + triton_bmm_54486 0.0087 ms 99.6% + triton_bmm_54487 0.0090 ms 96.5% + triton_bmm_54488 0.0092 ms 95.1% + triton_bmm_54490 0.0092 ms 94.4% + triton_bmm_54484 0.0093 ms 93.8% + triton_bmm_54492 0.0094 ms 92.5% + triton_bmm_54489 0.0095 ms 91.6% + triton_bmm_54491 0.0098 ms 88.6% + triton_bmm_54493 0.0099 ms 87.7% +SingleProcess AUTOTUNE takes 4.1455 seconds +AUTOTUNE bmm(16x1x571, 16x571x96) + bmm 0.0145 ms 100.0% + triton_bmm_54517 0.0151 ms 96.0% + triton_bmm_54516 0.0156 ms 93.4% + triton_bmm_54513 0.0204 ms 71.3% + triton_bmm_54514 0.0204 ms 71.3% + triton_bmm_54512 0.0206 ms 70.6% + triton_bmm_54510 0.0208 ms 69.8% + triton_bmm_54509 0.0216 ms 67.3% + triton_bmm_54511 0.0228 ms 63.9% + triton_bmm_54508 0.0269 ms 54.0% +SingleProcess AUTOTUNE takes 4.0027 seconds +AUTOTUNE bmm(16x1x96, 16x96x572) + triton_bmm_54586 0.0086 ms 100.0% + triton_bmm_54588 0.0089 ms 96.1% + triton_bmm_54583 0.0090 ms 95.4% + triton_bmm_54584 0.0090 ms 95.0% + triton_bmm_54580 0.0091 ms 94.4% + triton_bmm_54581 0.0091 ms 94.0% + triton_bmm_54582 0.0092 ms 93.4% + triton_bmm_54587 0.0093 ms 92.1% + triton_bmm_54585 0.0094 ms 91.5% + triton_bmm_54591 0.0097 ms 88.2% +SingleProcess AUTOTUNE takes 3.8186 seconds +AUTOTUNE bmm(16x1x572, 16x572x96) + triton_bmm_54610 0.0104 ms 100.0% + triton_bmm_54609 0.0106 ms 97.6% + triton_bmm_54612 0.0110 ms 94.2% + triton_bmm_54608 0.0112 ms 92.3% + triton_bmm_54613 0.0119 ms 87.1% + triton_bmm_54607 0.0121 ms 85.7% + bmm 0.0131 ms 79.0% + triton_bmm_54606 0.0132 ms 78.6% + triton_bmm_54605 0.0140 ms 74.0% + triton_bmm_54604 0.0191 ms 54.4% +SingleProcess AUTOTUNE takes 3.9284 seconds +AUTOTUNE bmm(16x1x96, 16x96x573) + triton_bmm_54684 0.0089 ms 100.0% + triton_bmm_54679 0.0091 ms 98.6% + triton_bmm_54677 0.0092 ms 97.2% + triton_bmm_54680 0.0092 ms 97.2% + triton_bmm_54682 0.0092 ms 96.9% + triton_bmm_54678 0.0092 ms 96.5% + triton_bmm_54676 0.0093 ms 96.2% + triton_bmm_54681 0.0095 ms 93.9% + triton_bmm_54686 0.0100 ms 89.7% + triton_bmm_54683 0.0100 ms 89.4% +SingleProcess AUTOTUNE takes 4.0469 seconds +AUTOTUNE bmm(16x1x573, 16x573x96) + bmm 0.0147 ms 100.0% + triton_bmm_54709 0.0156 ms 94.5% + triton_bmm_54708 0.0160 ms 91.9% + triton_bmm_54706 0.0205 ms 71.9% + triton_bmm_54702 0.0205 ms 71.8% + triton_bmm_54705 0.0205 ms 71.7% + triton_bmm_54704 0.0206 ms 71.4% + triton_bmm_54701 0.0219 ms 67.3% + triton_bmm_54703 0.0225 ms 65.5% + triton_bmm_54700 0.0271 ms 54.4% +SingleProcess AUTOTUNE takes 4.0380 seconds +AUTOTUNE bmm(16x1x96, 16x96x574) + triton_bmm_54773 0.0087 ms 100.0% + triton_bmm_54775 0.0090 ms 96.8% + triton_bmm_54777 0.0091 ms 96.1% + triton_bmm_54776 0.0091 ms 95.8% + triton_bmm_54778 0.0092 ms 94.4% + triton_bmm_54772 0.0093 ms 93.8% + triton_bmm_54774 0.0093 ms 93.2% + triton_bmm_54780 0.0094 ms 92.8% + triton_bmm_54779 0.0098 ms 88.9% + triton_bmm_54782 0.0100 ms 87.5% +SingleProcess AUTOTUNE takes 3.9023 seconds +AUTOTUNE bmm(16x1x574, 16x574x96) + triton_bmm_54801 0.0107 ms 100.0% + triton_bmm_54802 0.0108 ms 98.5% + triton_bmm_54804 0.0110 ms 97.1% + triton_bmm_54800 0.0114 ms 93.6% + triton_bmm_54805 0.0119 ms 89.8% + triton_bmm_54799 0.0126 ms 85.0% + bmm 0.0132 ms 80.9% + triton_bmm_54798 0.0137 ms 78.2% + triton_bmm_54797 0.0140 ms 76.1% + triton_bmm_54796 0.0197 ms 54.3% +SingleProcess AUTOTUNE takes 4.1005 seconds +AUTOTUNE bmm(16x1x96, 16x96x575) + triton_bmm_54871 0.0085 ms 100.0% + triton_bmm_54869 0.0087 ms 97.8% + triton_bmm_54870 0.0089 ms 96.4% + triton_bmm_54872 0.0092 ms 93.0% + triton_bmm_54874 0.0092 ms 92.7% + triton_bmm_54868 0.0093 ms 92.1% + triton_bmm_54876 0.0094 ms 90.8% + triton_bmm_54873 0.0095 ms 89.9% + triton_bmm_54875 0.0099 ms 86.4% + triton_bmm_54877 0.0099 ms 86.1% +SingleProcess AUTOTUNE takes 3.7872 seconds +AUTOTUNE bmm(16x1x575, 16x575x96) + bmm 0.0149 ms 100.0% + triton_bmm_54901 0.0156 ms 95.5% + triton_bmm_54900 0.0158 ms 94.3% + triton_bmm_54897 0.0184 ms 81.2% + triton_bmm_54898 0.0188 ms 79.3% + triton_bmm_54896 0.0190 ms 78.3% + triton_bmm_54894 0.0197 ms 75.6% + triton_bmm_54895 0.0210 ms 71.1% + triton_bmm_54893 0.0221 ms 67.5% + triton_bmm_54892 0.0284 ms 52.5% +SingleProcess AUTOTUNE takes 4.1639 seconds +AUTOTUNE bmm(16x1x96, 16x96x576) + triton_bmm_54967 0.0085 ms 100.0% + triton_bmm_54968 0.0085 ms 100.0% + triton_bmm_54965 0.0086 ms 98.9% + triton_bmm_54966 0.0087 ms 97.8% + triton_bmm_54972 0.0089 ms 95.7% + triton_bmm_54964 0.0091 ms 94.0% + triton_bmm_54970 0.0091 ms 93.7% + triton_bmm_54971 0.0093 ms 91.8% + triton_bmm_54969 0.0093 ms 91.4% + triton_bmm_54975 0.0097 ms 88.1% +SingleProcess AUTOTUNE takes 3.8468 seconds +AUTOTUNE bmm(16x1x576, 16x576x96) + triton_bmm_54994 0.0104 ms 100.0% + triton_bmm_54993 0.0112 ms 92.6% + triton_bmm_54996 0.0113 ms 92.0% + bmm 0.0115 ms 90.5% + triton_bmm_54992 0.0117 ms 88.5% + triton_bmm_54997 0.0121 ms 85.7% + triton_bmm_54991 0.0121 ms 85.5% + triton_bmm_54990 0.0136 ms 76.1% + triton_bmm_54989 0.0143 ms 72.6% + triton_bmm_54988 0.0192 ms 53.9% +SingleProcess AUTOTUNE takes 4.5007 seconds +AUTOTUNE bmm(16x1x96, 16x96x577) + triton_bmm_55063 0.0086 ms 100.0% + triton_bmm_55062 0.0088 ms 97.8% + triton_bmm_55068 0.0090 ms 95.7% + triton_bmm_55061 0.0092 ms 93.4% + triton_bmm_55064 0.0092 ms 93.4% + triton_bmm_55066 0.0092 ms 92.7% + triton_bmm_55060 0.0093 ms 92.4% + triton_bmm_55065 0.0095 ms 90.2% + triton_bmm_55067 0.0095 ms 90.2% + triton_bmm_55071 0.0097 ms 88.2% +SingleProcess AUTOTUNE takes 3.9612 seconds +AUTOTUNE bmm(16x1x577, 16x577x96) + bmm 0.0123 ms 100.0% + triton_bmm_55093 0.0138 ms 88.9% + triton_bmm_55092 0.0165 ms 74.4% + triton_bmm_55090 0.0205 ms 59.9% + triton_bmm_55088 0.0207 ms 59.3% + triton_bmm_55089 0.0208 ms 59.1% + triton_bmm_55086 0.0210 ms 58.5% + triton_bmm_55085 0.0220 ms 55.7% + triton_bmm_55087 0.0234 ms 52.6% + triton_bmm_55084 0.0281 ms 43.7% +SingleProcess AUTOTUNE takes 3.9340 seconds +AUTOTUNE bmm(16x1x96, 16x96x578) + triton_bmm_55157 0.0087 ms 100.0% + triton_bmm_55162 0.0087 ms 99.6% + triton_bmm_55158 0.0088 ms 99.3% + triton_bmm_55159 0.0090 ms 96.5% + triton_bmm_55161 0.0091 ms 95.8% + triton_bmm_55160 0.0091 ms 95.4% + triton_bmm_55156 0.0093 ms 93.8% + triton_bmm_55164 0.0094 ms 92.5% + triton_bmm_55163 0.0098 ms 88.9% + triton_bmm_55166 0.0100 ms 87.5% +SingleProcess AUTOTUNE takes 4.4998 seconds +AUTOTUNE bmm(16x1x578, 16x578x96) + triton_bmm_55185 0.0104 ms 100.0% + triton_bmm_55186 0.0111 ms 93.6% + triton_bmm_55188 0.0116 ms 89.6% + triton_bmm_55184 0.0120 ms 86.6% + triton_bmm_55189 0.0124 ms 83.9% + triton_bmm_55183 0.0128 ms 81.0% + bmm 0.0134 ms 77.5% + triton_bmm_55182 0.0136 ms 76.2% + triton_bmm_55181 0.0147 ms 70.6% + triton_bmm_55180 0.0199 ms 52.2% +SingleProcess AUTOTUNE takes 4.2836 seconds +AUTOTUNE bmm(16x1x96, 16x96x579) + triton_bmm_55255 0.0087 ms 100.0% + triton_bmm_55253 0.0087 ms 99.3% + triton_bmm_55258 0.0087 ms 99.3% + triton_bmm_55257 0.0091 ms 95.4% + triton_bmm_55256 0.0092 ms 94.4% + triton_bmm_55254 0.0094 ms 92.5% + triton_bmm_55260 0.0094 ms 92.2% + triton_bmm_55259 0.0095 ms 91.2% + triton_bmm_55252 0.0097 ms 89.7% + triton_bmm_55263 0.0097 ms 89.1% +SingleProcess AUTOTUNE takes 4.5100 seconds +AUTOTUNE bmm(16x1x579, 16x579x96) + bmm 0.0123 ms 100.0% + triton_bmm_55285 0.0143 ms 86.1% + triton_bmm_55284 0.0165 ms 74.8% + triton_bmm_55281 0.0205 ms 60.1% + triton_bmm_55282 0.0208 ms 59.2% + triton_bmm_55280 0.0210 ms 58.6% + triton_bmm_55278 0.0212 ms 58.0% + triton_bmm_55277 0.0225 ms 54.8% + triton_bmm_55279 0.0234 ms 52.7% + triton_bmm_55276 0.0281 ms 43.8% +SingleProcess AUTOTUNE takes 4.4821 seconds +AUTOTUNE bmm(16x1x96, 16x96x580) + triton_bmm_55351 0.0085 ms 100.0% + triton_bmm_55352 0.0091 ms 93.8% + triton_bmm_55349 0.0092 ms 93.4% + triton_bmm_55354 0.0092 ms 93.0% + triton_bmm_55355 0.0093 ms 91.8% + triton_bmm_55350 0.0093 ms 91.4% + triton_bmm_55353 0.0093 ms 91.4% + triton_bmm_55356 0.0094 ms 90.8% + triton_bmm_55348 0.0096 ms 89.3% + triton_bmm_55359 0.0097 ms 87.8% +SingleProcess AUTOTUNE takes 3.8749 seconds +AUTOTUNE bmm(16x1x580, 16x580x96) + triton_bmm_55377 0.0108 ms 100.0% + triton_bmm_55378 0.0110 ms 98.3% + triton_bmm_55380 0.0112 ms 96.6% + triton_bmm_55381 0.0115 ms 94.7% + triton_bmm_55376 0.0119 ms 90.9% + triton_bmm_55375 0.0125 ms 86.7% + bmm 0.0132 ms 82.3% + triton_bmm_55374 0.0140 ms 77.4% + triton_bmm_55373 0.0147 ms 73.7% + triton_bmm_55372 0.0198 ms 54.9% +SingleProcess AUTOTUNE takes 3.9653 seconds +AUTOTUNE bmm(16x1x96, 16x96x581) + triton_bmm_55445 0.0087 ms 100.0% + triton_bmm_55448 0.0087 ms 100.0% + triton_bmm_55450 0.0088 ms 99.6% + triton_bmm_55446 0.0089 ms 98.6% + triton_bmm_55447 0.0091 ms 96.5% + triton_bmm_55451 0.0095 ms 91.9% + triton_bmm_55449 0.0095 ms 91.6% + triton_bmm_55452 0.0095 ms 91.6% + triton_bmm_55444 0.0098 ms 89.5% + triton_bmm_55453 0.0100 ms 87.8% +SingleProcess AUTOTUNE takes 3.8977 seconds +AUTOTUNE bmm(16x1x581, 16x581x96) + bmm 0.0123 ms 100.0% + triton_bmm_55477 0.0143 ms 86.1% + triton_bmm_55476 0.0162 ms 76.1% + triton_bmm_55474 0.0207 ms 59.5% + triton_bmm_55472 0.0208 ms 59.3% + triton_bmm_55473 0.0208 ms 59.2% + triton_bmm_55470 0.0212 ms 58.2% + triton_bmm_55469 0.0225 ms 54.8% + triton_bmm_55471 0.0234 ms 52.6% + triton_bmm_55468 0.0279 ms 44.2% +SingleProcess AUTOTUNE takes 4.3259 seconds +AUTOTUNE bmm(16x1x96, 16x96x582) + triton_bmm_55543 0.0085 ms 100.0% + triton_bmm_55541 0.0087 ms 98.2% + triton_bmm_55544 0.0087 ms 97.8% + triton_bmm_55546 0.0087 ms 97.8% + triton_bmm_55542 0.0093 ms 91.4% + triton_bmm_55548 0.0094 ms 90.8% + triton_bmm_55545 0.0095 ms 90.2% + triton_bmm_55540 0.0097 ms 87.8% + triton_bmm_55547 0.0098 ms 87.0% + triton_bmm_55550 0.0100 ms 85.9% +SingleProcess AUTOTUNE takes 4.3398 seconds +AUTOTUNE bmm(16x1x582, 16x582x96) + triton_bmm_55569 0.0105 ms 100.0% + triton_bmm_55570 0.0106 ms 98.5% + triton_bmm_55572 0.0112 ms 93.2% + triton_bmm_55568 0.0119 ms 87.7% + triton_bmm_55573 0.0124 ms 84.7% + triton_bmm_55567 0.0128 ms 81.5% + bmm 0.0133 ms 78.8% + triton_bmm_55566 0.0141 ms 74.3% + triton_bmm_55565 0.0147 ms 71.2% + triton_bmm_55564 0.0199 ms 52.7% +SingleProcess AUTOTUNE takes 4.0240 seconds +AUTOTUNE bmm(16x1x96, 16x96x583) + triton_bmm_55642 0.0088 ms 100.0% + triton_bmm_55638 0.0089 ms 98.9% + triton_bmm_55639 0.0091 ms 96.5% + triton_bmm_55641 0.0091 ms 96.5% + triton_bmm_55640 0.0092 ms 95.1% + triton_bmm_55637 0.0093 ms 94.6% + triton_bmm_55636 0.0093 ms 94.5% + triton_bmm_55643 0.0095 ms 92.3% + triton_bmm_55644 0.0095 ms 91.9% + triton_bmm_55647 0.0097 ms 90.1% +SingleProcess AUTOTUNE takes 4.1645 seconds +AUTOTUNE bmm(16x1x583, 16x583x96) + bmm 0.0125 ms 100.0% + triton_bmm_55669 0.0139 ms 90.3% + triton_bmm_55668 0.0162 ms 77.3% + triton_bmm_55664 0.0210 ms 59.7% + triton_bmm_55665 0.0210 ms 59.6% + triton_bmm_55662 0.0212 ms 59.2% + triton_bmm_55666 0.0212 ms 59.0% + triton_bmm_55661 0.0227 ms 55.1% + triton_bmm_55663 0.0231 ms 54.1% + triton_bmm_55660 0.0281 ms 44.5% +SingleProcess AUTOTUNE takes 3.9819 seconds +AUTOTUNE bmm(16x1x96, 16x96x584) + triton_bmm_55733 0.0087 ms 100.0% + triton_bmm_55734 0.0087 ms 99.3% + triton_bmm_55740 0.0089 ms 97.1% + triton_bmm_55735 0.0091 ms 95.8% + triton_bmm_55736 0.0091 ms 95.4% + triton_bmm_55738 0.0091 ms 95.3% + triton_bmm_55732 0.0093 ms 93.4% + triton_bmm_55737 0.0094 ms 92.2% + triton_bmm_55741 0.0095 ms 90.9% + triton_bmm_55739 0.0098 ms 88.3% +SingleProcess AUTOTUNE takes 3.8256 seconds +AUTOTUNE bmm(16x1x584, 16x584x96) + triton_bmm_55762 0.0106 ms 100.0% + triton_bmm_55761 0.0110 ms 95.9% + bmm 0.0113 ms 94.0% + triton_bmm_55760 0.0119 ms 88.7% + triton_bmm_55764 0.0119 ms 88.7% + triton_bmm_55765 0.0123 ms 86.2% + triton_bmm_55759 0.0128 ms 82.7% + triton_bmm_55758 0.0139 ms 76.4% + triton_bmm_55757 0.0147 ms 72.0% + triton_bmm_55756 0.0198 ms 53.4% +SingleProcess AUTOTUNE takes 4.1810 seconds +AUTOTUNE bmm(16x1x96, 16x96x585) + triton_bmm_55832 0.0087 ms 100.0% + triton_bmm_55834 0.0087 ms 100.0% + triton_bmm_55836 0.0090 ms 96.8% + triton_bmm_55831 0.0091 ms 96.0% + triton_bmm_55828 0.0093 ms 94.1% + triton_bmm_55829 0.0093 ms 93.5% + triton_bmm_55830 0.0094 ms 93.2% + triton_bmm_55833 0.0095 ms 91.6% + triton_bmm_55839 0.0098 ms 89.5% + triton_bmm_55835 0.0099 ms 88.3% +SingleProcess AUTOTUNE takes 4.0054 seconds +AUTOTUNE bmm(16x1x585, 16x585x96) + bmm 0.0125 ms 100.0% + triton_bmm_55861 0.0139 ms 90.5% + triton_bmm_55860 0.0162 ms 77.3% + triton_bmm_55857 0.0206 ms 60.8% + triton_bmm_55858 0.0210 ms 59.7% + triton_bmm_55854 0.0212 ms 59.2% + triton_bmm_55856 0.0212 ms 59.1% + triton_bmm_55853 0.0222 ms 56.4% + triton_bmm_55855 0.0233 ms 53.8% + triton_bmm_55852 0.0283 ms 44.3% +SingleProcess AUTOTUNE takes 3.9794 seconds +AUTOTUNE bmm(16x1x96, 16x96x586) + triton_bmm_55927 0.0087 ms 100.0% + triton_bmm_55925 0.0087 ms 99.6% + triton_bmm_55930 0.0087 ms 99.3% + triton_bmm_55929 0.0091 ms 95.8% + triton_bmm_55928 0.0092 ms 94.1% + triton_bmm_55926 0.0094 ms 92.5% + triton_bmm_55931 0.0095 ms 91.6% + triton_bmm_55932 0.0095 ms 91.1% + triton_bmm_55924 0.0097 ms 89.7% + triton_bmm_55934 0.0100 ms 87.1% +SingleProcess AUTOTUNE takes 4.1053 seconds +AUTOTUNE bmm(16x1x586, 16x586x96) + triton_bmm_55953 0.0108 ms 100.0% + triton_bmm_55954 0.0111 ms 98.0% + triton_bmm_55956 0.0112 ms 96.6% + triton_bmm_55952 0.0117 ms 92.9% + triton_bmm_55957 0.0119 ms 91.1% + triton_bmm_55951 0.0128 ms 84.7% + bmm 0.0133 ms 81.7% + triton_bmm_55950 0.0142 ms 76.2% + triton_bmm_55949 0.0143 ms 76.0% + triton_bmm_55948 0.0204 ms 53.3% +SingleProcess AUTOTUNE takes 4.6060 seconds +AUTOTUNE bmm(16x1x96, 16x96x587) + triton_bmm_56024 0.0087 ms 100.0% + triton_bmm_56021 0.0088 ms 99.6% + triton_bmm_56022 0.0089 ms 97.8% + triton_bmm_56025 0.0091 ms 96.1% + triton_bmm_56023 0.0092 ms 95.1% + triton_bmm_56026 0.0092 ms 94.5% + triton_bmm_56020 0.0093 ms 94.1% + triton_bmm_56027 0.0095 ms 91.9% + triton_bmm_56028 0.0096 ms 91.3% + triton_bmm_56030 0.0100 ms 87.8% +SingleProcess AUTOTUNE takes 3.9130 seconds +AUTOTUNE bmm(16x1x587, 16x587x96) + bmm 0.0130 ms 100.0% + triton_bmm_56053 0.0138 ms 93.7% + triton_bmm_56052 0.0167 ms 77.7% + triton_bmm_56049 0.0207 ms 62.5% + triton_bmm_56050 0.0212 ms 61.1% + triton_bmm_56048 0.0214 ms 60.6% + triton_bmm_56046 0.0216 ms 59.9% + triton_bmm_56045 0.0227 ms 57.0% + triton_bmm_56047 0.0232 ms 55.9% + triton_bmm_56044 0.0283 ms 45.9% +SingleProcess AUTOTUNE takes 3.9681 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +dcgan +cuda eval dcgan int4weightonly-bs1-acc +pass-sqnr-53.240 + loading model: 0it [00:00, ?it/s]WARNING:common:Model demucs does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:common:Model demucs does not support bfloat16, running with amp instead +demucs +cuda eval demucs int4weightonly-bs1-acc +WARNING:common:Model demucs does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +densenet121 +cuda eval densenet121 int4weightonly-bs1-acc +pass-sqnr-20.547 + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_c4 +WARNING:common:Model detectron2_fasterrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.14 GiB is allocated by PyTorch, and 73.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_dc5 + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 76.85 GiB is free. Including non-PyTorch memory, this process has 2.30 GiB memory in use. Of the allocated memory 1.68 GiB is allocated by PyTorch, and 44.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_101_fpn +WARNING:common:Model detectron2_fasterrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3653.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.22 GiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.35 GiB is allocated by PyTorch, and 37.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_c4 + loading model: 0it [00:09, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.73 GiB is free. Including non-PyTorch memory, this process has 1.42 GiB memory in use. Of the allocated memory 884.81 MiB is allocated by PyTorch, and 15.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_dc5 +WARNING:common:Model detectron2_fasterrcnn_r_50_dc5 does not support bfloat16, running with amp instead + loading model: 0it [00:08, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_dc5 failed to load +Original Error: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2803.80 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.04 GiB is free. Including non-PyTorch memory, this process has 2.11 GiB memory in use. Of the allocated memory 1.46 GiB is allocated by PyTorch, and 89.64 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fasterrcnn_r_50_fpn +WARNING:common:Model detectron2_fasterrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_fasterrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4125.40 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.46 GiB is free. Including non-PyTorch memory, this process has 1.69 GiB memory in use. Of the allocated memory 1.10 GiB is allocated by PyTorch, and 47.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_fcos_r_50_fpn +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +cuda eval detectron2_fcos_r_50_fpn int4weightonly-bs1-acc +WARNING:common:Model detectron2_fcos_r_50_fpn does not support bfloat16, running with amp instead +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_c4 +WARNING:common:Model detectron2_maskrcnn_r_101_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5607.61 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.38 GiB is free. Including non-PyTorch memory, this process has 1.77 GiB memory in use. Of the allocated memory 1.15 GiB is allocated by PyTorch, and 84.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_101_fpn +WARNING:common:Model detectron2_maskrcnn_r_101_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:06, ?it/s] +WARNING:root:detectron2_maskrcnn_r_101_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3682.18 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.23 GiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.36 GiB is allocated by PyTorch, and 20.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_c4 +WARNING:common:Model detectron2_maskrcnn_r_50_c4 does not support bfloat16, running with amp instead + loading model: 0it [00:07, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_c4 failed to load +Original Error: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 477, in forward + box_features = self._shared_roi_transform( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 456, in _shared_roi_transform + x = self.pooler(features, boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 246, in forward + return self.level_poolers[0](x[0], pooler_fmt_boxes) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 5433.77 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.69 GiB is free. Including non-PyTorch memory, this process has 1.46 GiB memory in use. Of the allocated memory 910.65 MiB is allocated by PyTorch, and 33.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s]detectron2_maskrcnn_r_50_fpn +WARNING:common:Model detectron2_maskrcnn_r_50_fpn does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:root:detectron2_maskrcnn_r_50_fpn failed to load +Original Error: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 150, in forward + return self.inference(batched_inputs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 213, in inference + results, _ = self.roi_heads(images, features, proposals, None) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 747, in forward + pred_instances = self._forward_box(features, proposals) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 798, in _forward_box + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/modeling/poolers.py", line 261, in forward + output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/detectron2/layers/roi_align.py", line 58, in forward + return roi_align( + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 236, in roi_align + return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 168, in _roi_align + val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask) # [K, C, PH, PW, IY, IX] + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 62, in _bilinear_interpolate + v1 = masked_index(y_low, x_low) + File "/home/cdhernandez/local/vision/torchvision/ops/roi_align.py", line 55, in masked_index + return input[ +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3966.30 GiB. GPU 0 has a total capacity of 79.15 GiB of which 77.40 GiB is free. Including non-PyTorch memory, this process has 1.75 GiB memory in use. Of the allocated memory 1.11 GiB is allocated by PyTorch, and 99.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:12, ?it/s] +dlrm +cuda eval dlrm int4weightonly-bs1-acc +pass-sqnr-nan + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +doctr_det_predictor +cuda eval doctr_det_predictor int4weightonly-bs1-acc +WARNING:common:Model doctr_det_predictor does not support bfloat16, running with amp instead +[2023-12-13 03:32:33,607] [2/0_1] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +malloc(): unaligned tcache chunk detected +Run failed with return code: -6 +Output: None +Error: None + loading model: 0it [00:00, ?it/s]WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead + loading model: 0it [00:05, ?it/s] +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +doctr_reco_predictor +cuda eval doctr_reco_predictor int4weightonly-bs1-acc +WARNING:common:Model doctr_reco_predictor does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +drq +cuda eval drq int4weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for drq. Setting accuracy check to cosine +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 36, in + run() + File "/home/cdhernandez/local/benchmark/run_benchmark.py", line 30, in run + benchmark.run(bm_args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/run.py", line 24, in run + main(TorchBenchmarkRunner(), original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3133, in main + process_entry(0, runner, original_dir, args) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3090, in process_entry + return maybe_fresh_cache( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1714, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3642, in run + runner.run_one_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2518, in run_one_model + status = self.check_accuracy( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2147, in check_accuracy + model, example_inputs = self.maybe_cast(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1937, in maybe_cast + model = self.deepcopy_model(model) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1887, in deepcopy_model + return copy.deepcopy(model) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 297, in _reconstruct + value = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 172, in deepcopy + y = _reconstruct(x, memo, *rv) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 271, in _reconstruct + state = deepcopy(state, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 146, in deepcopy + y = copier(x, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 231, in _deepcopy_dict + y[deepcopy(key, memo)] = deepcopy(value, memo) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/copy.py", line 153, in deepcopy + y = copier(memo) + File "/home/cdhernandez/local/pytorch/torch/_tensor.py", line 86, in __deepcopy__ + raise RuntimeError( +RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment. If you were attempting to deepcopy a module, this may be because of a torch.nn.utils.weight_norm usage, see https://github.com/pytorch/pytorch/pull/103001 +Run failed with return code: 1 +Output: None +Error: None + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +fastNLP_Bert +cuda eval fastNLP_Bert int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +functorch_dp_cifar10 +cuda eval functorch_dp_cifar10 int4weightonly-bs1-acc +pass-sqnr-22.880 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +functorch_maml_omniglot +cuda eval functorch_maml_omniglot int4weightonly-bs1-acc +pass-sqnr-25.190 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_Albert +cuda eval hf_Albert int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:07, ?it/s] +hf_Bart +cuda eval hf_Bart int4weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for hf_Bart. Setting accuracy check to cosine +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_BigBird +cuda eval hf_BigBird int4weightonly-bs1-acc +[2023-12-13 03:37:14,127] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:16,858] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:18,324] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:19,770] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:21,218] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:22,662] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:24,112] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:25,561] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:27,371] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:28,833] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:30,291] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +[2023-12-13 03:37:31,731] [1/0] torch._inductor.utils: [WARNING] DeviceCopy in input program +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +hf_DistilBert +cuda eval hf_DistilBert int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:06, ?it/s] +hf_GPT2 +cuda eval hf_GPT2 int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:18, ?it/s] +hf_GPT2_large +cuda eval hf_GPT2_large int4weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:09, ?it/s] +hf_Longformer +cuda eval hf_Longformer int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Reformer +cuda eval hf_Reformer int4weightonly-bs1-acc +skipping cudagraphs due to ['incompatible ops'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:05, ?it/s] +hf_T5 +cuda eval hf_T5 int4weightonly-bs1-acc +[2023-12-13 03:42:32,392] torch._dynamo.utils: [ERROR] RMSE (res-fp64): 0.00002, (ref-fp64): 0.00000 and shape=torch.Size([1, 8, 2048, 64]) +[2023-12-13 03:42:32,393] torch._dynamo.utils: [ERROR] Accuracy failed for key name past_key_values +fail_accuracy-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:08, ?it/s] +hf_T5_base +cuda eval hf_T5_base int4weightonly-bs1-acc +[2023-12-13 03:43:42,063] torch._dynamo.utils: [ERROR] RMSE (res-fp64): 0.00002, (ref-fp64): 0.00000 and shape=torch.Size([1, 12, 2048, 64]) +[2023-12-13 03:43:42,064] torch._dynamo.utils: [ERROR] Accuracy failed for key name past_key_values +fail_accuracy-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:15, ?it/s] +hf_T5_generate +cuda eval hf_T5_generate int4weightonly-bs1-acc +AUTOTUNE bmm(8x1x64, 8x64x1) + triton_bmm_145 0.0060 ms 100.0% + triton_bmm_146 0.0060 ms 100.0% + triton_bmm_147 0.0060 ms 100.0% + triton_bmm_148 0.0060 ms 100.0% + triton_bmm_149 0.0067 ms 90.4% + triton_bmm_144 0.0070 ms 85.5% + triton_bmm_150 0.0075 ms 80.2% + triton_bmm_151 0.0075 ms 80.0% + bmm 0.0078 ms 77.4% +SingleProcess AUTOTUNE takes 2.8662 seconds +AUTOTUNE bmm(8x1x64, 8x64x2048) + triton_bmm_168 0.0090 ms 100.0% + triton_bmm_167 0.0093 ms 97.2% + triton_bmm_160 0.0093 ms 96.6% + triton_bmm_163 0.0093 ms 96.6% + triton_bmm_165 0.0093 ms 96.6% + triton_bmm_161 0.0099 ms 91.3% + triton_bmm_169 0.0100 ms 90.1% + triton_bmm_159 0.0102 ms 88.4% + triton_bmm_162 0.0102 ms 88.3% + triton_bmm_166 0.0102 ms 88.1% +SingleProcess AUTOTUNE takes 3.6115 seconds +AUTOTUNE bmm(8x1x64, 8x64x2) + triton_bmm_365 0.0062 ms 100.0% + bmm 0.0068 ms 92.4% + triton_bmm_361 0.0068 ms 92.0% + triton_bmm_362 0.0068 ms 92.0% + triton_bmm_363 0.0068 ms 92.0% + triton_bmm_364 0.0068 ms 92.0% + triton_bmm_366 0.0070 ms 89.0% + triton_bmm_360 0.0072 ms 86.3% + triton_bmm_367 0.0076 ms 82.6% +SingleProcess AUTOTUNE takes 3.0172 seconds +AUTOTUNE bmm(8x1x64, 8x64x3) + triton_bmm_577 0.0062 ms 100.0% + triton_bmm_581 0.0066 ms 94.2% + triton_bmm_578 0.0068 ms 92.0% + triton_bmm_579 0.0068 ms 92.0% + triton_bmm_580 0.0068 ms 92.0% + triton_bmm_582 0.0070 ms 88.6% + triton_bmm_576 0.0073 ms 85.9% + bmm 0.0073 ms 85.5% + triton_bmm_583 0.0078 ms 80.2% +SingleProcess AUTOTUNE takes 2.6851 seconds +AUTOTUNE bmm(8x1x64, 8x64x4) + triton_bmm_793 0.0062 ms 100.0% + triton_bmm_794 0.0062 ms 100.0% + triton_bmm_795 0.0062 ms 100.0% + triton_bmm_796 0.0062 ms 100.0% + triton_bmm_797 0.0062 ms 100.0% + bmm 0.0068 ms 92.4% + triton_bmm_792 0.0068 ms 92.4% + triton_bmm_799 0.0070 ms 89.0% + triton_bmm_798 0.0076 ms 82.6% +SingleProcess AUTOTUNE takes 2.9297 seconds +AUTOTUNE bmm(8x1x64, 8x64x5) + triton_bmm_1010 0.0062 ms 100.0% + triton_bmm_1011 0.0062 ms 100.0% + triton_bmm_1012 0.0062 ms 100.0% + triton_bmm_1013 0.0062 ms 100.0% + triton_bmm_1008 0.0068 ms 92.4% + triton_bmm_1009 0.0068 ms 92.0% + triton_bmm_1014 0.0072 ms 86.3% + triton_bmm_1015 0.0072 ms 86.3% + bmm 0.0075 ms 83.3% +SingleProcess AUTOTUNE takes 2.4679 seconds +AUTOTUNE bmm(8x1x64, 8x64x6) + triton_bmm_1227 0.0062 ms 100.0% + triton_bmm_1229 0.0062 ms 100.0% + triton_bmm_1225 0.0068 ms 92.0% + triton_bmm_1226 0.0068 ms 92.0% + triton_bmm_1228 0.0068 ms 92.0% + triton_bmm_1230 0.0070 ms 89.0% + triton_bmm_1224 0.0073 ms 85.9% + bmm 0.0073 ms 85.5% + triton_bmm_1231 0.0076 ms 81.9% +SingleProcess AUTOTUNE takes 2.6410 seconds +AUTOTUNE bmm(8x1x64, 8x64x7) + triton_bmm_1441 0.0067 ms 100.0% + triton_bmm_1442 0.0068 ms 99.3% + triton_bmm_1443 0.0068 ms 99.3% + triton_bmm_1444 0.0068 ms 98.8% + triton_bmm_1445 0.0069 ms 97.9% + triton_bmm_1446 0.0072 ms 93.1% + triton_bmm_1447 0.0072 ms 93.1% + triton_bmm_1440 0.0073 ms 92.3% + bmm 0.0080 ms 83.9% +SingleProcess AUTOTUNE takes 2.4942 seconds +AUTOTUNE bmm(8x1x64, 8x64x8) + triton_bmm_1658 0.0062 ms 100.0% + triton_bmm_1659 0.0062 ms 100.0% + triton_bmm_1660 0.0062 ms 100.0% + triton_bmm_1656 0.0068 ms 92.4% + triton_bmm_1657 0.0068 ms 92.0% + triton_bmm_1661 0.0068 ms 92.0% + bmm 0.0068 ms 91.8% + triton_bmm_1662 0.0070 ms 89.0% + triton_bmm_1663 0.0077 ms 80.9% +SingleProcess AUTOTUNE takes 2.5462 seconds +AUTOTUNE bmm(8x1x64, 8x64x9) + triton_bmm_1877 0.0067 ms 100.0% + triton_bmm_1872 0.0068 ms 99.3% + triton_bmm_1874 0.0068 ms 98.8% + triton_bmm_1876 0.0068 ms 98.8% + triton_bmm_1873 0.0068 ms 98.6% + triton_bmm_1875 0.0068 ms 98.6% + triton_bmm_1878 0.0072 ms 92.7% + triton_bmm_1879 0.0072 ms 92.7% + bmm 0.0080 ms 83.5% +SingleProcess AUTOTUNE takes 2.5419 seconds +AUTOTUNE bmm(8x1x64, 8x64x10) + triton_bmm_2089 0.0062 ms 100.0% + triton_bmm_2090 0.0062 ms 100.0% + triton_bmm_2091 0.0062 ms 100.0% + triton_bmm_2092 0.0063 ms 99.5% + triton_bmm_2093 0.0063 ms 99.5% + triton_bmm_2095 0.0072 ms 86.3% + triton_bmm_2088 0.0073 ms 85.5% + bmm 0.0075 ms 83.0% + triton_bmm_2094 0.0078 ms 80.2% +SingleProcess AUTOTUNE takes 2.4332 seconds +AUTOTUNE bmm(8x1x64, 8x64x11) + triton_bmm_2306 0.0062 ms 100.0% + triton_bmm_2307 0.0062 ms 100.0% + triton_bmm_2308 0.0063 ms 99.5% + triton_bmm_2309 0.0063 ms 99.5% + triton_bmm_2305 0.0067 ms 92.6% + triton_bmm_2304 0.0073 ms 85.7% + triton_bmm_2310 0.0078 ms 80.2% + triton_bmm_2311 0.0078 ms 80.2% + bmm 0.0080 ms 77.7% +SingleProcess AUTOTUNE takes 2.6376 seconds +AUTOTUNE bmm(8x1x64, 8x64x12) + triton_bmm_2522 0.0062 ms 100.0% + triton_bmm_2523 0.0062 ms 100.0% + triton_bmm_2525 0.0063 ms 99.5% + triton_bmm_2521 0.0066 ms 94.7% + triton_bmm_2524 0.0068 ms 91.5% + triton_bmm_2526 0.0072 ms 86.3% + triton_bmm_2527 0.0072 ms 86.3% + triton_bmm_2520 0.0073 ms 85.5% + bmm 0.0073 ms 85.2% +SingleProcess AUTOTUNE takes 2.5469 seconds +AUTOTUNE bmm(8x1x64, 8x64x13) + triton_bmm_2737 0.0063 ms 100.0% + triton_bmm_2738 0.0063 ms 100.0% + triton_bmm_2739 0.0063 ms 100.0% + triton_bmm_2736 0.0068 ms 92.9% + triton_bmm_2740 0.0068 ms 92.0% + triton_bmm_2741 0.0068 ms 92.0% + triton_bmm_2742 0.0073 ms 86.3% + bmm 0.0075 ms 83.4% + triton_bmm_2743 0.0078 ms 80.7% +SingleProcess AUTOTUNE takes 2.5769 seconds +AUTOTUNE bmm(8x1x64, 8x64x14) + triton_bmm_2953 0.0062 ms 100.0% + triton_bmm_2954 0.0062 ms 100.0% + triton_bmm_2955 0.0063 ms 99.5% + triton_bmm_2956 0.0063 ms 99.5% + triton_bmm_2957 0.0063 ms 99.5% + triton_bmm_2952 0.0068 ms 92.4% + triton_bmm_2958 0.0073 ms 85.9% + triton_bmm_2959 0.0073 ms 85.9% + bmm 0.0073 ms 85.2% +SingleProcess AUTOTUNE takes 2.5873 seconds +AUTOTUNE bmm(8x1x64, 8x64x15) + triton_bmm_3170 0.0063 ms 100.0% + triton_bmm_3171 0.0063 ms 100.0% + triton_bmm_3172 0.0063 ms 100.0% + triton_bmm_3173 0.0063 ms 100.0% + triton_bmm_3168 0.0068 ms 92.9% + triton_bmm_3169 0.0069 ms 90.7% + triton_bmm_3174 0.0073 ms 86.3% + triton_bmm_3175 0.0078 ms 80.7% + bmm 0.0221 ms 28.4% +SingleProcess AUTOTUNE takes 2.8679 seconds +AUTOTUNE bmm(8x1x64, 8x64x16) + triton_bmm_3386 0.0062 ms 100.0% + triton_bmm_3387 0.0063 ms 99.5% + triton_bmm_3385 0.0067 ms 92.9% + triton_bmm_3388 0.0070 ms 89.7% + triton_bmm_3389 0.0070 ms 89.7% + triton_bmm_3390 0.0072 ms 86.3% + triton_bmm_3384 0.0073 ms 85.9% + bmm 0.0075 ms 83.0% + triton_bmm_3391 0.0078 ms 80.2% +SingleProcess AUTOTUNE takes 2.6812 seconds +AUTOTUNE bmm(8x1x64, 8x64x17) + triton_bmm_3601 0.0065 ms 100.0% + triton_bmm_3602 0.0065 ms 100.0% + triton_bmm_3604 0.0068 ms 96.2% + triton_bmm_3600 0.0070 ms 92.7% + triton_bmm_3603 0.0070 ms 92.3% + triton_bmm_3605 0.0073 ms 89.0% + triton_bmm_3606 0.0075 ms 86.8% + bmm 0.0075 ms 86.4% + triton_bmm_3607 0.0080 ms 81.5% +SingleProcess AUTOTUNE takes 2.5273 seconds +AUTOTUNE bmm(8x1x64, 8x64x18) + triton_bmm_3822 0.0070 ms 100.0% + triton_bmm_3824 0.0070 ms 100.0% + triton_bmm_3826 0.0070 ms 100.0% + triton_bmm_3827 0.0070 ms 100.0% + triton_bmm_3823 0.0070 ms 99.1% + triton_bmm_3828 0.0073 ms 96.0% + bmm 0.0073 ms 95.8% + triton_bmm_3829 0.0078 ms 89.7% + triton_bmm_3825 0.0186 ms 37.5% +SingleProcess AUTOTUNE takes 2.5977 seconds +AUTOTUNE bmm(8x1x64, 8x64x19) + triton_bmm_4048 0.0068 ms 100.0% + triton_bmm_4045 0.0069 ms 98.1% + triton_bmm_4046 0.0070 ms 95.9% + triton_bmm_4047 0.0070 ms 95.9% + triton_bmm_4049 0.0073 ms 92.5% + triton_bmm_4050 0.0075 ms 90.2% + triton_bmm_4044 0.0075 ms 89.8% + triton_bmm_4051 0.0080 ms 84.4% + bmm 0.0081 ms 83.7% +SingleProcess AUTOTUNE takes 2.5287 seconds +AUTOTUNE bmm(8x1x64, 8x64x20) + triton_bmm_4267 0.0065 ms 100.0% + triton_bmm_4271 0.0068 ms 95.7% + triton_bmm_4269 0.0069 ms 93.5% + triton_bmm_4268 0.0070 ms 92.4% + triton_bmm_4270 0.0072 ms 89.2% + triton_bmm_4272 0.0073 ms 89.0% + triton_bmm_4266 0.0075 ms 86.0% + bmm 0.0078 ms 83.3% + triton_bmm_4273 0.0078 ms 83.1% +SingleProcess AUTOTUNE takes 2.5199 seconds +AUTOTUNE bmm(8x1x64, 8x64x21) + triton_bmm_4490 0.0065 ms 100.0% + triton_bmm_4491 0.0065 ms 100.0% + triton_bmm_4489 0.0070 ms 92.7% + bmm 0.0075 ms 86.4% + triton_bmm_4492 0.0075 ms 86.4% + triton_bmm_4493 0.0075 ms 86.4% + triton_bmm_4488 0.0075 ms 86.2% + triton_bmm_4494 0.0080 ms 80.9% + triton_bmm_4495 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 2.4414 seconds +AUTOTUNE bmm(8x1x64, 8x64x22) + triton_bmm_4711 0.0065 ms 100.0% + triton_bmm_4712 0.0065 ms 100.0% + triton_bmm_4713 0.0065 ms 100.0% + triton_bmm_4714 0.0074 ms 87.5% + triton_bmm_4716 0.0074 ms 87.5% + triton_bmm_4715 0.0075 ms 86.6% + bmm 0.0075 ms 86.4% + triton_bmm_4710 0.0075 ms 86.4% + triton_bmm_4717 0.0078 ms 83.2% +SingleProcess AUTOTUNE takes 2.4956 seconds +AUTOTUNE bmm(8x1x64, 8x64x23) + triton_bmm_4933 0.0065 ms 100.0% + triton_bmm_4934 0.0065 ms 100.0% + triton_bmm_4935 0.0070 ms 92.3% + triton_bmm_4938 0.0075 ms 86.8% + triton_bmm_4939 0.0075 ms 86.8% + triton_bmm_4936 0.0075 ms 86.6% + triton_bmm_4937 0.0075 ms 86.6% + triton_bmm_4932 0.0075 ms 86.2% + bmm 0.0078 ms 83.2% +SingleProcess AUTOTUNE takes 3.0430 seconds +AUTOTUNE bmm(8x1x64, 8x64x24) + triton_bmm_5155 0.0064 ms 100.0% + triton_bmm_5156 0.0064 ms 100.0% + triton_bmm_5157 0.0064 ms 100.0% + triton_bmm_5158 0.0070 ms 92.2% + bmm 0.0071 ms 90.5% + triton_bmm_5161 0.0073 ms 88.5% + triton_bmm_5160 0.0073 ms 87.8% + triton_bmm_5159 0.0074 ms 86.6% + triton_bmm_5154 0.0075 ms 85.5% +SingleProcess AUTOTUNE takes 2.4517 seconds +AUTOTUNE bmm(8x1x64, 8x64x25) + triton_bmm_5377 0.0070 ms 100.0% + triton_bmm_5376 0.0070 ms 99.3% + triton_bmm_5379 0.0070 ms 99.1% + triton_bmm_5378 0.0070 ms 98.9% + triton_bmm_5382 0.0075 ms 92.9% + bmm 0.0075 ms 92.6% + triton_bmm_5380 0.0075 ms 92.6% + triton_bmm_5381 0.0075 ms 92.6% + triton_bmm_5383 0.0080 ms 86.7% +SingleProcess AUTOTUNE takes 2.6952 seconds +AUTOTUNE bmm(8x1x64, 8x64x26) + triton_bmm_5599 0.0065 ms 100.0% + triton_bmm_5600 0.0065 ms 100.0% + triton_bmm_5601 0.0065 ms 100.0% + triton_bmm_5603 0.0070 ms 93.1% + triton_bmm_5598 0.0070 ms 92.7% + triton_bmm_5602 0.0070 ms 92.7% + triton_bmm_5605 0.0078 ms 83.2% + triton_bmm_5604 0.0080 ms 81.2% + bmm 0.0089 ms 72.8% +SingleProcess AUTOTUNE takes 2.4690 seconds +AUTOTUNE bmm(8x1x64, 8x64x27) + triton_bmm_5824 0.0070 ms 100.0% + triton_bmm_5825 0.0070 ms 100.0% + triton_bmm_5823 0.0070 ms 99.5% + triton_bmm_5821 0.0070 ms 99.1% + triton_bmm_5822 0.0070 ms 99.1% + triton_bmm_5826 0.0075 ms 93.2% + bmm 0.0075 ms 92.8% + triton_bmm_5820 0.0075 ms 92.8% + triton_bmm_5827 0.0080 ms 86.9% +SingleProcess AUTOTUNE takes 2.4456 seconds +AUTOTUNE bmm(8x1x64, 8x64x28) + triton_bmm_6044 0.0064 ms 100.0% + triton_bmm_6045 0.0064 ms 100.0% + triton_bmm_6043 0.0065 ms 99.0% + triton_bmm_6047 0.0068 ms 95.3% + triton_bmm_6042 0.0070 ms 92.0% + triton_bmm_6049 0.0073 ms 88.5% + triton_bmm_6046 0.0073 ms 87.8% + triton_bmm_6048 0.0075 ms 85.9% + bmm 0.0078 ms 82.0% +SingleProcess AUTOTUNE takes 2.4662 seconds +AUTOTUNE bmm(8x1x64, 8x64x29) + triton_bmm_6264 0.0070 ms 100.0% + triton_bmm_6265 0.0070 ms 99.5% + triton_bmm_6266 0.0070 ms 99.5% + triton_bmm_6267 0.0070 ms 99.5% + triton_bmm_6270 0.0075 ms 93.6% + triton_bmm_6268 0.0075 ms 93.2% + triton_bmm_6269 0.0075 ms 93.2% + bmm 0.0077 ms 90.9% + triton_bmm_6271 0.0080 ms 87.3% +SingleProcess AUTOTUNE takes 2.6779 seconds +AUTOTUNE bmm(8x1x64, 8x64x30) + triton_bmm_6488 0.0065 ms 100.0% + triton_bmm_6489 0.0065 ms 100.0% + triton_bmm_6487 0.0070 ms 92.7% + triton_bmm_6491 0.0075 ms 87.1% + triton_bmm_6492 0.0075 ms 86.8% + triton_bmm_6486 0.0075 ms 86.4% + triton_bmm_6490 0.0075 ms 86.4% + triton_bmm_6493 0.0079 ms 81.9% + bmm 0.0083 ms 78.7% +SingleProcess AUTOTUNE takes 2.4564 seconds +AUTOTUNE bmm(8x1x64, 8x64x31) + triton_bmm_6711 0.0065 ms 100.0% + triton_bmm_6709 0.0069 ms 93.5% + triton_bmm_6713 0.0070 ms 92.7% + triton_bmm_6710 0.0070 ms 92.3% + bmm 0.0075 ms 86.4% + triton_bmm_6708 0.0075 ms 86.4% + triton_bmm_6712 0.0075 ms 86.4% + triton_bmm_6714 0.0080 ms 80.9% + triton_bmm_6715 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 2.7107 seconds +AUTOTUNE bmm(8x1x64, 8x64x32) + triton_bmm_6931 0.0065 ms 100.0% + bmm 0.0069 ms 94.2% + triton_bmm_6930 0.0070 ms 93.1% + triton_bmm_6935 0.0070 ms 93.1% + triton_bmm_6932 0.0070 ms 92.7% + triton_bmm_6933 0.0070 ms 92.7% + triton_bmm_6936 0.0075 ms 86.8% + triton_bmm_6934 0.0075 ms 86.4% + triton_bmm_6937 0.0080 ms 81.7% +SingleProcess AUTOTUNE takes 2.5130 seconds +AUTOTUNE bmm(8x1x64, 8x64x33) + triton_bmm_7154 0.0065 ms 100.0% + triton_bmm_7157 0.0070 ms 93.1% + triton_bmm_7153 0.0070 ms 92.5% + triton_bmm_7156 0.0071 ms 91.0% + triton_bmm_7155 0.0072 ms 89.6% + triton_bmm_7160 0.0075 ms 86.8% + triton_bmm_7158 0.0075 ms 86.4% + triton_bmm_7152 0.0076 ms 85.7% + bmm 0.0080 ms 81.5% + triton_bmm_7159 0.0081 ms 80.4% +SingleProcess AUTOTUNE takes 2.7567 seconds +AUTOTUNE bmm(8x1x64, 8x64x34) + triton_bmm_7388 0.0065 ms 100.0% + triton_bmm_7389 0.0065 ms 100.0% + triton_bmm_7390 0.0069 ms 93.8% + triton_bmm_7391 0.0070 ms 93.1% + triton_bmm_7386 0.0070 ms 92.7% + triton_bmm_7387 0.0070 ms 92.3% + triton_bmm_7392 0.0075 ms 86.4% + triton_bmm_7393 0.0075 ms 86.4% + bmm 0.0078 ms 82.9% + triton_bmm_7394 0.0080 ms 81.2% +SingleProcess AUTOTUNE takes 3.3023 seconds +AUTOTUNE bmm(8x1x64, 8x64x35) + triton_bmm_7622 0.0065 ms 100.0% + triton_bmm_7625 0.0068 ms 96.2% + triton_bmm_7621 0.0070 ms 92.3% + triton_bmm_7623 0.0070 ms 92.3% + triton_bmm_7624 0.0070 ms 92.3% + triton_bmm_7626 0.0075 ms 86.4% + triton_bmm_7620 0.0076 ms 86.0% + triton_bmm_7628 0.0080 ms 81.0% + triton_bmm_7627 0.0080 ms 80.9% + bmm 0.0081 ms 80.6% +SingleProcess AUTOTUNE takes 2.8817 seconds +AUTOTUNE bmm(8x1x64, 8x64x36) + triton_bmm_7855 0.0065 ms 100.0% + triton_bmm_7858 0.0065 ms 100.0% + triton_bmm_7856 0.0069 ms 94.2% + triton_bmm_7854 0.0070 ms 92.7% + triton_bmm_7857 0.0071 ms 91.9% + triton_bmm_7859 0.0074 ms 87.7% + triton_bmm_7862 0.0075 ms 86.8% + triton_bmm_7860 0.0075 ms 86.4% + bmm 0.0079 ms 82.2% + triton_bmm_7861 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 4.3445 seconds +AUTOTUNE bmm(8x1x64, 8x64x37) + triton_bmm_8093 0.0068 ms 100.0% + triton_bmm_8088 0.0070 ms 96.3% + triton_bmm_8092 0.0070 ms 95.9% + triton_bmm_8091 0.0071 ms 95.5% + triton_bmm_8090 0.0072 ms 94.2% + triton_bmm_8089 0.0073 ms 93.0% + bmm 0.0075 ms 89.8% + triton_bmm_8094 0.0075 ms 89.8% + triton_bmm_8096 0.0080 ms 84.1% + triton_bmm_8095 0.0080 ms 83.9% +SingleProcess AUTOTUNE takes 2.9899 seconds +AUTOTUNE bmm(8x1x64, 8x64x38) + triton_bmm_8323 0.0065 ms 100.0% + triton_bmm_8324 0.0065 ms 100.0% + triton_bmm_8325 0.0065 ms 100.0% + triton_bmm_8327 0.0068 ms 95.8% + triton_bmm_8326 0.0070 ms 92.3% + triton_bmm_8328 0.0075 ms 86.2% + triton_bmm_8322 0.0076 ms 86.0% + triton_bmm_8330 0.0080 ms 81.4% + triton_bmm_8329 0.0080 ms 80.9% + bmm 0.0083 ms 78.4% +SingleProcess AUTOTUNE takes 3.1099 seconds +AUTOTUNE bmm(8x1x64, 8x64x39) + triton_bmm_8557 0.0065 ms 100.0% + triton_bmm_8559 0.0065 ms 100.0% + triton_bmm_8560 0.0065 ms 100.0% + triton_bmm_8561 0.0068 ms 96.2% + triton_bmm_8562 0.0070 ms 92.7% + triton_bmm_8558 0.0071 ms 91.9% + triton_bmm_8556 0.0071 ms 91.4% + triton_bmm_8564 0.0075 ms 86.8% + bmm 0.0075 ms 86.4% + triton_bmm_8563 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 2.9129 seconds +AUTOTUNE bmm(8x1x64, 8x64x40) + triton_bmm_8791 0.0065 ms 100.0% + triton_bmm_8793 0.0069 ms 93.5% + triton_bmm_8790 0.0070 ms 92.7% + triton_bmm_8796 0.0070 ms 92.7% + triton_bmm_8792 0.0070 ms 92.5% + triton_bmm_8794 0.0070 ms 92.3% + triton_bmm_8795 0.0073 ms 89.0% + triton_bmm_8798 0.0075 ms 86.8% + bmm 0.0075 ms 86.4% + triton_bmm_8797 0.0081 ms 80.6% +SingleProcess AUTOTUNE takes 2.7723 seconds +AUTOTUNE bmm(8x1x64, 8x64x41) + triton_bmm_9027 0.0065 ms 100.0% + triton_bmm_9026 0.0065 ms 99.5% + triton_bmm_9028 0.0065 ms 99.5% + triton_bmm_9029 0.0070 ms 93.1% + triton_bmm_9025 0.0071 ms 91.0% + triton_bmm_9030 0.0075 ms 86.4% + triton_bmm_9032 0.0076 ms 85.8% + triton_bmm_9031 0.0077 ms 83.9% + triton_bmm_9024 0.0078 ms 83.5% + bmm 0.0082 ms 79.0% +SingleProcess AUTOTUNE takes 2.8265 seconds +AUTOTUNE bmm(8x1x64, 8x64x42) + triton_bmm_9259 0.0065 ms 100.0% + triton_bmm_9262 0.0065 ms 100.0% + triton_bmm_9260 0.0070 ms 92.3% + triton_bmm_9261 0.0070 ms 92.3% + triton_bmm_9258 0.0072 ms 90.6% + triton_bmm_9263 0.0075 ms 86.8% + triton_bmm_9264 0.0075 ms 86.4% + triton_bmm_9265 0.0075 ms 86.4% + triton_bmm_9266 0.0080 ms 80.9% + bmm 0.0083 ms 78.1% +SingleProcess AUTOTUNE takes 2.9419 seconds +AUTOTUNE bmm(8x1x64, 8x64x43) + triton_bmm_9495 0.0065 ms 100.0% + triton_bmm_9494 0.0065 ms 99.5% + triton_bmm_9497 0.0070 ms 93.1% + triton_bmm_9496 0.0072 ms 90.2% + triton_bmm_9493 0.0073 ms 89.4% + triton_bmm_9498 0.0075 ms 86.4% + triton_bmm_9499 0.0077 ms 83.9% + triton_bmm_9492 0.0078 ms 83.5% + triton_bmm_9500 0.0080 ms 80.9% + bmm 0.0083 ms 78.7% +SingleProcess AUTOTUNE takes 2.8685 seconds +AUTOTUNE bmm(8x1x64, 8x64x44) + triton_bmm_9727 0.0065 ms 100.0% + triton_bmm_9730 0.0065 ms 100.0% + triton_bmm_9729 0.0065 ms 99.5% + triton_bmm_9726 0.0070 ms 92.7% + triton_bmm_9728 0.0070 ms 92.3% + triton_bmm_9732 0.0075 ms 86.9% + triton_bmm_9734 0.0075 ms 86.8% + triton_bmm_9731 0.0075 ms 86.4% + triton_bmm_9733 0.0075 ms 86.2% + bmm 0.0083 ms 78.4% +SingleProcess AUTOTUNE takes 2.7175 seconds +AUTOTUNE bmm(8x1x64, 8x64x45) + triton_bmm_9962 0.0065 ms 100.0% + triton_bmm_9963 0.0065 ms 100.0% + triton_bmm_9964 0.0065 ms 100.0% + triton_bmm_9966 0.0070 ms 93.2% + triton_bmm_9961 0.0072 ms 91.1% + triton_bmm_9965 0.0075 ms 86.8% + triton_bmm_9968 0.0075 ms 86.8% + bmm 0.0076 ms 86.1% + triton_bmm_9967 0.0077 ms 84.3% + triton_bmm_9960 0.0078 ms 84.0% +SingleProcess AUTOTUNE takes 3.0126 seconds +AUTOTUNE bmm(8x1x64, 8x64x46) + triton_bmm_10200 0.0068 ms 100.0% + triton_bmm_10195 0.0070 ms 95.9% + triton_bmm_10196 0.0070 ms 95.9% + triton_bmm_10198 0.0070 ms 95.9% + triton_bmm_10197 0.0073 ms 93.0% + triton_bmm_10199 0.0075 ms 90.2% + triton_bmm_10194 0.0077 ms 87.9% + bmm 0.0078 ms 86.8% + triton_bmm_10202 0.0080 ms 84.1% + triton_bmm_10201 0.0081 ms 83.1% +SingleProcess AUTOTUNE takes 3.0387 seconds +AUTOTUNE bmm(8x1x64, 8x64x47) + triton_bmm_10432 0.0065 ms 100.0% + triton_bmm_10429 0.0067 ms 98.1% + triton_bmm_10433 0.0068 ms 96.2% + triton_bmm_10434 0.0070 ms 93.2% + triton_bmm_10431 0.0071 ms 91.5% + triton_bmm_10430 0.0072 ms 90.7% + triton_bmm_10428 0.0072 ms 90.3% + triton_bmm_10436 0.0077 ms 84.6% + bmm 0.0079 ms 82.6% + triton_bmm_10435 0.0083 ms 78.8% +SingleProcess AUTOTUNE takes 2.9979 seconds +AUTOTUNE bmm(8x1x64, 8x64x48) + triton_bmm_10666 0.0065 ms 100.0% + triton_bmm_10665 0.0065 ms 99.5% + triton_bmm_10667 0.0069 ms 94.4% + triton_bmm_10663 0.0070 ms 92.3% + triton_bmm_10664 0.0070 ms 92.3% + triton_bmm_10670 0.0075 ms 86.8% + bmm 0.0075 ms 86.4% + triton_bmm_10668 0.0075 ms 86.4% + triton_bmm_10662 0.0076 ms 86.0% + triton_bmm_10669 0.0082 ms 79.0% +SingleProcess AUTOTUNE takes 2.8037 seconds +AUTOTUNE bmm(8x1x64, 8x64x49) + triton_bmm_10899 0.0065 ms 100.0% + triton_bmm_10897 0.0067 ms 97.4% + triton_bmm_10900 0.0067 ms 96.7% + triton_bmm_10902 0.0070 ms 92.7% + triton_bmm_10896 0.0072 ms 89.8% + triton_bmm_10898 0.0073 ms 89.4% + triton_bmm_10901 0.0075 ms 86.6% + bmm 0.0077 ms 83.9% + triton_bmm_10904 0.0080 ms 80.9% + triton_bmm_10903 0.0082 ms 79.5% +SingleProcess AUTOTUNE takes 2.7900 seconds +AUTOTUNE bmm(8x1x64, 8x64x50) + triton_bmm_11135 0.0070 ms 100.0% + triton_bmm_11136 0.0070 ms 100.0% + triton_bmm_11131 0.0071 ms 99.1% + triton_bmm_11132 0.0071 ms 99.1% + triton_bmm_11134 0.0071 ms 99.1% + triton_bmm_11133 0.0071 ms 98.2% + triton_bmm_11138 0.0075 ms 93.2% + triton_bmm_11137 0.0077 ms 90.9% + triton_bmm_11130 0.0078 ms 90.1% + bmm 0.0082 ms 85.9% +SingleProcess AUTOTUNE takes 2.7869 seconds +AUTOTUNE bmm(8x1x64, 8x64x51) + triton_bmm_11367 0.0065 ms 100.0% + triton_bmm_11365 0.0067 ms 97.1% + triton_bmm_11370 0.0070 ms 93.2% + triton_bmm_11366 0.0072 ms 90.7% + triton_bmm_11364 0.0072 ms 90.3% + triton_bmm_11368 0.0073 ms 89.9% + triton_bmm_11372 0.0075 ms 86.8% + triton_bmm_11371 0.0077 ms 84.3% + triton_bmm_11369 0.0078 ms 84.0% + bmm 0.0083 ms 78.8% +SingleProcess AUTOTUNE takes 3.0432 seconds +AUTOTUNE bmm(8x1x64, 8x64x52) + triton_bmm_11600 0.0065 ms 100.0% + triton_bmm_11601 0.0065 ms 99.5% + triton_bmm_11604 0.0070 ms 93.1% + triton_bmm_11599 0.0071 ms 91.9% + triton_bmm_11602 0.0071 ms 91.9% + triton_bmm_11603 0.0075 ms 86.4% + triton_bmm_11598 0.0077 ms 84.6% + triton_bmm_11606 0.0080 ms 80.9% + bmm 0.0081 ms 80.2% + triton_bmm_11605 0.0082 ms 79.0% +SingleProcess AUTOTUNE takes 2.6902 seconds +AUTOTUNE bmm(8x1x64, 8x64x53) + triton_bmm_11835 0.0065 ms 100.0% + triton_bmm_11836 0.0067 ms 97.1% + triton_bmm_11832 0.0072 ms 90.3% + triton_bmm_11833 0.0073 ms 89.9% + triton_bmm_11834 0.0073 ms 89.9% + triton_bmm_11838 0.0075 ms 87.2% + triton_bmm_11837 0.0076 ms 86.4% + triton_bmm_11840 0.0082 ms 79.5% + triton_bmm_11839 0.0082 ms 79.4% + bmm 0.0083 ms 79.1% +SingleProcess AUTOTUNE takes 2.9321 seconds +AUTOTUNE bmm(8x1x64, 8x64x54) + triton_bmm_12070 0.0065 ms 100.0% + triton_bmm_12071 0.0070 ms 92.7% + triton_bmm_12072 0.0070 ms 92.7% + triton_bmm_12069 0.0071 ms 91.9% + triton_bmm_12068 0.0072 ms 90.4% + triton_bmm_12067 0.0072 ms 90.0% + triton_bmm_12066 0.0072 ms 89.8% + triton_bmm_12073 0.0077 ms 83.9% + bmm 0.0080 ms 80.9% + triton_bmm_12074 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 2.8412 seconds +AUTOTUNE bmm(8x1x64, 8x64x55) + triton_bmm_12302 0.0067 ms 100.0% + triton_bmm_12303 0.0071 ms 94.6% + triton_bmm_12304 0.0073 ms 92.1% + triton_bmm_12301 0.0073 ms 91.9% + triton_bmm_12305 0.0076 ms 88.6% + triton_bmm_12306 0.0076 ms 88.6% + triton_bmm_12300 0.0078 ms 86.0% + triton_bmm_12308 0.0081 ms 82.9% + bmm 0.0082 ms 82.0% + triton_bmm_12307 0.0083 ms 80.7% +SingleProcess AUTOTUNE takes 2.7878 seconds +AUTOTUNE bmm(8x1x64, 8x64x56) + triton_bmm_12535 0.0065 ms 100.0% + triton_bmm_12538 0.0070 ms 92.3% + triton_bmm_12537 0.0071 ms 91.9% + triton_bmm_12536 0.0071 ms 91.0% + triton_bmm_12534 0.0072 ms 89.8% + triton_bmm_12542 0.0075 ms 86.8% + triton_bmm_12540 0.0075 ms 86.4% + bmm 0.0076 ms 86.0% + triton_bmm_12539 0.0077 ms 84.2% + triton_bmm_12541 0.0077 ms 83.9% +SingleProcess AUTOTUNE takes 3.0356 seconds +AUTOTUNE bmm(8x1x64, 8x64x57) + triton_bmm_12769 0.0067 ms 100.0% + triton_bmm_12770 0.0068 ms 99.5% + triton_bmm_12772 0.0068 ms 99.5% + triton_bmm_12771 0.0073 ms 92.5% + triton_bmm_12776 0.0075 ms 89.4% + triton_bmm_12774 0.0076 ms 89.0% + bmm 0.0077 ms 86.8% + triton_bmm_12775 0.0077 ms 86.8% + triton_bmm_12768 0.0078 ms 86.4% + triton_bmm_12773 0.0078 ms 86.4% +SingleProcess AUTOTUNE takes 3.0242 seconds +AUTOTUNE bmm(8x1x64, 8x64x58) + triton_bmm_13003 0.0065 ms 100.0% + triton_bmm_13006 0.0065 ms 100.0% + triton_bmm_13008 0.0070 ms 92.7% + triton_bmm_13005 0.0072 ms 90.6% + triton_bmm_13004 0.0072 ms 89.6% + triton_bmm_13007 0.0073 ms 89.4% + triton_bmm_13009 0.0077 ms 83.9% + bmm 0.0078 ms 83.5% + triton_bmm_13002 0.0078 ms 83.5% + triton_bmm_13010 0.0080 ms 80.9% +SingleProcess AUTOTUNE takes 3.1632 seconds +AUTOTUNE bmm(8x1x64, 8x64x59) + triton_bmm_13240 0.0067 ms 100.0% + triton_bmm_13237 0.0068 ms 99.8% + triton_bmm_13238 0.0068 ms 99.8% + triton_bmm_13241 0.0070 ms 96.1% + triton_bmm_13242 0.0070 ms 96.1% + triton_bmm_13239 0.0072 ms 93.3% + triton_bmm_13243 0.0077 ms 87.0% + triton_bmm_13236 0.0078 ms 86.6% + triton_bmm_13244 0.0081 ms 83.5% + bmm 0.0083 ms 81.3% +SingleProcess AUTOTUNE takes 2.8681 seconds +AUTOTUNE bmm(8x1x64, 8x64x60) + triton_bmm_13471 0.0065 ms 100.0% + triton_bmm_13472 0.0065 ms 100.0% + triton_bmm_13473 0.0065 ms 99.5% + triton_bmm_13474 0.0071 ms 91.4% + triton_bmm_13470 0.0072 ms 89.8% + triton_bmm_13475 0.0073 ms 89.4% + triton_bmm_13478 0.0075 ms 86.8% + triton_bmm_13476 0.0075 ms 86.4% + triton_bmm_13477 0.0077 ms 83.9% + bmm 0.0091 ms 71.6% +SingleProcess AUTOTUNE takes 3.4641 seconds +AUTOTUNE bmm(8x1x64, 8x64x61) + triton_bmm_13706 0.0067 ms 100.0% + triton_bmm_13707 0.0071 ms 94.2% + triton_bmm_13708 0.0073 ms 92.5% + triton_bmm_13705 0.0073 ms 92.1% + triton_bmm_13710 0.0075 ms 89.4% + triton_bmm_13709 0.0076 ms 89.0% + triton_bmm_13712 0.0076 ms 89.0% + bmm 0.0077 ms 86.8% + triton_bmm_13711 0.0077 ms 86.8% + triton_bmm_13704 0.0078 ms 86.4% +SingleProcess AUTOTUNE takes 2.8086 seconds +AUTOTUNE bmm(8x1x64, 8x64x62) + triton_bmm_13940 0.0065 ms 100.0% + triton_bmm_13942 0.0065 ms 100.0% + triton_bmm_13941 0.0067 ms 97.6% + triton_bmm_13944 0.0070 ms 92.7% + triton_bmm_13939 0.0070 ms 92.3% + triton_bmm_13938 0.0072 ms 89.8% + triton_bmm_13946 0.0075 ms 86.8% + triton_bmm_13943 0.0078 ms 83.5% + triton_bmm_13945 0.0083 ms 78.7% + bmm 0.0092 ms 70.2% +SingleProcess AUTOTUNE takes 2.7584 seconds +AUTOTUNE bmm(8x1x64, 8x64x63) + triton_bmm_14173 0.0067 ms 100.0% + triton_bmm_14174 0.0067 ms 100.0% + triton_bmm_14175 0.0072 ms 93.3% + triton_bmm_14172 0.0072 ms 92.5% + triton_bmm_14177 0.0072 ms 92.5% + triton_bmm_14176 0.0073 ms 92.1% + triton_bmm_14178 0.0076 ms 88.6% + triton_bmm_14179 0.0077 ms 86.4% + bmm 0.0080 ms 83.3% + triton_bmm_14180 0.0080 ms 83.3% +SingleProcess AUTOTUNE takes 2.8972 seconds +AUTOTUNE bmm(8x1x64, 8x64x64) + triton_bmm_14408 0.0065 ms 100.0% + triton_bmm_14410 0.0065 ms 100.0% + triton_bmm_14409 0.0065 ms 99.5% + triton_bmm_14411 0.0070 ms 93.1% + triton_bmm_14407 0.0070 ms 92.5% + triton_bmm_14414 0.0075 ms 86.8% + bmm 0.0075 ms 86.4% + triton_bmm_14412 0.0075 ms 86.4% + triton_bmm_14406 0.0077 ms 84.4% + triton_bmm_14413 0.0082 ms 79.0% +SingleProcess AUTOTUNE takes 2.8081 seconds +[2023-12-13 04:02:57,755] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (1000) +[2023-12-13 04:02:57,755] torch._dynamo.convert_frame: [WARNING] function: 'forward' (/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1645) +[2023-12-13 04:02:57,755] torch._dynamo.convert_frame: [WARNING] last reason: ___check_obj_id(L['past_key_values'], 7628576) # mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length # miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1026 in forward +[2023-12-13 04:02:57,755] torch._dynamo.convert_frame: [WARNING] To log all recompilation reasons, use TORCH_LOGS="recompiles". +[2023-12-13 04:02:57,755] torch._dynamo.convert_frame: [WARNING] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html. +AUTOTUNE bmm(8x1x64, 8x64x65) + triton_bmm_14642 0.0068 ms 100.0% + triton_bmm_14641 0.0069 ms 97.7% + triton_bmm_14643 0.0070 ms 96.3% + triton_bmm_14649 0.0070 ms 96.3% + triton_bmm_14644 0.0071 ms 95.5% + triton_bmm_14640 0.0072 ms 93.4% + triton_bmm_14646 0.0073 ms 93.0% + triton_bmm_14645 0.0075 ms 89.8% + triton_bmm_14648 0.0075 ms 89.8% + triton_bmm_14651 0.0077 ms 87.9% +SingleProcess AUTOTUNE takes 3.8513 seconds +AUTOTUNE bmm(8x1x64, 8x64x66) + triton_bmm_14894 0.0068 ms 100.0% + triton_bmm_14893 0.0070 ms 96.8% + triton_bmm_14897 0.0070 ms 96.3% + triton_bmm_14901 0.0070 ms 96.3% + triton_bmm_14898 0.0071 ms 95.0% + triton_bmm_14892 0.0072 ms 93.4% + triton_bmm_14896 0.0073 ms 93.0% + triton_bmm_14895 0.0075 ms 89.8% + triton_bmm_14903 0.0075 ms 89.8% + triton_bmm_14900 0.0076 ms 89.4% +SingleProcess AUTOTUNE takes 4.0328 seconds +AUTOTUNE bmm(8x1x64, 8x64x67) + triton_bmm_15148 0.0068 ms 100.0% + triton_bmm_15150 0.0068 ms 100.0% + triton_bmm_15147 0.0070 ms 96.3% + triton_bmm_15152 0.0070 ms 96.3% + triton_bmm_15146 0.0073 ms 93.0% + triton_bmm_15149 0.0073 ms 92.5% + triton_bmm_15145 0.0074 ms 91.5% + triton_bmm_15153 0.0075 ms 89.8% + triton_bmm_15144 0.0076 ms 88.3% + triton_bmm_15155 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 3.9828 seconds +AUTOTUNE bmm(8x1x64, 8x64x68) + triton_bmm_15400 0.0065 ms 100.0% + triton_bmm_15401 0.0068 ms 96.7% + triton_bmm_15398 0.0072 ms 91.1% + triton_bmm_15402 0.0072 ms 90.9% + triton_bmm_15396 0.0072 ms 90.3% + triton_bmm_15397 0.0073 ms 89.5% + triton_bmm_15399 0.0075 ms 86.8% + triton_bmm_15404 0.0076 ms 86.4% + triton_bmm_15406 0.0077 ms 84.3% + bmm 0.0078 ms 84.0% +SingleProcess AUTOTUNE takes 3.6582 seconds +AUTOTUNE bmm(8x1x64, 8x64x69) + triton_bmm_15650 0.0068 ms 100.0% + triton_bmm_15654 0.0068 ms 100.0% + triton_bmm_15656 0.0070 ms 96.3% + triton_bmm_15652 0.0072 ms 93.4% + triton_bmm_15653 0.0073 ms 93.0% + triton_bmm_15649 0.0074 ms 90.9% + triton_bmm_15655 0.0075 ms 89.8% + triton_bmm_15651 0.0076 ms 89.4% + triton_bmm_15657 0.0076 ms 89.4% + bmm 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.7310 seconds +AUTOTUNE bmm(8x1x64, 8x64x70) + triton_bmm_15905 0.0068 ms 100.0% + triton_bmm_15906 0.0068 ms 100.0% + triton_bmm_15903 0.0070 ms 96.3% + triton_bmm_15900 0.0072 ms 93.4% + triton_bmm_15902 0.0073 ms 93.0% + triton_bmm_15904 0.0073 ms 93.0% + triton_bmm_15908 0.0073 ms 93.0% + triton_bmm_15901 0.0075 ms 89.8% + triton_bmm_15909 0.0075 ms 89.8% + triton_bmm_15910 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.7299 seconds +AUTOTUNE bmm(8x1x64, 8x64x71) + triton_bmm_16157 0.0067 ms 100.0% + triton_bmm_16155 0.0070 ms 95.9% + triton_bmm_16158 0.0072 ms 93.7% + triton_bmm_16152 0.0073 ms 92.5% + triton_bmm_16156 0.0073 ms 92.5% + triton_bmm_16154 0.0073 ms 92.1% + triton_bmm_16153 0.0075 ms 90.1% + triton_bmm_16161 0.0076 ms 89.0% + bmm 0.0077 ms 86.8% + triton_bmm_16163 0.0077 ms 86.8% +SingleProcess AUTOTUNE takes 4.0343 seconds +AUTOTUNE bmm(8x1x64, 8x64x72) + triton_bmm_16405 0.0068 ms 100.0% + triton_bmm_16409 0.0068 ms 100.0% + triton_bmm_16413 0.0070 ms 96.3% + triton_bmm_16408 0.0072 ms 94.2% + triton_bmm_16404 0.0072 ms 93.4% + triton_bmm_16410 0.0072 ms 93.4% + triton_bmm_16406 0.0073 ms 93.0% + triton_bmm_16411 0.0075 ms 90.2% + triton_bmm_16415 0.0075 ms 90.2% + triton_bmm_16407 0.0075 ms 89.8% +SingleProcess AUTOTUNE takes 3.6317 seconds +AUTOTUNE bmm(8x1x64, 8x64x73) + triton_bmm_16658 0.0068 ms 100.0% + triton_bmm_16661 0.0068 ms 100.0% + triton_bmm_16660 0.0072 ms 93.8% + triton_bmm_16662 0.0073 ms 93.0% + triton_bmm_16657 0.0074 ms 90.9% + triton_bmm_16663 0.0075 ms 89.8% + triton_bmm_16659 0.0076 ms 89.0% + triton_bmm_16665 0.0077 ms 87.4% + triton_bmm_16666 0.0077 ms 87.2% + triton_bmm_16656 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.9306 seconds +AUTOTUNE bmm(8x1x64, 8x64x74) + triton_bmm_16914 0.0067 ms 100.0% + triton_bmm_16909 0.0070 ms 95.9% + triton_bmm_16911 0.0070 ms 95.9% + triton_bmm_16908 0.0072 ms 92.9% + triton_bmm_16910 0.0073 ms 92.5% + triton_bmm_16912 0.0073 ms 92.3% + triton_bmm_16913 0.0073 ms 91.7% + triton_bmm_16915 0.0075 ms 89.7% + triton_bmm_16917 0.0075 ms 89.4% + triton_bmm_16919 0.0075 ms 89.4% +SingleProcess AUTOTUNE takes 3.6884 seconds +AUTOTUNE bmm(8x1x64, 8x64x75) + triton_bmm_17161 0.0070 ms 100.0% + triton_bmm_17168 0.0070 ms 99.5% + triton_bmm_17162 0.0073 ms 96.0% + triton_bmm_17165 0.0073 ms 96.0% + triton_bmm_17166 0.0073 ms 96.0% + triton_bmm_17164 0.0073 ms 95.6% + triton_bmm_17169 0.0076 ms 92.4% + triton_bmm_17163 0.0076 ms 92.0% + triton_bmm_17167 0.0076 ms 91.2% + triton_bmm_17160 0.0078 ms 89.3% +SingleProcess AUTOTUNE takes 3.7450 seconds +AUTOTUNE bmm(8x1x64, 8x64x76) + triton_bmm_17416 0.0067 ms 100.0% + triton_bmm_17413 0.0068 ms 98.6% + triton_bmm_17417 0.0068 ms 98.6% + triton_bmm_17415 0.0070 ms 95.4% + triton_bmm_17420 0.0070 ms 95.0% + triton_bmm_17412 0.0072 ms 92.0% + triton_bmm_17418 0.0073 ms 91.6% + triton_bmm_17414 0.0073 ms 91.2% + triton_bmm_17419 0.0075 ms 88.9% + triton_bmm_17421 0.0075 ms 88.3% +SingleProcess AUTOTUNE takes 4.1662 seconds +AUTOTUNE bmm(8x1x64, 8x64x77) + triton_bmm_17670 0.0068 ms 100.0% + triton_bmm_17665 0.0070 ms 96.8% + triton_bmm_17673 0.0070 ms 96.3% + triton_bmm_17666 0.0072 ms 94.2% + triton_bmm_17669 0.0072 ms 93.6% + triton_bmm_17668 0.0072 ms 93.4% + triton_bmm_17664 0.0073 ms 93.0% + triton_bmm_17667 0.0076 ms 89.4% + triton_bmm_17672 0.0076 ms 89.4% + triton_bmm_17674 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.6443 seconds +AUTOTUNE bmm(8x1x64, 8x64x78) + triton_bmm_17920 0.0067 ms 100.0% + triton_bmm_17921 0.0068 ms 99.5% + triton_bmm_17919 0.0070 ms 95.9% + triton_bmm_17925 0.0070 ms 95.9% + triton_bmm_17916 0.0072 ms 92.9% + triton_bmm_17922 0.0072 ms 92.7% + triton_bmm_17918 0.0073 ms 92.5% + triton_bmm_17924 0.0073 ms 92.5% + triton_bmm_17917 0.0075 ms 89.4% + triton_bmm_17927 0.0077 ms 87.1% +SingleProcess AUTOTUNE takes 3.9520 seconds +AUTOTUNE bmm(8x1x64, 8x64x79) + triton_bmm_18170 0.0068 ms 100.0% + triton_bmm_18172 0.0068 ms 100.0% + triton_bmm_18168 0.0073 ms 93.0% + triton_bmm_18173 0.0073 ms 92.5% + triton_bmm_18174 0.0073 ms 92.5% + triton_bmm_18169 0.0075 ms 90.2% + triton_bmm_18176 0.0076 ms 89.4% + triton_bmm_18177 0.0076 ms 89.4% + triton_bmm_18175 0.0077 ms 87.6% + triton_bmm_18179 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 3.6560 seconds +AUTOTUNE bmm(8x1x64, 8x64x80) + triton_bmm_18424 0.0067 ms 100.0% + triton_bmm_18425 0.0068 ms 99.5% + triton_bmm_18423 0.0070 ms 96.3% + triton_bmm_18429 0.0070 ms 95.9% + triton_bmm_18422 0.0073 ms 92.5% + triton_bmm_18426 0.0073 ms 92.5% + triton_bmm_18421 0.0073 ms 92.1% + bmm 0.0075 ms 89.7% + triton_bmm_18427 0.0075 ms 89.7% + triton_bmm_18428 0.0076 ms 89.0% +SingleProcess AUTOTUNE takes 4.1266 seconds +AUTOTUNE bmm(8x1x64, 8x64x81) + triton_bmm_18677 0.0068 ms 100.0% + triton_bmm_18675 0.0070 ms 96.3% + triton_bmm_18680 0.0070 ms 96.3% + triton_bmm_18681 0.0070 ms 96.3% + triton_bmm_18676 0.0071 ms 94.6% + triton_bmm_18674 0.0073 ms 93.0% + triton_bmm_18678 0.0073 ms 93.0% + triton_bmm_18673 0.0074 ms 91.7% + triton_bmm_18679 0.0076 ms 88.3% + triton_bmm_18683 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 4.2893 seconds +AUTOTUNE bmm(8x1x64, 8x64x82) + triton_bmm_18930 0.0067 ms 100.0% + triton_bmm_18926 0.0068 ms 99.5% + triton_bmm_18932 0.0070 ms 95.9% + triton_bmm_18933 0.0070 ms 95.9% + triton_bmm_18924 0.0072 ms 92.9% + triton_bmm_18928 0.0073 ms 92.5% + triton_bmm_18929 0.0073 ms 91.9% + triton_bmm_18925 0.0075 ms 90.1% + triton_bmm_18927 0.0075 ms 89.7% + triton_bmm_18934 0.0077 ms 86.8% +SingleProcess AUTOTUNE takes 3.7136 seconds +AUTOTUNE bmm(8x1x64, 8x64x83) + triton_bmm_19182 0.0068 ms 100.0% + triton_bmm_19185 0.0070 ms 96.3% + triton_bmm_19176 0.0073 ms 93.0% + triton_bmm_19178 0.0073 ms 92.5% + triton_bmm_19180 0.0073 ms 92.5% + triton_bmm_19181 0.0074 ms 91.7% + triton_bmm_19177 0.0075 ms 89.8% + triton_bmm_19179 0.0076 ms 89.4% + triton_bmm_19184 0.0076 ms 89.4% + triton_bmm_19186 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.8975 seconds +AUTOTUNE bmm(8x1x64, 8x64x84) + triton_bmm_19432 0.0066 ms 100.0% + triton_bmm_19433 0.0068 ms 98.1% + triton_bmm_19437 0.0070 ms 94.5% + triton_bmm_19434 0.0073 ms 91.2% + triton_bmm_19429 0.0073 ms 90.8% + triton_bmm_19430 0.0073 ms 90.8% + triton_bmm_19431 0.0075 ms 88.1% + triton_bmm_19436 0.0076 ms 87.7% + triton_bmm_19439 0.0076 ms 87.7% + triton_bmm_19438 0.0077 ms 85.5% +SingleProcess AUTOTUNE takes 3.7738 seconds +AUTOTUNE bmm(8x1x64, 8x64x85) + triton_bmm_19685 0.0068 ms 100.0% + triton_bmm_19686 0.0068 ms 100.0% + triton_bmm_19683 0.0070 ms 96.3% + triton_bmm_19689 0.0070 ms 96.3% + triton_bmm_19682 0.0072 ms 93.8% + triton_bmm_19684 0.0073 ms 93.0% + triton_bmm_19681 0.0074 ms 90.8% + triton_bmm_19688 0.0076 ms 89.4% + triton_bmm_19687 0.0076 ms 89.0% + bmm 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.9245 seconds +AUTOTUNE bmm(8x1x64, 8x64x86) + triton_bmm_19938 0.0067 ms 100.0% + triton_bmm_19934 0.0068 ms 99.5% + triton_bmm_19935 0.0070 ms 95.9% + triton_bmm_19941 0.0070 ms 95.9% + triton_bmm_19932 0.0072 ms 92.9% + triton_bmm_19936 0.0073 ms 92.5% + triton_bmm_19937 0.0074 ms 90.9% + triton_bmm_19933 0.0075 ms 89.4% + triton_bmm_19940 0.0076 ms 89.0% + triton_bmm_19943 0.0077 ms 87.1% +SingleProcess AUTOTUNE takes 3.9348 seconds +AUTOTUNE bmm(8x1x64, 8x64x87) + triton_bmm_20186 0.0068 ms 100.0% + triton_bmm_20188 0.0068 ms 100.0% + triton_bmm_20189 0.0068 ms 100.0% + triton_bmm_20190 0.0068 ms 100.0% + triton_bmm_20192 0.0070 ms 96.3% + triton_bmm_20184 0.0073 ms 93.0% + triton_bmm_20185 0.0075 ms 89.8% + triton_bmm_20187 0.0075 ms 89.6% + triton_bmm_20193 0.0076 ms 89.2% + triton_bmm_20195 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 4.2210 seconds +AUTOTUNE bmm(8x1x64, 8x64x88) + triton_bmm_20437 0.0068 ms 100.0% + triton_bmm_20438 0.0068 ms 100.0% + triton_bmm_20441 0.0068 ms 100.0% + triton_bmm_20442 0.0068 ms 100.0% + bmm 0.0070 ms 96.3% + triton_bmm_20444 0.0070 ms 96.3% + triton_bmm_20445 0.0070 ms 96.3% + triton_bmm_20436 0.0072 ms 93.4% + triton_bmm_20440 0.0073 ms 92.5% + triton_bmm_20443 0.0075 ms 90.2% +SingleProcess AUTOTUNE takes 4.2627 seconds +AUTOTUNE bmm(8x1x64, 8x64x89) + triton_bmm_20692 0.0068 ms 100.0% + triton_bmm_20689 0.0070 ms 96.8% + triton_bmm_20697 0.0070 ms 96.3% + triton_bmm_20696 0.0072 ms 93.4% + triton_bmm_20688 0.0073 ms 93.0% + triton_bmm_20690 0.0073 ms 93.0% + triton_bmm_20694 0.0073 ms 93.0% + triton_bmm_20693 0.0073 ms 92.5% + triton_bmm_20695 0.0075 ms 89.8% + triton_bmm_20691 0.0076 ms 89.4% +SingleProcess AUTOTUNE takes 3.9110 seconds +AUTOTUNE bmm(8x1x64, 8x64x90) + triton_bmm_20941 0.0070 ms 100.0% + triton_bmm_20949 0.0070 ms 100.0% + triton_bmm_20942 0.0071 ms 98.2% + triton_bmm_20946 0.0072 ms 97.8% + triton_bmm_20944 0.0073 ms 96.1% + triton_bmm_20945 0.0075 ms 93.6% + triton_bmm_20947 0.0075 ms 93.6% + triton_bmm_20943 0.0076 ms 92.8% + triton_bmm_20950 0.0077 ms 90.5% + triton_bmm_20940 0.0078 ms 90.1% +SingleProcess AUTOTUNE takes 3.9448 seconds +AUTOTUNE bmm(8x1x64, 8x64x91) + triton_bmm_21194 0.0068 ms 100.0% + triton_bmm_21196 0.0068 ms 100.0% + triton_bmm_21197 0.0068 ms 100.0% + triton_bmm_21198 0.0068 ms 100.0% + triton_bmm_21193 0.0070 ms 96.8% + triton_bmm_21195 0.0070 ms 96.3% + triton_bmm_21200 0.0072 ms 93.4% + triton_bmm_21201 0.0076 ms 89.4% + bmm 0.0077 ms 87.2% + triton_bmm_21192 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.7594 seconds +AUTOTUNE bmm(8x1x64, 8x64x92) + triton_bmm_21450 0.0067 ms 100.0% + triton_bmm_21446 0.0067 ms 99.5% + triton_bmm_21448 0.0067 ms 99.5% + triton_bmm_21445 0.0068 ms 99.1% + triton_bmm_21449 0.0068 ms 98.6% + triton_bmm_21444 0.0072 ms 92.5% + triton_bmm_21447 0.0075 ms 88.9% + triton_bmm_21453 0.0075 ms 88.9% + triton_bmm_21454 0.0077 ms 86.4% + triton_bmm_21452 0.0078 ms 86.0% +SingleProcess AUTOTUNE takes 3.7424 seconds +AUTOTUNE bmm(8x1x64, 8x64x93) + triton_bmm_21700 0.0068 ms 100.0% + triton_bmm_21697 0.0070 ms 96.8% + triton_bmm_21704 0.0070 ms 96.3% + triton_bmm_21705 0.0070 ms 96.3% + triton_bmm_21698 0.0073 ms 93.0% + triton_bmm_21702 0.0073 ms 93.0% + triton_bmm_21701 0.0073 ms 92.5% + triton_bmm_21699 0.0076 ms 89.4% + triton_bmm_21707 0.0077 ms 87.6% + triton_bmm_21696 0.0078 ms 86.5% +SingleProcess AUTOTUNE takes 3.6845 seconds +AUTOTUNE bmm(8x1x64, 8x64x94) + triton_bmm_21954 0.0068 ms 100.0% + triton_bmm_21951 0.0070 ms 96.3% + triton_bmm_21956 0.0072 ms 93.4% + triton_bmm_21950 0.0073 ms 93.0% + triton_bmm_21952 0.0073 ms 93.0% + triton_bmm_21953 0.0074 ms 90.9% + triton_bmm_21949 0.0075 ms 90.6% + triton_bmm_21957 0.0075 ms 89.8% + triton_bmm_21959 0.0076 ms 89.4% + triton_bmm_21948 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.5713 seconds +AUTOTUNE bmm(8x1x64, 8x64x95) + triton_bmm_22202 0.0068 ms 100.0% + triton_bmm_22204 0.0068 ms 100.0% + triton_bmm_22201 0.0070 ms 96.8% + triton_bmm_22203 0.0070 ms 96.3% + triton_bmm_22209 0.0070 ms 96.3% + triton_bmm_22208 0.0072 ms 93.4% + triton_bmm_22205 0.0073 ms 92.5% + triton_bmm_22206 0.0073 ms 92.5% + triton_bmm_22211 0.0077 ms 87.6% + triton_bmm_22200 0.0078 ms 86.8% +SingleProcess AUTOTUNE takes 3.6200 seconds +AUTOTUNE bmm(8x1x64, 8x64x96) + triton_bmm_22460 0.0070 ms 100.0% + triton_bmm_22454 0.0072 ms 97.1% + triton_bmm_22452 0.0072 ms 96.9% + triton_bmm_22453 0.0073 ms 96.5% + triton_bmm_22456 0.0073 ms 96.3% + triton_bmm_22458 0.0073 ms 96.1% + triton_bmm_22457 0.0074 ms 94.4% + triton_bmm_22459 0.0075 ms 93.6% + triton_bmm_22463 0.0075 ms 93.6% + triton_bmm_22455 0.0075 ms 93.2% +SingleProcess AUTOTUNE takes 3.9048 seconds +AUTOTUNE bmm(8x1x64, 8x64x97) + triton_bmm_22708 0.0068 ms 100.0% + triton_bmm_22709 0.0068 ms 100.0% + triton_bmm_22710 0.0068 ms 100.0% + triton_bmm_22706 0.0072 ms 93.4% + triton_bmm_22704 0.0073 ms 93.0% + triton_bmm_22705 0.0074 ms 90.9% + triton_bmm_22712 0.0076 ms 89.4% + triton_bmm_22713 0.0077 ms 87.6% + bmm 0.0077 ms 87.2% + triton_bmm_22707 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 4.1204 seconds +AUTOTUNE bmm(8x1x64, 8x64x98) + triton_bmm_22960 0.0068 ms 100.0% + triton_bmm_22961 0.0068 ms 100.0% + triton_bmm_22962 0.0068 ms 100.0% + triton_bmm_22958 0.0073 ms 92.7% + triton_bmm_22957 0.0075 ms 90.4% + triton_bmm_22963 0.0075 ms 90.2% + triton_bmm_22959 0.0075 ms 89.6% + triton_bmm_22964 0.0077 ms 87.9% + triton_bmm_22965 0.0078 ms 86.8% + triton_bmm_22956 0.0078 ms 86.5% +SingleProcess AUTOTUNE takes 3.9070 seconds +AUTOTUNE bmm(8x1x64, 8x64x99) + triton_bmm_23212 0.0068 ms 100.0% + triton_bmm_23213 0.0068 ms 100.0% + triton_bmm_23211 0.0070 ms 96.3% + triton_bmm_23214 0.0072 ms 93.4% + triton_bmm_23210 0.0073 ms 93.0% + triton_bmm_23209 0.0075 ms 90.0% + triton_bmm_23215 0.0075 ms 89.8% + triton_bmm_23217 0.0076 ms 88.8% + bmm 0.0077 ms 87.2% + triton_bmm_23218 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.9095 seconds +AUTOTUNE bmm(8x1x64, 8x64x100) + triton_bmm_23462 0.0067 ms 100.0% + triton_bmm_23464 0.0067 ms 100.0% + triton_bmm_23468 0.0070 ms 95.9% + triton_bmm_23469 0.0070 ms 95.9% + triton_bmm_23466 0.0072 ms 92.9% + triton_bmm_23465 0.0073 ms 92.5% + triton_bmm_23461 0.0073 ms 92.1% + triton_bmm_23463 0.0075 ms 89.7% + triton_bmm_23470 0.0077 ms 86.8% + bmm 0.0078 ms 86.4% +SingleProcess AUTOTUNE takes 3.6961 seconds +AUTOTUNE bmm(8x1x100, 8x100x64) + triton_bmm_23475 0.0068 ms 100.0% + triton_bmm_23477 0.0068 ms 100.0% + triton_bmm_23474 0.0070 ms 96.3% + triton_bmm_23473 0.0072 ms 93.4% + triton_bmm_23476 0.0075 ms 89.8% + triton_bmm_23472 0.0082 ms 82.1% + triton_bmm_23478 0.0084 ms 80.5% + triton_bmm_23479 0.0095 ms 71.3% + triton_bmm_23480 0.0097 ms 69.4% + bmm 0.0545 ms 12.4% +SingleProcess AUTOTUNE takes 2.9612 seconds +AUTOTUNE bmm(8x1x64, 8x64x101) + triton_bmm_23714 0.0068 ms 100.0% + triton_bmm_23715 0.0070 ms 96.3% + triton_bmm_23720 0.0070 ms 96.3% + triton_bmm_23721 0.0070 ms 96.3% + triton_bmm_23712 0.0073 ms 93.0% + triton_bmm_23716 0.0073 ms 93.0% + triton_bmm_23717 0.0073 ms 93.0% + triton_bmm_23718 0.0073 ms 93.0% + triton_bmm_23713 0.0075 ms 89.8% + triton_bmm_23723 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.8360 seconds +AUTOTUNE bmm(8x1x101, 8x101x64) + triton_bmm_23727 0.0080 ms 100.0% + triton_bmm_23728 0.0083 ms 96.9% + triton_bmm_23729 0.0083 ms 96.5% + triton_bmm_23725 0.0085 ms 94.3% + triton_bmm_23730 0.0085 ms 94.3% + triton_bmm_23726 0.0092 ms 87.0% + triton_bmm_23724 0.0095 ms 84.2% + triton_bmm_23732 0.0097 ms 82.2% + triton_bmm_23731 0.0115 ms 69.8% + bmm 0.0533 ms 15.0% +SingleProcess AUTOTUNE takes 3.2004 seconds +AUTOTUNE bmm(8x1x64, 8x64x102) + triton_bmm_23968 0.0068 ms 100.0% + triton_bmm_23965 0.0070 ms 96.3% + triton_bmm_23967 0.0070 ms 96.3% + triton_bmm_23970 0.0072 ms 93.4% + triton_bmm_23964 0.0073 ms 93.0% + triton_bmm_23966 0.0073 ms 92.5% + triton_bmm_23969 0.0073 ms 92.5% + triton_bmm_23971 0.0075 ms 90.2% + triton_bmm_23973 0.0075 ms 89.8% + triton_bmm_23975 0.0077 ms 87.6% +SingleProcess AUTOTUNE takes 3.7057 seconds +AUTOTUNE bmm(8x1x102, 8x102x64) + triton_bmm_23980 0.0070 ms 100.0% + triton_bmm_23978 0.0070 ms 99.5% + triton_bmm_23979 0.0075 ms 93.0% + triton_bmm_23981 0.0075 ms 92.8% + triton_bmm_23977 0.0078 ms 89.7% + triton_bmm_23976 0.0083 ms 84.5% + triton_bmm_23982 0.0085 ms 82.3% + triton_bmm_23983 0.0095 ms 73.6% + triton_bmm_23984 0.0103 ms 67.9% + bmm 0.0554 ms 12.6% +SingleProcess AUTOTUNE takes 3.5287 seconds +AUTOTUNE bmm(8x1x64, 8x64x103) + triton_bmm_24220 0.0068 ms 100.0% + triton_bmm_24221 0.0068 ms 100.0% + triton_bmm_24219 0.0070 ms 96.3% + triton_bmm_24224 0.0070 ms 95.9% + triton_bmm_24216 0.0073 ms 93.0% + triton_bmm_24218 0.0073 ms 92.5% + triton_bmm_24222 0.0073 ms 92.5% + triton_bmm_24217 0.0075 ms 90.4% + triton_bmm_24225 0.0076 ms 89.0% + triton_bmm_24223 0.0076 ms 88.7% +SingleProcess AUTOTUNE takes 4.0321 seconds +AUTOTUNE bmm(8x1x103, 8x103x64) + triton_bmm_24232 0.0084 ms 100.0% + triton_bmm_24233 0.0084 ms 100.0% + triton_bmm_24234 0.0085 ms 98.9% + triton_bmm_24231 0.0087 ms 96.7% + triton_bmm_24229 0.0087 ms 96.3% + triton_bmm_24230 0.0092 ms 90.8% + triton_bmm_24228 0.0097 ms 86.2% + triton_bmm_24236 0.0097 ms 86.2% + triton_bmm_24235 0.0120 ms 70.1% + bmm 0.0842 ms 10.0% +SingleProcess AUTOTUNE takes 2.9023 seconds +AUTOTUNE bmm(8x1x64, 8x64x104) + triton_bmm_24477 0.0070 ms 100.0% + triton_bmm_24470 0.0072 ms 96.9% + triton_bmm_24468 0.0073 ms 96.5% + triton_bmm_24472 0.0073 ms 96.3% + triton_bmm_24473 0.0073 ms 96.1% + triton_bmm_24474 0.0073 ms 96.1% + triton_bmm_24469 0.0073 ms 95.6% + triton_bmm_24471 0.0074 ms 94.2% + triton_bmm_24475 0.0075 ms 93.6% + triton_bmm_24479 0.0075 ms 93.2% +SingleProcess AUTOTUNE takes 3.8513 seconds +AUTOTUNE bmm(8x1x104, 8x104x64) + triton_bmm_24482 0.0070 ms 100.0% + triton_bmm_24483 0.0070 ms 100.0% + triton_bmm_24485 0.0073 ms 96.3% + triton_bmm_24484 0.0075 ms 93.6% + triton_bmm_24486 0.0076 ms 92.0% + triton_bmm_24481 0.0078 ms 90.1% + triton_bmm_24480 0.0088 ms 79.9% + triton_bmm_24488 0.0097 ms 72.0% + triton_bmm_24487 0.0100 ms 70.2% + bmm 0.0545 ms 12.9% +SingleProcess AUTOTUNE takes 2.9516 seconds +AUTOTUNE bmm(8x1x64, 8x64x105) + triton_bmm_24724 0.0068 ms 100.0% + triton_bmm_24726 0.0068 ms 100.0% + triton_bmm_24722 0.0073 ms 92.5% + triton_bmm_24725 0.0073 ms 92.5% + triton_bmm_24721 0.0075 ms 89.8% + triton_bmm_24729 0.0076 ms 89.4% + triton_bmm_24723 0.0076 ms 89.2% + triton_bmm_24727 0.0077 ms 87.6% + triton_bmm_24730 0.0077 ms 87.2% + triton_bmm_24731 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.6594 seconds +AUTOTUNE bmm(8x1x105, 8x105x64) + triton_bmm_24735 0.0083 ms 100.0% + triton_bmm_24736 0.0083 ms 100.0% + triton_bmm_24737 0.0085 ms 97.4% + triton_bmm_24738 0.0085 ms 97.0% + triton_bmm_24733 0.0092 ms 89.6% + triton_bmm_24734 0.0092 ms 89.3% + triton_bmm_24732 0.0102 ms 80.6% + triton_bmm_24740 0.0102 ms 80.6% + triton_bmm_24739 0.0122 ms 67.6% + bmm 0.0557 ms 14.8% +SingleProcess AUTOTUNE takes 3.1760 seconds +AUTOTUNE bmm(8x1x64, 8x64x106) + triton_bmm_24974 0.0068 ms 100.0% + triton_bmm_24975 0.0070 ms 96.3% + triton_bmm_24980 0.0070 ms 96.3% + triton_bmm_24972 0.0073 ms 93.0% + triton_bmm_24976 0.0073 ms 92.5% + triton_bmm_24977 0.0073 ms 92.5% + triton_bmm_24978 0.0073 ms 92.5% + triton_bmm_24973 0.0075 ms 90.0% + triton_bmm_24981 0.0077 ms 87.9% + triton_bmm_24979 0.0080 ms 84.1% +SingleProcess AUTOTUNE takes 3.7596 seconds +AUTOTUNE bmm(8x1x106, 8x106x64) + triton_bmm_24989 0.0068 ms 100.0% + triton_bmm_24988 0.0068 ms 99.5% + triton_bmm_24987 0.0070 ms 96.8% + triton_bmm_24986 0.0075 ms 90.2% + triton_bmm_24985 0.0077 ms 87.6% + triton_bmm_24990 0.0085 ms 80.0% + triton_bmm_24984 0.0088 ms 77.1% + triton_bmm_24991 0.0095 ms 71.4% + triton_bmm_24992 0.0100 ms 68.2% + bmm 0.0554 ms 12.2% +SingleProcess AUTOTUNE takes 2.9435 seconds +AUTOTUNE bmm(8x1x64, 8x64x107) + triton_bmm_25228 0.0068 ms 100.0% + triton_bmm_25227 0.0070 ms 96.3% + triton_bmm_25226 0.0071 ms 95.0% + triton_bmm_25229 0.0072 ms 93.2% + triton_bmm_25230 0.0073 ms 92.5% + triton_bmm_25225 0.0075 ms 89.8% + triton_bmm_25233 0.0075 ms 89.8% + triton_bmm_25232 0.0076 ms 88.7% + triton_bmm_25234 0.0078 ms 86.8% + triton_bmm_25224 0.0080 ms 84.4% +SingleProcess AUTOTUNE takes 4.1386 seconds +AUTOTUNE bmm(8x1x107, 8x107x64) + triton_bmm_25241 0.0083 ms 100.0% + triton_bmm_25239 0.0088 ms 94.7% + triton_bmm_25240 0.0090 ms 92.5% + triton_bmm_25242 0.0091 ms 91.7% + triton_bmm_25237 0.0092 ms 89.8% + triton_bmm_25238 0.0092 ms 89.8% + triton_bmm_25244 0.0097 ms 85.4% + triton_bmm_25236 0.0103 ms 80.8% + triton_bmm_25243 0.0122 ms 67.9% + bmm 0.0562 ms 14.8% +SingleProcess AUTOTUNE takes 2.8882 seconds +AUTOTUNE bmm(8x1x64, 8x64x108) + triton_bmm_25477 0.0068 ms 100.0% + triton_bmm_25478 0.0068 ms 100.0% + triton_bmm_25481 0.0068 ms 100.0% + triton_bmm_25479 0.0070 ms 96.8% + triton_bmm_25482 0.0072 ms 93.4% + triton_bmm_25480 0.0073 ms 92.5% + triton_bmm_25483 0.0075 ms 90.2% + triton_bmm_25485 0.0075 ms 89.8% + triton_bmm_25484 0.0076 ms 89.4% + triton_bmm_25486 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.9128 seconds +AUTOTUNE bmm(8x1x108, 8x108x64) + triton_bmm_25493 0.0073 ms 100.0% + triton_bmm_25492 0.0073 ms 99.6% + triton_bmm_25491 0.0074 ms 99.1% + triton_bmm_25490 0.0076 ms 96.6% + triton_bmm_25489 0.0078 ms 93.8% + triton_bmm_25488 0.0088 ms 82.6% + triton_bmm_25494 0.0090 ms 80.9% + triton_bmm_25495 0.0097 ms 75.2% + triton_bmm_25496 0.0105 ms 69.5% + bmm 0.0542 ms 13.5% +SingleProcess AUTOTUNE takes 3.0091 seconds +AUTOTUNE bmm(8x1x64, 8x64x109) + triton_bmm_25730 0.0068 ms 100.0% + triton_bmm_25732 0.0068 ms 100.0% + triton_bmm_25733 0.0068 ms 100.0% + triton_bmm_25729 0.0070 ms 96.8% + triton_bmm_25737 0.0070 ms 95.9% + triton_bmm_25728 0.0073 ms 93.0% + triton_bmm_25734 0.0073 ms 92.5% + triton_bmm_25731 0.0077 ms 87.9% + triton_bmm_25735 0.0077 ms 87.9% + bmm 0.0077 ms 87.2% +SingleProcess AUTOTUNE takes 3.7687 seconds +AUTOTUNE bmm(8x1x109, 8x109x64) + triton_bmm_25745 0.0079 ms 100.0% + triton_bmm_25746 0.0087 ms 90.8% + triton_bmm_25741 0.0087 ms 90.5% + triton_bmm_25743 0.0090 ms 87.9% + triton_bmm_25744 0.0090 ms 87.7% + triton_bmm_25742 0.0092 ms 85.5% + triton_bmm_25748 0.0097 ms 81.2% + triton_bmm_25740 0.0103 ms 76.9% + triton_bmm_25747 0.0120 ms 65.8% + bmm 0.0552 ms 14.3% +SingleProcess AUTOTUNE takes 2.9547 seconds +AUTOTUNE bmm(8x1x64, 8x64x110) + triton_bmm_25982 0.0068 ms 100.0% + triton_bmm_25984 0.0068 ms 100.0% + triton_bmm_25985 0.0068 ms 100.0% + triton_bmm_25981 0.0070 ms 96.3% + triton_bmm_25983 0.0070 ms 96.3% + triton_bmm_25988 0.0070 ms 96.3% + triton_bmm_25989 0.0070 ms 96.3% + triton_bmm_25986 0.0072 ms 94.0% + triton_bmm_25980 0.0073 ms 93.0% + triton_bmm_25987 0.0075 ms 90.2% +SingleProcess AUTOTUNE takes 3.7493 seconds +AUTOTUNE bmm(8x1x110, 8x110x64) + triton_bmm_25997 0.0068 ms 100.0% + triton_bmm_25996 0.0069 ms 98.1% + triton_bmm_25995 0.0070 ms 97.2% + triton_bmm_25993 0.0073 ms 93.4% + triton_bmm_25994 0.0076 ms 89.8% + triton_bmm_25998 0.0085 ms 80.0% + triton_bmm_25992 0.0090 ms 75.4% + triton_bmm_25999 0.0097 ms 69.7% + triton_bmm_26000 0.0100 ms 68.2% + bmm 0.0511 ms 13.3% +SingleProcess AUTOTUNE takes 2.9350 seconds +AUTOTUNE bmm(8x1x64, 8x64x111) + triton_bmm_26234 0.0068 ms 100.0% + triton_bmm_26236 0.0068 ms 100.0% + triton_bmm_26237 0.0068 ms 100.0% + triton_bmm_26238 0.0073 ms 92.1% + triton_bmm_26233 0.0075 ms 89.8% + triton_bmm_26235 0.0076 ms 89.4% + triton_bmm_26239 0.0077 ms 87.6% + triton_bmm_26241 0.0078 ms 86.8% + triton_bmm_26242 0.0078 ms 86.8% + triton_bmm_26232 0.0078 ms 86.5% +SingleProcess AUTOTUNE takes 3.8685 seconds +AUTOTUNE bmm(8x1x111, 8x111x64) + triton_bmm_26248 0.0083 ms 100.0% + triton_bmm_26249 0.0083 ms 99.6% + triton_bmm_26246 0.0087 ms 94.9% + triton_bmm_26250 0.0087 ms 94.9% + triton_bmm_26245 0.0087 ms 94.5% + triton_bmm_26247 0.0088 ms 94.2% + triton_bmm_26244 0.0103 ms 80.4% + triton_bmm_26251 0.0122 ms 67.7% + triton_bmm_26252 0.0125 ms 66.0% + bmm 0.0516 ms 16.0% +SingleProcess AUTOTUNE takes 3.8353 seconds +TIMEOUT + loading model: 0it [00:00, ?it/s] loading model: 0it [00:16, ?it/s] +hf_T5_large +cuda eval hf_T5_large int4weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +hf_Whisper +cuda eval hf_Whisper int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +WARNING:root:hf_clip failed to load +hf_clip +Original Error: 'str' object has no attribute 'shape' +Eager model failed to run +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1931, in validate_model + self.model_iter_fn(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 1142, in forward + vision_outputs = self.vision_model( + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 883, in forward + hidden_states = self.embeddings(pixel_values) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.py", line 194, in forward + batch_size = pixel_values.shape[0] +AttributeError: 'str' object has no attribute 'shape' + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 476, in load_model + self.validate_model(model, example_inputs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1934, in validate_model + raise NotImplementedError("Eager model failed to run") from e +NotImplementedError: Eager model failed to run + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +lennard_jones +cuda eval lennard_jones int4weightonly-bs1-acc +pass-sqnr-26.612 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +llama +cuda eval llama int4weightonly-bs1-acc +pass-sqnr-19.468 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:55, ?it/s] +llama_v2_7b_16h +cuda eval llama_v2_7b_16h int4weightonly-bs1-acc +pass_due_to_skip + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +maml_omniglot +cuda eval maml_omniglot int4weightonly-bs1-acc +pass-sqnr-35.554 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mnasnet1_0 +cuda eval mnasnet1_0 int4weightonly-bs1-acc +pass-sqnr-23.173 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +mobilenet_v2 +cuda eval mobilenet_v2 int4weightonly-bs1-acc +pass-sqnr-25.390 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:mobilenet_v2_quantized_qat failed to load +mobilenet_v2_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/mobilenet_v2_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +mobilenet_v3_large +cuda eval mobilenet_v3_large int4weightonly-bs1-acc +pass-sqnr-19.524 + loading model: 0it [00:00, ?it/s]NCCL version 2.19.3+cuda12.0 + loading model: 0it [00:03, ?it/s] +moco +cuda eval moco int4weightonly-bs1-acc +WARNING:common:fp64 golden ref were not generated for moco. Setting accuracy check to cosine +ERROR:common:add_(): argument 'other' (position 1) must be Tensor, not NoneType +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2156, in check_accuracy + correct_result = self.run_n_iterations( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 540, in forward_pass + return mod(*inputs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1523, in forward + else self._run_ddp_forward(*inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/parallel/distributed.py", line 1359, in _run_ddp_forward + return self.module(*inputs, **kwargs) # type: ignore[index] + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/nn/modules/module.py", line 1520, in _call_impl + return forward_call(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 130, in forward + self._momentum_update_key_encoder() # update the key encoder + File "/home/cdhernandez/local/pytorch/torch/utils/_contextlib.py", line 115, in decorate_context + return func(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/moco/moco/builder.py", line 50, in _momentum_update_key_encoder + param_k.mul_(self.m).add_(param_q.mul(1. - self.m)) +TypeError: add_(): argument 'other' (position 1) must be Tensor, not NoneType +eager_1st_run_fail + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +nanogpt +number of parameters: 123.69M +num decayed parameter tensors: 50, with 124,354,560 parameters +num non-decayed parameter tensors: 98, with 121,344 parameters +using fused AdamW: True +cuda eval nanogpt int4weightonly-bs1-acc +pass-sqnr-10.791 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +nvidia_deeprecommender +cuda eval nvidia_deeprecommender int4weightonly-bs1-acc +pass-sqnr-41.873 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +opacus_cifar10 +cuda eval opacus_cifar10 int4weightonly-bs1-acc +pass-sqnr-24.190 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:26, ?it/s] +phi_1_5 +cuda eval phi_1_5 int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +phlippe_densenet +cuda eval phlippe_densenet int4weightonly-bs1-acc +pass-sqnr-20.121 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +phlippe_resnet +cuda eval phlippe_resnet int4weightonly-bs1-acc +pass-sqnr-26.786 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +pyhpc_equation_of_state +cuda eval pyhpc_equation_of_state int4weightonly-bs1-acc +pass-sqnr-40.034 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +pyhpc_isoneutral_mixing +cuda eval pyhpc_isoneutral_mixing int4weightonly-bs1-acc +skipping cudagraphs due to ['mutated inputs'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s]WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead + loading model: 0it [00:01, ?it/s] +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pyhpc_turbulent_kinetic_energy +cuda eval pyhpc_turbulent_kinetic_energy int4weightonly-bs1-acc +WARNING:common:Model pyhpc_turbulent_kinetic_energy does not support bfloat16, running with amp instead +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +pytorch_CycleGAN_and_pix2pix +cuda eval pytorch_CycleGAN_and_pix2pix int4weightonly-bs1-acc +pass-sqnr-33.538 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +pytorch_stargan +cuda eval pytorch_stargan int4weightonly-bs1-acc +pass-sqnr-41.851 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:01, ?it/s] +pytorch_unet +cuda eval pytorch_unet int4weightonly-bs1-acc +pass-sqnr-49.327 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +resnet152 +cuda eval resnet152 int4weightonly-bs1-acc +pass-sqnr-24.731 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +resnet18 +cuda eval resnet18 int4weightonly-bs1-acc +pass-sqnr-21.516 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +resnet50 +cuda eval resnet50 int4weightonly-bs1-acc +pass-sqnr-25.023 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:00, ?it/s] +WARNING:root:resnet50_quantized_qat failed to load +resnet50_quantized_qat +The eval test only supports CPU. +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 3559, in run + ) = runner.load_model( + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/torchbench.py", line 443, in load_model + benchmark = benchmark_cls( + File "/home/cdhernandez/local/benchmark/torchbenchmark/util/model.py", line 24, in __call__ + obj = type.__call__(cls, *args, **kwargs) + File "/home/cdhernandez/local/benchmark/torchbenchmark/models/resnet50_quantized_qat/__init__.py", line 21, in __init__ + raise NotImplementedError("The eval test only supports CPU.") +NotImplementedError: The eval test only supports CPU. + + loading model: 0it [00:00, ?it/s] loading model: 0it [00:04, ?it/s] +resnext50_32x4d +cuda eval resnext50_32x4d int4weightonly-bs1-acc +pass-sqnr-24.622 + loading model: 0it [00:00, ?it/s] loading model: 0it [00:10, ?it/s] +sam +cuda eval sam int4weightonly-bs1-acc +AUTOTUNE bmm(16x4096x80, 16x80x4096) + triton_bmm_344 0.4847 ms 100.0% + triton_bmm_350 0.5000 ms 97.0% + triton_bmm_345 0.5028 ms 96.4% + triton_bmm_343 0.5388 ms 90.0% + triton_bmm_346 0.5501 ms 88.1% + triton_bmm_347 0.5729 ms 84.6% + triton_bmm_353 0.5743 ms 84.4% + bmm 0.6196 ms 78.2% + triton_bmm_351 0.8544 ms 56.7% + triton_bmm_354 1.0133 ms 47.8% +SingleProcess AUTOTUNE takes 4.5704 seconds +AUTOTUNE bmm(16x4096x4096, 16x4096x80) + bmm 0.3902 ms 100.0% + triton_bmm_381 0.4673 ms 83.5% + triton_bmm_382 0.4826 ms 80.9% + triton_bmm_383 0.4874 ms 80.1% + triton_bmm_380 0.4958 ms 78.7% + triton_bmm_387 0.5712 ms 68.3% + triton_bmm_379 0.6339 ms 61.5% + triton_bmm_384 0.7409 ms 52.7% + triton_bmm_386 0.7414 ms 52.6% + triton_bmm_388 0.7688 ms 50.8% +SingleProcess AUTOTUNE takes 4.9125 seconds +AUTOTUNE bmm(1x4x32, 1x32x65536) + triton_bmm_1577 0.0109 ms 100.0% + triton_bmm_1578 0.0110 ms 99.7% + triton_bmm_1568 0.0111 ms 98.3% + triton_bmm_1575 0.0111 ms 98.3% + triton_bmm_1571 0.0112 ms 98.0% + triton_bmm_1572 0.0112 ms 98.0% + triton_bmm_1569 0.0113 ms 96.6% + triton_bmm_1570 0.0116 ms 94.5% + triton_bmm_1574 0.0116 ms 94.2% + triton_bmm_1576 0.0122 ms 90.0% +SingleProcess AUTOTUNE takes 3.3291 seconds +[2023-12-13 04:33:32,792] torch._dynamo.utils: [ERROR] Accuracy failed: uint8 tensor did not match +[2023-12-13 04:33:32,792] torch._dynamo.utils: [ERROR] Accuracy failed for key name masks +fail_accuracy-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:03, ?it/s] +shufflenet_v2_x1_0 +cuda eval shufflenet_v2_x1_0 int4weightonly-bs1-acc +ERROR:common:backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={clone, view_1} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True +Traceback (most recent call last): + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 2232, in check_accuracy + new_result = optimized_model_iter_fn(model_copy, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 488, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/benchmark/userbenchmark/dynamo/dynamobench/common.py", line 1972, in run_n_iterations + self.model_iter_fn(mod, inputs, collect_outputs=False) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/eval_frame.py", line 654, in catch_errors + return callback(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 727, in _convert_frame + result = inner_convert(frame, cache_entry, hooks, frame_state) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 383, in _convert_frame_assert + compiled_product = _compile( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 646, in _compile + guarded_code = compile_inner(code, one_graph, hooks, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 562, in compile_inner + out_code = transform_code_object(code, transform) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1033, in transform_code_object + transformations(instructions, code_options) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 151, in _fn + return fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/convert_frame.py", line 527, in transform + tracer.run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2144, in run + super().run() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 818, in run + and self.step() + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 781, in step + getattr(self, inst.opname)(inst) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2259, in RETURN_VALUE + self.output.compile_subgraph( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 919, in compile_subgraph + self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1087, in compile_and_call_fx_graph + compiled_fn = self.call_user_compiler(gm) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1159, in call_user_compiler + raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/output_graph.py", line 1140, in call_user_compiler + compiled_fn = compiler_fn(gm, self.example_inputs()) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper + compiled_gm = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/__init__.py", line 1672, in __call__ + return compile_fx(model_, inputs_, config_patches=self.config) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 952, in compile_fx + return compile_fx( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1168, in compile_fx + return aot_autograd( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/backends/common.py", line 55, in compiler_fn + cg = aot_module_simplified(gm, example_inputs, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 887, in aot_module_simplified + compiled_fn = create_aot_dispatcher_function( + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_functorch/aot_autograd.py", line 600, in create_aot_dispatcher_function + compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 425, in aot_wrapper_dedupe + return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 630, in aot_wrapper_synthetic_base + return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) + File "/home/cdhernandez/local/pytorch/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py", line 97, in aot_dispatch_base + compiled_fw = compiler(fw_module, updated_flat_args) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 1100, in fw_compiler_base + return inner_compile( + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/repro/after_aot.py", line 83, in debug_wrapper + inner_compiled_fn = compiler_fn(gm, example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/debug.py", line 305, in inner + return fn(*args, **kwargs) + File "/home/cdhernandez/local/miniconda3/envs/pytorch/lib/python3.10/contextlib.py", line 79, in inner + return func(*args, **kwds) + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 320, in compile_fx_inner + compiled_graph = fx_codegen_and_compile( + File "/home/cdhernandez/local/pytorch/torch/_inductor/compile_fx.py", line 535, in fx_codegen_and_compile + graph.run(*example_inputs) + File "/home/cdhernandez/local/pytorch/torch/_dynamo/utils.py", line 244, in time_wrapper + r = func(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 518, in run + return super().run(*args) + File "/home/cdhernandez/local/pytorch/torch/fx/interpreter.py", line 138, in run + self.env[node] = self.run_node(node) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 815, in run_node + result = self.call_function(n.target, args, kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 695, in call_function + raise LoweringException(e, target, args, kwargs).with_traceback( + File "/home/cdhernandez/local/pytorch/torch/_inductor/graph.py", line 692, in call_function + out = lowerings[target](*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/lowering.py", line 291, in wrapped + out = decomp_fn(*args, **kwargs) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 363, in convolution + return convert_1x1_conv_to_mm(x, weight, bias) + File "/home/cdhernandez/local/pytorch/torch/_inductor/kernel/conv.py", line 280, in convert_1x1_conv_to_mm + x.freeze_layout() + File "/home/cdhernandez/local/pytorch/torch/_inductor/ir.py", line 6264, in __getattr__ + fn = getattr(self.data, name) +torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: +LoweringException: AttributeError: 'SliceView' object has no attribute 'freeze_layout' + target: aten.convolution.default + args[0]: TensorBox( + SliceView( + View( + StorageBox( + ComputedBuffer(name='buf14', layout=FlexibleLayout('cuda', torch.bfloat16, size=[1, 58, 2, 28, 28], stride=[90944, 1568, 784, 28, 1]), data=Pointwise( + 'cuda', + torch.bfloat16, + def inner_fn(index): + _, i1, i2, i3, i4 = index + tmp0 = ops.load(buf13, i4 + 28 * i3 + 784 * i1 + 45472 * i2) + return tmp0 + , + ranges=[1, 58, 2, 28, 28], + origin_node=clone, + origins={clone} + )) + ), + size=[1, 116, 28, 28], + reindex=lambda i0, i1, i2, i3: [0, ModularIndexing(i1, 2, 58), ModularIndexing(i1, 1, 2), i2, i3], + origins={clone, view_1} + ), + size=[1, 58, 28, 28], + reindex=lambda i0, i1, i2, i3: [i0, i1 + 58, i2, i3], + origins={split} + ) + ) + args[1]: TensorBox(StorageBox( + InputBuffer(name='arg18_1', layout=FixedLayout('cuda', torch.bfloat16, size=[58, 58, 1, 1], stride=[58, 1, 1, 1])) + )) + args[2]: None + args[3]: [1, 1] + args[4]: [0, 0] + args[5]: [1, 1] + args[6]: False + args[7]: [0, 0] + args[8]: 1 + +Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information + + +You can suppress this exception and fall back to eager by setting: + import torch._dynamo + torch._dynamo.config.suppress_errors = True + +TorchDynamo optimized model failed to run because of following error +fail_to_run + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +soft_actor_critic +cuda eval soft_actor_critic int4weightonly-bs1-acc +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +speech_transformer +cuda eval speech_transformer int4weightonly-bs1-acc +skipping cudagraphs due to ['non-cuda device in graph'] +skipping cudagraphs due to ['non-cuda device in graph'] +pass-sqnr-error + loading model: 0it [00:00, ?it/s] loading model: 0it [00:02, ?it/s] +squeezenet1_1 +cuda eval squeezenet1_1 int4weightonly-bs1-acc +pass-sqnr-43.374 + loading model: 0it [00:00, ?it/s]stable_diffusion_text_encoder + + Loading pipeline components...: 0%| | 0/6 [00:00 16 + and (mod.in_features, mod.out_features) + not in [ + (768, 768), + ] + ) + +def quantize(device, model, args): + torch._dynamo.config.automatic_dynamic_shapes = False + torch._dynamo.config.force_parameter_static_shapes = False + torch._dynamo.config.cache_size_limit = 10000 + torch._inductor.config.epilogue_fusion = False + assert "cuda" in device + torch._inductor.config.force_fuse_int_mm_with_mul = True + torch._inductor.config.use_mixed_mm = True + swap_conv2d_1x1_to_linear(model) + if args.quantization=="int8dynamic": + change_linear_weights_to_int8_dqtensors(model, dynamic_quant_filter_fn) + elif args.quantization=="int8weightonly": + change_linear_weights_to_int8_woqtensors(model) + elif args.quantization=="int4weightonly": + change_linear_weights_to_int4_woqtensors(model) def main(runner, original_dir=None, args=None): if original_dir: @@ -3508,6 +3598,7 @@ def run(runner, args, original_dir=None): ) else: try: + print(model_name) with tqdm(desc="loading model"): extra_args = [] if hasattr(args, "rank") and hasattr(args, "world_size"): @@ -3548,37 +3639,69 @@ def run(runner, args, original_dir=None): batch_size=batch_size, extra_args=extra_args, ) - else: - print(model_name) - ( - device, - name, - model, - example_inputs, - batch_size, - ) = runner.load_model( - device, - model_name, - batch_size=batch_size, - extra_args=extra_args, - ) + # calculate SQNR + sqnr = None + if False and args.quantization: + ( + _, + name, + model, + example_inputs, + batch_size, + ) = runner.load_model( + device, + model_name, + batch_size=batch_size, + extra_args=extra_args, + ) + ref = runner.validate_model(model, example_inputs) + quantize(device, model, args) + act = runner.validate_model(model, example_inputs) + sqnr = get_sqnr(ref, act) + print("SQNR", sqnr) + + # find batchsize + if args.custom_find_batchsize: + batch_size = 128 res = None + while res is None and batch_size > 0: + print(f"trying batch_size {batch_size}") + try: + ( + device, + name, + model, + example_inputs, + _, + ) = runner.load_model( + device, + model_name, + batch_size=batch_size, + extra_args=extra_args, + ) + if args.quantization: + quantize(device, model, args) + res = runner.validate_model(model, example_inputs) + except: + res = None + batch_size = batch_size // 2 + else: + ( + device, + name, + model, + example_inputs, + batch_size, + ) = runner.load_model( + device, + model_name, + batch_size=batch_size, + extra_args=extra_args, + ) if args.quantization: - if args.accuracy: - res=model(*example_inputs) # to later calculate SQNR - - torch._dynamo.config.automatic_dynamic_shapes = False - torch._dynamo.config.force_parameter_static_shapes = False - torch._dynamo.config.cache_size_limit = 1000 - assert "cuda" in device - if args.quantization=="int8dynamic": - torch._inductor.config.force_fuse_int_mm_with_mul = True - change_linear_weights_to_int8_dqtensors(model) - elif args.quantization=="int8weightonly": - torch._inductor.config.use_mixed_mm = True - change_linear_weights_to_int8_woqtensors(model) - elif args.quantization=="int4weightonly": - change_linear_weights_to_int4_woqtensors(model) + quantize(device, model, args) + print(model_name, "batchsize", batch_size) + except NotImplementedError as e: print(e) @@ -3647,7 +3770,7 @@ def detect_and_mark_batch(t): experiment, explain=args.explain, tag=args.tag, - res=res, + sqnr=sqnr, ) if args.generate_aot_autograd_stats: stats_file = output_filename.split(".csv")[0] + "_stats.csv" diff --git a/userbenchmark/dynamo/dynamobench/torchbench.py b/userbenchmark/dynamo/dynamobench/torchbench.py index 3222762243..3c8e111349 100755 --- a/userbenchmark/dynamo/dynamobench/torchbench.py +++ b/userbenchmark/dynamo/dynamobench/torchbench.py @@ -265,8 +265,6 @@ def setup_torchbench_cwd(): "tts_angular", "pyhpc_turbulent_kinetic_energy", "detectron2_fcos_r_50_fpn", - "detectron2_fasterrcnn_r_101_dc5" - "detectron2_fasterrcnn_r_50_c4", "detectron2_fasterrcnn_r_101_c4", "detectron2_fasterrcnn_r_101_fpn", "detectron2_fasterrcnn_r_50_dc5", @@ -278,7 +276,11 @@ def setup_torchbench_cwd(): "demucs", } -FORCE_FP16_FOR_BF16_MODELS = {"vision_maskrcnn"} +FORCE_FP16_FOR_BF16_MODELS = { + "vision_maskrcnn", + "detectron2_fasterrcnn_r_101_dc5", + "detectron2_fasterrcnn_r_50_c4", +} # models in canary_models that we should run anyway CANARY_MODELS = { @@ -366,6 +368,7 @@ def load_model( part=None, extra_args=None, ): + breakpoint() if self.args.enable_activation_checkpointing: raise NotImplementedError( "Activation checkpointing not implemented for Torchbench models"