ROCm · aferoz21 · Sep 30, 2024 · Oct 1, 2024
@@ -3550,6 +3550,284 @@
  _WorkspaceSizePerElemBias: 0
  _WorkspaceSizePerElemC: 4
  _staggerStrideShift: 0
+ - 1LDSBuffer: 1
+ ActivationAlt: false
+ ActivationFuncCall: false
+ ActivationFused: true
+ AssertAIGreaterThanEqual: -1
+ AssertAILessThanEqual: -1
+ AssertFree0ElementMultiple: 1
+ AssertFree1ElementMultiple: 1
+ AssertSummationElementMultiple: 1
+ AssignedDerivedParameters: true
+ AssignedProblemIndependentDerivedParameters: true
+ BufferLoad: true
+ BufferStore: true
+ CUCount: null
+ ClusterLocalRead: 1
+ CodeObjectVersion: default
+ ConvertAfterDS: false
+ CustomKernelName: ''
+ DebugStreamK: 0
+ DepthU: 64
+ DirectToLds: false
+ DirectToLdsA: false
+ DirectToLdsB: false
+ DirectToVgprSparseMetadata: false
+ EdgeType: ShiftPtr
+ EnableF32XdlMathOp: false
+ EnableMatrixInstruction: true
+ ExpandPointerSwap: 0
+ ForceDisableShadowInit: false
+ GlobalReadPerMfma: 1
+ GlobalReadVectorWidthA: 8
+ GlobalReadVectorWidthB: 8
+ GlobalSplitU: 1
+ GlobalSplitUAlgorithm: MultipleBuffer
+ GlobalSplitUCoalesced: false
+ GlobalSplitUWorkGroupMappingRoundRobin: false
+ GlobalWriteVectorWidth: 2
+ GroupLoadStore: false
+ GuaranteeNoPartialA: false
+ GuaranteeNoPartialB: false
+ GuaranteeNoPartialMetadata: true
+ ISA: [9, 4, 2]
+ InnerUnroll: 1
+ InterleaveAlpha: 0
+ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
+ SupportUserGSU: true, UseUniversalArgs: true}
+ Kernel: true
+ KernelLanguage: Assembly
+ KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_UserArgs_MT32x64x64_MI16x16x1_SN_K1_LBSPPA256_LBSPPB512_LPA32_LPB16_MIWT2_1_NLCA1_NLCB1_SVW2_VWA2_VWB1_WG16_16_1
+ LSCA: 32
+ LSCB: 64
+ LSPA: 64
+ LSPB: 32
+ LVCA: 4
+ LVCB: 8
+ LVPA: 8
+ LVPB: 4
+ LdsBlockSizePerPadA: 256
+ LdsBlockSizePerPadB: 512
+ LdsBlockSizePerPadMetadata: 0
+ LdsBytesNoAmax: 13824
+ LdsInitCVgprs: false
+ LdsNumBytes: 13824
+ LdsNumElementsAlignedA: 5120
+ LdsNumElementsAlignedB: 8704
+ LdsNumElementsAlignedMetadata: 0
+ LdsOffsetA: 0
+ LdsOffsetA_Blk: 16384
+ LdsOffsetB: 5120
+ LdsOffsetB_Blk: 21504
+ LdsOffsetBias: 0
+ LdsOffsetBiasGSU: 0
+ LdsOffsetBiasNonGSU: 0
+ LdsOffsetMetadata: 13824
+ LdsOffsetMetadata_Blk: 21504
+ LdsPadA: 32
+ LdsPadB: 16
+ LdsPadMetadata: 0
+ LocalReadVectorWidth: 4
+ LocalSplitU: 1
+ LocalWritePerMfma: -1
+ LocalWriteUseSgprA: false
+ LocalWriteUseSgprB: false
+ LoopIters: 4
+ LoopUnroll: 64
+ MFMA_BF16_1K: false
+ MIArchVgpr: 1
+ MIBlock: [16, 16, 16, 1, 1, 1]
+ MIInputPerThread: 4
+ MIInputPerThreadA: 4
+ MIInputPerThreadB: 4
+ MIInputPerThreadMetadata: 4
+ MIOutputVectorWidth: 4
+ MIRegPerOut: 1
+ MIWaveGroup: [1, 4]
+ MIWaveTile: [2, 1]
+ MIWaveTileA: 2
+ MIWaveTileB: 1
+ MIWaveTileMetadata: 0
+ MacroTile0: 32
+ MacroTile1: 64
+ MacroTileA: 32
+ MacroTileB: 64
+ MagicDivAlg: 2
+ MatrixInstB: 1
+ MatrixInstBM: 1
+ MatrixInstBN: 1
+ MatrixInstK: 16
+ MatrixInstM: 16
+ MatrixInstN: 16
+ MatrixInstruction: [16, 16, 16, 1]
+ MaxOccupancy: 40
+ MaxVgprNumber: 256
+ MinVgprNumber: 0
+ NoLdsWriteCode: false
+ NoReject: false
+ NoTailLoop: false
+ NonTemporal: -1
+ NonTemporalA: 0
+ NonTemporalB: 0
+ NonTemporalC: 0
+ NonTemporalD: 0
+ NonTemporalE: 0
+ NonTemporalMetadata: 0
+ NonTemporalWS: 0
+ NumElementsPerBatchStore: 16
+ NumElementsPerThread: 8
+ NumGlobalWriteVectorsPerThread: 4
+ NumLoadsA: 1
+ NumLoadsB: 2
+ NumLoadsCoalescedA: 1
+ NumLoadsCoalescedB: 1
+ NumLoadsPerpendicularA: 1
+ NumLoadsPerpendicularB: 2
+ NumThreads: 256
+ OptNoLoadLoop: 1
+ PackedC0IdxChars: [I]
+ PackedC0IndicesX: [0]
+ PackedC1IdxChars: [J]
+ PackedC1IndicesX: [1]
+ PrefetchGlobalRead: 2
+ PrefetchLocalRead: 1
+ PreloadKernArgs: true
+ ProblemType:
+ Activation: false
+ ActivationComputeDataType: 0
+ ActivationNoGuard: false
+ ActivationType: none
+ AllowNoFreeDims: false
+ AssignedDerivedParameters: true
+ Batched: true
+ BetaOnlyUseBias: false
+ BiasDataTypeList: []
+ BiasSrc: D
+ ComplexConjugateA: false
+ ComplexConjugateB: false
+ ComputeDataType: 0
+ DataType: 4
+ DataTypeA: 4
+ DataTypeAmaxD: 0
+ DataTypeB: 4
+ DataTypeE: 4
+ DestDataType: 4
+ F32XdlMathOp: 0
+ Gradient: false
+ GroupedGemm: false
+ HighPrecisionAccumulate: true
+ Index0: 0
+ Index01A: 0
+ Index01B: 1
+ Index1: 1
+ IndexAssignmentsA: [0, 3, 2]
+ IndexAssignmentsB: [1, 3, 2]
+ IndexAssignmentsLD: [4, 5, 6, 7]
+ IndexAssignmentsMetadata: [3, 0, 2]
+ IndexUnroll: 3
+ IndexUnrollA: 1
+ IndexUnrollB: 1
+ IndexUnrollM: 0
+ IndicesBatch: [2]
+ IndicesFree: [0, 1]
+ IndicesSummation: [3]
+ MirrorDimsA: []
+ MirrorDimsB: []
+ MirrorDimsMetadata: []
+ NumIndicesBatch: 1
+ NumIndicesC: 3
+ NumIndicesFree: 2
+ NumIndicesLD: 4
+ NumIndicesSummation: 1
+ OperationType: GEMM
+ OutputAmaxD: false
+ SetConstStrideA: []
+ SetConstStrideB: []
+ SetConstStrideBias: []
+ SilentHighPrecisionAccumulate: false
+ Sparse: 0
+ StochasticRounding: false
+ StridedBatched: true
+ SupportUserArgs: true
+ TLUA: true
+ TLUB: true
+ Tensor0: 0
+ Tensor1: 1
+ TileA: 0
+ TileAwareSelection: false
+ TileB: 1
+ TotalIndices: 4
+ TransposeA: false
+ TransposeB: true
+ UseBeta: true
+ UseBias: 0
+ UseE: false
+ UseInitialStridesAB: false
+ UseInitialStridesCD: false
+ UseScaleAB: ''
+ UseScaleAlphaVec: 0
+ UseScaleCD: false
+ ScheduleGlobalRead: 1
+ ScheduleIterAlg: 3
+ ScheduleLocalWrite: 1
+ SolutionIndex: 13
+ SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_UserArgs_MT32x64x64_MI16x16x1_SN_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB512_LPA32_LPB16_MIWT2_1_NLCA1_NLCB1_SU0_SUM0_SUS0_SVW2_VWA2_VWB1_WG16_16_1_WGM1
+ SourceSwap: 1
+ StaggerU: 0
+ StaggerUMapping: 0
+ StaggerUStride: 0
+ StorePriorityOpt: 0
+ StoreRemapVectorWidth: 0
+ StoreSyncOpt: 0
+ StoreVectorWidth: 2
+ StreamK: 0
+ StreamKAtomic: 0
+ StreamKXCCMapping: 0
+ SubGroup0: 4
+ SubGroup1: 64
+ SubGroupA: 4
+ SubGroupB: 64
+ SuppressNoLoadLoop: false
+ ThreadTile: [1, 1]
+ ThreadTile0: 8
+ ThreadTile1: 1
+ ThreadTileA: 8
+ ThreadTileB: 1
+ TransposeLDS: 0
+ TransposeLDSMetadata: true
+ ULSGRODoubleG2L: 0
+ UnrollLoopSwapGlobalReadOrder: 0
+ UnrollMajorLDSA: 0
+ UnrollMajorLDSB: 0
+ UnrollMajorLDSMetadata: true
+ Use64bShadowLimit: 1
+ UseInstOffsetForGRO: 0
+ UseSgprForGRO: -1
+ Valid: true
+ VectorStore: -1
+ VectorWidthA: 2
+ VectorWidthB: 1
+ WaveSeparateGlobalReadA: 0
+ WaveSeparateGlobalReadB: 0
+ WaveSeparateGlobalReadMetadata: 0
+ WavefrontSize: 64
+ WorkGroup: [16, 16, 1]
+ WorkGroupMapping: 1
+ WorkGroupMappingXCC: 1
+ WorkGroupMappingXCCGroup: 0
+ WorkGroupReduction: false
+ WorkspaceCheck: [4, 0, 1]
+ _DepthU: 64
+ _DepthUA: 64
+ _DepthUB: 64
+ _DepthUMetadata: 64
+ _GlobalAccumulation: MultipleBuffer
+ _UseSgprForGRO: false
+ _VectorStore: 1
+ _WorkspaceSizePerElemBias: 0
+ _WorkspaceSizePerElemC: 4
+ _staggerStrideShift: 0
 - [2, 3, 0, 1]
 - - - [512, 512, 1, 512]
  - [0, 0.0]
@@ -3601,6 +3879,8 @@
  - [12, 0.0]
  - - [64, 178, 5604, 178]
  - [12, 0.0]
+ - - [768, 768, 1, 512]
+ - [13, 0.0]
 - null
 - null
 - DeviceEfficiency