Skip to content

Commit

Permalink
Review suggestion to add 1 more GEMM size for HHS NT
Browse files Browse the repository at this point in the history
  • Loading branch information
Feroz committed Oct 1, 2024
1 parent 05f1c0a commit d3c6a09
Showing 1 changed file with 279 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -518673,6 +518673,284 @@
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
- 1LDSBuffer: 1
ActivationAlt: false
ActivationFuncCall: true
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BufferLoad: true
BufferStore: true
CUCount: null
ClusterLocalRead: 1
CodeObjectVersion: default
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: false
DirectToLdsA: false
DirectToLdsB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ForceDisableShadowInit: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 8
GlobalSplitU: 3
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 2
GroupLoadStore: false
GuaranteeNoPartialA: false
GuaranteeNoPartialB: false
GuaranteeNoPartialMetadata: true
ISA: [9, 4, 2]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: true, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_AS_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_K1_LBSPPA1536_LBSPPB1024_LPA32_LPB0_MIAV0_MIWT6_4_NTC3_NTD3_NLCA3_NLCB1_SVW2_VWA2_VWB4_WG32_8_1
LSCA: 64
LSCB: 128
LSPA: 32
LSPB: 16
LVCA: 8
LVCB: 16
LVPA: 4
LVPB: 2
LdsBlockSizePerPadA: 1536
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 41984
LdsInitCVgprs: false
LdsNumBytes: 41984
LdsNumElementsAlignedA: 25600
LdsNumElementsAlignedB: 16384
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
LdsOffsetB: 25600
LdsOffsetB_Blk: 91136
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 41984
LdsOffsetMetadata_Blk: 91136
LdsPadA: 32
LdsPadB: 0
LdsPadMetadata: 0
LocalReadVectorWidth: 4
LocalSplitU: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LoopIters: 4
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: 0
MIBlock: [16, 16, 16, 1, 1, 1]
MIInputPerThread: 4
MIInputPerThreadA: 4
MIInputPerThreadB: 4
MIInputPerThreadMetadata: 4
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [6, 4]
MIWaveTileA: 6
MIWaveTileB: 4
MIWaveTileMetadata: 0
MacroTile0: 192
MacroTile1: 128
MacroTileA: 192
MacroTileB: 128
MagicDivAlg: 2
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 16
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 16, 1]
MaxOccupancy: 40
MaxVgprNumber: 256
MinVgprNumber: 0
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 3
NonTemporalD: 3
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 16
NumElementsPerThread: 96
NumGlobalWriteVectorsPerThread: 48
NumLoadsA: 6
NumLoadsB: 4
NumLoadsCoalescedA: 3
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 2
NumLoadsPerpendicularB: 4
NumThreads: 256
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
ProblemType:
Activation: true
ActivationComputeDataType: 0
ActivationNoGuard: false
ActivationType: hipblaslt_all
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
BetaOnlyUseBias: false
BiasDataTypeList: [0, 4]
BiasSrc: D
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
DataType: 4
DataTypeA: 4
DataTypeAmaxD: 0
DataTypeB: 4
DataTypeE: 4
DestDataType: 4
F32XdlMathOp: 0
Gradient: false
GroupedGemm: false
HighPrecisionAccumulate: true
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexAssignmentsMetadata: [3, 0, 2]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndexUnrollM: 0
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
MirrorDimsMetadata: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
OutputAmaxD: false
SetConstStrideA: []
SetConstStrideB: []
SetConstStrideBias: []
SilentHighPrecisionAccumulate: false
Sparse: 0
StochasticRounding: false
StridedBatched: true
SupportUserArgs: true
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: false
TransposeB: true
UseBeta: true
UseBias: 1
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: ''
UseScaleAlphaVec: 1
UseScaleCD: false
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 2010
SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_AS_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_GSU3_GSUC0_GSUWGMRR0_K1_LBSPPA1536_LBSPPB1024_LPA32_LPB0_MIAV0_MIWT6_4_NTC3_NTD3_NLCA3_NLCB1_SU0_SUM0_SUS0_SVW2_VWA2_VWB4_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1
SourceSwap: 1
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 0
StorePriorityOpt: 0
StoreRemapVectorWidth: 0
StoreSyncOpt: 0
StoreVectorWidth: 2
StreamK: 0
StreamKAtomic: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
SubGroupA: 8
SubGroupB: 32
SuppressNoLoadLoop: false
ThreadTile: [1, 1]
ThreadTile0: 24
ThreadTile1: 4
ThreadTileA: 24
ThreadTileB: 4
TransposeLDS: 0
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: 0
UnrollMajorLDSB: 0
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseInstOffsetForGRO: 0
UseSgprForGRO: -1
Valid: true
VectorStore: -1
VectorWidthA: 2
VectorWidthB: 4
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 8
WorkGroupMappingXCC: 1
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 3]
_DepthU: 64
_DepthUA: 64
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: MultipleBuffer
_UseSgprForGRO: false
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
- [2, 3, 0, 1]
- - - [16, 64, 1, 32]
- [71, 0.0]
Expand Down Expand Up @@ -524313,7 +524591,7 @@
- - [768, 3072, 1, 32]
- [445, 0.0]
- - [768, 3072, 1, 8192]
- [862, 0.0]
- [2010, 0.0]
- - [768, 3328, 1, 4096]
- [1192, 0.0]
- - [768, 3344, 1, 2048]
Expand Down

0 comments on commit d3c6a09

Please sign in to comment.