Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tune Aquavanjaram942 HHS NN TN NT GEMM sizes equality and grid based #1184

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3550,6 +3550,284 @@
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
- 1LDSBuffer: 1
ActivationAlt: false
ActivationFuncCall: false
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BufferLoad: true
BufferStore: true
CUCount: null
ClusterLocalRead: 1
CodeObjectVersion: default
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: false
DirectToLdsA: false
DirectToLdsB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ForceDisableShadowInit: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 8
GlobalSplitU: 1
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 2
GroupLoadStore: false
GuaranteeNoPartialA: false
GuaranteeNoPartialB: false
GuaranteeNoPartialMetadata: true
ISA: [9, 4, 2]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: true, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_UserArgs_MT32x64x64_MI16x16x1_SN_K1_LBSPPA256_LBSPPB512_LPA32_LPB16_MIWT2_1_NLCA1_NLCB1_SVW2_VWA2_VWB1_WG16_16_1
LSCA: 32
LSCB: 64
LSPA: 64
LSPB: 32
LVCA: 4
LVCB: 8
LVPA: 8
LVPB: 4
LdsBlockSizePerPadA: 256
LdsBlockSizePerPadB: 512
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 13824
LdsInitCVgprs: false
LdsNumBytes: 13824
LdsNumElementsAlignedA: 5120
LdsNumElementsAlignedB: 8704
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 16384
LdsOffsetB: 5120
LdsOffsetB_Blk: 21504
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 13824
LdsOffsetMetadata_Blk: 21504
LdsPadA: 32
LdsPadB: 16
LdsPadMetadata: 0
LocalReadVectorWidth: 4
LocalSplitU: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LoopIters: 4
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: 1
MIBlock: [16, 16, 16, 1, 1, 1]
MIInputPerThread: 4
MIInputPerThreadA: 4
MIInputPerThreadB: 4
MIInputPerThreadMetadata: 4
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [1, 4]
MIWaveTile: [2, 1]
MIWaveTileA: 2
MIWaveTileB: 1
MIWaveTileMetadata: 0
MacroTile0: 32
MacroTile1: 64
MacroTileA: 32
MacroTileB: 64
MagicDivAlg: 2
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 16
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 16, 1]
MaxOccupancy: 40
MaxVgprNumber: 256
MinVgprNumber: 0
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 0
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 16
NumElementsPerThread: 8
NumGlobalWriteVectorsPerThread: 4
NumLoadsA: 1
NumLoadsB: 2
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 1
NumLoadsPerpendicularB: 2
NumThreads: 256
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
ProblemType:
Activation: false
ActivationComputeDataType: 0
ActivationNoGuard: false
ActivationType: none
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
BetaOnlyUseBias: false
BiasDataTypeList: []
BiasSrc: D
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
DataType: 4
DataTypeA: 4
DataTypeAmaxD: 0
DataTypeB: 4
DataTypeE: 4
DestDataType: 4
F32XdlMathOp: 0
Gradient: false
GroupedGemm: false
HighPrecisionAccumulate: true
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexAssignmentsMetadata: [3, 0, 2]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndexUnrollM: 0
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
MirrorDimsMetadata: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
OutputAmaxD: false
SetConstStrideA: []
SetConstStrideB: []
SetConstStrideBias: []
SilentHighPrecisionAccumulate: false
Sparse: 0
StochasticRounding: false
StridedBatched: true
SupportUserArgs: true
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: false
TransposeB: true
UseBeta: true
UseBias: 0
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: ''
UseScaleAlphaVec: 0
UseScaleCD: false
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 13
SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_UserArgs_MT32x64x64_MI16x16x1_SN_GSU1_GSUC0_GSUWGMRR0_K1_LBSPPA256_LBSPPB512_LPA32_LPB16_MIWT2_1_NLCA1_NLCB1_SU0_SUM0_SUS0_SVW2_VWA2_VWB1_WG16_16_1_WGM1
SourceSwap: 1
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 0
StorePriorityOpt: 0
StoreRemapVectorWidth: 0
StoreSyncOpt: 0
StoreVectorWidth: 2
StreamK: 0
StreamKAtomic: 0
StreamKXCCMapping: 0
SubGroup0: 4
SubGroup1: 64
SubGroupA: 4
SubGroupB: 64
SuppressNoLoadLoop: false
ThreadTile: [1, 1]
ThreadTile0: 8
ThreadTile1: 1
ThreadTileA: 8
ThreadTileB: 1
TransposeLDS: 0
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: 0
UnrollMajorLDSB: 0
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseInstOffsetForGRO: 0
UseSgprForGRO: -1
Valid: true
VectorStore: -1
VectorWidthA: 2
VectorWidthB: 1
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WavefrontSize: 64
WorkGroup: [16, 16, 1]
WorkGroupMapping: 1
WorkGroupMappingXCC: 1
WorkGroupMappingXCCGroup: 0
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 1]
_DepthU: 64
_DepthUA: 64
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: MultipleBuffer
_UseSgprForGRO: false
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
- [2, 3, 0, 1]
- - - [512, 512, 1, 512]
- [0, 0.0]
Expand Down Expand Up @@ -3601,6 +3879,8 @@
- [12, 0.0]
- - [64, 178, 5604, 178]
- [12, 0.0]
- - [768, 768, 1, 512]
- [13, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Loading