26template <
typename GridwiseGemm,
27 bool HasMainKBlockLoop,
32#if CK_USE_LAUNCH_BOUNDS
38#if defined(__gfx9__) || defined(__gfx12__)
39 if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
41 __shared__
char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
46 auto splitk_batch_offset =
typename GridwiseGemm::SplitKBatchOffset(karg);
48 const index_t num_k_per_block = GridwiseGemm::CalculateBK0Shuffled(karg.K);
49 const index_t k_id = blockIdx.z * num_k_per_block;
51 GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
52 karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
54 karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
65template <
typename GridwiseGemm,
66 bool HasMainKBlockLoop,
71#if CK_USE_LAUNCH_BOUNDS
77#if defined(__gfx9__) || defined(__gfx12__)
78 if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
82 __shared__
char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
83 __shared__
char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
88 auto splitk_batch_offset =
typename GridwiseGemm::SplitKBatchOffset(karg);
90 const index_t num_k_per_block = GridwiseGemm::CalculateBK0Shuffled(karg.K);
91 const index_t k_id = blockIdx.z * num_k_per_block;
93 GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
94 karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
96 karg.p_c_grid + splitk_batch_offset.c_reduce_offset,
108template <
typename ALayout,
113 typename AccDataType,
114 typename CShuffleDataType,
116 typename AElementwiseOperation,
117 typename BElementwiseOperation,
118 typename CElementwiseOperation,
130 typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
131 typename ABlockTransferThreadClusterArrangeOrder,
132 typename ABlockTransferSrcAccessOrder,
133 index_t ABlockTransferSrcVectorDim,
134 index_t ABlockTransferSrcScalarPerVector,
135 index_t ABlockTransferDstScalarPerVector_AK1,
136 bool AThreadTransferSrcResetCoordinateAfterRun,
138 typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
139 typename BBlockTransferThreadClusterArrangeOrder,
140 typename BBlockTransferSrcAccessOrder,
141 index_t BBlockTransferSrcVectorDim,
142 index_t BBlockTransferSrcScalarPerVector,
143 index_t BBlockTransferDstScalarPerVector_BK1,
144 bool BThreadTransferSrcResetCoordinateAfterRun,
146 index_t CShuffleMXdlPerWavePerShuffle,
147 index_t CShuffleNXdlPerWavePerShuffle,
148 typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
149 index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
152 typename ComputeTypeA = CDataType,
153 typename ComputeTypeB = ComputeTypeA,
154 bool PermuteA =
false,
155 bool PermuteB =
false>
240 auto K_t = K_Batch * KPerBlock;
241 return (K + K_t - 1) / K_t * (KPerBlock / AK1Value);
246 auto K_t = K_Batch * KPerBlock;
247 return (K + K_t - 1) / K_t * (KPerBlock / BK1Value);
252 auto K_t = K_Batch * KPerBlock;
253 return (K + K_t - 1) / K_t * KPerBlock;
259 auto K_t = K_Batch * KReadVec;
260 return (K + K_t - 1) / K_t * KReadVec;
273 template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl,
typename TileDesc_K0_MN_K1>
291 const auto a_grid_desc_mraw_kraw = [&]() {
304 if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
305 GemmSpec == GemmSpecialization::MNKPadding)
308 const auto a_grid_desc_m_k =
322 return a_grid_desc_ak0_m_ak1;
324 else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
325 GemmSpec == GemmSpecialization::MNPadding)
329 a_grid_desc_mraw_kraw,
335 return a_grid_desc_ak0_m_ak1;
337 else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
338 GemmSpec == GemmSpecialization::NKPadding)
342 a_grid_desc_mraw_kraw,
354 return a_grid_desc_ak0_m_ak1;
360 a_grid_desc_mraw_kraw,
366 return a_grid_desc_ak0_m_ak1;
372 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
373 constexpr index_t WaveSize = BlockSize / (MWave *
NWave);
377 make_tuple(
NWave * K0 * NkSwizzleNumber, K0 * NkSwizzleNumber, NkSwizzleNumber,
I1));
383 const auto b_grid_desc_nraw_kraw = [&]() {
397 GemmSpec != GemmSpecialization::Default),
398 "pk_i4_t does not support padding");
400 if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
401 GemmSpec == GemmSpecialization::MNKPadding)
404 const auto b_grid_desc_n_k =
418 return b_grid_desc_bk0_n_bk1;
420 else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
421 GemmSpec == GemmSpecialization::MNPadding)
425 b_grid_desc_nraw_kraw,
431 return b_grid_desc_bk0_n_bk1;
433 else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
434 GemmSpec == GemmSpecialization::MKPadding)
438 b_grid_desc_nraw_kraw,
450 return b_grid_desc_bk0_n_bk1;
454 if constexpr(!PermuteB)
458 b_grid_desc_nraw_kraw,
464 return b_grid_desc_bk0_n_bk1;
470 constexpr index_t BK01 = KPerBlock / BK1Value;
471 const index_t BK0_ = StrideB / BK1Value;
472 const index_t BK00 = BK0_ / BK01;
474 const auto b_grid_desc_bk00_n_bk01_bk1_permute =
478 b_grid_desc_bk00_n_bk01_bk1_permute,
485 return b_grid_desc_bk0_n_bk1_permute;
490 template <
typename ABlockDesc_AK0_M_AK1>
491 __host__ __device__
static constexpr auto
494 constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
499 template <
typename BBlockDesc_BK0_N_BK1>
500 __host__ __device__
static constexpr auto
510 __host__ __device__
static auto
513 const auto c_grid_desc_mraw_nraw = [&]() {
533 if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
534 GemmSpec == GemmSpecialization::MNKPadding)
543 else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
544 GemmSpec == GemmSpecialization::MKPadding)
548 c_grid_desc_mraw_nraw,
553 else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
554 GemmSpec == GemmSpecialization::NKPadding)
558 c_grid_desc_mraw_nraw,
566 return c_grid_desc_mraw_nraw;
600 std::cout <<
"problem {" <<
"M:" <<
M <<
", " <<
"N:" <<
N <<
", " <<
"K:" <<
K <<
", "
603 <<
"KRead:" <<
KRead <<
", " <<
"KP:" <<
KPadded <<
", " <<
"AK0:" <<
AK0
604 <<
", " <<
"BK0:" <<
BK0 <<
", " <<
"MBlock: " <<
MBlock <<
", "
605 <<
"NBlock: " <<
NBlock <<
"}" << std::endl;
629 const BDataType* p_b_grid_,
630 CDataType* p_c_grid_,
638 bool is_reduce_ =
false)
639 :
Problem{M_, N_, K_, StrideA_, StrideB_, StrideC_, k_batch_},
702 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
703 constexpr index_t WaveSize = BlockSize / (MWave *
NWave);
705 if constexpr(ABlockLdsExtraM)
715 constexpr auto a_lds_block_desc =
727 return a_lds_block_desc_permuted;
734 constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(
I1);
735 constexpr auto M1 = MPerBlock / M0;
737 constexpr auto KThreadWrite = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(
I0);
738 constexpr auto K0PerThreadWrite =
AK0Number / KThreadWrite;
739 constexpr auto KThreadRead = WaveSize / MPerXdl;
740 constexpr auto K0PerThreadRead =
AK0Number / KThreadRead;
742 constexpr auto kfold = (
AK1Number * M0 *
sizeof(ADataType) > 128)
744 : 128 / (
AK1Number * M0 *
sizeof(ADataType));
745 constexpr auto KThreadReadPerm =
746 (kfold * K0PerThreadWrite / K0PerThreadRead) > 1
747 ? KThreadRead / (kfold * K0PerThreadWrite / K0PerThreadRead)
751 constexpr auto mpair = (
AK1Number * MPerXdl *
sizeof(ADataType) > 128)
753 : ((128 / (
AK1Number * MPerXdl *
sizeof(ADataType))) > M0
755 : 128 / (
AK1Number * MPerXdl *
sizeof(ADataType)));
761 Number<kfold * M0 / mpair>{},
780 a_lds_block_desc_permuted,
802 a_lds_block_desc_unmerged,
805 Number<KThreadWrite / kfold / KThreadReadPerm>{},
814 return a_lds_block_desc_ak0_m_ak1;
830 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
832 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
839 return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
857 ABlockTransferSrcScalarPerVector,
858 BBlockTransferSrcScalarPerVector,
877 a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
880 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
883 constexpr auto c_block_size =
884 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
887 c_block_size *
sizeof(CShuffleDataType));
895 static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
896 (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
897 "Invalid tuning param!");
899 if constexpr(NXdlPerWave % CShuffleNXdlPerWavePerShuffle != 0)
910 if(!(karg.M % MPerBlock == 0))
914 std::cout <<
"Arg M value is not a multiple of MPerBlock! M: " << karg.M <<
" "
915 << __FILE__ <<
":" << __LINE__ <<
", in function: " << __func__
928 if(!(karg.N % NPerBlock == 0))
932 std::cout <<
"Arg N value is not a multiple of NPerBlock! N: " << karg.N <<
" "
933 << __FILE__ <<
":" << __LINE__ <<
", in function: " << __func__
946 auto K_t = karg.KBatch * KPerBlock;
947 if(!(karg.K % K_t == 0))
951 std::cout <<
"Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
952 << karg.K <<
" " << __FILE__ <<
":" << __LINE__
953 <<
", in function: " << __func__ << std::endl;
961 auto K_t = karg.KBatch * KReadVec;
963 if((KReadPadSplited * (karg.KBatch - 1)) >= karg.K)
971 if(karg.K % ABlockTransferSrcScalarPerVector != 0)
975 std::cout <<
"Arg K (" << karg.K
976 <<
") value is not a multiple of ABlockTransferSrcScalarPerVector ("
977 << ABlockTransferSrcScalarPerVector <<
" )! " << __FILE__ <<
":"
978 << __LINE__ <<
", in function: " << __func__ << std::endl;
985 if(karg.M % ABlockTransferSrcScalarPerVector != 0)
989 std::cout <<
"Arg M (" << karg.M
990 <<
") value is not a multiple of ABlockTransferSrcScalarPerVector ("
991 << ABlockTransferSrcScalarPerVector <<
" )! " << __FILE__ <<
":"
992 << __LINE__ <<
", in function: " << __func__ << std::endl;
1000 if(karg.N % BBlockTransferSrcScalarPerVector != 0)
1004 std::cout <<
"Arg N (" << karg.N
1005 <<
") value is not a multiple of BBlockTransferSrcScalarPerVector ("
1006 << BBlockTransferSrcScalarPerVector <<
" )! " << __FILE__ <<
":"
1007 << __LINE__ <<
", in function: " << __func__ << std::endl;
1014 if(karg.K % BBlockTransferSrcScalarPerVector != 0)
1018 std::cout <<
"Arg K (" << karg.K
1019 <<
") value is not a multiple of BBlockTransferSrcScalarPerVector ("
1020 << BBlockTransferSrcScalarPerVector <<
" )! " << __FILE__ <<
":"
1021 << __LINE__ <<
", in function: " << __func__ << std::endl;
1029 if(karg.N % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
1033 std::cout <<
"Arg N (" << karg.N
1034 <<
") value is not a multiple of "
1035 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
1036 << CShuffleBlockTransferScalarPerVector_NPerBlock <<
" )! "
1037 << __FILE__ <<
":" << __LINE__ <<
", in function: " << __func__
1045 if(karg.M % CShuffleBlockTransferScalarPerVector_NPerBlock != 0)
1049 std::cout <<
"Arg M (" << karg.M
1050 <<
") value is not a multiple of "
1051 "CShuffleBlockTransferScalarPerVector_NPerBlock ("
1052 << CShuffleBlockTransferScalarPerVector_NPerBlock <<
" )! "
1053 << __FILE__ <<
":" << __LINE__ <<
", in function: " << __func__
1065 if(!karg.IsReduceAdd())
1069 std::cout <<
" KBatch: " << karg.KBatch <<
" > 1 is not support yet" << __FILE__
1070 <<
":" << __LINE__ <<
", in function: " << __func__ << std::endl;
1080 const auto num_k_loop = karg.AK0 / (KPerBlock / AK1Value);
1082 if(num_k_loop <= BlockwiseGemmPipe::PrefetchStages)
1093 const index_t num_loop = K / KPerBlock;
1095 return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
1100 const index_t num_loop = K / KPerBlock;
1102 return BlockwiseGemmPipe::BlockLoopTailNum(num_loop);
1105 template <
typename CGr
idDesc>
1107 const CGridDesc& c_grid_desc_m_n,
index_t MBlock,
index_t NBlock)
1116 return c_grid_desc_mblock_mperblock_nblock_nperblock;
1124 template <
typename AGridDesc_AK0_M_K1,
1125 typename BGridDesc_BPreshuffled,
1126 typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
1127 bool HasMainKBlockLoop,
1130 __device__
static void Run(
const ADataType* p_a_grid,
1131 const BDataType* p_b_grid,
1132 CDataType* p_c_grid,
1134 const Problem& problem,
1135 const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
1136 const BGridDesc_BPreshuffled& b_grid_desc_bpreshuffled,
1137 const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
1138 c_grid_desc_mblock_mperblock_nblock_nperblock,
1142 p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
1144 p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
1146 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
1148 const AElementwiseOperation a_element_op{};
1150 const CElementwiseOperation c_element_op{};
1153 const auto block_2_ctile_map =
Block2CTileMap{problem.M, problem.N, 4};
1155 const auto block_work_idx =
1158 if(!block_2_ctile_map.ValidCTileIndex(
1160 make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I0),
1161 c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I2))))
1166 const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[
I0]);
1167 const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[
I1]);
1170 const index_t m_block_data_idx_on_grid =
1171 __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
1174 const index_t n_block_data_idx_on_grid =
1175 __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
1184 auto a_blockwise_copy =
1186 AElementwiseOperation,
1190 ABlockTransferThreadClusterLengths_AK0_M_AK1,
1191 ABlockTransferThreadClusterArrangeOrder,
1194 decltype(a_grid_desc_ak0_m_ak1),
1195 decltype(a_block_desc_ak0_m_ak1),
1196 ABlockTransferSrcAccessOrder,
1198 ABlockTransferSrcVectorDim,
1200 ABlockTransferSrcScalarPerVector,
1201 ABlockTransferDstScalarPerVector_AK1,
1204 AThreadTransferSrcResetCoordinateAfterRun,
1206 BlockwiseGemmPipe::GlobalBufferNum>(
1207 a_grid_desc_ak0_m_ak1,
1210 a_block_desc_ak0_m_ak1,
1216 b_block_desc_bk0_n_bk1.GetElementSpaceSize());
1221 decltype(b_grid_desc_bpreshuffled),
1222 decltype(b_block_desc_bk0_n_bk1),
1226 BBlockTransferSrcScalarPerVector,
1227 BThreadTransferSrcResetCoordinateAfterRun,
1228 true>(b_grid_desc_bpreshuffled,
1238 static_cast<ADataType*
>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
1244 static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
1246 auto c_thread_buf = blockwise_gemm_pipeline.GetCThreadBuffer();
1248 const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
1249 (a_grid_desc_ak0_m_ak1.GetLength(
I0) * a_grid_desc_ak0_m_ak1.GetLength(
I2)) /
1253 a_block_desc_ak0_m_ak1,
1257 a_block_slice_copy_step,
1258 b_grid_desc_bpreshuffled,
1262 b_block_slice_copy_step,
1264 num_k_block_main_loop);
1268 static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
1269 NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
1272 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
1275 constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
1276 blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1280 constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
1281 blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1283 constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I0);
1284 constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I1);
1285 constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I2);
1286 constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I3);
1287 constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I4);
1288 constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I5);
1289 constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I6);
1290 constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I7);
1292 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
1296 static_cast<CShuffleDataType*
>(p_shared),
1297 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
1300 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1320 const auto c_thread_mtx_on_block =
1321 blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(
I0,
I0,
I0,
I0);
1323 const index_t m_thread_data_on_block = c_thread_mtx_on_block[
I0];
1324 const index_t n_thread_data_on_block = c_thread_mtx_on_block[
I1];
1326 const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
1332 const auto m_thread_data_on_block_idx =
1333 m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
1336 const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
1342 const auto n_thread_data_on_block_idx =
1343 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
1347 auto c_thread_copy_vgpr_to_lds =
1350 decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1351 decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1353 Sequence<CShuffleMXdlPerWavePerShuffle,
1354 CShuffleNXdlPerWavePerShuffle,
1367 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1370 m_thread_data_on_block_idx[
I1],
1371 n_thread_data_on_block_idx[
I1],
1372 m_thread_data_on_block_idx[
I2],
1373 m_thread_data_on_block_idx[
I3],
1374 m_thread_data_on_block_idx[
I4],
1375 n_thread_data_on_block_idx[
I2]),
1381 CElementwiseOperation,
1382 CGlobalMemoryDataOperation,
1384 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
1386 CShuffleNXdlPerWavePerShuffle *
NWave * NPerXdl>,
1387 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
1391 decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
1392 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
1395 CShuffleBlockTransferScalarPerVector_NPerBlock,
1398 {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1400 c_grid_desc_mblock_mperblock_nblock_nperblock,
1405 constexpr auto sfc_c_vgpr =
1408 Sequence<CShuffleMXdlPerWavePerShuffle,
1409 CShuffleNXdlPerWavePerShuffle,
1418 constexpr auto sfc_c_global =
1422 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
1424 CShuffleNXdlPerWavePerShuffle *
NWave * NPerXdl>>{};
1426 constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
1428 static_assert(num_access == sfc_c_global.GetNumOfAccess(),
"wrong!");
1435 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1436 sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
1438 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1439 c_shuffle_block_buf);
1445 c_shuffle_block_copy_lds_to_global.Run(
1446 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1447 c_shuffle_block_buf,
1448 c_grid_desc_mblock_mperblock_nblock_nperblock,
1451 if constexpr(access_id < num_access - 1)
1453 constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
1456 c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
1457 c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
1463 template <
bool HasMainKBlockLoop,
1466 __device__
static void Run(
const ADataType* p_a_grid,
1467 const BDataType* p_b_grid,
1468 CDataType* p_c_grid,
1470 const Problem& problem,
1477 problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
1478 const auto b_grid_desc_bpreshuffled =
1481 problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
1482 const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
1484 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
1486 Run<
decltype(a_grid_desc_ak0_m_ak1),
1487 decltype(b_grid_desc_bpreshuffled),
1488 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
1490 CGlobalMemoryDataOperation,
1496 a_grid_desc_ak0_m_ak1,
1497 b_grid_desc_bpreshuffled,
1498 c_grid_desc_mblock_mperblock_nblock_nperblock,
1502 template <
typename AGridDesc_AK0_M_K1,
1503 typename BGridDesc_BPreshuffled,
1504 typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
1505 bool HasMainKBlockLoop,
1508 __device__
static void Run_2Lds(
const ADataType* p_a_grid,
1509 const BDataType* p_b_grid,
1510 CDataType* p_c_grid,
1513 const Problem& problem,
1514 const AGridDesc_AK0_M_K1& a_grid_desc_ak0_m_ak1,
1515 const BGridDesc_BPreshuffled& b_grid_desc_bpreshuffled,
1516 const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
1517 c_grid_desc_mblock_mperblock_nblock_nperblock,
1521 p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
1523 p_b_grid, b_grid_desc_bpreshuffled.GetElementSpaceSize());
1525 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
1527 const AElementwiseOperation a_element_op{};
1529 const CElementwiseOperation c_element_op{};
1532 const auto block_2_ctile_map =
Block2CTileMap{problem.M, problem.N, 4};
1534 const auto block_work_idx =
1537 if(!block_2_ctile_map.ValidCTileIndex(
1539 make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I0),
1540 c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I2))))
1545 const index_t block_m_id = __builtin_amdgcn_readfirstlane(block_work_idx[
I0]);
1546 const index_t block_n_id = __builtin_amdgcn_readfirstlane(block_work_idx[
I1]);
1549 const index_t m_block_data_idx_on_grid =
1550 __builtin_amdgcn_readfirstlane(block_m_id * MPerBlock);
1553 const index_t n_block_data_idx_on_grid =
1554 __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
1563 auto a_blockwise_copy =
1565 AElementwiseOperation,
1569 ABlockTransferThreadClusterLengths_AK0_M_AK1,
1570 ABlockTransferThreadClusterArrangeOrder,
1573 decltype(a_grid_desc_ak0_m_ak1),
1574 decltype(a_block_desc_ak0_m_ak1),
1575 ABlockTransferSrcAccessOrder,
1577 ABlockTransferSrcVectorDim,
1579 ABlockTransferSrcScalarPerVector,
1580 ABlockTransferDstScalarPerVector_AK1,
1583 AThreadTransferSrcResetCoordinateAfterRun,
1586 a_grid_desc_ak0_m_ak1,
1589 a_block_desc_ak0_m_ak1,
1597 b_block_desc_bk0_n_bk1.GetElementSpaceSize());
1599 b_block_desc_bk0_n_bk1.GetElementSpaceSize());
1600 auto b_block_bufs =
make_tuple(b_block_buf_ping, b_block_buf_pong);
1605 decltype(b_grid_desc_bpreshuffled),
1606 decltype(b_block_desc_bk0_n_bk1),
1610 BBlockTransferSrcScalarPerVector,
1611 BThreadTransferSrcResetCoordinateAfterRun,
1612 true>(b_grid_desc_bpreshuffled,
1620 static_cast<ADataType*
>(p_shared_0), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
1623 static_cast<ADataType*
>(p_shared_1), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
1625 auto a_block_bufs =
make_tuple(a_block_buf_ping, a_block_buf_pong);
1631 static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
1633 auto c_thread_buf = blockwise_gemm_pipeline.GetCThreadBuffer();
1635 const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
1636 (a_grid_desc_ak0_m_ak1.GetLength(
I0) * a_grid_desc_ak0_m_ak1.GetLength(
I2)) /
1640 a_block_desc_ak0_m_ak1,
1644 a_block_slice_copy_step,
1645 b_grid_desc_bpreshuffled,
1649 b_block_slice_copy_step,
1651 num_k_block_main_loop);
1655 static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
1656 NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
1659 constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
1662 constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
1663 blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1667 constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
1668 blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1670 constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I0);
1671 constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I1);
1672 constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I2);
1673 constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I3);
1674 constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I4);
1675 constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I5);
1676 constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I6);
1677 constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(
I7);
1679 constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
1683 static_cast<CShuffleDataType*
>(p_shared_0),
1684 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
1687 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1707 const auto c_thread_mtx_on_block =
1708 blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(
I0,
I0,
I0,
I0);
1710 const index_t m_thread_data_on_block = c_thread_mtx_on_block[
I0];
1711 const index_t n_thread_data_on_block = c_thread_mtx_on_block[
I1];
1713 const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
1719 const auto m_thread_data_on_block_idx =
1720 m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
1723 const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
1729 const auto n_thread_data_on_block_idx =
1730 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
1734 auto c_thread_copy_vgpr_to_lds =
1737 decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1738 decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1740 Sequence<CShuffleMXdlPerWavePerShuffle,
1741 CShuffleNXdlPerWavePerShuffle,
1754 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1757 m_thread_data_on_block_idx[
I1],
1758 n_thread_data_on_block_idx[
I1],
1759 m_thread_data_on_block_idx[
I2],
1760 m_thread_data_on_block_idx[
I3],
1761 m_thread_data_on_block_idx[
I4],
1762 n_thread_data_on_block_idx[
I2]),
1768 CElementwiseOperation,
1769 CGlobalMemoryDataOperation,
1771 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
1773 CShuffleNXdlPerWavePerShuffle *
NWave * NPerXdl>,
1774 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
1778 decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
1779 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
1782 CShuffleBlockTransferScalarPerVector_NPerBlock,
1785 {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1787 c_grid_desc_mblock_mperblock_nblock_nperblock,
1792 constexpr auto sfc_c_vgpr =
1795 Sequence<CShuffleMXdlPerWavePerShuffle,
1796 CShuffleNXdlPerWavePerShuffle,
1805 constexpr auto sfc_c_global =
1809 CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
1811 CShuffleNXdlPerWavePerShuffle *
NWave * NPerXdl>>{};
1813 constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
1815 static_assert(num_access == sfc_c_global.GetNumOfAccess(),
"wrong!");
1822 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1823 sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
1825 c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1826 c_shuffle_block_buf);
1832 c_shuffle_block_copy_lds_to_global.Run(
1833 c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1834 c_shuffle_block_buf,
1835 c_grid_desc_mblock_mperblock_nblock_nperblock,
1838 if constexpr(access_id < num_access - 1)
1840 constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
1843 c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
1844 c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
1850 template <
bool HasMainKBlockLoop,
1853 __device__
static void Run_2Lds(
const ADataType* p_a_grid,
1854 const BDataType* p_b_grid,
1855 CDataType* p_c_grid,
1858 const Problem& problem,
1865 problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
1866 const auto b_grid_desc_bpreshuffled =
1869 problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideC);
1871 const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
1873 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
1875 Run_2Lds<
decltype(a_grid_desc_ak0_m_ak1),
1876 decltype(b_grid_desc_bpreshuffled),
1877 decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
1879 CGlobalMemoryDataOperation,
1886 a_grid_desc_ak0_m_ak1,
1887 b_grid_desc_bpreshuffled,
1888 c_grid_desc_mblock_mperblock_nblock_nperblock,
#define CK_MAX_THREAD_PER_BLOCK
Definition ck.hpp:30
#define IS_VALID_COMPILATION_PARAMETER_IMPL(CDataType_)
Definition device_base.hpp:178
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto lcm(X x, Y y)
Definition utility/math.hpp:198
GemmSpecialization
Definition gemm_specialization.hpp:11
@ MKPadding
Definition gemm_specialization.hpp:18
@ KPadding
Definition gemm_specialization.hpp:16
@ NPadding
Definition gemm_specialization.hpp:15
@ MPadding
Definition gemm_specialization.hpp:14
@ MNKPadding
Definition gemm_specialization.hpp:20
@ MNPadding
Definition gemm_specialization.hpp:17
@ NKPadding
Definition gemm_specialization.hpp:19
ushort bhalf_t
Definition data_type.hpp:30
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__device__ index_t get_warp_local_1d_id()
Definition get_id.hpp:45
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
BlockGemmPipelineVersion
Definition blkgemmpipe_scheduler.hpp:12
@ v1
Definition blkgemmpipe_scheduler.hpp:14
__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex &low_idx)
Definition multi_index_transform_helper.hpp:151
__host__ __device__ constexpr auto make_right_pad_transform(const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:37
_Float16 half_t
Definition data_type.hpp:31
__host__ __device__ constexpr auto make_xor_with_modulo_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:185
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds(typename GridwiseGemm::Argument karg)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:75
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__device__ index_t get_block_1d_id()
Definition get_id.hpp:47
constexpr auto BlockGemmBPreshufflePipeline_Selector()
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp:41
bool EnvIsEnabled(EnvVar)
Definition utility/env.hpp:140
constexpr bool is_same_v
Definition type.hpp:283
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ index_t get_thread_local_1d_id()
Definition get_id.hpp:41
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle(typename GridwiseGemm::Argument karg)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:36
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
Definition block_to_ctile_map.hpp:271
__host__ static __device__ constexpr index_t CalculateGridSize(index_t M, index_t N)
Definition block_to_ctile_map.hpp:283
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:627
__host__ __device__ bool IsReduceAdd() const
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:647
bool is_reduce
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:660
__host__ __device__ bool IsAtomicAdd() const
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:652
const ADataType * p_a_grid
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:657
CDataType * p_c_grid
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:659
const BDataType * p_b_grid
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:658
__host__ Argument(const ADataType *p_a_grid_, const BDataType *p_b_grid_, CDataType *p_c_grid_, index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_, index_t k_batch_, bool is_reduce_=false)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:628
index_t N
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:609
index_t KBatch
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:614
index_t KPadded
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:618
index_t NPadded
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:616
__host__ void Print() const
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:598
index_t M
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:608
index_t MPadded
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:615
index_t KRead
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:617
index_t AK0
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:619
__host__ Problem(index_t M_, index_t N_, index_t K_, index_t StrideA_, index_t StrideB_, index_t StrideC_, index_t KBatch_)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:573
index_t K
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:610
index_t MBlock
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:621
index_t BK0
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:620
index_t StrideC
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:613
index_t StrideA
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:611
index_t NBlock
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:622
index_t StrideB
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:612
index_t a_k_split_offset
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:696
index_t c_reduce_offset
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:697
__device__ SplitKBatchOffset(Argument &karg)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:666
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:157
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateNPadded static __host__ auto CalculateNPadded(index_t N)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:218
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateBK0Shuffled __host__ static __device__ auto CalculateBK0Shuffled(index_t K)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:228
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::KRepeat static constexpr index_t KRepeat
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:188
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I6 static constexpr auto I6
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:164
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::ThisThreadBlock ThisThreadBlock< BlockSize > ThisThreadBlock
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:192
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::BK0Number static constexpr auto BK0Number
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:169
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 static __device__ constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:700
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::CheckValidity static __host__ constexpr bool CheckValidity(const Argument &karg)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:893
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 static __device__ constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:818
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeGemmMmaTileDescriptor __host__ static __device__ constexpr auto MakeGemmMmaTileDescriptor(const TileDesc_K0_MN_K1 &)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:274
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetSharedMemoryNumberOfByte static __device__ constexpr index_t GetSharedMemoryNumberOfByte()
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:868
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::NWave static constexpr index_t NWave
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:190
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I2 static constexpr auto I2
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:160
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::mfma static constexpr auto mfma
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:179
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateKRead static __host__ auto CalculateKRead(index_t K, index_t K_Batch=1)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:256
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateGridSize static __host__ auto CalculateGridSize(index_t M, index_t N, index_t KBatch)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:208
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeCGridDescriptor_M_N __host__ static __device__ auto MakeCGridDescriptor_M_N(index_t M, index_t MPad, index_t N, index_t NPad, index_t StrideC)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:511
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeAGridDescriptor_AK0_M_AK1 __host__ static __device__ auto MakeAGridDescriptor_AK0_M_AK1(index_t M, index_t MPad, index_t K, index_t KPad, index_t StrideA, index_t AK0)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:288
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock __host__ static __device__ constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc &c_grid_desc_m_n, index_t MBlock, index_t NBlock)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1106
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::AK0Number static constexpr auto AK0Number
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:168
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::Run static __device__ void Run(const ADataType *p_a_grid, const BDataType *p_b_grid, CDataType *p_c_grid, void *p_shared, const Problem &problem, const index_t k_id, const index_t Kt)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1466
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::KPack static constexpr index_t KPack
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:185
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::is_single_rate_mfma static constexpr bool is_single_rate_mfma
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:177
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeBGridDescriptor_Preshuffled __host__ static __device__ auto MakeBGridDescriptor_Preshuffled(index_t N0, index_t K0)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:370
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::BK1Number static constexpr auto BK1Number
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:171
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::Run static __device__ void Run(const ADataType *p_a_grid, const BDataType *p_b_grid, CDataType *p_c_grid, void *p_shared, const Problem &problem, const AGridDesc_AK0_M_K1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BPreshuffled &b_grid_desc_bpreshuffled, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &c_grid_desc_mblock_mperblock_nblock_nperblock, const index_t k_id)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1130
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I5 static constexpr auto I5
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:163
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I0 static constexpr auto I0
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:158
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::APackedSize static constexpr index_t APackedSize
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:194
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::Run_2Lds static __device__ void Run_2Lds(const ADataType *p_a_grid, const BDataType *p_b_grid, CDataType *p_c_grid, void *p_shared_0, void *p_shared_1, const Problem &problem, const AGridDesc_AK0_M_K1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BPreshuffled &b_grid_desc_bpreshuffled, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &c_grid_desc_mblock_mperblock_nblock_nperblock, const index_t k_id)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1508
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::BPackedSize static constexpr index_t BPackedSize
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:201
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::lcm_AK1_BK1 static constexpr auto lcm_AK1_BK1
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:176
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::Run_2Lds static __device__ void Run_2Lds(const ADataType *p_a_grid, const BDataType *p_b_grid, CDataType *p_c_grid, void *p_shared_0, void *p_shared_1, const Problem &problem, const index_t k_id, const index_t Kt)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1853
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::is_scale_mfma static constexpr auto is_scale_mfma
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:178
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I4 static constexpr auto I4
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:162
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateBK0Padded static __host__ auto CalculateBK0Padded(index_t K, index_t K_Batch=1)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:244
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I1 static constexpr auto I1
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:159
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::Block2CTileMap BlockToCTileMap_Grouped_M00_N0_M01Adapt< 8, MPerBlock, NPerBlock > Block2CTileMap
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1121
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::KLane static constexpr index_t KLane
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:186
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::NLane static constexpr index_t NLane
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:189
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateBN0Shuffled __host__ static __device__ auto CalculateBN0Shuffled(index_t N)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:223
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateAK0Padded static __host__ auto CalculateAK0Padded(index_t K, index_t K_Batch=1)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:238
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::AK1Number static constexpr auto AK1Number
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:170
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I7 static constexpr auto I7
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:165
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateMBlock static __host__ auto CalculateMBlock(index_t M)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:263
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeBMmaTileDescriptor_N0_N1_N2_K __host__ static __device__ constexpr auto MakeBMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1 &)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:501
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateKPadded static __host__ auto CalculateKPadded(index_t K)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:233
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateMPadded static __host__ auto CalculateMPadded(index_t M)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:213
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::I3 static constexpr auto I3
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:161
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateHasMainKBlockLoop static __host__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1091
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >< math::max(NXdlPerWave64, 1)>::BlockwiseGemmPipe remove_cvref_t< decltype(BlockGemmBPreshufflePipeline_Selector< BlkGemmPipelineVer, BlkGemmPipeSched, BlockSize, ADataType, BDataType, ComputeTypeA, GemmAccDataType, decltype(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()), decltype(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()), decltype(MakeAMmaTileDescriptor_M0_M1_M2_K(GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1())), decltype(MakeBMmaTileDescriptor_N0_N1_N2_K(GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1())), ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, KPack >())> BlockwiseGemmPipe
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:842
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateKBlockLoopTailNum static __host__ constexpr TailNumber CalculateKBlockLoopTailNum(index_t K)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:1098
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeBGridDescriptor_BK0_N_BK1 __host__ static __device__ auto MakeBGridDescriptor_BK0_N_BK1(index_t K, index_t KPad, index_t N, index_t NPad, index_t StrideB, index_t BK0)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:380
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateKPadded static __host__ auto CalculateKPadded(index_t K, index_t K_Batch=1)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:250
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::CalculateNBlock static __host__ auto CalculateNBlock(index_t N)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:268
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock static __device__ constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:828
ck::GridwiseGemm_xdl_cshuffle_v3_b_preshuffle< ALayout, BLayout, CLayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, CDataType, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, GemmSpec, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave_, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer, ComputeTypeA, ComputeTypeB, PermuteA, PermuteB >::MakeAMmaTileDescriptor_M0_M1_M2_K __host__ static __device__ constexpr auto MakeAMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1 &)
Definition gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp:492
Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes o...
Definition xdlops_gemm.hpp:1208
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
Blockwise data transfer.
Definition thread_group_tensor_slice_transfer_v4r1.hpp:46
Definition thread_group_tensor_slice_transfer_v6r1.hpp:34
Definition threadwise_tensor_slice_transfer.hpp:39
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition threadwise_tensor_slice_transfer.hpp:234
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition data_type.hpp:187
Definition functional2.hpp:33
Definition device_base.hpp:197
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340
#define CK_ENV(name)
Definition utility/env.hpp:129