DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched > Struct Template Reference
#include <device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >:
Classes | |
| struct | ComputeBasePtrOfStridedBatch |
| struct | Argument |
| struct | Invoker |
Public Types | |
| using | DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle |
| using | A0GridDesc_M_K = decltype(MakeA0GridDescriptor_M_K(1, 1, 1)) |
| using | B0GridDesc_N_K = decltype(MakeB0GridDescriptor_N_K(1, 1, 1)) |
| using | D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))> |
| using | B1GridDesc_N_K = decltype(MakeB1GridDescriptor_N_K(1, 1, 1)) |
| using | D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))> |
| using | E1GridDesc_M_N = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1)) |
| template<index_t Gemm0MXdlPerWave_> | |
| using | GridwiseGemmBase |
| using | GridwiseGemm64 = GridwiseGemmBase<math::max(Gemm0MXdlPerWave64, 1)> |
| using | GridwiseGemm32 = GridwiseGemmBase<Gemm0MXdlPerWave32> |
| using | A0GridDesc_AK0_M_AK1 |
| using | B0GridDesc_BK0_N_BK1 |
| using | B1GridDesc_BK0_N_BK1 |
Public Member Functions | |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a0, const void *p_b0, std::array< const void *, NumD0Tensor > p_d0s, const void *p_b1, std::array< const void *, NumD1Tensor > p_d1s, void *p_e1, index_t MRaw, index_t NRaw, index_t KRaw, index_t Gemm1NRaw, index_t Batch, index_t StrideA0, index_t StrideB0, std::array< ck::index_t, NumD0Tensor > StrideD0s, index_t StrideB1, std::array< ck::index_t, NumD1Tensor > StrideD1s, index_t StrideE1, index_t BatchStrideA0, index_t BatchStrideB0, std::array< ck::index_t, NumD0Tensor > BatchStrideD0s, index_t BatchStrideB1, std::array< ck::index_t, NumD1Tensor > BatchStrideD1s, index_t BatchStrideE1, A0ElementwiseOperation a0_element_op, B0ElementwiseOperation b0_element_op, CDE0ElementwiseOperation cde0_element_op, B1ElementwiseOperation b1_element_op, CDE1ElementwiseOperation cde1_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
| Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetInstanceString () const |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual size_t | GetWorkSpaceSize (const BaseArgument *) const |
| virtual void | SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| static auto | MakeA0GridDescriptor_M_K (index_t MRaw, index_t KRaw, index_t StrideA0) |
| static auto | MakeB0GridDescriptor_N_K (index_t KRaw, index_t NRaw, index_t StrideB) |
| template<typename DLay> | |
| static auto | MakeD0GridDescriptor_M_N (index_t MRaw, index_t NRaw, index_t StrideD0) |
| static auto | MakeB1GridDescriptor_N_K (index_t KRaw, index_t NRaw, index_t StrideB) |
| template<typename ELay> | |
| static auto | MakeE1GridDescriptor_M_N (index_t MRaw, index_t NRaw, index_t StrideE1) |
| static auto | MakeD0sGridDescriptor_M_N (const std::array< index_t, NumD1Tensor > &MRaws, const std::array< index_t, NumD1Tensor > &NRaws, const std::array< index_t, NumD1Tensor > &DsStride) |
| static auto | MakeD1sGridDescriptor_M_N (const std::array< index_t, NumD1Tensor > &MRaws, const std::array< index_t, NumD1Tensor > &NRaws, const std::array< index_t, NumD1Tensor > &DsStride) |
| static constexpr bool | IsValidCompilationParameter () |
| template<typename RefLayout, typename DsLayout, const index_t NumDTensor> | |
| static bool | CheckDLayout () |
| static bool | IsSupportedArgument (const Argument &arg) |
| static auto | MakeArgument (const A0DataType *p_a0, const B0DataType *p_b0, std::array< const void *, NumD0Tensor > p_d0s, const B1DataType *p_b1, std::array< const void *, NumD1Tensor > p_d1s, E1DataType *p_e1, index_t MRaw, index_t NRaw, index_t KRaw, index_t Gemm1NRaw, index_t Batch, index_t StrideA0, index_t StrideB0, std::array< index_t, NumD0Tensor > StrideD0s, index_t StrideB1, std::array< index_t, NumD1Tensor > StrideD1s, index_t StrideE1, index_t BatchStrideA0, index_t BatchStrideB0, std::array< index_t, NumD0Tensor > BatchStrideD0s, index_t BatchStrideB1, std::array< index_t, NumD1Tensor > BatchStrideD1s, index_t BatchStrideE1, A0ElementwiseOperation a0_element_op, B0ElementwiseOperation b0_element_op, CDE0ElementwiseOperation cde0_element_op, B1ElementwiseOperation b1_element_op, CDE1ElementwiseOperation cde1_element_op) |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static constexpr auto | Gemm0MXdlPerWave64 |
| static constexpr auto | Gemm0MXdlPerWave32 |
| static constexpr index_t | NumD0Tensor = D0sDataType::Size() |
| static constexpr index_t | NumD1Tensor = D1sDataType::Size() |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | I5 = Number<5>{} |
| static constexpr auto | I6 = Number<6>{} |
| static constexpr auto | I7 = Number<7>{} |
| static constexpr auto | I8 = Number<8>{} |
| static constexpr auto | I9 = Number<9>{} |
| static constexpr auto | gemm0_padder |
| static constexpr auto | gemm1_padder |
| Static Public Attributes inherited from ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, D0sDataType, B1DataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation > | |
| static constexpr index_t | NumD0Tensor = D0sDataType::Size() |
| static constexpr index_t | NumD1Tensor = D1sDataType::Size() |
Member Typedef Documentation
◆ A0GridDesc_AK0_M_AK1
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::A0GridDesc_AK0_M_AK1 |
Initial value:
A0GridDesc_M_K{}))>
__host__ static __device__ constexpr auto MakeDefaultA0GridDescriptor_AK0_M_AK1(const A0GridDesc_M_K &a0_grid_desc_m_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:369
decltype(MakeA0GridDescriptor_M_K(1, 1, 1)) A0GridDesc_M_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:457
◆ A0GridDesc_M_K
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::A0GridDesc_M_K = decltype(MakeA0GridDescriptor_M_K(1, 1, 1)) |
◆ B0GridDesc_BK0_N_BK1
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B0GridDesc_BK0_N_BK1 |
Initial value:
B0GridDesc_N_K{}))>
__host__ static __device__ constexpr auto MakeDefaultB0GridDescriptor_BK0_N_BK1(const B0GridDesc_N_K &b0_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:386
decltype(MakeB0GridDescriptor_N_K(1, 1, 1)) B0GridDesc_N_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:458
◆ B0GridDesc_N_K
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B0GridDesc_N_K = decltype(MakeB0GridDescriptor_N_K(1, 1, 1)) |
◆ B1GridDesc_BK0_N_BK1
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B1GridDesc_BK0_N_BK1 |
Initial value:
B1GridDesc_N_K{}))>
__host__ static __device__ constexpr auto MakeDefaultB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K &b1_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:443
decltype(MakeB1GridDescriptor_N_K(1, 1, 1)) B1GridDesc_N_K
Definition device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp:460
◆ B1GridDesc_N_K
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::B1GridDesc_N_K = decltype(MakeB1GridDescriptor_N_K(1, 1, 1)) |
◆ D0sGridDesc_M_N
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))> |
◆ D1sGridDesc_M_N
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))> |
◆ DeviceOp
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle |
◆ E1GridDesc_M_N
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::E1GridDesc_M_N = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1)) |
◆ GridwiseGemm32
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemm32 = GridwiseGemmBase<Gemm0MXdlPerWave32> |
◆ GridwiseGemm64
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemm64 = GridwiseGemmBase<math::max(Gemm0MXdlPerWave64, 1)> |
◆ GridwiseGemmBase
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<index_t Gemm0MXdlPerWave_>
| using ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0Layout, B0Layout, D0sLayout, B1Layout, D1sLayout, E1Layout, A0DataType, B0DataType, Acc0DataType, D0sDataType, B1DataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, PadGemm0M, PadGemm0N, PadGemm0K, PadGemm1N, PadGemm1K, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GridwiseGemmBase |
Member Function Documentation
◆ CheckDLayout()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename RefLayout, typename DsLayout, const index_t NumDTensor>
|
inlinestatic |
◆ GetTypeString()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ IsSupportedArgument() [2/2]
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsValidCompilationParameter()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestaticconstexpr |
◆ MakeA0GridDescriptor_M_K()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeArgument()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeArgumentPointer()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineoverridevirtual |
◆ MakeB0GridDescriptor_N_K()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeB1GridDescriptor_N_K()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeD0GridDescriptor_M_N()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename DLay>
|
inlinestatic |
◆ MakeD0sGridDescriptor_M_N()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeD1sGridDescriptor_M_N()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeE1GridDescriptor_M_N()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
template<typename ELay>
|
inlinestatic |
◆ MakeInvoker()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlinestatic |
◆ MakeInvokerPointer()
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineoverridevirtual |
Member Data Documentation
◆ gemm0_padder
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
Initial value:
=
Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock}
Definition matrix_padder.hpp:204
◆ Gemm0MXdlPerWave32
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
Initial value:
= GetNXdlPerWave2<BlockSize,
Gemm0NPerBlock,
Gemm0MPerBlock,
Gemm0NPerXdl,
Gemm0MPerXdl,
Gemm0NXdlPerWave,
false>()
◆ Gemm0MXdlPerWave64
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
Initial value:
= GetNXdlPerWave2<BlockSize,
Gemm0NPerBlock,
Gemm0MPerBlock,
Gemm0NPerXdl,
Gemm0MPerXdl,
Gemm0NXdlPerWave,
true>()
◆ gemm1_padder
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
Initial value:
=
Gemm0MPerBlock, Gemm1NPerBlock, Gemm1KPerBlock}
◆ I0
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I1
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I2
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I3
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I4
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I5
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I6
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I7
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I8
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ I9
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ NumD0Tensor
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
◆ NumD1Tensor
template<typename A0Layout, typename B0Layout, typename D0sLayout, typename B1Layout, typename D1sLayout, typename E1Layout, typename A0DataType, typename B0DataType, typename Acc0DataType, typename D0sDataType, typename B1DataType, typename Acc1DataType, typename C1ShuffleDataType, typename D1sDataType, typename E1DataType, typename A0ElementwiseOperation, typename B0ElementwiseOperation, typename CDE0ElementwiseOperation, typename B1ElementwiseOperation, typename CDE1ElementwiseOperation, bool PadGemm0M, bool PadGemm0N, bool PadGemm0K, bool PadGemm1N, bool PadGemm1K, index_t NumGemm0KPrefetchStage, index_t BlockSize, index_t Gemm0MPerBlock, index_t Gemm0NPerBlock, index_t Gemm0KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t A0K1, index_t B0K1, index_t B1K1, index_t Gemm0MPerXdl, index_t Gemm0NPerXdl, index_t Gemm0MXdlPerWave, index_t Gemm0NXdlPerWave, index_t Gemm1NXdlPerWave, typename A0BlockTransferThreadClusterLengths_AK0_M_AK1, typename A0BlockTransferThreadClusterArrangeOrder, typename A0BlockTransferSrcAccessOrder, index_t A0BlockTransferSrcVectorDim, index_t A0BlockTransferSrcScalarPerVector, index_t A0BlockTransferDstScalarPerVector_AK1, bool A0BlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_BK0_N_BK1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_BK1, bool B0BlockLdsExtraN, index_t CDE0BlockTransferSrcVectorDim, index_t CDE0BlockTransferSrcScalaerPerVector, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t C1ShuffleMXdlPerWavePerShuffle, index_t C1ShuffleGemm0NXdlPerWavePerShuffle, typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopScheduler LoopSched = LoopScheduler::Default>
|
staticconstexpr |
The documentation for this struct was generated from the following file: