GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer > Struct Template Reference

GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1&lt; FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer &gt; Struct Template Reference#

Composable Kernel: ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer > Struct Template Reference
ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer > Struct Template Reference

#include <gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp>

Public Types

using ThisThreadBlock = ThisThreadBlock<BlockSize>
using GridwiseGemmPipe
using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
using C0GridDescriptor_NBlock_NPerBlock
using DefaultBlock2CTileMap

Static Public Member Functions

__host__ static __device__ constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 ()
__host__ static __device__ constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 ()
__host__ static __device__ constexpr auto GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ()
__host__ static __device__ constexpr index_t GetSharedMemoryNumberOfByte ()
template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
static __device__ bool constexpr IsValidCompilationParameter ()
template<typename Block2CTileMap>
__host__ static __device__ constexpr bool CheckValidity (const AGridDesc_AK0_M_AK1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 &b_grid_desc_bk0_n_bk1, const CGridDesc_M_N &c_grid_desc_m_n, const Block2CTileMap &block_2_ctile_map)
__host__ static __device__ constexpr bool CalculateHasMainKBlockLoop (index_t K)
__host__ static __device__ constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock (const CGridDesc_M_N &c_grid_desc_m_n)
__host__ static __device__ constexpr auto MakeC0GridDescriptor_NBlock_NPerBlock (const C0GridDesc_N &c0_grid_desc_n)
__host__ static __device__ constexpr auto MakeDefaultBlock2CTileMap (const CGridDesc_M_N &c_grid_desc_m_n)
template<bool HasMainKBlockLoop, typename Block2CTileMap>
static __device__ void Run (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, void *__restrict__ p_shared, const AElementwiseOperation &a_element_op, const BElementwiseOperation &b_element_op, const AccElementwiseOperation &acc_element_op, const CElementwiseOperation &c_element_op, const AGridDesc_AK0_M_AK1 &a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 &b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock &c0_grid_desc_nblock_nperblock, const Block2CTileMap &block_2_ctile_map)

Static Public Attributes

static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto I2 = Number<2>{}
static constexpr auto I3 = Number<3>{}
static constexpr auto I4 = Number<4>{}
static constexpr auto I5 = Number<5>{}
static constexpr auto I6 = Number<6>{}
static constexpr auto I7 = Number<7>{}
static constexpr auto AK0 = Number<KPerBlock / AK1Value>{}
static constexpr auto BK0 = Number<KPerBlock / BK1Value>{}
static constexpr auto AK1 = Number<AK1Value>{}
static constexpr auto BK1 = Number<BK1Value>{}

Member Typedef Documentation

◆ C0GridDescriptor_NBlock_NPerBlock

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
using ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::C0GridDescriptor_NBlock_NPerBlock
Initial value:
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ static __device__ constexpr auto MakeC0GridDescriptor_NBlock_NPerBlock(const C0GridDesc_N &c0_grid_desc_n)
Definition gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp:344

◆ CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
using ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
Initial value:
CGridDesc_M_N{}))>
__host__ static __device__ constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N &c_grid_desc_m_n)
Definition gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp:324

◆ DefaultBlock2CTileMap

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
using ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::DefaultBlock2CTileMap
Initial value:
remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>
__host__ static __device__ constexpr auto MakeDefaultBlock2CTileMap(const CGridDesc_M_N &c_grid_desc_m_n)
Definition gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp:360

◆ GridwiseGemmPipe

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
using ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GridwiseGemmPipe
Initial value:
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31

◆ ThisThreadBlock

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
using ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::ThisThreadBlock = ThisThreadBlock<BlockSize>

Member Function Documentation

◆ CalculateHasMainKBlockLoop()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr bool ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::CalculateHasMainKBlockLoop ( index_t K)
inlinestaticconstexpr

◆ CheckValidity()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
template<typename Block2CTileMap>
__host__ static __device__ constexpr bool ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::CheckValidity ( const AGridDesc_AK0_M_AK1 & a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 & b_grid_desc_bk0_n_bk1,
const CGridDesc_M_N & c_grid_desc_m_n,
const Block2CTileMap & block_2_ctile_map )
inlinestaticconstexpr

◆ GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1 ( )
inlinestaticconstexpr

◆ GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1 ( )
inlinestaticconstexpr

◆ GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ( )
inlinestaticconstexpr

◆ GetSharedMemoryNumberOfByte()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr index_t ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::GetSharedMemoryNumberOfByte ( )
inlinestaticconstexpr

◆ IsValidCompilationParameter()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
template<InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
__device__ bool constexpr ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::IsValidCompilationParameter ( )
inlinestaticconstexpr

◆ MakeC0GridDescriptor_NBlock_NPerBlock()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeC0GridDescriptor_NBlock_NPerBlock ( const C0GridDesc_N & c0_grid_desc_n)
inlinestaticconstexpr

◆ MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ( const CGridDesc_M_N & c_grid_desc_m_n)
inlinestaticconstexpr

◆ MakeDefaultBlock2CTileMap()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
__host__ static __device__ constexpr auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::MakeDefaultBlock2CTileMap ( const CGridDesc_M_N & c_grid_desc_m_n)
inlinestaticconstexpr

◆ Run()

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
template<bool HasMainKBlockLoop, typename Block2CTileMap>
__device__ void ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::Run ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const FloatC0 *__restrict__ p_c0_bias_grid,
const FloatC0 *__restrict__ p_c0_add_grid,
const FloatC0 *__restrict__ p_c0_gamma_grid,
const FloatC0 *__restrict__ p_c0_beta_grid,
void *__restrict__ p_shared,
const AElementwiseOperation & a_element_op,
const BElementwiseOperation & b_element_op,
const AccElementwiseOperation & acc_element_op,
const CElementwiseOperation & c_element_op,
const AGridDesc_AK0_M_AK1 & a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 & b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock & c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_NBlock_NPerBlock & c0_grid_desc_nblock_nperblock,
const Block2CTileMap & block_2_ctile_map )
inlinestatic

Member Data Documentation

◆ AK0

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::AK0 = Number<KPerBlock / AK1Value>{}
staticconstexpr

◆ AK1

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::AK1 = Number<AK1Value>{}
staticconstexpr

◆ BK0

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::BK0 = Number<KPerBlock / BK1Value>{}
staticconstexpr

◆ BK1

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::BK1 = Number<BK1Value>{}
staticconstexpr

◆ I0

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I1 = Number<1>{}
staticconstexpr

◆ I2

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I2 = Number<2>{}
staticconstexpr

◆ I3

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I3 = Number<3>{}
staticconstexpr

◆ I4

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I4 = Number<4>{}
staticconstexpr

◆ I5

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I5 = Number<5>{}
staticconstexpr

◆ I6

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I6 = Number<6>{}
staticconstexpr

◆ I7

template<typename FloatAB, typename FloatGemmAcc, typename FloatCShuffle, typename FloatC, typename FloatC0, typename FloatReduceAcc, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDesc_M_N, typename C0GridDesc_N, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t MPerXdl, index_t NPerXdl, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool AThreadTransferSrcResetCoordinateAfterRun, index_t ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BThreadTransferSrcResetCoordinateAfterRun, index_t BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, typename CReduceThreadClusterLengths_MPerBlock_NPerBlock, index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopScheduler LoopSched, PipelineVersion PipelineVer = PipelineVersion::v1>
auto ck::GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1< FloatAB, FloatGemmAcc, FloatCShuffle, FloatC, FloatC0, FloatReduceAcc, AElementwiseOperation, BElementwiseOperation, AccElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc_AK0_M_AK1, BGridDesc_BK0_N_BK1, CGridDesc_M_N, C0GridDesc_N, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1Value, BK1Value, MPerXdl, NPerXdl, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, AThreadTransferSrcResetCoordinateAfterRun, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BThreadTransferSrcResetCoordinateAfterRun, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, CReduceThreadClusterLengths_MPerBlock_NPerBlock, CReduceThreadCopySrcDstScalarPerVector_NPerBlock, LoopSched, PipelineVer >::I7 = Number<7>{}
staticconstexpr

The documentation for this struct was generated from the following file: