18 typename ComputeTypeA,
19 typename ComputeTypeB,
21 typename AWmmaTileDesc,
22 typename BWmmaTileDesc,
23 index_t ABlockTransferSrcScalarPerVector,
24 index_t BBlockTransferSrcScalarPerVector,
33 bool TransposeC =
false>
60 static_assert(KPack % (
A_K1 *
A_KRow) == 0,
"wrong!");
61 static_assert(KPack % (
B_K1 *
B_KRow) == 0,
"wrong!");
75 ABlockTransferSrcScalarPerVector,
76 BBlockTransferSrcScalarPerVector,
97 template <index_t NBuffer>
105 template <
index_t ScaleSliceSizeN,
113 typename ThreadStaticBuffer,
114 typename BScaleThreadDesc>
117 __device__
BScale(GridDesc b_scale_grid_desc_,
118 ThreadCopy b_scale_thread_copy_,
119 GridBuffer b_scale_grid_buf_)
134 template <index_t NBuffer>
177 return threadid_to_wave_idx_adaptor.CalculateBottomIndex(
make_multi_index(thread_id));
184 const auto waveId_m = wave_idx[
I0];
186 const auto wmma_a_idx =
wmma_gemm.CalculateAThreadOriginDataIndex();
188#if defined(__gfx12__)
189 const auto wmma_krow =
wmma_gemm.GetSubGroupId();
191 const auto wmma_krow = 0;
195 return make_tuple(0, 0, waveId_m, wmma_krow, wmma_a_idx, 0);
202 const auto waveId_n = wave_idx[
I1];
204 const auto wmma_b_idx =
wmma_gemm.CalculateBThreadOriginDataIndex();
206#if defined(__gfx12__)
207 const auto wmma_krow =
wmma_gemm.GetSubGroupId();
209 const auto wmma_krow = 0;
213 return make_tuple(0, 0, waveId_n, wmma_krow, wmma_b_idx, 0);
216 template <index_t m0, index_t n0>
221 const auto waveId_m = wave_idx[
I0];
222 const auto waveId_n = wave_idx[
I1];
224 const auto blk_idx =
wmma_gemm.GetBeginOfThreadBlk();
236 const index_t c_thread_m = mrepeat_mwave_mperwmma_to_m_adaptor.CalculateBottomIndex(
238 const index_t c_thread_n = nrepeat_nwave_nperwmma_to_n_adaptor.CalculateBottomIndex(
268 static_assert(AWmmaTileDesc::IsKnownAtCompileTime() &&
269 BWmmaTileDesc::IsKnownAtCompileTime(),
270 "wrong! Desc should be known at compile-time");
273 "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize");
275 static_assert(MPerBlock % (MPerWmma * MRepeat) == 0 &&
276 NPerBlock % (NPerWmma * NRepeat) == 0,
281 __host__ __device__
static constexpr auto
284 constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
285 wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
287 constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[
I2];
296 wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths()[
I2];
298 __host__ __device__
static constexpr auto
301 constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
302 wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
304 constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[
I3];
318 __host__ __device__
static constexpr auto
321 constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
330 .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
331 c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Vgpr
Definition amd_address_space.hpp:20
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
static constexpr index_t num_scale_krepeat
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:125
GridBuffer b_scale_grid_buf
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:162
__device__ BScale(GridDesc b_scale_grid_desc_, ThreadCopy b_scale_thread_copy_, GridBuffer b_scale_grid_buf_)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:117
StaticallyIndexedArray< ThreadStaticBuffer, Number< NumberOfBuffers >{}> b_scale_thread_bufs
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:163
static constexpr auto b_scale_thread_copy_step
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:129
static constexpr index_t num_scale_k_block
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:124
static constexpr auto b_scale_thread_desc
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:127
__device__ void GlobalLoad(bool cond)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:135
ThreadCopy b_scale_thread_copy
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:160
GridDesc b_scale_grid_desc
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:161
__device__ Empty()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:96
__device__ void GlobalLoad(bool cond)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:98
ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeTypeA, decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), Sequence< KPack/A_K1/A_KRow, 1, 1, 1, 1, A_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:372
ck::BlockwiseGemmWmmaops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerWmma, NPerWmma, wmma_gemm.wmma_instr.k_per_wmma > HotLoopInstList
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:70
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:166
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:340
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:394
__host__ static __device__ constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:319
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_block_desc_k0_n0_n1_n2_k1 static constexpr BWmmaTileDesc b_block_desc_k0_n0_n1_n2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:337
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:37
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:57
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >)
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:217
static __device__ auto CalculateBThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:198
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::c_thread_buf_ StaticBufferTupleOfVector< AddressSpaceEnum::Vgpr, AccDataType, MRepeat *NRepeat, wmma_gemm.GetRegSizePerWmma(), true > c_thread_buf_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:92
__host__ static __device__ constexpr auto GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:282
__host__ __device__ BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin=CalculateAThreadOriginDataIndex(), Tuple6 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmWmmaops_pipeline_base.
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:264
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:354
ThreadwiseTensorSliceTransfer_v4< BDataType, ComputeTypeB, decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), Sequence< KPack/B_K1/B_KRow, 1, 1, 1, 1, B_K1 >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, B_K1, B_K1 > BThreadCopy
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:383
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::a_block_desc_k0_m0_m1_m2_k1 static constexpr AWmmaTileDesc a_block_desc_k0_m0_m1_m2_k1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:336
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:46
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::wmma_gemm static constexpr auto wmma_gemm
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:63
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_KRow static constexpr index_t B_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:54
static __device__ auto GetWaveIdx()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:168
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I3 static constexpr auto I3
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:39
__host__ static __device__ constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:299
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:36
ThisThreadBlock< BlockSize > ThisThreadBlock
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:42
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:58
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::MAccVgprs static constexpr auto MAccVgprs
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:295
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:44
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::WmmaK static constexpr auto WmmaK
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:68
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I5 static constexpr auto I5
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:40
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:395
decltype(CalculateAThreadOriginDataIndex()) Tuple6
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:244
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:66
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:47
static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:180
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::A_KRow static constexpr index_t A_KRow
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:53
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:369
ck::BlockwiseGemmWmmaops_pipeline_base< BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >::I2 static constexpr auto I2
Definition blockwise_gemm_pipeline_wmmaops_base.hpp:38
Definition blockwise_gemm_pipeline_wmmaops.hpp:26
Definition utility/sequence.hpp:43
Definition static_buffer.hpp:75
static __device__ constexpr index_t GetNumOfThread()
Definition thread_group.hpp:15
static __device__ index_t GetThreadId()
Definition thread_group.hpp:19
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition wmma_gemm.hpp:663
Definition functional2.hpp:33