|
| constexpr unsigned int | fnv1a_hash (std::string_view str, unsigned int h=2166136261u) |
| std::string | get_device_name () |
| bool | is_gfx12_supported () |
| bool | is_gfx11_supported () |
| bool | is_xdl_supported () |
| template<typename ADataType, typename BDataType, index_t MPerXDL, index_t NPerXDL> |
| bool | is_xdl_wmma_supported () |
| bool | is_lds_direct_load_supported () |
| bool | is_bf16_atomic_supported () |
| bool | is_gfx101_supported () |
| bool | is_gfx103_supported () |
| bool | is_wmma_supported () |
| bool | is_tf32_supported () |
| template<typename T, typename ForwardIterator, typename Size, typename BinaryOperation> |
| auto | accumulate_n (ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op)) |
| unsigned int | get_available_cpu_cores () |
| template<typename... In, typename... Wei, typename... Out, typename ConvStrides, typename ConvDilations, typename InLeftPads, typename InRightPads, index_t GemmK1Value> |
| __host__ __device__ constexpr auto | transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad (const TensorDescriptor< In... > &in_grid_desc_n_di_hi_wi_c, const TensorDescriptor< Wei... > &wei_k_z_y_x_c_grid_desc, const TensorDescriptor< Out... > &out_n_do_ho_wo_k_grid_desc, const ConvStrides &conv_strides, const ConvDilations &conv_dilations, const InLeftPads &in_left_pads, const InRightPads &in_right_pads, Number< GemmK1Value >) |
| template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false> |
| __host__ __device__ constexpr auto | make_static_tensor (TensorDesc) |
| template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename X, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false> |
| __host__ __device__ constexpr auto | make_static_tensor (TensorDesc, X invalid_element_value) |
| template<typename Lengths, typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type> |
| __host__ __device__ constexpr auto | make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{}) |
| template<typename LowLength> |
| __host__ __device__ constexpr auto | make_pass_through_transform (const LowLength &low_length) |
| template<typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false> |
| __host__ __device__ constexpr auto | make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
| template<typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false> |
| __host__ __device__ constexpr auto | make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
| template<typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false> |
| __host__ __device__ constexpr auto | make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{}) |
| template<typename UpLengths, typename Coefficients, typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false> |
| __host__ __device__ constexpr auto | make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_merge_transform (const LowLengths &low_lengths) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_merge_transform_v1_carry_check (const LowLengths &low_lengths) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_merge_transform_v2_magic_division (const LowLengths &low_lengths) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_merge_transform_v3_division_mod (const LowLengths &low_lengths) |
| template<typename UpLengths, bool Use24BitIntegerCalculation = false> |
| __host__ __device__ constexpr auto | make_unmerge_transform (const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{}) |
| __host__ __device__ constexpr auto | make_conv_bwd_data_out_transform (index_t N, index_t Ho, index_t Wo, index_t K, index_t YDot, index_t XDot, index_t HTilde, index_t WTilde, index_t ConvDilationH, index_t ConvDilationW, index_t HTildeSlice, index_t WTildeSlice, index_t YDotSlice, index_t XDotSlice, index_t IHTildeSliceBegin, index_t IWTildeSliceBegin, index_t GcdStrideDilationH, index_t GcdStrideDilationW, index_t K0, index_t K1, index_t MPerBlock, index_t GemmKPerBlock) |
| template<typename LowerIndex> |
| __host__ __device__ constexpr auto | make_freeze_transform (const LowerIndex &low_idx) |
| template<typename UpperIndex> |
| __host__ __device__ constexpr auto | make_insert_transform (const UpperIndex &up_idx) |
| template<typename LowLength, typename SliceBegin, typename SliceEnd> |
| __host__ __device__ constexpr auto | make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end) |
| template<typename VectorSize, typename UpLength> |
| __host__ __device__ constexpr auto | make_vectorize_transform (const VectorSize &vector_size, const UpLength &up_length) |
| template<typename Modulus, typename UpLength> |
| __host__ __device__ constexpr auto | make_modulo_transform (const Modulus &modulus, const UpLength &up_length) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_xor_with_modulo_transform (const LowLengths &low_lengths) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_xor_transform (const LowLengths &low_lengths) |
| template<typename TensorAdaptor0, typename TensorAdaptor1> |
| __host__ __device__ constexpr auto | chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1) |
| template<typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss> |
| __host__ __device__ constexpr auto | make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss) |
| template<typename X, typename... Xs, typename enable_if< sizeof...(Xs) > = 2, bool, ::type = false> |
| __host__ __device__ constexpr auto | chain_tensor_adaptors (const X &x, const Xs &... xs) |
| template<typename OldTensorDescriptor, typename NewTransforms, typename NewLowerDimensionOldVisibleIdss, typename NewUpperDimensionNewVisibleIdss> |
| __host__ __device__ constexpr auto | transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss) |
| template<typename TensorDesc, typename VisibleIndex> |
| __host__ __device__ constexpr auto | make_tensor_coordinate (const TensorDesc &tensor_desc, const VisibleIndex &idx_visible) |
| template<typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack> |
| __host__ __device__ constexpr auto | make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack) |
| template<typename TensorDesc, typename VisibleIndex> |
| __host__ __device__ constexpr auto | make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible) |
| template<typename TensorDesc, typename TensorCoord, typename TensorCoordStep> |
| __host__ __device__ constexpr void | move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step) |
| template<typename TensorDesc, typename TensorCoord> |
| __host__ __device__ constexpr bool | coordinate_has_valid_offset_assuming_visible_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord) |
| template<typename TensorDesc, typename TensorCoord> |
| __host__ __device__ constexpr bool | coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord) |
| template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false> |
| __host__ __device__ constexpr auto | make_naive_tensor_descriptor (const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides) |
| template<typename... Lengths> |
| __host__ __device__ constexpr auto | make_naive_tensor_descriptor_packed (const Tuple< Lengths... > &lengths) |
| template<typename... Lengths, typename Align> |
| __host__ __device__ constexpr auto | make_naive_tensor_descriptor_aligned (const Tuple< Lengths... > &lengths, Align align) |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeTypeA, typename ComputeTypeB, typename AccDataType, typename AWmmaTileDesc, typename BWmmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false> |
| constexpr auto | BlockGemmPipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
| constexpr auto | BlockGemmABScalePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
| constexpr auto | BlockGemmMXBPreshufflePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
| constexpr auto | BlockGemmBPreshufflePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
| constexpr auto | BlockGemmPipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
| constexpr auto | BlockGemmBlockScaleBPreshufflePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
| constexpr auto | BlockGemmBlockMoeScaleBPreshufflePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
| constexpr auto | BlockGemmMXBPreshufflePipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
| constexpr auto | BlockGemmMXNBSPipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false> |
| constexpr auto | BlockGemmMXPipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack> |
| constexpr auto | BlockGemmMXPipeline_Selector () |
| template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool DirectLoad = false> |
| constexpr auto | BlockGemmPipeline_Selector () |
| template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename AK0MK1BlockDesc, typename BK0NK1BlockDesc, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB> |
| constexpr auto | BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector () |
| template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_batched_gemm_xdl_cshuffle_v3_multi_d (BatchedGemmArg karg) |
| template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds (BatchedGemmArg karg) |
| template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_batched_gemm_b_scale_xdl_cshuffle_v3 (BatchedGemmArg karg) |
| template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds (BatchedGemmArg karg) |
| template<typename GridwiseGemm, typename AsPointer, typename BsPointer, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AsGridDesc_AK0_M_AK1, typename BsGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_contraction_multiple_abd_xdl_cshuffle (AsPointer p_as_grid, BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1, const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseElementwiseReduction, typename InDataTypePointerTuple, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename AccDataType, typename XElementwiseOperation, typename YElementwiseOperation, typename InGrid2dDescTuple, typename GridDesc_M_K> |
| __global__ void | kernel_elementwise_layernorm (const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op) |
| template<typename GridwiseGemm, typename ABDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename DsGridDesc_M0_M10_M11_N0_N10_N11, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> |
| __global__ void | kernel_gemm_dl_multiple_d (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11, const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename EMeanVarDataType, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_multiple_d_welford_first_half_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid) |
| template<typename GridwiseWelfordLayernorm, typename EMeanVarDataType, typename HDataType, typename GammaDataType, typename BetaDataType, typename ComputeDataType, typename EHGridDesc_M_N, typename LayernormMeanVarGridDesc_M_NBlock, typename LayernormCountGridDesc_M_NBlock, typename GammaBetaGridDesc_N, typename HElementwiseOperation> |
| __global__ void | kernel_welford_layernorm2d_second_half (const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op) |
| template<typename GridwiseGemmWelford, typename ABDataType, typename DsPointer, typename EMeanVarDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock, typename CountGridDescriptor_MBlock_MPerBlock_NBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EMeanVarDataType *__restrict__ p_e_grid, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock, const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock, const Block2ETileMap block_2_etile_map, index_t NRaw) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename FloatRsPointer, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename RsGridDescriptor_MBlock_MPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_multiple_d_multiple_r_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, FloatRsPointer p_rs_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const QsElementwiseOperation qs_element_op, const RsElementwiseOperation rs_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_multiple_d_xdl_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseGemm, typename ABDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename EElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdl_waveletmodel_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const EElementwiseOperation e_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
| template<typename GridwiseGemm, typename ContractionMultiDKernelArg, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, bool HasMainKBlockLoop> |
| __global__ void | kernel_grouped_contraction_multiple_d_xdl_cshuffle (const void CK_CONSTANT_ADDRESS_SPACE *contraction_args, const index_t group_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op) |
| template<typename GridwiseWelford, typename XDataType, typename WorkspaceMeanVarDataType, typename ComputeDataType, typename XGridDesc_M_K, typename MeanVarGridDesc_M_KBlock> |
| __global__ void | kernel_normalizationSplitK1st (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, WorkspaceMeanVarDataType *const __restrict__ p_welford_mean, WorkspaceMeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count) |
| template<typename GridwiseWelfordNormalization, typename WorkspaceMeanVarDataType, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename MeanVarGridDesc_M_KBlock, typename CountGridDesc_M_KBlock, typename XYGammaBetaGridDesc_M_K, typename SaveMeanInvStdGridDesc_M> |
| __global__ void | kernel_normalizationSplitK2nd (const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, const CountGridDesc_M_KBlock count_grid_desc_m_kblock, const XYGammaBetaGridDesc_M_K x_grid_desc_m_k, const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k, const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k, const XYGammaBetaGridDesc_M_K y_grid_desc_m_k, const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m, const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m, index_t num_k_mean_var_count_iteration, index_t num_k_block_tile_iteration, index_t k_grid_size, ComputeDataType epsilon, const WorkspaceMeanVarDataType *const p_mean_global, const WorkspaceMeanVarDataType *const p_variance_global, const int32_t *const p_welford_count_global, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AKB_AK0_M_AK1, typename BGridDesc_BKB_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1, const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map) |
| __device__ half4_t | i4_to_half4 (int q) |
| __device__ half4_t | i4_to_half4_scale (int q, const ck::half2_t &scale) |
| __device__ f8x4_t | i4_to_f8x4 (int q) |
| __device__ f8x8_t | i4_to_fp8x8 (int q) |
| __device__ bhalf4_t | i4_to_bhalf4 (int q) |
| template<typename GridwiseMultiblockBatchNormForward_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor> |
| __global__ void | kernel_multiblock_batchnorm_forward (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, MeanVarDataType *const __restrict__ p_welford_mean, MeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count, int32_t *const __restrict__ p_control, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
| template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_, typename XDataType, typename DyDataType, typename DxDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename DscaleDbiasGridDesc_M_K, typename MeanVarGridDesc_M, typename ScaleBiasGridDesc_M> |
| __global__ void | kernel_reduce_second_half_batchnorm_backward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias) |
| template<typename GridwiseMultiblockWelfordFirstHalf_, typename XDataType, typename MeanVarDataType, typename XGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename GetReduceCountPerThreadFunctor> |
| __global__ void | kernel_multiblock_welford_first_half (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x, MeanVarDataType *const p_welford_mean, MeanVarDataType *const p_welford_variance, int32_t *const p_welford_count) |
| template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M> |
| __global__ void | kernel_welford_second_half_batchnorm_forward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, AccDataType epsilon, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
| template<typename GridwiseWelfordSecondHalfReduceFirstHalf_, typename XDataType, typename DyDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename MeanVarGridDesc_M, typename MeanVarCountGridDesc_M_K, typename DscaleDbiasGridDesc_M_G> |
| __global__ void | kernel_welford_second_half_reduce_first_half (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, index_t num_mean_var_count_k_block_tile_iteration, AccDataType epsilon, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const DyElementwiseOp dy_elementwise_op, MeanVarDataType *const __restrict__ p_out_welford_mean, MeanVarDataType *const __restrict__ p_out_welford_inv_variance, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, DscaleDbiasDataType *const __restrict__ p_reduce_dscale, DscaleDbiasDataType *const __restrict__ p_reduce_dbias) |
| template<typename CTileIdx, typename CTileDim> |
| __host__ __device__ bool | DefaultValidCTileIndex (const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim) |
| template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple> |
| __global__ void | kernel_multiple_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, index_t block_group_size, index_t num_k_block_tile_iteration, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple) |
| template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple> |
| __global__ void | kernel_multiple_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple) |
| template<typename GridwiseReduction, bool OutputIndex, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation> |
| __global__ void | kernel_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global) |
| template<typename GridwiseReduction, bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation> |
| __global__ void | kernel_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global) |
| template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename InGridDesc_M_K, typename DsGridDesc_M, typename OutGridDesc_M, typename InElementwiseOperation, typename OutElementwiseOperation, typename DsGridPointer> |
| __global__ void | kernel_reduce_threadwise_multi_d (const InGridDesc_M_K in_grid_desc_m_k, const DsGridDesc_M ds_grid_desc_m, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const OutElementwiseOperation out_elementwise_op, const InDataType *const __restrict__ p_in_value_global, const DsGridPointer p_ds_value_global, OutDataType *const __restrict__ p_out_value_global) |
| template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_, typename XDataType, typename DyDataType, typename DxDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor> |
| __global__ void | kernel_batchnorm_backward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, long_index_t reduce_size, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias) |
| template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor> |
| __global__ void | kernel_batchnorm_forward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance) |
| template<typename GridwiseElementwise1dFunctor, typename InGrid1dDescTuple, typename OutGrid1dDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename ElementwiseOperation, typename UnaryOperation, typename Scale> |
| __global__ void | kernel_elementwise_1d (const InGrid1dDescTuple in_grid_1d_desc_tuple, const OutGrid1dDescTuple out_grid_1d_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const ElementwiseOperation elementwise_op, const UnaryOperation unary_op, const Scale scale_op) |
| template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation> |
| __global__ void | kernel_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op) |
| template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation> |
| __global__ void | kernel_elementwise_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size) |
| template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation, index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB> |
| __global__ void | kernel_elementwise_batched_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b) |
| template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation, index_t NumInputs, index_t NumOutputs> |
| __global__ void | kernel_batched_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides) |
| template<typename GridwiseGemm, typename ADataType, typename BDataType, typename ScaleDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename ScaleGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_fpAintB_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, const ScaleDataType *__restrict__ p_scale_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const ScaleGridDesc scale_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename FloatC1, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename C1ElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_bias_grid, const FloatC1 *__restrict__ p_d0_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const C1ElementwiseOperation c1_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock, const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> |
| __global__ void | kernel_gemm_dl_v1r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_dpp (const typename GridwiseGemm::Argument karg) |
| template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2CTileMap, typename ComputePtrOffsetOfBatch, bool HasMainKBlockLoop> |
| __global__ void | kernel_grouped_conv_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) |
| template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename ComputePtrOffsetOfBatch, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_contraction_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map) |
| template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_mupltipe_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map) |
| template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true> |
| constexpr auto | GridwiseGemmPipeline_Selector () |
| template<index_t NumPrefetch, LoopScheduler LoopSched> |
| constexpr auto | GridwiseGemmPipeline_v1_Selector () |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename ADataType, typename BDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdl_cshuffle_v1 (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdl_cshuffle_v1 (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, typename GridwiseGemm::Problem problem) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, index_t TailNum = 3> |
| __global__ void | kernel_gemm_xdl_cshuffle_v2 (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdl_cshuffle_v2 (const FloatA *p_a_grid, const FloatB *p_b_grid, FloatC *p_c_grid, typename GridwiseGemm::Problem problem) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_b_preshuffle (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds (typename GridwiseGemm::Argument karg) |
| template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ enable_if_t<!Use2LDS, void > | kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg) |
| template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full> |
| __global__ enable_if_t< Use2LDS, void > | kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_NBlock_NPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_layernorm_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const AccElementwiseOperation acc_element_op, const CElementwiseOperation c_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock, const Block2CTileMap block_2_ctile_map) |
| template<typename LowLengths> |
| __host__ __device__ constexpr auto | make_merge_transform_v4_no_carry (const LowLengths &low_lengths) |
| template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_bwd_weight (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop> |
| __global__ void | kernel_gemm_xdlops_skip_b_lds_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation> |
| __global__ void | kernel_gemm_xdlops_splitk_lds_direct_load (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op) |
| template<typename GridwiseGemm> |
| __global__ void | kernel_gemm_xdlops_streamk (const typename GridwiseGemm::FloatAB *p_a_grid, const typename GridwiseGemm::FloatAB *p_b_grid, typename GridwiseGemm::FloatC *p_c_grid, void *p_workspace, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, typename GridwiseGemm::Block2CTileMap block_mapping) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_v2r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_v2r3 (const typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ABK0MK1GridDesc, typename BBK0NK1GridDesc, typename CM0N0M1N1M2M3M4N2GridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_v2r4 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation> |
| __global__ void | kernel_gemm_xdlops_v2r4r2_simplified (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop> |
| __global__ void | kernel_gemm_xdlops_v3r1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_v3r2 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop> |
| __global__ void | kernel_gemm_xdlops_v3r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const FloatC *__restrict__ p_c1_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_moe_gemm (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_moe_gemm_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_moe_mxgemm_2lds (typename GridwiseGemm::Argument karg) |
| template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even> |
| __global__ void | kernel_moe_mxgemm (typename GridwiseGemm::Argument karg) |
| template<typename GridwisePermute, typename InGridDesc, typename OutGridDesc, typename InDataType, typename OutDataType, typename ElementwiseOperation, typename Block2TileMap> |
| __global__ void | kernel_nd_permute (const InGridDesc in_grid_desc, const OutGridDesc out_grid_desc, const InDataType *p_in_global, OutDataType *p_out_global, const ElementwiseOperation elementwise_op, const Block2TileMap block_2_tile_map) |
| template<typename GridwisePutElementwise1dFunctor, typename InGrid1dDesc, typename InDataType, typename IndexDataType, typename OutDataType, typename ElementwiseOperation> |
| __global__ void | kernel_put_element_1d (const InGrid1dDesc in_grid_1d_desc, const InDataType *__restrict__ p_in_global, const IndexDataType *__restrict__ p_indices_global, OutDataType *__restrict__ p_out_global, const ElementwiseOperation elementwise_op) |
| template<index_t BlockSize, typename DataType, typename Grid1dBufferDescType> |
| __global__ void | kernel_buffer_set_value (const Grid1dBufferDescType grid_1d_buffer_desc, DataType *const __restrict__ p_global, DataType value) |
| template<typename Grid1dBufferDescTuple, index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple, typename DataTypeTuple> |
| __global__ void | kernel_multiple_buffer_set_value (const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, DataTypePointerTuple p_global_tuple, DataTypeTuple value_tuple) |
| template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename GridDesc_M_K> |
| __global__ void | kernel_softmax (const GridDesc_M_K in_grid_desc_m_k, const GridDesc_M_K out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global) |
| template<typename GridwiseSparseEmbedding, typename EmbType, typename IndexType, typename GammaDataType, typename BetaDataType, typename AccDataType, typename OutType, typename OutGridDesc, typename EmbElementwiseOperation, ck::index_t NumEmbeddings> |
| __global__ void | kernel_sparse_embeddings_forward_layernorm (OutType *p_out, const ck::Array< EmbType *, NumEmbeddings > p_embs, const ck::Array< IndexType *, NumEmbeddings > p_indexes, const GammaDataType *p_gamma, const BetaDataType *p_beta, const OutGridDesc out_grid_desc, const AccDataType epsilon, const EmbElementwiseOperation emb_elementwise_op) |
| template<typename InputGridDesc, typename InputDataType, typename OutputGridDesc, typename OutputDataType, typename Block2ETileMap, typename ComputePtrOffsetOfStridedBatch, typename GridwiseTensorRearrangeKernel> |
| __global__ void | kernel_tensor_rearrange (const InputGridDesc in_grid_desc, const InputDataType *__restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType *__restrict__ p_out_global, const index_t batch_count, const Block2ETileMap block_2_tile_map, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch) |
| template<typename GridwiseReduction, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M> |
| __global__ void | kernel_normalization (const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, const GridDesc_M save_mean_grid_desc_m, const GridDesc_M save_inv_std_grid_desc_m, index_t num_k_block_tile_iteration, ComputeDataType epsilon, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op) |
| template<typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M, index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford> |
| auto | NormalizationKernelSelector (bool isSweepOnce) |
| template<typename T> |
| __device__ T * | cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p) |
| template<typename T> |
| __host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * | cast_pointer_to_constant_address_space (T *p) |
| template<typename T> |
| __device__ int32x4_t | make_wave_buffer_resource (T *p_wave, index_t element_space_size) |
| template<typename T> |
| __device__ int32x4_t | make_wave_buffer_resource_with_default_range (T *p_wave) |
| __device__ int8_t | llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8") |
| __device__ int8x2_t | llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8") |
| __device__ int8x4_t | llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8") |
| __device__ bhalf_t | llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16") |
| __device__ bhalf2_t | llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16") |
| __device__ bhalf4_t | llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16") |
| __device__ int32_t | llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32") |
| __device__ int32x2_t | llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32") |
| __device__ int32x4_t | llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32") |
| __device__ half_t | llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16") |
| __device__ half2_t | llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16") |
| __device__ half4_t | llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16") |
| __device__ float | llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32") |
| __device__ float2_t | llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32") |
| __device__ float4_t | llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32") |
| __device__ void | llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32") |
| __device__ half2_t | llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16") |
| __device__ int32_t | llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32") |
| __device__ float | llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32") |
| __device__ double | llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64") |
| template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type< int8_t, N >::type | amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type< T, N >::type | amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ void | amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ void | amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N> |
| __device__ void | amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr) |
| template<typename T, index_t N> |
| __device__ void | amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N> |
| __device__ void | amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type_maker< T, N >::type::type | amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type_maker< T, N >::type::type | amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ void | amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| template<typename T, index_t N> |
| __device__ void | amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| template<typename T, index_t N> |
| __device__ void | amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size) |
| __device__ void | llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds") |
| template<typename T, index_t NumElemsPerThread> |
| __device__ void | amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size) |
| template<typename T> |
| __device__ __amdgpu_buffer_rsrc_t | make_wave_buffer_resource_new (T *p_wave, index_t element_space_size) |
| template<typename T> |
| __device__ __amdgpu_buffer_rsrc_t | make_wave_buffer_resource_with_default_range_new (T *p_wave) |
| template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type< int8_t, N >::type | amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ vector_type< T, N >::type | amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset) |
| template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ void | amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence> |
| __device__ void | amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset) |
| template<> |
| __host__ __device__ constexpr bool | fp8_is_nan (f8_ocp_t a) |
| template<> |
| __host__ __device__ constexpr bool | fp8_is_nan (bf8_ocp_t a) |
| template<> |
| __host__ __device__ constexpr bool | fp8_is_nan (f8_fnuz_t a) |
| template<> |
| __host__ __device__ constexpr bool | fp8_is_nan (bf8_fnuz_t a) |
| template<> |
| __host__ __device__ constexpr bool | fp8_is_inf (bf8_ocp_t a) |
| __device__ int | amd_assembly_and_b32 (int a, int b) |
| __device__ int | amd_assembly_and_or_b32 (int a, int b, int d) |
| __device__ half2_t | amd_assembly_pk_fma_f16 (half2_t a, half2_t b, half2_t c) |
| __device__ half2_t | amd_assembly_pk_add_f16 (half2_t a, half2_t b) |
| __device__ float | amd_assemble_cvt_f32_i4 (int b) |
| __device__ f8x4_t | amd_assembly_cvt_f8_to_f32 (float b0, float b1, float b2, float b3) |
| __device__ f8x8_t | amd_assembly_i4_to_fp8x8 (int a) |
| __device__ void | amd_assembly_outer_product_1x2 (float a, float b0, float b1, float &c0, float &c1) |
| __device__ void | amd_assembly_outer_product_1x4 (float a, float b0, float b1, float b2, float b3, float &c0, float &c1, float &c2, float &c3) |
| __device__ void | amd_assembly_outer_product_1x2 (half2_t a, half2_t b0, half2_t b1, float &c0, float &c1) |
| __device__ void | amd_assembly_outer_product_1x4 (half2_t a, half2_t b0, half2_t b1, half2_t b2, half2_t b3, float &c0, float &c1, float &c2, float &c3) |
| __device__ void | amd_assembly_outer_product_1x2 (int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t &c0, int32_t &c1) |
| __device__ void | amd_assembly_outer_product_1x4 (int8x4_t a, int8x4_t b0, int8x4_t b1, int8x4_t b2, int8x4_t b3, int32_t &c0, int32_t &c1, int32_t &c2, int32_t &c3) |
| __device__ uint32_t | amd_wave_read_first_lane (uint32_t value) |
| __device__ int32_t | amd_wave_read_first_lane (int32_t value) |
| __device__ int64_t | amd_wave_read_first_lane (int64_t value) |
| template<typename Object, typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>> |
| __device__ auto | amd_wave_read_first_lane (const Object &obj) |
| template<typename X, typename... Xs> |
| __host__ __device__ constexpr auto | make_array (X &&x, Xs &&... xs) |
| template<typename X> |
| __host__ __device__ constexpr auto | make_array () |
| template<typename... Xs> |
| __host__ __device__ constexpr auto | make_multi_index (Xs &&... xs) |
| template<index_t NSize> |
| __host__ __device__ constexpr auto | make_zero_multi_index () |
| template<typename T> |
| __host__ __device__ constexpr auto | to_multi_index (const T &x) |
| template<index_t NSize, typename X> |
| __host__ __device__ constexpr auto | operator+= (MultiIndex< NSize > &y, const X &x) |
| template<index_t NSize, typename X> |
| __host__ __device__ constexpr auto | operator-= (MultiIndex< NSize > &y, const X &x) |
| template<index_t NSize, typename T> |
| __host__ __device__ constexpr auto | operator+ (const MultiIndex< NSize > &a, const T &b) |
| template<index_t NSize, typename T> |
| __host__ __device__ constexpr auto | operator- (const MultiIndex< NSize > &a, const T &b) |
| template<index_t NSize, typename T> |
| __host__ __device__ constexpr auto | operator* (const MultiIndex< NSize > &a, const T &b) |
| template<typename PY, typename PX, typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false> |
| __host__ __device__ PY | c_style_pointer_cast (PX p_x) |
| template<typename Arr, typename Picks, typename X> |
| __host__ __device__ constexpr auto | operator+= (ContainerElementPicker< Arr, Picks > &y, const X &x) |
| template<typename Arr, typename Picks, typename X> |
| __host__ __device__ constexpr auto | operator-= (ContainerElementPicker< Arr, Picks > &y, const X &x) |
| template<typename Arr, typename Picks> |
| __host__ __device__ constexpr auto | pick_container_element (Arr &a, Picks) |
| template<typename Arr, typename Picks> |
| __host__ __device__ constexpr auto | pick_container_element (const Arr &a, Picks) |
| template<typename TData, index_t NSize> |
| __host__ __device__ constexpr auto | container_push_back (const Array< TData, NSize > &a, const TData &x) |
| template<typename... Ts, typename T> |
| __host__ __device__ constexpr auto | container_push_front (const Tuple< Ts... > &a, const T &x) |
| template<typename... Ts, typename T> |
| __host__ __device__ constexpr auto | container_push_back (const Tuple< Ts... > &a, const T &x) |
| template<typename TData, index_t NSize, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_new2old (const Array< TData, NSize > &old_array, Sequence< IRs... >) |
| template<typename TData, index_t NSize, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_old2new (const Array< TData, NSize > &old_array, Sequence< IRs... > old2new) |
| template<typename... Ts, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_new2old (const Tuple< Ts... > &old_tuple, Sequence< IRs... >) |
| template<typename... Ts, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_old2new (const Tuple< Ts... > &old_tuple, Sequence< IRs... > old2new) |
| template<index_t... Is, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_new2old (Sequence< Is... >, Sequence< IRs... >) |
| template<index_t... Is, index_t... IRs> |
| __host__ __device__ constexpr auto | container_reorder_given_old2new (Sequence< Is... > old_seq, Sequence< IRs... >) |
| template<typename Container, typename Reduce, typename Init, index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1> |
| __host__ __device__ constexpr auto | container_reduce (const Container &x, Reduce reduce, Init init, Number< IBegin >=Number< 0 >{}, Number< IEnd >=Number< Container::Size()>{}, Number< IStep >=Number< 1 >{}) |
| template<typename TData, index_t NSize, typename Reduce> |
| __host__ __device__ constexpr auto | container_reverse_inclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init) |
| template<typename TData, index_t NSize, typename Reduce> |
| __host__ __device__ constexpr auto | container_reverse_exclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init) |
| template<index_t... Is, typename Reduce, index_t Init> |
| __host__ __device__ constexpr auto | container_reverse_exclusive_scan (const Sequence< Is... > &seq, Reduce f, Number< Init >) |
| template<typename... Xs, typename Reduce, typename Init> |
| __host__ __device__ constexpr auto | container_reverse_exclusive_scan (const Tuple< Xs... > &x, Reduce reduce, Init init) |
| template<typename... Xs, typename Reduce, typename TData> |
| __host__ __device__ constexpr auto | container_reverse_inclusive_scan (const Tuple< Xs... > &x, Reduce f, TData init) |
| template<typename X, typename... Ys> |
| __host__ __device__ constexpr auto | container_concat (const X &x, const Ys &... ys) |
| template<typename T, index_t NX, index_t NY> |
| __host__ __device__ constexpr auto | container_concat (const Array< T, NX > &ax, const Array< T, NY > &ay) |
| template<typename... X, typename... Y> |
| __host__ __device__ constexpr auto | container_concat (const Tuple< X... > &tx, const Tuple< Y... > &ty) |
| template<typename Container> |
| __host__ __device__ constexpr auto | container_concat (const Container &x) |
| template<typename T, index_t N, index_t... Is> |
| __host__ __device__ constexpr auto | get_container_subset (const Array< T, N > &arr, Sequence< Is... >) |
| template<typename... Ts, index_t... Is> |
| __host__ __device__ constexpr auto | get_container_subset (const Tuple< Ts... > &tup, Sequence< Is... >) |
| template<typename T, index_t N, index_t... Is> |
| __host__ __device__ constexpr void | set_container_subset (Array< T, N > &y, Sequence< Is... > picks, const Array< T, sizeof...(Is)> &x) |
| template<typename... Ys, index_t... Is, typename... Xs> |
| __host__ __device__ constexpr void | set_container_subset (Tuple< Ys... > &y, Sequence< Is... > picks, const Tuple< Xs... > &x) |
| template<index_t... Is> |
| __host__ __device__ constexpr auto | sequence_to_tuple_of_number (Sequence< Is... >) |
| constexpr auto | next_pow2 (uint32_t x) |
| template<typename T> |
| constexpr bool | is_native_type () |
| template<typename T> |
| const char * | get_type_name () |
| template<typename T, index_t N> |
| __host__ __device__ constexpr auto | make_vector_type (Number< N >) |
| template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize> |
| __host__ __device__ constexpr auto | make_dynamic_buffer (T *p, ElementSpaceSize element_space_size) |
| template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize> |
| __host__ __device__ constexpr auto | make_long_dynamic_buffer (T *p, ElementSpaceSize element_space_size) |
| template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize, typename X, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false> |
| __host__ __device__ constexpr auto | make_dynamic_buffer (T *p, ElementSpaceSize element_space_size, X invalid_element_value) |
| template<class EnvVar> |
| const std::string & | EnvGetString (EnvVar) |
| template<class EnvVar> |
| bool | EnvIsEnabled (EnvVar) |
| template<class EnvVar> |
| bool | EnvIsDisabled (EnvVar) |
| template<class EnvVar> |
| uint64_t | EnvValue (EnvVar) |
| template<class EnvVar> |
| bool | EnvIsUnset (EnvVar) |
| template<class EnvVar> |
| void | EnvUnset (EnvVar) |
| template<typename EnvVar, typename ValueType> |
| void | UpdateEnvVar (EnvVar, const ValueType &val) |
| | updates the cached value of an environment variable
|
| template<typename EnvVar> |
| void | UpdateEnvVar (EnvVar, const std::string_view &val) |
| __host__ int | clz (uint32_t x) |
| template<bool predicate, typename X, typename Y> |
| constexpr auto | conditional_expr (X &&x, Y &&y) |
| template<typename F, typename X> |
| __host__ __device__ constexpr auto | unpack (F &&f, X &&x) |
| template<typename F, typename X, typename Y> |
| __host__ __device__ constexpr auto | unpack2 (F &&f, X &&x, Y &&y) |
| template<typename X> |
| __device__ X | atomic_add (X *p_dst, const X &x) |
| template<> |
| __device__ int32_t | atomic_add< int32_t > (int32_t *p_dst, const int32_t &x) |
| template<> |
| __device__ uint32_t | atomic_add< uint32_t > (uint32_t *p_dst, const uint32_t &x) |
| template<> |
| __device__ float | atomic_add< float > (float *p_dst, const float &x) |
| template<> |
| __device__ unsigned short | atomic_add< unsigned short > (unsigned short *p_dst, const unsigned short &x) |
| template<> |
| __device__ _Float16 | atomic_add< _Float16 > (_Float16 *p_dst, const _Float16 &x) |
| template<> |
| __device__ double | atomic_add< double > (double *p_dst, const double &x) |
| template<> |
| __device__ float2_t | atomic_add< float2_t > (float2_t *p_dst, const float2_t &x) |
| template<> |
| __device__ double2_t | atomic_add< double2_t > (double2_t *p_dst, const double2_t &x) |
| template<typename X> |
| __device__ X | atomic_max (X *p_dst, const X &x) |
| template<> |
| __device__ int32_t | atomic_max< int32_t > (int32_t *p_dst, const int32_t &x) |
| template<> |
| __device__ uint32_t | atomic_max< uint32_t > (uint32_t *p_dst, const uint32_t &x) |
| template<> |
| __device__ float | atomic_max< float > (float *p_dst, const float &x) |
| template<> |
| __device__ double | atomic_max< double > (double *p_dst, const double &x) |
| template<> |
| __device__ float2_t | atomic_max< float2_t > (float2_t *p_dst, const float2_t &x) |
| __device__ constexpr index_t | get_warp_size () |
| __device__ index_t | get_thread_local_1d_id () |
| __device__ index_t | get_thread_global_1d_id () |
| __device__ index_t | get_warp_local_1d_id () |
| __device__ index_t | get_block_1d_id () |
| __device__ index_t | get_grid_size () |
| __device__ index_t | get_block_size () |
| template<> |
| constexpr __device__ index_t | get_shift< 1 > () |
| template<typename TA, typename TB, typename TC> |
| __device__ void | inner_product (const TA &a, const TB &b, TC &c) |
| template<> |
| __device__ void | inner_product< float, float, float > (const float &a, const float &b, float &c) |
| template<> |
| __device__ void | inner_product< float2_t, float2_t, float > (const float2_t &a, const float2_t &b, float &c) |
| template<> |
| __device__ void | inner_product< float4_t, float4_t, float > (const float4_t &a, const float4_t &b, float &c) |
| template<> |
| __device__ void | inner_product< bhalf_t, bhalf_t, float > (const bhalf_t &a, const bhalf_t &b, float &c) |
| template<> |
| __device__ void | inner_product< half_t, half_t, float > (const half_t &a, const half_t &b, float &c) |
| template<> |
| __device__ void | inner_product< half2_t, half2_t, float > (const half2_t &a, const half2_t &b, float &c) |
| template<> |
| __device__ void | inner_product< half4_t, half4_t, float > (const half4_t &a, const half4_t &b, float &c) |
| template<> |
| __device__ void | inner_product< half8_t, half8_t, float > (const half8_t &a, const half8_t &b, float &c) |
| template<> |
| __device__ void | inner_product< int8_t, int8_t, int32_t > (const int8_t &a, const int8_t &b, int32_t &c) |
| template<> |
| __device__ void | inner_product< int8x2_t, int8x2_t, int32_t > (const int8x2_t &a, const int8x2_t &b, int32_t &c) |
| template<> |
| __device__ void | inner_product< int8x4_t, int8x4_t, int32_t > (const int8x4_t &a, const int8x4_t &b, int32_t &c) |
| template<> |
| __device__ void | inner_product< int8x8_t, int8x8_t, int32_t > (const int8x8_t &a, const int8x8_t &b, int32_t &c) |
| template<> |
| __device__ void | inner_product< int8x16_t, int8x16_t, int32_t > (const int8x16_t &a, const int8x16_t &b, int32_t &c) |
| template<typename TX, TX X, typename TY, TY Y> |
| __host__ __device__ constexpr auto | operator+ (integral_constant< TX, X >, integral_constant< TY, Y >) |
| template<typename TX, TX X, typename TY, TY Y> |
| __host__ __device__ constexpr auto | operator- (integral_constant< TX, X >, integral_constant< TY, Y >) |
| template<typename TX, TX X, typename TY, TY Y> |
| __host__ __device__ constexpr auto | operator* (integral_constant< TX, X >, integral_constant< TY, Y >) |
| template<typename TX, TX X, typename TY, TY Y> |
| __host__ __device__ constexpr auto | operator/ (integral_constant< TX, X >, integral_constant< TY, Y >) |
| template<typename TX, TX X, typename TY, TY Y> |
| __host__ __device__ constexpr auto | operator% (integral_constant< TX, X >, integral_constant< TY, Y >) |
| constexpr LoopScheduler | make_default_loop_scheduler () |
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | mxf8_convert_sr (X x, float scale) |
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | mxf8_convert_rne (X x, float scale) |
| template<> |
| __host__ __device__ f8_ocp_t | mxf8_convert_rne< f8_ocp_t, float > (float x, float scale) |
| template<> |
| __host__ __device__ bf8_ocp_t | mxf8_convert_rne< bf8_ocp_t, float > (float x, float scale) |
| template<> |
| __host__ __device__ f8x2_ocp_t | mxf8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x, float scale) |
| template<> |
| __host__ __device__ bf8x2_ocp_t | mxf8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x, float scale) |
| template<> |
| __host__ __device__ f8x16_ocp_t | mxf8_convert_rne< f8x16_ocp_t, float16_t > (float16_t x, float scale) |
| template<> |
| __host__ __device__ bf8x16_ocp_t | mxf8_convert_rne< bf8x16_ocp_t, float16_t > (float16_t x, float scale) |
| template<> |
| __host__ __device__ f8x32_ocp_t | mxf8_convert_rne< f8x32_ocp_t, float32_t > (float32_t x, float scale) |
| template<> |
| __host__ __device__ bf8x32_ocp_t | mxf8_convert_rne< bf8x32_ocp_t, float32_t > (float32_t x, float scale) |
| template<> |
| __host__ __device__ f8_ocp_t | mxf8_convert_sr< f8_ocp_t, float > (float x, float scale) |
| template<> |
| __host__ __device__ bf8_ocp_t | mxf8_convert_sr< bf8_ocp_t, float > (float x, float scale) |
| template<> |
| __host__ __device__ f8x2_ocp_t | mxf8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x, float scale) |
| template<> |
| __host__ __device__ bf8x2_ocp_t | mxf8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x, float scale) |
| template<> |
| __host__ __device__ f8x16_ocp_t | mxf8_convert_sr< f8x16_ocp_t, float16_t > (float16_t x, float scale) |
| template<> |
| __host__ __device__ bf8x16_ocp_t | mxf8_convert_sr< bf8x16_ocp_t, float16_t > (float16_t x, float scale) |
| template<> |
| __host__ __device__ f8x32_ocp_t | mxf8_convert_sr< f8x32_ocp_t, float32_t > (float32_t x, float scale) |
| template<> |
| __host__ __device__ bf8x32_ocp_t | mxf8_convert_sr< bf8x32_ocp_t, float32_t > (float32_t x, float scale) |
| template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< float, T >{}, bool > = false> |
| __host__ __device__ uint32_t | prand_generator (index_t id, T val, uint32_t seed=seed_t) |
| template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< _Float16, T >{}, bool > = false> |
| __host__ __device__ uint32_t | prand_generator (index_t id, T val, uint32_t seed=seed_t) |
| template<typename T, uint32_t seed_t, ck::enable_if_t<!(is_same< float, T >{}||is_same< _Float16, T >{}), bool > = false> |
| __host__ __device__ uint32_t | prand_generator (int id, T val, uint32_t seed=seed_t) |
| template<typename Y, typename X> |
| __host__ constexpr Y | scaled_type_convert (e8m0_bexp_t scale, X x) |
| template<> |
| __host__ float | scaled_type_convert< float, f8_ocp_t > (e8m0_bexp_t scale, f8_ocp_t x) |
| template<> |
| __host__ float | scaled_type_convert< float, bf8_ocp_t > (e8m0_bexp_t scale, bf8_ocp_t x) |
| template<> |
| __host__ float2_t | scaled_type_convert< float2_t, f8x2_ocp_t > (e8m0_bexp_t scale, f8x2_ocp_t x) |
| template<> |
| __host__ float2_t | scaled_type_convert< float2_t, bf8x2_ocp_t > (e8m0_bexp_t scale, bf8x2_ocp_t x) |
| template<> |
| __host__ float16_t | scaled_type_convert< float16_t, f8x16_ocp_t > (e8m0_bexp_t scale, f8x16_ocp_t x) |
| template<> |
| __host__ float16_t | scaled_type_convert< float16_t, bf8x16_ocp_t > (e8m0_bexp_t scale, bf8x16_ocp_t x) |
| template<> |
| __host__ float32_t | scaled_type_convert< float32_t, f8x32_ocp_t > (e8m0_bexp_t scale, f8x32_ocp_t x) |
| template<> |
| __host__ float32_t | scaled_type_convert< float32_t, bf8x32_ocp_t > (e8m0_bexp_t scale, bf8x32_ocp_t x) |
| template<> |
| __host__ f8_ocp_t | scaled_type_convert< f8_ocp_t, float > (e8m0_bexp_t scale, float x) |
| template<> |
| __host__ bf8_ocp_t | scaled_type_convert< bf8_ocp_t, float > (e8m0_bexp_t scale, float x) |
| template<> |
| __host__ f8x2_ocp_t | scaled_type_convert< f8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x) |
| template<> |
| __host__ bf8x2_ocp_t | scaled_type_convert< bf8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x) |
| template<> |
| __host__ f8x16_ocp_t | scaled_type_convert< f8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x) |
| template<> |
| __host__ bf8x16_ocp_t | scaled_type_convert< bf8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x) |
| template<> |
| __host__ f8x32_ocp_t | scaled_type_convert< f8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x) |
| template<> |
| __host__ bf8x32_ocp_t | scaled_type_convert< bf8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x) |
| template<index_t I, index_t... Is> |
| __host__ __device__ constexpr auto | sequence_pop_front (Sequence< I, Is... >) |
| template<typename Seq> |
| __host__ __device__ constexpr auto | sequence_pop_back (Seq) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr bool | operator== (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | operator+ (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | operator- (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | operator* (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | operator/ (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | operator% (Sequence< Xs... >, Sequence< Ys... >) |
| template<index_t... Xs, index_t Y> |
| __host__ __device__ constexpr auto | operator+ (Sequence< Xs... >, Number< Y >) |
| template<index_t... Xs, index_t Y> |
| __host__ __device__ constexpr auto | operator- (Sequence< Xs... >, Number< Y >) |
| template<index_t... Xs, index_t Y> |
| __host__ __device__ constexpr auto | operator* (Sequence< Xs... >, Number< Y >) |
| template<index_t... Xs, index_t Y> |
| __host__ __device__ constexpr auto | operator/ (Sequence< Xs... >, Number< Y >) |
| template<index_t... Xs, index_t Y> |
| __host__ __device__ constexpr auto | operator% (Sequence< Xs... >, Number< Y >) |
| template<index_t Y, index_t... Xs> |
| __host__ __device__ constexpr auto | operator+ (Number< Y >, Sequence< Xs... >) |
| template<index_t Y, index_t... Xs> |
| __host__ __device__ constexpr auto | operator- (Number< Y >, Sequence< Xs... >) |
| template<index_t Y, index_t... Xs> |
| __host__ __device__ constexpr auto | operator* (Number< Y >, Sequence< Xs... >) |
| template<index_t Y, index_t... Xs> |
| __host__ __device__ constexpr auto | operator/ (Number< Y >, Sequence< Xs... >) |
| template<index_t Y, index_t... Xs> |
| __host__ __device__ constexpr auto | operator% (Number< Y >, Sequence< Xs... >) |
| template<typename... Seqs> |
| __host__ __device__ constexpr auto | merge_sequences (Seqs...) |
| template<typename F, index_t... Xs> |
| __host__ __device__ constexpr auto | transform_sequences (F f, Sequence< Xs... >) |
| template<typename F, index_t... Xs, index_t... Ys> |
| __host__ __device__ constexpr auto | transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >) |
| template<typename F, index_t... Xs, index_t... Ys, index_t... Zs> |
| __host__ __device__ constexpr auto | transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >, Sequence< Zs... >) |
| template<typename Seq, typename Reduce, index_t Init> |
| __host__ __device__ constexpr auto | reverse_inclusive_scan_sequence (Seq, Reduce, Number< Init >) |
| template<typename Seq, typename Reduce, index_t Init> |
| __host__ __device__ constexpr auto | reverse_exclusive_scan_sequence (Seq, Reduce, Number< Init >) |
| template<typename Seq, typename Reduce, index_t Init> |
| __host__ __device__ constexpr auto | inclusive_scan_sequence (Seq, Reduce, Number< Init >) |
| template<typename Seq, index_t... Is> |
| __host__ __device__ constexpr auto | pick_sequence_elements_by_ids (Seq, Sequence< Is... >) |
| template<typename Seq, typename Mask> |
| __host__ __device__ constexpr auto | pick_sequence_elements_by_mask (Seq, Mask) |
| template<typename Seq, typename Values, typename Ids> |
| __host__ __device__ constexpr auto | modify_sequence_elements_by_ids (Seq, Values, Ids) |
| template<typename Seq, typename Reduce, index_t Init> |
| __host__ __device__ constexpr index_t | reduce_on_sequence (Seq, Reduce f, Number< Init >) |
| template<typename Seq, typename F> |
| __host__ __device__ constexpr bool | sequence_any_of (Seq, F f) |
| template<typename Seq, typename F> |
| __host__ __device__ constexpr bool | sequence_all_of (Seq, F f) |
| template<index_t... Is> |
| __host__ __device__ constexpr auto | make_sequence (Number< Is >...) |
| template<typename F, index_t N> |
| __host__ __device__ constexpr auto | generate_sequence (F, Number< N >) |
| template<typename F, index_t N> |
| __host__ __device__ constexpr auto | generate_sequence_v2 (F &&f, Number< N >) |
| template<index_t... Is> |
| __host__ __device__ constexpr auto | to_sequence (Tuple< Number< Is >... >) |
| template<AddressSpaceEnum AddressSpace, typename T, index_t N> |
| __host__ __device__ constexpr auto | make_static_buffer (Number< N >) |
| template<AddressSpaceEnum AddressSpace, typename T, long_index_t N> |
| __host__ __device__ constexpr auto | make_static_buffer (LongNumber< N >) |
| template<typename X, typename... Xs> |
| __host__ __device__ constexpr auto | make_statically_indexed_array (const X &x, const Xs &... xs) |
| template<typename X> |
| __host__ __device__ constexpr auto | make_statically_indexed_array () |
| template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator+= (Tuple< Ys... > &y, const X &x) |
| template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator-= (Tuple< Ys... > &y, const X &x) |
| template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator+ (const Tuple< Xs... > &x, const Y &y) |
| template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator- (const Tuple< Xs... > &x, const Y &y) |
| template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator* (const Tuple< Xs... > &x, const Y &y) |
| template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator* (Y a, const Tuple< Xs... > &x) |
| template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false> |
| __host__ __device__ constexpr auto | operator* (const Tuple< Xs... > &x, Y a) |
| template<typename... Xs> |
| __host__ __device__ void | print_multi_index (const Tuple< Xs... > &x) |
| __device__ void | block_sync_lds () |
| __device__ void | block_sync_lds_direct_load () |
| __device__ void | s_nop () |
| __device__ void | transpose_fp16_2x2 (const half2_t &x0, const half2_t &x1, half2_t &y0, half2_t &y1) |
| __device__ void | transpose_int8_4x4 (const int8x4_t &x0, const int8x4_t &x1, const int8x4_t &x2, const int8x4_t &x3, int8x4_t &y0, int8x4_t &y1, int8x4_t &y2, int8x4_t &y3) |
| __device__ void | transpose_f8_4x4 (const f8x4_t &x0, const f8x4_t &x1, const f8x4_t &x2, const f8x4_t &x3, f8x4_t &y0, f8x4_t &y1, f8x4_t &y2, f8x4_t &y3) |
| template<typename... Xs> |
| __host__ __device__ constexpr auto | make_tuple (Xs &&... xs) |
| template<typename... Args> |
| constexpr Tuple< Args &... > | tie (Args &... args) noexcept |
| template<typename F, index_t... ids> |
| __host__ __device__ constexpr auto | generate_tuple_for (F &&f, Sequence< ids... >) |
| template<typename F, index_t N> |
| __host__ __device__ constexpr auto | generate_tuple (F &&f, Number< N >) |
| template<typename F, index_t N> |
| __host__ __device__ constexpr auto | generate_tuple (F &&f, LongNumber< N >) |
| template<typename F, index_t N> |
| __host__ __device__ constexpr auto | generate_tie (F &&f, Number< N >) |
| template<typename... X, typename... Y> |
| __host__ __device__ constexpr auto | concat_tuple_of_reference (const Tuple< X &... > &tx, const Tuple< Y &... > &ty) |
| template<typename... X, typename... Y> |
| __host__ __device__ constexpr auto | concat_tuple (const Tuple< X... > &tx, const Tuple< Y... > &ty) |
| template<typename... X> |
| __host__ __device__ constexpr auto | concat_tuple (const Tuple< X... > &tx) |
| template<typename... X, typename... Tuples> |
| __host__ __device__ constexpr auto | concat_tuple (const Tuple< X... > &tx, const Tuples &... tuples) |
| template<typename F, typename X> |
| __host__ __device__ constexpr auto | transform_tuples (F f, const X &x) |
| template<typename F, typename X, typename Y> |
| __host__ __device__ constexpr auto | transform_tuples (F f, const X &x, const Y &y) |
| template<typename F, typename X, typename Y, typename Z> |
| __host__ __device__ constexpr auto | transform_tuples (F f, const X &x, const Y &y, const Z &z) |
| template<index_t Depth = 0, index_t MaxDepth = -1> |
| __host__ __device__ constexpr auto | UnrollNestedTuple (const Tuple<> &element) |
| template<index_t Depth = 0, index_t MaxDepth = -1, typename T> |
| __host__ __device__ constexpr auto | UnrollNestedTuple (const T &element) |
| template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts> |
| __host__ __device__ constexpr auto | UnrollNestedTuple (const Tuple< Ts... > &tuple) |
| template<typename... Ts> |
| __host__ __device__ constexpr auto | TupleReverse (const Tuple< Ts... > &tuple) |
| template<index_t Idx, index_t End, typename F, typename... Ts> |
| __host__ __device__ constexpr auto | TupleReduce (F &&f, const Tuple< Ts... > &tuple) |
| template<typename... Ts> |
| __host__ __device__ constexpr auto | IsNestedTuple (const Tuple< Ts... > &) |
| template<index_t depth = 0, typename T> |
| __host__ __device__ constexpr auto | TupleDepth (const T &) |
| template<index_t depth = 0, typename... Ts> |
| __host__ __device__ constexpr auto | TupleDepth (const Tuple< Ts... > &) |
| template<index_t from, index_t to, typename... Ts> |
| __host__ __device__ constexpr auto | TupleSlice (const Tuple< Ts... > &tuple) |
| template<typename Y, typename X, typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false> |
| __host__ __device__ constexpr Y | bit_cast (const X &x) |
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | bf16_convert_rtn (X x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | bf16_convert_rtn< bhalf_t, float > (float x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | bf16_convert_rtn< bhalf_t, half_t > (half_t x) |
| template<typename Y, typename X, ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false> |
| __host__ __device__ constexpr Y | type_convert (X x) |
| template<typename Y, typename X, ck::enable_if_t< ck::is_const_v< Y >||ck::is_const_v< X >, bool > = false> |
| __host__ __device__ constexpr Y | type_convert (X x) |
| template<> |
| __host__ __device__ constexpr float | type_convert< float, bhalf_t > (bhalf_t x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | type_convert< bhalf_t, float > (float x) |
| template<> |
| __host__ __device__ constexpr half_t | type_convert< half_t, bhalf_t > (bhalf_t x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | type_convert< bhalf_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ constexpr int8_t | type_convert< int8_t, bhalf_t > (bhalf_t x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | type_convert< bhalf_t, int8_t > (int8_t x) |
| template<> |
| __host__ __device__ constexpr f8_ocp_t | type_convert< f8_ocp_t, int > (int x) |
| template<> |
| __host__ __device__ constexpr bf8_ocp_t | type_convert< bf8_ocp_t, int > (int x) |
| template<typename Y, enable_if_t< is_same_v< Y, ck::tf32_t >, bool > = false> |
| __host__ __device__ constexpr float | type_convert (float x) |
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | type_convert_sp (X x) |
| template<> |
| __host__ __device__ constexpr int | type_convert_sp< int, float > (float x) |
| template<> |
| __host__ __device__ constexpr float | type_convert_sp< float, int > (int x) |
| template<> |
| __host__ __device__ constexpr int | type_convert_sp< int, half_t > (half_t x) |
| template<> |
| __host__ __device__ constexpr half_t | type_convert_sp< half_t, int > (int x) |
| template<> |
| __host__ __device__ constexpr int | type_convert_sp< int, f8_t > (f8_t x) |
| template<> |
| __host__ __device__ constexpr f8_t | type_convert_sp< f8_t, int > (int x) |
| template<> |
| __host__ __device__ constexpr int | type_convert_sp< int, bhalf_t > (bhalf_t x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | type_convert_sp< bhalf_t, int > (int x) |
| template<> |
| __host__ __device__ constexpr bhalf_t | type_convert_sp< bhalf_t, float > (float x) |
| template<> |
| __host__ __device__ constexpr half_t | type_convert_sp< half_t, float > (float x) |
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | f8_convert_sr (X x) |
| template<> |
| __host__ __device__ f8_fnuz_t | f8_convert_sr< f8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ f8_fnuz_t | f8_convert_sr< f8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | f8_convert_sr< bf8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | f8_convert_sr< bf8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, float > (float x) |
| | Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x) |
| | Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, float > (float x) |
| | Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x) |
| | Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, half_t > (half_t x) |
| | Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, half2_t > (half2_t x) |
| | Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, half_t > (half_t x) |
| | Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, half2_t > (half2_t x) |
| | Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_sr< f8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_sr< f8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| | Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_sr< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_sr< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| | Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
|
| template<typename Y, typename X> |
| __host__ __device__ constexpr Y | f8_convert_rne (X x) |
| template<> |
| __host__ __device__ f8_fnuz_t | f8_convert_rne< f8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ f8_fnuz_t | f8_convert_rne< f8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | f8_convert_rne< bf8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | f8_convert_rne< bf8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, float > (float x) |
| | Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x) |
| | Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, float > (float x) |
| | Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x) |
| | Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, half_t > (half_t x) |
| | Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, half2_t > (half2_t x) |
| | Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, half_t > (half_t x) |
| | Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, half2_t > (half2_t x) |
| | Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8_ocp_t | f8_convert_rne< f8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8x2_ocp_t | f8_convert_rne< f8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| | Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8_ocp_t | f8_convert_rne< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ bf8x2_ocp_t | f8_convert_rne< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x) |
| | Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
|
| template<> |
| __host__ __device__ f8_fnuz_t | type_convert< f8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ float | type_convert< float, f8_fnuz_t > (f8_fnuz_t x) |
| template<> |
| __host__ __device__ float2_t | type_convert< float2_t, f8x2_fnuz_t > (f8x2_fnuz_t x) |
| template<> |
| __host__ __device__ float | type_convert< float, f8_ocp_t > (f8_ocp_t x) |
| | Converts a f8_ocp_t value to a float value.
|
| template<> |
| __host__ __device__ float2_t | type_convert< float2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| | Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.
|
| template<> |
| __host__ __device__ half_t | type_convert< half_t, f8_ocp_t > (f8_ocp_t x) |
| | Converts a f8_ocp_t value to a half_t value.
|
| template<> |
| __host__ __device__ half2_t | type_convert< half2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| | Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.
|
| template<> |
| __host__ __device__ bhalf_t | type_convert< bhalf_t, f8_ocp_t > (f8_ocp_t x) |
| | Converts a f8_ocp_t value to a bhalf_t value.
|
| template<> |
| __host__ __device__ bhalf2_t | type_convert< bhalf2_t, f8x2_ocp_t > (f8x2_ocp_t x) |
| | Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.
|
| template<> |
| __host__ __device__ float | type_convert< float, bf8_ocp_t > (bf8_ocp_t x) |
| | Converts a bf8_ocp_t value to a float value.
|
| template<> |
| __host__ __device__ float2_t | type_convert< float2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| | Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.
|
| template<> |
| __host__ __device__ half_t | type_convert< half_t, bf8_ocp_t > (bf8_ocp_t x) |
| | Converts a bf8_ocp_t value to a half_t value.
|
| template<> |
| __host__ __device__ half2_t | type_convert< half2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| | Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.
|
| template<> |
| __host__ __device__ bhalf_t | type_convert< bhalf_t, bf8_ocp_t > (bf8_ocp_t x) |
| | Converts a bf8_ocp_t value to a bhalf_t value.
|
| template<> |
| __host__ __device__ bhalf2_t | type_convert< bhalf2_t, bf8x2_ocp_t > (bf8x2_ocp_t x) |
| | Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.
|
| template<> |
| __host__ __device__ float2_t | type_convert< float2_t, pk_i4_t > (pk_i4_t x) |
| template<> |
| __host__ __device__ half2_t | type_convert< half2_t, pk_i4_t > (pk_i4_t x) |
| template<> |
| __host__ __device__ bhalf2_t | type_convert< bhalf2_t, pk_i4_t > (pk_i4_t x) |
| template<> |
| __host__ __device__ half2_t | type_convert< half2_t, float2_t > (float2_t x) |
| template<> |
| __host__ __device__ f8_fnuz_t | type_convert< f8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, half_t > (half_t x) |
| | Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, half_t > (half_t x) |
| | Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ half_t | type_convert< half_t, f8_fnuz_t > (f8_fnuz_t x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | type_convert< bf8_fnuz_t, float > (float x) |
| template<> |
| __host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, float > (float x) |
| | Converts a float value to a f8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, float > (float x) |
| | Converts a float value to a bf8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ f8_ocp_t | type_convert< f8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ bf8_ocp_t | type_convert< bf8_ocp_t, bhalf_t > (bhalf_t x) |
| | Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.
|
| template<> |
| __host__ __device__ float | type_convert< float, bf8_fnuz_t > (bf8_fnuz_t x) |
| template<> |
| __host__ __device__ bf8_fnuz_t | type_convert< bf8_fnuz_t, half_t > (half_t x) |
| template<> |
| __host__ __device__ half_t | type_convert< half_t, bf8_fnuz_t > (bf8_fnuz_t x) |
| __host__ __device__ f4_t | f4_convert_rne (float x, float scale=1.0f) |
| __host__ __device__ f4x2_t | f4_convert_rne (float2_t x, float scale=1.0f) |
| __host__ __device__ f4_t | f4_convert_sr (float x, float scale=1.0f) |
| __host__ __device__ f4x2_t | f4_convert_sr (float2_t x, float scale=1.0f) |
| template<> |
| __host__ __device__ f4_t | type_convert< f4_t, float > (float x) |
| template<> |
| __host__ __device__ f4x2_t | type_convert< f4x2_t, float2_t > (float2_t x) |
| template<> |
| __host__ __device__ f4x2_pk_t | type_convert< f4x2_pk_t, float2_t > (float2_t x) |
| template<> |
| __host__ __device__ f4x32_t | type_convert< f4x32_t, float32_t > (float32_t x) |
| template<> |
| __host__ __device__ float | type_convert< float, f4_t > (f4_t x) |
| template<> |
| __host__ __device__ float2_t | type_convert< float2_t, f4x2_t > (f4x2_t x) |
| template<> |
| __host__ __device__ float32_t | type_convert< float32_t, f4x32_t > (f4x32_t x) |
| __host__ __device__ f6_t | f6_convert_rne (float x, float scale=1.0f) |
| | Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.
|
| __host__ __device__ f6x32_t | f6_convert_rne (float32_t x, float scale=1.0f) |
| | Converts a 32-element single-precision float array into a packed 6-bit representation.
|
| __host__ __device__ f6_t | f6_convert_sr (float x, float scale=1.0f) |
| | Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.
|
| __host__ __device__ f6x32_t | f6_convert_sr (float32_t x, float scale=1.0f) |
| | Converts a 32-element single-precision float array into a packed 6-bit representation.
|
| template<> |
| __host__ __device__ f6_t | type_convert< f6_t, float > (float x) |
| | Specializes the type conversion template for converting a float into the 6-bit float type (f6_t).
|
| template<> |
| __host__ __device__ f6x32_t | type_convert< f6x32_t, float32_t > (float32_t x) |
| | Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t).
|
| template<> |
| __host__ __device__ f6x32_pk_t | type_convert< f6x32_pk_t, float32_t > (float32_t x) |
| template<> |
| __host__ __device__ f6x16_t | type_convert< f6x16_t, float16_t > (float16_t x) |
| template<> |
| __host__ __device__ f6x16_pk_t | type_convert< f6x16_pk_t, float16_t > (float16_t x) |
| template<> |
| __host__ __device__ float | type_convert< float, f6_t > (f6_t x) |
| | Specializes the type conversion template for converting the 6-bit float type (f6_t) to float.
|
| template<> |
| __host__ __device__ float32_t | type_convert< float32_t, f6x32_t > (f6x32_t x) |
| | Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats.
|
| template<> |
| __host__ __device__ float16_t | type_convert< float16_t, f6x16_t > (f6x16_t x) |
| template<> |
| __host__ __device__ float16_t | type_convert< float16_t, f6x16_pk_t > (f6x16_pk_t x) |
| __host__ __device__ bf6_t | bf6_convert_rne (float x, float scale=1.0f) |
| | Converts a float to the 6-bit BF6 type using round-to-nearest-even.
|
| __host__ __device__ bf6x32_t | bf6_convert_rne (float32_t x, float scale=1.0f) |
| | Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even.
|
| __host__ __device__ bf6_t | bf6_convert_sr (float x, float scale=1.0f) |
| | Converts a float to the 6-bit BF6 type using stochastic rounding.
|
| __host__ __device__ bf6x32_t | bf6_convert_sr (float32_t x, float scale=1.0f) |
| | Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding.
|
| template<> |
| __host__ __device__ bf6_t | type_convert< bf6_t, float > (float x) |
| | Specializes float-to-bf6_t conversion.
|
| template<> |
| __host__ __device__ bf6x32_t | type_convert< bf6x32_t, float32_t > (float32_t x) |
| | Specializes vector of 32 float-to-bf6_t conversion.
|
| template<> |
| __host__ __device__ bf6x32_pk_t | type_convert< bf6x32_pk_t, float32_t > (float32_t x) |
| template<> |
| __host__ __device__ bf6x16_t | type_convert< bf6x16_t, float16_t > (float16_t x) |
| template<> |
| __host__ __device__ bf6x16_pk_t | type_convert< bf6x16_pk_t, float16_t > (float16_t x) |
| template<> |
| __host__ __device__ float | type_convert< float, bf6_t > (bf6_t x) |
| | Specializes the type conversion template for converting a bf6_t value to float.
|
| template<> |
| __host__ __device__ float32_t | type_convert< float32_t, bf6x32_t > (bf6x32_t x) |
| | Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats.
|
| template<> |
| __host__ __device__ float16_t | type_convert< float16_t, bf6x16_t > (bf6x16_t x) |
| template<> |
| __host__ __device__ float16_t | type_convert< float16_t, bf6x16_pk_t > (bf6x16_pk_t x) |
| template<typename Y, typename X, size_t NumElems> |
| __host__ __device__ void | array_convert (std::array< Y, NumElems > &y, const std::array< X, NumElems > &x) |
| template<typename Y, typename X, index_t NumElems> |
| __host__ __device__ void | array_convert (Array< Y, NumElems > &y, const Array< X, NumElems > &x) |