22template <
typename SliceLengths,
23 typename SrcElementwiseOperation,
24 typename DstElementwiseOperation,
30 typename SrcDimAccessOrder,
31 typename DstDimAccessOrder,
36 index_t SrcScalarStrideInVector,
37 index_t DstScalarStrideInVector,
38 bool SrcResetCoordinateAfterRun,
41 bool DstResetCoordinateAfterRun,
82 const SrcDesc& src_desc,
83 const Index& src_slice_origin,
84 const SrcElementwiseOperation& src_element_op,
85 const DstDesc& dst_desc,
86 const Index& dst_slice_origin,
87 const DstElementwiseOperation& dst_element_op)
90 src_element_op_(src_element_op),
91 dst_element_op_(dst_element_op)
96 "SrcData != DstData");
100 "SrcScalarPerVector_ and DstScalarPerVector_ cannot be 1 for packed data type");
102 static_assert(SrcVectorDim == DstVectorDim,
103 "Packed data type does not support transpose");
117 template <
typename SrcBuffer, index_t ThreadScratchId = 0>
118 __device__
void RunRead(
const SrcDesc& src_desc,
119 const SrcBuffer& src_buf,
128 "wrong! SrcBuffer and SrcData data type are inconsistent");
135 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
137 static_assert(SliceLengths::At(SrcVectorDim) % (SrcScalarPerVector_) == 0,
138 "SliceLengths[SrcVectorDim] must be divisible by SrcScalarPerVector");
140 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
142 constexpr auto ordered_src_access_lengths =
148 Index forward_step_idx;
150 static_for<0, nDim, 1>{}([&](
auto j) {
151 forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
161 Index backward_step_idx;
163 static_for<0, nDim, 1>{}([&](
auto j) {
164 backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
172 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
174 constexpr auto forward_sweep = [&]() {
177 forward_sweep_(
I0) =
true;
179 static_for<1, nDim, 1>{}([&](
auto i) {
180 index_t tmp = ordered_src_access_idx[
I0];
182 static_for<1, i, 1>{}([&](
auto j) {
183 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
186 forward_sweep_(i) = tmp % 2 == 0;
189 return forward_sweep_;
193 constexpr auto src_data_idx = [&]() {
196 static_for<0, nDim, 1>{}([&](
auto i) {
197 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
198 : ordered_src_access_lengths[i] - 1 -
199 ordered_src_access_idx[i];
203 src_scalar_per_access;
210 const bool is_src_valid =
212 src_oob_thread_scratch_tuple_(thread_scratch_id)
213 .template SetAsType<bool>(src_data_idx_seq, is_src_valid);
216 using dst_vector_t =
typename dst_vector_type::type;
217 dst_vector_type op_r_v;
219 constexpr auto get_elem_op_vec_len = []() {
222 if constexpr(
decltype(src_element_op_)::is_pack8_invocable)
226 decltype(src_element_op_)>
::value)
228 if constexpr(
decltype(src_element_op_)::is_pack4_invocable)
232 decltype(src_element_op_)>
::value)
234 if constexpr(
decltype(src_element_op_)::is_pack2_invocable)
243 constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
245 using src_elem_op_vec_t =
typename vector_type<SrcData, elem_op_vec_len>::type;
246 using dst_elem_op_vec_t =
typename vector_type<DstData, elem_op_vec_len>::type;
248 using VectorSizeLookupTable = Tuple<Sequence<>,
255 Sequence<I4, I2, I1>,
259 Sequence<I8, I2, I1>,
261 Sequence<I8, I4, I1>,
262 Sequence<I8, I4, I2>,
263 Sequence<I8, I4, I2, I1>,
265 using VectorOffsetsLookupTable = Tuple<Sequence<>,
272 Sequence<I0, I4, I6>,
276 Sequence<I0, I8, I10>,
278 Sequence<I0, I8, I12>,
279 Sequence<I0, I8, I12>,
280 Sequence<I0, I8, I12, I14>,
283 static_for<0, tuple_element_t<SrcScalarPerVector, VectorSizeLookupTable>::Size(), 1>{}(
285 constexpr auto VectorLoadSize =
286 tuple_element_t<SrcScalarPerVector, VectorSizeLookupTable>::At(v_idx);
287 constexpr auto LoadOffset =
288 tuple_element_t<SrcScalarPerVector, VectorOffsetsLookupTable>::At(v_idx);
291 using src_vector_container_t =
typename src_vector_container::type;
293 src_vector_container src_vector =
294 src_vector_container{src_buf.template Get<src_vector_container_t>(
295 src_coord_.GetOffset() /
PackedSize + LoadOffset,
true)};
297 static_for<0, VectorLoadSize / elem_op_vec_len, 1>{}([&](
auto idx) {
301 op_r_v.template AsType<dst_elem_op_vec_t>()(idx + LoadOffset),
302 src_vector.template AsType<src_elem_op_vec_t>()[idx]);
307 src_thread_scratch_tuple_(thread_scratch_id)
308 .template SetAsType<dst_vector_t>(src_data_idx_seq,
309 op_r_v.template AsType<dst_vector_t>()[
I0]);
311 constexpr auto move_on_dim = [&]()
constexpr {
314 static_for<0, nDim, 1>{}([&](
auto i) {
315 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
317 static_for<i + 1, nDim, 1>{}([&](
auto j) {
319 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
327 static_for<0, nDim, 1>{}([&](
auto i) {
328 if constexpr(move_on_dim[i])
330 if constexpr(forward_sweep[i])
333 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
338 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
345 if constexpr(SrcResetCoordinateAfterRun)
347 const auto src_reset_step =
354 template <
typename SeqIdx, index_t ThreadScratchId = 0>
355 __device__
constexpr auto
359 return src_thread_scratch_tuple_(thread_scratch_id).template GetAsType<vector_t>(SeqIdx{});
362 template <index_t ThreadScratchId>
366#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
368 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
375 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
377 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
379 constexpr auto ordered_src_access_lengths =
383 static_ford<
decltype(ordered_src_access_lengths)>{}([&](
auto ordered_src_access_idx) {
385 constexpr auto forward_sweep = [&]() {
388 forward_sweep_(
I0) =
true;
391 index_t tmp = ordered_src_access_idx[
I0];
394 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
397 forward_sweep_(i) = tmp % 2 == 0;
400 return forward_sweep_;
404 constexpr auto src_data_idx = [&]() {
408 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
409 : ordered_src_access_lengths[i] - 1 -
410 ordered_src_access_idx[i];
414 src_scalar_per_access;
422 auto op_r = src_thread_scratch_tuple_(thread_scratch_id)
423 .template GetAsType<vector_t>(src_data_idx_seq);
425 const bool is_src_valid = src_oob_thread_scratch_tuple_(thread_scratch_id)
426 .template GetAsType<bool>(src_data_idx_seq);
428 auto op_r_v = is_src_valid ? op_r : vector_t(0);
430 src_thread_scratch_tuple_(thread_scratch_id)
431 .template SetAsType<vector_t>(src_data_idx_seq, op_r_v);
436 if constexpr(SrcVectorDim != DstVectorDim &&
445 "in-register transpose is not supported for pk_i4_t");
447 "in-register transpose is not supported for f4x2_pk_t");
456 static_assert(SrcVectorDim != DstVectorDim,
"wrong");
471 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
473 static_ford<
decltype(access_lengths)>{}([&](
auto access_idx) {
474 constexpr auto data_idx = access_idx * scalar_per_access;
485 [&](
auto i) ->
const src_vector_t& {
487 return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference(
488 data_idx_seq + i * dst_scalar_step_in_vector);
494 [&](
auto i) -> dst_vector_t& {
496 return dst_thread_scratch_.GetVectorTypeReference(
497 data_idx_seq + i * src_scalar_step_in_vector);
503 src_vector_refs, dst_vector_refs);
511 constexpr auto packed_access_lengths = SliceLengths{} / packed_per_access;
513 static_ford<
decltype(packed_access_lengths)>{}([&](
auto idx) {
514 dst_thread_scratch_(idx) = src_thread_scratch_tuple_[thread_scratch_id][idx];
520 template <
typename DstBuffer, index_t ThreadScratchId = 0>
536 "wrong! SrcBuffer or DstBuffer data type is wrong");
543 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
545 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
547 constexpr auto ordered_dst_access_lengths =
553 Index forward_step_idx;
555 static_for<0, nDim, 1>{}([&](
auto j) {
556 forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
566 Index backward_step_idx;
568 static_for<0, nDim, 1>{}([&](
auto j) {
569 backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
577 static_ford<
decltype(ordered_dst_access_lengths)>{}([&](
auto ordered_dst_access_idx) {
579 constexpr auto forward_sweep = [&]() {
582 forward_sweep_(
I0) =
true;
584 static_for<1, nDim, 1>{}([&](
auto i) {
585 index_t tmp = ordered_dst_access_idx[
I0];
587 static_for<1, i, 1>{}([&](
auto j) {
588 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
591 forward_sweep_(i) = tmp % 2 == 0;
594 return forward_sweep_;
598 constexpr auto dst_data_idx = [&]() {
601 static_for<0, nDim, 1>{}([&](
auto i) {
602 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
603 : ordered_dst_access_lengths[i] - 1 -
604 ordered_dst_access_idx[i];
608 dst_scalar_per_access;
614 const bool is_dst_valid =
618 using dst_vector_t =
typename dst_vector_type::type;
621 auto dst_vector_container = dst_vector_type{
622 dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
624 static_for<0, DstScalarPerVector, 1>{}([&](
auto i) {
628 dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
635 dst_vector_container.template AsType<dst_vector_t>()[
I0]);
637 constexpr auto move_on_dim = [&]()
constexpr {
640 static_for<0, nDim, 1>{}([&](
auto i) {
641 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
643 static_for<i + 1, nDim, 1>{}([&](
auto j) {
645 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
653 static_for<0, nDim, 1>{}([&](
auto i) {
654 if constexpr(move_on_dim[i])
656 if constexpr(forward_sweep[i])
659 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
664 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
671 if constexpr(DstResetCoordinateAfterRun)
673 const auto dst_reset_step =
687 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
689 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
691 constexpr auto ordered_src_access_lengths =
695 constexpr auto forward_sweep = [&]() {
698 forward_sweep_(
I0) =
true;
701 index_t tmp = ordered_src_access_lengths[
I0] - 1;
704 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
707 forward_sweep_(i) = tmp % 2 == 0;
710 return forward_sweep_;
715 constexpr auto src_data_idx = [&]() {
719 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
723 src_scalar_per_access;
727 constexpr auto reset_src_data_step = [&]() {
728 Index reset_src_data_step_;
732 return reset_src_data_step_;
735 return reset_src_data_step;
745 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
747 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
749 constexpr auto ordered_dst_access_lengths =
753 constexpr auto forward_sweep = [&]() {
756 forward_sweep_(
I0) =
true;
759 index_t tmp = ordered_dst_access_lengths[
I0] - 1;
762 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
765 forward_sweep_(i) = tmp % 2 == 0;
768 return forward_sweep_;
773 constexpr auto dst_data_idx = [&]() {
777 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
781 dst_scalar_per_access;
785 constexpr auto reset_dst_data_step = [&]() {
786 Index reset_dst_data_step_;
790 return reset_dst_data_step_;
793 return reset_dst_data_step;
798 const Index& src_slice_origin_step_idx)
801 const auto adjusted_step_idx =
802 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
813 const Index& dst_slice_origin_step_idx)
816 const auto adjusted_step_idx =
817 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
831 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
837 constexpr auto desc0 =
843 if constexpr(i == SrcVectorDim)
846 make_tuple(src_access_lengths_and_vector_length[i],
858 if constexpr(i == SrcVectorDim)
869 constexpr auto up_dim_idss =
880 constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
891 constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
896 constexpr auto desc0 =
902 if constexpr(i == DstVectorDim)
905 make_tuple(dst_access_lengths_and_vector_length[i],
917 if constexpr(i == DstVectorDim)
928 constexpr auto up_dim_idss =
936 static constexpr auto src_oob_thread_scratch_desc_ =
940 using SrcThreadScratch =
944 decltype(src_thread_scratch_desc_),
947 using SrcOOBThreadScratch =
951 decltype(src_oob_thread_scratch_desc_),
957 decltype(dst_thread_scratch_desc_),
963 DstThreadScratch dst_thread_scratch_;
967 const SrcElementwiseOperation src_element_op_;
968 const DstElementwiseOperation dst_element_op_;
__host__ __device__ constexpr T min(T x)
Definition utility/math.hpp:116
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
decltype(ck::declval< T & >().is_pack8_invocable) is_pack8_invocable_t
Definition is_detected.hpp:43
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto container_push_back(const Array< TData, NSize > &a, const TData &x)
Definition utility/container_helper.hpp:18
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
decltype(ck::declval< T & >().is_pack4_invocable) is_pack4_invocable_t
Definition is_detected.hpp:40
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
typename detail::detector< nonesuch, void, Op, Args... >::value_t is_detected
Definition is_detected.hpp:34
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Lds
Definition amd_address_space.hpp:18
@ Global
Definition amd_address_space.hpp:17
@ Vgpr
Definition amd_address_space.hpp:20
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr auto generate_sequence_v2(F &&f, Number< N >)
Definition sequence_helper.hpp:25
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
constexpr bool is_same_v
Definition type.hpp:283
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
decltype(ck::declval< T & >().is_pack2_invocable) is_pack2_invocable_t
Definition is_detected.hpp:37
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
constexpr index_t packed_size_v
Definition data_type.hpp:411
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition utility/sequence.hpp:43
static __device__ constexpr auto GetSrcOOBThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1.hpp:875
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:812
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v3r1.hpp:48
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::DstScalarPerVector static constexpr auto DstScalarPerVector
Definition threadwise_tensor_slice_transfer_v3r1.hpp:79
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I7 static constexpr auto I7
Definition threadwise_tensor_slice_transfer_v3r1.hpp:63
__device__ constexpr auto GetSrcThreadScratchIdx(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1.hpp:356
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I3 static constexpr auto I3
Definition threadwise_tensor_slice_transfer_v3r1.hpp:59
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:797
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::PackedSize static constexpr index_t PackedSize
Definition threadwise_tensor_slice_transfer_v3r1.hpp:71
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::DstCoord decltype(make_tensor_coordinate(DstDesc{}, Index{})) DstCoord
Definition threadwise_tensor_slice_transfer_v3r1.hpp:51
__device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(const SrcDesc &src_desc, const Index &src_slice_origin, const SrcElementwiseOperation &src_element_op, const DstDesc &dst_desc, const Index &dst_slice_origin, const DstElementwiseOperation &dst_element_op)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:81
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I10 static constexpr auto I10
Definition threadwise_tensor_slice_transfer_v3r1.hpp:65
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1.hpp:680
static __device__ constexpr auto GetSrcThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1.hpp:826
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I16 static constexpr auto I16
Definition threadwise_tensor_slice_transfer_v3r1.hpp:69
__device__ void TransferDataFromSrcThreadScratchToDstThreadScratch(Number< ThreadScratchId > thread_scratch_id)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:364
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I4 static constexpr auto I4
Definition threadwise_tensor_slice_transfer_v3r1.hpp:60
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I14 static constexpr auto I14
Definition threadwise_tensor_slice_transfer_v3r1.hpp:68
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::SrcScalarPerVector static constexpr auto SrcScalarPerVector
Definition threadwise_tensor_slice_transfer_v3r1.hpp:78
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I2 static constexpr auto I2
Definition threadwise_tensor_slice_transfer_v3r1.hpp:58
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::SrcCoordStep decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) SrcCoordStep
Definition threadwise_tensor_slice_transfer_v3r1.hpp:53
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::SrcCoord decltype(make_tensor_coordinate(SrcDesc{}, Index{})) SrcCoord
Definition threadwise_tensor_slice_transfer_v3r1.hpp:50
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I1 static constexpr auto I1
Definition threadwise_tensor_slice_transfer_v3r1.hpp:57
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1.hpp:118
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v3r1.hpp:47
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I12 static constexpr auto I12
Definition threadwise_tensor_slice_transfer_v3r1.hpp:66
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v3r1.hpp:738
static __device__ constexpr auto GetDstThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v3r1.hpp:885
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v3r1.hpp:521
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:107
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I13 static constexpr auto I13
Definition threadwise_tensor_slice_transfer_v3r1.hpp:67
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::DstCoordStep decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) DstCoordStep
Definition threadwise_tensor_slice_transfer_v3r1.hpp:54
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v3r1.hpp:56
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I6 static constexpr auto I6
Definition threadwise_tensor_slice_transfer_v3r1.hpp:62
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I5 static constexpr auto I5
Definition threadwise_tensor_slice_transfer_v3r1.hpp:61
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v3r1.hpp:112
ck::ThreadwiseTensorSliceTransfer_v3r1< decltype(thread_slice_lengths), SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, NumThreadScratch >::I8 static constexpr auto I8
Definition threadwise_tensor_slice_transfer_v3r1.hpp:64
Definition threadwise_tensor_slice_transfer_util.hpp:43
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition threadwise_tensor_slice_transfer_util.hpp:29
Definition data_type.hpp:42
Definition data_type.hpp:187
Definition functional2.hpp:33
Definition functional3.hpp:97
Definition utility/transpose_vectors.hpp:16
Definition dtype_vector.hpp:30