45 namespace threadblock {
    56 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
    57           typename ThreadMap_, 
int Alignment>
    60     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
    61                                           int(128 / sizeof(Element_))>,
    62     AdvanceRank, ThreadMap_, Alignment> {
    65       AdvanceRank == 0 || AdvanceRank == 1,
    66       "Specialization for pitch-linear iterator may along advance along the "    67       "contiguous(rank=0) or strided(rank=1) dimension.");
    73                                             int(128 / 
sizeof(Element_))>;
    74   static int const kAdvanceRank = AdvanceRank;
    75   static int const kAlignment = Alignment;
    89     static int const kAccessSizeInBits = 128;
    92                           ThreadMap::kElementsPerAccess ==
    94                   "This iterator requires a policy whose access size is 128bs");
    97     static int const kPointerCount =
    98         (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   102   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   119   int iteration_contiguous_;
   122   int iteration_strided_;
   130       : stride_(ref.stride(0) / 
Layout::kElementsPerAccess),
   133         ThreadMap::initial_offset(thread_id);
   136     for (
int i = 0; i < Detail::kPointerCount; ++i) {
   142               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
   146           ref.
data() + ref.
offset(thread_offset_in_threadblock_tile));
   149     set_iteration_index(0);
   155     iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
   156     iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
   162     byte_offset_ += pointer_offset * 
sizeof(
Element);
   168     AccessType *access_ptr = pointer_[iteration_strided_ & 1];
   169     int stride_idx = (iteration_strided_ & ~1);
   171     int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
   172                         iteration_contiguous_ * ThreadMap::Delta::kContiguous /
   173                             ThreadMap::kElementsPerAccess;
   175     char *access_byte_ptr =
   176         reinterpret_cast<char *
>(access_ptr + access_offset);
   177     return reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_);
   183     ++iteration_contiguous_;
   185     if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
   190     iteration_contiguous_ = 0;
   191     ++iteration_strided_;
   193     if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
   199     iteration_strided_ = 0;
   216     add_pointer_offset(coord.contiguous() * Shape::kContiguous +
   217                        coord.strided() * Shape::kStrided * stride_ *
   218                            Layout::kElementsPerAccess);
   231 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   232           typename ThreadMap_, 
int Alignment>
   235     layout::ColumnMajorTensorOpMultiplicandCongruous<
   236         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
   237     AdvanceRank, ThreadMap_, Alignment> {
   240       AdvanceRank == 0 || AdvanceRank == 1,
   241       "Specialization for column-major iterator may along advance along the "   242       "columns(rank=0) or rows(rank=1) dimension.");
   248   static int const kAdvanceRank = AdvanceRank;
   249   static int const kAlignment = Alignment;
   263                                             int(128 / 
sizeof(Element_))>,
   264       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
   278       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   287     iterator_.add_pointer_offset(pointer_offset);
   293     return reinterpret_cast<AccessType *
>(iterator_.get());
   299     iterator_.add_tile_offset({coord.row(), coord.column()});
   328 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   329           typename ThreadMap_, 
int Alignment>
   332     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
   333                                                   int(128 / sizeof(Element_))>,
   334     AdvanceRank, ThreadMap_, Alignment> {
   337       AdvanceRank == 0 || AdvanceRank == 1,
   338       "Specialization for row-major iterator may along advance along the "   339       "columns(rank=0) or rows(rank=1) dimension.");
   342   using Element = Element_;
   345   static int const kAdvanceRank = AdvanceRank;
   346   static int const kAlignment = Alignment;
   359       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
   360                                             int(128 / 
sizeof(Element_))>,
   361       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
   375       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   384     iterator_.add_pointer_offset(pointer_offset);
   390     return reinterpret_cast<AccessType *
>(iterator_.get());
   396     iterator_.add_tile_offset({coord.column(), coord.row()});
   425 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   426           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   428                                 layout::TensorOpMultiplicandCrosswise<
   429                                     sizeof_bits<Element_>::value, Crosswise>,
   430                                 AdvanceRank, ThreadMap_, Alignment> {
   433       AdvanceRank == 0 || AdvanceRank == 1,
   434       "Specialization for pitch-linear iterator may along advance along the "   435       "contiguous(rank=0) or strided(rank=1) dimension.");
   438   using Element = Element_;
   442   static int const kAdvanceRank = AdvanceRank;
   443   static int const kAlignment = Alignment;
   444   static int const kCrosswise = Crosswise;
   458     static int const kAccessSizeInBits = 128;
   461                           ThreadMap::kElementsPerAccess ==
   463                   "This iterator requires a policy whose access size is 128bs");
   469     static int const kPointerCount =
   470         (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
   474   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   488   int sections_per_stage_;
   500   int iteration_contiguous_;
   503   int iteration_strided_;
   511       : sections_(ref.stride(0) / kCrosswise),
   512         sections_per_stage_(
Shape::kContiguous / kCrosswise),
   514         stride_(ref.stride(0) * 
Layout::kFactor / 
Layout::kElementsPerAccess),
   517         ThreadMap::initial_offset(thread_id);
   520     for (
int i = 0; i < Detail::kPointerCount; ++i) {
   526               0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
   529                     ref.
offset(thread_offset_in_threadblock_tile) /
   530                         Layout::kElementsPerAccess;
   533     set_iteration_index(0);
   539     iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
   540     iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
   552     AccessType *access_ptr = pointer_[iteration_strided_ & 1];
   553     int stride_idx = (iteration_strided_ & ~1);
   556         stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
   557         iteration_contiguous_ * ThreadMap::Delta::kContiguous /
   558             ThreadMap::kElementsPerAccess;
   559     char *access_byte_ptr =
   560         reinterpret_cast<char *
>(access_ptr + access_offset);
   561     return reinterpret_cast<AccessType *
>(access_byte_ptr + byte_offset_);
   567     ++iteration_contiguous_;
   569     if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
   574     iteration_contiguous_ = 0;
   575     ++iteration_strided_;
   577     if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
   583     iteration_strided_ = 0;
   600     add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
   601                            ThreadMap::kElementsPerAccess / sections_ +
   602                        coord.strided() * Shape::kStrided * stride_ *
   603                            Layout::kElementsPerAccess);
   616 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   617           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   620     layout::ColumnMajorTensorOpMultiplicandCrosswise<
   621         sizeof_bits<Element_>::value, Crosswise>,
   622     AdvanceRank, ThreadMap_, Alignment> {
   625       AdvanceRank == 0 || AdvanceRank == 1,
   626       "Specialization for column-major iterator may along advance along the "   627       "columns(rank=0) or rows(rank=1) dimension.");
   630   using Element = Element_;
   633   static int const kAdvanceRank = AdvanceRank;
   634   static int const kAlignment = Alignment;
   646       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
   649       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
   663       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   672     iterator_.add_pointer_offset(pointer_offset);
   678     return reinterpret_cast<AccessType *
>(iterator_.get());
   684     iterator_.add_tile_offset({coord.row(), coord.column()});
   713 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   714           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   716                                 layout::RowMajorTensorOpMultiplicandCrosswise<
   717                                     sizeof_bits<Element_>::value, Crosswise>,
   718                                 AdvanceRank, ThreadMap_, Alignment> {
   721       AdvanceRank == 0 || AdvanceRank == 1,
   722       "Specialization for row-major iterator may along advance along the "   723       "columns(rank=0) or rows(rank=1) dimension.");
   726   using Element = Element_;
   729   static int const kAdvanceRank = AdvanceRank;
   730   static int const kAlignment = Alignment;
   742       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
   745       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
   759       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   768     iterator_.add_pointer_offset(pointer_offset);
   774     return reinterpret_cast<AccessType *
>(iterator_.get());
   780     iterator_.add_tile_offset({coord.column(), coord.row()});
 int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:434
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:640
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space. 
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data. 
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const 
Returns the pointer to referenced data. 
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:539
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:221
Definition: tensor_op_multiplicand_sm75.h:422
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Definition: tensor_op_multiplicand_sm75.h:213
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE Stride stride() const 
Returns the layout object's stride vector. 
Definition: tensor_ref.h:277
Defines a Shape template for matrix tiles. 
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:745
Defines the size of an element in bits. 
Definition: numeric_types.h:42
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const 
Computes the offset of an index from the origin of the tensor. 
Definition: tensor_ref.h:301
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:643
Templates implementing the address computation of storing of tiles from pitch-linear rank=2 tensors...
Defines a canonical coordinate for rank=2 matrices offering named indices. 
Defines layout functions used by TensorRef and derived classes for pitch-linear memory. 
Definition: tensor_op_multiplicand_sm75.h:632
Basic include for CUTLASS. 
Definition: matrix_coord.h:39
Definition: tensor_op_multiplicand_sm75.h:527