67     typename InstructionShape_,
    93     typename InstructionShape_,
   100     Shape_, Operand_, Element_,
   103     InstructionShape_, OpDelta_, 32, PartitionsK_> {
   110   static Operand 
const kOperand = Operand_;
   113     "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
   126   static int const kOpDelta = OpDelta_;
   129   static int const kThreads = 32;
   132   static int const kPartitionsK = PartitionsK_;
   149         !(Shape::kContiguous % InstructionShape::kContiguous),
   150         "Shape of warp-level Mma must be divisible by operator shape.");
   153     static int const kLdsmOpOuter = Layout::kElementsPerAccess;
   154     static int const kLdsmOpInner = 8;
   157       "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
   160       "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
   163     static int const LdsmShapeStrided =
   164         InstructionShape::kStrided / kLdsmOpInner;
   165     static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
   171         Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
   175     static int const kGroupsPerTile =
   176         Shape::kStrided / InstructionShape::kStrided;
   183     "Alternative arrangements not supported at present.");
   186   static int const kPointerCount =
   187       Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
   190   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   210   AccessType 
const *pointer_[kPointerCount];
   227     stride_(ref.stride(0) / 
Layout::kElementsPerAccess), byte_offset_(0),
   230     int quad_pair = (lane_id >> 3);
   231     int lane_in_quad = (lane_id & 3);
   232     int lane_in_quad_pair = (lane_id & 7);
   234     for (
int i = 0; i < kPointerCount; ++i) {
   235       int partition_contiguous_idx = -1;
   236       int access_contiguous_idx = -1;
   237       int access_strided_idx = -1;
   239       if (Policy::LdsmShape::kContiguous == 4) {
   240         partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
   241         access_contiguous_idx = (quad_pair ^ lane_in_quad);
   242         access_strided_idx = lane_in_quad_pair;
   244       int access_contiguous =
   245           partition_contiguous_idx * Layout::PartitionShape::kContiguous +
   246           access_contiguous_idx;
   248       int access_strided = access_strided_idx;
   250       pointer_[i] = 
reinterpret_cast<AccessType 
const *
>(ref.
data()) +
   251                     access_contiguous + access_strided * stride_;
   259     byte_offset_ += offset * 
sizeof(Element);
   268     int contiguous_offset = tile_offset.contiguous();
   269     if (Shape::kContiguous ==
   270         Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
   271       if (tile_offset.contiguous() % 2) {
   273         for (
int i = 0; i < kPointerCount / 2; ++i) {
   274           AccessType 
const *tmp_pointer = pointer_[i];
   275           pointer_[i] = pointer_[i + kPointerCount / 2];
   276           pointer_[i + kPointerCount / 2] = tmp_pointer;
   279       contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
   282     int offset = (tile_offset.strided() * InstructionShape::kStrided) *
   283                      stride_ * Layout::kElementsPerAccess +
   284                  contiguous_offset * Shape::kContiguous;
   286     add_pointer_offset(offset);
   295     add_tile_offset({0, 1});
   297     if (kPartitionsK > 1) {
   300       if (k_group_idx_ == Policy::kGroupsPerTile) {
   303             {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
   313     byte_offset_ -= stride_ * InstructionShape::kStrided * 
sizeof(Element) *
   314                     Layout::kElementsPerAccess;
   322     add_tile_offset(tile_offset);
   329     add_tile_offset(-tile_offset);
   337     load_with_byte_offset(frag, 0);
   346       Index byte_offset)
 const {
   348     Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
   349       reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *
>(&frag);
   352     for (
int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
   355       for (
int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
   357         int access_idx = c + s * Policy::LdsmIterations::kContiguous;
   359         AccessType 
const *source_ptr =
   360             pointer_[c % kPointerCount] +
   361             Layout::TileShape::kContiguous * (c / kPointerCount) +
   362             Policy::LdsmShape::kStrided * s * stride_;
   364         char const *source_byte_ptr = 
reinterpret_cast<char const *
>(source_ptr) + byte_offset + byte_offset_;
   366         cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
   367           fetch_ptr[access_idx],
   380       Index pointer_offset)
 const {
   381     load_with_byte_offset(frag, pointer_offset * 
sizeof(Element));
   391     load_with_byte_offset(frag, tile_offset, 0);
   402       Index pointer_offset)
 const {
   403     load_with_byte_offset(frag, tile_offset, pointer_offset * 
sizeof(Element));
   414       Index byte_offset)
 const {
   415     Index pointer_offset = 
   416       tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
   417       tile_offset.strided() * InstructionShape::kStrided * stride_;
   419     byte_offset += 
sizeof(AccessType) * pointer_offset;
   421     load_with_byte_offset(frag, byte_offset);
   452     typename InstructionShape_,
   459     Shape_, Operand_, Element_,
   461         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
   462     InstructionShape_, OpDelta_, 32, PartitionsK_> {
   469   static Operand 
const kOperand = Operand_;
   472                 "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "   473                 "only be instantiated for A operand to warp-level Mma.");
   486   static int const kOpDelta = OpDelta_;
   489   static int const kThreads = 32;
   507                                             int(128 / 
sizeof(Element_))>,
   509                                InstructionShape::kColumn>,
   510       kOpDelta, kThreads, PartitionsK_>;
   519   using Fragment = Array<Element, Shape::kCount / kThreads>;
   537   ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
   544     iterator_.add_pointer_offset(offset);
   553     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
   579     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
   586     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
   594     iterator_.load(frag);
   603       Index pointer_offset)
 const {
   604     iterator_.load_with_pointer_offset(frag, pointer_offset);
   613       Index byte_offset)
 const {
   614     iterator_.load_with_byte_offset(frag, byte_offset);
   635       Index pointer_offset)
 const {
   647       Index byte_offset)
 const {
   648     iterator_.load_with_byte_offset(
   650       {tile_offset.contiguous(), tile_offset.strided()},
   663     iterator_.set_kgroup_index(k_group); 
   683     typename InstructionShape_,
   690     Shape_, Operand_, Element_,
   692         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
   693     InstructionShape_, OpDelta_, 32, PartitionsK_> {
   700   static Operand 
const kOperand = Operand_;
   703                 "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "   704                 "only be instantiated for B operand to warp-level Mma.");
   707   using Element = Element_;
   717   static int const kOpDelta = OpDelta_;
   720   static int const kThreads = 32;
   737       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
   738                                             int(128 / 
sizeof(Element_))>,
   740                                InstructionShape::kRow>,
   741       kOpDelta, kThreads, PartitionsK_>;
   750   using Fragment = Array<Element, Shape::kCount / kThreads>;
   768   ): iterator_({ref.
data(), ref.
stride()}, lane_id) {
   775     iterator_.add_pointer_offset(offset);
   784     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
   810     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
   817     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
   825     iterator_.load(frag);
   834       Index pointer_offset)
 const {
   835     iterator_.load_with_pointer_offset(frag, pointer_offset);
   844       Index byte_offset)
 const {
   845     iterator_.load_with_byte_offset(frag, byte_offset);
   866       Index pointer_offset)
 const {
   878       Index byte_offset)
 const {
   879     iterator_.load_with_byte_offset(
   881       {tile_offset.strided(), tile_offset.contiguous()},
   894     iterator_.set_kgroup_index(k_group); 
   915     typename InstructionShape_,
   924     Shape_, Operand_, Element_,
   927     InstructionShape_, OpDelta_, 32, PartitionsK_> {
   933   static Operand 
const kOperand = Operand_;
   936                 "MmaTensorOpMultiplicandIterator may only be instantiated for "   937                 "A or B operands to warp-level Mma.");
   940   using Element = Element_;
   943   static int const kCrosswise = Crosswise;
   954   static int const kOpDelta = OpDelta_;
   957   static int const kThreads = 32;
   960   static int const kPartitionsK = PartitionsK_;
   977         !(Shape::kContiguous % InstructionShape::kContiguous),
   978         "Shape of warp-level Mma must be divisible by operator shape.");
   981     static int const kLdsmOpOuter = Layout::kElementsPerAccess;
   982     static int const kLdsmOpInner = 8;
   985                   "Shape of warp-level mma must be divisible by LDSM's "   986                   "fundamental tile size.");
   989                   "Shape of warp-level mma must be divisible by LDSM's "   990                   "fundamental tile size.");
   993     static int const LdsmShapeContiguous =
   994         InstructionShape::kContiguous / kLdsmOpOuter;
   995     static int const LdsmShapeStrided =
   996         ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
   997             ? (Shape::kStrided / kLdsmOpInner)
   998             : (4 / LdsmShapeContiguous);
  1005                                         LdsmShape::kStrided>;
  1008     static int const kGroupsPerTile = Layout::TileShape::kContiguous /
  1009                                       Layout::kFactor / LdsmShape::kContiguous;
  1015                 "Alternative arrangements not supported at present.");
  1018   using AccessType = Array<Element, Layout::kElementsPerAccess>;
  1026   using Fragment = Array<Element, Shape::kCount / kThreads>;
  1040   AccessType 
const *pointer_;
  1062       : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
  1063         sections_(ref.stride(0) / kCrosswise),
  1065         stride_(ref.stride(0) * 
Layout::kFactor / 
Layout::kElementsPerAccess),
  1074 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))  1075     lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
  1078     int lane_in_pair = (lane_id & 1);
  1079     int lane_in_quad = (lane_id & 3);
  1080     int lane_in_quad_pair = (lane_id & 7);
  1081     int lane_in_quad_quad = (lane_id & 15);
  1083     int partition_contiguous_idx = -1;
  1084     int access_contiguous_idx = -1;
  1085     int access_strided_idx = -1;
  1087     if (Layout::kFactor == 4) {
  1090       int factor_in_partition =
  1091           (Layout::PartitionShape::kContiguous * Layout::kFactor /
  1092            Layout::TileShape::kContiguous);
  1094       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
  1096         partition_contiguous_idx = lane_in_quad / factor_in_partition;
  1097         access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
  1098                                  (lane_in_quad_quad / Layout::kFactor));
  1099         access_strided_idx = lane_id / Layout::kFactor;
  1101     } 
else if (Layout::kFactor == 2) {
  1103       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
  1110         partition_contiguous_idx = (lane_id % Layout::kFactor);
  1111         access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
  1112         access_strided_idx = lane_id / Layout::kFactor;
  1114     } 
else if (Layout::kFactor == 1) {
  1116       if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
  1121         partition_contiguous_idx = (lane_in_quad_pair >> 2);
  1122         access_contiguous_idx = lane_in_quad;
  1123         access_strided_idx = lane_id;
  1127     int access_contiguous =
  1128         partition_contiguous_idx * Layout::PartitionShape::kContiguous +
  1129         access_contiguous_idx;
  1131     int access_strided = access_strided_idx;
  1133     byte_offset_ = (access_contiguous + access_strided * stride_) *
  1150     int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
  1151     int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
  1154                     Layout::kElementsPerAccess / 8;
  1156         tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
  1157         whole_tiles * stride_ / sections_;
  1171     if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
  1172       int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
  1174                      : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
  1176       if (((k_group_idx_ & mask) % 2) == 0)
  1177         byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
  1179                         Layout::kElementsPerAccess / 8;
  1180       else if ((k_group_idx_ & mask) == 1)
  1181         byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
  1183                         Layout::kElementsPerAccess / 8;
  1184       else if ((k_group_idx_ & mask) == 3)
  1185         byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
  1187                         Layout::kElementsPerAccess / 8;
  1192     if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
  1194       add_tile_offset({Policy::kGroupsPerTile, 0});
  1209     add_tile_offset(tile_offset);
  1218     add_tile_offset(-tile_offset);
  1232       Index byte_offset)
 const {
  1233     Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
  1234         reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *
>(&frag);
  1237     for (
int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
  1239       for (
int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
  1240         int access_idx = c + s * Policy::LdsmIterations::kContiguous;
  1242         AccessType 
const *source_ptr =
  1243             pointer_ + Policy::LdsmShape::kContiguous * c +
  1244             Policy::kLdsmOpInner / Layout::kFactor *
  1245                 Policy::LdsmShape::kStrided * s * stride_;
  1247         char const *source_byte_ptr =
  1248             reinterpret_cast<char const *
>(source_ptr) + byte_offset +
  1251         cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
  1252             fetch_ptr[access_idx], source_byte_ptr);
  1263       Index pointer_offset)
 const {
  1264     load_with_byte_offset(frag, pointer_offset * 
sizeof(Element));
  1274     load_with_byte_offset(frag, tile_offset, 0);
  1285       Index pointer_offset)
 const {
  1286     load_with_byte_offset(frag, tile_offset, pointer_offset * 
sizeof(Element));
  1297       Index byte_offset)
 const {
  1298     Index pointer_offset = tile_offset.contiguous() *
  1299                                InstructionShape::kContiguous /
  1300                                Layout::kElementsPerAccess +
  1301                            tile_offset.strided() * Shape::kStrided * stride_;
  1305     load_with_byte_offset(frag, byte_offset);
  1317     k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
  1338     typename InstructionShape_,
  1347     Shape_, Operand_, Element_,
  1349         sizeof_bits<Element_>::value, Crosswise>,
  1350     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  1356   static Operand 
const kOperand = Operand_;
  1359                 "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "  1360                 "only be instantiated for B operand to warp-level Mma.");
  1363   using Element = Element_;
  1366   static int const kCrosswise = Crosswise;
  1377   static int const kOpDelta = OpDelta_;
  1380   static int const kThreads = 32;
  1396       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
  1400                                InstructionShape::kColumn>,
  1401       kOpDelta, kThreads, PartitionsK_>;
  1409   using Fragment = Array<Element, Shape::kCount / kThreads>;
  1423       : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
  1428     iterator_.add_pointer_offset(offset);
  1438     iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
  1464     add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
  1473     add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
  1487       Index pointer_offset)
 const {
  1488     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1497       Index byte_offset)
 const {
  1498     iterator_.load_with_byte_offset(frag, byte_offset);
  1520       Index pointer_offset)
 const {
  1533       Index byte_offset)
 const {
  1534     iterator_.load_with_byte_offset(
  1535         frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
  1547     iterator_.set_kgroup_index(k_group); 
  1568     typename InstructionShape_,
  1577     Shape_, Operand_, Element_,
  1579         sizeof_bits<Element_>::value, Crosswise>,
  1580     InstructionShape_, OpDelta_, 32, PartitionsK_> {
  1586   static Operand 
const kOperand = Operand_;
  1589                 "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "  1590                 "only be instantiated for A operand to warp-level Mma.");
  1593   using Element = Element_;
  1596   static int const kCrosswise = Crosswise;
  1607   static int const kOpDelta = OpDelta_;
  1610   static int const kThreads = 32;
  1626       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
  1630                                InstructionShape::kRow>,
  1631       kOpDelta, kThreads, PartitionsK_>;
  1639   using Fragment = Array<Element, Shape::kCount / kThreads>;
  1653       : iterator_({ref.
data(), ref.
stride()}, lane_id) {}
  1658     iterator_.add_pointer_offset(offset);
  1668     iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
  1694     add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  1703     add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
  1717       Index pointer_offset)
 const {
  1718     iterator_.load_with_pointer_offset(frag, pointer_offset);
  1727       Index byte_offset)
 const {
  1728     iterator_.load_with_byte_offset(frag, byte_offset);
  1750       Index pointer_offset)
 const {
  1763       Index byte_offset)
 const {
  1764     iterator_.load_with_byte_offset(
  1765         frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
  1777     iterator_.set_kgroup_index(k_group); 
  1790     typename InstructionShape_,
  1812     typename InstructionShape_,
  1827   using Element = Element_;
  1839   static int const kThreads = 32;
  1856         !(Shape::kRow % InstructionShape::kM) &&
  1857             !(Shape::kColumn % InstructionShape::kN),
  1858         "Shape of warp-level Mma must be divisible by operator shape.");
  1861       "Layouts must be defined for logical MatrixCoord coordinate space.");
  1865                                       Shape::kColumn / InstructionShape::kN>;
  1873  static int const kElementsPerAccess = InstructionShape::kN / 4;
  1874  static int const kRowsPerTile = 8;
  1875  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
  1884   using Fragment = Array<Element, Shape::kCount / kThreads>;
  1905     int quad = (lane_id >> 2);
  1906     int lane_in_quad = (lane_id & 3);
  1908     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
  1946     add_tile_offset(tile_offset);
  1953     add_tile_offset(-tile_offset);
  1960     load_with_pointer_offset(frag, 0);
  1967     Index pointer_offset)
 const {               
  1973     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  1975       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  1977         int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
  1978           (mma_n * Policy::MmaIterations::kRow + mma_m);
  1981         for (
int row = 0; row < kAccumulatorRows; ++row) {
  1983           for (
int col = 0; col < kElementsPerAccess; ++col) {
  1984             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
  1986             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
  1988             frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.
at({accum_m, accum_n});
  1999     Index byte_offset)
 const {                  
  2001     load_with_pointer_offset(byte_offset / 
sizeof(Element));
  2010     load(frag, tile_offset, 0);
  2018     Index pointer_offset)
 const {               
  2020     load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  2026     store_with_pointer_offset(frag, 0);
  2033     Index pointer_offset)
 const {               
  2039     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  2041       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  2043         int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
  2044           (mma_n * Policy::MmaIterations::kRow + mma_m);
  2047         for (
int row = 0; row < kAccumulatorRows; ++row) {
  2049           for (
int col = 0; col < kElementsPerAccess; ++col) {
  2050             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
  2052             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
  2053             int idx = mma_accum_start + row * kElementsPerAccess + col;
  2055             offset_ref.
at({accum_m, accum_n}) = frag[idx];
  2066     Index byte_offset)
 const {                  
  2068     store_with_pointer_offset(byte_offset / 
sizeof(Element));
  2077     store(frag, tile_offset, 0);
  2088       Index pointer_offset)
 const {
  2089     store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  2109     typename InstructionShape_,
  2115                                          InstructionShape_, OpDelta_> {
  2125   using Element = Element_;
  2137   static int const kThreads = 32;
  2154         !(Shape::kRow % InstructionShape::kM) &&
  2155             !(Shape::kColumn % InstructionShape::kN),
  2156         "Shape of warp-level Mma must be divisible by operator shape.");
  2159       "Layouts must be defined for logical MatrixCoord coordinate space.");
  2163                                       Shape::kColumn / InstructionShape::kN>;
  2171  static int const kElementsPerAccess = InstructionShape::kN / 4;
  2172  static int const kRowsPerTile = 8;
  2173  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
  2182   using Fragment = Array<Element, Shape::kCount / kThreads>;
  2203     int quad = (lane_id >> 2);
  2204     int lane_in_quad = (lane_id & 3);
  2206     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
  2244     add_tile_offset(tile_offset);
  2251     add_tile_offset(-tile_offset);
  2258     load_with_pointer_offset(frag, 0);
  2265     Index pointer_offset)
 const {               
  2271     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  2273       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  2275         int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
  2276           (mma_n * Policy::MmaIterations::kRow + mma_m);
  2279         for (
int row = 0; row < kAccumulatorRows; ++row) {
  2281           for (
int col = 0; col < kElementsPerAccess; ++col) {
  2282             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
  2284             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
  2285             int idx = mma_accum_start + row * kElementsPerAccess + col;
  2287             frag[idx] = offset_ref.
at({accum_m, accum_n});
  2298     Index byte_offset)
 const {                  
  2300     load_with_pointer_offset(byte_offset / 
sizeof(Element));
  2309     load(frag, tile_offset, 0);
  2317     Index pointer_offset)
 const {               
  2319     load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  2325     store_with_pointer_offset(frag, 0);
  2332     Index pointer_offset)
 const {               
  2338     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  2340       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  2342         int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
  2343           (mma_n * Policy::MmaIterations::kRow + mma_m);
  2346         for (
int row = 0; row < kAccumulatorRows; ++row) {
  2348           for (
int col = 0; col < kElementsPerAccess; ++col) {
  2349             int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
  2351             int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
  2352             int idx = mma_accum_start + row * kElementsPerAccess + col;
  2354             offset_ref.
at({accum_m, accum_n}) = frag[idx];
  2365     Index byte_offset)
 const {                  
  2367     store_with_pointer_offset(byte_offset / 
sizeof(Element));
  2376     store(frag, tile_offset, 0);
  2387       Index pointer_offset)
 const {
  2388     store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  2409     typename InstructionShape_,
  2417     InstructionShape_, OpDelta_> {
  2427   using Element = Element_;
  2439   static int const kThreads = 32;
  2456         !(Shape::kRow % InstructionShape::kM) &&
  2457             !(Shape::kColumn % InstructionShape::kN),
  2458         "Shape of warp-level Mma must be divisible by operator shape.");
  2461       "Layouts must be defined for logical MatrixCoord coordinate space.");
  2465                                       Shape::kColumn / InstructionShape::kN>;
  2470   static int const kElementsPerAccess = 2;
  2481   using Fragment = Array<Element, Shape::kCount / kThreads>;
  2502     int quad = (lane_id >> 2);
  2503     int lane_in_quad = (lane_id & 3);
  2505     MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
  2543     add_tile_offset(tile_offset);
  2550     add_tile_offset(-tile_offset);
  2557     load_with_pointer_offset(frag, 0);
  2564     Index pointer_offset)
 const {               
  2572     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  2574       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  2575         int accum_m = mma_m * InstructionShape::kM;
  2576         int accum_n = mma_n * InstructionShape::kN;
  2578         int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
  2583         frag_ptr[idx] = access_ptr[0];
  2592     Index byte_offset)
 const {                  
  2594     load_with_pointer_offset(byte_offset / 
sizeof(Element));
  2603     load(frag, tile_offset, 0);
  2611     Index pointer_offset)
 const {               
  2613     load_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
  2619     store_with_pointer_offset(frag, 0);
  2626     Index pointer_offset)
 const {               
  2634     for (
int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
  2636       for (
int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
  2637         int accum_m = mma_m * InstructionShape::kM;
  2638         int accum_n = mma_n * InstructionShape::kN;
  2640         int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
  2645         access_ptr[0] = frag_ptr[idx];         
  2654     Index byte_offset)
 const {                  
  2656     store_with_pointer_offset(byte_offset / 
sizeof(Element));
  2665     store(frag, tile_offset, 0);
  2676       Index pointer_offset)
 const {
  2677     store_with_pointer_offset(frag, ref_.
offset(tile_offset) + pointer_offset);
 OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:1836
Shape_ Shape
Shape of tile to load (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:2119
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:1389
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:1139
Describes the size of a matrix tile. 
Definition: matrix_shape.h:42
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:1652
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:1915
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator.h:476
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:1052
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:2433
Definition: aligned_buffer.h:35
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:483
Defines a structure containing strides, bounds, and a pointer to tensor data. 
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1514
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1259
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:966
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:223
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:569
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1207
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:257
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:2496
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const 
Stores a fragment to memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2073
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const 
Stores a fragment to memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2661
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:2236
CUTLASS_HOST_DEVICE Element * data() const 
Returns the pointer to referenced data. 
Definition: tensor_ref.h:254
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:765
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1163
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:773
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:2296
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:1845
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Stores a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2381
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:1026
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:219
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:732
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:328
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:321
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1952
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:408
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:1418
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate. 
Definition: coord.h:387
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1692
Operand
GEMM operand enumeration: D = A * B + C. 
Definition: include/cutlass/gemm/gemm.h:39
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Stores a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2082
Definition: tensor_op_multiplicand_sm75.h:422
Architecture-specific operators on memory added for SM75. 
Definition: tensor_op_multiplicand_sm75.h:835
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1279
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:1616
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:872
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:1224
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:293
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:840
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1965
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:1386
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1922
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:1945
Defines common types used for all GEMM-like operators. 
Definition: tensor_op_multiplicand_sm75.h:213
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:432
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:1648
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:1583
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:2143
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:1833
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1683
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:2562
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:816
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2031
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:2146
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:2263
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:726
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:809
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2243
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:1895
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:1884
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:860
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:599
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1216
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:609
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1757
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:2229
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the opposite of the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:312
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2519
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1546
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:714
CUTLASS_HOST_DEVICE TensorRef & add_coord_offset(TensorCoord const &coord)
Adds an offset to each pointer. 
Definition: tensor_ref.h:326
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:641
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:266
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:2556
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1723
Mapping function for column-major matrices. 
Definition: layout/matrix.h:142
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1938
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:2445
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:2197
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:830
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:2528
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:782
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1666
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:202
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1503
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1483
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2220
Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D tensor formats...
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1776
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:466
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2436
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:1851
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2652
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:2193
CUTLASS_HOST_DEVICE Stride stride() const 
Returns the layout object's stride vector. 
Definition: tensor_ref.h:277
CUTLASS_DEVICE void store(Fragment &frag, TensorCoord const &tile_offset) const 
Stores a fragment to memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2372
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:1427
typename Layout::TensorCoord TensorCoord
Coordinate in logical tensor space. 
Definition: tensor_ref.h:171
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const 
Stores a fragment to memory. 
Definition: mma_tensor_op_tile_iterator.h:2025
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:534
Defines a Shape template for matrix tiles. 
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:386
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1527
Defines the size of an element in bits. 
Definition: numeric_types.h:42
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:342
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1291
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1997
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1471
Definition: mma_tensor_op_tile_iterator.h:1794
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:1409
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:750
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:1622
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:1657
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:761
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1228
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2549
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const 
Stores a fragment to memory. 
Definition: mma_tensor_op_tile_iterator.h:2618
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:823
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1675
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2064
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1436
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:107
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:791
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:138
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:1373
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:662
Shape_ Shape
Shape of tile to load (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:2421
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:969
OpDelta_ OpDelta
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) ...
Definition: mma_tensor_op_tile_iterator.h:2134
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:1316
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:560
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
CUTLASS_DEVICE void set_kgroup_index(int k_group)
Definition: mma_tensor_op_tile_iterator.h:893
Top-level include for all CUTLASS numeric types. 
CUTLASS_HOST_DEVICE LongIndex offset(TensorCoord const &coord) const 
Computes the offset of an index from the origin of the tensor. 
Definition: tensor_ref.h:301
Definition: mma_tensor_op_tile_iterator.h:75
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:2512
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:930
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:141
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2599
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1493
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:1959
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1931
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2608
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator++()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1445
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1453
CUTLASS_DEVICE void store_with_byte_offset(Fragment const &frag, Index byte_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2363
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2305
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:585
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape) 
Definition: mma_tensor_op_tile_iterator.h:123
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:542
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:729
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:1848
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:1709
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:519
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:1619
typename TensorRef::Index Index
Index type. 
Definition: mma_tensor_op_tile_iterator.h:495
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:1422
typename Layout::Index Index
Index type. 
Definition: tensor_ref.h:165
Mapping function for row-major matrices. 
Definition: layout/matrix.h:50
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1744
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2015
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:2131
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1269
Shape_ Shape
Shape of tile to load (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:1821
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: GemmShape) 
Definition: mma_tensor_op_tile_iterator.h:950
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:530
CUTLASS_HOST_DEVICE Reference at(TensorCoord const &coord) const 
Returns a reference to the element at a given Coord. 
Definition: tensor_ref.h:307
CUTLASS_HOST_DEVICE void store(Fragment const &frag) const 
Stores a fragment to memory. 
Definition: mma_tensor_op_tile_iterator.h:2324
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:376
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:2257
Defines layout functions used by TensorRef and derived classes. 
CUTLASS_DEVICE void store(Fragment const &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Stores a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2670
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:1353
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:396
Shape_ Shape
Shape of tile to load (concept: PitchLinearShape) 
Definition: mma_tensor_op_tile_iterator.h:697
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1701
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1148
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:578
Array< Element, kElementsPerAccess > AccessType
Definition: mma_tensor_op_tile_iterator.h:2478
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:498
Defines layout functions used by TensorRef and derived classes for pitch-linear memory. 
Definition: layout/matrix.h:343
CUTLASS_HOST_DEVICE TensorRef & add_pointer_offset(LongIndex offset_)
Adds an offset to each pointer. 
Definition: tensor_ref.h:319
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:1639
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:972
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:1733
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:335
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:1479
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2330
CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:1713
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:1061
CUTLASS_DEVICE void load_with_byte_offset(Fragment &frag, Index byte_offset) const 
Loads a fragment from memory with additional logical offset. 
Definition: mma_tensor_op_tile_iterator.h:2590
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator(TensorRef const &ref, int lane_id)
Constructor from TensorRef. 
Definition: mma_tensor_op_tile_iterator.h:1899
Definition: tensor_op_multiplicand_sm75.h:632
Element_ Element
Element type. 
Definition: mma_tensor_op_tile_iterator.h:116
typename TensorRef::LongIndex LongIndex
Long Index type. 
Definition: mma_tensor_op_tile_iterator.h:2448
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:629
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:1392
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:2250
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:850
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:1202
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:619
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:2149
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:2182
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2314
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:2535
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & operator--()
Advances the iterator along the advance dimension. 
Definition: mma_tensor_op_tile_iterator.h:800
Basic include for CUTLASS. 
Definition: matrix_coord.h:39
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator()
Default ctor constructs null iterator. 
Definition: mma_tensor_op_tile_iterator.h:2492
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:144
CUTLASS_HOST_DEVICE void load(Fragment &frag) const 
Loads a fragment from memory at the location pointed to by the iterator. 
Definition: mma_tensor_op_tile_iterator.h:592
CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset)
Definition: mma_tensor_op_tile_iterator.h:1462
CUTLASS_DEVICE MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset)
advances in units of whole tiles along the logical coordinate space of the tensor ...
Definition: mma_tensor_op_tile_iterator.h:2542
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:501
CUTLASS_DEVICE void load(Fragment &frag, TensorCoord const &tile_offset) const 
Loads a fragment from memory with logical offset in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:2006
CUTLASS_DEVICE void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const 
Stores a fragment to memory with additional pointer offset. 
Definition: mma_tensor_op_tile_iterator.h:2624
typename TensorRef::TensorCoord TensorCoord
Coordinate for an element in the tensor. 
Definition: mma_tensor_op_tile_iterator.h:2451
InstructionShape_ InstructionShape
Shape of one matrix product operation (concept: MatrixShape) 
Definition: mma_tensor_op_tile_iterator.h:1603
Array< Element, Shape::kCount/kThreads > Fragment
Fragment object holding a thread's part of a tile. 
Definition: mma_tensor_op_tile_iterator.h:2481
typename Layout::LongIndex LongIndex
Long index used for pointer offsets. 
Definition: tensor_ref.h:168
CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator & add_tile_offset(TensorCoord const &tile_offset)
Advances an iterator along logical dimensions of matrix in units of whole tiles. 
Definition: mma_tensor_op_tile_iterator.h:551
Definition: tensor_op_multiplicand_sm75.h:527
CUTLASS_HOST_DEVICE MmaTensorOpAccumulatorTileIterator & add_pointer_offset(LongIndex offset)
Adds a pointer offset to internal pointer(s) to advance through memory. 
Definition: mma_tensor_op_tile_iterator.h:2213