38 template <
typename Func, 
int Rank, 
typename Params>
    44     if (!grid_size || !block_size) {
    47       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
    50         reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
    52       if (result != cudaSuccess) {
    53         throw std::runtime_error(
"Failed to query occupancy.");
    58       block_size = (block_size < 128 ? block_size : 128);
    61     dim3 grid(grid_size, 1, 1);
    62     dim3 block(block_size, 1, 1);
    64     kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
    71 template <
typename Func, 
int Rank, 
typename Params>
    81     dim3 block(block_size, 1, 1);
    82     dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
    84     kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
    91 template <
typename Element, 
typename Func>
    98     typename Func::Params params = 
typename Func::Params(), 
   100     int block_size = 0) {
   102     if (!grid_size || !block_size) {
   105       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
   108         reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
   110       if (result != cudaSuccess) {
   111         throw std::runtime_error(
"Failed to query occupancy.");
   116       block_size = (block_size < 128 ? block_size : 128);
   119     dim3 grid(grid_size, 1, 1);
   120     dim3 block(block_size, 1, 1);
   122     kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
 Definition: aligned_buffer.h:35
TensorDiagonalForEach(Coord< Rank > size, Params params=Params(), int start=0, int end=-1, int block_size=128)
Constructor performs the operation. 
Definition: device/tensor_foreach.h:75
TensorForEach(Coord< Rank > size, Params params=Params(), int grid_size=0, int block_size=0)
Constructor performs the operation. 
Definition: device/tensor_foreach.h:42
Launches a kernel calling a functor for each element along a tensor's diagonal. 
Definition: device/tensor_foreach.h:72
BlockForEach(Element *ptr, size_t capacity, typename Func::Params params=typename Func::Params(), int grid_size=0, int block_size=0)
Constructor performs the operation. 
Definition: device/tensor_foreach.h:95
Launches a kernel calling a functor for each element in a tensor's index space. 
Definition: device/tensor_foreach.h:39
Statically-sized array specifying Coords within a tensor. 
Definition: coord.h:43
Definition: device/tensor_foreach.h:92
Basic include for CUTLASS.