mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-11 02:32:12 +01:00
Compare commits
17 Commits
398ba6521a
...
d2dae9c31b
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d2dae9c31b | ||
![]() |
8d564d5e3a | ||
![]() |
37c5bcbef4 | ||
![]() |
940a4c7a91 | ||
![]() |
61bd39800d | ||
![]() |
8a96d2eee7 | ||
![]() |
e762d58260 | ||
![]() |
8fd1701744 | ||
![]() |
94765c1597 | ||
![]() |
c87881f02c | ||
![]() |
25400b6b3c | ||
![]() |
22424227e7 | ||
![]() |
42ff742bf5 | ||
![]() |
8ccb13c6f0 | ||
![]() |
fc06e3de18 | ||
![]() |
a3b5b817e3 | ||
![]() |
5925483b33 |
@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
||||
}
|
||||
|
||||
unsigned long long compute_perf =
|
||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
||||
((unsigned long long)multiProcessorCount * sm_per_multiproc *
|
||||
clockRate);
|
||||
|
||||
if (compute_perf > max_compute_perf) {
|
||||
|
@ -203,7 +203,7 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
|
||||
#### GLFW
|
||||
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
||||
|
||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
|
||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
|
||||
|
||||
#### OpenMP
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -57,7 +57,7 @@ int main(int argc, char **argv) {
|
||||
// Get GPU information
|
||||
checkCudaErrors(cudaGetDevice(&devID));
|
||||
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||||
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
|
||||
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
|
||||
props.major, props.minor);
|
||||
|
||||
printf("printf() is called. Output:\n\n");
|
||||
|
@ -11,5 +11,4 @@ This sample enumerates the properties of the CUDA devices present in the system.
|
||||
This sample enumerates the properties of the CUDA devices present using CUDA Driver API calls
|
||||
|
||||
### [topologyQuery](./topologyQuery)
|
||||
A simple exemple on how to query the topology of a system with multiple GPU
|
||||
|
||||
A simple example on how to query the topology of a system with multiple GPU
|
||||
|
@ -34,13 +34,12 @@
|
||||
#define _KERNELS_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <thrust/functional.h>
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
// Functors used with thrust library.
|
||||
template <typename Input>
|
||||
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
|
||||
struct IsGreaterEqualThan
|
||||
{
|
||||
__host__ __device__ IsGreaterEqualThan(uint upperBound) :
|
||||
upperBound_(upperBound) {}
|
||||
|
49
Samples/3_CUDA_Features/globalToShmemTMACopy/README.md
Normal file
49
Samples/3_CUDA_Features/globalToShmemTMACopy/README.md
Normal file
@ -0,0 +1,49 @@
|
||||
# globalToShmemTMACopy - Global Memory to Shared Memory TMA Copy
|
||||
|
||||
## Description
|
||||
|
||||
This sample shows how to use the CUDA driver API and inline PTX assembly to copy
|
||||
a 2D tile of a tensor into shared memory. It also demonstrates arrive-wait
|
||||
barrier for synchronization.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
CUDA Runtime API, CUDA Driver API, PTX ISA, CPP11 CUDA
|
||||
|
||||
## Supported SM Architectures
|
||||
|
||||
This sample requires compute capability 9.0 or higher.
|
||||
|
||||
[SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
|
||||
|
||||
## Supported OSes
|
||||
|
||||
Linux, Windows, QNX
|
||||
|
||||
## Supported CPU Architecture
|
||||
|
||||
x86_64, ppc64le, armv7l, aarch64
|
||||
|
||||
## CUDA APIs involved
|
||||
|
||||
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
|
||||
cudaMalloc, cudaMemcpy, cudaFree, cudaDeviceSynchronize
|
||||
|
||||
### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
|
||||
cudaMalloc, cudaMemcpy, cudaFree, cudaDeviceSynchronize
|
||||
|
||||
### [CUDA PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
|
||||
|
||||
## Dependencies needed to build/run
|
||||
[CPP11](../../../README.md#cpp11)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||
|
||||
## Build and Run
|
||||
|
||||
|
||||
## References (for more details)
|
||||
|
@ -0,0 +1,354 @@
|
||||
/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* This sample demonstrates how to:
|
||||
*
|
||||
* - Create a TensorMap (TMA descriptor)
|
||||
* - Load a 2D tile of data into shared memory
|
||||
*
|
||||
* Compile and run with:
|
||||
*
|
||||
* nvcc -arch sm_90 -run globalToShmemTMACopy.cu
|
||||
*
|
||||
* It can be that the compiler issues the following note. This can be safely ignored.
|
||||
*
|
||||
* note: the ABI for passing parameters with 64-byte alignment has changed in
|
||||
* GCC 4.6
|
||||
*
|
||||
*/
|
||||
#include <cstdio> // printf
|
||||
#include <vector> // std::vector
|
||||
|
||||
#include <cudaTypedefs.h> // PFN_cuTensorMapEncodeTiled
|
||||
#include <cuda.h> // CUtensormap
|
||||
|
||||
#include <cuda_awbarrier_primitives.h> // __mbarrier_*
|
||||
|
||||
#include "util.h" // CUDA_CHECK macro
|
||||
|
||||
/*
|
||||
* Constants.
|
||||
*/
|
||||
constexpr size_t W_global = 1024; // Width of tensor (in # elements)
|
||||
constexpr size_t H_global = 1024; // Height of tensor (in # elements)
|
||||
|
||||
constexpr int SMEM_W = 32; // Width of shared memory buffer (in # elements)
|
||||
constexpr int SMEM_H = 8; // Height of shared memory buffer (in # elements)
|
||||
|
||||
/*
|
||||
* CUDA Driver API
|
||||
*/
|
||||
|
||||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() {
|
||||
void* driver_ptr = nullptr;
|
||||
cudaDriverEntryPointQueryResult driver_status;
|
||||
CUDA_CHECK(cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &driver_ptr, cudaEnableDefault, &driver_status));
|
||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(driver_ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
* PTX wrappers
|
||||
*/
|
||||
|
||||
inline __device__ __mbarrier_token_t barrier_arrive1_tx(
|
||||
__mbarrier_t *barrier, uint32_t expected_tx_count
|
||||
)
|
||||
{
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
|
||||
__mbarrier_token_t token;
|
||||
|
||||
asm volatile("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
|
||||
: "=l"(token)
|
||||
: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))), "r"(expected_tx_count)
|
||||
: "memory");
|
||||
return token;
|
||||
}
|
||||
|
||||
inline __device__ bool barrier_try_wait_token(__mbarrier_t *barrier, __mbarrier_token_t token)
|
||||
{
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait
|
||||
//
|
||||
// This function returns a bool, so that software can retry.
|
||||
//
|
||||
// The HW only provides best-effort waiting support. The wait time is limited
|
||||
// by the HW capability, after which a fail occurs, in which case the SW is
|
||||
// responsible for retrying.
|
||||
int __ready;
|
||||
asm volatile("{\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"mbarrier.try_wait.acquire.cta.shared::cta.b64 p, [%1], %2;\n\t"
|
||||
"selp.b32 %0, 1, 0, p;\n\t"
|
||||
"}"
|
||||
: "=r"(__ready)
|
||||
: "r"(static_cast<unsigned int>(__cvta_generic_to_shared(barrier))),
|
||||
"l"(token)
|
||||
: "memory");
|
||||
return __ready;
|
||||
}
|
||||
|
||||
inline __device__ void cp_async_bulk_tensor_2d(
|
||||
__mbarrier_t *barrier, void *dst, int access_coord_x, int access_coord_y, const CUtensorMap *tensor_desc)
|
||||
{
|
||||
unsigned smem_int_ptr = static_cast<unsigned int>(__cvta_generic_to_shared(dst));
|
||||
unsigned smem_barrier_int_ptr = static_cast<unsigned int>(__cvta_generic_to_shared(barrier));
|
||||
uint64_t tensor_desc_ptr = reinterpret_cast<uint64_t>(tensor_desc);
|
||||
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
|
||||
"[%0], [%1, {%2, %3}], [%4];\n"
|
||||
:
|
||||
: "r"(smem_int_ptr),
|
||||
"l"(tensor_desc_ptr),
|
||||
"r"(access_coord_x),
|
||||
"r"(access_coord_y),
|
||||
"r"(smem_barrier_int_ptr)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Layout of shared memory. It contains:
|
||||
//
|
||||
// - a buffer to hold a subset of a tensor,
|
||||
// - a shared memory barrier.
|
||||
template <int H, int W>
|
||||
struct smem_t {
|
||||
|
||||
// The destination shared memory buffer of a bulk tensor operation should be
|
||||
// 128 byte aligned.
|
||||
struct alignas(128) tensor_buffer {
|
||||
int data[H][W];
|
||||
|
||||
__device__ constexpr int width() {return W;}
|
||||
__device__ constexpr int height() {return H;}
|
||||
};
|
||||
|
||||
tensor_buffer buffer;
|
||||
|
||||
// Put the barrier behind the tensor buffer to prevent 100+ bytes of padding.
|
||||
__mbarrier_t bar;
|
||||
|
||||
__device__ constexpr int buffer_size_in_bytes() {
|
||||
return sizeof(tensor_buffer::data);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Main kernel: takes a TMA descriptor and two coordinates.
|
||||
*
|
||||
* Loads a tile into shared memory using TMA and prints the tile.
|
||||
*
|
||||
*/
|
||||
__global__ void kernel(const __grid_constant__ CUtensorMap tma_desc, int x_0, int y_0) {
|
||||
/*
|
||||
* ***NOTE***:
|
||||
|
||||
A CUtensorMap can only be passed as a `const __grid_constant__`
|
||||
parameter. Passing a CUtensorMap in any other way from the host to
|
||||
device can result in difficult if not impossible to debug failures.
|
||||
|
||||
*/
|
||||
|
||||
// Declare shared memory to hold tensor buffer and shared memory barrier.
|
||||
__shared__ smem_t<SMEM_H, SMEM_W> smem;
|
||||
|
||||
// Utility variable to elect a leader thread.
|
||||
bool leader = threadIdx.x == 0;
|
||||
|
||||
|
||||
if (leader) {
|
||||
// Initialize barrier. We will participate in the barrier with `blockDim.x`
|
||||
// threads.
|
||||
__mbarrier_init(&smem.bar, blockDim.x);
|
||||
}
|
||||
// Syncthreads so initialized barrier is visible to all threads.
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// This token is created when arriving on the shared memory barrier. It is
|
||||
// used again when waiting on the barrier.
|
||||
__mbarrier_token_t token;
|
||||
|
||||
// Load first batch
|
||||
if (leader) {
|
||||
// Initiate bulk tensor copy.
|
||||
cp_async_bulk_tensor_2d(&smem.bar, &smem.buffer.data, x_0, y_0, &tma_desc);
|
||||
// Arrive with arrival count of 1 and expected transaction count equal to
|
||||
// the number of bytes that are copied by cp_async_bulk_tensor_2d.
|
||||
token = barrier_arrive1_tx(&smem.bar, smem.buffer_size_in_bytes());
|
||||
} else {
|
||||
// Other threads arrive with arrival count of 1 and expected tx count of 0.
|
||||
token = barrier_arrive1_tx(&smem.bar, 0);
|
||||
}
|
||||
|
||||
// The barrier will flip when the following two conditions have been met:
|
||||
//
|
||||
// - Its arrival count reaches blockDim.x (see __mbarrier_init above).
|
||||
// Typically, each thread will arrive with an arrival count of one so this
|
||||
// indicates that all threads have arrived.
|
||||
//
|
||||
// - Its expected transaction count reaches smem.buffer_size_in_bytes(). The
|
||||
// bulk tensor operation will increment the transaction count as it copies
|
||||
// bytes.
|
||||
|
||||
// Wait for barrier to flip. Try_wait puts the thread to sleep while waiting.
|
||||
// It is woken up when the barrier flips or when a hardware-defined number of
|
||||
// clock cycles have passed. In the second case, we retry waiting.
|
||||
while(! barrier_try_wait_token(&smem.bar, token)) { };
|
||||
|
||||
// From this point onwards, the data in smem.buffer is readable by all threads
|
||||
// participating the in the barrier.
|
||||
|
||||
// Print the data:
|
||||
if (leader) {
|
||||
printf("\n\nPrinting tile at coordinates x0 = %d, y0 = %d\n", x_0, y_0);
|
||||
|
||||
// Print global x coordinates
|
||||
printf("global->\t");
|
||||
for (int x = 0; x < smem.buffer.width(); ++x) {
|
||||
printf("[%4d] ", x_0 + x);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// Print local x coordinates
|
||||
printf("local ->\t");
|
||||
for (int x = 0; x < smem.buffer.width(); ++x) {
|
||||
printf("[%4d] ", x);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
for (int y = 0; y < smem.buffer.height(); ++y) {
|
||||
// Print global and local y coordinates
|
||||
printf("[%4d] [%2d]\t", y_0 + y, y);
|
||||
for (int x = 0; x < smem.buffer.width(); ++x) {
|
||||
printf(" %4d ", smem.buffer.data[y][x]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Invalidate barrier. If further computations were to take place in the
|
||||
// kernel, this allows the memory location of the shared memory barrier to
|
||||
// be repurposed.
|
||||
__mbarrier_inval(&smem.bar);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
// Create a 2D tensor in GPU global memory containing linear indices 0, 1, 2, ... .
|
||||
// The data layout is row-major.
|
||||
|
||||
// First fill in a vector on the host.
|
||||
std::vector<int> tensor_host(H_global * W_global);
|
||||
for (int i = 0; i < H_global * W_global; ++i) {
|
||||
tensor_host[i] = i;
|
||||
}
|
||||
|
||||
// Move it to device
|
||||
int * tensor = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&tensor, H_global * W_global * sizeof(int)));
|
||||
CUDA_CHECK(cudaMemcpy(tensor, tensor_host.data(), H_global * W_global * sizeof(int), cudaMemcpyHostToDevice));
|
||||
|
||||
// Set up parameters to create TMA descriptor.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
|
||||
|
||||
CUtensorMap tma_desc{};
|
||||
CUtensorMapDataType dtype = CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32;
|
||||
auto rank = 2;
|
||||
uint64_t size[rank] = {W_global, H_global};
|
||||
// The stride is the number of bytes to traverse from the first element of one row to the next.
|
||||
// It must be a multiple of 16.
|
||||
uint64_t stride[rank - 1] = {W_global * sizeof(int)};
|
||||
// The box_size is the size of the shared memory buffer that is used as the destination of a TMA transfer.
|
||||
uint32_t box_size[rank] = {SMEM_W, SMEM_H};
|
||||
// The distance between elements in units of sizeof(element). A stride of 2
|
||||
// can be used to load only the real component of a complex-valued tensor, for instance.
|
||||
uint32_t elem_stride[rank] = {1, 1};
|
||||
// Interleave patterns are sometimes used to accelerate loading of values that
|
||||
// are less than 4 bytes long.
|
||||
CUtensorMapInterleave interleave = CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE;
|
||||
// Swizzling can be used to avoid shared memory bank conflicts.
|
||||
CUtensorMapSwizzle swizzle = CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE;
|
||||
CUtensorMapL2promotion l2_promotion = CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE;
|
||||
// Any element that is outside of bounds will be set to zero by the TMA transfer.
|
||||
CUtensorMapFloatOOBfill oob_fill = CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
|
||||
|
||||
// Get a function pointer to the cuTensorMapEncodeTiled driver API.
|
||||
auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
|
||||
|
||||
// Create the tensor descriptor.
|
||||
CUresult res = cuTensorMapEncodeTiled(
|
||||
&tma_desc, // CUtensorMap *tensorMap,
|
||||
dtype, // CUtensorMapDataType tensorDataType,
|
||||
rank, // cuuint32_t tensorRank,
|
||||
tensor, // void *globalAddress,
|
||||
size, // const cuuint64_t *globalDim,
|
||||
stride, // const cuuint64_t *globalStrides,
|
||||
box_size, // const cuuint32_t *boxDim,
|
||||
elem_stride, // const cuuint32_t *elementStrides,
|
||||
interleave, // CUtensorMapInterleave interleave,
|
||||
swizzle, // CUtensorMapSwizzle swizzle,
|
||||
l2_promotion, // CUtensorMapL2promotion l2Promotion,
|
||||
oob_fill // CUtensorMapFloatOOBfill oobFill);
|
||||
);
|
||||
// Print the result. Should be zero.
|
||||
printf("cuTensorMapEncodeTiled returned CUresult: %d\n\n", res);
|
||||
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
dim3 grid(1);
|
||||
dim3 block(128);
|
||||
|
||||
printf("Print the top right corner tile of the tensor:\n");
|
||||
kernel<<<grid, block>>>(tma_desc, 0, 0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
printf("Negative indices work:\n");
|
||||
kernel<<<grid, block>>>(tma_desc, -4, 0);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
printf("When the indices are out of bounds, the shared memory buffer is filled with zeros:\n");
|
||||
kernel<<<grid, block>>>(tma_desc, W_global, H_global);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
printf(
|
||||
"\n**NOTE**: The following code will fail.\n "
|
||||
"\nCare must be taken to ensure that the coordinates result in a memory offset\n"
|
||||
"that is aligned to 16 bytes. With 32 bit integer elements, x coordinates\n"
|
||||
"that are not a multiple of 4 result in a non-recoverable error:\n\n"
|
||||
);
|
||||
kernel<<<grid, block>>>(tma_desc, 1, 0);
|
||||
CUDA_REPORT(cudaDeviceSynchronize());
|
||||
kernel<<<grid, block>>>(tma_desc, 2, 0);
|
||||
CUDA_REPORT(cudaDeviceSynchronize());
|
||||
kernel<<<grid, block>>>(tma_desc, 3, 0);
|
||||
CUDA_REPORT(cudaDeviceSynchronize());
|
||||
|
||||
CUDA_REPORT(cudaFree(tensor));
|
||||
return 0;
|
||||
}
|
64
Samples/3_CUDA_Features/globalToShmemTMACopy/util.h
Normal file
64
Samples/3_CUDA_Features/globalToShmemTMACopy/util.h
Normal file
@ -0,0 +1,64 @@
|
||||
/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
// Macro for checking cuda errors following a cuda launch or api call
|
||||
#define CUDA_CHECK(ans) \
|
||||
{ \
|
||||
gpuAssert((ans), __FILE__, __LINE__); \
|
||||
}
|
||||
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line)
|
||||
{
|
||||
if (code != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Macro to report cuda errors following a cuda launch or api call
|
||||
#define CUDA_REPORT(ans) \
|
||||
{ \
|
||||
gpuReport((ans), __FILE__, __LINE__); \
|
||||
}
|
||||
|
||||
inline void gpuReport(cudaError_t code, const char *file, int line)
|
||||
{
|
||||
if (code != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "CUDA error (as expected): %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
}
|
||||
}
|
@ -31,7 +31,6 @@
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include "cuda.h"
|
||||
@ -293,6 +292,11 @@ static void memMapGetDeviceFunction(char **argv) {
|
||||
jitNumOptions, jitOptions,
|
||||
(void **)jitOptVals));
|
||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||
|
||||
// Clean up dynamically allocated memory
|
||||
delete[] jitOptions;
|
||||
delete[] jitOptVals;
|
||||
delete[] jitLogBuffer;
|
||||
} else {
|
||||
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
|
||||
}
|
||||
@ -379,7 +383,7 @@ static void childProcess(int devId, int id, char **argv) {
|
||||
// deterministic.
|
||||
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
|
||||
if (id == 0) {
|
||||
printf("Step %lld done\n", (unsigned long long)i);
|
||||
printf("Step %llu done\n", (unsigned long long)i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -550,7 +554,7 @@ static void parentProcess(char *app) {
|
||||
// Launch the child processes!
|
||||
for (i = 0; i < nprocesses; i++) {
|
||||
char devIdx[10];
|
||||
char procIdx[10];
|
||||
char procIdx[12];
|
||||
char *const args[] = {app, devIdx, procIdx, NULL};
|
||||
Process process;
|
||||
|
||||
|
@ -416,8 +416,8 @@ void initMC(int argc, char **argv) {
|
||||
gridSizeLog2.x = n;
|
||||
}
|
||||
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "gridx")) {
|
||||
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridx");
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "gridy")) {
|
||||
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridy");
|
||||
gridSizeLog2.y = n;
|
||||
}
|
||||
|
||||
|
@ -20,16 +20,19 @@ include_directories(../../../Common)
|
||||
find_package(Vulkan)
|
||||
find_package(OpenGL)
|
||||
|
||||
|
||||
# Include the check_include_file macro
|
||||
include(CheckIncludeFile)
|
||||
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,21 +54,22 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(simpleVulkan PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET simpleVulkan POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sinewave.frag
|
||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,23 +53,23 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(simpleVulkanMMAP PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET simpleVulkanMMAP POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/montecarlo.frag
|
||||
|
@ -71,7 +71,7 @@ if(${OpenGL_FOUND})
|
||||
POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||
)
|
||||
endif()
|
||||
|
||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_file(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,21 +53,22 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(vulkanImageCUDA PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET vulkanImageCUDA POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/shader.frag
|
||||
|
Loading…
x
Reference in New Issue
Block a user