mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-21 23:31:23 +01:00
Compare commits
26 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
7b60178984 | ||
![]() |
929ac4c8b5 | ||
![]() |
291435e0b4 | ||
![]() |
990ebc01c2 | ||
![]() |
e7b23470d5 | ||
![]() |
3e8f91d1a1 | ||
![]() |
f3b7c41ad6 | ||
![]() |
29fb758e62 | ||
![]() |
3bc08136ff | ||
![]() |
85eefa06c4 | ||
![]() |
c357dd1e6b | ||
![]() |
efb46383e0 | ||
![]() |
8d564d5e3a | ||
![]() |
37c5bcbef4 | ||
![]() |
940a4c7a91 | ||
![]() |
61bd39800d | ||
![]() |
8a96d2eee7 | ||
![]() |
e762d58260 | ||
![]() |
8fd1701744 | ||
![]() |
94765c1597 | ||
![]() |
c87881f02c | ||
![]() |
25400b6b3c | ||
![]() |
e24f62e28c | ||
![]() |
22424227e7 | ||
![]() |
42ff742bf5 | ||
![]() |
8ccb13c6f0 |
@ -36,6 +36,7 @@
|
|||||||
* `cuDLALayerwiseStatsHybrid`
|
* `cuDLALayerwiseStatsHybrid`
|
||||||
* `cuDLALayerwiseStatsStandalone`
|
* `cuDLALayerwiseStatsStandalone`
|
||||||
* `cuDLAStandaloneMode`
|
* `cuDLAStandaloneMode`
|
||||||
|
* `cudaNvSciBufMultiplanar`
|
||||||
* `cudaNvSciNvMedia`
|
* `cudaNvSciNvMedia`
|
||||||
* `fluidsGLES`
|
* `fluidsGLES`
|
||||||
* `nbody_opengles`
|
* `nbody_opengles`
|
||||||
|
@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned long long compute_perf =
|
unsigned long long compute_perf =
|
||||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
((unsigned long long)multiProcessorCount * sm_per_multiproc *
|
||||||
clockRate);
|
clockRate);
|
||||||
|
|
||||||
if (compute_perf > max_compute_perf) {
|
if (compute_perf > max_compute_perf) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# CUDA Samples
|
# CUDA Samples
|
||||||
|
|
||||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.6](https://developer.nvidia.com/cuda-downloads).
|
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
## Release Notes
|
## Release Notes
|
||||||
|
|
||||||
@ -203,7 +203,7 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
|
|||||||
#### GLFW
|
#### GLFW
|
||||||
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
||||||
|
|
||||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
|
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
|
||||||
|
|
||||||
#### OpenMP
|
#### OpenMP
|
||||||
|
|
||||||
|
@ -1,20 +1,3 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
|
||||||
|
|
||||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
|
|
||||||
|
|
||||||
project(simpleCallback LANGUAGES C CXX CUDA)
|
|
||||||
|
|
||||||
find_package(CUDAToolkit REQUIRED)
|
|
||||||
|
|
||||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
add_subdirectory(UnifiedMemoryStreams)
|
add_subdirectory(UnifiedMemoryStreams)
|
||||||
add_subdirectory(asyncAPI)
|
add_subdirectory(asyncAPI)
|
||||||
add_subdirectory(clock)
|
add_subdirectory(clock)
|
||||||
@ -55,6 +38,7 @@ add_subdirectory(simpleTexture3D)
|
|||||||
add_subdirectory(simpleTextureDrv)
|
add_subdirectory(simpleTextureDrv)
|
||||||
add_subdirectory(simpleVoteIntrinsics)
|
add_subdirectory(simpleVoteIntrinsics)
|
||||||
add_subdirectory(simpleZeroCopy)
|
add_subdirectory(simpleZeroCopy)
|
||||||
|
add_subdirectory(template)
|
||||||
add_subdirectory(systemWideAtomics)
|
add_subdirectory(systemWideAtomics)
|
||||||
add_subdirectory(vectorAdd)
|
add_subdirectory(vectorAdd)
|
||||||
add_subdirectory(vectorAddDrv)
|
add_subdirectory(vectorAddDrv)
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@ int main(int argc, char **argv) {
|
|||||||
// Get GPU information
|
// Get GPU information
|
||||||
checkCudaErrors(cudaGetDevice(&devID));
|
checkCudaErrors(cudaGetDevice(&devID));
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||||||
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
|
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
|
||||||
props.major, props.minor);
|
props.major, props.minor);
|
||||||
|
|
||||||
printf("printf() is called. Output:\n\n");
|
printf("printf() is called. Output:\n\n");
|
||||||
|
@ -20,7 +20,7 @@ include_directories(../../../Common)
|
|||||||
|
|
||||||
# Source file
|
# Source file
|
||||||
# Add target for template
|
# Add target for template
|
||||||
add_executable(template template.cu)
|
add_executable(template template.cu template_cpu.cpp)
|
||||||
|
|
||||||
target_compile_options(template PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
|
target_compile_options(template PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
|
||||||
|
|
||||||
|
@ -11,5 +11,4 @@ This sample enumerates the properties of the CUDA devices present in the system.
|
|||||||
This sample enumerates the properties of the CUDA devices present using CUDA Driver API calls
|
This sample enumerates the properties of the CUDA devices present using CUDA Driver API calls
|
||||||
|
|
||||||
### [topologyQuery](./topologyQuery)
|
### [topologyQuery](./topologyQuery)
|
||||||
A simple exemple on how to query the topology of a system with multiple GPU
|
A simple example on how to query the topology of a system with multiple GPU
|
||||||
|
|
||||||
|
@ -77,7 +77,6 @@ int filter_radius = 14;
|
|||||||
int nthreads = 64;
|
int nthreads = 64;
|
||||||
unsigned int width, height;
|
unsigned int width, height;
|
||||||
unsigned int *h_img = NULL;
|
unsigned int *h_img = NULL;
|
||||||
unsigned int *d_img = NULL;
|
|
||||||
unsigned int *d_temp = NULL;
|
unsigned int *d_temp = NULL;
|
||||||
|
|
||||||
GLuint pbo; // OpenGL pixel buffer object
|
GLuint pbo; // OpenGL pixel buffer object
|
||||||
@ -108,11 +107,11 @@ extern "C" void computeGold(float *id, float *od, int w, int h, int n);
|
|||||||
// These are CUDA functions to handle allocation and launching the kernels
|
// These are CUDA functions to handle allocation and launching the kernels
|
||||||
extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA);
|
extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA);
|
||||||
extern "C" void freeTextures();
|
extern "C" void freeTextures();
|
||||||
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
extern "C" double boxFilter(float *d_temp, float *d_dest,
|
||||||
int width, int height, int radius, int iterations,
|
int width, int height, int radius, int iterations,
|
||||||
int nthreads, StopWatchInterface *timer);
|
int nthreads, StopWatchInterface *timer);
|
||||||
|
|
||||||
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
|
extern "C" double boxFilterRGBA(unsigned int *d_temp,
|
||||||
unsigned int *d_dest, int width, int height,
|
unsigned int *d_dest, int width, int height,
|
||||||
int radius, int iterations, int nthreads,
|
int radius, int iterations, int nthreads,
|
||||||
StopWatchInterface *timer);
|
StopWatchInterface *timer);
|
||||||
@ -165,7 +164,7 @@ void display() {
|
|||||||
size_t num_bytes;
|
size_t num_bytes;
|
||||||
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
|
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
|
||||||
(void **)&d_result, &num_bytes, cuda_pbo_resource));
|
(void **)&d_result, &num_bytes, cuda_pbo_resource));
|
||||||
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
|
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
|
||||||
iterations, nthreads, kernel_timer);
|
iterations, nthreads, kernel_timer);
|
||||||
|
|
||||||
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
|
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
|
||||||
@ -282,11 +281,7 @@ void reshape(int x, int y) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void initCuda(bool useRGBA) {
|
void initCuda(bool useRGBA) {
|
||||||
// allocate device memory
|
checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_img, (width * height * sizeof(unsigned int))));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
|
|
||||||
|
|
||||||
// Refer to boxFilter_kernel.cu for implementation
|
// Refer to boxFilter_kernel.cu for implementation
|
||||||
initTexture(width, height, h_img, useRGBA);
|
initTexture(width, height, h_img, useRGBA);
|
||||||
@ -304,11 +299,6 @@ void cleanup() {
|
|||||||
h_img = NULL;
|
h_img = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (d_img) {
|
|
||||||
cudaFree(d_img);
|
|
||||||
d_img = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (d_temp) {
|
if (d_temp) {
|
||||||
cudaFree(d_temp);
|
cudaFree(d_temp);
|
||||||
d_temp = NULL;
|
d_temp = NULL;
|
||||||
@ -413,7 +403,7 @@ int runBenchmark() {
|
|||||||
cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int)));
|
cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int)));
|
||||||
|
|
||||||
// warm-up
|
// warm-up
|
||||||
boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations,
|
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations,
|
||||||
nthreads, kernel_timer);
|
nthreads, kernel_timer);
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
@ -426,7 +416,7 @@ int runBenchmark() {
|
|||||||
|
|
||||||
for (int i = 0; i < iCycles; i++) {
|
for (int i = 0; i < iCycles; i++) {
|
||||||
dProcessingTime +=
|
dProcessingTime +=
|
||||||
boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius,
|
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius,
|
||||||
iterations, nthreads, kernel_timer);
|
iterations, nthreads, kernel_timer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -469,7 +459,7 @@ int runSingleTest(char *ref_file, char *exec_path) {
|
|||||||
{
|
{
|
||||||
printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius,
|
printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius,
|
||||||
iterations);
|
iterations);
|
||||||
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
|
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
|
||||||
iterations, nthreads, kernel_timer);
|
iterations, nthreads, kernel_timer);
|
||||||
|
|
||||||
// check if kernel execution generated an error
|
// check if kernel execution generated an error
|
||||||
|
@ -399,7 +399,6 @@ extern "C" void freeTextures() {
|
|||||||
Perform 2D box filter on image using CUDA
|
Perform 2D box filter on image using CUDA
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
d_src - pointer to input image in device memory
|
|
||||||
d_temp - pointer to temporary storage in device memory
|
d_temp - pointer to temporary storage in device memory
|
||||||
d_dest - pointer to destination image in device memory
|
d_dest - pointer to destination image in device memory
|
||||||
width - image width
|
width - image width
|
||||||
@ -408,7 +407,7 @@ extern "C" void freeTextures() {
|
|||||||
iterations - number of iterations
|
iterations - number of iterations
|
||||||
|
|
||||||
*/
|
*/
|
||||||
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
extern "C" double boxFilter(float *d_temp, float *d_dest,
|
||||||
int width, int height, int radius, int iterations,
|
int width, int height, int radius, int iterations,
|
||||||
int nthreads, StopWatchInterface *timer) {
|
int nthreads, StopWatchInterface *timer) {
|
||||||
// var for kernel timing
|
// var for kernel timing
|
||||||
@ -447,7 +446,7 @@ extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RGBA version
|
// RGBA version
|
||||||
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
|
extern "C" double boxFilterRGBA(unsigned int *d_temp,
|
||||||
unsigned int *d_dest, int width, int height,
|
unsigned int *d_dest, int width, int height,
|
||||||
int radius, int iterations, int nthreads,
|
int radius, int iterations, int nthreads,
|
||||||
StopWatchInterface *timer) {
|
StopWatchInterface *timer) {
|
||||||
|
@ -34,13 +34,12 @@
|
|||||||
#define _KERNELS_H_
|
#define _KERNELS_H_
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <thrust/functional.h>
|
|
||||||
|
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
// Functors used with thrust library.
|
// Functors used with thrust library.
|
||||||
template <typename Input>
|
template <typename Input>
|
||||||
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
|
struct IsGreaterEqualThan
|
||||||
{
|
{
|
||||||
__host__ __device__ IsGreaterEqualThan(uint upperBound) :
|
__host__ __device__ IsGreaterEqualThan(uint upperBound) :
|
||||||
upperBound_(upperBound) {}
|
upperBound_(upperBound) {}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/indexen.htm
|
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang https://hwlang.de/algorithmen/sortieren/bitonic/bitonicen.htm
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
|
@ -59,6 +59,7 @@ typedef struct shmStruct_st {
|
|||||||
size_t nprocesses;
|
size_t nprocesses;
|
||||||
int barrier;
|
int barrier;
|
||||||
int sense;
|
int sense;
|
||||||
|
cudaMemAllocationHandleType handleType;
|
||||||
int devices[MAX_DEVICES];
|
int devices[MAX_DEVICES];
|
||||||
cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES];
|
cudaMemPoolPtrExportData exportPtrData[MAX_DEVICES];
|
||||||
} shmStruct;
|
} shmStruct;
|
||||||
@ -126,7 +127,7 @@ static void childProcess(int id) {
|
|||||||
|
|
||||||
std::vector<cudaMemPool_t> pools(shm->nprocesses);
|
std::vector<cudaMemPool_t> pools(shm->nprocesses);
|
||||||
|
|
||||||
cudaMemAllocationHandleType handleType = cudaMemHandleTypePosixFileDescriptor;
|
cudaMemAllocationHandleType handleType = shm->handleType;
|
||||||
|
|
||||||
// Import mem pools from all the devices created in the master
|
// Import mem pools from all the devices created in the master
|
||||||
// process using shareable handles received via socket
|
// process using shareable handles received via socket
|
||||||
@ -239,6 +240,7 @@ static void parentProcess(char *app) {
|
|||||||
volatile shmStruct *shm = NULL;
|
volatile shmStruct *shm = NULL;
|
||||||
std::vector<void *> ptrs;
|
std::vector<void *> ptrs;
|
||||||
std::vector<Process> processes;
|
std::vector<Process> processes;
|
||||||
|
cudaMemAllocationHandleType handleType = cudaMemHandleTypeNone;
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceCount(&devCount));
|
checkCudaErrors(cudaGetDeviceCount(&devCount));
|
||||||
std::vector<CUdevice> devices(devCount);
|
std::vector<CUdevice> devices(devCount);
|
||||||
@ -270,22 +272,32 @@ static void parentProcess(char *app) {
|
|||||||
printf("Device %d does not support cuda memory pools, skipping...\n", i);
|
printf("Device %d does not support cuda memory pools, skipping...\n", i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int deviceSupportsIpcHandle = 0;
|
int supportedHandleTypes = 0;
|
||||||
#if defined(__linux__)
|
checkCudaErrors(cudaDeviceGetAttribute(&supportedHandleTypes,
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
cudaDevAttrMemoryPoolSupportedHandleTypes, i));
|
||||||
&deviceSupportsIpcHandle,
|
if (supportedHandleTypes == 0) {
|
||||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
|
printf("Device %d does not support Memory pool based IPC, skipping...\n", i);
|
||||||
devices[i]));
|
|
||||||
#else
|
|
||||||
cuDeviceGetAttribute(&deviceSupportsIpcHandle,
|
|
||||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED,
|
|
||||||
devices[i]);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!deviceSupportsIpcHandle) {
|
|
||||||
printf("Device %d does not support CUDA IPC Handle, skipping...\n", i);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (handleType == cudaMemHandleTypeNone) {
|
||||||
|
if (supportedHandleTypes & cudaMemHandleTypePosixFileDescriptor) {
|
||||||
|
handleType = cudaMemHandleTypePosixFileDescriptor;
|
||||||
|
}
|
||||||
|
else if (supportedHandleTypes & cudaMemHandleTypeWin32) {
|
||||||
|
handleType = cudaMemHandleTypeWin32;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("Device %d does not support any supported handle types, skipping...\n", i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if ((supportedHandleTypes & handleType) != handleType) {
|
||||||
|
printf("Mixed handle types are not supported, waiving test\n");
|
||||||
|
exit(EXIT_WAIVED);
|
||||||
|
}
|
||||||
|
}
|
||||||
// This sample requires two processes accessing each device, so we need
|
// This sample requires two processes accessing each device, so we need
|
||||||
// to ensure exclusive or prohibited mode is not set
|
// to ensure exclusive or prohibited mode is not set
|
||||||
if (prop.computeMode != cudaComputeModeDefault) {
|
if (prop.computeMode != cudaComputeModeDefault) {
|
||||||
@ -337,6 +349,11 @@ static void parentProcess(char *app) {
|
|||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (handleType == cudaMemHandleTypeNone) {
|
||||||
|
printf("No supported handle types found, waiving test\n");
|
||||||
|
exit(EXIT_WAIVED);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<ShareableHandle> shareableHandles(shm->nprocesses);
|
std::vector<ShareableHandle> shareableHandles(shm->nprocesses);
|
||||||
std::vector<cudaStream_t> streams(shm->nprocesses);
|
std::vector<cudaStream_t> streams(shm->nprocesses);
|
||||||
std::vector<cudaMemPool_t> pools(shm->nprocesses);
|
std::vector<cudaMemPool_t> pools(shm->nprocesses);
|
||||||
@ -352,7 +369,7 @@ static void parentProcess(char *app) {
|
|||||||
cudaMemPoolProps poolProps;
|
cudaMemPoolProps poolProps;
|
||||||
memset(&poolProps, 0, sizeof(cudaMemPoolProps));
|
memset(&poolProps, 0, sizeof(cudaMemPoolProps));
|
||||||
poolProps.allocType = cudaMemAllocationTypePinned;
|
poolProps.allocType = cudaMemAllocationTypePinned;
|
||||||
poolProps.handleTypes = cudaMemHandleTypePosixFileDescriptor;
|
poolProps.handleTypes = handleType;
|
||||||
|
|
||||||
poolProps.location.type = cudaMemLocationTypeDevice;
|
poolProps.location.type = cudaMemLocationTypeDevice;
|
||||||
poolProps.location.id = shm->devices[i];
|
poolProps.location.id = shm->devices[i];
|
||||||
@ -360,8 +377,6 @@ static void parentProcess(char *app) {
|
|||||||
checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps));
|
checkCudaErrors(cudaMemPoolCreate(&pools[i], &poolProps));
|
||||||
|
|
||||||
// Query the shareable handle for the pool
|
// Query the shareable handle for the pool
|
||||||
cudaMemAllocationHandleType handleType =
|
|
||||||
cudaMemHandleTypePosixFileDescriptor;
|
|
||||||
// Allocate memory in a stream from the pool just created
|
// Allocate memory in a stream from the pool just created
|
||||||
checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i]));
|
checkCudaErrors(cudaMallocAsync(&ptr, DATA_SIZE, pools[i], streams[i]));
|
||||||
|
|
||||||
@ -378,6 +393,8 @@ static void parentProcess(char *app) {
|
|||||||
ptrs.push_back(ptr);
|
ptrs.push_back(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
shm->handleType = handleType;
|
||||||
|
|
||||||
// Launch the child processes!
|
// Launch the child processes!
|
||||||
for (i = 0; i < shm->nprocesses; i++) {
|
for (i = 0; i < shm->nprocesses; i++) {
|
||||||
char devIdx[10];
|
char devIdx[10];
|
||||||
@ -430,7 +447,7 @@ static void parentProcess(char *app) {
|
|||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || \
|
#if defined(__arm__) || defined(__aarch64__) || defined(WIN32) || \
|
||||||
defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
printf("Not supported on ARM\n");
|
printf("Not supported on ARM or Windows\n");
|
||||||
return EXIT_WAIVED;
|
return EXIT_WAIVED;
|
||||||
#else
|
#else
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
|
|
||||||
__global__ void ifGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle)
|
__global__ void ifGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle)
|
||||||
{
|
{
|
||||||
// In this example, condition is set if *dPtr is odd
|
// In this example, condition is set if *dPtr is odd
|
||||||
unsigned int value = *dPtr & 0x01;
|
unsigned int value = *dPtr & 0x01;
|
||||||
cudaGraphSetConditional(handle, value);
|
cudaGraphSetConditional(handle, value);
|
||||||
printf("GPU: Handle set to %d\n", value);
|
printf("GPU: Handle set to %d\n", value);
|
||||||
@ -68,21 +68,22 @@ __global__ void ifGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle)
|
|||||||
// This kernel will only be executed if the condition is true
|
// This kernel will only be executed if the condition is true
|
||||||
__global__ void ifGraphKernelC(void)
|
__global__ void ifGraphKernelC(void)
|
||||||
{
|
{
|
||||||
printf("GPU: Hello from the GPU!\n");
|
printf("GPU: Hello from the GPU! The condition was true.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup and launch the graph
|
// Setup and launch the graph
|
||||||
void simpleIfGraph(void)
|
void simpleIfGraph(void)
|
||||||
{
|
{
|
||||||
cudaGraph_t graph;
|
cudaGraph_t graph;
|
||||||
cudaGraphExec_t graphExec;
|
cudaGraphExec_t graphExec;
|
||||||
cudaGraphNode_t node;
|
cudaGraphNode_t kernelNode;
|
||||||
|
cudaGraphNode_t conditionalNode;
|
||||||
|
|
||||||
void *kernelArgs[2];
|
void *kernelArgs[2];
|
||||||
|
|
||||||
// Allocate a byte of device memory to use as input
|
// Allocate a byte of device memory to use as input
|
||||||
char *dPtr;
|
char *dPtr;
|
||||||
checkCudaErrors(cudaMalloc((void**)&dPtr, 1));
|
checkCudaErrors(cudaMalloc((void **)&dPtr, 1));
|
||||||
|
|
||||||
printf("simpleIfGraph: Building graph...\n");
|
printf("simpleIfGraph: Building graph...\n");
|
||||||
cudaGraphCreate(&graph, 0);
|
cudaGraphCreate(&graph, 0);
|
||||||
@ -92,26 +93,26 @@ void simpleIfGraph(void)
|
|||||||
cudaGraphConditionalHandleCreate(&handle, graph);
|
cudaGraphConditionalHandleCreate(&handle, graph);
|
||||||
|
|
||||||
// Use a kernel upstream of the conditional to set the handle value
|
// Use a kernel upstream of the conditional to set the handle value
|
||||||
cudaGraphNodeParams params = { cudaGraphNodeTypeKernel };
|
cudaGraphNodeParams params = {cudaGraphNodeTypeKernel};
|
||||||
params.kernel.func = (void *)ifGraphKernelA;
|
params.kernel.func = (void *)ifGraphKernelA;
|
||||||
params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1;
|
params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1;
|
||||||
params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1;
|
params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1;
|
||||||
params.kernel.kernelParams = kernelArgs;
|
params.kernel.kernelParams = kernelArgs;
|
||||||
kernelArgs[0] = &dPtr;
|
kernelArgs[0] = &dPtr;
|
||||||
kernelArgs[1] = &handle;
|
kernelArgs[1] = &handle;
|
||||||
checkCudaErrors(cudaGraphAddNode(&node, graph, NULL, 0, ¶ms));
|
checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
cudaGraphNodeParams cParams = { cudaGraphNodeTypeConditional };
|
cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional};
|
||||||
cParams.conditional.handle = handle;
|
cParams.conditional.handle = handle;
|
||||||
cParams.conditional.type = cudaGraphCondTypeIf;
|
cParams.conditional.type = cudaGraphCondTypeIf;
|
||||||
cParams.conditional.size = 1;
|
cParams.conditional.size = 1;
|
||||||
checkCudaErrors(cudaGraphAddNode(&node, graph, &node, 1, &cParams));
|
checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams));
|
||||||
|
|
||||||
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
||||||
|
|
||||||
// Populate the body of the conditional node
|
// Populate the body of the conditional node
|
||||||
cudaGraphNode_t bodyNode;
|
cudaGraphNode_t bodyNode;
|
||||||
params.kernel.func = (void *)ifGraphKernelC;
|
params.kernel.func = (void *)ifGraphKernelC;
|
||||||
params.kernel.kernelParams = nullptr;
|
params.kernel.kernelParams = nullptr;
|
||||||
checkCudaErrors(cudaGraphAddNode(&bodyNode, bodyGraph, NULL, 0, ¶ms));
|
checkCudaErrors(cudaGraphAddNode(&bodyNode, bodyGraph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
@ -119,13 +120,13 @@ void simpleIfGraph(void)
|
|||||||
|
|
||||||
// Initialize device memory and launch the graph
|
// Initialize device memory and launch the graph
|
||||||
checkCudaErrors(cudaMemset(dPtr, 0, 1)); // Set dPtr to 0
|
checkCudaErrors(cudaMemset(dPtr, 0, 1)); // Set dPtr to 0
|
||||||
printf("Host: Launching graph with conditional value set to false\n");
|
printf("Host: Launching graph with device memory set to 0\n");
|
||||||
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
// Initialize device memory and launch the graph
|
// Initialize device memory and launch the graph
|
||||||
checkCudaErrors(cudaMemset(dPtr, 1, 1)); // Set dPtr to 1
|
checkCudaErrors(cudaMemset(dPtr, 1, 1)); // Set dPtr to 1
|
||||||
printf("Host: Launching graph with conditional value set to true\n");
|
printf("Host: Launching graph with device memory set to 1\n");
|
||||||
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
@ -158,7 +159,8 @@ __global__ void doWhileEmptyKernel(void)
|
|||||||
|
|
||||||
__global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle)
|
__global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle)
|
||||||
{
|
{
|
||||||
if (--(*dPtr) == 0) {
|
if (--(*dPtr) == 0)
|
||||||
|
{
|
||||||
cudaGraphSetConditional(handle, 0);
|
cudaGraphSetConditional(handle, 0);
|
||||||
}
|
}
|
||||||
printf("GPU: counter = %d\n", *dPtr);
|
printf("GPU: counter = %d\n", *dPtr);
|
||||||
@ -166,13 +168,13 @@ __global__ void doWhileLoopKernel(char *dPtr, cudaGraphConditionalHandle handle)
|
|||||||
|
|
||||||
void simpleDoWhileGraph(void)
|
void simpleDoWhileGraph(void)
|
||||||
{
|
{
|
||||||
cudaGraph_t graph;
|
cudaGraph_t graph;
|
||||||
cudaGraphExec_t graphExec;
|
cudaGraphExec_t graphExec;
|
||||||
cudaGraphNode_t node;
|
cudaGraphNode_t conditionalNode;
|
||||||
|
|
||||||
// Allocate a byte of device memory to use as input
|
// Allocate a byte of device memory to use as input
|
||||||
char *dPtr;
|
char *dPtr;
|
||||||
checkCudaErrors(cudaMalloc((void**)&dPtr, 1));
|
checkCudaErrors(cudaMalloc((void **)&dPtr, 1));
|
||||||
|
|
||||||
printf("simpleDoWhileGraph: Building graph...\n");
|
printf("simpleDoWhileGraph: Building graph...\n");
|
||||||
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
checkCudaErrors(cudaGraphCreate(&graph, 0));
|
||||||
@ -180,18 +182,18 @@ void simpleDoWhileGraph(void)
|
|||||||
cudaGraphConditionalHandle handle;
|
cudaGraphConditionalHandle handle;
|
||||||
checkCudaErrors(cudaGraphConditionalHandleCreate(&handle, graph, 1, cudaGraphCondAssignDefault));
|
checkCudaErrors(cudaGraphConditionalHandleCreate(&handle, graph, 1, cudaGraphCondAssignDefault));
|
||||||
|
|
||||||
cudaGraphNodeParams cParams = { cudaGraphNodeTypeConditional };
|
cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional};
|
||||||
cParams.conditional.handle = handle;
|
cParams.conditional.handle = handle;
|
||||||
cParams.conditional.type = cudaGraphCondTypeWhile;
|
cParams.conditional.type = cudaGraphCondTypeWhile;
|
||||||
cParams.conditional.size = 1;
|
cParams.conditional.size = 1;
|
||||||
checkCudaErrors(cudaGraphAddNode(&node, graph, NULL, 0, &cParams));
|
checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, NULL, 0, &cParams));
|
||||||
|
|
||||||
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
||||||
|
|
||||||
cudaStream_t captureStream;
|
cudaStream_t captureStream;
|
||||||
checkCudaErrors(cudaStreamCreate(&captureStream));
|
checkCudaErrors(cudaStreamCreate(&captureStream));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamBeginCaptureToGraph(captureStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeRelaxed));
|
checkCudaErrors(cudaStreamBeginCaptureToGraph(captureStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal));
|
||||||
doWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
doWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
||||||
doWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
doWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
||||||
doWhileLoopKernel<<<1, 1, 0, captureStream>>>(dPtr, handle);
|
doWhileLoopKernel<<<1, 1, 0, captureStream>>>(dPtr, handle);
|
||||||
@ -214,29 +216,30 @@ void simpleDoWhileGraph(void)
|
|||||||
printf("simpleDoWhileGraph: Complete\n\n");
|
printf("simpleDoWhileGraph: Complete\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create a graph containing a conditional while loop using stream capture.
|
* Create a graph containing a conditional while loop using stream capture.
|
||||||
* This demonstrates how to insert a conditional node into a stream which is
|
* This demonstrates how to insert a conditional node into a stream which is
|
||||||
* being captured. The graph consists of a kernel node followed by a conditional
|
* being captured. The graph consists of a kernel node, A, followed by a
|
||||||
* while node which contains a single kernel node:
|
* conditional while node, B, followed by a kernel node, D. The conditional
|
||||||
|
* body is populated by a single kernel node, C:
|
||||||
*
|
*
|
||||||
* A -> B [ C ]
|
* A -> B [ C ] -> D
|
||||||
*
|
*
|
||||||
* The same kernel will be used for both nodes A and C. This kernel will test
|
* The same kernel will be used for both nodes A and C. This kernel will test
|
||||||
* a device memory location and set the condition when the location is non-zero.
|
* a device memory location and set the condition when the location is non-zero.
|
||||||
* We must run the kernel before the loop as well as inside the loop in order
|
* We must run the kernel before the loop as well as inside the loop in order
|
||||||
* to behave like a while loop. We need to evaluate the device memory location
|
* to behave like a while loop as opposed to a do-while loop. We need to evaluate
|
||||||
* before the conditional node is evaluated in order to set the condition variable
|
* the device memory location before the conditional node is evaluated in order
|
||||||
* properly. Because we're using a kernel upstream of the conditional node,
|
* to set the condition variable properly. Because we're using a kernel upstream
|
||||||
* there is no need to use the handle default value to initialize the conditional
|
* of the conditional node, there is no need to use the handle default value to
|
||||||
* value.
|
* initialize the conditional value.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
__global__ void capturedWhileKernel(char *dPtr, cudaGraphConditionalHandle handle)
|
__global__ void capturedWhileKernel(char *dPtr, cudaGraphConditionalHandle handle)
|
||||||
{
|
{
|
||||||
printf("GPU: counter = %d\n", *dPtr);
|
printf("GPU: counter = %d\n", *dPtr);
|
||||||
if (*dPtr) {
|
if (*dPtr)
|
||||||
|
{
|
||||||
(*dPtr)--;
|
(*dPtr)--;
|
||||||
}
|
}
|
||||||
cudaGraphSetConditional(handle, *dPtr);
|
cudaGraphSetConditional(handle, *dPtr);
|
||||||
@ -259,13 +262,13 @@ void capturedWhileGraph(void)
|
|||||||
|
|
||||||
// Allocate a byte of device memory to use as input
|
// Allocate a byte of device memory to use as input
|
||||||
char *dPtr;
|
char *dPtr;
|
||||||
checkCudaErrors(cudaMalloc((void**)&dPtr, 1));
|
checkCudaErrors(cudaMalloc((void **)&dPtr, 1));
|
||||||
|
|
||||||
printf("capturedWhileGraph: Building graph...\n");
|
printf("capturedWhileGraph: Building graph...\n");
|
||||||
cudaStream_t captureStream;
|
cudaStream_t captureStream;
|
||||||
checkCudaErrors(cudaStreamCreate(&captureStream));
|
checkCudaErrors(cudaStreamCreate(&captureStream));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamBeginCapture(captureStream, cudaStreamCaptureModeRelaxed));
|
checkCudaErrors(cudaStreamBeginCapture(captureStream, cudaStreamCaptureModeGlobal));
|
||||||
|
|
||||||
// Obtain the handle of the graph
|
// Obtain the handle of the graph
|
||||||
checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies));
|
checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies));
|
||||||
@ -281,17 +284,17 @@ void capturedWhileGraph(void)
|
|||||||
checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies));
|
checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies));
|
||||||
|
|
||||||
// Insert conditional node B
|
// Insert conditional node B
|
||||||
cudaGraphNode_t node;
|
cudaGraphNode_t conditionalNode;
|
||||||
cudaGraphNodeParams cParams = { cudaGraphNodeTypeConditional };
|
cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional};
|
||||||
cParams.conditional.handle = handle;
|
cParams.conditional.handle = handle;
|
||||||
cParams.conditional.type = cudaGraphCondTypeWhile;
|
cParams.conditional.type = cudaGraphCondTypeWhile;
|
||||||
cParams.conditional.size = 1;
|
cParams.conditional.size = 1;
|
||||||
checkCudaErrors(cudaGraphAddNode(&node, graph, dependencies, numDependencies, &cParams));
|
checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, dependencies, numDependencies, &cParams));
|
||||||
|
|
||||||
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
||||||
|
|
||||||
// Update stream capture dependencies to account for the node we manually added
|
// Update stream capture dependencies to account for the node we manually added
|
||||||
checkCudaErrors(cudaStreamUpdateCaptureDependencies(captureStream, &node, 1, cudaStreamSetCaptureDependencies));
|
checkCudaErrors(cudaStreamUpdateCaptureDependencies(captureStream, &conditionalNode, 1, cudaStreamSetCaptureDependencies));
|
||||||
|
|
||||||
// Insert kernel node D
|
// Insert kernel node D
|
||||||
capturedWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
capturedWhileEmptyKernel<<<1, 1, 0, captureStream>>>();
|
||||||
@ -303,7 +306,7 @@ void capturedWhileGraph(void)
|
|||||||
cudaStream_t bodyStream;
|
cudaStream_t bodyStream;
|
||||||
checkCudaErrors(cudaStreamCreate(&bodyStream));
|
checkCudaErrors(cudaStreamCreate(&bodyStream));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamBeginCaptureToGraph(bodyStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeRelaxed));
|
checkCudaErrors(cudaStreamBeginCaptureToGraph(bodyStream, bodyGraph, nullptr, nullptr, 0, cudaStreamCaptureModeGlobal));
|
||||||
|
|
||||||
// Insert kernel node C
|
// Insert kernel node C
|
||||||
capturedWhileKernel<<<1, 1, 0, bodyStream>>>(dPtr, handle);
|
capturedWhileKernel<<<1, 1, 0, bodyStream>>>(dPtr, handle);
|
||||||
@ -333,24 +336,238 @@ void capturedWhileGraph(void)
|
|||||||
printf("capturedWhileGraph: Complete\n\n");
|
printf("capturedWhileGraph: Complete\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a graph containing two nodes.
|
||||||
|
* The first node, A, is a kernel and the second node, B, is a conditional IF node containing two graphs.
|
||||||
|
* The first graph within the conditional will be executed when the condition is true, while the second
|
||||||
|
* graph will be executed when the conditional is false.
|
||||||
|
* The kernel sets the condition variable to true if a device memory location
|
||||||
|
* contains an odd number. Otherwise the condition variable is set to false.
|
||||||
|
* There is a single kernel(C & D) within each conditional body which prints a message.
|
||||||
|
*
|
||||||
|
* A -> B [ C | D ]
|
||||||
|
*
|
||||||
|
* This example requires CUDA >= 12.8.
|
||||||
|
*/
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
// This kernel will only be executed if the condition is false
|
||||||
|
__global__ void ifGraphKernelD(void)
|
||||||
|
{
|
||||||
|
printf("GPU: Hello from the GPU! The condition was false.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup and launch the graph
|
||||||
|
void simpleIfElseGraph(void)
|
||||||
|
{
|
||||||
|
cudaGraph_t graph;
|
||||||
|
cudaGraphExec_t graphExec;
|
||||||
|
cudaGraphNode_t kernelNode;
|
||||||
|
cudaGraphNode_t conditionalNode;
|
||||||
|
|
||||||
|
void *kernelArgs[2];
|
||||||
|
|
||||||
|
// Allocate a byte of device memory to use as input
|
||||||
|
char *dPtr;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&dPtr, 1));
|
||||||
|
|
||||||
|
printf("simpleIfElseGraph: Building graph...\n");
|
||||||
|
cudaGraphCreate(&graph, 0);
|
||||||
|
|
||||||
|
// Create conditional handle.
|
||||||
|
cudaGraphConditionalHandle handle;
|
||||||
|
cudaGraphConditionalHandleCreate(&handle, graph);
|
||||||
|
|
||||||
|
// Use a kernel upstream of the conditional to set the handle value
|
||||||
|
cudaGraphNodeParams params = {cudaGraphNodeTypeKernel};
|
||||||
|
params.kernel.func = (void *)ifGraphKernelA;
|
||||||
|
params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1;
|
||||||
|
params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1;
|
||||||
|
params.kernel.kernelParams = kernelArgs;
|
||||||
|
kernelArgs[0] = &dPtr;
|
||||||
|
kernelArgs[1] = &handle;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
|
cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional};
|
||||||
|
cParams.conditional.handle = handle;
|
||||||
|
cParams.conditional.type = cudaGraphCondTypeIf;
|
||||||
|
cParams.conditional.size = 2; // Set size to 2 to indicate an ELSE graph will be used
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams));
|
||||||
|
|
||||||
|
cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0];
|
||||||
|
|
||||||
|
// Populate the body of the first graph in the conditional node, executed if the condition is true
|
||||||
|
cudaGraphNode_t trueBodyNode;
|
||||||
|
params.kernel.func = (void *)ifGraphKernelC;
|
||||||
|
params.kernel.kernelParams = nullptr;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&trueBodyNode, bodyGraph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
|
// Populate the body of the second graph in the conditional node, executed if the condition is false
|
||||||
|
bodyGraph = cParams.conditional.phGraph_out[1];
|
||||||
|
|
||||||
|
cudaGraphNode_t falseBodyNode;
|
||||||
|
params.kernel.func = (void *)ifGraphKernelD;
|
||||||
|
params.kernel.kernelParams = nullptr;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&falseBodyNode, bodyGraph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
||||||
|
|
||||||
|
// Initialize device memory and launch the graph
|
||||||
|
checkCudaErrors(cudaMemset(dPtr, 0, 1)); // Set dPtr to 0
|
||||||
|
printf("Host: Launching graph with device memory set to 0\n");
|
||||||
|
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
// Initialize device memory and launch the graph
|
||||||
|
checkCudaErrors(cudaMemset(dPtr, 1, 1)); // Set dPtr to 1
|
||||||
|
printf("Host: Launching graph with device memory set to 1\n");
|
||||||
|
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
checkCudaErrors(cudaGraphExecDestroy(graphExec));
|
||||||
|
checkCudaErrors(cudaGraphDestroy(graph));
|
||||||
|
checkCudaErrors(cudaFree(dPtr));
|
||||||
|
|
||||||
|
printf("simpleIfElseGraph: Complete\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a graph containing two nodes.
|
||||||
|
* The first node, A, is a kernel and the second node, B, is a conditional SWITCH node containing four graphs.
|
||||||
|
* The nth graph within the conditional will be executed when the condition is n. If conditional >= n,
|
||||||
|
* no graph will be executed.
|
||||||
|
* Kernel A sets the condition variable to the value stored in a device memory location.
|
||||||
|
* This device location is updated from the host with each launch to demonstrate the behavior.
|
||||||
|
* There is a single kernel(nodes C, D, E and F) within each conditional body which prints a message.
|
||||||
|
*
|
||||||
|
* A -> B [ C | D | E | F ]
|
||||||
|
*
|
||||||
|
* This example requires CUDA >= 12.8.
|
||||||
|
*/
|
||||||
|
|
||||||
|
__global__ void switchGraphKernelA(char *dPtr, cudaGraphConditionalHandle handle)
|
||||||
|
{
|
||||||
|
unsigned int value = *dPtr;
|
||||||
|
cudaGraphSetConditional(handle, value);
|
||||||
|
printf("GPU: Handle set to %d\n", value);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void switchGraphKernelC(void)
|
||||||
|
{
|
||||||
|
printf("GPU: Hello from switchGraphKernelC(), running on the GPU!\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void switchGraphKernelD(void)
|
||||||
|
{
|
||||||
|
printf("GPU: Hello from switchGraphKernelD(), running on the GPU!\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void switchGraphKernelE(void)
|
||||||
|
{
|
||||||
|
printf("GPU: Hello from switchGraphKernelE(), running on the GPU!\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void switchGraphKernelF(void)
|
||||||
|
{
|
||||||
|
printf("GPU: Hello from switchGraphKernelF(), running on the GPU!\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup and launch the graph
|
||||||
|
void simpleSwitchGraph(void)
|
||||||
|
{
|
||||||
|
cudaGraph_t graph;
|
||||||
|
cudaGraphExec_t graphExec;
|
||||||
|
cudaGraphNode_t kernelNode;
|
||||||
|
cudaGraphNode_t conditionalNode;
|
||||||
|
|
||||||
|
void *kernelArgs[2];
|
||||||
|
|
||||||
|
// Allocate a byte of device memory to use as input
|
||||||
|
char *dPtr;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&dPtr, 1));
|
||||||
|
|
||||||
|
printf("simpleSwitchGraph: Building graph...\n");
|
||||||
|
cudaGraphCreate(&graph, 0);
|
||||||
|
|
||||||
|
// Create conditional handle.
|
||||||
|
cudaGraphConditionalHandle handle;
|
||||||
|
cudaGraphConditionalHandleCreate(&handle, graph);
|
||||||
|
|
||||||
|
// Use a kernel upstream of the conditional to set the handle value
|
||||||
|
cudaGraphNodeParams params = {cudaGraphNodeTypeKernel};
|
||||||
|
params.kernel.func = (void *)switchGraphKernelA;
|
||||||
|
params.kernel.blockDim.x = params.kernel.blockDim.y = params.kernel.blockDim.z = 1;
|
||||||
|
params.kernel.gridDim.x = params.kernel.gridDim.y = params.kernel.gridDim.z = 1;
|
||||||
|
params.kernel.kernelParams = kernelArgs;
|
||||||
|
kernelArgs[0] = &dPtr;
|
||||||
|
kernelArgs[1] = &handle;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms));
|
||||||
|
|
||||||
|
cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional};
|
||||||
|
cParams.conditional.handle = handle;
|
||||||
|
cParams.conditional.type = cudaGraphCondTypeSwitch;
|
||||||
|
cParams.conditional.size = 4;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams));
|
||||||
|
|
||||||
|
// Populate the four graph bodies within the SWITCH conditional graph
|
||||||
|
cudaGraphNode_t bodyNode;
|
||||||
|
params.kernel.kernelParams = nullptr;
|
||||||
|
params.kernel.func = (void *)switchGraphKernelC;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[0], NULL, 0, ¶ms));
|
||||||
|
params.kernel.func = (void *)switchGraphKernelD;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[1], NULL, 0, ¶ms));
|
||||||
|
params.kernel.func = (void *)switchGraphKernelE;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[2], NULL, 0, ¶ms));
|
||||||
|
params.kernel.func = (void *)switchGraphKernelF;
|
||||||
|
checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[3], NULL, 0, ¶ms));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
||||||
|
|
||||||
|
for (char i = 0; i < 5; i++)
|
||||||
|
{
|
||||||
|
// Initialize device memory and launch the graph
|
||||||
|
checkCudaErrors(cudaMemset(dPtr, i, 1));
|
||||||
|
printf("Host: Launching graph with device memory set to %d\n", i);
|
||||||
|
checkCudaErrors(cudaGraphLaunch(graphExec, 0));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
checkCudaErrors(cudaGraphExecDestroy(graphExec));
|
||||||
|
checkCudaErrors(cudaGraphDestroy(graph));
|
||||||
|
checkCudaErrors(cudaFree(dPtr));
|
||||||
|
|
||||||
|
printf("simpleSwitchGraph: Complete\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int device = findCudaDevice(argc, (const char **)argv);
|
int device = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
int driverVersion = 0;
|
int driverVersion = 0;
|
||||||
|
|
||||||
cudaDriverGetVersion(&driverVersion);
|
cudaDriverGetVersion(&driverVersion);
|
||||||
printf("Driver version is: %d.%d\n", driverVersion / 1000,
|
printf("Driver version is: %d.%d\n", driverVersion / 1000,
|
||||||
(driverVersion % 100) / 10);
|
(driverVersion % 100) / 10);
|
||||||
|
|
||||||
if (driverVersion < 12030) {
|
if (driverVersion < 12030)
|
||||||
printf("Waiving execution as driver does not support Graph Conditional Nodes\n");
|
{
|
||||||
exit(EXIT_WAIVED);
|
printf("Skipping execution as driver does not support Graph Conditional Nodes\n");
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
simpleIfGraph();
|
simpleIfGraph();
|
||||||
simpleDoWhileGraph();
|
simpleDoWhileGraph();
|
||||||
capturedWhileGraph();
|
capturedWhileGraph();
|
||||||
|
|
||||||
|
if (driverVersion < 12080)
|
||||||
|
{
|
||||||
|
printf("Skipping execution as driver does not support if/else and switch type Graph Conditional Nodes\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
simpleIfElseGraph();
|
||||||
|
simpleSwitchGraph();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "cuda.h"
|
#include "cuda.h"
|
||||||
@ -293,6 +292,11 @@ static void memMapGetDeviceFunction(char **argv) {
|
|||||||
jitNumOptions, jitOptions,
|
jitNumOptions, jitOptions,
|
||||||
(void **)jitOptVals));
|
(void **)jitOptVals));
|
||||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||||
|
|
||||||
|
// Clean up dynamically allocated memory
|
||||||
|
delete[] jitOptions;
|
||||||
|
delete[] jitOptVals;
|
||||||
|
delete[] jitLogBuffer;
|
||||||
} else {
|
} else {
|
||||||
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
|
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
|
||||||
}
|
}
|
||||||
@ -379,7 +383,7 @@ static void childProcess(int devId, int id, char **argv) {
|
|||||||
// deterministic.
|
// deterministic.
|
||||||
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
|
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
|
||||||
if (id == 0) {
|
if (id == 0) {
|
||||||
printf("Step %lld done\n", (unsigned long long)i);
|
printf("Step %llu done\n", (unsigned long long)i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -489,12 +493,14 @@ static void parentProcess(char *app) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < nprocesses; j++) {
|
for (int j = 0; j < selectedDevices.size(); j++) {
|
||||||
int canAccessPeerIJ, canAccessPeerJI;
|
int canAccessPeerIJ, canAccessPeerJI;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI,
|
||||||
cuDeviceCanAccessPeer(&canAccessPeerJI, devices[j], devices[i]));
|
devices[selectedDevices[j]],
|
||||||
checkCudaErrors(
|
devices[i]));
|
||||||
cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[j]));
|
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ,
|
||||||
|
devices[i],
|
||||||
|
devices[selectedDevices[j]]));
|
||||||
if (!canAccessPeerIJ || !canAccessPeerJI) {
|
if (!canAccessPeerIJ || !canAccessPeerJI) {
|
||||||
allPeers = false;
|
allPeers = false;
|
||||||
break;
|
break;
|
||||||
@ -509,10 +515,10 @@ static void parentProcess(char *app) {
|
|||||||
// setup the peers for the device. For systems that only allow 8
|
// setup the peers for the device. For systems that only allow 8
|
||||||
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
|
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
|
||||||
for (int j = 0; j < nprocesses; j++) {
|
for (int j = 0; j < nprocesses; j++) {
|
||||||
checkCudaErrors(cuCtxSetCurrent(ctxs[i]));
|
checkCudaErrors(cuCtxSetCurrent(ctxs.back()));
|
||||||
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0));
|
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0));
|
||||||
checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
|
checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
|
||||||
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0));
|
checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0));
|
||||||
}
|
}
|
||||||
selectedDevices.push_back(i);
|
selectedDevices.push_back(i);
|
||||||
nprocesses++;
|
nprocesses++;
|
||||||
@ -550,7 +556,7 @@ static void parentProcess(char *app) {
|
|||||||
// Launch the child processes!
|
// Launch the child processes!
|
||||||
for (i = 0; i < nprocesses; i++) {
|
for (i = 0; i < nprocesses; i++) {
|
||||||
char devIdx[10];
|
char devIdx[10];
|
||||||
char procIdx[10];
|
char procIdx[12];
|
||||||
char *const args[] = {app, devIdx, procIdx, NULL};
|
char *const args[] = {app, devIdx, procIdx, NULL};
|
||||||
Process process;
|
Process process;
|
||||||
|
|
||||||
|
@ -231,6 +231,10 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (buffer) {
|
||||||
|
checkCudaErrors(cudaFree(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
cusparseDestroy(cusparseHandle);
|
cusparseDestroy(cusparseHandle);
|
||||||
cublasDestroy(cublasHandle);
|
cublasDestroy(cublasHandle);
|
||||||
if (matA) {
|
if (matA) {
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
|
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread & rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
|
@ -65,14 +65,14 @@ target_compile_features(Mandelbrot PRIVATE cxx_std_17 cuda_std_17)
|
|||||||
POST_BUILD
|
POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
|
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
|
||||||
${CMAKE_CURRENT_BINARY_DIR}
|
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||||
)
|
)
|
||||||
|
|
||||||
add_custom_command(TARGET Mandelbrot
|
add_custom_command(TARGET Mandelbrot
|
||||||
POST_BUILD
|
POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
||||||
${CMAKE_CURRENT_BINARY_DIR}
|
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -416,8 +416,8 @@ void initMC(int argc, char **argv) {
|
|||||||
gridSizeLog2.x = n;
|
gridSizeLog2.x = n;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "gridx")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "gridy")) {
|
||||||
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridx");
|
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridy");
|
||||||
gridSizeLog2.y = n;
|
gridSizeLog2.y = n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,16 +20,19 @@ include_directories(../../../Common)
|
|||||||
find_package(Vulkan)
|
find_package(Vulkan)
|
||||||
find_package(OpenGL)
|
find_package(OpenGL)
|
||||||
|
|
||||||
|
|
||||||
# Include the check_include_file macro
|
# Include the check_include_file macro
|
||||||
include(CheckIncludeFile)
|
include(CheckIncludeFile)
|
||||||
|
|
||||||
# Check for the GLFW/glfw3.h header
|
# Check for the GLFW/glfw3.h header
|
||||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||||
|
|
||||||
# Find GLFW/glfw3.h header for Windows
|
# Find GLFW header and lib for Windows
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||||
if(GLFW3_H)
|
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||||
|
if(GLFW3_H AND GLFW3_LIB)
|
||||||
|
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||||
set(HAVE_GLFW3_H 1)
|
set(HAVE_GLFW3_H 1)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
@ -51,21 +54,22 @@ if(${Vulkan_FOUND})
|
|||||||
${Vulkan_INCLUDE_DIRS}
|
${Vulkan_INCLUDE_DIRS}
|
||||||
${CUDAToolkit_INCLUDE_DIRS}
|
${CUDAToolkit_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
target_link_libraries(simpleVulkan
|
||||||
|
${Vulkan_LIBRARIES}
|
||||||
|
OpenGL::GL
|
||||||
|
)
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
|
target_include_directories(simpleVulkan PUBLIC
|
||||||
|
${GLFW_INCLUDE_DIR}
|
||||||
|
)
|
||||||
target_link_libraries(simpleVulkan
|
target_link_libraries(simpleVulkan
|
||||||
${Vulkan_LIBRARIES}
|
${GLFW3_LIB}
|
||||||
OpenGL::GL
|
|
||||||
glfw3.dll
|
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
target_link_libraries(simpleVulkan
|
target_link_libraries(simpleVulkan
|
||||||
${Vulkan_LIBRARIES}
|
|
||||||
OpenGL::GL
|
|
||||||
glfw
|
glfw
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_custom_command(TARGET simpleVulkan POST_BUILD
|
add_custom_command(TARGET simpleVulkan POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/sinewave.frag
|
${CMAKE_CURRENT_SOURCE_DIR}/sinewave.frag
|
||||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
|||||||
# Check for the GLFW/glfw3.h header
|
# Check for the GLFW/glfw3.h header
|
||||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||||
|
|
||||||
# Find GLFW/glfw3.h header for Windows
|
# Find GLFW header and lib for Windows
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||||
if(GLFW3_H)
|
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||||
|
if(GLFW3_H AND GLFW3_LIB)
|
||||||
|
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||||
set(HAVE_GLFW3_H 1)
|
set(HAVE_GLFW3_H 1)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
@ -51,23 +53,23 @@ if(${Vulkan_FOUND})
|
|||||||
${Vulkan_INCLUDE_DIRS}
|
${Vulkan_INCLUDE_DIRS}
|
||||||
${CUDAToolkit_INCLUDE_DIRS}
|
${CUDAToolkit_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
target_link_libraries(simpleVulkanMMAP
|
||||||
|
${Vulkan_LIBRARIES}
|
||||||
|
OpenGL::GL
|
||||||
|
CUDA::cuda_driver
|
||||||
|
)
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
|
target_include_directories(simpleVulkanMMAP PUBLIC
|
||||||
|
${GLFW_INCLUDE_DIR}
|
||||||
|
)
|
||||||
target_link_libraries(simpleVulkanMMAP
|
target_link_libraries(simpleVulkanMMAP
|
||||||
${Vulkan_LIBRARIES}
|
${GLFW3_LIB}
|
||||||
OpenGL::GL
|
|
||||||
CUDA::cuda_driver
|
|
||||||
glfw3.dll
|
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
target_link_libraries(simpleVulkanMMAP
|
target_link_libraries(simpleVulkanMMAP
|
||||||
${Vulkan_LIBRARIES}
|
|
||||||
OpenGL::GL
|
|
||||||
CUDA::cuda_driver
|
|
||||||
glfw
|
glfw
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_custom_command(TARGET simpleVulkanMMAP POST_BUILD
|
add_custom_command(TARGET simpleVulkanMMAP POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/montecarlo.frag
|
${CMAKE_CURRENT_SOURCE_DIR}/montecarlo.frag
|
||||||
|
@ -71,7 +71,7 @@ if(${OpenGL_FOUND})
|
|||||||
POST_BUILD
|
POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy
|
COMMAND ${CMAKE_COMMAND} -E copy
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
||||||
${CMAKE_CURRENT_BINARY_DIR}
|
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
|||||||
# Check for the GLFW/glfw3.h header
|
# Check for the GLFW/glfw3.h header
|
||||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||||
|
|
||||||
# Find GLFW/glfw3.h header for Windows
|
# Find GLFW header and lib for Windows
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||||
if(GLFW3_H)
|
find_file(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||||
|
if(GLFW3_H AND GLFW3_LIB)
|
||||||
|
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||||
set(HAVE_GLFW3_H 1)
|
set(HAVE_GLFW3_H 1)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
@ -51,21 +53,22 @@ if(${Vulkan_FOUND})
|
|||||||
${Vulkan_INCLUDE_DIRS}
|
${Vulkan_INCLUDE_DIRS}
|
||||||
${CUDAToolkit_INCLUDE_DIRS}
|
${CUDAToolkit_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
target_link_libraries(vulkanImageCUDA
|
||||||
|
${Vulkan_LIBRARIES}
|
||||||
|
OpenGL::GL
|
||||||
|
)
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
|
target_include_directories(vulkanImageCUDA PUBLIC
|
||||||
|
${GLFW_INCLUDE_DIR}
|
||||||
|
)
|
||||||
target_link_libraries(vulkanImageCUDA
|
target_link_libraries(vulkanImageCUDA
|
||||||
${Vulkan_LIBRARIES}
|
${GLFW3_LIB}
|
||||||
OpenGL::GL
|
|
||||||
glfw3.dll
|
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
target_link_libraries(vulkanImageCUDA
|
target_link_libraries(vulkanImageCUDA
|
||||||
${Vulkan_LIBRARIES}
|
|
||||||
OpenGL::GL
|
|
||||||
glfw
|
glfw
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_custom_command(TARGET vulkanImageCUDA POST_BUILD
|
add_custom_command(TARGET vulkanImageCUDA POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/shader.frag
|
${CMAKE_CURRENT_SOURCE_DIR}/shader.frag
|
||||||
|
@ -53,7 +53,7 @@ const char *sSDKsample = "Transpose";
|
|||||||
// TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of
|
// TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of
|
||||||
// BLOCK_ROWS
|
// BLOCK_ROWS
|
||||||
|
|
||||||
#define TILE_DIM 16
|
#define TILE_DIM 32
|
||||||
#define BLOCK_ROWS 16
|
#define BLOCK_ROWS 16
|
||||||
|
|
||||||
// This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
|
// This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
|
||||||
|
Loading…
x
Reference in New Issue
Block a user