Compare commits

...

13 Commits

Author SHA1 Message Date
li wei
601384af49
Merge 6aac4717b878ce3a78c1159ff2794e681a914e5b into 8d564d5e3afdab5dda868f42a13d85f3d0e75bc9 2025-02-20 17:35:32 +01:00
XSShawnZeng
8d564d5e3a
Enhancement for GLFW include and lib search (#331)
Fixes NVIDIA bug 5115098
2025-02-20 08:06:40 -08:00
Jake Hemstad
37c5bcbef4 Update kernels.cuh 2025-02-19 17:33:10 -08:00
Rob Armstrong
940a4c7a91
memMapIpc: Resolve build-time warnings and minor potential issues (#329)
* Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109

* 3_CUDA_Features/memMapIPCDrv: Increase procIdx buffer size to prevent potential buffer overflow

* memMapIPCDrv: Fix memory leaks and improve header inclusion

- Remove redundant string.h header
- Add memory cleanup for dynamically allocated JIT options and log buffer
- Fix printf format specifier for unsigned long long
2025-02-19 15:52:20 -08:00
ohmaya
61bd39800d
simplePrintf.cu: "Compute capability" text (#299)
Compute %d.%d capability => Compute capability %d.%d
2025-02-19 15:22:34 -08:00
Rob Armstrong
8a96d2eee7
Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109 2025-02-19 10:43:18 -08:00
Rob Armstrong
e762d58260
Merge pull request #247 from sangeetsatheesh/master
Fix typo from Open issue #161
2025-02-18 17:22:48 -08:00
Rob Armstrong
8fd1701744
Merge branch 'master' into master 2025-02-18 17:22:04 -08:00
Rob Armstrong
94765c1597
Fix minor typo in README.md (#326) 2025-02-18 17:14:14 -08:00
Rob Armstrong
c87881f02c
Update matrix multiplication sample README references (#325)
- Clarify reference to Shared Memory section in CUDA programming guide
- Update cuBLAS interface version description
- Add hyperlink to Shared Memory documentation
2025-02-18 14:02:59 -08:00
aioprli
6aac4717b8
Update globalToShmemAsyncCopy.cu
Fix two obvious errors, the first one is that five tasks were submitted to pipeline at the same time and task 4 conflicts with task 0, the remaining two are copy errors
2024-05-06 22:03:16 +08:00
Sangeet S
42ff742bf5
Merge pull request #1 from sangeetsatheesh/sangeetsatheesh-fix-typo
Fix typo #161
2024-01-17 13:16:53 -05:00
Sangeet S
8ccb13c6f0
Fix typo #161
Fix typo in line 14 from "simple exemple" to simple "example"
2024-01-17 13:16:01 -05:00
13 changed files with 59 additions and 48 deletions

View File

@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
}
unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
((unsigned long long)multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) {

View File

@ -203,7 +203,7 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
#### GLFW
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
#### OpenMP

View File

@ -2,7 +2,7 @@
## Description
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
## Key Concepts

View File

@ -2,7 +2,7 @@
## Description
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
## Key Concepts

View File

@ -57,7 +57,7 @@ int main(int argc, char **argv) {
// Get GPU information
checkCudaErrors(cudaGetDevice(&devID));
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
props.major, props.minor);
printf("printf() is called. Output:\n\n");

View File

@ -11,5 +11,4 @@ This sample enumerates the properties of the CUDA devices present in the system.
This sample enumerates the properties of the CUDA devices present using CUDA Driver API calls
### [topologyQuery](./topologyQuery)
A simple exemple on how to query the topology of a system with multiple GPU
A simple example on how to query the topology of a system with multiple GPU

View File

@ -34,13 +34,12 @@
#define _KERNELS_H_
#include <stdio.h>
#include <thrust/functional.h>
#include "common.cuh"
// Functors used with thrust library.
template <typename Input>
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
struct IsGreaterEqualThan
{
__host__ __device__ IsGreaterEqualThan(uint upperBound) :
upperBound_(upperBound) {}

View File

@ -128,7 +128,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
a <= aEnd; a += aStep, b += bStep, ++i) {
// Load the matrices from device memory to shared memory; each thread loads
// one element of each matrix
for (; aStage <= a + aStep * maxPipelineStages;
for (; aStage < a + aStep * maxPipelineStages;
aStage += aStep, bStage += bStep, ++iStage) {
pipe.producer_acquire();
if (aStage <= aEnd && t4x < BLOCK_SIZE) {
@ -137,7 +137,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
cuda::memcpy_async(&As[j][threadIdx.y][t4x],
&A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
cuda::memcpy_async(&Bs[j][threadIdx.y][t4x],
&B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
&B[bStage + wB * threadIdx.y + t4x], shape4, pipe);
}
pipe.producer_commit();
}
@ -222,7 +222,7 @@ __global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C,
cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x],
shape4, pipe);
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x],
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[b + wB * threadIdx.y + t4x],
shape4, pipe);
pipe.producer_commit();

View File

@ -31,7 +31,6 @@
*/
#include <stdio.h>
#include <string.h>
#include <cstring>
#include <iostream>
#include "cuda.h"
@ -293,6 +292,11 @@ static void memMapGetDeviceFunction(char **argv) {
jitNumOptions, jitOptions,
(void **)jitOptVals));
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
// Clean up dynamically allocated memory
delete[] jitOptions;
delete[] jitOptVals;
delete[] jitLogBuffer;
} else {
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
}
@ -379,7 +383,7 @@ static void childProcess(int devId, int id, char **argv) {
// deterministic.
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
if (id == 0) {
printf("Step %lld done\n", (unsigned long long)i);
printf("Step %llu done\n", (unsigned long long)i);
}
}
@ -550,7 +554,7 @@ static void parentProcess(char *app) {
// Launch the child processes!
for (i = 0; i < nprocesses; i++) {
char devIdx[10];
char procIdx[10];
char procIdx[12];
char *const args[] = {app, devIdx, procIdx, NULL};
Process process;

View File

@ -20,16 +20,19 @@ include_directories(../../../Common)
find_package(Vulkan)
find_package(OpenGL)
# Include the check_include_file macro
include(CheckIncludeFile)
# Check for the GLFW/glfw3.h header
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
# Find GLFW/glfw3.h header for Windows
# Find GLFW header and lib for Windows
if(WIN32)
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
if(GLFW3_H)
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
if(GLFW3_H AND GLFW3_LIB)
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
set(HAVE_GLFW3_H 1)
endif()
endif()
@ -51,21 +54,22 @@ if(${Vulkan_FOUND})
${Vulkan_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}
)
target_link_libraries(simpleVulkan
${Vulkan_LIBRARIES}
OpenGL::GL
)
if(WIN32)
target_include_directories(simpleVulkan PUBLIC
${GLFW_INCLUDE_DIR}
)
target_link_libraries(simpleVulkan
${Vulkan_LIBRARIES}
OpenGL::GL
glfw3.dll
${GLFW3_LIB}
)
else()
target_link_libraries(simpleVulkan
${Vulkan_LIBRARIES}
OpenGL::GL
glfw
)
endif()
add_custom_command(TARGET simpleVulkan POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/sinewave.frag

View File

@ -26,10 +26,12 @@ include(CheckIncludeFile)
# Check for the GLFW/glfw3.h header
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
# Find GLFW/glfw3.h header for Windows
# Find GLFW header and lib for Windows
if(WIN32)
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
if(GLFW3_H)
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
if(GLFW3_H AND GLFW3_LIB)
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
set(HAVE_GLFW3_H 1)
endif()
endif()
@ -51,23 +53,23 @@ if(${Vulkan_FOUND})
${Vulkan_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}
)
target_link_libraries(simpleVulkanMMAP
${Vulkan_LIBRARIES}
OpenGL::GL
CUDA::cuda_driver
)
if(WIN32)
target_include_directories(simpleVulkanMMAP PUBLIC
${GLFW_INCLUDE_DIR}
)
target_link_libraries(simpleVulkanMMAP
${Vulkan_LIBRARIES}
OpenGL::GL
CUDA::cuda_driver
glfw3.dll
${GLFW3_LIB}
)
else()
target_link_libraries(simpleVulkanMMAP
${Vulkan_LIBRARIES}
OpenGL::GL
CUDA::cuda_driver
glfw
)
endif()
add_custom_command(TARGET simpleVulkanMMAP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/montecarlo.frag

View File

@ -71,7 +71,7 @@ if(${OpenGL_FOUND})
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
)
endif()

View File

@ -26,10 +26,12 @@ include(CheckIncludeFile)
# Check for the GLFW/glfw3.h header
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
# Find GLFW/glfw3.h header for Windows
# Find GLFW header and lib for Windows
if(WIN32)
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
if(GLFW3_H)
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
find_file(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
if(GLFW3_H AND GLFW3_LIB)
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
set(HAVE_GLFW3_H 1)
endif()
endif()
@ -51,21 +53,22 @@ if(${Vulkan_FOUND})
${Vulkan_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}
)
target_link_libraries(vulkanImageCUDA
${Vulkan_LIBRARIES}
OpenGL::GL
)
if(WIN32)
target_include_directories(vulkanImageCUDA PUBLIC
${GLFW_INCLUDE_DIR}
)
target_link_libraries(vulkanImageCUDA
${Vulkan_LIBRARIES}
OpenGL::GL
glfw3.dll
${GLFW3_LIB}
)
else()
target_link_libraries(vulkanImageCUDA
${Vulkan_LIBRARIES}
OpenGL::GL
glfw
)
endif()
add_custom_command(TARGET vulkanImageCUDA POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/shader.frag