Compare commits

...

9 Commits

Author SHA1 Message Date
li wei
627a06add9
Merge 6aac4717b878ce3a78c1159ff2794e681a914e5b into 3e8f91d1a116060d3fedfe856f3721db970de030 2025-03-04 08:42:41 +08:00
XSShawnZeng
3e8f91d1a1
Several small bug fixes for Windows platforms
* Enhancement for GLFW include and lib search

* Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp

* Update CMakelist.txt for the sample 0_Introduction/template

* Copy .dll to correct dir for 5_Domain_Specific/Mandelbrot

* Fix typo

* Update changelog for cudaNvSciBufMultiplanar
2025-02-26 08:23:39 -08:00
Jonathan Bentz
f3b7c41ad6
cudaNvSci: Update README.md fixing typo (#337)
Fixes #193
2025-02-21 09:21:43 -08:00
Jonathan Bentz
29fb758e62
conjugateGradient: Ensure allocated memory is freed (#336)
Fixes #202
2025-02-21 09:20:53 -08:00
Jonathan Bentz
3bc08136ff
Update README.md link for sortingNetworks (#335)
Fixes #302
2025-02-21 09:19:21 -08:00
Jonathan Bentz
85eefa06c4
boxFilter: Remove unused parameter (#338)
Fixes: #122
2025-02-21 09:17:45 -08:00
XSShawnZeng
c357dd1e6b
Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp (#334) 2025-02-21 09:14:25 -08:00
Jonathan Bentz
efb46383e0
Transpose: Change TILE_DIM to 32 to fix bank conflicts
Fixes #175
2025-02-20 15:46:44 -08:00
aioprli
6aac4717b8
Update globalToShmemAsyncCopy.cu
Fix two obvious errors, the first one is that five tasks were submitted to pipeline at the same time and task 4 conflicts with task 0, the remaining two are copy errors
2024-05-06 22:03:16 +08:00
12 changed files with 33 additions and 36 deletions

View File

@ -36,6 +36,7 @@
* `cuDLALayerwiseStatsHybrid`
* `cuDLALayerwiseStatsStandalone`
* `cuDLAStandaloneMode`
* `cudaNvSciBufMultiplanar`
* `cudaNvSciNvMedia`
* `fluidsGLES`
* `nbody_opengles`

View File

@ -55,6 +55,7 @@ add_subdirectory(simpleTexture3D)
add_subdirectory(simpleTextureDrv)
add_subdirectory(simpleVoteIntrinsics)
add_subdirectory(simpleZeroCopy)
add_subdirectory(template)
add_subdirectory(systemWideAtomics)
add_subdirectory(vectorAdd)
add_subdirectory(vectorAddDrv)

View File

@ -20,7 +20,7 @@ include_directories(../../../Common)
# Source file
# Add target for template
add_executable(template template.cu)
add_executable(template template.cu template_cpu.cpp)
target_compile_options(template PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)

View File

@ -77,7 +77,6 @@ int filter_radius = 14;
int nthreads = 64;
unsigned int width, height;
unsigned int *h_img = NULL;
unsigned int *d_img = NULL;
unsigned int *d_temp = NULL;
GLuint pbo; // OpenGL pixel buffer object
@ -108,11 +107,11 @@ extern "C" void computeGold(float *id, float *od, int w, int h, int n);
// These are CUDA functions to handle allocation and launching the kernels
extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA);
extern "C" void freeTextures();
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
extern "C" double boxFilter(float *d_temp, float *d_dest,
int width, int height, int radius, int iterations,
int nthreads, StopWatchInterface *timer);
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
extern "C" double boxFilterRGBA(unsigned int *d_temp,
unsigned int *d_dest, int width, int height,
int radius, int iterations, int nthreads,
StopWatchInterface *timer);
@ -165,7 +164,7 @@ void display() {
size_t num_bytes;
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
(void **)&d_result, &num_bytes, cuda_pbo_resource));
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
iterations, nthreads, kernel_timer);
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
@ -282,11 +281,7 @@ void reshape(int x, int y) {
}
void initCuda(bool useRGBA) {
// allocate device memory
checkCudaErrors(
cudaMalloc((void **)&d_img, (width * height * sizeof(unsigned int))));
checkCudaErrors(
cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
// Refer to boxFilter_kernel.cu for implementation
initTexture(width, height, h_img, useRGBA);
@ -304,11 +299,6 @@ void cleanup() {
h_img = NULL;
}
if (d_img) {
cudaFree(d_img);
d_img = NULL;
}
if (d_temp) {
cudaFree(d_temp);
d_temp = NULL;
@ -413,7 +403,7 @@ int runBenchmark() {
cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int)));
// warm-up
boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations,
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations,
nthreads, kernel_timer);
checkCudaErrors(cudaDeviceSynchronize());
@ -426,7 +416,7 @@ int runBenchmark() {
for (int i = 0; i < iCycles; i++) {
dProcessingTime +=
boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius,
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius,
iterations, nthreads, kernel_timer);
}
@ -469,7 +459,7 @@ int runSingleTest(char *ref_file, char *exec_path) {
{
printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius,
iterations);
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
iterations, nthreads, kernel_timer);
// check if kernel execution generated an error

View File

@ -399,7 +399,6 @@ extern "C" void freeTextures() {
Perform 2D box filter on image using CUDA
Parameters:
d_src - pointer to input image in device memory
d_temp - pointer to temporary storage in device memory
d_dest - pointer to destination image in device memory
width - image width
@ -408,7 +407,7 @@ extern "C" void freeTextures() {
iterations - number of iterations
*/
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
extern "C" double boxFilter(float *d_temp, float *d_dest,
int width, int height, int radius, int iterations,
int nthreads, StopWatchInterface *timer) {
// var for kernel timing
@ -447,7 +446,7 @@ extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
}
// RGBA version
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
extern "C" double boxFilterRGBA(unsigned int *d_temp,
unsigned int *d_dest, int width, int height,
int radius, int iterations, int nthreads,
StopWatchInterface *timer) {

View File

@ -2,7 +2,7 @@
## Description
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/indexen.htm
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang https://hwlang.de/algorithmen/sortieren/bitonic/bitonicen.htm
## Key Concepts

View File

@ -128,7 +128,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
a <= aEnd; a += aStep, b += bStep, ++i) {
// Load the matrices from device memory to shared memory; each thread loads
// one element of each matrix
for (; aStage <= a + aStep * maxPipelineStages;
for (; aStage < a + aStep * maxPipelineStages;
aStage += aStep, bStage += bStep, ++iStage) {
pipe.producer_acquire();
if (aStage <= aEnd && t4x < BLOCK_SIZE) {
@ -137,7 +137,7 @@ __global__ void MatrixMulAsyncCopyMultiStageLargeChunk(
cuda::memcpy_async(&As[j][threadIdx.y][t4x],
&A[aStage + wA * threadIdx.y + t4x], shape4, pipe);
cuda::memcpy_async(&Bs[j][threadIdx.y][t4x],
&B[aStage + wA * threadIdx.y + t4x], shape4, pipe);
&B[bStage + wB * threadIdx.y + t4x], shape4, pipe);
}
pipe.producer_commit();
}
@ -222,7 +222,7 @@ __global__ void MatrixMulAsyncCopyLargeChunk(float *__restrict__ C,
cuda::memcpy_async(&As[threadIdx.y][t4x], &A[a + wA * threadIdx.y + t4x],
shape4, pipe);
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[a + wA * threadIdx.y + t4x],
cuda::memcpy_async(&Bs[threadIdx.y][t4x], &B[b + wB * threadIdx.y + t4x],
shape4, pipe);
pipe.producer_commit();

View File

@ -493,12 +493,14 @@ static void parentProcess(char *app) {
continue;
}
for (int j = 0; j < nprocesses; j++) {
for (int j = 0; j < selectedDevices.size(); j++) {
int canAccessPeerIJ, canAccessPeerJI;
checkCudaErrors(
cuDeviceCanAccessPeer(&canAccessPeerJI, devices[j], devices[i]));
checkCudaErrors(
cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[j]));
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI,
devices[selectedDevices[j]],
devices[i]));
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ,
devices[i],
devices[selectedDevices[j]]));
if (!canAccessPeerIJ || !canAccessPeerJI) {
allPeers = false;
break;
@ -513,10 +515,10 @@ static void parentProcess(char *app) {
// setup the peers for the device. For systems that only allow 8
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
for (int j = 0; j < nprocesses; j++) {
checkCudaErrors(cuCtxSetCurrent(ctxs[i]));
checkCudaErrors(cuCtxSetCurrent(ctxs.back()));
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0));
checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0));
checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0));
}
selectedDevices.push_back(i);
nprocesses++;

View File

@ -231,6 +231,10 @@ int main(int argc, char **argv) {
}
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
if (matA) {

View File

@ -2,7 +2,7 @@
## Description
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
## Key Concepts

View File

@ -65,14 +65,14 @@ target_compile_features(Mandelbrot PRIVATE cxx_std_17 cuda_std_17)
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
)
add_custom_command(TARGET Mandelbrot
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
)
endif()

View File

@ -53,7 +53,7 @@ const char *sSDKsample = "Transpose";
// TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of
// BLOCK_ROWS
#define TILE_DIM 16
#define TILE_DIM 32
#define BLOCK_ROWS 16
// This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y