Merge daaab7dd748980cd7454a1d613876473c15f9488 into 3e8f91d1a116060d3fedfe856f3721db970de030

Several small bug fixes for Windows platforms
* Enhancement for GLFW include and lib search * Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp * Update CMakelist.txt for the sample 0_Introduction/template * Copy .dll to correct dir for 5_Domain_Specific/Mandelbrot * Fix typo * Update changelog for cudaNvSciBufMultiplanar
2025-04-10 18:22:11 +01:00 · 2025-03-04 08:42:41 +08:00 · 2025-02-26 08:23:39 -08:00 · 2025-02-21 09:21:43 -08:00 · 2025-02-21 09:20:53 -08:00 · 2025-02-21 09:19:21 -08:00
12 changed files with 36 additions and 39 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -36,6 +36,7 @@
    * `cuDLALayerwiseStatsHybrid`
    * `cuDLALayerwiseStatsStandalone`
    * `cuDLAStandaloneMode`
+    * `cudaNvSciBufMultiplanar`
    * `cudaNvSciNvMedia`
    * `fluidsGLES`
    * `nbody_opengles`
--- a/Samples/0_Introduction/CMakeLists.txt
+++ b/Samples/0_Introduction/CMakeLists.txt
@ -55,6 +55,7 @@ add_subdirectory(simpleTexture3D)
 add_subdirectory(simpleTextureDrv)
 add_subdirectory(simpleVoteIntrinsics)
 add_subdirectory(simpleZeroCopy)
+add_subdirectory(template)
 add_subdirectory(systemWideAtomics)
 add_subdirectory(vectorAdd)
 add_subdirectory(vectorAddDrv)
--- a/Samples/0_Introduction/template/CMakeLists.txt
+++ b/Samples/0_Introduction/template/CMakeLists.txt
@ -20,7 +20,7 @@ include_directories(../../../Common)

 # Source file
 # Add target for template
-add_executable(template template.cu)
+add_executable(template template.cu template_cpu.cpp)

 target_compile_options(template PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)

--- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp
+++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter.cpp
@ -77,7 +77,6 @@ int filter_radius = 14;
 int nthreads = 64;
 unsigned int width, height;
 unsigned int *h_img = NULL;
-unsigned int *d_img = NULL;
 unsigned int *d_temp = NULL;

 GLuint pbo;                                      // OpenGL pixel buffer object
@ -108,11 +107,11 @@ extern "C" void computeGold(float *id, float *od, int w, int h, int n);
 // These are CUDA functions to handle allocation and launching the kernels
 extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA);
 extern "C" void freeTextures();
-extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
+extern "C" double boxFilter(float *d_temp, float *d_dest,
                            int width, int height, int radius, int iterations,
                            int nthreads, StopWatchInterface *timer);

-extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
+extern "C" double boxFilterRGBA(unsigned int *d_temp,
                                unsigned int *d_dest, int width, int height,
                                int radius, int iterations, int nthreads,
                                StopWatchInterface *timer);
@ -165,7 +164,7 @@ void display() {
  size_t num_bytes;
  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
      (void **)&d_result, &num_bytes, cuda_pbo_resource));
-  boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
+  boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
                iterations, nthreads, kernel_timer);

  checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
@ -282,11 +281,7 @@ void reshape(int x, int y) {
 }

 void initCuda(bool useRGBA) {
-  // allocate device memory
-  checkCudaErrors(
-      cudaMalloc((void **)&d_img, (width * height * sizeof(unsigned int))));
-  checkCudaErrors(
-      cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
+  checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));

  // Refer to boxFilter_kernel.cu for implementation
  initTexture(width, height, h_img, useRGBA);
@ -304,11 +299,6 @@ void cleanup() {
    h_img = NULL;
  }

-  if (d_img) {
-    cudaFree(d_img);
-    d_img = NULL;
-  }
-
  if (d_temp) {
    cudaFree(d_temp);
    d_temp = NULL;
@ -413,7 +403,7 @@ int runBenchmark() {
      cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int)));

  // warm-up
-  boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations,
+  boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations,
                nthreads, kernel_timer);
  checkCudaErrors(cudaDeviceSynchronize());

@ -426,7 +416,7 @@ int runBenchmark() {

  for (int i = 0; i < iCycles; i++) {
    dProcessingTime +=
-        boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius,
+        boxFilterRGBA(d_temp, d_temp, width, height, filter_radius,
                      iterations, nthreads, kernel_timer);
  }

@ -469,7 +459,7 @@ int runSingleTest(char *ref_file, char *exec_path) {
  {
    printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius,
           iterations);
-    boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
+    boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
                  iterations, nthreads, kernel_timer);

    // check if kernel execution generated an error
--- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp
+++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_cpu.cpp
@ -74,7 +74,7 @@ void hboxfilter_y(float *id, float *od, int w, int h, int r) {

  for (int x = 0; x < w; x++) {
    float t;
-    // do left edge
+    // do top edge
    t = id[x] * r;

    for (int y = 0; y < r + 1; y++) {
@ -98,7 +98,7 @@ void hboxfilter_y(float *id, float *od, int w, int h, int r) {
      od[c] = t * scale;
    }

-    // do right edge
+    // do bottom edge
    for (int y = h - r; y < h; y++) {
      int c = y * w + x;
      t += id[(h - 1) * w + x];
--- a/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu
+++ b/Samples/2_Concepts_and_Techniques/boxFilter/boxFilter_kernel.cu
@ -129,7 +129,7 @@ __device__ void d_boxfilter_y(float *id, float *od, int w, int h, int r) {
  float scale = 1.0f / (float)((r << 1) + 1);

  float t;
-  // do left edge
+  // do top edge
  t = id[0] * r;

  for (int y = 0; y < (r + 1); y++) {
@ -151,7 +151,7 @@ __device__ void d_boxfilter_y(float *id, float *od, int w, int h, int r) {
    od[y * w] = t * scale;
  }

-  // do right edge
+  // do bottom edge
  for (int y = h - r; y < h; y++) {
    t += id[(h - 1) * w];
    t -= id[((y - r) * w) - w];
@ -271,7 +271,7 @@ __global__ void d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w,
  float scale = 1.0f / (float)((r << 1) + 1);

  float4 t;
-  // do left edge
+  // do top edge
  t = rgbaIntToFloat(id[0]) * r;

  for (int y = 0; y < (r + 1); y++) {
@ -293,7 +293,7 @@ __global__ void d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w,
    od[y * w] = rgbaFloatToInt(t * scale);
  }

-  // do right edge
+  // do bottom edge
  for (int y = h - r; y < h; y++) {
    t += rgbaIntToFloat(id[(h - 1) * w]);
    t -= rgbaIntToFloat(id[((y - r) * w) - w]);
@ -399,7 +399,6 @@ extern "C" void freeTextures() {
    Perform 2D box filter on image using CUDA

    Parameters:
-    d_src  - pointer to input image in device memory
    d_temp - pointer to temporary storage in device memory
    d_dest - pointer to destination image in device memory
    width  - image width
@ -408,7 +407,7 @@ extern "C" void freeTextures() {
    iterations - number of iterations

 */
-extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
+extern "C" double boxFilter(float *d_temp, float *d_dest,
                            int width, int height, int radius, int iterations,
                            int nthreads, StopWatchInterface *timer) {
  // var for kernel timing
@ -447,7 +446,7 @@ extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
 }

 // RGBA version
-extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
+extern "C" double boxFilterRGBA(unsigned int *d_temp,
                                unsigned int *d_dest, int width, int height,
                                int radius, int iterations, int nthreads,
                                StopWatchInterface *timer) {
--- a/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md
+++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md
@ -2,7 +2,7 @@

 ## Description

-This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/indexen.htm
+This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang https://hwlang.de/algorithmen/sortieren/bitonic/bitonicen.htm

 ## Key Concepts

--- a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp
+++ b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp
@ -493,12 +493,14 @@ static void parentProcess(char *app) {
      continue;
    }

-    for (int j = 0; j < nprocesses; j++) {
+    for (int j = 0; j < selectedDevices.size(); j++) {
      int canAccessPeerIJ, canAccessPeerJI;
-      checkCudaErrors(
-          cuDeviceCanAccessPeer(&canAccessPeerJI, devices[j], devices[i]));
-      checkCudaErrors(
-          cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[j]));
+      checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI, 
+                                            devices[selectedDevices[j]], 
+                                            devices[i]));
+      checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ, 
+                                            devices[i], 
+                                            devices[selectedDevices[j]]));
      if (!canAccessPeerIJ || !canAccessPeerJI) {
        allPeers = false;
        break;
@ -513,10 +515,10 @@ static void parentProcess(char *app) {
      // setup the peers for the device.  For systems that only allow 8
      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
      for (int j = 0; j < nprocesses; j++) {
-        checkCudaErrors(cuCtxSetCurrent(ctxs[i]));
+        checkCudaErrors(cuCtxSetCurrent(ctxs.back()));
        checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0));
        checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
-        checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0));
+        checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0));
      }
      selectedDevices.push_back(i);
      nprocesses++;
--- a/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp
+++ b/Samples/4_CUDA_Libraries/conjugateGradient/main.cpp
@ -231,6 +231,10 @@ int main(int argc, char **argv) {
    }
  }

+  if (buffer) {
+    checkCudaErrors(cudaFree(buffer));
+  }
+  
  cusparseDestroy(cusparseHandle);
  cublasDestroy(cublasHandle);
  if (matA) {
--- a/Samples/4_CUDA_Libraries/cudaNvSci/README.md
+++ b/Samples/4_CUDA_Libraries/cudaNvSci/README.md
@ -2,7 +2,7 @@

 ## Description

-This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
+This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04

 ## Key Concepts

--- a/Samples/5_Domain_Specific/Mandelbrot/CMakeLists.txt
+++ b/Samples/5_Domain_Specific/Mandelbrot/CMakeLists.txt
@ -65,14 +65,14 @@ target_compile_features(Mandelbrot PRIVATE cxx_std_17 cuda_std_17)
                POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E copy
                ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
-                ${CMAKE_CURRENT_BINARY_DIR}
+                ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
            )

            add_custom_command(TARGET Mandelbrot
                POST_BUILD
                COMMAND ${CMAKE_COMMAND} -E copy
                ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
-                ${CMAKE_CURRENT_BINARY_DIR}
+                ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
            )
        endif()

--- a/Samples/6_Performance/transpose/transpose.cu
+++ b/Samples/6_Performance/transpose/transpose.cu
@ -53,7 +53,7 @@ const char *sSDKsample = "Transpose";
 // TILE_DIM/BLOCK_ROWS elements.  TILE_DIM must be an integral multiple of
 // BLOCK_ROWS

-#define TILE_DIM 16
+#define TILE_DIM 32
 #define BLOCK_ROWS 16

 // This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
Author	SHA1	Message	Date
Sinseok	6668105f27	Merge daaab7dd748980cd7454a1d613876473c15f9488 into 3e8f91d1a116060d3fedfe856f3721db970de030	2025-03-04 08:42:41 +08:00
XSShawnZeng	3e8f91d1a1	Several small bug fixes for Windows platforms * Enhancement for GLFW include and lib search * Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp * Update CMakelist.txt for the sample 0_Introduction/template * Copy .dll to correct dir for 5_Domain_Specific/Mandelbrot * Fix typo * Update changelog for cudaNvSciBufMultiplanar	2025-02-26 08:23:39 -08:00
Jonathan Bentz	f3b7c41ad6	cudaNvSci: Update README.md fixing typo (#337 ) Fixes #193	2025-02-21 09:21:43 -08:00
Jonathan Bentz	29fb758e62	conjugateGradient: Ensure allocated memory is freed (#336 ) Fixes #202	2025-02-21 09:20:53 -08:00
Jonathan Bentz	3bc08136ff	Update README.md link for sortingNetworks (#335 ) Fixes #302	2025-02-21 09:19:21 -08:00
Jonathan Bentz	85eefa06c4	boxFilter: Remove unused parameter (#338 ) Fixes: #122	2025-02-21 09:17:45 -08:00
XSShawnZeng	c357dd1e6b	Fixing issue #321 : A potential bug in memMapIPCDrv/memMapIpc.cpp (#334 )	2025-02-21 09:14:25 -08:00
Jonathan Bentz	efb46383e0	Transpose: Change TILE_DIM to 32 to fix bank conflicts Fixes #175	2025-02-20 15:46:44 -08:00
박신석	daaab7dd74	fix cuda box filter typo	2023-02-18 22:52:26 +09:00
박신석	24bd894ea4	fix cpu box filter typo	2023-02-18 21:19:41 +09:00