mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-29 11:03:08 +01:00
264 lines
8.6 KiB
Plaintext
264 lines
8.6 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "FDTD3dGPU.h"
|
|
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <helper_functions.h>
|
|
#include <helper_cuda.h>
|
|
|
|
#include "FDTD3dGPUKernel.cuh"
|
|
|
|
bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc,
|
|
const char **argv) {
|
|
int deviceCount = 0;
|
|
int targetDevice = 0;
|
|
size_t memsize = 0;
|
|
|
|
// Get the number of CUDA enabled GPU devices
|
|
printf(" cudaGetDeviceCount\n");
|
|
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
|
|
|
|
// Select target device (device 0 by default)
|
|
targetDevice = findCudaDevice(argc, (const char **)argv);
|
|
|
|
// Query target device for maximum memory allocation
|
|
printf(" cudaGetDeviceProperties\n");
|
|
struct cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice));
|
|
|
|
memsize = deviceProp.totalGlobalMem;
|
|
|
|
// Save the result
|
|
*result = (memsize_t)memsize;
|
|
return true;
|
|
}
|
|
|
|
bool fdtdGPU(float *output, const float *input, const float *coeff,
|
|
const int dimx, const int dimy, const int dimz, const int radius,
|
|
const int timesteps, const int argc, const char **argv) {
|
|
const int outerDimx = dimx + 2 * radius;
|
|
const int outerDimy = dimy + 2 * radius;
|
|
const int outerDimz = dimz + 2 * radius;
|
|
const size_t volumeSize = outerDimx * outerDimy * outerDimz;
|
|
int deviceCount = 0;
|
|
int targetDevice = 0;
|
|
float *bufferOut = 0;
|
|
float *bufferIn = 0;
|
|
dim3 dimBlock;
|
|
dim3 dimGrid;
|
|
|
|
// Ensure that the inner data starts on a 128B boundary
|
|
const int padding = (128 / sizeof(float)) - radius;
|
|
const size_t paddedVolumeSize = volumeSize + padding;
|
|
|
|
#ifdef GPU_PROFILING
|
|
cudaEvent_t profileStart = 0;
|
|
cudaEvent_t profileEnd = 0;
|
|
const int profileTimesteps = timesteps - 1;
|
|
|
|
if (profileTimesteps < 1) {
|
|
printf(
|
|
" cannot profile with fewer than two timesteps (timesteps=%d), "
|
|
"profiling is disabled.\n",
|
|
timesteps);
|
|
}
|
|
|
|
#endif
|
|
|
|
// Check the radius is valid
|
|
if (radius != RADIUS) {
|
|
printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Get the number of CUDA enabled GPU devices
|
|
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
|
|
|
|
// Select target device (device 0 by default)
|
|
targetDevice = findCudaDevice(argc, (const char **)argv);
|
|
|
|
checkCudaErrors(cudaSetDevice(targetDevice));
|
|
|
|
// Allocate memory buffers
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float)));
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float)));
|
|
|
|
// Check for a command-line specified block size
|
|
int userBlockSize;
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "block-size")) {
|
|
userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size");
|
|
// Constrain to a multiple of k_blockDimX
|
|
userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX);
|
|
|
|
// Constrain within allowed bounds
|
|
userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax);
|
|
} else {
|
|
userBlockSize = k_blockSizeMax;
|
|
}
|
|
|
|
// Check the device limit on the number of threads
|
|
struct cudaFuncAttributes funcAttrib;
|
|
checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel));
|
|
|
|
userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock);
|
|
|
|
// Set the block size
|
|
dimBlock.x = k_blockDimX;
|
|
// Visual Studio 2005 does not like std::min
|
|
// dimBlock.y = std::min<size_t>(userBlockSize / k_blockDimX,
|
|
// (size_t)k_blockDimMaxY);
|
|
dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY)
|
|
? (userBlockSize / k_blockDimX)
|
|
: (size_t)k_blockDimMaxY;
|
|
dimGrid.x = (unsigned int)ceil((float)dimx / dimBlock.x);
|
|
dimGrid.y = (unsigned int)ceil((float)dimy / dimBlock.y);
|
|
printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y);
|
|
printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y);
|
|
|
|
// Check the block size is valid
|
|
if (dimBlock.x < RADIUS || dimBlock.y < RADIUS) {
|
|
printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n",
|
|
dimBlock.x, dimBlock.y, RADIUS);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Copy the input to the device input buffer
|
|
checkCudaErrors(cudaMemcpy(bufferIn + padding, input,
|
|
volumeSize * sizeof(float),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
// Copy the input to the device output buffer (actually only need the halo)
|
|
checkCudaErrors(cudaMemcpy(bufferOut + padding, input,
|
|
volumeSize * sizeof(float),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
// Copy the coefficients to the device coefficient buffer
|
|
checkCudaErrors(
|
|
cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float)));
|
|
|
|
#ifdef GPU_PROFILING
|
|
|
|
// Create the events
|
|
checkCudaErrors(cudaEventCreate(&profileStart));
|
|
checkCudaErrors(cudaEventCreate(&profileEnd));
|
|
|
|
#endif
|
|
|
|
// Execute the FDTD
|
|
float *bufferSrc = bufferIn + padding;
|
|
float *bufferDst = bufferOut + padding;
|
|
printf(" GPU FDTD loop\n");
|
|
|
|
#ifdef GPU_PROFILING
|
|
// Enqueue start event
|
|
checkCudaErrors(cudaEventRecord(profileStart, 0));
|
|
#endif
|
|
|
|
for (int it = 0; it < timesteps; it++) {
|
|
printf("\tt = %d ", it);
|
|
|
|
// Launch the kernel
|
|
printf("launch kernel\n");
|
|
FiniteDifferencesKernel<<<dimGrid, dimBlock>>>(bufferDst, bufferSrc, dimx,
|
|
dimy, dimz);
|
|
|
|
// Toggle the buffers
|
|
// Visual Studio 2005 does not like std::swap
|
|
// std::swap<float *>(bufferSrc, bufferDst);
|
|
float *tmp = bufferDst;
|
|
bufferDst = bufferSrc;
|
|
bufferSrc = tmp;
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
#ifdef GPU_PROFILING
|
|
// Enqueue end event
|
|
checkCudaErrors(cudaEventRecord(profileEnd, 0));
|
|
#endif
|
|
|
|
// Wait for the kernel to complete
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
|
|
// Read the result back, result is in bufferSrc (after final toggle)
|
|
checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
// Report time
|
|
#ifdef GPU_PROFILING
|
|
float elapsedTimeMS = 0;
|
|
|
|
if (profileTimesteps > 0) {
|
|
checkCudaErrors(
|
|
cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd));
|
|
}
|
|
|
|
if (profileTimesteps > 0) {
|
|
// Convert milliseconds to seconds
|
|
double elapsedTime = elapsedTimeMS * 1.0e-3;
|
|
double avgElapsedTime = elapsedTime / (double)profileTimesteps;
|
|
// Determine number of computations per timestep
|
|
size_t pointsComputed = dimx * dimy * dimz;
|
|
// Determine throughput
|
|
double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime;
|
|
printf(
|
|
"FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, "
|
|
"NumDevsUsed = %u, Blocksize = %u\n",
|
|
throughputM, avgElapsedTime, pointsComputed, 1,
|
|
dimBlock.x * dimBlock.y);
|
|
}
|
|
|
|
#endif
|
|
|
|
// Cleanup
|
|
if (bufferIn) {
|
|
checkCudaErrors(cudaFree(bufferIn));
|
|
}
|
|
|
|
if (bufferOut) {
|
|
checkCudaErrors(cudaFree(bufferOut));
|
|
}
|
|
|
|
#ifdef GPU_PROFILING
|
|
|
|
if (profileStart) {
|
|
checkCudaErrors(cudaEventDestroy(profileStart));
|
|
}
|
|
|
|
if (profileEnd) {
|
|
checkCudaErrors(cudaEventDestroy(profileEnd));
|
|
}
|
|
|
|
#endif
|
|
return true;
|
|
}
|