mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-11 02:32:12 +01:00
Compare commits
21 Commits
9481f4fea7
...
62fd85ab04
Author | SHA1 | Date | |
---|---|---|---|
![]() |
62fd85ab04 | ||
![]() |
f3b7c41ad6 | ||
![]() |
29fb758e62 | ||
![]() |
3bc08136ff | ||
![]() |
85eefa06c4 | ||
![]() |
c357dd1e6b | ||
![]() |
efb46383e0 | ||
![]() |
8d564d5e3a | ||
![]() |
37c5bcbef4 | ||
![]() |
940a4c7a91 | ||
![]() |
61bd39800d | ||
![]() |
8a96d2eee7 | ||
![]() |
e762d58260 | ||
![]() |
8fd1701744 | ||
![]() |
94765c1597 | ||
![]() |
c87881f02c | ||
![]() |
25400b6b3c | ||
![]() |
5748bf69da | ||
![]() |
22424227e7 | ||
![]() |
42ff742bf5 | ||
![]() |
8ccb13c6f0 |
@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
||||
}
|
||||
|
||||
unsigned long long compute_perf =
|
||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
||||
((unsigned long long)multiProcessorCount * sm_per_multiproc *
|
||||
clockRate);
|
||||
|
||||
if (compute_perf > max_compute_perf) {
|
||||
|
@ -203,7 +203,7 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
|
||||
#### GLFW
|
||||
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
||||
|
||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
|
||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
|
||||
|
||||
#### OpenMP
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -57,7 +57,7 @@ int main(int argc, char **argv) {
|
||||
// Get GPU information
|
||||
checkCudaErrors(cudaGetDevice(&devID));
|
||||
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||||
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
|
||||
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
|
||||
props.major, props.minor);
|
||||
|
||||
printf("printf() is called. Output:\n\n");
|
||||
|
@ -11,5 +11,4 @@ This sample enumerates the properties of the CUDA devices present in the system.
|
||||
This sample enumerates the properties of the CUDA devices present using CUDA Driver API calls
|
||||
|
||||
### [topologyQuery](./topologyQuery)
|
||||
A simple exemple on how to query the topology of a system with multiple GPU
|
||||
|
||||
A simple example on how to query the topology of a system with multiple GPU
|
||||
|
@ -77,7 +77,6 @@ int filter_radius = 14;
|
||||
int nthreads = 64;
|
||||
unsigned int width, height;
|
||||
unsigned int *h_img = NULL;
|
||||
unsigned int *d_img = NULL;
|
||||
unsigned int *d_temp = NULL;
|
||||
|
||||
GLuint pbo; // OpenGL pixel buffer object
|
||||
@ -108,11 +107,11 @@ extern "C" void computeGold(float *id, float *od, int w, int h, int n);
|
||||
// These are CUDA functions to handle allocation and launching the kernels
|
||||
extern "C" void initTexture(int width, int height, void *pImage, bool useRGBA);
|
||||
extern "C" void freeTextures();
|
||||
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
||||
extern "C" double boxFilter(float *d_temp, float *d_dest,
|
||||
int width, int height, int radius, int iterations,
|
||||
int nthreads, StopWatchInterface *timer);
|
||||
|
||||
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
|
||||
extern "C" double boxFilterRGBA(unsigned int *d_temp,
|
||||
unsigned int *d_dest, int width, int height,
|
||||
int radius, int iterations, int nthreads,
|
||||
StopWatchInterface *timer);
|
||||
@ -165,7 +164,7 @@ void display() {
|
||||
size_t num_bytes;
|
||||
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
|
||||
(void **)&d_result, &num_bytes, cuda_pbo_resource));
|
||||
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
|
||||
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
|
||||
iterations, nthreads, kernel_timer);
|
||||
|
||||
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
|
||||
@ -282,11 +281,7 @@ void reshape(int x, int y) {
|
||||
}
|
||||
|
||||
void initCuda(bool useRGBA) {
|
||||
// allocate device memory
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_img, (width * height * sizeof(unsigned int))));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_temp, (width * height * sizeof(unsigned int))));
|
||||
|
||||
// Refer to boxFilter_kernel.cu for implementation
|
||||
initTexture(width, height, h_img, useRGBA);
|
||||
@ -304,11 +299,6 @@ void cleanup() {
|
||||
h_img = NULL;
|
||||
}
|
||||
|
||||
if (d_img) {
|
||||
cudaFree(d_img);
|
||||
d_img = NULL;
|
||||
}
|
||||
|
||||
if (d_temp) {
|
||||
cudaFree(d_temp);
|
||||
d_temp = NULL;
|
||||
@ -413,7 +403,7 @@ int runBenchmark() {
|
||||
cudaMalloc((void **)&d_result, width * height * sizeof(unsigned int)));
|
||||
|
||||
// warm-up
|
||||
boxFilterRGBA(d_img, d_temp, d_temp, width, height, filter_radius, iterations,
|
||||
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius, iterations,
|
||||
nthreads, kernel_timer);
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
|
||||
@ -426,7 +416,7 @@ int runBenchmark() {
|
||||
|
||||
for (int i = 0; i < iCycles; i++) {
|
||||
dProcessingTime +=
|
||||
boxFilterRGBA(d_img, d_temp, d_img, width, height, filter_radius,
|
||||
boxFilterRGBA(d_temp, d_temp, width, height, filter_radius,
|
||||
iterations, nthreads, kernel_timer);
|
||||
}
|
||||
|
||||
@ -469,7 +459,7 @@ int runSingleTest(char *ref_file, char *exec_path) {
|
||||
{
|
||||
printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius,
|
||||
iterations);
|
||||
boxFilterRGBA(d_img, d_temp, d_result, width, height, filter_radius,
|
||||
boxFilterRGBA(d_temp, d_result, width, height, filter_radius,
|
||||
iterations, nthreads, kernel_timer);
|
||||
|
||||
// check if kernel execution generated an error
|
||||
|
@ -399,7 +399,6 @@ extern "C" void freeTextures() {
|
||||
Perform 2D box filter on image using CUDA
|
||||
|
||||
Parameters:
|
||||
d_src - pointer to input image in device memory
|
||||
d_temp - pointer to temporary storage in device memory
|
||||
d_dest - pointer to destination image in device memory
|
||||
width - image width
|
||||
@ -408,7 +407,7 @@ extern "C" void freeTextures() {
|
||||
iterations - number of iterations
|
||||
|
||||
*/
|
||||
extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
||||
extern "C" double boxFilter(float *d_temp, float *d_dest,
|
||||
int width, int height, int radius, int iterations,
|
||||
int nthreads, StopWatchInterface *timer) {
|
||||
// var for kernel timing
|
||||
@ -447,7 +446,7 @@ extern "C" double boxFilter(float *d_src, float *d_temp, float *d_dest,
|
||||
}
|
||||
|
||||
// RGBA version
|
||||
extern "C" double boxFilterRGBA(unsigned int *d_src, unsigned int *d_temp,
|
||||
extern "C" double boxFilterRGBA(unsigned int *d_temp,
|
||||
unsigned int *d_dest, int width, int height,
|
||||
int radius, int iterations, int nthreads,
|
||||
StopWatchInterface *timer) {
|
||||
|
@ -34,13 +34,12 @@
|
||||
#define _KERNELS_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <thrust/functional.h>
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
// Functors used with thrust library.
|
||||
template <typename Input>
|
||||
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
|
||||
struct IsGreaterEqualThan
|
||||
{
|
||||
__host__ __device__ IsGreaterEqualThan(uint upperBound) :
|
||||
upperBound_(upperBound) {}
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang http://www.iti.fh-flensburg.de/lang/algorithmen/sortieren/networks/indexen.htm
|
||||
This sample implements bitonic sort and odd-even merge sort (also known as Batcher's sort), algorithms belonging to the class of sorting networks. While generally subefficient, for large sequences compared to algorithms with better asymptotic algorithmic complexity (i.e. merge sort or radix sort), this may be the preferred algorithms of choice for sorting batches of short-sized to mid-sized (key, value) array pairs. Refer to an excellent tutorial by H. W. Lang https://hwlang.de/algorithmen/sortieren/bitonic/bitonicen.htm
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -31,7 +31,6 @@
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include "cuda.h"
|
||||
@ -293,6 +292,11 @@ static void memMapGetDeviceFunction(char **argv) {
|
||||
jitNumOptions, jitOptions,
|
||||
(void **)jitOptVals));
|
||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||
|
||||
// Clean up dynamically allocated memory
|
||||
delete[] jitOptions;
|
||||
delete[] jitOptVals;
|
||||
delete[] jitLogBuffer;
|
||||
} else {
|
||||
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
|
||||
}
|
||||
@ -379,7 +383,7 @@ static void childProcess(int devId, int id, char **argv) {
|
||||
// deterministic.
|
||||
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
|
||||
if (id == 0) {
|
||||
printf("Step %lld done\n", (unsigned long long)i);
|
||||
printf("Step %llu done\n", (unsigned long long)i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -489,12 +493,14 @@ static void parentProcess(char *app) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int j = 0; j < nprocesses; j++) {
|
||||
for (int j = 0; j < selectedDevices.size(); j++) {
|
||||
int canAccessPeerIJ, canAccessPeerJI;
|
||||
checkCudaErrors(
|
||||
cuDeviceCanAccessPeer(&canAccessPeerJI, devices[j], devices[i]));
|
||||
checkCudaErrors(
|
||||
cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[j]));
|
||||
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerJI,
|
||||
devices[selectedDevices[j]],
|
||||
devices[i]));
|
||||
checkCudaErrors(cuDeviceCanAccessPeer(&canAccessPeerIJ,
|
||||
devices[i],
|
||||
devices[selectedDevices[j]]));
|
||||
if (!canAccessPeerIJ || !canAccessPeerJI) {
|
||||
allPeers = false;
|
||||
break;
|
||||
@ -509,10 +515,10 @@ static void parentProcess(char *app) {
|
||||
// setup the peers for the device. For systems that only allow 8
|
||||
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
|
||||
for (int j = 0; j < nprocesses; j++) {
|
||||
checkCudaErrors(cuCtxSetCurrent(ctxs[i]));
|
||||
checkCudaErrors(cuCtxSetCurrent(ctxs.back()));
|
||||
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0));
|
||||
checkCudaErrors(cuCtxSetCurrent(ctxs[j]));
|
||||
checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0));
|
||||
checkCudaErrors(cuCtxEnablePeerAccess(ctxs.back(), 0));
|
||||
}
|
||||
selectedDevices.push_back(i);
|
||||
nprocesses++;
|
||||
@ -550,7 +556,7 @@ static void parentProcess(char *app) {
|
||||
// Launch the child processes!
|
||||
for (i = 0; i < nprocesses; i++) {
|
||||
char devIdx[10];
|
||||
char procIdx[10];
|
||||
char procIdx[12];
|
||||
char *const args[] = {app, devIdx, procIdx, NULL};
|
||||
Process process;
|
||||
|
||||
|
@ -231,6 +231,10 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (buffer) {
|
||||
checkCudaErrors(cudaFree(buffer));
|
||||
}
|
||||
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
if (matA) {
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Description
|
||||
|
||||
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread &amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp;amp; rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
|
||||
This sample demonstrates CUDA-NvSciBuf/NvSciSync Interop. Two CPU threads import the NvSciBuf and NvSciSync into CUDA to perform two image processing algorithms on a ppm image - image rotation in 1st thread & rgba to grayscale conversion of rotated image in 2nd thread. Currently only supported on Ubuntu 18.04
|
||||
|
||||
## Key Concepts
|
||||
|
||||
|
@ -52,13 +52,14 @@ extern "C" void BlackScholesCall(real &callResult, TOptionData optionData);
|
||||
// Process single option on CPU
|
||||
// Note that CPU code is for correctness testing only and not for benchmarking.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData);
|
||||
extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData,
|
||||
option_t option_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Process an array of OptN options on GPU
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData,
|
||||
int optN);
|
||||
int optN, option_t option_type);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper function, returning uniformly distributed
|
||||
@ -103,12 +104,14 @@ int main(int argc, char **argv) {
|
||||
BlackScholesCall(callValueBS[i], optionData[i]);
|
||||
}
|
||||
|
||||
printf("Running GPU binomial tree...\n");
|
||||
option_t option_type = EU;
|
||||
|
||||
printf("Running GPU binomial tree (EU)...\n");
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkResetTimer(&hTimer);
|
||||
sdkStartTimer(&hTimer);
|
||||
|
||||
binomialOptionsGPU(callValueGPU, optionData, OPT_N);
|
||||
binomialOptionsGPU(callValueGPU, optionData, OPT_N, option_type);
|
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkStopTimer(&hTimer);
|
||||
@ -118,13 +121,13 @@ int main(int argc, char **argv) {
|
||||
printf("binomialOptionsGPU() time: %f msec\n", gpuTime);
|
||||
printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001));
|
||||
|
||||
printf("Running CPU binomial tree...\n");
|
||||
printf("Running CPU binomial tree (EU)...\n");
|
||||
|
||||
for (i = 0; i < OPT_N; i++) {
|
||||
binomialOptionsCPU(callValueCPU[i], optionData[i]);
|
||||
binomialOptionsCPU(callValueCPU[i], optionData[i], option_type);
|
||||
}
|
||||
|
||||
printf("Comparing the results...\n");
|
||||
printf("Comparing the results (EU)...\n");
|
||||
sumDelta = 0;
|
||||
sumRef = 0;
|
||||
printf("GPU binomial vs. Black-Scholes\n");
|
||||
@ -170,6 +173,49 @@ int main(int argc, char **argv) {
|
||||
printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N));
|
||||
}
|
||||
|
||||
if (errorVal > 5e-4) {
|
||||
printf("Test failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
option_type = NA;
|
||||
|
||||
printf("\nRunning GPU binomial tree (NA)...\n");
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkResetTimer(&hTimer);
|
||||
sdkStartTimer(&hTimer);
|
||||
|
||||
binomialOptionsGPU(callValueGPU, optionData, OPT_N, option_type);
|
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkStopTimer(&hTimer);
|
||||
gpuTime = sdkGetTimerValue(&hTimer);
|
||||
printf("Options count : %i \n", OPT_N);
|
||||
printf("Time steps : %i \n", NUM_STEPS);
|
||||
printf("binomialOptionsGPU() time: %f msec\n", gpuTime);
|
||||
printf("Options per second : %f \n", OPT_N / (gpuTime * 0.001));
|
||||
|
||||
printf("Running CPU binomial tree (NA)...\n");
|
||||
|
||||
for (i = 0; i < OPT_N; i++) {
|
||||
binomialOptionsCPU(callValueCPU[i], optionData[i], option_type);
|
||||
}
|
||||
|
||||
printf("CPU binomial vs. GPU binomial\n");
|
||||
sumDelta = 0;
|
||||
sumRef = 0;
|
||||
|
||||
for (i = 0; i < OPT_N; i++) {
|
||||
sumDelta += fabs(callValueGPU[i] - callValueCPU[i]);
|
||||
sumRef += callValueCPU[i];
|
||||
}
|
||||
|
||||
if (sumRef > 1E-5) {
|
||||
printf("L1 norm: %E\n", errorVal = sumDelta / sumRef);
|
||||
} else {
|
||||
printf("Avg. diff: %E\n", (double)(sumDelta / (real)OPT_N));
|
||||
}
|
||||
|
||||
printf("Shutting down...\n");
|
||||
|
||||
sdkDeleteTimer(&hTimer);
|
||||
|
@ -41,6 +41,15 @@ typedef struct {
|
||||
real V;
|
||||
} TOptionData;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Option types
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
enum option_t
|
||||
{
|
||||
NA = 0,
|
||||
EU,
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Global parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -78,7 +78,8 @@ static real expiryCallValue(real S, real X, real vDt, int i) {
|
||||
return (d > (real)0) ? d : (real)0;
|
||||
}
|
||||
|
||||
extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData) {
|
||||
extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData,
|
||||
option_t option_type) {
|
||||
static real Call[NUM_STEPS + 1];
|
||||
|
||||
const real S = optionData.S;
|
||||
@ -112,9 +113,18 @@ extern "C" void binomialOptionsCPU(real &callResult, TOptionData optionData) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Walk backwards up binomial tree
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
for (int i = NUM_STEPS; i > 0; i--)
|
||||
for (int j = 0; j <= i - 1; j++)
|
||||
Call[j] = puByDf * Call[j + 1] + pdByDf * Call[j];
|
||||
for (int i = NUM_STEPS; i > 0; i--) {
|
||||
for (int j = 0; j <= i - 1; j++) {
|
||||
real continuation_value = puByDf * Call[j + 1] + pdByDf * Call[j];
|
||||
if(option_type == NA){
|
||||
real fwd = S * exp((2*j-i) * vDt);
|
||||
real exercise_value = (fwd - X) > (real)0 ? (fwd - X) : (real)0;
|
||||
Call[j] = exercise_value > continuation_value ? exercise_value : continuation_value;
|
||||
} else if (option_type == EU) {
|
||||
Call[j] = continuation_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
callResult = (real)Call[0];
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ __device__ inline double expiryCallValue(double S, double X, double vDt,
|
||||
#error Bad constants
|
||||
#endif
|
||||
|
||||
__global__ void binomialOptionsKernel() {
|
||||
__global__ void binomialOptionsKernel(option_t option_type) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ real call_exchange[THREADBLOCK_SIZE + 1];
|
||||
@ -105,8 +105,20 @@ __global__ void binomialOptionsKernel() {
|
||||
|
||||
if (i > final_it) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ELEMS_PER_THREAD; ++j)
|
||||
call[j] = puByDf * call[j + 1] + pdByDf * call[j];
|
||||
for (int j = 0; j < ELEMS_PER_THREAD; ++j) {
|
||||
real continuation_value = puByDf * call[j + 1] + pdByDf * call[j];
|
||||
if(option_type == NA){
|
||||
#ifndef DOUBLE_PRECISION
|
||||
real fwd = S*__expf(vDt * (2*(tid * ELEMS_PER_THREAD + j) - i));
|
||||
#else
|
||||
real fwd = S*exp(vDt * (2*(tid * ELEMS_PER_THREAD + j) - i));
|
||||
#endif
|
||||
real exercise_value = ((fwd - X) > (real)0) ? (fwd - X) : (real)0;
|
||||
call[j] = exercise_value > continuation_value ? exercise_value : continuation_value;
|
||||
} else if (option_type == EU){
|
||||
call[j] = continuation_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -119,7 +131,7 @@ __global__ void binomialOptionsKernel() {
|
||||
// Host-side interface to GPU binomialOptions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData,
|
||||
int optN) {
|
||||
int optN, option_t option_type) {
|
||||
__TOptionData h_OptionData[MAX_OPTIONS];
|
||||
|
||||
for (int i = 0; i < optN; i++) {
|
||||
@ -150,7 +162,7 @@ extern "C" void binomialOptionsGPU(real *callValue, TOptionData *optionData,
|
||||
|
||||
checkCudaErrors(cudaMemcpyToSymbol(d_OptionData, h_OptionData,
|
||||
optN * sizeof(__TOptionData)));
|
||||
binomialOptionsKernel<<<optN, THREADBLOCK_SIZE>>>();
|
||||
binomialOptionsKernel<<<optN, THREADBLOCK_SIZE>>>(option_type);
|
||||
getLastCudaError("binomialOptionsKernel() execution failed.\n");
|
||||
checkCudaErrors(
|
||||
cudaMemcpyFromSymbol(callValue, d_CallValue, optN * sizeof(real)));
|
||||
|
@ -416,8 +416,8 @@ void initMC(int argc, char **argv) {
|
||||
gridSizeLog2.x = n;
|
||||
}
|
||||
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "gridx")) {
|
||||
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridx");
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "gridy")) {
|
||||
n = getCmdLineArgumentInt(argc, (const char **)argv, "gridy");
|
||||
gridSizeLog2.y = n;
|
||||
}
|
||||
|
||||
|
@ -20,16 +20,19 @@ include_directories(../../../Common)
|
||||
find_package(Vulkan)
|
||||
find_package(OpenGL)
|
||||
|
||||
|
||||
# Include the check_include_file macro
|
||||
include(CheckIncludeFile)
|
||||
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,21 +54,22 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(simpleVulkan PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(simpleVulkan
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET simpleVulkan POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sinewave.frag
|
||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_library(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,23 +53,23 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(simpleVulkanMMAP PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(simpleVulkanMMAP
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
CUDA::cuda_driver
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET simpleVulkanMMAP POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/montecarlo.frag
|
||||
|
@ -71,7 +71,7 @@ if(${OpenGL_FOUND})
|
||||
POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||
)
|
||||
endif()
|
||||
|
||||
|
@ -26,10 +26,12 @@ include(CheckIncludeFile)
|
||||
# Check for the GLFW/glfw3.h header
|
||||
check_include_file("GLFW/glfw3.h" HAVE_GLFW3_H)
|
||||
|
||||
# Find GLFW/glfw3.h header for Windows
|
||||
# Find GLFW header and lib for Windows
|
||||
if(WIN32)
|
||||
find_file(GLFW3_H "glfw3.h" PATH "$ENV{GLFW_INCLUDES_DIR}/GLFW")
|
||||
if(GLFW3_H)
|
||||
find_file(GLFW3_H "GLFW/glfw3.h" PATH "${GLFW_INCLUDE_DIR}")
|
||||
find_file(GLFW3_LIB "glfw3" PATH "${GLFW_LIB_DIR}")
|
||||
if(GLFW3_H AND GLFW3_LIB)
|
||||
message(STATUS "Found GLFW/glfw3.h and GLFW library.")
|
||||
set(HAVE_GLFW3_H 1)
|
||||
endif()
|
||||
endif()
|
||||
@ -51,21 +53,22 @@ if(${Vulkan_FOUND})
|
||||
${Vulkan_INCLUDE_DIRS}
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
)
|
||||
if(WIN32)
|
||||
target_include_directories(vulkanImageCUDA PUBLIC
|
||||
${GLFW_INCLUDE_DIR}
|
||||
)
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw3.dll
|
||||
${GLFW3_LIB}
|
||||
)
|
||||
else()
|
||||
target_link_libraries(vulkanImageCUDA
|
||||
${Vulkan_LIBRARIES}
|
||||
OpenGL::GL
|
||||
glfw
|
||||
)
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET vulkanImageCUDA POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/shader.frag
|
||||
|
@ -53,7 +53,7 @@ const char *sSDKsample = "Transpose";
|
||||
// TILE_DIM/BLOCK_ROWS elements. TILE_DIM must be an integral multiple of
|
||||
// BLOCK_ROWS
|
||||
|
||||
#define TILE_DIM 16
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 16
|
||||
|
||||
// This sample assumes that MATRIX_SIZE_X = MATRIX_SIZE_Y
|
||||
|
Loading…
x
Reference in New Issue
Block a user