mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-10 18:22:11 +01:00
Remove concurrentKernels
This commit is contained in:
parent
274836a1a2
commit
ff264a798f
@ -4,6 +4,7 @@
|
||||
* Removed the following outdated samples:
|
||||
* `0_Introduction`
|
||||
* `c++11_cuda` demonstrating CUDA and C++ 11 interoperability (reason: obsolete)
|
||||
* `concurrentKernels.cu` demonstrating the ability to run multiple kernels simultaneously (reason: obsolete)
|
||||
|
||||
### CUDA 12.5
|
||||
|
||||
|
@ -1,18 +0,0 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**",
|
||||
"${workspaceFolder}/../../../Common"
|
||||
],
|
||||
"defines": [],
|
||||
"compilerPath": "/usr/local/cuda/bin/nvcc",
|
||||
"cStandard": "gnu17",
|
||||
"cppStandard": "gnu++14",
|
||||
"intelliSenseMode": "linux-gcc-x64",
|
||||
"configurationProvider": "ms-vscode.makefile-tools"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"nvidia.nsight-vscode-edition",
|
||||
"ms-vscode.cpptools",
|
||||
"ms-vscode.makefile-tools"
|
||||
]
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "CUDA C++: Launch",
|
||||
"type": "cuda-gdb",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/concurrentKernels"
|
||||
}
|
||||
]
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"label": "sample",
|
||||
"type": "shell",
|
||||
"command": "make dbg=1",
|
||||
"problemMatcher": ["$nvcc"],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -1,357 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
# Gencode arguments
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
SMS ?= 53 61 70 72 75 80 86 87 90
|
||||
else
|
||||
SMS ?= 50 52 60 61 70 75 80 86 89 90
|
||||
endif
|
||||
|
||||
ifeq ($(SMS),)
|
||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --threads 0 --std=c++11
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: concurrentKernels
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
concurrentKernels.o:concurrentKernels.cu
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
concurrentKernels: concurrentKernels.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./concurrentKernels
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f concurrentKernels concurrentKernels.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/concurrentKernels
|
||||
|
||||
clobber: clean
|
@ -1,87 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>concurrentKernels</name>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaStreamDestroy</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaMemcpyAsync</toolkit>
|
||||
<toolkit>cudaFree</toolkit>
|
||||
<toolkit>cudaMallocHost</toolkit>
|
||||
<toolkit>cudaEventCreateWithFlags</toolkit>
|
||||
<toolkit>cudaEventSynchronize</toolkit>
|
||||
<toolkit>cudaEventRecord</toolkit>
|
||||
<toolkit>cudaFreeHost</toolkit>
|
||||
<toolkit>cudaGetDevice</toolkit>
|
||||
<toolkit>cudaStreamWaitEvent</toolkit>
|
||||
<toolkit>cudaEventDestroy</toolkit>
|
||||
<toolkit>cudaEventElapsedTime</toolkit>
|
||||
<toolkit>cudaStreamCreate</toolkit>
|
||||
<toolkit>cudaGetDeviceProperties</toolkit>
|
||||
<toolkit>cudaEventCreate</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="advanced">Performance Strategies</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>CUDA</keyword>
|
||||
<keyword>Concurrent Kernels</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>concurrentKernels.cu</primary_file>
|
||||
<scopes>
|
||||
<scope>1:CUDA Advanced Topics</scope>
|
||||
<scope>1:Performance Strategies</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm50</sm-arch>
|
||||
<sm-arch>sm52</sm-arch>
|
||||
<sm-arch>sm53</sm-arch>
|
||||
<sm-arch>sm60</sm-arch>
|
||||
<sm-arch>sm61</sm-arch>
|
||||
<sm-arch>sm70</sm-arch>
|
||||
<sm-arch>sm72</sm-arch>
|
||||
<sm-arch>sm75</sm-arch>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>macosx</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>arm</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<include>all</include>
|
||||
</supported_sm_architectures>
|
||||
<title>Concurrent Kernels</title>
|
||||
</entry>
|
@ -1,70 +0,0 @@
|
||||
# concurrentKernels - Concurrent Kernels
|
||||
|
||||
## Description
|
||||
|
||||
This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
Performance Strategies
|
||||
|
||||
## Supported SM Architectures
|
||||
|
||||
[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
|
||||
|
||||
## Supported OSes
|
||||
|
||||
Linux, Windows
|
||||
|
||||
## Supported CPU Architecture
|
||||
|
||||
x86_64, ppc64le, armv7l
|
||||
|
||||
## CUDA APIs involved
|
||||
|
||||
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
|
||||
cudaStreamDestroy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaEventCreateWithFlags, cudaEventSynchronize, cudaEventRecord, cudaFreeHost, cudaGetDevice, cudaStreamWaitEvent, cudaEventDestroy, cudaEventElapsedTime, cudaStreamCreate, cudaGetDeviceProperties, cudaEventCreate
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||
|
||||
## Build and Run
|
||||
|
||||
### Windows
|
||||
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
|
||||
```
|
||||
*_vs<version>.sln - for Visual Studio <version>
|
||||
```
|
||||
Each individual sample has its own set of solution files in its directory:
|
||||
|
||||
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
|
||||
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
|
||||
|
||||
### Linux
|
||||
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
|
||||
```
|
||||
$ cd <sample_dir>
|
||||
$ make
|
||||
```
|
||||
The samples makefiles can take advantage of certain options:
|
||||
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
|
||||
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
|
||||
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||
* **dbg=1** - build with debug symbols
|
||||
```
|
||||
$ make dbg=1
|
||||
```
|
||||
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
|
||||
```
|
||||
$ make SMS="50 60"
|
||||
```
|
||||
|
||||
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
|
||||
```
|
||||
$ make HOST_COMPILER=g++
|
||||
```
|
||||
|
||||
## References (for more details)
|
||||
|
@ -1,228 +0,0 @@
|
||||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
//
|
||||
// This sample demonstrates the use of streams for concurrent execution. It also
|
||||
// illustrates how to introduce dependencies between CUDA streams with the
|
||||
// cudaStreamWaitEvent function.
|
||||
//
|
||||
|
||||
// Devices of compute capability 2.0 or higher can overlap the kernels
|
||||
//
|
||||
#include <cooperative_groups.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_functions.h>
|
||||
|
||||
// This is a kernel that does no real work but runs at least for a specified
|
||||
// number of clocks
|
||||
__global__ void clock_block(clock_t *d_o, clock_t clock_count) {
|
||||
unsigned int start_clock = (unsigned int)clock();
|
||||
|
||||
clock_t clock_offset = 0;
|
||||
|
||||
while (clock_offset < clock_count) {
|
||||
unsigned int end_clock = (unsigned int)clock();
|
||||
|
||||
// The code below should work like
|
||||
// this (thanks to modular arithmetics):
|
||||
//
|
||||
// clock_offset = (clock_t) (end_clock > start_clock ?
|
||||
// end_clock - start_clock :
|
||||
// end_clock + (0xffffffffu - start_clock));
|
||||
//
|
||||
// Indeed, let m = 2^32 then
|
||||
// end - start = end + m - start (mod m).
|
||||
|
||||
clock_offset = (clock_t)(end_clock - start_clock);
|
||||
}
|
||||
|
||||
d_o[0] = clock_offset;
|
||||
}
|
||||
|
||||
// Single warp reduction kernel
|
||||
__global__ void sum(clock_t *d_clocks, int N) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ clock_t s_clocks[32];
|
||||
|
||||
clock_t my_sum = 0;
|
||||
|
||||
for (int i = threadIdx.x; i < N; i += blockDim.x) {
|
||||
my_sum += d_clocks[i];
|
||||
}
|
||||
|
||||
s_clocks[threadIdx.x] = my_sum;
|
||||
cg::sync(cta);
|
||||
|
||||
for (int i = 16; i > 0; i /= 2) {
|
||||
if (threadIdx.x < i) {
|
||||
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
}
|
||||
|
||||
d_clocks[0] = s_clocks[0];
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int nkernels = 8; // number of concurrent kernels
|
||||
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
|
||||
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
|
||||
float kernel_time = 10; // time the kernel should run in ms
|
||||
float elapsed_time; // timing variables
|
||||
int cuda_device = 0;
|
||||
|
||||
printf("[%s] - Starting...\n", argv[0]);
|
||||
|
||||
// get number of kernels if overridden on the command line
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) {
|
||||
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
|
||||
nstreams = nkernels + 1;
|
||||
}
|
||||
|
||||
// use command-line specified CUDA device, otherwise use device with highest
|
||||
// Gflops/s
|
||||
cuda_device = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
checkCudaErrors(cudaGetDevice(&cuda_device));
|
||||
|
||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
||||
|
||||
if ((deviceProp.concurrentKernels == 0)) {
|
||||
printf("> GPU does not support concurrent kernel execution\n");
|
||||
printf(" CUDA kernel runs will be serialized\n");
|
||||
}
|
||||
|
||||
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
|
||||
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
|
||||
|
||||
// allocate host memory
|
||||
clock_t *a = 0; // pointer to the array data in host memory
|
||||
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
||||
|
||||
// allocate device memory
|
||||
clock_t *d_a = 0; // pointers to data and init value in the device memory
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
||||
|
||||
// allocate and initialize an array of stream handles
|
||||
cudaStream_t *streams =
|
||||
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
||||
|
||||
for (int i = 0; i < nstreams; i++) {
|
||||
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
||||
}
|
||||
|
||||
// create CUDA event handles
|
||||
cudaEvent_t start_event, stop_event;
|
||||
checkCudaErrors(cudaEventCreate(&start_event));
|
||||
checkCudaErrors(cudaEventCreate(&stop_event));
|
||||
|
||||
// the events are used for synchronization only and hence do not need to
|
||||
// record timings this also makes events not introduce global sync points when
|
||||
// recorded which is critical to get overlap
|
||||
cudaEvent_t *kernelEvent;
|
||||
kernelEvent = (cudaEvent_t *)malloc(nkernels * sizeof(cudaEvent_t));
|
||||
|
||||
for (int i = 0; i < nkernels; i++) {
|
||||
checkCudaErrors(
|
||||
cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// time execution with nkernels streams
|
||||
clock_t total_clocks = 0;
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
// the kernel takes more time than the channel reset time on arm archs, so to
|
||||
// prevent hangs reduce time_clocks.
|
||||
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
|
||||
#else
|
||||
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
|
||||
#endif
|
||||
|
||||
cudaEventRecord(start_event, 0);
|
||||
|
||||
// queue nkernels in separate streams and record when they are done
|
||||
for (int i = 0; i < nkernels; ++i) {
|
||||
clock_block<<<1, 1, 0, streams[i]>>>(&d_a[i], time_clocks);
|
||||
total_clocks += time_clocks;
|
||||
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
|
||||
|
||||
// make the last stream wait for the kernel event to be recorded
|
||||
checkCudaErrors(
|
||||
cudaStreamWaitEvent(streams[nstreams - 1], kernelEvent[i], 0));
|
||||
}
|
||||
|
||||
// queue a sum kernel and a copy back to host in the last stream.
|
||||
// the commands in this stream get dispatched as soon as all the kernel events
|
||||
// have been recorded
|
||||
sum<<<1, 32, 0, streams[nstreams - 1]>>>(d_a, nkernels);
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
|
||||
|
||||
// at this point the CPU has dispatched all work for the GPU and can continue
|
||||
// processing other tasks in parallel
|
||||
|
||||
// in this sample we just wait until the GPU is done
|
||||
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
||||
checkCudaErrors(cudaEventSynchronize(stop_event));
|
||||
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
||||
|
||||
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels,
|
||||
nkernels * kernel_time / 1000.0f);
|
||||
printf("Expected time for concurrent execution of %d kernels = %.3fs\n",
|
||||
nkernels, kernel_time / 1000.0f);
|
||||
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
|
||||
|
||||
bool bTestResult = (a[0] > total_clocks);
|
||||
|
||||
// release resources
|
||||
for (int i = 0; i < nkernels; i++) {
|
||||
cudaStreamDestroy(streams[i]);
|
||||
cudaEventDestroy(kernelEvent[i]);
|
||||
}
|
||||
|
||||
free(streams);
|
||||
free(kernelEvent);
|
||||
|
||||
cudaEventDestroy(start_event);
|
||||
cudaEventDestroy(stop_event);
|
||||
cudaFreeHost(a);
|
||||
cudaFree(d_a);
|
||||
|
||||
if (!bTestResult) {
|
||||
printf("Test failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("Test passed\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user