mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-10 18:22:11 +01:00
Update StreamPriorities, bf16TensorCoreGemm, binaryPartitionCG
This commit is contained in:
parent
df5ff58ca4
commit
dd73281bc6
@ -1,6 +1,6 @@
|
||||
add_subdirectory(EGLStream_CUDA_CrossGPU)
|
||||
add_subdirectory(EGLStream_CUDA_Interop)
|
||||
add_subdirectory(EGLSync_CUDAEvent_Interop)
|
||||
#add_subdirectory(EGLSync_CUDAEvent_Interop)
|
||||
add_subdirectory(FunctionPointers)
|
||||
add_subdirectory(MC_EstimatePiInlineP)
|
||||
add_subdirectory(MC_EstimatePiInlineQ)
|
||||
|
24
Samples/3_CUDA_Features/CMakeLists.txt
Normal file
24
Samples/3_CUDA_Features/CMakeLists.txt
Normal file
@ -0,0 +1,24 @@
|
||||
add_subdirectory(StreamPriorities)
|
||||
add_subdirectory(bf16TensorCoreGemm)
|
||||
add_subdirectory(binaryPartitionCG)
|
||||
#add_subdirectory(bindlessTexture)
|
||||
#add_subdirectory(cdpAdvancedQuicksort)
|
||||
#add_subdirectory(cdpBezierTessellation)
|
||||
#add_subdirectory(cdpQuadtree)
|
||||
#add_subdirectory(cdpSimplePrint)
|
||||
#add_subdirectory(cdpSimpleQuicksort)
|
||||
#add_subdirectory(cudaCompressibleMemory)
|
||||
#add_subdirectory(cudaTensorCoreGemm)
|
||||
#add_subdirectory(dmmaTensorCoreGemm)
|
||||
#add_subdirectory(globalToShmemAsyncCopy)
|
||||
#add_subdirectory(graphConditionalNodes)
|
||||
#add_subdirectory(graphMemoryFootprint)
|
||||
#add_subdirectory(graphMemoryNodes)
|
||||
#add_subdirectory(immaTensorCoreGemm)
|
||||
#add_subdirectory(jacobiCudaGraphs)
|
||||
#add_subdirectory(memMapIPCDrv)
|
||||
#add_subdirectory(newdelete)
|
||||
#add_subdirectory(ptxjit)
|
||||
#add_subdirectory(simpleCudaGraphs)
|
||||
#add_subdirectory(tf32TensorCoreGemm)
|
||||
#add_subdirectory(warpAggregatedAtomicsCG)
|
11
Samples/3_CUDA_Features/StreamPriorities/CMakeLists.txt
Normal file
11
Samples/3_CUDA_Features/StreamPriorities/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
# Include directories and libraries
|
||||
include_directories(../../../Common)
|
||||
|
||||
# Source file
|
||||
set(SRC_FILES
|
||||
StreamPriorities.cu
|
||||
)
|
||||
|
||||
# Add target for StreamPriorities
|
||||
add_executable(StreamPriorities ${SRC_FILES})
|
||||
set_target_properties(StreamPriorities PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
@ -1,377 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
# This sample is not supported on ARMv7
|
||||
ifeq ($(TARGET_ARCH),armv7l)
|
||||
$(info >>> WARNING - StreamPriorities is not supported on ARMv7 - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
# This sample is not supported on aarch64
|
||||
ifeq ($(TARGET_ARCH),aarch64)
|
||||
$(info >>> WARNING - StreamPriorities is not supported on aarch64 - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
# This sample is not supported on sbsa
|
||||
ifeq ($(TARGET_ARCH),sbsa)
|
||||
$(info >>> WARNING - StreamPriorities is not supported on sbsa - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
# Gencode arguments
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
SMS ?= 53 61 70 72 75 80 86 87 90
|
||||
else
|
||||
SMS ?= 50 52 60 61 70 75 80 86 89 90
|
||||
endif
|
||||
|
||||
ifeq ($(SMS),)
|
||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --threads 0 --std=c++11
|
||||
|
||||
LIBRARIES += -lcudadevrt
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: StreamPriorities
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
StreamPriorities.o:StreamPriorities.cu
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
StreamPriorities: StreamPriorities.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./StreamPriorities
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f StreamPriorities StreamPriorities.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/StreamPriorities
|
||||
|
||||
clobber: clean
|
@ -1,63 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>StreamPriorities</name>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaMemcpy</toolkit>
|
||||
<toolkit>cudaStreamCreateWithPriority</toolkit>
|
||||
<toolkit>cudaDeviceGetStreamPriorityRange</toolkit>
|
||||
<toolkit>cudaEventSynchronize</toolkit>
|
||||
<toolkit>cudaEventRecord</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaEventElapsedTime</toolkit>
|
||||
<toolkit>cudaGetDeviceProperties</toolkit>
|
||||
<toolkit>cudaEventCreate</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[This sample demonstrates basic use of stream priorities.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="advanced">CUDA Streams and Events</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>GPGPU</keyword>
|
||||
<keyword>Streams</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
<library>cudadevrt</library>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>StreamPriorities.cu</primary_file>
|
||||
<required_dependencies>
|
||||
<dependency>Stream-Priorities</dependency>
|
||||
</required_dependencies>
|
||||
<scopes>
|
||||
<scope>1:CUDA Advanced Topics</scope>
|
||||
<scope>1:Streams</scope>
|
||||
</scopes>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>macosx</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<from>3.5</from>
|
||||
</supported_sm_architectures>
|
||||
<title>Stream Priorities</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
11
Samples/3_CUDA_Features/bf16TensorCoreGemm/CMakeLists.txt
Normal file
11
Samples/3_CUDA_Features/bf16TensorCoreGemm/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
# Include directories and libraries
|
||||
include_directories(../../../Common)
|
||||
|
||||
# Source file
|
||||
set(SRC_FILES
|
||||
bf16TensorCoreGemm.cu
|
||||
)
|
||||
|
||||
# Add target for bf16TensorCoreGemm
|
||||
add_executable(bf16TensorCoreGemm ${SRC_FILES})
|
||||
set_target_properties(bf16TensorCoreGemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
@ -1,403 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
# This sample is not supported on Mac OSX
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
$(info >>> WARNING - bf16TensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
# This sample is not supported on ARMv7
|
||||
ifeq ($(TARGET_ARCH),armv7l)
|
||||
$(info >>> WARNING - bf16TensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
# This sample is not supported on QNX
|
||||
ifeq ($(TARGET_OS),qnx)
|
||||
$(info >>> WARNING - bf16TensorCoreGemm is not supported on QNX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
#Detect if installed version of GCC supports required C++11
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
empty :=
|
||||
space := $(empty) $(empty)
|
||||
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
|
||||
#Create version number without "."
|
||||
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
|
||||
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
|
||||
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
|
||||
# Make sure the version number has at least 3 decimals
|
||||
GCCVERSION += 00
|
||||
# Remove spaces from the version number
|
||||
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
|
||||
#$(warning $(GCCVERSION))
|
||||
|
||||
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 51000)
|
||||
ifneq ($(CUSTOM_HOST_COMPILER), 1)
|
||||
ifeq ($(IS_MIN_VERSION), 1)
|
||||
$(info >>> GCC Version is greater or equal to 5.1.0 <<<)
|
||||
else
|
||||
$(info >>> Waiving build. Minimum GCC version required is 5.1.0<<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
else
|
||||
$(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 5.1.0 <<<)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Gencode arguments
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
SMS ?= 80 86 87 90
|
||||
else
|
||||
SMS ?= 80 86 89 90
|
||||
endif
|
||||
|
||||
ifeq ($(SMS),)
|
||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --std=c++11 --threads 0
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: bf16TensorCoreGemm
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
bf16TensorCoreGemm.o:bf16TensorCoreGemm.cu
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
bf16TensorCoreGemm: bf16TensorCoreGemm.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./bf16TensorCoreGemm
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f bf16TensorCoreGemm bf16TensorCoreGemm.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bf16TensorCoreGemm
|
||||
|
||||
clobber: clean
|
@ -1,81 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>bf16TensorCoreGemm</name>
|
||||
<cflags>
|
||||
<flag>--std=c++11</flag>
|
||||
</cflags>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaMemcpy</toolkit>
|
||||
<toolkit>cudaFree</toolkit>
|
||||
<toolkit>cudaGetErrorString</toolkit>
|
||||
<toolkit>cudaGetLastError</toolkit>
|
||||
<toolkit>cudaEventSynchronize</toolkit>
|
||||
<toolkit>cudaFuncSetAttribute</toolkit>
|
||||
<toolkit>cudaEventRecord</toolkit>
|
||||
<toolkit>cudaMemset</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaEventElapsedTime</toolkit>
|
||||
<toolkit>cudaGetDeviceProperties</toolkit>
|
||||
<toolkit>cudaEventCreate</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[A CUDA sample demonstrating __nv_bfloat16 (e8m7) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="basic">Matrix Multiply</concept>
|
||||
<concept level="advanced">WMMA</concept>
|
||||
<concept level="advanced">Tensor Cores</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>matrix multiply</keyword>
|
||||
<keyword>Async copy</keyword>
|
||||
<keyword>CPP11</keyword>
|
||||
<keyword>GCC 5.1.0</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>bf16TensorCoreGemm.cu</primary_file>
|
||||
<required_dependencies>
|
||||
<dependency>CPP11</dependency>
|
||||
</required_dependencies>
|
||||
<scopes>
|
||||
<scope>1:CUDA Basic Topics</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>aarch64</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<from>8.0</from>
|
||||
</supported_sm_architectures>
|
||||
<title>bfloat16 Tensor Core GEMM</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
11
Samples/3_CUDA_Features/binaryPartitionCG/CMakeLists.txt
Normal file
11
Samples/3_CUDA_Features/binaryPartitionCG/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
# Include directories and libraries
|
||||
include_directories(../../../Common)
|
||||
|
||||
# Source file
|
||||
set(SRC_FILES
|
||||
binaryPartitionCG.cu
|
||||
)
|
||||
|
||||
# Add target for binaryPartitionCG
|
||||
add_executable(binaryPartitionCG ${SRC_FILES})
|
||||
set_target_properties(binaryPartitionCG PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
@ -1,391 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
# This sample is not supported on Mac OSX
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
$(info >>> WARNING - binaryPartitionCG is not supported on Mac OSX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
#Detect if installed version of GCC supports required C++11
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
empty :=
|
||||
space := $(empty) $(empty)
|
||||
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
|
||||
#Create version number without "."
|
||||
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
|
||||
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
|
||||
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
|
||||
# Make sure the version number has at least 3 decimals
|
||||
GCCVERSION += 00
|
||||
# Remove spaces from the version number
|
||||
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
|
||||
#$(warning $(GCCVERSION))
|
||||
|
||||
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
|
||||
ifneq ($(CUSTOM_HOST_COMPILER), 1)
|
||||
ifeq ($(IS_MIN_VERSION), 1)
|
||||
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
|
||||
else
|
||||
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
else
|
||||
$(warning >>> Custom HOST_COMPILER set; skipping GCC version check. This may lead to unintended behavior. Please note the minimum equivalent GCC version is 4.7.0 <<<)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Gencode arguments
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
SMS ?= 53 61 70 72 75 80 86 87 90
|
||||
else
|
||||
SMS ?= 50 52 60 61 70 75 80 86 89 90
|
||||
endif
|
||||
|
||||
ifeq ($(SMS),)
|
||||
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --std=c++11 --threads 0
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: binaryPartitionCG
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
binaryPartitionCG.o:binaryPartitionCG.cu
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
binaryPartitionCG: binaryPartitionCG.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./binaryPartitionCG
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f binaryPartitionCG binaryPartitionCG.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/binaryPartitionCG
|
||||
|
||||
clobber: clean
|
@ -1,81 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>binaryPartitionCG</name>
|
||||
<cflags>
|
||||
<flag>--std=c++11</flag>
|
||||
</cflags>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaStreamCreateWithFlags</toolkit>
|
||||
<toolkit>cudaFree</toolkit>
|
||||
<toolkit>cudaMallocHost</toolkit>
|
||||
<toolkit>cudaFreeHost</toolkit>
|
||||
<toolkit>cudaStreamSynchronize</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaMemsetAsync</toolkit>
|
||||
<toolkit>cudaMemcpyAsync</toolkit>
|
||||
<toolkit>cudaOccupancyMaxPotentialBlockSize</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[This sample is a simple code that illustrates binary partition cooperative groups and reduce within the thread block.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="basic">Cooperative Groups</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>CUDA</keyword>
|
||||
<keyword>Parallel Reduction</keyword>
|
||||
<keyword>Cooperative Groups</keyword>
|
||||
<keyword>CPP11</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>binaryPartitionCG.cu</primary_file>
|
||||
<scopes>
|
||||
<scope>1:CUDA Basic Topics</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm50</sm-arch>
|
||||
<sm-arch>sm52</sm-arch>
|
||||
<sm-arch>sm53</sm-arch>
|
||||
<sm-arch>sm60</sm-arch>
|
||||
<sm-arch>sm61</sm-arch>
|
||||
<sm-arch>sm70</sm-arch>
|
||||
<sm-arch>sm72</sm-arch>
|
||||
<sm-arch>sm75</sm-arch>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>arm</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<include>all</include>
|
||||
</supported_sm_architectures>
|
||||
<title>Binary Partition Cooperative Groups</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
@ -1,3 +1,4 @@
|
||||
add_subdirectory(0_Introduction)
|
||||
add_subdirectory(1_Utilities)
|
||||
add_subdirectory(2_Concepts_and_Techniques)
|
||||
add_subdirectory(3_CUDA_Features)
|
||||
|
Loading…
x
Reference in New Issue
Block a user