mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-04-04 07:21:33 +01:00
Update boxFilterNPP, delete batchedLabelMarkersAndLabelCompressionNPP
This commit is contained in:
parent
89f2e5c0c3
commit
a3be0d3cd8
@ -13,6 +13,9 @@
|
||||
* `simpleVoteIntrinsics_nvrtc` demonstrating NVRTC usage for `simpleVoteIntrinsics` sample (reason: redundant)
|
||||
* `2_Concepts_and_Techniques`
|
||||
* `cuHook` demonstrating dlsym hooks. (reason: incompatible with modern `glibc`)
|
||||
* `4_CUDA_Libraries`
|
||||
* `batchedLabelMarkersAndLabelCompressionNPP` demonstrating NPP features (reason: some functionality removed from library)
|
||||
|
||||
|
||||
|
||||
### CUDA 12.5
|
||||
|
40
Samples/4_CUDA_Libraries/CMakeLists.txt
Normal file
40
Samples/4_CUDA_Libraries/CMakeLists.txt
Normal file
@ -0,0 +1,40 @@
|
||||
#add_subdirectory(FilterBorderControlNPP)
|
||||
#add_subdirectory(MersenneTwisterGP11213)
|
||||
add_subdirectory(batchCUBLAS)
|
||||
add_subdirectory(boxFilterNPP)
|
||||
#add_subdirectory(cannyEdgeDetectorNPP)
|
||||
#add_subdirectory(conjugateGradient)
|
||||
#add_subdirectory(conjugateGradientCudaGraphs)
|
||||
#add_subdirectory(conjugateGradientMultiBlockCG)
|
||||
#add_subdirectory(conjugateGradientMultiDeviceCG)
|
||||
#add_subdirectory(conjugateGradientPrecond)
|
||||
#add_subdirectory(conjugateGradientUM)
|
||||
#add_subdirectory(cuDLAErrorReporting)
|
||||
#add_subdirectory(cuDLAHybridMode)
|
||||
#add_subdirectory(cuDLALayerwiseStatsHybrid)
|
||||
#add_subdirectory(cuDLALayerwiseStatsStandalone)
|
||||
#add_subdirectory(cuDLAStandaloneMode)
|
||||
#add_subdirectory(cuSolverDn_LinearSolver)
|
||||
#add_subdirectory(cuSolverRf)
|
||||
#add_subdirectory(cuSolverSp_LinearSolver)
|
||||
#add_subdirectory(cuSolverSp_LowlevelCholesky)
|
||||
#add_subdirectory(cuSolverSp_LowlevelQR)
|
||||
#add_subdirectory(cudaNvSci)
|
||||
#add_subdirectory(cudaNvSciNvMedia)
|
||||
#add_subdirectory(freeImageInteropNPP)
|
||||
#add_subdirectory(histEqualizationNPP)
|
||||
#add_subdirectory(jitLto)
|
||||
#add_subdirectory(lineOfSight)
|
||||
#add_subdirectory(matrixMulCUBLAS)
|
||||
#add_subdirectory(nvJPEG)
|
||||
#add_subdirectory(nvJPEG_encoder)
|
||||
#add_subdirectory(oceanFFT)
|
||||
#add_subdirectory(randomFog)
|
||||
#add_subdirectory(simpleCUBLAS)
|
||||
#add_subdirectory(simpleCUBLASXT)
|
||||
#add_subdirectory(simpleCUBLAS_LU)
|
||||
#add_subdirectory(simpleCUFFT)
|
||||
#add_subdirectory(simpleCUFFT_2d_MGPU)
|
||||
#add_subdirectory(simpleCUFFT_MGPU)
|
||||
#add_subdirectory(simpleCUFFT_callback)
|
||||
#add_subdirectory(watershedSegmentationNPP)
|
20
Samples/4_CUDA_Libraries/batchCUBLAS/CMakeLists.txt
Normal file
20
Samples/4_CUDA_Libraries/batchCUBLAS/CMakeLists.txt
Normal file
@ -0,0 +1,20 @@
|
||||
# Include directories and libraries
|
||||
include_directories(../../../Common)
|
||||
|
||||
# Source file
|
||||
set(SRC_FILES
|
||||
batchCUBLAS.cpp
|
||||
)
|
||||
|
||||
# Add target for batchCUBLAS
|
||||
add_executable(batchCUBLAS ${SRC_FILES})
|
||||
set_target_properties(batchCUBLAS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
target_include_directories(batchCUBLAS PRIVATE
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(batchCUBLAS PRIVATE
|
||||
CUDA::cublas
|
||||
CUDA::cudart
|
||||
)
|
@ -1,347 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
# Gencode arguments
|
||||
SMS ?=
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
ifeq ($(SMS),)
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
# Generate PTX code from SM 53
|
||||
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
|
||||
else
|
||||
# Generate PTX code from SM 50
|
||||
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
|
||||
endif
|
||||
endif
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --threads 0 --std=c++11
|
||||
|
||||
LIBRARIES += -lcublas
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: batchCUBLAS
|
||||
|
||||
batchCUBLAS.o:batchCUBLAS.cpp
|
||||
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
batchCUBLAS: batchCUBLAS.o
|
||||
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
./batchCUBLAS
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f batchCUBLAS batchCUBLAS.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/batchCUBLAS
|
||||
|
||||
clobber: clean
|
@ -1,89 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>batchCUBLAS</name>
|
||||
<cuda_api_list>
|
||||
<driver>cuRand</driver>
|
||||
<driver>cuEqual</driver>
|
||||
<toolkit>cudaMemcpy</toolkit>
|
||||
<toolkit>cudaGetErrorString</toolkit>
|
||||
<toolkit>cudaFree</toolkit>
|
||||
<toolkit>cudaGetLastError</toolkit>
|
||||
<toolkit>cudaDeviceSynchronize</toolkit>
|
||||
<toolkit>cudaGetDevice</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaStreamCreate</toolkit>
|
||||
<toolkit>cudaGetDeviceProperties</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[A CUDA Sample that demonstrates how using batched CUBLAS API calls to improve overall performance.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<fallback_min_ptx>true</fallback_min_ptx>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="basic">Linear Algebra</concept>
|
||||
<concept level="basic">CUBLAS Library</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>CUBLAS</keyword>
|
||||
<keyword>Linear Algebra</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
<library>cublas</library>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>batchCUBLAS.cpp</primary_file>
|
||||
<required_dependencies>
|
||||
<dependency>CUBLAS</dependency>
|
||||
</required_dependencies>
|
||||
<scopes>
|
||||
<scope>1:CUDA Basic Topics</scope>
|
||||
<scope>3:Linear Algebra</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm50</sm-arch>
|
||||
<sm-arch>sm52</sm-arch>
|
||||
<sm-arch>sm53</sm-arch>
|
||||
<sm-arch>sm60</sm-arch>
|
||||
<sm-arch>sm61</sm-arch>
|
||||
<sm-arch>sm70</sm-arch>
|
||||
<sm-arch>sm72</sm-arch>
|
||||
<sm-arch>sm75</sm-arch>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>macosx</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>arm</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<include>all</include>
|
||||
</supported_sm_architectures>
|
||||
<title>batchCUBLAS</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
@ -1,18 +0,0 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**",
|
||||
"${workspaceFolder}/../../../Common"
|
||||
],
|
||||
"defines": [],
|
||||
"compilerPath": "/usr/local/cuda/bin/nvcc",
|
||||
"cStandard": "gnu17",
|
||||
"cppStandard": "gnu++14",
|
||||
"intelliSenseMode": "linux-gcc-x64",
|
||||
"configurationProvider": "ms-vscode.makefile-tools"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"nvidia.nsight-vscode-edition",
|
||||
"ms-vscode.cpptools",
|
||||
"ms-vscode.makefile-tools"
|
||||
]
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "CUDA C++: Launch",
|
||||
"type": "cuda-gdb",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/batchedLabelMarkersAndLabelCompressionNPP"
|
||||
}
|
||||
]
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"label": "sample",
|
||||
"type": "shell",
|
||||
"command": "make dbg=1",
|
||||
"problemMatcher": ["$nvcc"],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,372 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
# This sample is not supported on Mac OSX
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on Mac OSX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
# This sample is not supported on QNX
|
||||
ifeq ($(TARGET_OS),qnx)
|
||||
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
# Gencode arguments
|
||||
SMS ?=
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
ifeq ($(SMS),)
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
# Generate PTX code from SM 53
|
||||
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
|
||||
else
|
||||
# Generate PTX code from SM 50
|
||||
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
|
||||
endif
|
||||
endif
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --threads 0 --std=c++11
|
||||
|
||||
LIBRARIES += -lnppisu_static -lnppif_static -lnppc_static -lculibos
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: batchedLabelMarkersAndLabelCompressionNPP
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
batchedLabelMarkersAndLabelCompressionNPP.o:batchedLabelMarkersAndLabelCompressionNPP.cpp
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
batchedLabelMarkersAndLabelCompressionNPP: batchedLabelMarkersAndLabelCompressionNPP.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./batchedLabelMarkersAndLabelCompressionNPP
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f batchedLabelMarkersAndLabelCompressionNPP batchedLabelMarkersAndLabelCompressionNPP.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/batchedLabelMarkersAndLabelCompressionNPP
|
||||
|
||||
clobber: clean
|
@ -1,95 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>batchedLabelMarkersAndLabelCompressionNPP</name>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaRuntimeGetVersion</toolkit>
|
||||
<toolkit>cudaMallocPitch</toolkit>
|
||||
<toolkit>cudaFree</toolkit>
|
||||
<toolkit>cudaDeviceGetAttribute</toolkit>
|
||||
<toolkit>cudaMallocHost</toolkit>
|
||||
<toolkit>cudaDriverGetVersion</toolkit>
|
||||
<toolkit>cudaFreeHost</toolkit>
|
||||
<toolkit>cudaGetDevice</toolkit>
|
||||
<toolkit>cudaStreamGetFlags</toolkit>
|
||||
<toolkit>cudaStreamSynchronize</toolkit>
|
||||
<toolkit>cudaMalloc</toolkit>
|
||||
<toolkit>cudaMemcpyAsync</toolkit>
|
||||
<toolkit>cudaGetDeviceProperties</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[An NPP CUDA Sample that demonstrates how to use the NPP label markers generation and label compression functions based on a Union Find (UF) algorithm including both single image and batched image versions.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<fallback_min_ptx>true</fallback_min_ptx>
|
||||
<includepaths>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="basic">Performance Strategies</concept>
|
||||
<concept level="basic">Image Processing</concept>
|
||||
<concept level="basic">NPP Library</concept>
|
||||
<concept level="basic">Using NPP Batch Functions</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>CUDA</keyword>
|
||||
<keyword>NPP</keyword>
|
||||
<keyword>Image Processing</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
<library>nppisu_static</library>
|
||||
<library>nppif_static</library>
|
||||
<library>nppc_static</library>
|
||||
<library>culibos</library>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>batchedLabelMarkersAndLabelCompressionNPP.cpp</primary_file>
|
||||
<required_dependencies>
|
||||
<dependency>NPP</dependency>
|
||||
</required_dependencies>
|
||||
<scopes>
|
||||
<scope>1:CUDA Basic Topics</scope>
|
||||
<scope>1:Performance Strategies</scope>
|
||||
<scope>2:Image Processing</scope>
|
||||
<scope>2:Computer Vision</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm50</sm-arch>
|
||||
<sm-arch>sm52</sm-arch>
|
||||
<sm-arch>sm53</sm-arch>
|
||||
<sm-arch>sm60</sm-arch>
|
||||
<sm-arch>sm61</sm-arch>
|
||||
<sm-arch>sm70</sm-arch>
|
||||
<sm-arch>sm72</sm-arch>
|
||||
<sm-arch>sm75</sm-arch>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>arm</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<include>all</include>
|
||||
</supported_sm_architectures>
|
||||
<title>Batched Label Markers And Label Compression NPP</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,74 +0,0 @@
|
||||
# batchedLabelMarkersAndLabelCompressionNPP - Batched Label Markers And Label Compression NPP
|
||||
|
||||
## Description
|
||||
|
||||
An NPP CUDA Sample that demonstrates how to use the NPP label markers generation and label compression functions based on a Union Find (UF) algorithm including both single image and batched image versions.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
Performance Strategies, Image Processing, NPP Library, Using NPP Batch Functions
|
||||
|
||||
## Supported SM Architectures
|
||||
|
||||
[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
|
||||
|
||||
## Supported OSes
|
||||
|
||||
Linux, Windows
|
||||
|
||||
## Supported CPU Architecture
|
||||
|
||||
x86_64, ppc64le, armv7l
|
||||
|
||||
## CUDA APIs involved
|
||||
|
||||
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
|
||||
cudaRuntimeGetVersion, cudaMallocPitch, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cudaDriverGetVersion, cudaFreeHost, cudaGetDevice, cudaStreamGetFlags, cudaStreamSynchronize, cudaMalloc, cudaMemcpyAsync, cudaGetDeviceProperties
|
||||
|
||||
## Dependencies needed to build/run
|
||||
[NPP](../../../README.md#npp)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||
|
||||
## Build and Run
|
||||
|
||||
### Windows
|
||||
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
|
||||
```
|
||||
*_vs<version>.sln - for Visual Studio <version>
|
||||
```
|
||||
Each individual sample has its own set of solution files in its directory:
|
||||
|
||||
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
|
||||
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
|
||||
|
||||
### Linux
|
||||
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
|
||||
```
|
||||
$ cd <sample_dir>
|
||||
$ make
|
||||
```
|
||||
The samples makefiles can take advantage of certain options:
|
||||
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
|
||||
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
|
||||
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
|
||||
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
|
||||
* **dbg=1** - build with debug symbols
|
||||
```
|
||||
$ make dbg=1
|
||||
```
|
||||
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
|
||||
```
|
||||
$ make SMS="50 60"
|
||||
```
|
||||
|
||||
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
|
||||
```
|
||||
$ make HOST_COMPILER=g++
|
||||
```
|
||||
|
||||
## References (for more details)
|
||||
|
@ -1,805 +0,0 @@
|
||||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
#define WINDOWS_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#pragma warning(disable : 4819)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <fstream>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_string.h>
|
||||
#include <npp.h>
|
||||
|
||||
// Note: If you want to view these images we HIGHLY recommend using imagej
|
||||
// which is free on the internet and works on most platforms
|
||||
// because it is one of the few image viewing apps that can display 32
|
||||
// bit integer image data. While it normalizes the data to floating
|
||||
// point values for viewing it still provides a good representation of
|
||||
// the relative brightness of each label value. Note that label
|
||||
// compression output results in smaller differences between label values
|
||||
// making it visually more difficult to detect differences in labeled
|
||||
// regions. If you have an editor that can display hex values you can
|
||||
// see what the exact values of each label is, every 4 bytes represents 1
|
||||
// 32 bit integer label value.
|
||||
//
|
||||
// The files read and written by this sample app use RAW image format,
|
||||
// that is, only the image data itself exists in the files with no image
|
||||
// format information. When viewing RAW files with imagej just enter
|
||||
// the image size and bit depth values that are part of the file name
|
||||
// when requested by imagej.
|
||||
//
|
||||
// This sample app works in 2 stages, first it processes all of the
|
||||
// images individually then it processes them all again in 1 batch using
|
||||
// the Batch_Advanced versions of the NPP batch functions which allow
|
||||
// each image to have it's own ROI. The 2 stages are completely
|
||||
// separable but in this sample the second stage takes advantage of some
|
||||
// of the data that has already been initialized.
|
||||
//
|
||||
// Note that there is a small amount of variability in the number of
|
||||
// unique label markers generated from one run to the next by the UF
|
||||
// algorithm.
|
||||
//
|
||||
// Performance of ALL NPP image batch functions is limited by the maximum
|
||||
// ROI height in the list of images.
|
||||
|
||||
// Batched label compression support is only available on NPP versions > 11.0,
|
||||
// comment out if using NPP 11.0
|
||||
#define USE_BATCHED_LABEL_COMPRESSION 1
|
||||
|
||||
#define NUMBER_OF_IMAGES 5
|
||||
|
||||
Npp8u *pInputImageDev[NUMBER_OF_IMAGES];
|
||||
Npp8u *pInputImageHost[NUMBER_OF_IMAGES];
|
||||
Npp8u *pUFGenerateLabelsScratchBufferDev[NUMBER_OF_IMAGES];
|
||||
Npp8u *pUFCompressedLabelsScratchBufferDev[NUMBER_OF_IMAGES];
|
||||
Npp32u *pUFLabelDev[NUMBER_OF_IMAGES];
|
||||
Npp32u *pUFLabelHost[NUMBER_OF_IMAGES];
|
||||
NppiImageDescriptor *pUFBatchSrcImageListDev = 0;
|
||||
NppiImageDescriptor *pUFBatchSrcDstImageListDev = 0;
|
||||
NppiImageDescriptor *pUFBatchSrcImageListHost = 0;
|
||||
NppiImageDescriptor *pUFBatchSrcDstImageListHost = 0;
|
||||
NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListDev =
|
||||
0; // from nppi_filtering_functions.h
|
||||
NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListHost = 0;
|
||||
Npp32u *pUFBatchPerImageCompressedCountListDev = 0;
|
||||
Npp32u *pUFBatchPerImageCompressedCountListHost = 0;
|
||||
|
||||
void tearDown() // Clean up and tear down
|
||||
{
|
||||
if (pUFBatchPerImageCompressedCountListDev != 0)
|
||||
cudaFree(pUFBatchPerImageCompressedCountListDev);
|
||||
if (pUFBatchSrcDstScratchBufferListDev != 0)
|
||||
cudaFree(pUFBatchSrcDstScratchBufferListDev);
|
||||
if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
|
||||
if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
|
||||
if (pUFBatchPerImageCompressedCountListHost != 0)
|
||||
cudaFreeHost(pUFBatchPerImageCompressedCountListHost);
|
||||
if (pUFBatchSrcDstScratchBufferListHost != 0)
|
||||
cudaFreeHost(pUFBatchSrcDstScratchBufferListHost);
|
||||
if (pUFBatchSrcDstImageListHost != 0)
|
||||
cudaFreeHost(pUFBatchSrcDstImageListHost);
|
||||
if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost);
|
||||
|
||||
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
|
||||
if (pUFCompressedLabelsScratchBufferDev[j] != 0)
|
||||
cudaFree(pUFCompressedLabelsScratchBufferDev[j]);
|
||||
if (pUFGenerateLabelsScratchBufferDev[j] != 0)
|
||||
cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
|
||||
if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
|
||||
if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
|
||||
if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]);
|
||||
if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]);
|
||||
}
|
||||
}
|
||||
|
||||
const std::string &LabelMarkersOutputFile0 =
|
||||
"teapot_LabelMarkersUF_8Way_512x512_32u.raw";
|
||||
const std::string &LabelMarkersOutputFile1 =
|
||||
"CT_skull_LabelMarkersUF_8Way_512x512_32u.raw";
|
||||
const std::string &LabelMarkersOutputFile2 =
|
||||
"PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw";
|
||||
const std::string &LabelMarkersOutputFile3 =
|
||||
"PCB2_LabelMarkersUF_8Way_1024x683_32u.raw";
|
||||
const std::string &LabelMarkersOutputFile4 =
|
||||
"PCB_LabelMarkersUF_8Way_1280x720_32u.raw";
|
||||
|
||||
const std::string &CompressedMarkerLabelsOutputFile0 =
|
||||
"teapot_CompressedMarkerLabelsUF_8Way_512x512_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsOutputFile1 =
|
||||
"CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsOutputFile2 =
|
||||
"PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsOutputFile3 =
|
||||
"PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsOutputFile4 =
|
||||
"PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw";
|
||||
|
||||
const std::string &LabelMarkersBatchOutputFile0 =
|
||||
"teapot_LabelMarkersUFBatch_8Way_512x512_32u.raw";
|
||||
const std::string &LabelMarkersBatchOutputFile1 =
|
||||
"CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw";
|
||||
const std::string &LabelMarkersBatchOutputFile2 =
|
||||
"PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw";
|
||||
const std::string &LabelMarkersBatchOutputFile3 =
|
||||
"PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw";
|
||||
const std::string &LabelMarkersBatchOutputFile4 =
|
||||
"PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw";
|
||||
|
||||
const std::string &CompressedMarkerLabelsBatchOutputFile0 =
|
||||
"teapot_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsBatchOutputFile1 =
|
||||
"CT_skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsBatchOutputFile2 =
|
||||
"PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsBatchOutputFile3 =
|
||||
"PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u.raw";
|
||||
const std::string &CompressedMarkerLabelsBatchOutputFile4 =
|
||||
"PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u.raw";
|
||||
|
||||
int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
|
||||
FILE *bmpFile;
|
||||
size_t nSize;
|
||||
|
||||
if (nImage == 0) {
|
||||
if (nWidth != 512 || nHeight != 512) return -1;
|
||||
const char *fileName = "teapot_512x512_8u.raw";
|
||||
const char *InputFile = sdkFindFilePath(fileName, ".");
|
||||
if (InputFile == NULL) {
|
||||
printf("%s file not found.. exiting\n", fileName);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
FOPEN(bmpFile, InputFile, "rb");
|
||||
} else if (nImage == 1) {
|
||||
if (nWidth != 512 || nHeight != 512) return -1;
|
||||
const char *fileName = "CT_skull_512x512_8u.raw";
|
||||
const char *InputFile = sdkFindFilePath(fileName, ".");
|
||||
if (InputFile == NULL) {
|
||||
printf("%s file not found.. exiting\n", fileName);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
FOPEN(bmpFile, InputFile, "rb");
|
||||
} else if (nImage == 2) {
|
||||
if (nWidth != 509 || nHeight != 335) return -1;
|
||||
const char *fileName = "PCB_METAL_509x335_8u.raw";
|
||||
const char *InputFile = sdkFindFilePath(fileName, ".");
|
||||
if (InputFile == NULL) {
|
||||
printf("%s file not found.. exiting\n", fileName);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
FOPEN(bmpFile, InputFile, "rb");
|
||||
} else if (nImage == 3) {
|
||||
if (nWidth != 1024 || nHeight != 683) return -1;
|
||||
const char *fileName = "PCB2_1024x683_8u.raw";
|
||||
const char *InputFile = sdkFindFilePath(fileName, ".");
|
||||
if (InputFile == NULL) {
|
||||
printf("%s file not found.. exiting\n", fileName);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
FOPEN(bmpFile, InputFile, "rb");
|
||||
} else if (nImage == 4) {
|
||||
if (nWidth != 1280 || nHeight != 720) return -1;
|
||||
const char *fileName = "PCB_1280x720_8u.raw";
|
||||
const char *InputFile = sdkFindFilePath(fileName, ".");
|
||||
if (InputFile == NULL) {
|
||||
printf("%s file not found.. exiting\n", fileName);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
FOPEN(bmpFile, InputFile, "rb");
|
||||
} else {
|
||||
printf("Input file load failed.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (bmpFile == NULL) return -1;
|
||||
nSize = fread(pImage, 1, nWidth * nHeight, bmpFile);
|
||||
if (nSize < nWidth * nHeight) {
|
||||
fclose(bmpFile);
|
||||
return -1;
|
||||
}
|
||||
fclose(bmpFile);
|
||||
|
||||
printf("Input file load succeeded.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int aGenerateLabelsScratchBufferSize[NUMBER_OF_IMAGES];
|
||||
int aCompressLabelsScratchBufferSize[NUMBER_OF_IMAGES];
|
||||
|
||||
int nCompressedLabelCount = 0;
|
||||
cudaError_t cudaError;
|
||||
NppStatus nppStatus;
|
||||
NppStreamContext nppStreamCtx;
|
||||
FILE *bmpFile;
|
||||
|
||||
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
|
||||
pInputImageDev[j] = 0;
|
||||
pInputImageHost[j] = 0;
|
||||
pUFGenerateLabelsScratchBufferDev[j] = 0;
|
||||
pUFCompressedLabelsScratchBufferDev[j] = 0;
|
||||
pUFLabelDev[j] = 0;
|
||||
pUFLabelHost[j] = 0;
|
||||
}
|
||||
|
||||
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever
|
||||
// your stream ID is if not the NULL stream.
|
||||
|
||||
cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
|
||||
if (cudaError != cudaSuccess) {
|
||||
printf("CUDA error: no devices supporting CUDA.\n");
|
||||
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
|
||||
}
|
||||
|
||||
const NppLibraryVersion *libVer = nppGetLibVersion();
|
||||
|
||||
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
|
||||
libVer->build);
|
||||
|
||||
int driverVersion, runtimeVersion;
|
||||
cudaDriverGetVersion(&driverVersion);
|
||||
cudaRuntimeGetVersion(&runtimeVersion);
|
||||
|
||||
printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000,
|
||||
(driverVersion % 100) / 10);
|
||||
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000,
|
||||
(runtimeVersion % 100) / 10);
|
||||
|
||||
cudaError = cudaDeviceGetAttribute(
|
||||
&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
|
||||
cudaDevAttrComputeCapabilityMajor, nppStreamCtx.nCudaDeviceId);
|
||||
if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
|
||||
|
||||
cudaError = cudaDeviceGetAttribute(
|
||||
&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
|
||||
cudaDevAttrComputeCapabilityMinor, nppStreamCtx.nCudaDeviceId);
|
||||
if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
|
||||
|
||||
cudaError =
|
||||
cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
|
||||
|
||||
cudaDeviceProp oDeviceProperties;
|
||||
|
||||
cudaError =
|
||||
cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
|
||||
|
||||
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
|
||||
nppStreamCtx.nMaxThreadsPerMultiProcessor =
|
||||
oDeviceProperties.maxThreadsPerMultiProcessor;
|
||||
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
|
||||
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
|
||||
|
||||
NppiSize oSizeROI[NUMBER_OF_IMAGES];
|
||||
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
if (nImage == 0) {
|
||||
oSizeROI[nImage].width = 512;
|
||||
oSizeROI[nImage].height = 512;
|
||||
} else if (nImage == 1) {
|
||||
oSizeROI[nImage].width = 512;
|
||||
oSizeROI[nImage].height = 512;
|
||||
} else if (nImage == 2) {
|
||||
oSizeROI[nImage].width = 509;
|
||||
oSizeROI[nImage].height = 335;
|
||||
} else if (nImage == 3) {
|
||||
oSizeROI[nImage].width = 1024;
|
||||
oSizeROI[nImage].height = 683;
|
||||
} else if (nImage == 4) {
|
||||
oSizeROI[nImage].width = 1280;
|
||||
oSizeROI[nImage].height = 720;
|
||||
}
|
||||
|
||||
// NOTE: While using cudaMallocPitch() to allocate device memory for NPP can
|
||||
// significantly improve the performance of many NPP functions, for UF
|
||||
// function label markers generation or compression DO NOT USE
|
||||
// cudaMallocPitch(). Doing so could result in incorrect output.
|
||||
|
||||
cudaError = cudaMalloc(
|
||||
(void **)&pInputImageDev[nImage],
|
||||
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
// For images processed with UF label markers functions ROI width and height
|
||||
// for label markers generation output AND marker compression functions MUST
|
||||
// be the same AND line pitch MUST be equal to ROI.width * sizeof(Npp32u).
|
||||
// Also the image pointer used for label markers generation output must
|
||||
// start at the same position in the image as it does in the marker
|
||||
// compression function. Also note that actual input image size and ROI do
|
||||
// not necessarily need to be related other than ROI being less than or
|
||||
// equal to image size and image starting position does not necessarily have
|
||||
// to be at pixel 0 in the input image.
|
||||
|
||||
cudaError = cudaMalloc(
|
||||
(void **)&pUFLabelDev[nImage],
|
||||
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
checkCudaErrors(cudaMallocHost(
|
||||
&(pInputImageHost[nImage]),
|
||||
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
|
||||
checkCudaErrors(cudaMallocHost(
|
||||
&(pUFLabelHost[nImage]),
|
||||
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
|
||||
|
||||
// Use UF functions throughout this sample.
|
||||
|
||||
nppStatus = nppiLabelMarkersUFGetBufferSize_32u_C1R(
|
||||
oSizeROI[nImage], &aGenerateLabelsScratchBufferSize[nImage]);
|
||||
|
||||
// One at a time image processing
|
||||
|
||||
cudaError = cudaMalloc((void **)&pUFGenerateLabelsScratchBufferDev[nImage],
|
||||
aGenerateLabelsScratchBufferSize[nImage]);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
if (loadRaw8BitImage(pInputImageHost[nImage],
|
||||
oSizeROI[nImage].width * sizeof(Npp8u),
|
||||
oSizeROI[nImage].height, nImage) == 0) {
|
||||
cudaError = cudaMemcpy2DAsync(
|
||||
pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
|
||||
pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
|
||||
oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height,
|
||||
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
|
||||
|
||||
nppStatus = nppiLabelMarkersUF_8u32u_C1R_Ctx(
|
||||
pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage], nppiNormInf,
|
||||
pUFGenerateLabelsScratchBufferDev[nImage], nppStreamCtx);
|
||||
|
||||
if (nppStatus != NPP_SUCCESS) {
|
||||
if (nImage == 0)
|
||||
printf("teapot_LabelMarkersUF_8Way_512x512_32u failed.\n");
|
||||
else if (nImage == 1)
|
||||
printf("CT_skull_LabelMarkersUF_8Way_512x512_32u failed.\n");
|
||||
else if (nImage == 2)
|
||||
printf("PCB_METAL_LabelMarkersUF_8Way_509x335_32u failed.\n");
|
||||
else if (nImage == 3)
|
||||
printf("PCB2_LabelMarkersUF_8Way_1024x683_32u failed.\n");
|
||||
else if (nImage == 4)
|
||||
printf("PCB_LabelMarkersUF_8Way_1280x720_32u failed.\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaError = cudaMemcpy2DAsync(
|
||||
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
|
||||
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
|
||||
|
||||
// Wait host image read backs to complete, not necessary if no need to
|
||||
// synchronize
|
||||
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
|
||||
cudaSuccess) {
|
||||
printf("Post label generation cudaStreamSynchronize failed\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (nImage == 0)
|
||||
FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb");
|
||||
else if (nImage == 1)
|
||||
FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb");
|
||||
else if (nImage == 2)
|
||||
FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb");
|
||||
else if (nImage == 3)
|
||||
FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb");
|
||||
else if (nImage == 4)
|
||||
FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb");
|
||||
|
||||
if (bmpFile == NULL) return -1;
|
||||
size_t nSize = 0;
|
||||
for (int j = 0; j < oSizeROI[nImage].height; j++) {
|
||||
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
|
||||
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
|
||||
}
|
||||
fclose(bmpFile);
|
||||
|
||||
nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(
|
||||
oSizeROI[nImage].width * oSizeROI[nImage].height,
|
||||
&aCompressLabelsScratchBufferSize[nImage]);
|
||||
if (nppStatus != NPP_NO_ERROR) return nppStatus;
|
||||
|
||||
cudaError =
|
||||
cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[nImage],
|
||||
aCompressLabelsScratchBufferSize[nImage]);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
nCompressedLabelCount = 0;
|
||||
|
||||
nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR(
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage], oSizeROI[nImage].width * oSizeROI[nImage].height,
|
||||
&nCompressedLabelCount, pUFCompressedLabelsScratchBufferDev[nImage]);
|
||||
|
||||
if (nppStatus != NPP_SUCCESS) {
|
||||
if (nImage == 0)
|
||||
printf("teapot_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
|
||||
else if (nImage == 1)
|
||||
printf(
|
||||
"CT_Skull_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
|
||||
else if (nImage == 2)
|
||||
printf(
|
||||
"PCB_METAL_CompressedLabelMarkersUF_8Way_509x335_32u failed.\n");
|
||||
else if (nImage == 3)
|
||||
printf("PCB2_CompressedLabelMarkersUF_8Way_1024x683_32u failed.\n");
|
||||
else if (nImage == 4)
|
||||
printf("PCB_CompressedLabelMarkersUF_8Way_1280x720_32u failed.\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaError = cudaMemcpy2DAsync(
|
||||
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
|
||||
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
|
||||
|
||||
// Wait for host image read backs to finish, not necessary if no need to
|
||||
// synchronize
|
||||
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
|
||||
cudaSuccess ||
|
||||
nCompressedLabelCount == 0) {
|
||||
printf("Post label compression cudaStreamSynchronize failed\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (nImage == 0)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb");
|
||||
else if (nImage == 1)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb");
|
||||
else if (nImage == 2)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb");
|
||||
else if (nImage == 3)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb");
|
||||
else if (nImage == 4)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb");
|
||||
|
||||
if (bmpFile == NULL) return -1;
|
||||
nSize = 0;
|
||||
for (int j = 0; j < oSizeROI[nImage].height; j++) {
|
||||
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
|
||||
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
|
||||
}
|
||||
fclose(bmpFile);
|
||||
|
||||
if (nImage == 0)
|
||||
printf(
|
||||
"teapot_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
nCompressedLabelCount);
|
||||
else if (nImage == 1)
|
||||
printf(
|
||||
"CT_Skull_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
nCompressedLabelCount);
|
||||
else if (nImage == 2)
|
||||
printf(
|
||||
"PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
nCompressedLabelCount);
|
||||
else if (nImage == 3)
|
||||
printf(
|
||||
"PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
nCompressedLabelCount);
|
||||
else if (nImage == 4)
|
||||
printf(
|
||||
"PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
nCompressedLabelCount);
|
||||
}
|
||||
}
|
||||
|
||||
// Batch image processing
|
||||
|
||||
// We want to allocate scratch buffers more efficiently for batch processing
|
||||
// so first we free up the scratch buffers for image 0 and reallocate them.
|
||||
// This is not required but helps cudaMalloc to work more efficiently.
|
||||
|
||||
cudaFree(pUFCompressedLabelsScratchBufferDev[0]);
|
||||
|
||||
int nTotalBatchedUFCompressLabelsScratchBufferDevSize = 0;
|
||||
|
||||
for (int k = 0; k < NUMBER_OF_IMAGES; k++)
|
||||
nTotalBatchedUFCompressLabelsScratchBufferDevSize +=
|
||||
aCompressLabelsScratchBufferSize[k];
|
||||
|
||||
cudaError = cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[0],
|
||||
nTotalBatchedUFCompressLabelsScratchBufferDevSize);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
// Now allocate batch lists
|
||||
|
||||
int nBatchImageListBytes = NUMBER_OF_IMAGES * sizeof(NppiImageDescriptor);
|
||||
|
||||
cudaError =
|
||||
cudaMalloc((void **)&pUFBatchSrcImageListDev, nBatchImageListBytes);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
cudaError =
|
||||
cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
checkCudaErrors(
|
||||
cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes));
|
||||
|
||||
checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost,
|
||||
nBatchImageListBytes));
|
||||
|
||||
NppiSize oMaxROISize = {0, 0};
|
||||
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
pUFBatchSrcImageListHost[nImage].pData = pInputImageDev[nImage];
|
||||
pUFBatchSrcImageListHost[nImage].nStep =
|
||||
oSizeROI[nImage].width * sizeof(Npp8u);
|
||||
// src image oSize parameter is ignored in these NPP functions
|
||||
pUFBatchSrcDstImageListHost[nImage].pData = pUFLabelDev[nImage];
|
||||
pUFBatchSrcDstImageListHost[nImage].nStep =
|
||||
oSizeROI[nImage].width * sizeof(Npp32u);
|
||||
pUFBatchSrcDstImageListHost[nImage].oSize = oSizeROI[nImage];
|
||||
if (oSizeROI[nImage].width > oMaxROISize.width)
|
||||
oMaxROISize.width = oSizeROI[nImage].width;
|
||||
if (oSizeROI[nImage].height > oMaxROISize.height)
|
||||
oMaxROISize.height = oSizeROI[nImage].height;
|
||||
}
|
||||
|
||||
// Copy label generation batch lists from CPU to GPU
|
||||
cudaError = cudaMemcpyAsync(pUFBatchSrcImageListDev, pUFBatchSrcImageListHost,
|
||||
nBatchImageListBytes, cudaMemcpyHostToDevice,
|
||||
nppStreamCtx.hStream);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
|
||||
|
||||
cudaError = cudaMemcpyAsync(pUFBatchSrcDstImageListDev,
|
||||
pUFBatchSrcDstImageListHost, nBatchImageListBytes,
|
||||
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
|
||||
|
||||
// We use 8-way neighbor search throughout this example
|
||||
nppStatus = nppiLabelMarkersUFBatch_8u32u_C1R_Advanced_Ctx(
|
||||
pUFBatchSrcImageListDev, pUFBatchSrcDstImageListDev, NUMBER_OF_IMAGES,
|
||||
oMaxROISize, nppiNormInf, nppStreamCtx);
|
||||
|
||||
if (nppStatus != NPP_SUCCESS) {
|
||||
printf("LabelMarkersUFBatch_8Way_8u32u failed.\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Now read back generated device images to the host
|
||||
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
cudaError = cudaMemcpy2DAsync(
|
||||
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
|
||||
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
|
||||
}
|
||||
|
||||
// Wait for host image read backs to complete, not necessary if no need to
|
||||
// synchronize
|
||||
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
|
||||
cudaSuccess) {
|
||||
printf("Post label generation cudaStreamSynchronize failed\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Save output to files
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
if (nImage == 0)
|
||||
FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb");
|
||||
else if (nImage == 1)
|
||||
FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb");
|
||||
else if (nImage == 2)
|
||||
FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb");
|
||||
else if (nImage == 3)
|
||||
FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb");
|
||||
else if (nImage == 4)
|
||||
FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb");
|
||||
|
||||
if (bmpFile == NULL) return -1;
|
||||
size_t nSize = 0;
|
||||
for (int j = 0; j < oSizeROI[nImage].height; j++) {
|
||||
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
|
||||
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
|
||||
}
|
||||
fclose(bmpFile);
|
||||
}
|
||||
|
||||
#ifdef USE_BATCHED_LABEL_COMPRESSION
|
||||
|
||||
// Now allocate scratch buffer memory for batched label compression
|
||||
cudaError = cudaMalloc((void **)&pUFBatchSrcDstScratchBufferListDev,
|
||||
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor));
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
cudaError = cudaMalloc((void **)&pUFBatchPerImageCompressedCountListDev,
|
||||
NUMBER_OF_IMAGES * sizeof(Npp32u));
|
||||
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
|
||||
|
||||
// Allocate host side scratch buffer point and size list and initialize with
|
||||
// device scratch buffer pointers
|
||||
checkCudaErrors(
|
||||
cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost,
|
||||
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
|
||||
|
||||
checkCudaErrors(
|
||||
cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost,
|
||||
+NUMBER_OF_IMAGES * sizeof(Npp32u)));
|
||||
|
||||
// Start buffer pointer at beginning of full per image buffer list sized
|
||||
// pUFCompressedLabelsScratchBufferDev[0]
|
||||
Npp32u *pCurUFCompressedLabelsScratchBufferDev =
|
||||
reinterpret_cast<Npp32u *>(pUFCompressedLabelsScratchBufferDev[0]);
|
||||
|
||||
int nMaxUFCompressedLabelsScratchBufferSize = 0;
|
||||
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
// This particular function works on in-place data and SrcDst image batch
|
||||
// list has already been initialized in batched label generation function
|
||||
// setup
|
||||
|
||||
// Initialize each per image buffer descriptor
|
||||
pUFBatchSrcDstScratchBufferListHost[nImage].pData =
|
||||
reinterpret_cast<void *>(pCurUFCompressedLabelsScratchBufferDev);
|
||||
pUFBatchSrcDstScratchBufferListHost[nImage].nBufferSize =
|
||||
aCompressLabelsScratchBufferSize[nImage];
|
||||
|
||||
if (aCompressLabelsScratchBufferSize[nImage] >
|
||||
nMaxUFCompressedLabelsScratchBufferSize)
|
||||
nMaxUFCompressedLabelsScratchBufferSize =
|
||||
aCompressLabelsScratchBufferSize[nImage];
|
||||
|
||||
// Offset buffer pointer to next per image buffer
|
||||
Npp8u *pTempBuffer =
|
||||
reinterpret_cast<Npp8u *>(pCurUFCompressedLabelsScratchBufferDev);
|
||||
pTempBuffer += aCompressLabelsScratchBufferSize[nImage];
|
||||
pCurUFCompressedLabelsScratchBufferDev =
|
||||
reinterpret_cast<Npp32u *>((void *)(pTempBuffer));
|
||||
}
|
||||
|
||||
// Copy compression batch scratch buffer list from CPU to GPU
|
||||
cudaError = cudaMemcpyAsync(pUFBatchSrcDstScratchBufferListDev,
|
||||
pUFBatchSrcDstScratchBufferListHost,
|
||||
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor),
|
||||
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
|
||||
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
|
||||
|
||||
nppStatus = nppiCompressMarkerLabelsUFBatch_32u_C1IR_Advanced_Ctx(
|
||||
pUFBatchSrcDstImageListDev, pUFBatchSrcDstScratchBufferListDev,
|
||||
pUFBatchPerImageCompressedCountListDev, NUMBER_OF_IMAGES, oMaxROISize,
|
||||
nMaxUFCompressedLabelsScratchBufferSize, nppStreamCtx);
|
||||
if (nppStatus != NPP_SUCCESS) {
|
||||
printf("BatchCompressedLabelMarkersUF_8Way_32u failed.\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Copy output compressed label images back to host
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
cudaError = cudaMemcpy2DAsync(
|
||||
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
|
||||
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
|
||||
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
|
||||
}
|
||||
|
||||
// Wait for host image read backs to complete, not necessary if no need to
|
||||
// synchronize
|
||||
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
|
||||
cudaSuccess) {
|
||||
printf("Post label compression cudaStreamSynchronize failed\n");
|
||||
tearDown();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Save compressed label images into files
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
if (nImage == 0)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
|
||||
else if (nImage == 1)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
|
||||
else if (nImage == 2)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
|
||||
else if (nImage == 3)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
|
||||
else if (nImage == 4)
|
||||
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
|
||||
|
||||
if (bmpFile == NULL) return -1;
|
||||
size_t nSize = 0;
|
||||
for (int j = 0; j < oSizeROI[nImage].height; j++) {
|
||||
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
|
||||
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
|
||||
}
|
||||
fclose(bmpFile);
|
||||
}
|
||||
|
||||
// Read back per image compressed label count.
|
||||
cudaError = cudaMemcpyAsync(pUFBatchPerImageCompressedCountListHost,
|
||||
pUFBatchPerImageCompressedCountListDev,
|
||||
NUMBER_OF_IMAGES * sizeof(Npp32u),
|
||||
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
|
||||
if (cudaError != cudaSuccess) {
|
||||
tearDown();
|
||||
return NPP_MEMCPY_ERROR;
|
||||
}
|
||||
|
||||
// Wait for host read back to complete
|
||||
cudaError = cudaStreamSynchronize(nppStreamCtx.hStream);
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
|
||||
if (nImage == 0)
|
||||
printf(
|
||||
"teapot_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
pUFBatchPerImageCompressedCountListHost[nImage]);
|
||||
else if (nImage == 1)
|
||||
printf(
|
||||
"CT_Skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
pUFBatchPerImageCompressedCountListHost[nImage]);
|
||||
else if (nImage == 2)
|
||||
printf(
|
||||
"PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
pUFBatchPerImageCompressedCountListHost[nImage]);
|
||||
else if (nImage == 3)
|
||||
printf(
|
||||
"PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
pUFBatchPerImageCompressedCountListHost[nImage]);
|
||||
else if (nImage == 4)
|
||||
printf(
|
||||
"PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u succeeded, "
|
||||
"compressed label count is %d.\n",
|
||||
pUFBatchPerImageCompressedCountListHost[nImage]);
|
||||
}
|
||||
|
||||
#endif // USE_BATCHED_LABEL_COMPRESSION
|
||||
|
||||
tearDown();
|
||||
|
||||
return 0;
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
40
Samples/4_CUDA_Libraries/boxFilterNPP/CMakeLists.txt
Normal file
40
Samples/4_CUDA_Libraries/boxFilterNPP/CMakeLists.txt
Normal file
@ -0,0 +1,40 @@
|
||||
# Include directories and libraries
|
||||
include_directories(
|
||||
../../../Common
|
||||
../../../Common/UtilNPP
|
||||
)
|
||||
|
||||
# Source file
|
||||
set(SRC_FILES
|
||||
boxFilterNPP.cpp
|
||||
)
|
||||
|
||||
find_package(FreeImage)
|
||||
|
||||
if(${FreeImage_FOUND})
|
||||
# Add target for boxFilterNPP
|
||||
add_executable(boxFilterNPP ${SRC_FILES})
|
||||
set_target_properties(boxFilterNPP PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
target_include_directories(boxFilterNPP PRIVATE
|
||||
${CUDAToolkit_INCLUDE_DIRS}
|
||||
${FreeImage_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(boxFilterNPP PRIVATE
|
||||
CUDA::nppc
|
||||
CUDA::nppisu
|
||||
CUDA::nppif
|
||||
CUDA::cudart
|
||||
${FreeImage_LIBRARIES}
|
||||
)
|
||||
|
||||
# Copy data files to output directory
|
||||
add_custom_command(TARGET boxFilterNPP POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/*.pgm
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
else()
|
||||
message(STATUS "FreeImage not found - will not build sample 'boxFilterNPP'")
|
||||
endif()
|
@ -1,378 +0,0 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
#
|
||||
# Makefile project only supported on Mac OS X and Linux Platforms)
|
||||
#
|
||||
################################################################################
|
||||
|
||||
# Location of the CUDA Toolkit
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
|
||||
##############################
|
||||
# start deprecated interface #
|
||||
##############################
|
||||
ifeq ($(x86_64),1)
|
||||
$(info WARNING - x86_64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
|
||||
TARGET_ARCH ?= x86_64
|
||||
endif
|
||||
ifeq ($(ARMv7),1)
|
||||
$(info WARNING - ARMv7 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=armv7l instead)
|
||||
TARGET_ARCH ?= armv7l
|
||||
endif
|
||||
ifeq ($(aarch64),1)
|
||||
$(info WARNING - aarch64 variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
|
||||
TARGET_ARCH ?= aarch64
|
||||
endif
|
||||
ifeq ($(ppc64le),1)
|
||||
$(info WARNING - ppc64le variable has been deprecated)
|
||||
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
|
||||
TARGET_ARCH ?= ppc64le
|
||||
endif
|
||||
ifneq ($(GCC),)
|
||||
$(info WARNING - GCC variable has been deprecated)
|
||||
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
|
||||
HOST_COMPILER ?= $(GCC)
|
||||
endif
|
||||
ifneq ($(abi),)
|
||||
$(error ERROR - abi variable has been removed)
|
||||
endif
|
||||
############################
|
||||
# end deprecated interface #
|
||||
############################
|
||||
|
||||
# architecture
|
||||
HOST_ARCH := $(shell uname -m)
|
||||
TARGET_ARCH ?= $(HOST_ARCH)
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
|
||||
TARGET_SIZE := 64
|
||||
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
|
||||
TARGET_SIZE := 32
|
||||
endif
|
||||
else
|
||||
TARGET_SIZE := $(shell getconf LONG_BIT)
|
||||
endif
|
||||
else
|
||||
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
|
||||
endif
|
||||
|
||||
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
|
||||
ifeq ($(HOST_ARCH),aarch64)
|
||||
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
|
||||
HOST_ARCH := sbsa
|
||||
TARGET_ARCH := sbsa
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
|
||||
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
|
||||
endif
|
||||
endif
|
||||
|
||||
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
|
||||
TARGET_ARCH = armv7l
|
||||
endif
|
||||
|
||||
# operating system
|
||||
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
|
||||
TARGET_OS ?= $(HOST_OS)
|
||||
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
|
||||
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
|
||||
endif
|
||||
|
||||
# host compiler
|
||||
ifdef HOST_COMPILER
|
||||
CUSTOM_HOST_COMPILER = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
|
||||
HOST_COMPILER ?= clang++
|
||||
endif
|
||||
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
|
||||
ifeq ($(TARGET_OS),linux)
|
||||
HOST_COMPILER ?= arm-linux-gnueabihf-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
HOST_COMPILER ?= arm-linux-androideabi-g++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),aarch64)
|
||||
ifeq ($(TARGET_OS), linux)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_OS),qnx)
|
||||
ifeq ($(QNX_HOST),)
|
||||
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
|
||||
endif
|
||||
ifeq ($(QNX_TARGET),)
|
||||
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
|
||||
endif
|
||||
export QNX_HOST
|
||||
export QNX_TARGET
|
||||
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
|
||||
else ifeq ($(TARGET_OS), android)
|
||||
HOST_COMPILER ?= aarch64-linux-android-clang++
|
||||
endif
|
||||
else ifeq ($(TARGET_ARCH),sbsa)
|
||||
HOST_COMPILER ?= aarch64-linux-gnu-g++
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
|
||||
endif
|
||||
endif
|
||||
HOST_COMPILER ?= g++
|
||||
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
|
||||
|
||||
# internal flags
|
||||
NVCCFLAGS := -m${TARGET_SIZE}
|
||||
CCFLAGS :=
|
||||
LDFLAGS :=
|
||||
|
||||
# build flags
|
||||
|
||||
# Link flag for customized HOST_COMPILER with gcc realpath
|
||||
GCC_PATH := $(shell which gcc)
|
||||
ifeq ($(CUSTOM_HOST_COMPILER),1)
|
||||
ifneq ($(filter /%,$(HOST_COMPILER)),)
|
||||
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
|
||||
ifneq ($(GCC_PATH),$(HOST_COMPILER))
|
||||
LDFLAGS += -lstdc++
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_OS),darwin)
|
||||
LDFLAGS += -rpath $(CUDA_PATH)/lib
|
||||
CCFLAGS += -arch $(HOST_ARCH)
|
||||
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
|
||||
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
|
||||
CCFLAGS += -mfloat-abi=hard
|
||||
else ifeq ($(TARGET_OS),android)
|
||||
LDFLAGS += -pie
|
||||
CCFLAGS += -fpie -fpic -fexceptions
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
ifneq ($(TARGET_FS),)
|
||||
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
|
||||
ifeq ($(GCCVERSIONLTEQ46),1)
|
||||
CCFLAGS += --sysroot=$(TARGET_FS)
|
||||
endif
|
||||
LDFLAGS += --sysroot=$(TARGET_FS)
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
|
||||
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
|
||||
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
|
||||
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
|
||||
endif
|
||||
endif
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
NVCCFLAGS += -D_QNX_SOURCE
|
||||
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
|
||||
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
|
||||
LDFLAGS += -lsocket
|
||||
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
|
||||
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
|
||||
ifdef TARGET_OVERRIDE
|
||||
LDFLAGS += -lslog2
|
||||
endif
|
||||
|
||||
ifneq ($(TARGET_FS),)
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/lib
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
|
||||
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
|
||||
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
|
||||
CCFLAGS += -I$(TARGET_FS)/../include
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef TARGET_OVERRIDE # cuda toolkit targets override
|
||||
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
|
||||
endif
|
||||
|
||||
# Install directory of different arch
|
||||
CUDA_INSTALL_TARGET_DIR :=
|
||||
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
|
||||
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
|
||||
else ifeq ($(TARGET_ARCH),ppc64le)
|
||||
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
|
||||
endif
|
||||
|
||||
# Debug build flags
|
||||
ifeq ($(dbg),1)
|
||||
NVCCFLAGS += -g -G
|
||||
BUILD_TYPE := debug
|
||||
else
|
||||
BUILD_TYPE := release
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS :=
|
||||
ALL_CCFLAGS += $(NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
|
||||
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
|
||||
|
||||
SAMPLE_ENABLED := 1
|
||||
|
||||
# This sample is not supported on QNX
|
||||
ifeq ($(TARGET_OS),qnx)
|
||||
$(info >>> WARNING - boxFilterNPP is not supported on QNX - waiving sample <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ALL_LDFLAGS :=
|
||||
ALL_LDFLAGS += $(ALL_CCFLAGS)
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
|
||||
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
|
||||
|
||||
# Common includes and paths for CUDA
|
||||
INCLUDES := -I../../../Common
|
||||
LIBRARIES :=
|
||||
|
||||
################################################################################
|
||||
|
||||
# Gencode arguments
|
||||
SMS ?=
|
||||
|
||||
ifeq ($(GENCODE_FLAGS),)
|
||||
# Generate SASS code for each SM architecture listed in $(SMS)
|
||||
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
|
||||
|
||||
ifeq ($(SMS),)
|
||||
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
|
||||
# Generate PTX code from SM 53
|
||||
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
|
||||
else
|
||||
# Generate PTX code from SM 50
|
||||
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
|
||||
endif
|
||||
endif
|
||||
|
||||
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
|
||||
HIGHEST_SM := $(lastword $(sort $(SMS)))
|
||||
ifneq ($(HIGHEST_SM),)
|
||||
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_CCFLAGS += --threads 0 --std=c++11
|
||||
|
||||
INCLUDES += -I../../../Common/UtilNPP
|
||||
|
||||
LIBRARIES += -lnppisu_static -lnppif_static -lnppc_static -lculibos -lfreeimage
|
||||
|
||||
# Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up.
|
||||
$(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(ALL_LDFLAGS) $(LIBRARIES) -l freeimage test.c)
|
||||
FREEIMAGE := $(shell find a.out 2>/dev/null)
|
||||
$(shell rm a.out test.c 2>/dev/null)
|
||||
|
||||
ifeq ("$(FREEIMAGE)","")
|
||||
$(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<)
|
||||
SAMPLE_ENABLED := 0
|
||||
endif
|
||||
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
EXEC ?= @echo "[@]"
|
||||
endif
|
||||
|
||||
################################################################################
|
||||
|
||||
# Target rules
|
||||
all: build
|
||||
|
||||
build: boxFilterNPP
|
||||
|
||||
check.deps:
|
||||
ifeq ($(SAMPLE_ENABLED),0)
|
||||
@echo "Sample will be waived due to the above missing dependencies"
|
||||
else
|
||||
@echo "Sample is ready - all dependencies have been met"
|
||||
endif
|
||||
|
||||
boxFilterNPP.o:boxFilterNPP.cpp
|
||||
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
|
||||
|
||||
boxFilterNPP: boxFilterNPP.o
|
||||
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
|
||||
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
|
||||
|
||||
run: build
|
||||
$(EXEC) ./boxFilterNPP
|
||||
|
||||
testrun: build
|
||||
|
||||
clean:
|
||||
rm -f boxFilterNPP boxFilterNPP.o
|
||||
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/boxFilterNPP
|
||||
|
||||
clobber: clean
|
@ -1,95 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
|
||||
<entry>
|
||||
<name>boxFilterNPP</name>
|
||||
<cuda_api_list>
|
||||
<toolkit>cudaRuntimeGetVersion</toolkit>
|
||||
<toolkit>cudaDriverGetVersion</toolkit>
|
||||
</cuda_api_list>
|
||||
<description><![CDATA[A NPP CUDA Sample that demonstrates how to use NPP FilterBox function to perform a Box Filter.]]></description>
|
||||
<devicecompilation>whole</devicecompilation>
|
||||
<fallback_min_ptx>true</fallback_min_ptx>
|
||||
<files>
|
||||
<file>./teapot512.pgm</file>
|
||||
</files>
|
||||
<includepaths>
|
||||
<path>../../../Common/UtilNPP</path>
|
||||
<path os="Windows">../../../Common/FreeImage/Dist/x64</path>
|
||||
<path>./</path>
|
||||
<path>../</path>
|
||||
<path>../../../Common</path>
|
||||
</includepaths>
|
||||
<keyconcepts>
|
||||
<concept level="basic">Performance Strategies</concept>
|
||||
<concept level="basic">Image Processing</concept>
|
||||
<concept level="basic">NPP Library</concept>
|
||||
</keyconcepts>
|
||||
<keywords>
|
||||
<keyword>CUDA</keyword>
|
||||
<keyword>NPP</keyword>
|
||||
<keyword>Image Processing</keyword>
|
||||
<keyword>box filter</keyword>
|
||||
</keywords>
|
||||
<libraries>
|
||||
<library>nppisu_static</library>
|
||||
<library>nppif_static</library>
|
||||
<library>nppc_static</library>
|
||||
<library>culibos</library>
|
||||
<library>freeimage</library>
|
||||
</libraries>
|
||||
<librarypaths>
|
||||
</librarypaths>
|
||||
<nsight_eclipse>true</nsight_eclipse>
|
||||
<primary_file>boxFilterNPP.cpp</primary_file>
|
||||
<required_dependencies>
|
||||
<dependency>FreeImage</dependency>
|
||||
<dependency>NPP</dependency>
|
||||
</required_dependencies>
|
||||
<scopes>
|
||||
<scope>1:CUDA Basic Topics</scope>
|
||||
<scope>1:Performance Strategies</scope>
|
||||
<scope>2:Image Processing</scope>
|
||||
<scope>2:Computer Vision</scope>
|
||||
</scopes>
|
||||
<sm-arch>sm50</sm-arch>
|
||||
<sm-arch>sm52</sm-arch>
|
||||
<sm-arch>sm53</sm-arch>
|
||||
<sm-arch>sm60</sm-arch>
|
||||
<sm-arch>sm61</sm-arch>
|
||||
<sm-arch>sm70</sm-arch>
|
||||
<sm-arch>sm72</sm-arch>
|
||||
<sm-arch>sm75</sm-arch>
|
||||
<sm-arch>sm80</sm-arch>
|
||||
<sm-arch>sm86</sm-arch>
|
||||
<sm-arch>sm87</sm-arch>
|
||||
<sm-arch>sm89</sm-arch>
|
||||
<sm-arch>sm90</sm-arch>
|
||||
<supported_envs>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
<env>
|
||||
<platform>windows7</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>x86_64</arch>
|
||||
<platform>macosx</platform>
|
||||
</env>
|
||||
<env>
|
||||
<arch>arm</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>sbsa</arch>
|
||||
</env>
|
||||
<env>
|
||||
<arch>ppc64le</arch>
|
||||
<platform>linux</platform>
|
||||
</env>
|
||||
</supported_envs>
|
||||
<supported_sm_architectures>
|
||||
<include>all</include>
|
||||
</supported_sm_architectures>
|
||||
<title>Box Filter with NPP</title>
|
||||
<type>exe</type>
|
||||
</entry>
|
@ -2,5 +2,6 @@ add_subdirectory(0_Introduction)
|
||||
add_subdirectory(1_Utilities)
|
||||
add_subdirectory(2_Concepts_and_Techniques)
|
||||
add_subdirectory(3_CUDA_Features)
|
||||
add_subdirectory(4_CUDA_Libraries)
|
||||
add_subdirectory(6_Performance)
|
||||
add_subdirectory(7_libNVVM)
|
||||
|
17
cmake/Modules/FindFreeImage.cmake
Normal file
17
cmake/Modules/FindFreeImage.cmake
Normal file
@ -0,0 +1,17 @@
|
||||
find_path(FreeImage_INCLUDE_DIR
|
||||
NAMES freeimage.h FreeImage.h
|
||||
PATHS /usr/include /usr/local/include
|
||||
)
|
||||
|
||||
find_library(FreeImage_LIBRARY
|
||||
NAMES freeimage
|
||||
PATHS /usr/lib /usr/local/lib
|
||||
)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(FreeImage DEFAULT_MSG FreeImage_LIBRARY FreeImage_INCLUDE_DIR)
|
||||
|
||||
if(FreeImage_FOUND)
|
||||
set(FreeImage_LIBRARIES ${FreeImage_LIBRARY})
|
||||
set(FreeImage_INCLUDE_DIRS ${FreeImage_INCLUDE_DIR})
|
||||
endif()
|
Loading…
x
Reference in New Issue
Block a user