Update boxFilterNPP, delete batchedLabelMarkersAndLabelCompressionNPP

This commit is contained in:
Rob Armstrong 2024-12-13 10:38:24 -08:00
parent 89f2e5c0c3
commit a3be0d3cd8
38 changed files with 121 additions and 2305 deletions

View File

@ -13,6 +13,9 @@
* `simpleVoteIntrinsics_nvrtc` demonstrating NVRTC usage for `simpleVoteIntrinsics` sample (reason: redundant)
* `2_Concepts_and_Techniques`
* `cuHook` demonstrating dlsym hooks. (reason: incompatible with modern `glibc`)
* `4_CUDA_Libraries`
* `batchedLabelMarkersAndLabelCompressionNPP` demonstrating NPP features (reason: some functionality removed from library)
### CUDA 12.5

View File

@ -0,0 +1,40 @@
#add_subdirectory(FilterBorderControlNPP)
#add_subdirectory(MersenneTwisterGP11213)
add_subdirectory(batchCUBLAS)
add_subdirectory(boxFilterNPP)
#add_subdirectory(cannyEdgeDetectorNPP)
#add_subdirectory(conjugateGradient)
#add_subdirectory(conjugateGradientCudaGraphs)
#add_subdirectory(conjugateGradientMultiBlockCG)
#add_subdirectory(conjugateGradientMultiDeviceCG)
#add_subdirectory(conjugateGradientPrecond)
#add_subdirectory(conjugateGradientUM)
#add_subdirectory(cuDLAErrorReporting)
#add_subdirectory(cuDLAHybridMode)
#add_subdirectory(cuDLALayerwiseStatsHybrid)
#add_subdirectory(cuDLALayerwiseStatsStandalone)
#add_subdirectory(cuDLAStandaloneMode)
#add_subdirectory(cuSolverDn_LinearSolver)
#add_subdirectory(cuSolverRf)
#add_subdirectory(cuSolverSp_LinearSolver)
#add_subdirectory(cuSolverSp_LowlevelCholesky)
#add_subdirectory(cuSolverSp_LowlevelQR)
#add_subdirectory(cudaNvSci)
#add_subdirectory(cudaNvSciNvMedia)
#add_subdirectory(freeImageInteropNPP)
#add_subdirectory(histEqualizationNPP)
#add_subdirectory(jitLto)
#add_subdirectory(lineOfSight)
#add_subdirectory(matrixMulCUBLAS)
#add_subdirectory(nvJPEG)
#add_subdirectory(nvJPEG_encoder)
#add_subdirectory(oceanFFT)
#add_subdirectory(randomFog)
#add_subdirectory(simpleCUBLAS)
#add_subdirectory(simpleCUBLASXT)
#add_subdirectory(simpleCUBLAS_LU)
#add_subdirectory(simpleCUFFT)
#add_subdirectory(simpleCUFFT_2d_MGPU)
#add_subdirectory(simpleCUFFT_MGPU)
#add_subdirectory(simpleCUFFT_callback)
#add_subdirectory(watershedSegmentationNPP)

View File

@ -0,0 +1,20 @@
# Include directories and libraries
include_directories(../../../Common)
# Source file
set(SRC_FILES
batchCUBLAS.cpp
)
# Add target for batchCUBLAS
add_executable(batchCUBLAS ${SRC_FILES})
set_target_properties(batchCUBLAS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_include_directories(batchCUBLAS PRIVATE
${CUDAToolkit_INCLUDE_DIRS}
)
target_link_libraries(batchCUBLAS PRIVATE
CUDA::cublas
CUDA::cudart
)

View File

@ -1,347 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifdef HOST_COMPILER
CUSTOM_HOST_COMPILER = 1
endif
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
# Link flag for customized HOST_COMPILER with gcc realpath
GCC_PATH := $(shell which gcc)
ifeq ($(CUSTOM_HOST_COMPILER),1)
ifneq ($(filter /%,$(HOST_COMPILER)),)
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
ifneq ($(GCC_PATH),$(HOST_COMPILER))
LDFLAGS += -lstdc++
endif
endif
endif
endif
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
SMS ?=
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
# Generate PTX code from SM 53
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
else
# Generate PTX code from SM 50
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
endif
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
LIBRARIES += -lcublas
################################################################################
# Target rules
all: build
build: batchCUBLAS
batchCUBLAS.o:batchCUBLAS.cpp
$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
batchCUBLAS: batchCUBLAS.o
$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
./batchCUBLAS
testrun: build
clean:
rm -f batchCUBLAS batchCUBLAS.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/batchCUBLAS
clobber: clean

View File

@ -1,89 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>batchCUBLAS</name>
<cuda_api_list>
<driver>cuRand</driver>
<driver>cuEqual</driver>
<toolkit>cudaMemcpy</toolkit>
<toolkit>cudaGetErrorString</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaGetLastError</toolkit>
<toolkit>cudaDeviceSynchronize</toolkit>
<toolkit>cudaGetDevice</toolkit>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaStreamCreate</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
</cuda_api_list>
<description><![CDATA[A CUDA Sample that demonstrates how using batched CUBLAS API calls to improve overall performance.]]></description>
<devicecompilation>whole</devicecompilation>
<fallback_min_ptx>true</fallback_min_ptx>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Linear Algebra</concept>
<concept level="basic">CUBLAS Library</concept>
</keyconcepts>
<keywords>
<keyword>CUBLAS</keyword>
<keyword>Linear Algebra</keyword>
</keywords>
<libraries>
<library>cublas</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>batchCUBLAS.cpp</primary_file>
<required_dependencies>
<dependency>CUBLAS</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>3:Linear Algebra</scope>
</scopes>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<sm-arch>sm89</sm-arch>
<sm-arch>sm90</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>batchCUBLAS</title>
<type>exe</type>
</entry>

View File

@ -1,18 +0,0 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/../../../Common"
],
"defines": [],
"compilerPath": "/usr/local/cuda/bin/nvcc",
"cStandard": "gnu17",
"cppStandard": "gnu++14",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.makefile-tools"
}
],
"version": 4
}

View File

@ -1,7 +0,0 @@
{
"recommendations": [
"nvidia.nsight-vscode-edition",
"ms-vscode.cpptools",
"ms-vscode.makefile-tools"
]
}

View File

@ -1,10 +0,0 @@
{
"configurations": [
{
"name": "CUDA C++: Launch",
"type": "cuda-gdb",
"request": "launch",
"program": "${workspaceFolder}/batchedLabelMarkersAndLabelCompressionNPP"
}
]
}

View File

@ -1,15 +0,0 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "sample",
"type": "shell",
"command": "make dbg=1",
"problemMatcher": ["$nvcc"],
"group": {
"kind": "build",
"isDefault": true
}
}
]
}

View File

@ -1,372 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifdef HOST_COMPILER
CUSTOM_HOST_COMPILER = 1
endif
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
# Link flag for customized HOST_COMPILER with gcc realpath
GCC_PATH := $(shell which gcc)
ifeq ($(CUSTOM_HOST_COMPILER),1)
ifneq ($(filter /%,$(HOST_COMPILER)),)
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
ifneq ($(GCC_PATH),$(HOST_COMPILER))
LDFLAGS += -lstdc++
endif
endif
endif
endif
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on Mac OSX
ifeq ($(TARGET_OS),darwin)
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on Mac OSX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - batchedLabelMarkersAndLabelCompressionNPP is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
SMS ?=
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
# Generate PTX code from SM 53
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
else
# Generate PTX code from SM 50
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
endif
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
LIBRARIES += -lnppisu_static -lnppif_static -lnppc_static -lculibos
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: batchedLabelMarkersAndLabelCompressionNPP
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
batchedLabelMarkersAndLabelCompressionNPP.o:batchedLabelMarkersAndLabelCompressionNPP.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
batchedLabelMarkersAndLabelCompressionNPP: batchedLabelMarkersAndLabelCompressionNPP.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./batchedLabelMarkersAndLabelCompressionNPP
testrun: build
clean:
rm -f batchedLabelMarkersAndLabelCompressionNPP batchedLabelMarkersAndLabelCompressionNPP.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/batchedLabelMarkersAndLabelCompressionNPP
clobber: clean

View File

@ -1,95 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>batchedLabelMarkersAndLabelCompressionNPP</name>
<cuda_api_list>
<toolkit>cudaRuntimeGetVersion</toolkit>
<toolkit>cudaMallocPitch</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaDeviceGetAttribute</toolkit>
<toolkit>cudaMallocHost</toolkit>
<toolkit>cudaDriverGetVersion</toolkit>
<toolkit>cudaFreeHost</toolkit>
<toolkit>cudaGetDevice</toolkit>
<toolkit>cudaStreamGetFlags</toolkit>
<toolkit>cudaStreamSynchronize</toolkit>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaMemcpyAsync</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
</cuda_api_list>
<description><![CDATA[An NPP CUDA Sample that demonstrates how to use the NPP label markers generation and label compression functions based on a Union Find (UF) algorithm including both single image and batched image versions.]]></description>
<devicecompilation>whole</devicecompilation>
<fallback_min_ptx>true</fallback_min_ptx>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Performance Strategies</concept>
<concept level="basic">Image Processing</concept>
<concept level="basic">NPP Library</concept>
<concept level="basic">Using NPP Batch Functions</concept>
</keyconcepts>
<keywords>
<keyword>CUDA</keyword>
<keyword>NPP</keyword>
<keyword>Image Processing</keyword>
</keywords>
<libraries>
<library>nppisu_static</library>
<library>nppif_static</library>
<library>nppc_static</library>
<library>culibos</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>batchedLabelMarkersAndLabelCompressionNPP.cpp</primary_file>
<required_dependencies>
<dependency>NPP</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:Performance Strategies</scope>
<scope>2:Image Processing</scope>
<scope>2:Computer Vision</scope>
</scopes>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<sm-arch>sm89</sm-arch>
<sm-arch>sm90</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>Batched Label Markers And Label Compression NPP</title>
<type>exe</type>
</entry>

View File

@ -1,74 +0,0 @@
# batchedLabelMarkersAndLabelCompressionNPP - Batched Label Markers And Label Compression NPP
## Description
An NPP CUDA Sample that demonstrates how to use the NPP label markers generation and label compression functions based on a Union Find (UF) algorithm including both single image and batched image versions.
## Key Concepts
Performance Strategies, Image Processing, NPP Library, Using NPP Batch Functions
## Supported SM Architectures
[SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, armv7l
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaRuntimeGetVersion, cudaMallocPitch, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cudaDriverGetVersion, cudaFreeHost, cudaGetDevice, cudaStreamGetFlags, cudaStreamSynchronize, cudaMalloc, cudaMemcpyAsync, cudaGetDeviceProperties
## Dependencies needed to build/run
[NPP](../../../README.md#npp)
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -1,805 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#pragma warning(disable : 4819)
#endif
#include <stdio.h>
#include <string.h>
#include <fstream>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_string.h>
#include <npp.h>
// Note: If you want to view these images we HIGHLY recommend using imagej
// which is free on the internet and works on most platforms
// because it is one of the few image viewing apps that can display 32
// bit integer image data. While it normalizes the data to floating
// point values for viewing it still provides a good representation of
// the relative brightness of each label value. Note that label
// compression output results in smaller differences between label values
// making it visually more difficult to detect differences in labeled
// regions. If you have an editor that can display hex values you can
// see what the exact values of each label is, every 4 bytes represents 1
// 32 bit integer label value.
//
// The files read and written by this sample app use RAW image format,
// that is, only the image data itself exists in the files with no image
// format information. When viewing RAW files with imagej just enter
// the image size and bit depth values that are part of the file name
// when requested by imagej.
//
// This sample app works in 2 stages, first it processes all of the
// images individually then it processes them all again in 1 batch using
// the Batch_Advanced versions of the NPP batch functions which allow
// each image to have it's own ROI. The 2 stages are completely
// separable but in this sample the second stage takes advantage of some
// of the data that has already been initialized.
//
// Note that there is a small amount of variability in the number of
// unique label markers generated from one run to the next by the UF
// algorithm.
//
// Performance of ALL NPP image batch functions is limited by the maximum
// ROI height in the list of images.
// Batched label compression support is only available on NPP versions > 11.0,
// comment out if using NPP 11.0
#define USE_BATCHED_LABEL_COMPRESSION 1
#define NUMBER_OF_IMAGES 5
Npp8u *pInputImageDev[NUMBER_OF_IMAGES];
Npp8u *pInputImageHost[NUMBER_OF_IMAGES];
Npp8u *pUFGenerateLabelsScratchBufferDev[NUMBER_OF_IMAGES];
Npp8u *pUFCompressedLabelsScratchBufferDev[NUMBER_OF_IMAGES];
Npp32u *pUFLabelDev[NUMBER_OF_IMAGES];
Npp32u *pUFLabelHost[NUMBER_OF_IMAGES];
NppiImageDescriptor *pUFBatchSrcImageListDev = 0;
NppiImageDescriptor *pUFBatchSrcDstImageListDev = 0;
NppiImageDescriptor *pUFBatchSrcImageListHost = 0;
NppiImageDescriptor *pUFBatchSrcDstImageListHost = 0;
NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListDev =
0; // from nppi_filtering_functions.h
NppiBufferDescriptor *pUFBatchSrcDstScratchBufferListHost = 0;
Npp32u *pUFBatchPerImageCompressedCountListDev = 0;
Npp32u *pUFBatchPerImageCompressedCountListHost = 0;
void tearDown() // Clean up and tear down
{
if (pUFBatchPerImageCompressedCountListDev != 0)
cudaFree(pUFBatchPerImageCompressedCountListDev);
if (pUFBatchSrcDstScratchBufferListDev != 0)
cudaFree(pUFBatchSrcDstScratchBufferListDev);
if (pUFBatchSrcDstImageListDev != 0) cudaFree(pUFBatchSrcDstImageListDev);
if (pUFBatchSrcImageListDev != 0) cudaFree(pUFBatchSrcImageListDev);
if (pUFBatchPerImageCompressedCountListHost != 0)
cudaFreeHost(pUFBatchPerImageCompressedCountListHost);
if (pUFBatchSrcDstScratchBufferListHost != 0)
cudaFreeHost(pUFBatchSrcDstScratchBufferListHost);
if (pUFBatchSrcDstImageListHost != 0)
cudaFreeHost(pUFBatchSrcDstImageListHost);
if (pUFBatchSrcImageListHost != 0) cudaFreeHost(pUFBatchSrcImageListHost);
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
if (pUFCompressedLabelsScratchBufferDev[j] != 0)
cudaFree(pUFCompressedLabelsScratchBufferDev[j]);
if (pUFGenerateLabelsScratchBufferDev[j] != 0)
cudaFree(pUFGenerateLabelsScratchBufferDev[j]);
if (pUFLabelDev[j] != 0) cudaFree(pUFLabelDev[j]);
if (pInputImageDev[j] != 0) cudaFree(pInputImageDev[j]);
if (pUFLabelHost[j] != 0) cudaFreeHost(pUFLabelHost[j]);
if (pInputImageHost[j] != 0) cudaFreeHost(pInputImageHost[j]);
}
}
const std::string &LabelMarkersOutputFile0 =
"teapot_LabelMarkersUF_8Way_512x512_32u.raw";
const std::string &LabelMarkersOutputFile1 =
"CT_skull_LabelMarkersUF_8Way_512x512_32u.raw";
const std::string &LabelMarkersOutputFile2 =
"PCB_METAL_LabelMarkersUF_8Way_509x335_32u.raw";
const std::string &LabelMarkersOutputFile3 =
"PCB2_LabelMarkersUF_8Way_1024x683_32u.raw";
const std::string &LabelMarkersOutputFile4 =
"PCB_LabelMarkersUF_8Way_1280x720_32u.raw";
const std::string &CompressedMarkerLabelsOutputFile0 =
"teapot_CompressedMarkerLabelsUF_8Way_512x512_32u.raw";
const std::string &CompressedMarkerLabelsOutputFile1 =
"CT_skull_CompressedMarkerLabelsUF_8Way_512x512_32u.raw";
const std::string &CompressedMarkerLabelsOutputFile2 =
"PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u.raw";
const std::string &CompressedMarkerLabelsOutputFile3 =
"PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u.raw";
const std::string &CompressedMarkerLabelsOutputFile4 =
"PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u.raw";
const std::string &LabelMarkersBatchOutputFile0 =
"teapot_LabelMarkersUFBatch_8Way_512x512_32u.raw";
const std::string &LabelMarkersBatchOutputFile1 =
"CT_skull_LabelMarkersUFBatch_8Way_512x512_32u.raw";
const std::string &LabelMarkersBatchOutputFile2 =
"PCB_METAL_LabelMarkersUFBatch_8Way_509x335_32u.raw";
const std::string &LabelMarkersBatchOutputFile3 =
"PCB2_LabelMarkersUFBatch_8Way_1024x683_32u.raw";
const std::string &LabelMarkersBatchOutputFile4 =
"PCB_LabelMarkersUFBatch_8Way_1280x720_32u.raw";
const std::string &CompressedMarkerLabelsBatchOutputFile0 =
"teapot_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw";
const std::string &CompressedMarkerLabelsBatchOutputFile1 =
"CT_skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u.raw";
const std::string &CompressedMarkerLabelsBatchOutputFile2 =
"PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u.raw";
const std::string &CompressedMarkerLabelsBatchOutputFile3 =
"PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u.raw";
const std::string &CompressedMarkerLabelsBatchOutputFile4 =
"PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u.raw";
int loadRaw8BitImage(Npp8u *pImage, int nWidth, int nHeight, int nImage) {
FILE *bmpFile;
size_t nSize;
if (nImage == 0) {
if (nWidth != 512 || nHeight != 512) return -1;
const char *fileName = "teapot_512x512_8u.raw";
const char *InputFile = sdkFindFilePath(fileName, ".");
if (InputFile == NULL) {
printf("%s file not found.. exiting\n", fileName);
exit(EXIT_WAIVED);
}
FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 1) {
if (nWidth != 512 || nHeight != 512) return -1;
const char *fileName = "CT_skull_512x512_8u.raw";
const char *InputFile = sdkFindFilePath(fileName, ".");
if (InputFile == NULL) {
printf("%s file not found.. exiting\n", fileName);
exit(EXIT_WAIVED);
}
FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 2) {
if (nWidth != 509 || nHeight != 335) return -1;
const char *fileName = "PCB_METAL_509x335_8u.raw";
const char *InputFile = sdkFindFilePath(fileName, ".");
if (InputFile == NULL) {
printf("%s file not found.. exiting\n", fileName);
exit(EXIT_WAIVED);
}
FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 3) {
if (nWidth != 1024 || nHeight != 683) return -1;
const char *fileName = "PCB2_1024x683_8u.raw";
const char *InputFile = sdkFindFilePath(fileName, ".");
if (InputFile == NULL) {
printf("%s file not found.. exiting\n", fileName);
exit(EXIT_WAIVED);
}
FOPEN(bmpFile, InputFile, "rb");
} else if (nImage == 4) {
if (nWidth != 1280 || nHeight != 720) return -1;
const char *fileName = "PCB_1280x720_8u.raw";
const char *InputFile = sdkFindFilePath(fileName, ".");
if (InputFile == NULL) {
printf("%s file not found.. exiting\n", fileName);
exit(EXIT_WAIVED);
}
FOPEN(bmpFile, InputFile, "rb");
} else {
printf("Input file load failed.\n");
return -1;
}
if (bmpFile == NULL) return -1;
nSize = fread(pImage, 1, nWidth * nHeight, bmpFile);
if (nSize < nWidth * nHeight) {
fclose(bmpFile);
return -1;
}
fclose(bmpFile);
printf("Input file load succeeded.\n");
return 0;
}
int main(int argc, char **argv) {
int aGenerateLabelsScratchBufferSize[NUMBER_OF_IMAGES];
int aCompressLabelsScratchBufferSize[NUMBER_OF_IMAGES];
int nCompressedLabelCount = 0;
cudaError_t cudaError;
NppStatus nppStatus;
NppStreamContext nppStreamCtx;
FILE *bmpFile;
for (int j = 0; j < NUMBER_OF_IMAGES; j++) {
pInputImageDev[j] = 0;
pInputImageHost[j] = 0;
pUFGenerateLabelsScratchBufferDev[j] = 0;
pUFCompressedLabelsScratchBufferDev[j] = 0;
pUFLabelDev[j] = 0;
pUFLabelHost[j] = 0;
}
nppStreamCtx.hStream = 0; // The NULL stream by default, set this to whatever
// your stream ID is if not the NULL stream.
cudaError = cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess) {
printf("CUDA error: no devices supporting CUDA.\n");
return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
}
const NppLibraryVersion *libVer = nppGetLibVersion();
printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor,
libVer->build);
int driverVersion, runtimeVersion;
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version: %d.%d\n", driverVersion / 1000,
(driverVersion % 100) / 10);
printf("CUDA Runtime Version: %d.%d\n\n", runtimeVersion / 1000,
(runtimeVersion % 100) / 10);
cudaError = cudaDeviceGetAttribute(
&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor, nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError = cudaDeviceGetAttribute(
&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor, nppStreamCtx.nCudaDeviceId);
if (cudaError != cudaSuccess) return NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY;
cudaError =
cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaError =
cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor =
oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
NppiSize oSizeROI[NUMBER_OF_IMAGES];
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0) {
oSizeROI[nImage].width = 512;
oSizeROI[nImage].height = 512;
} else if (nImage == 1) {
oSizeROI[nImage].width = 512;
oSizeROI[nImage].height = 512;
} else if (nImage == 2) {
oSizeROI[nImage].width = 509;
oSizeROI[nImage].height = 335;
} else if (nImage == 3) {
oSizeROI[nImage].width = 1024;
oSizeROI[nImage].height = 683;
} else if (nImage == 4) {
oSizeROI[nImage].width = 1280;
oSizeROI[nImage].height = 720;
}
// NOTE: While using cudaMallocPitch() to allocate device memory for NPP can
// significantly improve the performance of many NPP functions, for UF
// function label markers generation or compression DO NOT USE
// cudaMallocPitch(). Doing so could result in incorrect output.
cudaError = cudaMalloc(
(void **)&pInputImageDev[nImage],
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
// For images processed with UF label markers functions ROI width and height
// for label markers generation output AND marker compression functions MUST
// be the same AND line pitch MUST be equal to ROI.width * sizeof(Npp32u).
// Also the image pointer used for label markers generation output must
// start at the same position in the image as it does in the marker
// compression function. Also note that actual input image size and ROI do
// not necessarily need to be related other than ROI being less than or
// equal to image size and image starting position does not necessarily have
// to be at pixel 0 in the input image.
cudaError = cudaMalloc(
(void **)&pUFLabelDev[nImage],
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
checkCudaErrors(cudaMallocHost(
&(pInputImageHost[nImage]),
oSizeROI[nImage].width * sizeof(Npp8u) * oSizeROI[nImage].height));
checkCudaErrors(cudaMallocHost(
&(pUFLabelHost[nImage]),
oSizeROI[nImage].width * sizeof(Npp32u) * oSizeROI[nImage].height));
// Use UF functions throughout this sample.
nppStatus = nppiLabelMarkersUFGetBufferSize_32u_C1R(
oSizeROI[nImage], &aGenerateLabelsScratchBufferSize[nImage]);
// One at a time image processing
cudaError = cudaMalloc((void **)&pUFGenerateLabelsScratchBufferDev[nImage],
aGenerateLabelsScratchBufferSize[nImage]);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
if (loadRaw8BitImage(pInputImageHost[nImage],
oSizeROI[nImage].width * sizeof(Npp8u),
oSizeROI[nImage].height, nImage) == 0) {
cudaError = cudaMemcpy2DAsync(
pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
pInputImageHost[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
oSizeROI[nImage].width * sizeof(Npp8u), oSizeROI[nImage].height,
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
nppStatus = nppiLabelMarkersUF_8u32u_C1R_Ctx(
pInputImageDev[nImage], oSizeROI[nImage].width * sizeof(Npp8u),
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage], nppiNormInf,
pUFGenerateLabelsScratchBufferDev[nImage], nppStreamCtx);
if (nppStatus != NPP_SUCCESS) {
if (nImage == 0)
printf("teapot_LabelMarkersUF_8Way_512x512_32u failed.\n");
else if (nImage == 1)
printf("CT_skull_LabelMarkersUF_8Way_512x512_32u failed.\n");
else if (nImage == 2)
printf("PCB_METAL_LabelMarkersUF_8Way_509x335_32u failed.\n");
else if (nImage == 3)
printf("PCB2_LabelMarkersUF_8Way_1024x683_32u failed.\n");
else if (nImage == 4)
printf("PCB_LabelMarkersUF_8Way_1280x720_32u failed.\n");
tearDown();
return -1;
}
cudaError = cudaMemcpy2DAsync(
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
// Wait host image read backs to complete, not necessary if no need to
// synchronize
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
cudaSuccess) {
printf("Post label generation cudaStreamSynchronize failed\n");
tearDown();
return -1;
}
if (nImage == 0)
FOPEN(bmpFile, LabelMarkersOutputFile0.c_str(), "wb");
else if (nImage == 1)
FOPEN(bmpFile, LabelMarkersOutputFile1.c_str(), "wb");
else if (nImage == 2)
FOPEN(bmpFile, LabelMarkersOutputFile2.c_str(), "wb");
else if (nImage == 3)
FOPEN(bmpFile, LabelMarkersOutputFile3.c_str(), "wb");
else if (nImage == 4)
FOPEN(bmpFile, LabelMarkersOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1;
size_t nSize = 0;
for (int j = 0; j < oSizeROI[nImage].height; j++) {
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
}
fclose(bmpFile);
nppStatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(
oSizeROI[nImage].width * oSizeROI[nImage].height,
&aCompressLabelsScratchBufferSize[nImage]);
if (nppStatus != NPP_NO_ERROR) return nppStatus;
cudaError =
cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[nImage],
aCompressLabelsScratchBufferSize[nImage]);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
nCompressedLabelCount = 0;
nppStatus = nppiCompressMarkerLabelsUF_32u_C1IR(
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage], oSizeROI[nImage].width * oSizeROI[nImage].height,
&nCompressedLabelCount, pUFCompressedLabelsScratchBufferDev[nImage]);
if (nppStatus != NPP_SUCCESS) {
if (nImage == 0)
printf("teapot_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
else if (nImage == 1)
printf(
"CT_Skull_CompressedLabelMarkersUF_8Way_512x512_32u failed.\n");
else if (nImage == 2)
printf(
"PCB_METAL_CompressedLabelMarkersUF_8Way_509x335_32u failed.\n");
else if (nImage == 3)
printf("PCB2_CompressedLabelMarkersUF_8Way_1024x683_32u failed.\n");
else if (nImage == 4)
printf("PCB_CompressedLabelMarkersUF_8Way_1280x720_32u failed.\n");
tearDown();
return -1;
}
cudaError = cudaMemcpy2DAsync(
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
// Wait for host image read backs to finish, not necessary if no need to
// synchronize
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
cudaSuccess ||
nCompressedLabelCount == 0) {
printf("Post label compression cudaStreamSynchronize failed\n");
tearDown();
return -1;
}
if (nImage == 0)
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile0.c_str(), "wb");
else if (nImage == 1)
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile1.c_str(), "wb");
else if (nImage == 2)
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile2.c_str(), "wb");
else if (nImage == 3)
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile3.c_str(), "wb");
else if (nImage == 4)
FOPEN(bmpFile, CompressedMarkerLabelsOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1;
nSize = 0;
for (int j = 0; j < oSizeROI[nImage].height; j++) {
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
}
fclose(bmpFile);
if (nImage == 0)
printf(
"teapot_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, "
"compressed label count is %d.\n",
nCompressedLabelCount);
else if (nImage == 1)
printf(
"CT_Skull_CompressedMarkerLabelsUF_8Way_512x512_32u succeeded, "
"compressed label count is %d.\n",
nCompressedLabelCount);
else if (nImage == 2)
printf(
"PCB_METAL_CompressedMarkerLabelsUF_8Way_509x335_32u succeeded, "
"compressed label count is %d.\n",
nCompressedLabelCount);
else if (nImage == 3)
printf(
"PCB2_CompressedMarkerLabelsUF_8Way_1024x683_32u succeeded, "
"compressed label count is %d.\n",
nCompressedLabelCount);
else if (nImage == 4)
printf(
"PCB_CompressedMarkerLabelsUF_8Way_1280x720_32u succeeded, "
"compressed label count is %d.\n",
nCompressedLabelCount);
}
}
// Batch image processing
// We want to allocate scratch buffers more efficiently for batch processing
// so first we free up the scratch buffers for image 0 and reallocate them.
// This is not required but helps cudaMalloc to work more efficiently.
cudaFree(pUFCompressedLabelsScratchBufferDev[0]);
int nTotalBatchedUFCompressLabelsScratchBufferDevSize = 0;
for (int k = 0; k < NUMBER_OF_IMAGES; k++)
nTotalBatchedUFCompressLabelsScratchBufferDevSize +=
aCompressLabelsScratchBufferSize[k];
cudaError = cudaMalloc((void **)&pUFCompressedLabelsScratchBufferDev[0],
nTotalBatchedUFCompressLabelsScratchBufferDevSize);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
// Now allocate batch lists
int nBatchImageListBytes = NUMBER_OF_IMAGES * sizeof(NppiImageDescriptor);
cudaError =
cudaMalloc((void **)&pUFBatchSrcImageListDev, nBatchImageListBytes);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
cudaError =
cudaMalloc((void **)&pUFBatchSrcDstImageListDev, nBatchImageListBytes);
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
checkCudaErrors(
cudaMallocHost((void **)&pUFBatchSrcImageListHost, nBatchImageListBytes));
checkCudaErrors(cudaMallocHost((void **)&pUFBatchSrcDstImageListHost,
nBatchImageListBytes));
NppiSize oMaxROISize = {0, 0};
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
pUFBatchSrcImageListHost[nImage].pData = pInputImageDev[nImage];
pUFBatchSrcImageListHost[nImage].nStep =
oSizeROI[nImage].width * sizeof(Npp8u);
// src image oSize parameter is ignored in these NPP functions
pUFBatchSrcDstImageListHost[nImage].pData = pUFLabelDev[nImage];
pUFBatchSrcDstImageListHost[nImage].nStep =
oSizeROI[nImage].width * sizeof(Npp32u);
pUFBatchSrcDstImageListHost[nImage].oSize = oSizeROI[nImage];
if (oSizeROI[nImage].width > oMaxROISize.width)
oMaxROISize.width = oSizeROI[nImage].width;
if (oSizeROI[nImage].height > oMaxROISize.height)
oMaxROISize.height = oSizeROI[nImage].height;
}
// Copy label generation batch lists from CPU to GPU
cudaError = cudaMemcpyAsync(pUFBatchSrcImageListDev, pUFBatchSrcImageListHost,
nBatchImageListBytes, cudaMemcpyHostToDevice,
nppStreamCtx.hStream);
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
cudaError = cudaMemcpyAsync(pUFBatchSrcDstImageListDev,
pUFBatchSrcDstImageListHost, nBatchImageListBytes,
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
// We use 8-way neighbor search throughout this example
nppStatus = nppiLabelMarkersUFBatch_8u32u_C1R_Advanced_Ctx(
pUFBatchSrcImageListDev, pUFBatchSrcDstImageListDev, NUMBER_OF_IMAGES,
oMaxROISize, nppiNormInf, nppStreamCtx);
if (nppStatus != NPP_SUCCESS) {
printf("LabelMarkersUFBatch_8Way_8u32u failed.\n");
tearDown();
return -1;
}
// Now read back generated device images to the host
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
cudaError = cudaMemcpy2DAsync(
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
}
// Wait for host image read backs to complete, not necessary if no need to
// synchronize
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
cudaSuccess) {
printf("Post label generation cudaStreamSynchronize failed\n");
tearDown();
return -1;
}
// Save output to files
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0)
FOPEN(bmpFile, LabelMarkersBatchOutputFile0.c_str(), "wb");
else if (nImage == 1)
FOPEN(bmpFile, LabelMarkersBatchOutputFile1.c_str(), "wb");
else if (nImage == 2)
FOPEN(bmpFile, LabelMarkersBatchOutputFile2.c_str(), "wb");
else if (nImage == 3)
FOPEN(bmpFile, LabelMarkersBatchOutputFile3.c_str(), "wb");
else if (nImage == 4)
FOPEN(bmpFile, LabelMarkersBatchOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1;
size_t nSize = 0;
for (int j = 0; j < oSizeROI[nImage].height; j++) {
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
}
fclose(bmpFile);
}
#ifdef USE_BATCHED_LABEL_COMPRESSION
// Now allocate scratch buffer memory for batched label compression
cudaError = cudaMalloc((void **)&pUFBatchSrcDstScratchBufferListDev,
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor));
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
cudaError = cudaMalloc((void **)&pUFBatchPerImageCompressedCountListDev,
NUMBER_OF_IMAGES * sizeof(Npp32u));
if (cudaError != cudaSuccess) return NPP_MEMORY_ALLOCATION_ERR;
// Allocate host side scratch buffer point and size list and initialize with
// device scratch buffer pointers
checkCudaErrors(
cudaMallocHost((void **)&pUFBatchSrcDstScratchBufferListHost,
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor)));
checkCudaErrors(
cudaMallocHost((void **)&pUFBatchPerImageCompressedCountListHost,
+NUMBER_OF_IMAGES * sizeof(Npp32u)));
// Start buffer pointer at beginning of full per image buffer list sized
// pUFCompressedLabelsScratchBufferDev[0]
Npp32u *pCurUFCompressedLabelsScratchBufferDev =
reinterpret_cast<Npp32u *>(pUFCompressedLabelsScratchBufferDev[0]);
int nMaxUFCompressedLabelsScratchBufferSize = 0;
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
// This particular function works on in-place data and SrcDst image batch
// list has already been initialized in batched label generation function
// setup
// Initialize each per image buffer descriptor
pUFBatchSrcDstScratchBufferListHost[nImage].pData =
reinterpret_cast<void *>(pCurUFCompressedLabelsScratchBufferDev);
pUFBatchSrcDstScratchBufferListHost[nImage].nBufferSize =
aCompressLabelsScratchBufferSize[nImage];
if (aCompressLabelsScratchBufferSize[nImage] >
nMaxUFCompressedLabelsScratchBufferSize)
nMaxUFCompressedLabelsScratchBufferSize =
aCompressLabelsScratchBufferSize[nImage];
// Offset buffer pointer to next per image buffer
Npp8u *pTempBuffer =
reinterpret_cast<Npp8u *>(pCurUFCompressedLabelsScratchBufferDev);
pTempBuffer += aCompressLabelsScratchBufferSize[nImage];
pCurUFCompressedLabelsScratchBufferDev =
reinterpret_cast<Npp32u *>((void *)(pTempBuffer));
}
// Copy compression batch scratch buffer list from CPU to GPU
cudaError = cudaMemcpyAsync(pUFBatchSrcDstScratchBufferListDev,
pUFBatchSrcDstScratchBufferListHost,
NUMBER_OF_IMAGES * sizeof(NppiBufferDescriptor),
cudaMemcpyHostToDevice, nppStreamCtx.hStream);
if (cudaError != cudaSuccess) return NPP_MEMCPY_ERROR;
nppStatus = nppiCompressMarkerLabelsUFBatch_32u_C1IR_Advanced_Ctx(
pUFBatchSrcDstImageListDev, pUFBatchSrcDstScratchBufferListDev,
pUFBatchPerImageCompressedCountListDev, NUMBER_OF_IMAGES, oMaxROISize,
nMaxUFCompressedLabelsScratchBufferSize, nppStreamCtx);
if (nppStatus != NPP_SUCCESS) {
printf("BatchCompressedLabelMarkersUF_8Way_32u failed.\n");
tearDown();
return -1;
}
// Copy output compressed label images back to host
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
cudaError = cudaMemcpy2DAsync(
pUFLabelHost[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
pUFLabelDev[nImage], oSizeROI[nImage].width * sizeof(Npp32u),
oSizeROI[nImage].width * sizeof(Npp32u), oSizeROI[nImage].height,
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
}
// Wait for host image read backs to complete, not necessary if no need to
// synchronize
if ((cudaError = cudaStreamSynchronize(nppStreamCtx.hStream)) !=
cudaSuccess) {
printf("Post label compression cudaStreamSynchronize failed\n");
tearDown();
return -1;
}
// Save compressed label images into files
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0)
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile0.c_str(), "wb");
else if (nImage == 1)
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile1.c_str(), "wb");
else if (nImage == 2)
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile2.c_str(), "wb");
else if (nImage == 3)
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile3.c_str(), "wb");
else if (nImage == 4)
FOPEN(bmpFile, CompressedMarkerLabelsBatchOutputFile4.c_str(), "wb");
if (bmpFile == NULL) return -1;
size_t nSize = 0;
for (int j = 0; j < oSizeROI[nImage].height; j++) {
nSize += fwrite(&pUFLabelHost[nImage][j * oSizeROI[nImage].width],
sizeof(Npp32u), oSizeROI[nImage].width, bmpFile);
}
fclose(bmpFile);
}
// Read back per image compressed label count.
cudaError = cudaMemcpyAsync(pUFBatchPerImageCompressedCountListHost,
pUFBatchPerImageCompressedCountListDev,
NUMBER_OF_IMAGES * sizeof(Npp32u),
cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
if (cudaError != cudaSuccess) {
tearDown();
return NPP_MEMCPY_ERROR;
}
// Wait for host read back to complete
cudaError = cudaStreamSynchronize(nppStreamCtx.hStream);
printf("\n\n");
for (int nImage = 0; nImage < NUMBER_OF_IMAGES; nImage++) {
if (nImage == 0)
printf(
"teapot_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, "
"compressed label count is %d.\n",
pUFBatchPerImageCompressedCountListHost[nImage]);
else if (nImage == 1)
printf(
"CT_Skull_CompressedMarkerLabelsUFBatch_8Way_512x512_32u succeeded, "
"compressed label count is %d.\n",
pUFBatchPerImageCompressedCountListHost[nImage]);
else if (nImage == 2)
printf(
"PCB_METAL_CompressedMarkerLabelsUFBatch_8Way_509x335_32u succeeded, "
"compressed label count is %d.\n",
pUFBatchPerImageCompressedCountListHost[nImage]);
else if (nImage == 3)
printf(
"PCB2_CompressedMarkerLabelsUFBatch_8Way_1024x683_32u succeeded, "
"compressed label count is %d.\n",
pUFBatchPerImageCompressedCountListHost[nImage]);
else if (nImage == 4)
printf(
"PCB_CompressedMarkerLabelsUFBatch_8Way_1280x720_32u succeeded, "
"compressed label count is %d.\n",
pUFBatchPerImageCompressedCountListHost[nImage]);
}
#endif // USE_BATCHED_LABEL_COMPRESSION
tearDown();
return 0;
}

View File

@ -0,0 +1,40 @@
# Include directories and libraries
include_directories(
../../../Common
../../../Common/UtilNPP
)
# Source file
set(SRC_FILES
boxFilterNPP.cpp
)
find_package(FreeImage)
if(${FreeImage_FOUND})
# Add target for boxFilterNPP
add_executable(boxFilterNPP ${SRC_FILES})
set_target_properties(boxFilterNPP PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_include_directories(boxFilterNPP PRIVATE
${CUDAToolkit_INCLUDE_DIRS}
${FreeImage_INCLUDE_DIRS}
)
target_link_libraries(boxFilterNPP PRIVATE
CUDA::nppc
CUDA::nppisu
CUDA::nppif
CUDA::cudart
${FreeImage_LIBRARIES}
)
# Copy data files to output directory
add_custom_command(TARGET boxFilterNPP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/*.pgm
${CMAKE_CURRENT_BINARY_DIR}
)
else()
message(STATUS "FreeImage not found - will not build sample 'boxFilterNPP'")
endif()

View File

@ -1,378 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifdef HOST_COMPILER
CUSTOM_HOST_COMPILER = 1
endif
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
# Link flag for customized HOST_COMPILER with gcc realpath
GCC_PATH := $(shell which gcc)
ifeq ($(CUSTOM_HOST_COMPILER),1)
ifneq ($(filter /%,$(HOST_COMPILER)),)
ifneq ($(findstring gcc,$(HOST_COMPILER)),)
ifneq ($(GCC_PATH),$(HOST_COMPILER))
LDFLAGS += -lstdc++
endif
endif
endif
endif
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - boxFilterNPP is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
SMS ?=
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
ifeq ($(SMS),)
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
# Generate PTX code from SM 53
GENCODE_FLAGS += -gencode arch=compute_53,code=compute_53
else
# Generate PTX code from SM 50
GENCODE_FLAGS += -gencode arch=compute_50,code=compute_50
endif
endif
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
INCLUDES += -I../../../Common/UtilNPP
LIBRARIES += -lnppisu_static -lnppif_static -lnppc_static -lculibos -lfreeimage
# Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up.
$(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(ALL_LDFLAGS) $(LIBRARIES) -l freeimage test.c)
FREEIMAGE := $(shell find a.out 2>/dev/null)
$(shell rm a.out test.c 2>/dev/null)
ifeq ("$(FREEIMAGE)","")
$(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: boxFilterNPP
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
boxFilterNPP.o:boxFilterNPP.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
boxFilterNPP: boxFilterNPP.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./boxFilterNPP
testrun: build
clean:
rm -f boxFilterNPP boxFilterNPP.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/boxFilterNPP
clobber: clean

View File

@ -1,95 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>boxFilterNPP</name>
<cuda_api_list>
<toolkit>cudaRuntimeGetVersion</toolkit>
<toolkit>cudaDriverGetVersion</toolkit>
</cuda_api_list>
<description><![CDATA[A NPP CUDA Sample that demonstrates how to use NPP FilterBox function to perform a Box Filter.]]></description>
<devicecompilation>whole</devicecompilation>
<fallback_min_ptx>true</fallback_min_ptx>
<files>
<file>./teapot512.pgm</file>
</files>
<includepaths>
<path>../../../Common/UtilNPP</path>
<path os="Windows">../../../Common/FreeImage/Dist/x64</path>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Performance Strategies</concept>
<concept level="basic">Image Processing</concept>
<concept level="basic">NPP Library</concept>
</keyconcepts>
<keywords>
<keyword>CUDA</keyword>
<keyword>NPP</keyword>
<keyword>Image Processing</keyword>
<keyword>box filter</keyword>
</keywords>
<libraries>
<library>nppisu_static</library>
<library>nppif_static</library>
<library>nppc_static</library>
<library>culibos</library>
<library>freeimage</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>boxFilterNPP.cpp</primary_file>
<required_dependencies>
<dependency>FreeImage</dependency>
<dependency>NPP</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:Performance Strategies</scope>
<scope>2:Image Processing</scope>
<scope>2:Computer Vision</scope>
</scopes>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<sm-arch>sm89</sm-arch>
<sm-arch>sm90</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>Box Filter with NPP</title>
<type>exe</type>
</entry>

View File

@ -2,5 +2,6 @@ add_subdirectory(0_Introduction)
add_subdirectory(1_Utilities)
add_subdirectory(2_Concepts_and_Techniques)
add_subdirectory(3_CUDA_Features)
add_subdirectory(4_CUDA_Libraries)
add_subdirectory(6_Performance)
add_subdirectory(7_libNVVM)

View File

@ -0,0 +1,17 @@
find_path(FreeImage_INCLUDE_DIR
NAMES freeimage.h FreeImage.h
PATHS /usr/include /usr/local/include
)
find_library(FreeImage_LIBRARY
NAMES freeimage
PATHS /usr/lib /usr/local/lib
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FreeImage DEFAULT_MSG FreeImage_LIBRARY FreeImage_INCLUDE_DIR)
if(FreeImage_FOUND)
set(FreeImage_LIBRARIES ${FreeImage_LIBRARY})
set(FreeImage_INCLUDE_DIRS ${FreeImage_INCLUDE_DIR})
endif()