diff --git a/h/HyperQueue/HyperQueue-0.10.0.eb b/h/HyperQueue/HyperQueue-0.10.0.eb new file mode 100644 index 00000000..cabd8d04 --- /dev/null +++ b/h/HyperQueue/HyperQueue-0.10.0.eb @@ -0,0 +1,24 @@ +# IT4Innovations +# JK 2022 + +easyblock = 'PackedBinary' + +name = 'HyperQueue' +version = '0.10.0' + +homepage = 'https://it4innovations.github.io/hyperqueue/' +description = """HyperQueue lets you build a computation plan consisting of a large amount of tasks and then execute it transparently over a system like SLURM/PBS. It dynamically groups jobs into SLURM/PBS jobs and distributes them to fully utilize allocated notes. You thus do not have to manually aggregate your tasks into SLURM/PBS jobs.""" + +toolchain = SYSTEM + +source_urls = ['https://github.com/It4innovations/hyperqueue/releases/download/v%(version)s/'] +sources = ['hq-v%(version)s-linux-x64.tar.gz'] +checksums = ['2513d5ce7e8b31ace17f5054058c3fed7900ef61e3aa0f27d66f794533cd152c'] + +sanity_check_paths = { + 'files': ['hq'], + 'dirs': [], +} + + +moduleclass = 'devel' diff --git a/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb b/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb new file mode 100644 index 00000000..7754197d --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb @@ -0,0 +1,119 @@ +name = 'PyTorch' +version = '1.11.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2021a'} + +sources = [{ + 'filename': '%(name)s-%(version)s.tar.gz', + 'git_config': { + 'url': 'https://github.com/pytorch', + 'repo_name': 'pytorch', + 'tag': 'v%(version)s', + 'recursive': True, + }, +}] +patches = [ + 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch', + 'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch', + 'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch', + 'PyTorch-1.10.0_skip_cmake_rpath.patch', + 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', + 'PyTorch-1.11.0_skip_failing_ops_tests.patch', + 'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch', + 'PyTorch-1.11.0_fix_sharded_imports.patch', + 'PyTorch-1.11.0_increase_test_tolerances_TF32.patch', + 'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch', + 'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch', +] +checksums = [ + None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' + 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch + '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch + '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch + # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch + 'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea', + # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch + '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707', + 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch + # PyTorch-1.11.0_increase-distributed-test-timeout.patch + '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f', + '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch + '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch + '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch + # PyTorch-1.11.0_increase_test_tolerances_TF32.patch + '26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba', + # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch + '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953', + # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch + 'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51', +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.20.1'), + ('hypothesis', '6.13.1'), +] + +dependencies = [ + ('CUDA', '11.3.1', '', True), + ('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions + ('Python', '3.9.5'), + ('protobuf', '3.17.3'), + ('protobuf-python', '3.17.3'), + ('pybind11', '2.6.2'), + ('SciPy-bundle', '2021.05'), + ('typing-extensions', '3.10.0.0'), + ('PyYAML', '5.4.1'), + ('MPFR', '4.1.0'), + ('GMP', '6.2.1'), + ('numactl', '2.0.14'), + ('FFmpeg', '4.3.2'), + ('Pillow', '8.2.0'), + ('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True), + ('magma', '2.6.1', '-CUDA-%(cudaver)s'), + ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'), + ('expecttest', '0.1.3'), +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6'] + +custom_opts = ["USE_CUPTI_SO=1"] + +excluded_tests = { + '': [ + # Bad tests: https://github.com/pytorch/pytorch/issues/60260 + 'distributed/elastic/utils/distributed_test', + 'distributed/elastic/multiprocessing/api_test', + # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. + # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html + # 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', + # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 + 'test_optim', + # Test from this suite timeout often. The process group backend is deprecated anyway + # 'distributed/rpc/test_process_group_agent', + # This test fails constently when run as part of the test suite, but succeeds when run interactively + 'test_model_dump', + ] +} + +runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' + +# The readelf sanity check command can be taken out once the TestRPATH test from +# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite +local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT +sanity_check_commands = [ + "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, +] +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'devel' diff --git a/p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb b/p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb index 255a8ef2..b195ac4d 100644 --- a/p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb +++ b/p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb @@ -1,4 +1,4 @@ -# IT4Innovations +# it4Innovations # LK 2022 name = 'PyTorch' @@ -22,55 +22,45 @@ sources = [{ patches = [ 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', 'PyTorch-1.7.0_disable-dev-shm-test.patch', -# 'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch', 'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch', - 'PyTorch-1.8.1_increase-distributed-test-timeout.patch', 'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch', 'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch', - 'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch', - 'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch', - 'PyTorch-1.10.0_fix-test-cond-cpu.patch', - 'PyTorch-1.10.0_fix-vnni-detection.patch', - 'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch', - 'PyTorch-1.10.0_skip_failing_ops_tests.patch', - 'PyTorch-1.10.0_skip_nan_tests_openblas.patch', 'PyTorch-1.10.0_skip_cmake_rpath.patch', + 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', + 'PyTorch-1.11.0_skip_failing_ops_tests.patch', + 'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch', + 'PyTorch-1.11.0_fix_sharded_imports.patch', + 'PyTorch-1.11.0_increase_test_tolerances_TF32.patch', + 'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch', + 'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch', ] checksums = [ None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch - # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch - 'd4d967d47f8a6172fcbf57f0a61835482968850967c4fdb01108b720696a988d', '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch - # PyTorch-1.8.1_increase-distributed-test-timeout.patch - '7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071', # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch 'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea', # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707', - # PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch - '426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12', - # PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch - '67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a', - # PyTorch-1.10.0_fix-test-cond-cpu.patch - '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03', - # PyTorch-1.10.0_fix-vnni-detection.patch - '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc', - # PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch - 'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240', - # PyTorch-1.10.0_skip_failing_ops_tests.patch - '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab', - # PyTorch-1.10.0_skip_nan_tests_openblas.patch - '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf', - # PyTorch-1.10.0_skip_cmake_rpath.patch - 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', + 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch + # PyTorch-1.11.0_increase-distributed-test-timeout.patch + '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f', + '8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch + '21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch + '9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch + # PyTorch-1.11.0_increase_test_tolerances_TF32.patch + '26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba', + # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch + '20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953', + # PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch + 'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51', ] osdependencies = [OS_PKG_IBVERBS_DEV] builddependencies = [ - ('CMake', '3.20.1'), # Needs 3.20 or newer. + ('CMake', '3.20.1'), ('hypothesis', '5.41.5'), ] @@ -88,7 +78,6 @@ dependencies = [ ('numactl', '2.0.13'), ('FFmpeg', '4.3.1'), ('Pillow', '8.0.1'), - ('expecttest', '0.1.3'), ('cuDNN', '8.0.4.30', '-CUDA-%(cudaver)s', True), ('magma', '2.5.4'), ('NCCL', '2.8.3', '-CUDA-%(cudaver)s'), @@ -112,16 +101,17 @@ excluded_tests = { 'test_optim', # Test from this suite timeout often. The process group backend is deprecated anyway # 'distributed/rpc/test_process_group_agent', + # This test fails constently when run as part of the test suite, but succeeds when run interactively + 'test_model_dump', ] } runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' -# The readelf sanity check can be taken out once the TestRPATH test from https://github.com/pytorch/pytorch/pull/68912 -# is accepted, since it is then checked as part of the PyTorch test suite +# The readelf sanity check command can be taken out once the TestRPATH test from +# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT sanity_check_commands = [ - "python -c 'import caffe2.python'", "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, ] tests = ['PyTorch-check-cpp-extension.py'] diff --git a/p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch b/p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch new file mode 100644 index 00000000..c589c52b --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch @@ -0,0 +1,50 @@ +# Author: Caspar van Leeuwen +# Company: SURF +# We've seen that these tests fail for version 1.11.0, see https://github.com/pytorch/pytorch/issues/76107 +# These failures probably point to underlying issues, but the PR that fixes them touches a ton of files +# It's near-impossible to cherry pick that, without causing other issues. Moreover, +# PyTorch devs have pointed out that nvfuser is not enabled by default in 1.11.0, so chances of anyone +# hitting these issues are very small +# We simply disable the tests and accept that in v 1.11.0 in PyTorch, this functionality is broken. +diff -Nru pytorch_orig/test/test_jit_cuda_fuser.py pytorch/test/test_jit_cuda_fuser.py +--- pytorch_orig/test/test_jit_cuda_fuser.py 2022-04-29 14:54:30.771378000 +0200 ++++ pytorch/test/test_jit_cuda_fuser.py 2022-04-29 14:05:54.067297000 +0200 +@@ -1313,6 +1313,12 @@ + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") ++ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11 ++ # Thus, even if this points to an underlying issue, it should be extremely rare that ++ # anyone hits it. ++ # See https://github.com/pytorch/pytorch/issues/76107 ++ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137 ++ @unittest.skip("Skipping test that is known to fail, see PT #76107") + def test_native_layer_norm_bfloat(self): + dims = 4 + rnds = 3 +@@ -2828,6 +2834,12 @@ + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") ++ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11 ++ # Thus, even if this points to an underlying issue, it should be extremely rare that ++ # anyone hits it. ++ # See https://github.com/pytorch/pytorch/issues/76107 ++ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137 ++ @unittest.skip("Skipping test that is known to fail, see PT #76107") + def test_batch_norm_half(self): + with torch.backends.cudnn.flags(enabled=True): + setups = [ +@@ -2843,6 +2855,12 @@ + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") ++ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11 ++ # Thus, even if this points to an underlying issue, it should be extremely rare that ++ # anyone hits it. ++ # See https://github.com/pytorch/pytorch/issues/76107 ++ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137 ++ @unittest.skip("Skipping test that is known to fail, see PT #76107") + def test_batch_norm_impl_index_correctness(self): + with torch.backends.cudnn.flags(enabled=True): + batch = [2, 7, 16] diff --git a/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch b/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch new file mode 100644 index 00000000..7d46fdad --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch @@ -0,0 +1,44 @@ +# Fixes a "NameError: name 'sharded_tensor' is not defined" error +# for the test_named_params_with_sharded_tensor test +# See https://github.com/pytorch/pytorch/pull/73309 +From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001 +From: wanchaol +Date: Wed, 23 Feb 2022 12:10:39 -0800 +Subject: [PATCH] [shard] fix some imports in tests + +This fix some imports in sharded optimizer tests + +Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/) + +[ghstack-poisoned] +--- + .../_shard/sharded_optim/test_sharded_optim.py | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py +index 085c928985eb..d3f1468aea3c 100644 +--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py ++++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py +@@ -2,7 +2,10 @@ + + import torch + import torch.optim as optim +-import torch.distributed._shard.sharded_tensor ++from torch.distributed._shard import ( ++ sharded_tensor, ++ shard_parameter ++) + + from copy import deepcopy + from torch.distributed._shard.sharding_spec import ( +@@ -77,8 +80,8 @@ def shard_parameter(self): + ], + ) + +- sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec) +- sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec) ++ shard_parameter(self.linear1, "weight", rowwise_sharding_spec) ++ shard_parameter(self.linear2, "weight", colwise_sharding_spec) + + def forward(self, inp): + return self.linear2(self.gelu(self.linear1(inp))) diff --git a/p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch b/p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch new file mode 100644 index 00000000..4368fb8f --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch @@ -0,0 +1,439 @@ +# Author: Caspar van Leeuwen +# Company: SURF +# The CudaFuser test suite checks CUDA capabilities, even if 'RUN_CUDA' is false. +# That makes the test fail on non-GPU nodes. +# In this patch, I wrapped the logic in 'if RUN_CUDA' blocks in order to make sure +# no CUDA calls are made when RUN_CUDA=false +# Furthermore, I swapped all occurences of @unittest.skipIf(not RUN_CUDA, ...) and @unittest.skipIf(is_pre_volta()) +# The latter check is a more specific 'skip' condition: you should only check if a GPU is pre-volta, +# if there are CUDA devices present to begin with. Again, doing this in the wrong order would incur CUDA calls +# on non-CUDA nodes. +# Note that this has been fixed in master, so we probably don't need this patch beyond PT 1.11 +diff -Nru pytorch-1.11.0-rc3.orig/test/test_jit_cuda_fuser.py pytorch-1.11.0-rc3/test/test_jit_cuda_fuser.py +--- pytorch-1.11.0-rc3.orig/test/test_jit_cuda_fuser.py 2022-02-24 18:06:55.180421593 +0100 ++++ pytorch-1.11.0-rc3/test/test_jit_cuda_fuser.py 2022-02-25 13:30:47.112845480 +0100 +@@ -57,18 +57,25 @@ + torch._C._jit_set_nvfuser_horizontal_mode(old_value) + + def is_pre_volta(): +- prop = torch.cuda.get_device_properties(torch.cuda.current_device()) +- return prop.major < 7 +- +-TEST_BF16 = torch.cuda.is_bf16_supported() ++ if RUN_CUDA: ++ prop = torch.cuda.get_device_properties(torch.cuda.current_device()) ++ return prop.major < 7 ++ else: ++ return True ++ ++if RUN_CUDA: ++ TEST_BF16 = torch.cuda.is_bf16_supported() ++else: ++ TEST_BF16=False + + class TestCudaFuser(JitTestCase): + +- special_values = torch.tensor( +- [float("-inf"), -10, -math.pi, +- -1, -0.5, 0, 1, 0.5, +- math.pi, 10, float("inf"), +- float("nan")], dtype=torch.float, device='cuda') ++ if RUN_CUDA: ++ special_values = torch.tensor( ++ [float("-inf"), -10, -math.pi, ++ -1, -0.5, 0, 1, 0.5, ++ math.pi, 10, float("inf"), ++ float("nan")], dtype=torch.float, device='cuda') + + int_types = [ + torch.int8, +@@ -253,8 +260,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_reduction_dtypes_axis(self): +@@ -1120,8 +1127,8 @@ + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_reduction(self): +@@ -1170,8 +1177,8 @@ + FileCheck().check(FUSION_GUARD).run(g) + FileCheck().check(FUSION_GUARD).run(v2.graph) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_layer_norm_autodiff(self): +@@ -1212,8 +1219,8 @@ + args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_()) + self._layer_norm_autodiff_helper(m, grad, shapes, args) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_layer_norm_parser(self): +@@ -1273,8 +1280,8 @@ + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + + @unittest.skipIf(True, "codegen failure awaiting fix") +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_native_layer_norm(self): +@@ -1288,8 +1295,8 @@ + self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine) + + @unittest.skipIf(True, "codegen failure awaiting fix") +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_native_layer_norm_half(self): +@@ -1301,8 +1308,8 @@ + norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)] + self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") +@@ -1362,8 +1369,8 @@ + self.assertTrue(self._compare("comparing running_var failed", eager_running_var, jit_running_var, error)) + self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_norm_channels_last(self): +@@ -1374,8 +1381,8 @@ + for mf in [torch.channels_last, torch.contiguous_format]: + self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_norm(self): +@@ -1391,8 +1398,8 @@ + x[1] = C + self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_norm_large(self): +@@ -1407,8 +1414,8 @@ + x[1] = C + self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_norm_half(self): +@@ -1424,8 +1431,8 @@ + x[1] = C + self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") +@@ -1469,8 +1476,8 @@ + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_softmax_dtype(self): +@@ -1511,8 +1518,8 @@ + )[0].graph + FileCheck().check(FUSION_GUARD).run(bwd_graph) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test__softmax_function(self): +@@ -1535,8 +1542,8 @@ + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3)) + self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test__softmax_function_half_to_float(self): +@@ -1559,8 +1566,8 @@ + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3)) + self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_softmax(self): +@@ -1575,8 +1582,8 @@ + x[reduction_dim] = reduction_size + self._softmax_helper(x, reduction_dim, torch.float32, "cuda", 1e-4) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_softmax_half(self): +@@ -1591,8 +1598,8 @@ + x[reduction_dim] = reduction_size + self._softmax_helper(x, reduction_dim, torch.float16, "cuda", 5e-3) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") +@@ -1608,8 +1615,8 @@ + x[reduction_dim] = reduction_size + self._softmax_helper(x, reduction_dim, torch.bfloat16, "cuda", 1e-1) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_reduction_permutation(self): +@@ -1622,8 +1629,8 @@ + for perm1 in itertools.permutations(range(len(x))): + self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_reduction_multiple_output(self): +@@ -1767,8 +1774,8 @@ + self.assertEqual(o, jit_o) + ''' + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_pw_single_reduction_partition(self): +@@ -1792,8 +1799,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_permutation_preservation(self): +@@ -1830,8 +1837,8 @@ + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + self.assertTrue(jit_o.is_contiguous(memory_format=torch.channels_last)) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_normalization_partition(self): +@@ -1858,8 +1865,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_sum_to_one(self): +@@ -1879,8 +1886,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_single_reduction_broadcast(self): +@@ -1903,8 +1910,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_trivial_reduction(self): +@@ -1940,8 +1947,8 @@ + repro_jit = torch.jit.script(repro) + self._run_helper(repro_jit, repro, x, 0.6) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_reduction_sizes_op(self): +@@ -1964,8 +1971,8 @@ + # have been optimized away + self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_profile_ivalue(self): +@@ -1987,8 +1994,8 @@ + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_sum_to_size(self): +@@ -2021,8 +2028,8 @@ + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_grad_sum_to_size(self): +@@ -2145,8 +2152,8 @@ + self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01))) + self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_dropout_training_fusion(self): +@@ -2294,8 +2301,8 @@ + self.assertEqual(x.grad.dtype, x.dtype) + self.assertEqual(y.grad.dtype, y.dtype) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_autocast_1(self): +@@ -2331,8 +2338,8 @@ + self.assertEqual(x.grad.dtype, x.dtype) + self.assertEqual(y.grad.dtype, y.dtype) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_autocast_2(self): +@@ -2367,8 +2374,8 @@ + self.assertEqual(jit_o.dtype, torch.float) + self.assertEqual(x.grad.dtype, x.dtype) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") +@@ -2405,8 +2412,8 @@ + self.assertEqual(x.grad.dtype, x.dtype) + self.assertEqual(y.grad.dtype, y.dtype) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") +@@ -2817,8 +2824,8 @@ + ref_module.bn.running_var, + e0)) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_batch_norm_half(self): +@@ -2832,8 +2839,8 @@ + training, track_running_stats = training_and_track + self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_batch_norm_impl_index_correctness(self): +@@ -2947,8 +2954,8 @@ + self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) + self.assertGraphContains(graph, 'prim::add_optional', True) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_remove_output_used_only_in_dtype(self): +@@ -2980,8 +2987,8 @@ + graph = jitted.graph_for(x, y) + self.assertGraphContains(graph, FUSION_GROUP, True) + +- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_CUDA, "requires CUDA") ++ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_fix_shape_expression_bn(self): diff --git a/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch b/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch new file mode 100644 index 00000000..05ff461f --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch @@ -0,0 +1,17 @@ +It seems the timeout for the distributed tests is set to low and spurious failures can be seen +Increase it by a factor of 6 similar to torch/testing/_internal/distributed/distributed_test.py + +Original patch by Alexander Grund (TU Dresden), updated by Caspar van Leeuwen (SURF) + +diff -Nru pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py +--- pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py 2022-02-24 18:07:16.414274654 +0100 ++++ pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py 2022-02-24 18:08:31.772851148 +0100 +@@ -321,7 +321,7 @@ + # TSAN runs much slower. + TIMEOUT_DEFAULT = 500 + else: +- TIMEOUT_DEFAULT = 100 ++ TIMEOUT_DEFAULT = 600 + TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400} + + diff --git a/p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch b/p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch new file mode 100644 index 00000000..15d6be2f --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch @@ -0,0 +1,16 @@ +# Author: Caspar van Leeuwen +# Institute: SURF +# Increase timeout for c10d gloo process group operations since test_allreduce_coalesced_basics +# was failing with a timeout (see https://github.com/easybuilders/easybuild-easyconfigs/pull/15137) +diff -Nru pytorch/test/distributed/test_c10d_gloo.py pytorch_orig/test/distributed/test_c10d_gloo.py +--- pytorch/test/distributed/test_c10d_gloo.py 2022-04-19 15:27:48.540163735 +0200 ++++ pytorch_orig/test/distributed/test_c10d_gloo.py 2022-04-07 18:31:13.110755000 +0200 +@@ -216,7 +216,7 @@ + + def opts(self, threads=2): + opts = c10d.ProcessGroupGloo._Options() +- opts._timeout = 5.0 ++ opts._timeout = 50.0 + opts._devices = [create_device(interface=LOOPBACK)] + opts._threads = threads + return opts diff --git a/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch b/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch new file mode 100644 index 00000000..38a6d0ee --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch @@ -0,0 +1,143 @@ +# Author: Caspar van Leeuwen, SURF +# Fixes failing tests due to use of TensorFloat32 +# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue +# We increase tolerances for the asserts to make these tests pass +diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py +--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200 ++++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200 +@@ -77,7 +77,7 @@ + local_output = local_linear(inp) + + # Verify +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) + + # Validate for torch.nn.functional.linear version. + local_output = torch.nn.functional.linear( +@@ -91,7 +91,7 @@ + # for reshard. We need to squeeze the # of dimensions manually. + if inp.dim() == 1: + sharded_output = sharded_output.squeeze(reshard_spec.dim) +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) + + # Compute loss and run backward pass. + local_output.sum().backward() +@@ -114,7 +114,7 @@ + + # Test backward gradient calculation. + self.assertEqual(sharded_linear.bias.grad, local_bias_grad) +- self.assertEqual(sharded_weight.grad, local_grad_narrowed) ++ self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03) + + # Test optimizer. + previous = local_linear.weight.clone().detach() +@@ -135,7 +135,7 @@ + ) + self.assertEqual(sharded_weight.size(), local_weight_narrowed.size()) + self.assertNotEqual(previous_sharded_weight, sharded_weight) +- self.assertEqual(sharded_weight, local_weight_narrowed) ++ self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04) + self.assertNotEqual(previous_sharded_bias, sharded_linear.bias) + self.assertEqual(sharded_linear.bias, local_linear.bias) + +diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py +--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200 ++++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200 +@@ -113,7 +113,7 @@ + local_output = local_megatron_lm(inp) + + # Verify +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03) + + # Compute loss and run backward pass. + local_output.sum().backward() +@@ -161,9 +161,9 @@ + ) + + # Test backward gradient calculation. +- self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1) +- self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2) +- self.assertEqual(bias_grad_fc1, local_bias_grad_fc1) ++ self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03) ++ self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03) ++ self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02) + self.assertEqual(bias_grad_fc2, local_bias_grad_fc2) + + # Test optimizer. +@@ -171,7 +171,7 @@ + local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + self.assertEqual(bias_fc1, local_bias_fc1) + self.assertEqual(bias_fc2, local_bias_fc2) +- self.assertEqual(bias_fc1.grad, local_bias_fc1.grad) ++ self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02) + self.assertEqual(bias_fc2.grad, local_bias_fc2.grad) + previous_sharded_weight_fc1 = sharded_weight_fc1.clone() + previous_sharded_weight_fc2 = sharded_weight_fc2.clone() +@@ -197,13 +197,13 @@ + self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size()) + self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1) + self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2) +- self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed) +- self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed) ++ self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03) ++ self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03) + + # Test bias value after optimizer. + local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + self.assertNotEqual(previous_bias_fc1, bias_fc1) +- self.assertEqual(bias_fc1, local_bias_fc1) ++ self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03) + self.assertNotEqual(previous_bias_fc2, bias_fc2) + self.assertEqual(bias_fc2, local_bias_fc2) + +diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py +--- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200 ++++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200 +@@ -42,7 +42,7 @@ + # existing params in module. So here we expect the result to be the + # same as the input if the weight swapping went well. + res = _stateless.functional_call(module, parameters, x) +- self.assertEqual(x, res) ++ self.assertEqual(x, res, rtol=1e-04, atol=1e-04) + # check that the weight remain unmodified + cur_weight = to_check.l1.weight + uur_buffer = to_check.buffer +c PyTorch-1.11.0_increase_test_tolerances_TF32.patch +rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py +--- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200 ++++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200 +@@ -956,7 +956,7 @@ + def test_lstm_traced(self): + for device in self.devices: + inputs = get_lstm_inputs(device) +- ge = self.checkTrace(LSTMCellF, inputs) ++ ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5) + graph = ge.graph_for(*inputs) + fusion_groups = self.findFusionGroups(graph) + # TODO: chunk +diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py +--- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200 ++++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200 +@@ -525,7 +525,7 @@ + def checkTrace(self, func, reference_tensors, input_tensors=None, + drop=None, allow_unused=False, verbose=False, + inputs_require_grads=True, check_tolerance=1e-5, export_import=True, +- _force_outplace=False): ++ _force_outplace=False, rtol=None, atol=None): + + # TODO: check gradients for parameters, not just inputs + def allSum(vs): +@@ -618,7 +618,10 @@ + + self.assertEqual(outputs, outputs_ge) + if inputs_require_grads: +- self.assertEqual(grads, grads_ge) ++ if atol is not None and rtol is not None: ++ self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol) ++ else: ++ self.assertEqual(grads, grads_ge) + for g2, g2_ge in zip(grads2, grads2_ge): + if g2 is None and g2_ge is None: + continue diff --git a/p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch b/p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch new file mode 100644 index 00000000..25bac0b7 --- /dev/null +++ b/p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch @@ -0,0 +1,35 @@ +# Author: Caspar van Leeuwen +# Company: SURF +# Test 'test_fn_grad_linalg_det_singular_cpu_complex128' and test_variant_consistency_jit_contiguous_cpu_float32 fail +# See https://github.com/pytorch/pytorch/issues/67767 and https://github.com/pytorch/pytorch/issues/67838 +# For the first one, devs recommended to switch it off while they revisit the code. +# For the second: the test works interactively when run with +# python -m unittest test_ops.TestJitCPU.test_variant_consistency_jit_contiguous_cpu_float32 -v +# This shows there is no fundamental problem with the installation, +# but something in the environment when run as 'python run_test.py' makes it fail. +diff -Nru pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_methods_invocations.py pytorch-1.11.0-rc3/torch/testing/_internal/common_methods_invocations.py +--- pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_methods_invocations.py 2022-02-24 18:07:16.430276050 +0100 ++++ pytorch-1.11.0-rc3/torch/testing/_internal/common_methods_invocations.py 2022-02-24 19:38:11.610293957 +0100 +@@ -8791,7 +8791,10 @@ + supports_fwgrad_bwgrad=True, + autodiff_fusible_nodes=['aten::contiguous'], + assert_jit_shape_analysis=True, +- supports_out=False), ++ supports_out=False, ++ skips=( ++ DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cpu'), ++ )), + OpInfo('sum_to_size', + op=lambda x, *args, **kwargs: x.sum_to_size(*args, **kwargs), + dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), +@@ -9746,6 +9749,10 @@ + DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'), + DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'), ++ # It also breaks on CPU. We'll revisit this once `linalg.lu_solve` is a thing ++ # See https://github.com/pytorch/pytorch/pull/64387 and https://github.com/pytorch/pytorch/issues/67767 ++ DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', ++ dtypes=(torch.complex128,)), + )), + OpInfo('linalg.cholesky', + aten_name='linalg_cholesky',