new file: h/HyperQueue/HyperQueue-0.10.0.eb

new file:   p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
	modified:   p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb
	new file:   p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
	new file:   p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
	new file:   p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch
	new file:   p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch
This commit is contained in:
Jakub Kropacek 2022-05-23 11:13:06 +02:00 committed by easybuild
parent ce632b3121
commit 094a092a73
10 changed files with 912 additions and 35 deletions

View File

@ -0,0 +1,24 @@
# IT4Innovations
# JK 2022
easyblock = 'PackedBinary'
name = 'HyperQueue'
version = '0.10.0'
homepage = 'https://it4innovations.github.io/hyperqueue/'
description = """HyperQueue lets you build a computation plan consisting of a large amount of tasks and then execute it transparently over a system like SLURM/PBS. It dynamically groups jobs into SLURM/PBS jobs and distributes them to fully utilize allocated notes. You thus do not have to manually aggregate your tasks into SLURM/PBS jobs."""
toolchain = SYSTEM
source_urls = ['https://github.com/It4innovations/hyperqueue/releases/download/v%(version)s/']
sources = ['hq-v%(version)s-linux-x64.tar.gz']
checksums = ['2513d5ce7e8b31ace17f5054058c3fed7900ef61e3aa0f27d66f794533cd152c']
sanity_check_paths = {
'files': ['hq'],
'dirs': [],
}
moduleclass = 'devel'

View File

@ -0,0 +1,119 @@
name = 'PyTorch'
version = '1.11.0'
versionsuffix = '-CUDA-%(cudaver)s'
homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""
toolchain = {'name': 'foss', 'version': '2021a'}
sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v%(version)s',
'recursive': True,
},
}]
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_skip_failing_ops_tests.patch',
'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
'PyTorch-1.11.0_fix_sharded_imports.patch',
'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
]
checksums = [
None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
'8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch
'21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
'9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch
# PyTorch-1.11.0_increase_test_tolerances_TF32.patch
'26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
]
osdependencies = [OS_PKG_IBVERBS_DEV]
builddependencies = [
('CMake', '3.20.1'),
('hypothesis', '6.13.1'),
]
dependencies = [
('CUDA', '11.3.1', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.5'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.6.2'),
('SciPy-bundle', '2021.05'),
('typing-extensions', '3.10.0.0'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.2.0'),
('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
('magma', '2.6.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']
custom_opts = ["USE_CUPTI_SO=1"]
excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
'test_model_dump',
]
}
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']
moduleclass = 'devel'

View File

@ -1,4 +1,4 @@
# IT4Innovations
# it4Innovations
# LK 2022
name = 'PyTorch'
@ -22,55 +22,45 @@ sources = [{
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
# 'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch',
'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch',
'PyTorch-1.10.0_fix-test-cond-cpu.patch',
'PyTorch-1.10.0_fix-vnni-detection.patch',
'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch',
'PyTorch-1.10.0_skip_failing_ops_tests.patch',
'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_skip_failing_ops_tests.patch',
'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
'PyTorch-1.11.0_fix_sharded_imports.patch',
'PyTorch-1.11.0_increase_test_tolerances_TF32.patch',
'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
'PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch',
]
checksums = [
None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
# PyTorch-1.7.1_correctly-pass-jit_opt_level.patch
'd4d967d47f8a6172fcbf57f0a61835482968850967c4fdb01108b720696a988d',
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.8.1_increase-distributed-test-timeout.patch
'7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
# PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch
'426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12',
# PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
'67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a',
# PyTorch-1.10.0_fix-test-cond-cpu.patch
'51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',
# PyTorch-1.10.0_fix-vnni-detection.patch
'1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',
# PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch
'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240',
# PyTorch-1.10.0_skip_failing_ops_tests.patch
'399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',
# PyTorch-1.10.0_skip_nan_tests_openblas.patch
'7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',
# PyTorch-1.10.0_skip_cmake_rpath.patch
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f',
'8eaca92d64fcadb0552d28e9c7ea5c4bc669d2fe33004e45a3519ce8d0d136a2', # PyTorch-1.11.0_skip_failing_ops_tests.patch
'21fc678febcdfbb9dabd72235be23cd392044e9a954f6580d15b530e1f69dcc1', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
'9a04f4285b800dad8a00c3014af0a9713d40d5dd35d10931c7c0da4e89c558e9', # PyTorch-1.11.0_fix_sharded_imports.patch
# PyTorch-1.11.0_increase_test_tolerances_TF32.patch
'26e179a4f6f57e49209092612ae5f5cd8c03fd2ca84566ba0244eabefc3736ba',
# PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
'20cd4a8663f74ab326fdb032b926bf5c7e94d9750c515ab9050927ba00cf1953',
# PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
'e7bfe120a8b3fe2b40dac6839852a5fbab3cb3429fbe44a0fc3a1800adaaee51',
]
osdependencies = [OS_PKG_IBVERBS_DEV]
builddependencies = [
('CMake', '3.20.1'), # Needs 3.20 or newer.
('CMake', '3.20.1'),
('hypothesis', '5.41.5'),
]
@ -88,7 +78,6 @@ dependencies = [
('numactl', '2.0.13'),
('FFmpeg', '4.3.1'),
('Pillow', '8.0.1'),
('expecttest', '0.1.3'),
('cuDNN', '8.0.4.30', '-CUDA-%(cudaver)s', True),
('magma', '2.5.4'),
('NCCL', '2.8.3', '-CUDA-%(cudaver)s'),
@ -112,16 +101,17 @@ excluded_tests = {
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
# This test fails constently when run as part of the test suite, but succeeds when run interactively
'test_model_dump',
]
}
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
# The readelf sanity check can be taken out once the TestRPATH test from https://github.com/pytorch/pytorch/pull/68912
# is accepted, since it is then checked as part of the PyTorch test suite
# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/68912 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"python -c 'import caffe2.python'",
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']

View File

@ -0,0 +1,50 @@
# Author: Caspar van Leeuwen
# Company: SURF
# We've seen that these tests fail for version 1.11.0, see https://github.com/pytorch/pytorch/issues/76107
# These failures probably point to underlying issues, but the PR that fixes them touches a ton of files
# It's near-impossible to cherry pick that, without causing other issues. Moreover,
# PyTorch devs have pointed out that nvfuser is not enabled by default in 1.11.0, so chances of anyone
# hitting these issues are very small
# We simply disable the tests and accept that in v 1.11.0 in PyTorch, this functionality is broken.
diff -Nru pytorch_orig/test/test_jit_cuda_fuser.py pytorch/test/test_jit_cuda_fuser.py
--- pytorch_orig/test/test_jit_cuda_fuser.py 2022-04-29 14:54:30.771378000 +0200
+++ pytorch/test/test_jit_cuda_fuser.py 2022-04-29 14:05:54.067297000 +0200
@@ -1313,6 +1313,12 @@
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_native_layer_norm_bfloat(self):
dims = 4
rnds = 3
@@ -2828,6 +2834,12 @@
@unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_batch_norm_half(self):
with torch.backends.cudnn.flags(enabled=True):
setups = [
@@ -2843,6 +2855,12 @@
@unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
+ # Disable test, since it fails and nnfuser wasn't enabled by default in 1.11
+ # Thus, even if this points to an underlying issue, it should be extremely rare that
+ # anyone hits it.
+ # See https://github.com/pytorch/pytorch/issues/76107
+ # and https://github.com/easybuilders/easybuild-easyconfigs/pull/15137
+ @unittest.skip("Skipping test that is known to fail, see PT #76107")
def test_batch_norm_impl_index_correctness(self):
with torch.backends.cudnn.flags(enabled=True):
batch = [2, 7, 16]

View File

@ -0,0 +1,44 @@
# Fixes a "NameError: name 'sharded_tensor' is not defined" error
# for the test_named_params_with_sharded_tensor test
# See https://github.com/pytorch/pytorch/pull/73309
From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001
From: wanchaol <wanchaol@devvm3348.frc0.facebook.com>
Date: Wed, 23 Feb 2022 12:10:39 -0800
Subject: [PATCH] [shard] fix some imports in tests
This fix some imports in sharded optimizer tests
Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/)
[ghstack-poisoned]
---
.../_shard/sharded_optim/test_sharded_optim.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index 085c928985eb..d3f1468aea3c 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -2,7 +2,10 @@
import torch
import torch.optim as optim
-import torch.distributed._shard.sharded_tensor
+from torch.distributed._shard import (
+ sharded_tensor,
+ shard_parameter
+)
from copy import deepcopy
from torch.distributed._shard.sharding_spec import (
@@ -77,8 +80,8 @@ def shard_parameter(self):
],
)
- sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
- sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+ shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+ shard_parameter(self.linear2, "weight", colwise_sharding_spec)
def forward(self, inp):
return self.linear2(self.gelu(self.linear1(inp)))

View File

@ -0,0 +1,439 @@
# Author: Caspar van Leeuwen
# Company: SURF
# The CudaFuser test suite checks CUDA capabilities, even if 'RUN_CUDA' is false.
# That makes the test fail on non-GPU nodes.
# In this patch, I wrapped the logic in 'if RUN_CUDA' blocks in order to make sure
# no CUDA calls are made when RUN_CUDA=false
# Furthermore, I swapped all occurences of @unittest.skipIf(not RUN_CUDA, ...) and @unittest.skipIf(is_pre_volta())
# The latter check is a more specific 'skip' condition: you should only check if a GPU is pre-volta,
# if there are CUDA devices present to begin with. Again, doing this in the wrong order would incur CUDA calls
# on non-CUDA nodes.
# Note that this has been fixed in master, so we probably don't need this patch beyond PT 1.11
diff -Nru pytorch-1.11.0-rc3.orig/test/test_jit_cuda_fuser.py pytorch-1.11.0-rc3/test/test_jit_cuda_fuser.py
--- pytorch-1.11.0-rc3.orig/test/test_jit_cuda_fuser.py 2022-02-24 18:06:55.180421593 +0100
+++ pytorch-1.11.0-rc3/test/test_jit_cuda_fuser.py 2022-02-25 13:30:47.112845480 +0100
@@ -57,18 +57,25 @@
torch._C._jit_set_nvfuser_horizontal_mode(old_value)
def is_pre_volta():
- prop = torch.cuda.get_device_properties(torch.cuda.current_device())
- return prop.major < 7
-
-TEST_BF16 = torch.cuda.is_bf16_supported()
+ if RUN_CUDA:
+ prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+ return prop.major < 7
+ else:
+ return True
+
+if RUN_CUDA:
+ TEST_BF16 = torch.cuda.is_bf16_supported()
+else:
+ TEST_BF16=False
class TestCudaFuser(JitTestCase):
- special_values = torch.tensor(
- [float("-inf"), -10, -math.pi,
- -1, -0.5, 0, 1, 0.5,
- math.pi, 10, float("inf"),
- float("nan")], dtype=torch.float, device='cuda')
+ if RUN_CUDA:
+ special_values = torch.tensor(
+ [float("-inf"), -10, -math.pi,
+ -1, -0.5, 0, 1, 0.5,
+ math.pi, 10, float("inf"),
+ float("nan")], dtype=torch.float, device='cuda')
int_types = [
torch.int8,
@@ -253,8 +260,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_reduction_dtypes_axis(self):
@@ -1120,8 +1127,8 @@
self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_reduction(self):
@@ -1170,8 +1177,8 @@
FileCheck().check(FUSION_GUARD).run(g)
FileCheck().check(FUSION_GUARD).run(v2.graph)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_layer_norm_autodiff(self):
@@ -1212,8 +1219,8 @@
args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
self._layer_norm_autodiff_helper(m, grad, shapes, args)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_layer_norm_parser(self):
@@ -1273,8 +1280,8 @@
self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
@unittest.skipIf(True, "codegen failure awaiting fix")
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_native_layer_norm(self):
@@ -1288,8 +1295,8 @@
self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine)
@unittest.skipIf(True, "codegen failure awaiting fix")
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_native_layer_norm_half(self):
@@ -1301,8 +1308,8 @@
norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1362,8 +1369,8 @@
self.assertTrue(self._compare("comparing running_var failed", eager_running_var, jit_running_var, error))
self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_norm_channels_last(self):
@@ -1374,8 +1381,8 @@
for mf in [torch.channels_last, torch.contiguous_format]:
self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_norm(self):
@@ -1391,8 +1398,8 @@
x[1] = C
self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_norm_large(self):
@@ -1407,8 +1414,8 @@
x[1] = C
self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_norm_half(self):
@@ -1424,8 +1431,8 @@
x[1] = C
self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1469,8 +1476,8 @@
self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_softmax_dtype(self):
@@ -1511,8 +1518,8 @@
)[0].graph
FileCheck().check(FUSION_GUARD).run(bwd_graph)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test__softmax_function(self):
@@ -1535,8 +1542,8 @@
self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test__softmax_function_half_to_float(self):
@@ -1559,8 +1566,8 @@
self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_softmax(self):
@@ -1575,8 +1582,8 @@
x[reduction_dim] = reduction_size
self._softmax_helper(x, reduction_dim, torch.float32, "cuda", 1e-4)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_softmax_half(self):
@@ -1591,8 +1598,8 @@
x[reduction_dim] = reduction_size
self._softmax_helper(x, reduction_dim, torch.float16, "cuda", 5e-3)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -1608,8 +1615,8 @@
x[reduction_dim] = reduction_size
self._softmax_helper(x, reduction_dim, torch.bfloat16, "cuda", 1e-1)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_reduction_permutation(self):
@@ -1622,8 +1629,8 @@
for perm1 in itertools.permutations(range(len(x))):
self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_reduction_multiple_output(self):
@@ -1767,8 +1774,8 @@
self.assertEqual(o, jit_o)
'''
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_pw_single_reduction_partition(self):
@@ -1792,8 +1799,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_permutation_preservation(self):
@@ -1830,8 +1837,8 @@
self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
self.assertTrue(jit_o.is_contiguous(memory_format=torch.channels_last))
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_normalization_partition(self):
@@ -1858,8 +1865,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_sum_to_one(self):
@@ -1879,8 +1886,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_single_reduction_broadcast(self):
@@ -1903,8 +1910,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_trivial_reduction(self):
@@ -1940,8 +1947,8 @@
repro_jit = torch.jit.script(repro)
self._run_helper(repro_jit, repro, x, 0.6)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_reduction_sizes_op(self):
@@ -1964,8 +1971,8 @@
# have been optimized away
self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_profile_ivalue(self):
@@ -1987,8 +1994,8 @@
self.assertEqual(o, jit_o)
self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_sum_to_size(self):
@@ -2021,8 +2028,8 @@
self.assertEqual(o.dtype, jit_o.dtype)
self.assertEqual(o, jit_o)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_grad_sum_to_size(self):
@@ -2145,8 +2152,8 @@
self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_dropout_training_fusion(self):
@@ -2294,8 +2301,8 @@
self.assertEqual(x.grad.dtype, x.dtype)
self.assertEqual(y.grad.dtype, y.dtype)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_autocast_1(self):
@@ -2331,8 +2338,8 @@
self.assertEqual(x.grad.dtype, x.dtype)
self.assertEqual(y.grad.dtype, y.dtype)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_autocast_2(self):
@@ -2367,8 +2374,8 @@
self.assertEqual(jit_o.dtype, torch.float)
self.assertEqual(x.grad.dtype, x.dtype)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2405,8 +2412,8 @@
self.assertEqual(x.grad.dtype, x.dtype)
self.assertEqual(y.grad.dtype, y.dtype)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
@unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
@@ -2817,8 +2824,8 @@
ref_module.bn.running_var,
e0))
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_batch_norm_half(self):
@@ -2832,8 +2839,8 @@
training, track_running_stats = training_and_track
self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_batch_norm_impl_index_correctness(self):
@@ -2947,8 +2954,8 @@
self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
self.assertGraphContains(graph, 'prim::add_optional', True)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_remove_output_used_only_in_dtype(self):
@@ -2980,8 +2987,8 @@
graph = jitted.graph_for(x, y)
self.assertGraphContains(graph, FUSION_GROUP, True)
- @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(not RUN_CUDA, "requires CUDA")
+ @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_fix_shape_expression_bn(self):

View File

@ -0,0 +1,17 @@
It seems the timeout for the distributed tests is set to low and spurious failures can be seen
Increase it by a factor of 6 similar to torch/testing/_internal/distributed/distributed_test.py
Original patch by Alexander Grund (TU Dresden), updated by Caspar van Leeuwen (SURF)
diff -Nru pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py
--- pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py 2022-02-24 18:07:16.414274654 +0100
+++ pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py 2022-02-24 18:08:31.772851148 +0100
@@ -321,7 +321,7 @@
# TSAN runs much slower.
TIMEOUT_DEFAULT = 500
else:
- TIMEOUT_DEFAULT = 100
+ TIMEOUT_DEFAULT = 600
TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}

View File

@ -0,0 +1,16 @@
# Author: Caspar van Leeuwen
# Institute: SURF
# Increase timeout for c10d gloo process group operations since test_allreduce_coalesced_basics
# was failing with a timeout (see https://github.com/easybuilders/easybuild-easyconfigs/pull/15137)
diff -Nru pytorch/test/distributed/test_c10d_gloo.py pytorch_orig/test/distributed/test_c10d_gloo.py
--- pytorch/test/distributed/test_c10d_gloo.py 2022-04-19 15:27:48.540163735 +0200
+++ pytorch_orig/test/distributed/test_c10d_gloo.py 2022-04-07 18:31:13.110755000 +0200
@@ -216,7 +216,7 @@
def opts(self, threads=2):
opts = c10d.ProcessGroupGloo._Options()
- opts._timeout = 5.0
+ opts._timeout = 50.0
opts._devices = [create_device(interface=LOOPBACK)]
opts._threads = threads
return opts

View File

@ -0,0 +1,143 @@
# Author: Caspar van Leeuwen, SURF
# Fixes failing tests due to use of TensorFloat32
# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue
# We increase tolerances for the asserts to make these tests pass
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py
--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200
+++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200
@@ -77,7 +77,7 @@
local_output = local_linear(inp)
# Verify
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Validate for torch.nn.functional.linear version.
local_output = torch.nn.functional.linear(
@@ -91,7 +91,7 @@
# for reshard. We need to squeeze the # of dimensions manually.
if inp.dim() == 1:
sharded_output = sharded_output.squeeze(reshard_spec.dim)
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Compute loss and run backward pass.
local_output.sum().backward()
@@ -114,7 +114,7 @@
# Test backward gradient calculation.
self.assertEqual(sharded_linear.bias.grad, local_bias_grad)
- self.assertEqual(sharded_weight.grad, local_grad_narrowed)
+ self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03)
# Test optimizer.
previous = local_linear.weight.clone().detach()
@@ -135,7 +135,7 @@
)
self.assertEqual(sharded_weight.size(), local_weight_narrowed.size())
self.assertNotEqual(previous_sharded_weight, sharded_weight)
- self.assertEqual(sharded_weight, local_weight_narrowed)
+ self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04)
self.assertNotEqual(previous_sharded_bias, sharded_linear.bias)
self.assertEqual(sharded_linear.bias, local_linear.bias)
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200
+++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200
@@ -113,7 +113,7 @@
local_output = local_megatron_lm(inp)
# Verify
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03)
# Compute loss and run backward pass.
local_output.sum().backward()
@@ -161,9 +161,9 @@
)
# Test backward gradient calculation.
- self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1)
- self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2)
- self.assertEqual(bias_grad_fc1, local_bias_grad_fc1)
+ self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03)
+ self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03)
+ self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02)
self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
# Test optimizer.
@@ -171,7 +171,7 @@
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
self.assertEqual(bias_fc1, local_bias_fc1)
self.assertEqual(bias_fc2, local_bias_fc2)
- self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
+ self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02)
self.assertEqual(bias_fc2.grad, local_bias_fc2.grad)
previous_sharded_weight_fc1 = sharded_weight_fc1.clone()
previous_sharded_weight_fc2 = sharded_weight_fc2.clone()
@@ -197,13 +197,13 @@
self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size())
self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1)
self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2)
- self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed)
- self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
+ self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03)
+ self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03)
# Test bias value after optimizer.
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
self.assertNotEqual(previous_bias_fc1, bias_fc1)
- self.assertEqual(bias_fc1, local_bias_fc1)
+ self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03)
self.assertNotEqual(previous_bias_fc2, bias_fc2)
self.assertEqual(bias_fc2, local_bias_fc2)
diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py
--- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200
+++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200
@@ -42,7 +42,7 @@
# existing params in module. So here we expect the result to be the
# same as the input if the weight swapping went well.
res = _stateless.functional_call(module, parameters, x)
- self.assertEqual(x, res)
+ self.assertEqual(x, res, rtol=1e-04, atol=1e-04)
# check that the weight remain unmodified
cur_weight = to_check.l1.weight
uur_buffer = to_check.buffer
c PyTorch-1.11.0_increase_test_tolerances_TF32.patch
rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py
--- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200
+++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200
@@ -956,7 +956,7 @@
def test_lstm_traced(self):
for device in self.devices:
inputs = get_lstm_inputs(device)
- ge = self.checkTrace(LSTMCellF, inputs)
+ ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5)
graph = ge.graph_for(*inputs)
fusion_groups = self.findFusionGroups(graph)
# TODO: chunk
diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py
--- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200
+++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200
@@ -525,7 +525,7 @@
def checkTrace(self, func, reference_tensors, input_tensors=None,
drop=None, allow_unused=False, verbose=False,
inputs_require_grads=True, check_tolerance=1e-5, export_import=True,
- _force_outplace=False):
+ _force_outplace=False, rtol=None, atol=None):
# TODO: check gradients for parameters, not just inputs
def allSum(vs):
@@ -618,7 +618,10 @@
self.assertEqual(outputs, outputs_ge)
if inputs_require_grads:
- self.assertEqual(grads, grads_ge)
+ if atol is not None and rtol is not None:
+ self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol)
+ else:
+ self.assertEqual(grads, grads_ge)
for g2, g2_ge in zip(grads2, grads2_ge):
if g2 is None and g2_ge is None:
continue

View File

@ -0,0 +1,35 @@
# Author: Caspar van Leeuwen
# Company: SURF
# Test 'test_fn_grad_linalg_det_singular_cpu_complex128' and test_variant_consistency_jit_contiguous_cpu_float32 fail
# See https://github.com/pytorch/pytorch/issues/67767 and https://github.com/pytorch/pytorch/issues/67838
# For the first one, devs recommended to switch it off while they revisit the code.
# For the second: the test works interactively when run with
# python -m unittest test_ops.TestJitCPU.test_variant_consistency_jit_contiguous_cpu_float32 -v
# This shows there is no fundamental problem with the installation,
# but something in the environment when run as 'python run_test.py' makes it fail.
diff -Nru pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_methods_invocations.py pytorch-1.11.0-rc3/torch/testing/_internal/common_methods_invocations.py
--- pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_methods_invocations.py 2022-02-24 18:07:16.430276050 +0100
+++ pytorch-1.11.0-rc3/torch/testing/_internal/common_methods_invocations.py 2022-02-24 19:38:11.610293957 +0100
@@ -8791,7 +8791,10 @@
supports_fwgrad_bwgrad=True,
autodiff_fusible_nodes=['aten::contiguous'],
assert_jit_shape_analysis=True,
- supports_out=False),
+ supports_out=False,
+ skips=(
+ DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cpu'),
+ )),
OpInfo('sum_to_size',
op=lambda x, *args, **kwargs: x.sum_to_size(*args, **kwargs),
dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
@@ -9746,6 +9749,10 @@
DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+ # It also breaks on CPU. We'll revisit this once `linalg.lu_solve` is a thing
+ # See https://github.com/pytorch/pytorch/pull/64387 and https://github.com/pytorch/pytorch/issues/67767
+ DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad',
+ dtypes=(torch.complex128,)),
)),
OpInfo('linalg.cholesky',
aten_name='linalg_cholesky',