easyconfigs-it4i/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch
Jakub Kropacek 094a092a73 new file: h/HyperQueue/HyperQueue-0.10.0.eb
new file:   p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb
	modified:   p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb
	new file:   p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch
	new file:   p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
	new file:   p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
	new file:   p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch
	new file:   p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch
2022-05-23 11:13:06 +02:00

144 lines
7.5 KiB
Diff

# Author: Caspar van Leeuwen, SURF
# Fixes failing tests due to use of TensorFloat32
# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue
# We increase tolerances for the asserts to make these tests pass
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py
--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200
+++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200
@@ -77,7 +77,7 @@
local_output = local_linear(inp)
# Verify
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Validate for torch.nn.functional.linear version.
local_output = torch.nn.functional.linear(
@@ -91,7 +91,7 @@
# for reshard. We need to squeeze the # of dimensions manually.
if inp.dim() == 1:
sharded_output = sharded_output.squeeze(reshard_spec.dim)
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Compute loss and run backward pass.
local_output.sum().backward()
@@ -114,7 +114,7 @@
# Test backward gradient calculation.
self.assertEqual(sharded_linear.bias.grad, local_bias_grad)
- self.assertEqual(sharded_weight.grad, local_grad_narrowed)
+ self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03)
# Test optimizer.
previous = local_linear.weight.clone().detach()
@@ -135,7 +135,7 @@
)
self.assertEqual(sharded_weight.size(), local_weight_narrowed.size())
self.assertNotEqual(previous_sharded_weight, sharded_weight)
- self.assertEqual(sharded_weight, local_weight_narrowed)
+ self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04)
self.assertNotEqual(previous_sharded_bias, sharded_linear.bias)
self.assertEqual(sharded_linear.bias, local_linear.bias)
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200
+++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200
@@ -113,7 +113,7 @@
local_output = local_megatron_lm(inp)
# Verify
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03)
# Compute loss and run backward pass.
local_output.sum().backward()
@@ -161,9 +161,9 @@
)
# Test backward gradient calculation.
- self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1)
- self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2)
- self.assertEqual(bias_grad_fc1, local_bias_grad_fc1)
+ self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03)
+ self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03)
+ self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02)
self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
# Test optimizer.
@@ -171,7 +171,7 @@
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
self.assertEqual(bias_fc1, local_bias_fc1)
self.assertEqual(bias_fc2, local_bias_fc2)
- self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
+ self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02)
self.assertEqual(bias_fc2.grad, local_bias_fc2.grad)
previous_sharded_weight_fc1 = sharded_weight_fc1.clone()
previous_sharded_weight_fc2 = sharded_weight_fc2.clone()
@@ -197,13 +197,13 @@
self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size())
self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1)
self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2)
- self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed)
- self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
+ self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03)
+ self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03)
# Test bias value after optimizer.
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
self.assertNotEqual(previous_bias_fc1, bias_fc1)
- self.assertEqual(bias_fc1, local_bias_fc1)
+ self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03)
self.assertNotEqual(previous_bias_fc2, bias_fc2)
self.assertEqual(bias_fc2, local_bias_fc2)
diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py
--- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200
+++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200
@@ -42,7 +42,7 @@
# existing params in module. So here we expect the result to be the
# same as the input if the weight swapping went well.
res = _stateless.functional_call(module, parameters, x)
- self.assertEqual(x, res)
+ self.assertEqual(x, res, rtol=1e-04, atol=1e-04)
# check that the weight remain unmodified
cur_weight = to_check.l1.weight
uur_buffer = to_check.buffer
c PyTorch-1.11.0_increase_test_tolerances_TF32.patch
rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py
--- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200
+++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200
@@ -956,7 +956,7 @@
def test_lstm_traced(self):
for device in self.devices:
inputs = get_lstm_inputs(device)
- ge = self.checkTrace(LSTMCellF, inputs)
+ ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5)
graph = ge.graph_for(*inputs)
fusion_groups = self.findFusionGroups(graph)
# TODO: chunk
diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py
--- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200
+++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200
@@ -525,7 +525,7 @@
def checkTrace(self, func, reference_tensors, input_tensors=None,
drop=None, allow_unused=False, verbose=False,
inputs_require_grads=True, check_tolerance=1e-5, export_import=True,
- _force_outplace=False):
+ _force_outplace=False, rtol=None, atol=None):
# TODO: check gradients for parameters, not just inputs
def allSum(vs):
@@ -618,7 +618,10 @@
self.assertEqual(outputs, outputs_ge)
if inputs_require_grads:
- self.assertEqual(grads, grads_ge)
+ if atol is not None and rtol is not None:
+ self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol)
+ else:
+ self.assertEqual(grads, grads_ge)
for g2, g2_ge in zip(grads2, grads2_ge):
if g2 is None and g2_ge is None:
continue