mirror of
https://code.it4i.cz/sccs/easyconfigs-it4i.git
synced 2025-04-11 01:12:11 +01:00

new file: p/PyTorch/PyTorch-1.11.0-foss-2021a-CUDA-11.3.1.eb modified: p/PyTorch/PyTorch-1.11.0-fosscuda-2020b.eb new file: p/PyTorch/PyTorch-1.11.0_disable_failing_jit_cuda_fuser_tests.patch new file: p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch new file: p/PyTorch/PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch new file: p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch new file: p/PyTorch/PyTorch-1.11.0_increase_c10d_gloo_timeout.patch new file: p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch new file: p/PyTorch/PyTorch-1.11.0_skip_failing_ops_tests.patch
144 lines
7.5 KiB
Diff
144 lines
7.5 KiB
Diff
# Author: Caspar van Leeuwen, SURF
|
|
# Fixes failing tests due to use of TensorFloat32
|
|
# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue
|
|
# We increase tolerances for the asserts to make these tests pass
|
|
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py
|
|
--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200
|
|
+++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200
|
|
@@ -77,7 +77,7 @@
|
|
local_output = local_linear(inp)
|
|
|
|
# Verify
|
|
- self.assertEqual(local_output, sharded_output)
|
|
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
|
|
|
|
# Validate for torch.nn.functional.linear version.
|
|
local_output = torch.nn.functional.linear(
|
|
@@ -91,7 +91,7 @@
|
|
# for reshard. We need to squeeze the # of dimensions manually.
|
|
if inp.dim() == 1:
|
|
sharded_output = sharded_output.squeeze(reshard_spec.dim)
|
|
- self.assertEqual(local_output, sharded_output)
|
|
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
|
|
|
|
# Compute loss and run backward pass.
|
|
local_output.sum().backward()
|
|
@@ -114,7 +114,7 @@
|
|
|
|
# Test backward gradient calculation.
|
|
self.assertEqual(sharded_linear.bias.grad, local_bias_grad)
|
|
- self.assertEqual(sharded_weight.grad, local_grad_narrowed)
|
|
+ self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03)
|
|
|
|
# Test optimizer.
|
|
previous = local_linear.weight.clone().detach()
|
|
@@ -135,7 +135,7 @@
|
|
)
|
|
self.assertEqual(sharded_weight.size(), local_weight_narrowed.size())
|
|
self.assertNotEqual(previous_sharded_weight, sharded_weight)
|
|
- self.assertEqual(sharded_weight, local_weight_narrowed)
|
|
+ self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04)
|
|
self.assertNotEqual(previous_sharded_bias, sharded_linear.bias)
|
|
self.assertEqual(sharded_linear.bias, local_linear.bias)
|
|
|
|
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
|
|
--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200
|
|
+++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200
|
|
@@ -113,7 +113,7 @@
|
|
local_output = local_megatron_lm(inp)
|
|
|
|
# Verify
|
|
- self.assertEqual(local_output, sharded_output)
|
|
+ self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03)
|
|
|
|
# Compute loss and run backward pass.
|
|
local_output.sum().backward()
|
|
@@ -161,9 +161,9 @@
|
|
)
|
|
|
|
# Test backward gradient calculation.
|
|
- self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1)
|
|
- self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2)
|
|
- self.assertEqual(bias_grad_fc1, local_bias_grad_fc1)
|
|
+ self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03)
|
|
+ self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03)
|
|
+ self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02)
|
|
self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
|
|
|
|
# Test optimizer.
|
|
@@ -171,7 +171,7 @@
|
|
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
|
|
self.assertEqual(bias_fc1, local_bias_fc1)
|
|
self.assertEqual(bias_fc2, local_bias_fc2)
|
|
- self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
|
|
+ self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02)
|
|
self.assertEqual(bias_fc2.grad, local_bias_fc2.grad)
|
|
previous_sharded_weight_fc1 = sharded_weight_fc1.clone()
|
|
previous_sharded_weight_fc2 = sharded_weight_fc2.clone()
|
|
@@ -197,13 +197,13 @@
|
|
self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size())
|
|
self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1)
|
|
self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2)
|
|
- self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed)
|
|
- self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
|
|
+ self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03)
|
|
+ self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03)
|
|
|
|
# Test bias value after optimizer.
|
|
local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
|
|
self.assertNotEqual(previous_bias_fc1, bias_fc1)
|
|
- self.assertEqual(bias_fc1, local_bias_fc1)
|
|
+ self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03)
|
|
self.assertNotEqual(previous_bias_fc2, bias_fc2)
|
|
self.assertEqual(bias_fc2, local_bias_fc2)
|
|
|
|
diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py
|
|
--- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200
|
|
+++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200
|
|
@@ -42,7 +42,7 @@
|
|
# existing params in module. So here we expect the result to be the
|
|
# same as the input if the weight swapping went well.
|
|
res = _stateless.functional_call(module, parameters, x)
|
|
- self.assertEqual(x, res)
|
|
+ self.assertEqual(x, res, rtol=1e-04, atol=1e-04)
|
|
# check that the weight remain unmodified
|
|
cur_weight = to_check.l1.weight
|
|
uur_buffer = to_check.buffer
|
|
c PyTorch-1.11.0_increase_test_tolerances_TF32.patch
|
|
rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py
|
|
--- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200
|
|
+++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200
|
|
@@ -956,7 +956,7 @@
|
|
def test_lstm_traced(self):
|
|
for device in self.devices:
|
|
inputs = get_lstm_inputs(device)
|
|
- ge = self.checkTrace(LSTMCellF, inputs)
|
|
+ ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5)
|
|
graph = ge.graph_for(*inputs)
|
|
fusion_groups = self.findFusionGroups(graph)
|
|
# TODO: chunk
|
|
diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py
|
|
--- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200
|
|
+++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200
|
|
@@ -525,7 +525,7 @@
|
|
def checkTrace(self, func, reference_tensors, input_tensors=None,
|
|
drop=None, allow_unused=False, verbose=False,
|
|
inputs_require_grads=True, check_tolerance=1e-5, export_import=True,
|
|
- _force_outplace=False):
|
|
+ _force_outplace=False, rtol=None, atol=None):
|
|
|
|
# TODO: check gradients for parameters, not just inputs
|
|
def allSum(vs):
|
|
@@ -618,7 +618,10 @@
|
|
|
|
self.assertEqual(outputs, outputs_ge)
|
|
if inputs_require_grads:
|
|
- self.assertEqual(grads, grads_ge)
|
|
+ if atol is not None and rtol is not None:
|
|
+ self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol)
|
|
+ else:
|
|
+ self.assertEqual(grads, grads_ge)
|
|
for g2, g2_ge in zip(grads2, grads2_ge):
|
|
if g2 is None and g2_ge is None:
|
|
continue
|