# Author: Caspar van Leeuwen, SURF # Fixes failing tests due to use of TensorFloat32 # Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue # We increase tolerances for the asserts to make these tests pass diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py --- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200 +++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200 @@ -77,7 +77,7 @@ local_output = local_linear(inp) # Verify - self.assertEqual(local_output, sharded_output) + self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) # Validate for torch.nn.functional.linear version. local_output = torch.nn.functional.linear( @@ -91,7 +91,7 @@ # for reshard. We need to squeeze the # of dimensions manually. if inp.dim() == 1: sharded_output = sharded_output.squeeze(reshard_spec.dim) - self.assertEqual(local_output, sharded_output) + self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) # Compute loss and run backward pass. local_output.sum().backward() @@ -114,7 +114,7 @@ # Test backward gradient calculation. self.assertEqual(sharded_linear.bias.grad, local_bias_grad) - self.assertEqual(sharded_weight.grad, local_grad_narrowed) + self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03) # Test optimizer. previous = local_linear.weight.clone().detach() @@ -135,7 +135,7 @@ ) self.assertEqual(sharded_weight.size(), local_weight_narrowed.size()) self.assertNotEqual(previous_sharded_weight, sharded_weight) - self.assertEqual(sharded_weight, local_weight_narrowed) + self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04) self.assertNotEqual(previous_sharded_bias, sharded_linear.bias) self.assertEqual(sharded_linear.bias, local_linear.bias) diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py --- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200 +++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200 @@ -113,7 +113,7 @@ local_output = local_megatron_lm(inp) # Verify - self.assertEqual(local_output, sharded_output) + self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03) # Compute loss and run backward pass. local_output.sum().backward() @@ -161,9 +161,9 @@ ) # Test backward gradient calculation. - self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1) - self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2) - self.assertEqual(bias_grad_fc1, local_bias_grad_fc1) + self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03) + self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03) + self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02) self.assertEqual(bias_grad_fc2, local_bias_grad_fc2) # Test optimizer. @@ -171,7 +171,7 @@ local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) self.assertEqual(bias_fc1, local_bias_fc1) self.assertEqual(bias_fc2, local_bias_fc2) - self.assertEqual(bias_fc1.grad, local_bias_fc1.grad) + self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02) self.assertEqual(bias_fc2.grad, local_bias_fc2.grad) previous_sharded_weight_fc1 = sharded_weight_fc1.clone() previous_sharded_weight_fc2 = sharded_weight_fc2.clone() @@ -197,13 +197,13 @@ self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size()) self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1) self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2) - self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed) - self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed) + self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03) + self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03) # Test bias value after optimizer. local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) self.assertNotEqual(previous_bias_fc1, bias_fc1) - self.assertEqual(bias_fc1, local_bias_fc1) + self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03) self.assertNotEqual(previous_bias_fc2, bias_fc2) self.assertEqual(bias_fc2, local_bias_fc2) diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py --- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200 +++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200 @@ -42,7 +42,7 @@ # existing params in module. So here we expect the result to be the # same as the input if the weight swapping went well. res = _stateless.functional_call(module, parameters, x) - self.assertEqual(x, res) + self.assertEqual(x, res, rtol=1e-04, atol=1e-04) # check that the weight remain unmodified cur_weight = to_check.l1.weight uur_buffer = to_check.buffer c PyTorch-1.11.0_increase_test_tolerances_TF32.patch rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py --- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200 +++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200 @@ -956,7 +956,7 @@ def test_lstm_traced(self): for device in self.devices: inputs = get_lstm_inputs(device) - ge = self.checkTrace(LSTMCellF, inputs) + ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5) graph = ge.graph_for(*inputs) fusion_groups = self.findFusionGroups(graph) # TODO: chunk diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py --- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200 +++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200 @@ -525,7 +525,7 @@ def checkTrace(self, func, reference_tensors, input_tensors=None, drop=None, allow_unused=False, verbose=False, inputs_require_grads=True, check_tolerance=1e-5, export_import=True, - _force_outplace=False): + _force_outplace=False, rtol=None, atol=None): # TODO: check gradients for parameters, not just inputs def allSum(vs): @@ -618,7 +618,10 @@ self.assertEqual(outputs, outputs_ge) if inputs_require_grads: - self.assertEqual(grads, grads_ge) + if atol is not None and rtol is not None: + self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol) + else: + self.assertEqual(grads, grads_ge) for g2, g2_ge in zip(grads2, grads2_ge): if g2 is None and g2_ge is None: continue