From e95a8861e979f64fb5f4467461dc5df99a04923b Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Wed, 15 Nov 2023 14:45:43 -0500 Subject: [PATCH] workaround for cuda errors on some rented machines --- utils/isolate_rng.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/isolate_rng.py b/utils/isolate_rng.py index 711d629..879f4fe 100644 --- a/utils/isolate_rng.py +++ b/utils/isolate_rng.py @@ -34,7 +34,11 @@ def _collect_rng_states(include_cuda: bool = True) -> Dict[str, Any]: "python": python_get_rng_state(), } if include_cuda: - states["torch.cuda"] = torch.cuda.get_rng_state_all() + try: + states["torch.cuda"] = torch.cuda.get_rng_state_all() + except RuntimeError: + # CUDA initialization failure. + pass return states