fix: use TORCH_NCCL_AVOID_RECORD_STREAMS=1

This commit is contained in:
OlivierDehaene 2024-01-09 17:59:16 +01:00
parent 91d7267534
commit 65db02f192
1 changed files with 1 additions and 0 deletions

View File

@ -477,6 +477,7 @@ fn shard_manager(
envs.push(("MASTER_ADDR".into(), master_addr.into()));
envs.push(("MASTER_PORT".into(), master_port.to_string().into()));
envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()));
envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into()))
// CUDA memory fraction
envs.push((