diff --git a/open_diloco/simulate_multi_node.sh b/open_diloco/simulate_multi_node.sh old mode 100644 new mode 100755 index c5def4a..fde4efa --- a/open_diloco/simulate_multi_node.sh +++ b/open_diloco/simulate_multi_node.sh @@ -57,7 +57,7 @@ mkdir -p logs for i in $(seq 0 $(($N - 1 ))) do > logs/log$i - CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank $i --rdzv-endpoint localhost:9999 --nnodes=$N $@ > logs/log$i 2>&1 & + CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --node-rank $i --rdzv-endpoint localhost:9999 --nnodes=$N $@ > logs/log$i 2>&1 & child_pids+=($!) done diff --git a/open_diloco/train_pure_fsdp.py b/open_diloco/train_pure_fsdp.py index a9a6afc..cbbfa45 100644 --- a/open_diloco/train_pure_fsdp.py +++ b/open_diloco/train_pure_fsdp.py @@ -240,14 +240,18 @@ def train(config: Config): for param_offloaded, param in zip(cpu_model, model.parameters()): # todo check how to handle the SHARD_GRAD_OP strategy where the weight are replicated across the local devices param_offloaded.grad = param_offloaded.data - param.data.to(param_offloaded.device) - - if param_offloaded.grad.device == torch.device("cpu"): - # gloo does not support AVG - param_offloaded.grad = param_offloaded.grad / global_pg.size() - dist.all_reduce(param_offloaded.grad, op=dist.ReduceOp.SUM, group=global_pg) - else: - dist.all_reduce(param_offloaded.grad, op=dist.ReduceOp.AVG, group=global_pg) - + + mask = torch.rand_like(param_offloaded.grad) > 0.95 + + data_to_send = param_offloaded.grad * mask + data_to_send_pre_reduce = data_to_send.clone() + + # gloo does not support AVG + data_to_send = data_to_send / global_pg.size() + dist.all_reduce(data_to_send, op=dist.ReduceOp.SUM, group=global_pg) + + param_offloaded.grad += data_to_send - data_to_send_pre_reduce # removing the + outer_optimizer.step() outer_optimizer.zero_grad()