Spaces:
Running
on
Zero
Running
on
Zero
# // Copyright (c) 2025 Bytedance Ltd. and/or its affiliates | |
# // | |
# // Licensed under the Apache License, Version 2.0 (the "License"); | |
# // you may not use this file except in compliance with the License. | |
# // You may obtain a copy of the License at | |
# // | |
# // http://www.apache.org/licenses/LICENSE-2.0 | |
# // | |
# // Unless required by applicable law or agreed to in writing, software | |
# // distributed under the License is distributed on an "AS IS" BASIS, | |
# // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# // See the License for the specific language governing permissions and | |
# // limitations under the License. | |
""" | |
Distributed basic functions. | |
""" | |
import os | |
from datetime import timedelta | |
import torch | |
import torch.distributed as dist | |
from torch.nn.parallel import DistributedDataParallel | |
def get_global_rank() -> int: | |
""" | |
Get the global rank, the global index of the GPU. | |
""" | |
return int(os.environ.get("RANK", "0")) | |
def get_local_rank() -> int: | |
""" | |
Get the local rank, the local index of the GPU. | |
""" | |
return int(os.environ.get("LOCAL_RANK", "0")) | |
def get_world_size() -> int: | |
""" | |
Get the world size, the total amount of GPUs. | |
""" | |
return int(os.environ.get("WORLD_SIZE", "1")) | |
def get_device() -> torch.device: | |
""" | |
Get current rank device. | |
""" | |
return torch.device("cuda", get_local_rank()) | |
def barrier_if_distributed(*args, **kwargs): | |
""" | |
Synchronizes all processes if under distributed context. | |
""" | |
if dist.is_initialized(): | |
return dist.barrier(*args, **kwargs) | |
def init_torch(cudnn_benchmark=True, timeout=timedelta(seconds=600)): | |
""" | |
Common PyTorch initialization configuration. | |
""" | |
torch.backends.cuda.matmul.allow_tf32 = True | |
torch.backends.cudnn.allow_tf32 = True | |
torch.backends.cudnn.benchmark = cudnn_benchmark | |
torch.cuda.set_device(get_local_rank()) | |
dist.init_process_group( | |
backend="nccl", | |
rank=get_global_rank(), | |
world_size=get_world_size(), | |
timeout=timeout, | |
) | |
def convert_to_ddp(module: torch.nn.Module, **kwargs) -> DistributedDataParallel: | |
return DistributedDataParallel( | |
module=module, | |
device_ids=[get_local_rank()], | |
output_device=get_local_rank(), | |
**kwargs, | |
) | |