pytorch加速训练过程(单机多卡)
作者:互联网
第一种方式:nn.DataParallel方式
# main.py import torch import torch.distributed as dist gpus = [0, 1, 2, 3]#指定有哪些gpu torch.cuda.set_device('cuda:{}'.format(gpus[0]))# train_dataset = ... train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=...) model = ... model = nn.DataParallel(model.to(device), device_ids=gpus, output_device=gpus[0])#output_device为指定哪一张卡进行汇总梯度,这张卡内存要大一些一般
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
for batch_idx, (data, target) in enumerate(train_loader):
images = images.cuda(non_blocking=True)#将数据放入cuda
target = target.cuda(non_blocking=True)#将标签放入cuda
...
output = model(images)
loss = criterion(output, target)
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
#训练的话直接python main.py
第二种方式:使用torch.distributed加速
# main.py import torch import argparse import torch.distributed as dist #获取当前Gpu进程的index parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training') args = parser.parse_args() #设置GPU之间通信使用的后端和端口 dist.init_process_group(backend='nccl') torch.cuda.set_device(args.local_rank) #使用DistributedSampler对数据集进行划分
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
#使用 DistributedDataParallel 包装模型,汇总不同GPU算得的梯度,并同步计算结果
model = ...
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
for batch_idx, (data, target) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
target = target.cuda(non_blocking=True)
...
output = model(images)
loss = criterion(output, target)
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
#启动方法
CUDA_VISIBLE_DEVICES=0,1,2,3 #指定使用哪些GPU,
--nproc_per_node表示有几个进程,有几个GPU就有几个进程
python -m torch.distributed.launch --nproc_per_node=4 main.py
#使用 torch.multiprocessing 取代启动器
# main.py
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
args=parser.parse_args()
args.nprocs=torch.cuda.device_count()
mp.spawn(main_worker, nprocs=args.nprocs, args=(args.nprocs, args))
def main_worker(local_rank, nprocs, args):
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=args.nprocs, rank=local_rank)
torch.cuda.set_device(args.local_rank)
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
model = ...
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
for batch_idx, (data, target) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
target = target.cuda(non_blocking=True)
...
output = model(images)
loss = criterion(output, target)
...
optimizer.zero_grad()
loss.backward()
optimizer.step()
#启动方法:python main.py
第三种:使用Apex加速
# main.py import torch import argparse import torch.distributed as dist from apex.parallel import DistributedDataParallel parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training') args = parser.parse_args() dist.init_process_group(backend='nccl') torch.cuda.set_device(args.local_rank) train_dataset = ... train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler) model = ... model, optimizer = amp.initialize(model, optimizer) model = DistributedDataParallel(model, device_ids=[args.local_rank]) optimizer = optim.SGD(model.parameters()) for epoch in range(100): for batch_idx, (data, target) in enumerate(train_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) ... output = model(images) loss = criterion(output, target) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step()
# 启动方法
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py
标签:...,args,单机,torch,多卡,pytorch,train,cuda,model 来源: https://www.cnblogs.com/zhaojianhui/p/16684209.html