其他分享
首页 > 其他分享> > pytorch加速训练过程(单机多卡)

pytorch加速训练过程(单机多卡)

作者:互联网

第一种方式:nn.DataParallel方式

# main.py import torch import torch.distributed as dist gpus = [0, 1, 2, 3]#指定有哪些gpu torch.cuda.set_device('cuda:{}'.format(gpus[0]))# train_dataset = ... train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=...) model = ... model = nn.DataParallel(model.to(device), device_ids=gpus, output_device=gpus[0])#output_device为指定哪一张卡进行汇总梯度,这张卡内存要大一些一般
optimizer = optim.SGD(model.parameters()) 

for epoch in range(100):
  for batch_idx, (data, target) in enumerate(train_loader):
    images = images.cuda(non_blocking=True)#将数据放入cuda
    target = target.cuda(non_blocking=True)#将标签放入cuda
    ...
    output = model(images)
    loss = criterion(output, target)
    ...
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#训练的话直接python main.py

 

第二种方式:使用torch.distributed加速
# main.py import torch import argparse import torch.distributed as dist #获取当前Gpu进程的index parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training') args = parser.parse_args() #设置GPU之间通信使用的后端和端口 dist.init_process_group(backend='nccl') torch.cuda.set_device(args.local_rank) #使用DistributedSampler对数据集进行划分
train_dataset = ... 
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
#使用 DistributedDataParallel 包装模型,汇总不同GPU算得的梯度,并同步计算结果
model = ...
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer = optim.SGD(model.parameters())
for epoch in range(100):
  for batch_idx, (data, target) in enumerate(train_loader):
    images = images.cuda(non_blocking=True)
    target = target.cuda(non_blocking=True)
    ...
    output = model(images)
    loss = criterion(output, target)
    ...
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
#启动方法
CUDA_VISIBLE_DEVICES=0,1,2,3 #指定使用哪些GPU,--nproc_per_node表示有几个进程,有几个GPU就有几个进程

python -m torch.distributed.launch --nproc_per_node=4 main.py
#使用 torch.multiprocessing 取代启动器

# main.py
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
args=parser.parse_args()
args.nprocs=torch.cuda.device_count() mp.spawn(main_worker, nprocs=args.nprocs, args=(args.nprocs, args)) def main_worker(local_rank, nprocs, args): dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=args.nprocs, rank=local_rank) torch.cuda.set_device(args.local_rank) train_dataset = ... train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler) model = ... model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank]) optimizer = optim.SGD(model.parameters()) for epoch in range(100): for batch_idx, (data, target) in enumerate(train_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) ... output = model(images) loss = criterion(output, target) ... optimizer.zero_grad() loss.backward() optimizer.step()
 #启动方法:python main.py

 

第三种:使用Apex加速
# main.py import torch import argparse import torch.distributed as dist from apex.parallel import DistributedDataParallel parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training') args = parser.parse_args() dist.init_process_group(backend='nccl') torch.cuda.set_device(args.local_rank) train_dataset = ... train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler) model = ... model, optimizer = amp.initialize(model, optimizer) model = DistributedDataParallel(model, device_ids=[args.local_rank]) optimizer = optim.SGD(model.parameters()) for epoch in range(100): for batch_idx, (data, target) in enumerate(train_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) ... output = model(images) loss = criterion(output, target) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step()
# 启动方法
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py

  

标签:...,args,单机,torch,多卡,pytorch,train,cuda,model
来源: https://www.cnblogs.com/zhaojianhui/p/16684209.html