我是pytorch的新手。
因为我想增加批量,而且模型太重,所以我收到反馈,使用pytorch DP(DataParallel)和DDP(Distributed Data Parallel)可以产生很好的效果。但是,与DDP不同,例如,如果我有4个GPU,DP复制4个模型,并在每个GPU上放置一个。但我在文件上看到DDP没有复制4种型号。例如,模型的前部是GPU 0。在模型的后半部分,据说模型本身可以在GPU 1上以这种方式划分,但DDP是否总是划分为模型?还是需要以特定的方式实施DDP来划分模型
def init_process_group(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("gloo", rank = rank,world_size =world_size)
def build_data_loader(vocab, infile, args, shuffle = True):
dataset = MovieDataset(vocab, infile)
if 1< args.n_gpu and shuffle:
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler = sampler, collate_fn =movie_collate_fn)
else :
sampler = None
loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler=sampler, shuffle=shuffle, collate_fn = movie_collate_fn)
return loader, sampler
def train_model(rank, world_size, args):
if 1 < args.n_gpu:
init_process_group(rank, world_size)
master = (world_size == 0 or rank % world_size ==0)
config.tconfig.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
print(config.tconfig)
best_epoch, best_loss, best_score = 0, 0, 0
model = MovieClassification(config.tconfig)
if os.path.isfile(args.save):
print(f"rank:{rank} load state dict from : {args.save}")
if 1 < args.n_gpu:
model.to(config.tconfig.device)
model = DistributedDataParallel(model, device_ids = [rank], find_unused_parameters = True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", default = None, type = int, required = False)
args = parser.parse_args()
config.tconfig.device = config.device
if torch.cuda.is_available():
args.n_gpu = torch.cuda.device_count() if args.gpu is None else 1
else:
args.n_gpu = 0
print("available GPU : ",args.n_gpu)
if 1 < args.n_gpu:
mp.spawn(train_model, args= (args.n_gpu,args), nprocs = args.n_gpu, join = True)
上面是我的代码,但如果我写所有的部分,似乎太多的是导入的,所以我只写一些部分。我是否将模型复制到每个GPU?还是将所有一个模型划分为GPU
目前没有回答
相关问题 更多 >
编程相关推荐