Guest User

Untitled

a guest
Jan 20th, 2018
123
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.35 KB | None | 0 0
  1. #!/usr/bin/python3.5
  2.  
  3. import torch
  4. import torch.nn as nn
  5. import torch.optim as optim
  6. import torch.nn.functional as F
  7. import torch.backends.cudnn as cudnn
  8.  
  9. import torchvision
  10. import torchvision.transforms as transforms
  11.  
  12. import os
  13. import argparse
  14.  
  15. import math
  16.  
  17. from torch.autograd import Variable
  18.  
  19. import os
  20. import torch
  21. import torch.distributed as dist
  22.  
  23. import torch.multiprocessing as mp
  24.  
  25. from torch.utils.data.distributed import DistributedSampler
  26.  
  27. from torch.utils.data.sampler import RandomSampler
  28.  
  29. from torch.optim.lr_scheduler import MultiStepLR
  30. from torch.optim.lr_scheduler import LambdaLR
  31.  
  32. import numpy as np
  33.  
  34. import copy
  35.  
  36. from mpi4py import MPI
  37.  
  38. import random
  39.  
  40. import time
  41.  
  42. comm = MPI.COMM_WORLD
  43. size = comm.Get_size()
  44. rank = comm.Get_rank()
  45.  
  46. import os
  47. NGPU_PER_MACHINE = 8
  48. os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
  49. os.environ["CUDA_VISIBLE_DEVICES"]= "%d" % (rank % NGPU_PER_MACHINE)
  50.  
  51. print ("Rank %d, Using GPU %s" % (rank, os.environ["CUDA_VISIBLE_DEVICES"]))
  52.  
  53. import sys
  54.  
  55. LEARNING_RATE = float(sys.argv[1])
  56. MOMEMTUM = float(sys.argv[2])
  57. COMMUNICATION = sys.argv[3]
  58. PACKAGE_DROP_RATE = float(sys.argv[4])
  59.  
  60. PARTITIONED_LOAD = False
  61.  
  62. if rank == 0:
  63. print ("# LEARNING_RATE: %f" % LEARNING_RATE)
  64. print ("# MOMEMTUM: %f" % MOMEMTUM)
  65. print ("# COMMUNICATION: %s" % COMMUNICATION)
  66. print ("# PACKAGE_DROP_RATE: %f" % PACKAGE_DROP_RATE)
  67.  
  68. transform_train = transforms.Compose([
  69. # transforms.RandomCrop(32, padding=4),
  70. # transforms.RandomHorizontalFlip(),
  71. transforms.ToTensor(),
  72. transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
  73. ])
  74.  
  75. transform_test = transforms.Compose([
  76. transforms.ToTensor(),
  77. transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
  78. ])
  79.  
  80. """
  81. class Bottleneck(nn.Module):
  82. def __init__(self, in_planes, growth_rate):
  83. super(Bottleneck, self).__init__()
  84. self.bn1 = nn.BatchNorm2d(in_planes)
  85. self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
  86. self.bn2 = nn.BatchNorm2d(4*growth_rate)
  87. self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
  88.  
  89. def forward(self, x):
  90. out = self.conv1(F.relu(self.bn1(x)))
  91. out = self.conv2(F.relu(self.bn2(out)))
  92. out = torch.cat([out,x], 1)
  93. return out
  94.  
  95. class Transition(nn.Module):
  96. def __init__(self, in_planes, out_planes):
  97. super(Transition, self).__init__()
  98. self.bn = nn.BatchNorm2d(in_planes)
  99. self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
  100.  
  101. def forward(self, x):
  102. out = self.conv(F.relu(self.bn(x)))
  103. out = F.avg_pool2d(out, 2)
  104. return out
  105.  
  106. class DenseNet(nn.Module):
  107. def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
  108. super(DenseNet, self).__init__()
  109. self.growth_rate = growth_rate
  110.  
  111. num_planes = 2*growth_rate
  112. self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
  113.  
  114. self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
  115. num_planes += nblocks[0]*growth_rate
  116. out_planes = int(math.floor(num_planes*reduction))
  117. self.trans1 = Transition(num_planes, out_planes)
  118. num_planes = out_planes
  119.  
  120. self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
  121. num_planes += nblocks[1]*growth_rate
  122. out_planes = int(math.floor(num_planes*reduction))
  123. self.trans2 = Transition(num_planes, out_planes)
  124. num_planes = out_planes
  125.  
  126. self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
  127. num_planes += nblocks[2]*growth_rate
  128. out_planes = int(math.floor(num_planes*reduction))
  129. self.trans3 = Transition(num_planes, out_planes)
  130. num_planes = out_planes
  131.  
  132. self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
  133. num_planes += nblocks[3]*growth_rate
  134.  
  135. self.bn = nn.BatchNorm2d(num_planes)
  136. self.linear = nn.Linear(num_planes, num_classes)
  137.  
  138. def _make_dense_layers(self, block, in_planes, nblock):
  139. layers = []
  140. for i in range(nblock):
  141. layers.append(block(in_planes, self.growth_rate))
  142. in_planes += self.growth_rate
  143. return nn.Sequential(*layers)
  144.  
  145. def forward(self, x):
  146. out = self.conv1(x)
  147. out = self.trans1(self.dense1(out))
  148. out = self.trans2(self.dense2(out))
  149. out = self.trans3(self.dense3(out))
  150. out = self.dense4(out)
  151. out = F.avg_pool2d(F.relu(self.bn(out)), 4)
  152. out = out.view(out.size(0), -1)
  153. out = self.linear(out)
  154. return out
  155.  
  156. net = DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
  157. net2 = DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
  158. """
  159.  
  160. class BasicBlock(nn.Module):
  161. expansion = 1
  162.  
  163. def __init__(self, in_planes, planes, stride=1):
  164. super(BasicBlock, self).__init__()
  165. self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
  166. self.bn1 = nn.BatchNorm2d(planes)
  167. self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
  168. self.bn2 = nn.BatchNorm2d(planes)
  169.  
  170. self.shortcut = nn.Sequential()
  171. if stride != 1 or in_planes != self.expansion*planes:
  172. self.shortcut = nn.Sequential(
  173. nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
  174. nn.BatchNorm2d(self.expansion*planes)
  175. )
  176.  
  177. def forward(self, x):
  178. out = F.relu(self.bn1(self.conv1(x)))
  179. out = self.bn2(self.conv2(out))
  180. out += self.shortcut(x)
  181. out = F.relu(out)
  182. return out
  183.  
  184.  
  185. class Bottleneck(nn.Module):
  186. expansion = 4
  187.  
  188. def __init__(self, in_planes, planes, stride=1):
  189. super(Bottleneck, self).__init__()
  190. self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
  191. self.bn1 = nn.BatchNorm2d(planes)
  192. self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
  193. self.bn2 = nn.BatchNorm2d(planes)
  194. self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
  195. self.bn3 = nn.BatchNorm2d(self.expansion*planes)
  196.  
  197. self.shortcut = nn.Sequential()
  198. if stride != 1 or in_planes != self.expansion*planes:
  199. self.shortcut = nn.Sequential(
  200. nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
  201. nn.BatchNorm2d(self.expansion*planes)
  202. )
  203.  
  204. def forward(self, x):
  205. out = F.relu(self.bn1(self.conv1(x)))
  206. out = F.relu(self.bn2(self.conv2(out)))
  207. out = self.bn3(self.conv3(out))
  208. out += self.shortcut(x)
  209. out = F.relu(out)
  210. return out
  211.  
  212.  
  213. class ResNet(nn.Module):
  214. def __init__(self, block, num_blocks, num_classes=10):
  215. super(ResNet, self).__init__()
  216. self.in_planes = 64
  217.  
  218. self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
  219. self.bn1 = nn.BatchNorm2d(64)
  220. self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
  221. self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
  222. self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
  223. self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
  224. self.linear = nn.Linear(512*block.expansion, num_classes)
  225.  
  226. def _make_layer(self, block, planes, num_blocks, stride):
  227. strides = [stride] + [1]*(num_blocks-1)
  228. layers = []
  229. for stride in strides:
  230. layers.append(block(self.in_planes, planes, stride))
  231. self.in_planes = planes * block.expansion
  232. return nn.Sequential(*layers)
  233.  
  234. def forward(self, x):
  235. out = F.relu(self.bn1(self.conv1(x)))
  236. out = self.layer1(out)
  237. out = self.layer2(out)
  238. out = self.layer3(out)
  239. out = self.layer4(out)
  240. out = F.avg_pool2d(out, 4)
  241. out = out.view(out.size(0), -1)
  242. out = self.linear(out)
  243. return out
  244.  
  245. net = ResNet(BasicBlock, [2, 2, 2, 2])
  246. net2 = ResNet(BasicBlock, [2, 2, 2, 2])
  247.  
  248. net.cuda()
  249. net2.cuda()
  250.  
  251. trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
  252.  
  253. if PARTITIONED_LOAD == False:
  254. dsampler = RandomSampler(trainset)
  255. trainloader = torch.utils.data.DataLoader(trainset, batch_size=8192/size, shuffle=False, sampler=dsampler)
  256. else:
  257. dsampler = DistributedSampler(trainset, size, rank)
  258. trainloader = torch.utils.data.DataLoader(trainset, batch_size=8192/size, shuffle=False, sampler=dsampler)
  259.  
  260. dsampler2 = DistributedSampler(trainset, size, rank)
  261. evalloader = torch.utils.data.DataLoader(trainset, batch_size=8192/size, shuffle=False, sampler=dsampler2)
  262.  
  263. criterion = nn.CrossEntropyLoss()
  264. optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=MOMEMTUM, weight_decay=0.0)
  265. scheduler = MultiStepLR(optimizer, milestones=[50,], gamma=0.1)
  266. #scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: LEARNING_RATE / math.sqrt(epoch+1))
  267.  
  268. model = net.cuda()
  269. model2 = net2.cuda()
  270.  
  271. random.seed(rank)
  272.  
  273. ## Model Avg -- Everyone starts from the same mdoel at the begining
  274. for param in model.parameters():
  275. view = param.cpu().data.numpy()
  276. view2 = 0.0 * view
  277. comm.Allreduce(view, view2, op=MPI.SUM)
  278. param.data = torch.from_numpy(view2).cuda()
  279. param.data /= size
  280.  
  281. i = 0
  282. for iepoch in range(0, 100):
  283.  
  284. if PARTITIONED_LOAD == True:
  285. scheduler.step()
  286.  
  287. net.train()
  288. train_loss = 0
  289. correct = 0
  290. total = 0
  291.  
  292. start = time.time()
  293.  
  294. if (iepoch + 1) % 25 == 0:
  295. LEARNING_RATE = LEARNING_RATE / 10
  296.  
  297. for batch_idx, (inputs, targets) in enumerate(trainloader):
  298.  
  299. i = i + 1
  300.  
  301. if PARTITIONED_LOAD == False:
  302. if (i % int(50000/8192)) == 0:
  303. scheduler.step()
  304.  
  305. optimizer.zero_grad()
  306. inputs, targets = Variable(inputs.cuda()), Variable(targets.cuda())
  307. outputs = net(inputs)
  308. loss = criterion(outputs, targets)
  309. loss.backward()
  310.  
  311. # Centralized Communications
  312. #
  313. if COMMUNICATION == "CENTRALIZED":
  314. for param in model.parameters():
  315. view = param.grad.cpu().data.numpy()
  316. view2 = 0.0 * view
  317. comm.Allreduce(view, view2, op=MPI.SUM)
  318. view2 = view2 / size
  319.  
  320. #newmodel = param.cpu().data.numpy() - LEARNING_RATE * view2
  321. #param.data = torch.from_numpy(newmodel).cuda()
  322.  
  323. param.grad.data = torch.from_numpy(view2).cuda()
  324.  
  325. # TODO:
  326. # try impl SGD by itself
  327.  
  328. optimizer.step()
  329.  
  330. elif COMMUNICATION == "DECENTRALIZED":
  331. optimizer.step()
  332.  
  333. for param in model.parameters():
  334. view = param.cpu().data.numpy()
  335.  
  336. model_neighbor1 = 0.0 * view
  337. model_neighbor2 = 0.0 * view
  338.  
  339. neighbor1 = (rank - 1) % size
  340. neighbor2 = (rank + 1) % size
  341.  
  342. send_req1 = comm.isend(view, dest=neighbor1)
  343. send_req2 = comm.isend(view, dest=neighbor2)
  344.  
  345. model_neighbor1 = comm.recv(source=neighbor1)
  346. model_neighbor2 = comm.recv(source=neighbor2)
  347.  
  348. send_req1.wait()
  349. send_req2.wait()
  350.  
  351. view2 = (model_neighbor1 + model_neighbor2 + view) / 3
  352. param.data = torch.from_numpy(view2).cuda()
  353.  
  354. elif COMMUNICATION == "PACKAGEDROP":
  355. optimizer.step()
  356. for param in model.parameters():
  357.  
  358. view = param.cpu().data.numpy()
  359. drop1_local = np.array([0.0,])
  360. if random.random() < PACKAGE_DROP_RATE:
  361. view = 0.0 * view
  362. drop1_local = drop1_local + 1
  363. #print (" drop1 package rank", rank)
  364.  
  365. drop1_global = np.array([0.0,])
  366. comm.Allreduce(drop1_local, drop1_global, op=MPI.SUM)
  367. drop1_global = drop1_global[0]
  368. #print ("drop1_global = ", drop1_global, "/", size)
  369.  
  370. view2 = 0.0 * view
  371. comm.Allreduce(view, view2, op=MPI.SUM)
  372. view2 = view2 / (size - drop1_global)
  373.  
  374. if random.random() < PACKAGE_DROP_RATE:
  375. pass
  376. #print (" drop2 package rank", rank)
  377. else:
  378. param.data = torch.from_numpy(view2).cuda()
  379.  
  380. if (i % int(50000/8192)) == 0:
  381.  
  382. if rank == 0:
  383. elapsed = time.time() - start
  384.  
  385. # Evaluation
  386. net2.train()
  387. net2.load_state_dict(net.state_dict())
  388. for param in model2.parameters():
  389. view = param.cpu().data.numpy()
  390. view2 = 1.0 * view
  391. comm.Allreduce(view, view2, op=MPI.SUM)
  392. param.data = torch.from_numpy(view2).cuda()
  393. param.data /= size
  394.  
  395. _train_loss = 0
  396. for batch_idx, (inputs, targets) in enumerate(evalloader):
  397. inputs, targets = Variable(inputs.cuda()), Variable(targets.cuda())
  398. outputs = net2(inputs)
  399. loss = criterion(outputs, targets)
  400. _train_loss += loss.data[0]
  401.  
  402. _train_loss = np.array([_train_loss,])
  403. train_loss = 0.0 * _train_loss
  404. comm.Allreduce(_train_loss, train_loss, op=MPI.SUM)
  405. train_loss = train_loss[0]
  406.  
  407. if PARTITIONED_LOAD == False:
  408. if rank == 0:
  409. print ('%d Loss: %.3f | %f seconds | LR: %f ~' % (i / int(50000/8192),train_loss/size/(batch_idx+1),elapsed, LEARNING_RATE))
  410. else:
  411. if rank == 0:
  412. print ('%d Loss: %.3f | %f seconds | LR: %f' % (iepoch,train_loss/size/(batch_idx+1),elapsed, LEARNING_RATE))
Add Comment
Please, Sign In to add comment