Advertisement
Guest User

Untitled

a guest
Apr 25th, 2019
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 30.25 KB | None | 0 0
  1. # language translation
  2.  
  3. ## training
  4. ```
  5. python train.py -src_data path/lang1.txt -trg_data path/lang2.txt -src_lang lang1 -trg_lang lang2
  6. ```
  7.  
  8. ### __Arguments for training models are__
  9. 1. -src_data
  10. 2. -trg_data
  11. 3. if we want to use model without cuda then -no_cuda
  12. 4. -epochs which are the training instances
  13. 5. '-SGDR', '-epochs', '-d_model', '-n_layers', '-heads', '-batchsize', '-printevery', '-lr', '-load_weights', '-create_valset', '-max_strlen','-floyd','-checkpoint' these are some other Arguments.
  14.  
  15.  
  16.  
  17. ```python
  18. #train.py
  19. opt = parser.parse_args()
  20.  
  21. opt.device = 0 if opt.no_cuda is False else -1
  22. if opt.device == 0:
  23. assert torch.cuda.is_available()
  24.  
  25. read_data(opt)
  26. SRC, TRG = create_fields(opt)
  27. opt.train = create_dataset(opt, SRC, TRG)
  28. ```
  29.  
  30. __read_data create_fields create_dataset belongs to Process.py__
  31.  
  32. src and target files are .txt files that contains parellel translation for two languages.
  33. So read data get that file and stip and split it
  34. then create fiels create __two fields__ as __src__ and __trg__
  35. ```python
  36. #process.py
  37. def read_data(opt):
  38.  
  39. if opt.src_data is not None:
  40. try:
  41. opt.src_data = open(opt.src_data).read().strip().split('\n')
  42. except:
  43. print("error: '" + opt.src_data + "' file not found")
  44. quit()
  45.  
  46. if opt.trg_data is not None:
  47. try:
  48. opt.trg_data = open(opt.trg_data).read().strip().split('\n')
  49. except:
  50. print("error: '" + opt.trg_data + "' file not found")
  51. quit()
  52.  
  53. def create_fields(opt):
  54.  
  55. spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl', 'hi']
  56. if opt.src_lang not in spacy_langs:
  57. print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)
  58. if opt.trg_lang not in spacy_langs:
  59. print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
  60.  
  61. print("loading spacy tokenizers...")
  62.  
  63. t_src = tokenize(opt.src_lang)
  64. t_trg = tokenize(opt.trg_lang)
  65. ```
  66.  
  67. __tokenize belongs to Tokenize.py script file__
  68.  
  69. It is used for converting sentences into tokens.
  70. spacy is used for the tokenizer for most languages.
  71. for hindi cltk tokenizer is used.
  72.  
  73. ```python
  74. #Tokenize.py
  75. class tokenize(object):
  76.  
  77. def __init__(self, lang):
  78. if lang != 'hi':
  79. self.nlp = spacy.load(lang)
  80. self.lang = lang
  81.  
  82.  
  83. def tokenizer(self, sentence):
  84. if self.lang != 'hi':
  85. sentence = re.sub(
  86. r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
  87. sentence = re.sub(r"[ ]+", " ", sentence)
  88. sentence = re.sub(r"\!+", "!", sentence)
  89. sentence = re.sub(r"\,+", ",", sentence)
  90. sentence = re.sub(r"\?+", "?", sentence)
  91. sentence = sentence.lower()
  92. print([tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "])
  93. return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
  94. else:
  95. print(TokenizeSentence('hindi').tokenize(sentence))
  96. return TokenizeSentence('hindi').tokenize(sentence)
  97. ```
  98. a csv temp dataset is created with those tokens and fileds and vocab is builf on that dataset
  99. ```python
  100. #process.py continue
  101. TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
  102. SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
  103.  
  104. if opt.load_weights is not None:
  105. try:
  106. print("loading presaved fields...")
  107. SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
  108. TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
  109. except:
  110. print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
  111. quit()
  112.  
  113. return(SRC, TRG)
  114.  
  115. def create_dataset(opt, SRC, TRG):
  116.  
  117. print("creating dataset and iterator... ")
  118.  
  119. raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
  120. df = pd.DataFrame(raw_data, columns=["src", "trg"])
  121.  
  122. mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
  123. df = df.loc[mask]
  124.  
  125. df.to_csv("translate_transformer_temp.csv", index=False)
  126.  
  127. data_fields = [('src', SRC), ('trg', TRG)]
  128. train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
  129.  
  130. train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
  131. repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
  132. batch_size_fn=batch_size_fn, train=True, shuffle=True)
  133.  
  134. os.remove('translate_transformer_temp.csv')
  135.  
  136. if opt.load_weights is None:
  137. SRC.build_vocab(train)
  138. TRG.build_vocab(train)
  139. if opt.checkpoint > 0:
  140. try:
  141. os.mkdir("weights")
  142. except:
  143. print("weights folder already exists, run program with -load_weights weights to load them")
  144. quit()
  145. pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
  146. pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
  147.  
  148. opt.src_pad = SRC.vocab.stoi['<pad>']
  149. opt.trg_pad = TRG.vocab.stoi['<pad>']
  150.  
  151. opt.train_len = get_len(train_iter)
  152.  
  153. return train_iter
  154. def get_len(train):
  155.  
  156. for i, b in enumerate(train):
  157. pass
  158.  
  159. return i
  160. ```
  161. __MyIterator function belongs to process.py__
  162.  
  163. Iteration function for dataset
  164. ```python
  165. class MyIterator(data.Iterator):
  166. def create_batches(self):
  167. if self.train:
  168. def pool(d, random_shuffler):
  169. for p in data.batch(d, self.batch_size * 100):
  170. p_batch = data.batch(
  171. sorted(p, key=self.sort_key),
  172. self.batch_size, self.batch_size_fn)
  173. for b in random_shuffler(list(p_batch)):
  174. yield b
  175. self.batches = pool(self.data(), self.random_shuffler)
  176.  
  177. else:
  178. self.batches = []
  179. for b in data.batch(self.data(), self.batch_size,
  180. self.batch_size_fn):
  181. self.batches.append(sorted(b, key=self.sort_key))
  182.  
  183. global max_src_in_batch, max_tgt_in_batch
  184.  
  185. def batch_size_fn(new, count, sofar):
  186. "Keep augmenting batch and calculate total number of tokens + padding."
  187. global max_src_in_batch, max_tgt_in_batch
  188. if count == 1:
  189. max_src_in_batch = 0
  190. max_tgt_in_batch = 0
  191. max_src_in_batch = max(max_src_in_batch, len(new.src))
  192. max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
  193. src_elements = count * max_src_in_batch
  194. tgt_elements = count * max_tgt_in_batch
  195. return max(src_elements, tgt_elements)
  196.  
  197. ```
  198. __train.py execution continue__
  199.  
  200. ```python
  201. #train.py
  202. model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
  203.  
  204. opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
  205. if opt.SGDR == True:
  206. opt.sched = CosineWithRestarts(opt.optimizer, T_max=opt.train_len)
  207.  
  208. if opt.checkpoint > 0:
  209. print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))
  210.  
  211. if opt.load_weights is not None and opt.floyd is not None:
  212. os.mkdir('weights')
  213. pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
  214. pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
  215.  
  216. ```
  217.  
  218.  
  219.  
  220. __get_model function belongs to models.py script__
  221.  
  222. Getting the model with get_model function which call transform with src and target vocablary
  223. ```python
  224. #models.py
  225. def get_model(opt, src_vocab, trg_vocab):
  226.  
  227. assert opt.d_model % opt.heads == 0
  228. assert opt.dropout < 1
  229.  
  230. model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)
  231.  
  232. if opt.load_weights is not None:
  233. print("loading pretrained weights...")
  234. model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
  235. else:
  236. for p in model.parameters():
  237. if p.dim() > 1:
  238. nn.init.xavier_uniform_(p)
  239.  
  240. if opt.device == 0:
  241. model = model.cuda()
  242.  
  243. return model
  244. ```
  245.  
  246. __model calls transformer__
  247. tranformer calls encoder and decoder foe src and target respectively to convert languages to the intermdediate language calling encoder and decoder
  248.  
  249. ```python
  250. #models.py
  251. class Transformer(nn.Module):
  252. def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
  253. super().__init__()
  254. self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
  255. self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
  256. self.out = nn.Linear(d_model, trg_vocab)
  257. def forward(self, src, trg, src_mask, trg_mask):
  258. e_outputs = self.encoder(src, src_mask)
  259. #print("DECODER")
  260. d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
  261. output = self.out(d_output)
  262. return output
  263. ```
  264. __which then call encode and decode__
  265. embedding and positional encoding part is done in this section as encoder and decoder both use this
  266.  
  267. __Embeding__
  268. Embedding: What are embeddings and how do we use them?
  269. A key principle in NLP tasks is embedding. Originally, when performing NLP, words would be one hot encoded, and so essentially each word was represented by a single value, by providing every word a whole array of values that the model can tune. In our model the vector will be of size 512, meaning each word has 512 values that the neural network can tweak to fully interpret its meaning.
  270. And what about preloaded word-embeddings such as GloVe and word2vec? Forget about them. Effective deep learning should be end-to-end. Let’s initialize our word vectors randomly, and get that model to learn all parameters and embeddings itself.
  271.  
  272. __Positional Encoding__
  273.  
  274. n order for the model to make sense of a sentence, it needs to know two things about each word: what does the word mean? And
  275. The embedding vector for each word will express the meaning, so now we need to input something that tells the network about the word’s position.
  276. The positional encoding matrix is a constant whose values are defined by a function(pos, i), where pos is the position of the word in the sentences, and i follows the embedded values.
  277. When these position specific values are added to our embedding values, each word embedding is altered in a way specific to its position in the sentence.
  278. The network is hence given information about structure, and it can use this to build understanding of the languages.
  279.  
  280.  
  281. ```python
  282. #models.py
  283. class Encoder(nn.Module):
  284. def __init__(self, vocab_size, d_model, N, heads, dropout):
  285. super().__init__()
  286. self.N = N
  287. self.embed = Embedder(vocab_size, d_model)
  288. self.pe = PositionalEncoder(d_model, dropout=dropout)
  289. self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
  290. self.norm = Norm(d_model)
  291. def forward(self, src, mask):
  292. x = self.embed(src)
  293. x = self.pe(x)
  294. for i in range(self.N):
  295. x = self.layers[i](x, mask)
  296. return self.norm(x)
  297.  
  298. class Decoder(nn.Module):
  299. def __init__(self, vocab_size, d_model, N, heads, dropout):
  300. super().__init__()
  301. self.N = N
  302. self.embed = Embedder(vocab_size, d_model)
  303. self.pe = PositionalEncoder(d_model, dropout=dropout)
  304. self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
  305. self.norm = Norm(d_model)
  306. def forward(self, trg, e_outputs, src_mask, trg_mask):
  307. x = self.embed(trg)
  308. x = self.pe(x)
  309. for i in range(self.N):
  310. x = self.layers[i](x, e_outputs, src_mask, trg_mask)
  311. return self.norm(x)
  312.  
  313. ```
  314. __norm function belongs to Sublayers.py__
  315. Mathamatical normalisation of data model
  316. ```python
  317. #Sublayers.py
  318. class Norm(nn.Module):
  319. def __init__(self, d_model, eps = 1e-6):
  320. super().__init__()
  321.  
  322. self.size = d_model
  323.  
  324. # create two learnable parameters to calibrate normalisation
  325. self.alpha = nn.Parameter(torch.ones(self.size))
  326. self.bias = nn.Parameter(torch.zeros(self.size))
  327.  
  328. self.eps = eps
  329.  
  330. def forward(self, x):
  331. norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
  332. / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
  333. return norm
  334.  
  335. ```
  336.  
  337. __encode and decode call Embedder and PositionalEncoder which belongs to script Embed.py__
  338.  
  339.  
  340. ```python
  341. #Embed.py
  342. class Embedder(nn.Module):
  343. def __init__(self, vocab_size, d_model):
  344. super().__init__()
  345. self.d_model = d_model
  346. self.embed = nn.Embedding(vocab_size, d_model)
  347. def forward(self, x):
  348. return self.embed(x)
  349.  
  350. class PositionalEncoder(nn.Module):
  351. def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
  352. super().__init__()
  353. self.d_model = d_model
  354. self.dropout = nn.Dropout(dropout)
  355. # create constant 'pe' matrix with values dependant on
  356. # pos and i
  357. pe = torch.zeros(max_seq_len, d_model)
  358. for pos in range(max_seq_len):
  359. for i in range(0, d_model, 2):
  360. pe[pos, i] = \
  361. math.sin(pos / (10000 ** ((2 * i)/d_model)))
  362. pe[pos, i + 1] = \
  363. math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
  364. pe = pe.unsqueeze(0)
  365. self.register_buffer('pe', pe)
  366.  
  367.  
  368. def forward(self, x):
  369. # make embeddings relatively larger
  370. x = x * math.sqrt(self.d_model)
  371. #add constant to embedding
  372. seq_len = x.size(1)
  373. pe = Variable(self.pe[:,:seq_len], requires_grad=False)
  374. if x.is_cuda:
  375. pe.cuda()
  376. x = x + pe
  377. return self.dropout(x)
  378.  
  379. ```
  380. __EncoderLayer and DecoderLayer fucntion that belongs to Layers.py script__
  381. __layers.py__
  382. ```python
  383. #layers.py
  384. class EncoderLayer(nn.Module):
  385. def __init__(self, d_model, heads, dropout=0.1):
  386. super().__init__()
  387. self.norm_1 = Norm(d_model)
  388. self.norm_2 = Norm(d_model)
  389. self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
  390. self.ff = FeedForward(d_model, dropout=dropout)
  391. self.dropout_1 = nn.Dropout(dropout)
  392. self.dropout_2 = nn.Dropout(dropout)
  393.  
  394. def forward(self, x, mask):
  395. x2 = self.norm_1(x)
  396. x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
  397. x2 = self.norm_2(x)
  398. x = x + self.dropout_2(self.ff(x2))
  399. return x
  400.  
  401. # build a decoder layer with two multi-head attention layers and
  402. # one feed-forward layer
  403. class DecoderLayer(nn.Module):
  404. def __init__(self, d_model, heads, dropout=0.1):
  405. super().__init__()
  406. self.norm_1 = Norm(d_model)
  407. self.norm_2 = Norm(d_model)
  408. self.norm_3 = Norm(d_model)
  409.  
  410. self.dropout_1 = nn.Dropout(dropout)
  411. self.dropout_2 = nn.Dropout(dropout)
  412. self.dropout_3 = nn.Dropout(dropout)
  413.  
  414. self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
  415. self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
  416. self.ff = FeedForward(d_model, dropout=dropout)
  417.  
  418. def forward(self, x, e_outputs, src_mask, trg_mask):
  419. x2 = self.norm_1(x)
  420. x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
  421. x2 = self.norm_2(x)
  422. x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
  423. src_mask))
  424. x2 = self.norm_3(x)
  425. x = x + self.dropout_3(self.ff(x2))
  426. return x
  427. ```
  428. __MultiHeadAttention and FeedForward belongs to Sublayers.py__
  429.  
  430. __Attention__
  431. attention and feedforward secttion of models are in sections.
  432. Once we have our embedded values (with positional encodings), we can put them through our attention function.
  433. In the decoder, the query will be the encoder outputs and the key and value will be the decoder outputs. A series of matrix multiplications combines these values, and tells the model which words from the input are important for making our next prediction.
  434. The first word we give the decoder to start translating is the 's' token (s for start). When it receives this we can see it is paying attention to let, ‘s, and look outputs from the encoder, realizing it can translate all those words to voyons.
  435. It then outputs voyons. To predict the next word we can now see it pays attention to the word inside. Attending to inside, it then predicts a and then l’ and finally intérieur. It now pays attention to the next encoder output, translates this, and so on.
  436.  
  437. __Feedforward__
  438. The feed-forward network just consists of two linear operations. That’s it. Here the network can feed on all the information generated by the attention functions and begin deciphering useful patterns and correlations.
  439.  
  440.  
  441.  
  442. ```python
  443. #Sublayers.py
  444. def attention(q, k, v, d_k, mask=None, dropout=None):
  445.  
  446. scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
  447.  
  448. if mask is not None:
  449. mask = mask.unsqueeze(1)
  450. scores = scores.masked_fill(mask == 0, -1e9)
  451.  
  452. scores = F.softmax(scores, dim=-1)
  453.  
  454. if dropout is not None:
  455. scores = dropout(scores)
  456.  
  457. output = torch.matmul(scores, v)
  458. return output
  459.  
  460. class MultiHeadAttention(nn.Module):
  461. def __init__(self, heads, d_model, dropout = 0.1):
  462. super().__init__()
  463.  
  464. self.d_model = d_model
  465. self.d_k = d_model // heads
  466. self.h = heads
  467.  
  468. self.q_linear = nn.Linear(d_model, d_model)
  469. self.v_linear = nn.Linear(d_model, d_model)
  470. self.k_linear = nn.Linear(d_model, d_model)
  471.  
  472. self.dropout = nn.Dropout(dropout)
  473. self.out = nn.Linear(d_model, d_model)
  474.  
  475. def forward(self, q, k, v, mask=None):
  476.  
  477. bs = q.size(0)
  478.  
  479. # perform linear operation and split into N heads
  480. k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
  481. q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
  482. v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
  483.  
  484. # transpose to get dimensions bs * N * sl * d_model
  485. k = k.transpose(1,2)
  486. q = q.transpose(1,2)
  487. v = v.transpose(1,2)
  488.  
  489.  
  490. # calculate attention using function we will define next
  491. scores = attention(q, k, v, self.d_k, mask, self.dropout)
  492. # concatenate heads and put through final linear layer
  493. concat = scores.transpose(1,2).contiguous()\
  494. .view(bs, -1, self.d_model)
  495. output = self.out(concat)
  496.  
  497. return output
  498.  
  499. class FeedForward(nn.Module):
  500. def __init__(self, d_model, d_ff=2048, dropout = 0.1):
  501. super().__init__()
  502.  
  503. # We set d_ff as a default to 2048
  504. self.linear_1 = nn.Linear(d_model, d_ff)
  505. self.dropout = nn.Dropout(dropout)
  506. self.linear_2 = nn.Linear(d_ff, d_model)
  507.  
  508. def forward(self, x):
  509. x = self.dropout(F.relu(self.linear_1(x)))
  510. x = self.linear_2(x)
  511. return x
  512.  
  513. ```
  514. __which then call Attention function of the same class__
  515.  
  516. __CosineWithRestarts belongs to Optim.py__
  517.  
  518. ```python
  519. #Optim.py
  520. class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
  521. """
  522. Cosine annealing with restarts.
  523.  
  524. Parameters
  525. ----------
  526. optimizer : torch.optim.Optimizer
  527.  
  528. T_max : int
  529. The maximum number of iterations within the first cycle.
  530.  
  531. eta_min : float, optional (default: 0)
  532. The minimum learning rate.
  533.  
  534. last_epoch : int, optional (default: -1)
  535. The index of the last epoch.
  536.  
  537. """
  538.  
  539. def __init__(self,
  540. optimizer: torch.optim.Optimizer,
  541. T_max: int,
  542. eta_min: float = 0.,
  543. last_epoch: int = -1,
  544. factor: float = 1.) -> None:
  545. # pylint: disable=invalid-name
  546. self.T_max = T_max
  547. self.eta_min = eta_min
  548. self.factor = factor
  549. self._last_restart: int = 0
  550. self._cycle_counter: int = 0
  551. self._cycle_factor: float = 1.
  552. self._updated_cycle_len: int = T_max
  553. self._initialized: bool = False
  554. super(CosineWithRestarts, self).__init__(optimizer, last_epoch)
  555.  
  556. def get_lr(self):
  557. """Get updated learning rate."""
  558. # HACK: We need to check if this is the first time get_lr() was called, since
  559. # we want to start with step = 0, but _LRScheduler calls get_lr with
  560. # last_epoch + 1 when initialized.
  561. if not self._initialized:
  562. self._initialized = True
  563. return self.base_lrs
  564.  
  565. step = self.last_epoch + 1
  566. self._cycle_counter = step - self._last_restart
  567.  
  568. lrs = [
  569. (
  570. self.eta_min + ((lr - self.eta_min) / 2) *
  571. (
  572. np.cos(
  573. np.pi *
  574. ((self._cycle_counter) % self._updated_cycle_len) /
  575. self._updated_cycle_len
  576. ) + 1
  577. )
  578. ) for lr in self.base_lrs
  579. ]
  580.  
  581. if self._cycle_counter % self._updated_cycle_len == 0:
  582. # Adjust the cycle length.
  583. self._cycle_factor *= self.factor
  584. self._cycle_counter = 0
  585. self._updated_cycle_len = int(self._cycle_factor * self.T_max)
  586. self._last_restart = step
  587.  
  588. return lrs
  589.  
  590. ```
  591.  
  592.  
  593.  
  594. ### train model
  595.  
  596. __model and arguments are passed to the function__
  597. ```python
  598. #train.py continue
  599. train_model(model, opt)
  600. ```
  601. __defination of model train using multiple epochs__
  602. ```python
  603. #train.py
  604. def train_model(model, opt):
  605.  
  606. print("training model...")
  607. model.train()
  608. start = time.time()
  609. if opt.checkpoint > 0:
  610. cptime = time.time()
  611.  
  612. for epoch in range(opt.epochs):
  613.  
  614. total_loss = 0
  615.  
  616. if opt.floyd is False:
  617. print(" %dm: epoch %d [%s] %d%% loss = %s" %\
  618. ((time.time() - start)//60, epoch + 1, "".join(' '*20), 0, '...'), end='\r')
  619.  
  620. if opt.checkpoint > 0:
  621. torch.save(model.state_dict(), 'weights/model_weights')
  622.  
  623. for i, batch in enumerate(opt.train):
  624.  
  625. src = batch.src.transpose(0,1)
  626. trg = batch.trg.transpose(0,1)
  627. src = src.cuda()
  628. trg = trg.cuda()
  629. trg_input = trg[:, :-1]
  630.  
  631. src_mask, trg_mask = create_masks(src, trg_input, opt)
  632. preds = model(src, trg_input, src_mask, trg_mask)
  633. ys = trg[:, 1:].contiguous().view(-1)
  634. opt.optimizer.zero_grad()
  635. loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=opt.trg_pad)
  636. loss.backward()
  637. opt.optimizer.step()
  638. if opt.SGDR == True:
  639. opt.sched.step()
  640.  
  641. total_loss += loss.item()
  642.  
  643. if (i + 1) % opt.printevery == 0:
  644. p = int(100 * (i + 1) / opt.train_len)
  645. avg_loss = total_loss/opt.printevery
  646. if opt.floyd is False:
  647. print(" %dm: epoch %d [%s%s] %d%% loss = %.3f" %\
  648. ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss), end='\r')
  649. else:
  650. print(" %dm: epoch %d [%s%s] %d%% loss = %.3f" %\
  651. ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss))
  652. total_loss = 0
  653.  
  654. if opt.checkpoint > 0 and ((time.time()-cptime)//60) // opt.checkpoint >= 1:
  655. torch.save(model.state_dict(), 'weights/model_weights')
  656. cptime = time.time()
  657.  
  658.  
  659. print("%dm: epoch %d [%s%s] %d%% loss = %.3f\nepoch %d complete, loss = %.03f" %\
  660. ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_loss))
  661. ```
  662. __create_masks function belongs to Batch.py script__
  663.  
  664. ```python
  665. #Batch.py
  666.  
  667. def create_masks(src, trg, opt):
  668.  
  669. src_mask = (src != opt.src_pad).unsqueeze(-2)
  670.  
  671. if trg is not None:
  672. trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
  673. size = trg.size(1) # get seq_len for matrix
  674. np_mask = nopeak_mask(size, opt)
  675. if trg.is_cuda:
  676. np_mask.cuda()
  677. trg_mask = trg_mask.cuda()
  678. trg_mask = trg_mask & np_mask
  679. else:
  680. trg_mask = None
  681. return src_mask, trg_mask
  682. ```
  683. __which calls nopeak_mask__
  684.  
  685. ```python
  686. #Batch.py
  687. def nopeak_mask(size, opt):
  688. np_mask = np.triu(np.ones((1, size, size)),
  689. k=1).astype('uint8')
  690. np_mask = Variable(torch.from_numpy(np_mask) == 0)
  691. if opt.device == 0:
  692. np_mask = np_mask.cuda()
  693. return np_mask
  694.  
  695. ```
  696. ## translating with trained models
  697. ```
  698. python translate.py -load_weights weights -src_lang -trg_lang
  699. ```
  700. ### __Arguments for translation are__
  701. 1. -src_lang source language
  702. 2. -trg_lang target language
  703. 3. -no_cuda for not using cuda for translation
  704. 4. -load_weights for loading trained weights of model
  705. 5. '-max_len', '-d_model', type=int, '-n_layers', '-heads', '-dropout', '-floyd'.
  706.  
  707. ```python
  708. #translate.py arguments
  709. def main():
  710.  
  711. parser = argparse.ArgumentParser()
  712. parser.add_argument('-load_weights', required=True)
  713. parser.add_argument('-k', type=int, default=3)
  714. parser.add_argument('-max_len', type=int, default=80)
  715. parser.add_argument('-d_model', type=int, default=512)
  716. parser.add_argument('-n_layers', type=int, default=6)
  717. parser.add_argument('-src_lang', required=True)
  718. parser.add_argument('-trg_lang', required=True)
  719. parser.add_argument('-heads', type=int, default=8)
  720. parser.add_argument('-dropout', type=int, default=0.1)
  721. parser.add_argument('-no_cuda', action='store_true')
  722. parser.add_argument('-floyd', action='store_true')
  723.  
  724. opt = parser.parse_args()
  725.  
  726. opt.device = 0 if opt.no_cuda is False else -1
  727.  
  728. assert opt.k > 0
  729. assert opt.max_len > 10
  730.  
  731. SRC, TRG = create_fields(opt)
  732. model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
  733.  
  734. while True:
  735. opt.text =input("Enter a sentence to translate (type 'f' to load from file, or 'q' to quit):\n")
  736. if opt.text=="q":
  737. break
  738. if opt.text=='f':
  739. fpath =input("Enter a sentence to translate (type 'f' to load from file, or 'q' to quit):\n")
  740. try:
  741. opt.text = ' '.join(open(opt.text, encoding='utf-8').read().split('\n'))
  742. except:
  743. print("error opening or reading text file")
  744. continue
  745. phrase = translate(opt, model, SRC, TRG)
  746. print('> '+ phrase + '\n')
  747.  
  748. if __name__ == '__main__':
  749. main()
  750.  
  751.  
  752. ```
  753.  
  754. __translate uses wordnet for langauges available in wordnet__
  755.  
  756.  
  757. __for hindi as 'hi' fasttext from facebook which is a similar to word2vec is used to find synonym and uses library gensim__
  758. ```python
  759. #translate.py
  760. path = '/home/kapil/Downloads/Transformer-master/hi.bin'
  761. fastmodel = FastText.load_fasttext_format(path, full_model=True)
  762.  
  763.  
  764. def get_synonym(word, SRC):
  765. if SRC!= 'hi':
  766. syns = wordnet.synsets(word)
  767. for s in syns:
  768. for l in s.lemmas():
  769. if SRC.vocab.stoi[l.name()] != 0:
  770. return SRC.vocab.stoi[l.name()]
  771. else:
  772. result = fastmodel.wv.most_similar(positive=word)
  773. for i in result:
  774. if i[1] > 0.5:
  775. if SRC.vocab.stoi[i[1]] != 0:
  776. return SRC.vocab.stoi[i[1]]
  777. return 0
  778. ```
  779. ```python
  780. #translate.py
  781. def multiple_replace(dict, text):
  782. # Create a regular expression from the dictionary keys
  783. regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  784.  
  785. # For each match, look-up corresponding value in dictionary
  786. return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
  787.  
  788. def translate_sentence(sentence, model, opt, SRC, TRG):
  789.  
  790. model.eval()
  791. indexed = []
  792. sentence = SRC.preprocess(sen### __Arguments for training models are__tence)
  793. for tok in sentence:
  794. if SRC.vocab.stoi[tok] != 0 or opt.floyd == True:
  795. indexed.append(SRC.vocab.stoi[tok])
  796. else:
  797. indexed.append(get_synonym(tok, SRC))
  798. sentence = Variable(torch.LongTensor([indexed]))
  799. if opt.device == 0:
  800. sentence = sentence.cuda()
  801.  
  802. sentence = beam_search(sentence, model, SRC, TRG, opt)
  803.  
  804.  
  805. return multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)
  806.  
  807. def translate(opt, model, SRC, TRG):
  808. sentences = opt.text.lower().split('.')
  809. translated = []
  810.  
  811. for sentence in sentences:
  812. translated.append(translate_sentence(sentence + '.', model, opt, SRC, TRG).capitalize())
  813.  
  814. return (' '.join(translated))
  815.  
  816. ```
  817. __beam_search belongs to Beam.py__
  818. ```python
  819. #Beam.py
  820. def init_vars(src, model, SRC, TRG, opt):
  821.  
  822. init_tok = TRG.vocab.stoi['<sos>']
  823. src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
  824. e_output = model.encoder(src, src_mask)
  825.  
  826. outputs = torch.LongTensor([[init_tok]])
  827. if opt.device == 0:
  828. outputs = outputs.cuda()
  829.  
  830. trg_mask = nopeak_mask(1, opt)
  831.  
  832. out = model.out(model.decoder(outputs,
  833. e_output, src_mask, trg_mask))
  834. out = F.softmax(out, dim=-1)
  835.  
  836. probs, ix = out[:, -1].data.topk(opt.k)
  837. log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
  838.  
  839. outputs = torch.zeros(opt.k, opt.max_len).long()
  840. if opt.device == 0:
  841. outputs = outputs.cuda()
  842. outputs[:, 0] = init_tok
  843. outputs[:, 1] = ix[0]
  844.  
  845. e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
  846. if opt.device == 0:
  847. e_outputs = e_outputs.cuda()
  848. e_outputs[:, :] = e_output[0]
  849.  
  850. return outputs, e_outputs, log_scores
  851.  
  852. def k_best_outputs(outputs, out, log_scores, i, k):
  853.  
  854. probs, ix = out[:, -1].data.topk(k)
  855. log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
  856. k_probs, k_ix = log_probs.view(-1).topk(k)
  857.  
  858. row = k_ix // k
  859. col = k_ix % k
  860.  
  861. outputs[:, :i] = outputs[row, :i]
  862. outputs[:, i] = ix[row, col]
  863.  
  864. log_scores = k_probs.unsqueeze(0)
  865.  
  866. return outputs, log_scores
  867.  
  868. def beam_search(src, model, SRC, TRG, opt):
  869.  
  870.  
  871. outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
  872. eos_tok = TRG.vocab.stoi['<eos>']
  873. src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
  874. ind = None
  875. for i in range(2, opt.max_len):
  876.  
  877. trg_mask = nopeak_mask(i, opt)
  878.  
  879. out = model.out(model.decoder(outputs[:,:i],
  880. e_outputs, src_mask, trg_mask))
  881.  
  882. out = F.softmax(out, dim=-1)
  883.  
  884. outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
  885.  
  886. if (outputs==eos_tok).nonzero().size(0) == opt.k:
  887. alpha = 0.7
  888. div = 1/((outputs==eos_tok).nonzero()[:,1].type_as(log_scores)**alpha)
  889. _, ind = torch.max(log_scores * div, 1)
  890. ind = ind.data[0]
  891. break
  892.  
  893. if ind is None:
  894. length = (outputs[0]==eos_tok).nonzero()[0]
  895. return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
  896.  
  897. else:
  898. length = (outputs[ind]==eos_tok).nonzero()[0]
  899. return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
  900.  
  901.  
  902. ```
  903. languageTranslation (1).md
  904. Displaying languageTranslation (1).md.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement