Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # language translation
- ## training
- ```
- python train.py -src_data path/lang1.txt -trg_data path/lang2.txt -src_lang lang1 -trg_lang lang2
- ```
- ### __Arguments for training models are__
- 1. -src_data
- 2. -trg_data
- 3. if we want to use model without cuda then -no_cuda
- 4. -epochs which are the training instances
- 5. '-SGDR', '-epochs', '-d_model', '-n_layers', '-heads', '-batchsize', '-printevery', '-lr', '-load_weights', '-create_valset', '-max_strlen','-floyd','-checkpoint' these are some other Arguments.
- ```python
- #train.py
- opt = parser.parse_args()
- opt.device = 0 if opt.no_cuda is False else -1
- if opt.device == 0:
- assert torch.cuda.is_available()
- read_data(opt)
- SRC, TRG = create_fields(opt)
- opt.train = create_dataset(opt, SRC, TRG)
- ```
- __read_data create_fields create_dataset belongs to Process.py__
- src and target files are .txt files that contains parellel translation for two languages.
- So read data get that file and stip and split it
- then create fiels create __two fields__ as __src__ and __trg__
- ```python
- #process.py
- def read_data(opt):
- if opt.src_data is not None:
- try:
- opt.src_data = open(opt.src_data).read().strip().split('\n')
- except:
- print("error: '" + opt.src_data + "' file not found")
- quit()
- if opt.trg_data is not None:
- try:
- opt.trg_data = open(opt.trg_data).read().strip().split('\n')
- except:
- print("error: '" + opt.trg_data + "' file not found")
- quit()
- def create_fields(opt):
- spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl', 'hi']
- if opt.src_lang not in spacy_langs:
- print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)
- if opt.trg_lang not in spacy_langs:
- print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
- print("loading spacy tokenizers...")
- t_src = tokenize(opt.src_lang)
- t_trg = tokenize(opt.trg_lang)
- ```
- __tokenize belongs to Tokenize.py script file__
- It is used for converting sentences into tokens.
- spacy is used for the tokenizer for most languages.
- for hindi cltk tokenizer is used.
- ```python
- #Tokenize.py
- class tokenize(object):
- def __init__(self, lang):
- if lang != 'hi':
- self.nlp = spacy.load(lang)
- self.lang = lang
- def tokenizer(self, sentence):
- if self.lang != 'hi':
- sentence = re.sub(
- r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
- sentence = re.sub(r"[ ]+", " ", sentence)
- sentence = re.sub(r"\!+", "!", sentence)
- sentence = re.sub(r"\,+", ",", sentence)
- sentence = re.sub(r"\?+", "?", sentence)
- sentence = sentence.lower()
- print([tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "])
- return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
- else:
- print(TokenizeSentence('hindi').tokenize(sentence))
- return TokenizeSentence('hindi').tokenize(sentence)
- ```
- a csv temp dataset is created with those tokens and fileds and vocab is builf on that dataset
- ```python
- #process.py continue
- TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
- SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
- if opt.load_weights is not None:
- try:
- print("loading presaved fields...")
- SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
- TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
- except:
- print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
- quit()
- return(SRC, TRG)
- def create_dataset(opt, SRC, TRG):
- print("creating dataset and iterator... ")
- raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
- df = pd.DataFrame(raw_data, columns=["src", "trg"])
- mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
- df = df.loc[mask]
- df.to_csv("translate_transformer_temp.csv", index=False)
- data_fields = [('src', SRC), ('trg', TRG)]
- train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
- train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
- repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
- batch_size_fn=batch_size_fn, train=True, shuffle=True)
- os.remove('translate_transformer_temp.csv')
- if opt.load_weights is None:
- SRC.build_vocab(train)
- TRG.build_vocab(train)
- if opt.checkpoint > 0:
- try:
- os.mkdir("weights")
- except:
- print("weights folder already exists, run program with -load_weights weights to load them")
- quit()
- pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
- pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
- opt.src_pad = SRC.vocab.stoi['<pad>']
- opt.trg_pad = TRG.vocab.stoi['<pad>']
- opt.train_len = get_len(train_iter)
- return train_iter
- def get_len(train):
- for i, b in enumerate(train):
- pass
- return i
- ```
- __MyIterator function belongs to process.py__
- Iteration function for dataset
- ```python
- class MyIterator(data.Iterator):
- def create_batches(self):
- if self.train:
- def pool(d, random_shuffler):
- for p in data.batch(d, self.batch_size * 100):
- p_batch = data.batch(
- sorted(p, key=self.sort_key),
- self.batch_size, self.batch_size_fn)
- for b in random_shuffler(list(p_batch)):
- yield b
- self.batches = pool(self.data(), self.random_shuffler)
- else:
- self.batches = []
- for b in data.batch(self.data(), self.batch_size,
- self.batch_size_fn):
- self.batches.append(sorted(b, key=self.sort_key))
- global max_src_in_batch, max_tgt_in_batch
- def batch_size_fn(new, count, sofar):
- "Keep augmenting batch and calculate total number of tokens + padding."
- global max_src_in_batch, max_tgt_in_batch
- if count == 1:
- max_src_in_batch = 0
- max_tgt_in_batch = 0
- max_src_in_batch = max(max_src_in_batch, len(new.src))
- max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
- src_elements = count * max_src_in_batch
- tgt_elements = count * max_tgt_in_batch
- return max(src_elements, tgt_elements)
- ```
- __train.py execution continue__
- ```python
- #train.py
- model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
- opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
- if opt.SGDR == True:
- opt.sched = CosineWithRestarts(opt.optimizer, T_max=opt.train_len)
- if opt.checkpoint > 0:
- print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))
- if opt.load_weights is not None and opt.floyd is not None:
- os.mkdir('weights')
- pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
- pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))
- ```
- __get_model function belongs to models.py script__
- Getting the model with get_model function which call transform with src and target vocablary
- ```python
- #models.py
- def get_model(opt, src_vocab, trg_vocab):
- assert opt.d_model % opt.heads == 0
- assert opt.dropout < 1
- model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads, opt.dropout)
- if opt.load_weights is not None:
- print("loading pretrained weights...")
- model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
- else:
- for p in model.parameters():
- if p.dim() > 1:
- nn.init.xavier_uniform_(p)
- if opt.device == 0:
- model = model.cuda()
- return model
- ```
- __model calls transformer__
- tranformer calls encoder and decoder foe src and target respectively to convert languages to the intermdediate language calling encoder and decoder
- ```python
- #models.py
- class Transformer(nn.Module):
- def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
- super().__init__()
- self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
- self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
- self.out = nn.Linear(d_model, trg_vocab)
- def forward(self, src, trg, src_mask, trg_mask):
- e_outputs = self.encoder(src, src_mask)
- #print("DECODER")
- d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
- output = self.out(d_output)
- return output
- ```
- __which then call encode and decode__
- embedding and positional encoding part is done in this section as encoder and decoder both use this
- __Embeding__
- Embedding: What are embeddings and how do we use them?
- A key principle in NLP tasks is embedding. Originally, when performing NLP, words would be one hot encoded, and so essentially each word was represented by a single value, by providing every word a whole array of values that the model can tune. In our model the vector will be of size 512, meaning each word has 512 values that the neural network can tweak to fully interpret its meaning.
- And what about preloaded word-embeddings such as GloVe and word2vec? Forget about them. Effective deep learning should be end-to-end. Let’s initialize our word vectors randomly, and get that model to learn all parameters and embeddings itself.
- __Positional Encoding__
- n order for the model to make sense of a sentence, it needs to know two things about each word: what does the word mean? And
- The embedding vector for each word will express the meaning, so now we need to input something that tells the network about the word’s position.
- The positional encoding matrix is a constant whose values are defined by a function(pos, i), where pos is the position of the word in the sentences, and i follows the embedded values.
- When these position specific values are added to our embedding values, each word embedding is altered in a way specific to its position in the sentence.
- The network is hence given information about structure, and it can use this to build understanding of the languages.
- ```python
- #models.py
- class Encoder(nn.Module):
- def __init__(self, vocab_size, d_model, N, heads, dropout):
- super().__init__()
- self.N = N
- self.embed = Embedder(vocab_size, d_model)
- self.pe = PositionalEncoder(d_model, dropout=dropout)
- self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
- self.norm = Norm(d_model)
- def forward(self, src, mask):
- x = self.embed(src)
- x = self.pe(x)
- for i in range(self.N):
- x = self.layers[i](x, mask)
- return self.norm(x)
- class Decoder(nn.Module):
- def __init__(self, vocab_size, d_model, N, heads, dropout):
- super().__init__()
- self.N = N
- self.embed = Embedder(vocab_size, d_model)
- self.pe = PositionalEncoder(d_model, dropout=dropout)
- self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
- self.norm = Norm(d_model)
- def forward(self, trg, e_outputs, src_mask, trg_mask):
- x = self.embed(trg)
- x = self.pe(x)
- for i in range(self.N):
- x = self.layers[i](x, e_outputs, src_mask, trg_mask)
- return self.norm(x)
- ```
- __norm function belongs to Sublayers.py__
- Mathamatical normalisation of data model
- ```python
- #Sublayers.py
- class Norm(nn.Module):
- def __init__(self, d_model, eps = 1e-6):
- super().__init__()
- self.size = d_model
- # create two learnable parameters to calibrate normalisation
- self.alpha = nn.Parameter(torch.ones(self.size))
- self.bias = nn.Parameter(torch.zeros(self.size))
- self.eps = eps
- def forward(self, x):
- norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
- / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
- return norm
- ```
- __encode and decode call Embedder and PositionalEncoder which belongs to script Embed.py__
- ```python
- #Embed.py
- class Embedder(nn.Module):
- def __init__(self, vocab_size, d_model):
- super().__init__()
- self.d_model = d_model
- self.embed = nn.Embedding(vocab_size, d_model)
- def forward(self, x):
- return self.embed(x)
- class PositionalEncoder(nn.Module):
- def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
- super().__init__()
- self.d_model = d_model
- self.dropout = nn.Dropout(dropout)
- # create constant 'pe' matrix with values dependant on
- # pos and i
- pe = torch.zeros(max_seq_len, d_model)
- for pos in range(max_seq_len):
- for i in range(0, d_model, 2):
- pe[pos, i] = \
- math.sin(pos / (10000 ** ((2 * i)/d_model)))
- pe[pos, i + 1] = \
- math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
- pe = pe.unsqueeze(0)
- self.register_buffer('pe', pe)
- def forward(self, x):
- # make embeddings relatively larger
- x = x * math.sqrt(self.d_model)
- #add constant to embedding
- seq_len = x.size(1)
- pe = Variable(self.pe[:,:seq_len], requires_grad=False)
- if x.is_cuda:
- pe.cuda()
- x = x + pe
- return self.dropout(x)
- ```
- __EncoderLayer and DecoderLayer fucntion that belongs to Layers.py script__
- __layers.py__
- ```python
- #layers.py
- class EncoderLayer(nn.Module):
- def __init__(self, d_model, heads, dropout=0.1):
- super().__init__()
- self.norm_1 = Norm(d_model)
- self.norm_2 = Norm(d_model)
- self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
- self.ff = FeedForward(d_model, dropout=dropout)
- self.dropout_1 = nn.Dropout(dropout)
- self.dropout_2 = nn.Dropout(dropout)
- def forward(self, x, mask):
- x2 = self.norm_1(x)
- x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
- x2 = self.norm_2(x)
- x = x + self.dropout_2(self.ff(x2))
- return x
- # build a decoder layer with two multi-head attention layers and
- # one feed-forward layer
- class DecoderLayer(nn.Module):
- def __init__(self, d_model, heads, dropout=0.1):
- super().__init__()
- self.norm_1 = Norm(d_model)
- self.norm_2 = Norm(d_model)
- self.norm_3 = Norm(d_model)
- self.dropout_1 = nn.Dropout(dropout)
- self.dropout_2 = nn.Dropout(dropout)
- self.dropout_3 = nn.Dropout(dropout)
- self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
- self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
- self.ff = FeedForward(d_model, dropout=dropout)
- def forward(self, x, e_outputs, src_mask, trg_mask):
- x2 = self.norm_1(x)
- x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
- x2 = self.norm_2(x)
- x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
- src_mask))
- x2 = self.norm_3(x)
- x = x + self.dropout_3(self.ff(x2))
- return x
- ```
- __MultiHeadAttention and FeedForward belongs to Sublayers.py__
- __Attention__
- attention and feedforward secttion of models are in sections.
- Once we have our embedded values (with positional encodings), we can put them through our attention function.
- In the decoder, the query will be the encoder outputs and the key and value will be the decoder outputs. A series of matrix multiplications combines these values, and tells the model which words from the input are important for making our next prediction.
- The first word we give the decoder to start translating is the 's' token (s for start). When it receives this we can see it is paying attention to let, ‘s, and look outputs from the encoder, realizing it can translate all those words to voyons.
- It then outputs voyons. To predict the next word we can now see it pays attention to the word inside. Attending to inside, it then predicts a and then l’ and finally intérieur. It now pays attention to the next encoder output, translates this, and so on.
- __Feedforward__
- The feed-forward network just consists of two linear operations. That’s it. Here the network can feed on all the information generated by the attention functions and begin deciphering useful patterns and correlations.
- ```python
- #Sublayers.py
- def attention(q, k, v, d_k, mask=None, dropout=None):
- scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
- if mask is not None:
- mask = mask.unsqueeze(1)
- scores = scores.masked_fill(mask == 0, -1e9)
- scores = F.softmax(scores, dim=-1)
- if dropout is not None:
- scores = dropout(scores)
- output = torch.matmul(scores, v)
- return output
- class MultiHeadAttention(nn.Module):
- def __init__(self, heads, d_model, dropout = 0.1):
- super().__init__()
- self.d_model = d_model
- self.d_k = d_model // heads
- self.h = heads
- self.q_linear = nn.Linear(d_model, d_model)
- self.v_linear = nn.Linear(d_model, d_model)
- self.k_linear = nn.Linear(d_model, d_model)
- self.dropout = nn.Dropout(dropout)
- self.out = nn.Linear(d_model, d_model)
- def forward(self, q, k, v, mask=None):
- bs = q.size(0)
- # perform linear operation and split into N heads
- k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
- q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
- v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
- # transpose to get dimensions bs * N * sl * d_model
- k = k.transpose(1,2)
- q = q.transpose(1,2)
- v = v.transpose(1,2)
- # calculate attention using function we will define next
- scores = attention(q, k, v, self.d_k, mask, self.dropout)
- # concatenate heads and put through final linear layer
- concat = scores.transpose(1,2).contiguous()\
- .view(bs, -1, self.d_model)
- output = self.out(concat)
- return output
- class FeedForward(nn.Module):
- def __init__(self, d_model, d_ff=2048, dropout = 0.1):
- super().__init__()
- # We set d_ff as a default to 2048
- self.linear_1 = nn.Linear(d_model, d_ff)
- self.dropout = nn.Dropout(dropout)
- self.linear_2 = nn.Linear(d_ff, d_model)
- def forward(self, x):
- x = self.dropout(F.relu(self.linear_1(x)))
- x = self.linear_2(x)
- return x
- ```
- __which then call Attention function of the same class__
- __CosineWithRestarts belongs to Optim.py__
- ```python
- #Optim.py
- class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
- """
- Cosine annealing with restarts.
- Parameters
- ----------
- optimizer : torch.optim.Optimizer
- T_max : int
- The maximum number of iterations within the first cycle.
- eta_min : float, optional (default: 0)
- The minimum learning rate.
- last_epoch : int, optional (default: -1)
- The index of the last epoch.
- """
- def __init__(self,
- optimizer: torch.optim.Optimizer,
- T_max: int,
- eta_min: float = 0.,
- last_epoch: int = -1,
- factor: float = 1.) -> None:
- # pylint: disable=invalid-name
- self.T_max = T_max
- self.eta_min = eta_min
- self.factor = factor
- self._last_restart: int = 0
- self._cycle_counter: int = 0
- self._cycle_factor: float = 1.
- self._updated_cycle_len: int = T_max
- self._initialized: bool = False
- super(CosineWithRestarts, self).__init__(optimizer, last_epoch)
- def get_lr(self):
- """Get updated learning rate."""
- # HACK: We need to check if this is the first time get_lr() was called, since
- # we want to start with step = 0, but _LRScheduler calls get_lr with
- # last_epoch + 1 when initialized.
- if not self._initialized:
- self._initialized = True
- return self.base_lrs
- step = self.last_epoch + 1
- self._cycle_counter = step - self._last_restart
- lrs = [
- (
- self.eta_min + ((lr - self.eta_min) / 2) *
- (
- np.cos(
- np.pi *
- ((self._cycle_counter) % self._updated_cycle_len) /
- self._updated_cycle_len
- ) + 1
- )
- ) for lr in self.base_lrs
- ]
- if self._cycle_counter % self._updated_cycle_len == 0:
- # Adjust the cycle length.
- self._cycle_factor *= self.factor
- self._cycle_counter = 0
- self._updated_cycle_len = int(self._cycle_factor * self.T_max)
- self._last_restart = step
- return lrs
- ```
- ### train model
- __model and arguments are passed to the function__
- ```python
- #train.py continue
- train_model(model, opt)
- ```
- __defination of model train using multiple epochs__
- ```python
- #train.py
- def train_model(model, opt):
- print("training model...")
- model.train()
- start = time.time()
- if opt.checkpoint > 0:
- cptime = time.time()
- for epoch in range(opt.epochs):
- total_loss = 0
- if opt.floyd is False:
- print(" %dm: epoch %d [%s] %d%% loss = %s" %\
- ((time.time() - start)//60, epoch + 1, "".join(' '*20), 0, '...'), end='\r')
- if opt.checkpoint > 0:
- torch.save(model.state_dict(), 'weights/model_weights')
- for i, batch in enumerate(opt.train):
- src = batch.src.transpose(0,1)
- trg = batch.trg.transpose(0,1)
- src = src.cuda()
- trg = trg.cuda()
- trg_input = trg[:, :-1]
- src_mask, trg_mask = create_masks(src, trg_input, opt)
- preds = model(src, trg_input, src_mask, trg_mask)
- ys = trg[:, 1:].contiguous().view(-1)
- opt.optimizer.zero_grad()
- loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=opt.trg_pad)
- loss.backward()
- opt.optimizer.step()
- if opt.SGDR == True:
- opt.sched.step()
- total_loss += loss.item()
- if (i + 1) % opt.printevery == 0:
- p = int(100 * (i + 1) / opt.train_len)
- avg_loss = total_loss/opt.printevery
- if opt.floyd is False:
- print(" %dm: epoch %d [%s%s] %d%% loss = %.3f" %\
- ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss), end='\r')
- else:
- print(" %dm: epoch %d [%s%s] %d%% loss = %.3f" %\
- ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss))
- total_loss = 0
- if opt.checkpoint > 0 and ((time.time()-cptime)//60) // opt.checkpoint >= 1:
- torch.save(model.state_dict(), 'weights/model_weights')
- cptime = time.time()
- print("%dm: epoch %d [%s%s] %d%% loss = %.3f\nepoch %d complete, loss = %.03f" %\
- ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_loss))
- ```
- __create_masks function belongs to Batch.py script__
- ```python
- #Batch.py
- def create_masks(src, trg, opt):
- src_mask = (src != opt.src_pad).unsqueeze(-2)
- if trg is not None:
- trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
- size = trg.size(1) # get seq_len for matrix
- np_mask = nopeak_mask(size, opt)
- if trg.is_cuda:
- np_mask.cuda()
- trg_mask = trg_mask.cuda()
- trg_mask = trg_mask & np_mask
- else:
- trg_mask = None
- return src_mask, trg_mask
- ```
- __which calls nopeak_mask__
- ```python
- #Batch.py
- def nopeak_mask(size, opt):
- np_mask = np.triu(np.ones((1, size, size)),
- k=1).astype('uint8')
- np_mask = Variable(torch.from_numpy(np_mask) == 0)
- if opt.device == 0:
- np_mask = np_mask.cuda()
- return np_mask
- ```
- ## translating with trained models
- ```
- python translate.py -load_weights weights -src_lang -trg_lang
- ```
- ### __Arguments for translation are__
- 1. -src_lang source language
- 2. -trg_lang target language
- 3. -no_cuda for not using cuda for translation
- 4. -load_weights for loading trained weights of model
- 5. '-max_len', '-d_model', type=int, '-n_layers', '-heads', '-dropout', '-floyd'.
- ```python
- #translate.py arguments
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('-load_weights', required=True)
- parser.add_argument('-k', type=int, default=3)
- parser.add_argument('-max_len', type=int, default=80)
- parser.add_argument('-d_model', type=int, default=512)
- parser.add_argument('-n_layers', type=int, default=6)
- parser.add_argument('-src_lang', required=True)
- parser.add_argument('-trg_lang', required=True)
- parser.add_argument('-heads', type=int, default=8)
- parser.add_argument('-dropout', type=int, default=0.1)
- parser.add_argument('-no_cuda', action='store_true')
- parser.add_argument('-floyd', action='store_true')
- opt = parser.parse_args()
- opt.device = 0 if opt.no_cuda is False else -1
- assert opt.k > 0
- assert opt.max_len > 10
- SRC, TRG = create_fields(opt)
- model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
- while True:
- opt.text =input("Enter a sentence to translate (type 'f' to load from file, or 'q' to quit):\n")
- if opt.text=="q":
- break
- if opt.text=='f':
- fpath =input("Enter a sentence to translate (type 'f' to load from file, or 'q' to quit):\n")
- try:
- opt.text = ' '.join(open(opt.text, encoding='utf-8').read().split('\n'))
- except:
- print("error opening or reading text file")
- continue
- phrase = translate(opt, model, SRC, TRG)
- print('> '+ phrase + '\n')
- if __name__ == '__main__':
- main()
- ```
- __translate uses wordnet for langauges available in wordnet__
- __for hindi as 'hi' fasttext from facebook which is a similar to word2vec is used to find synonym and uses library gensim__
- ```python
- #translate.py
- path = '/home/kapil/Downloads/Transformer-master/hi.bin'
- fastmodel = FastText.load_fasttext_format(path, full_model=True)
- def get_synonym(word, SRC):
- if SRC!= 'hi':
- syns = wordnet.synsets(word)
- for s in syns:
- for l in s.lemmas():
- if SRC.vocab.stoi[l.name()] != 0:
- return SRC.vocab.stoi[l.name()]
- else:
- result = fastmodel.wv.most_similar(positive=word)
- for i in result:
- if i[1] > 0.5:
- if SRC.vocab.stoi[i[1]] != 0:
- return SRC.vocab.stoi[i[1]]
- return 0
- ```
- ```python
- #translate.py
- def multiple_replace(dict, text):
- # Create a regular expression from the dictionary keys
- regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
- # For each match, look-up corresponding value in dictionary
- return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
- def translate_sentence(sentence, model, opt, SRC, TRG):
- model.eval()
- indexed = []
- sentence = SRC.preprocess(sen### __Arguments for training models are__tence)
- for tok in sentence:
- if SRC.vocab.stoi[tok] != 0 or opt.floyd == True:
- indexed.append(SRC.vocab.stoi[tok])
- else:
- indexed.append(get_synonym(tok, SRC))
- sentence = Variable(torch.LongTensor([indexed]))
- if opt.device == 0:
- sentence = sentence.cuda()
- sentence = beam_search(sentence, model, SRC, TRG, opt)
- return multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)
- def translate(opt, model, SRC, TRG):
- sentences = opt.text.lower().split('.')
- translated = []
- for sentence in sentences:
- translated.append(translate_sentence(sentence + '.', model, opt, SRC, TRG).capitalize())
- return (' '.join(translated))
- ```
- __beam_search belongs to Beam.py__
- ```python
- #Beam.py
- def init_vars(src, model, SRC, TRG, opt):
- init_tok = TRG.vocab.stoi['<sos>']
- src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
- e_output = model.encoder(src, src_mask)
- outputs = torch.LongTensor([[init_tok]])
- if opt.device == 0:
- outputs = outputs.cuda()
- trg_mask = nopeak_mask(1, opt)
- out = model.out(model.decoder(outputs,
- e_output, src_mask, trg_mask))
- out = F.softmax(out, dim=-1)
- probs, ix = out[:, -1].data.topk(opt.k)
- log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
- outputs = torch.zeros(opt.k, opt.max_len).long()
- if opt.device == 0:
- outputs = outputs.cuda()
- outputs[:, 0] = init_tok
- outputs[:, 1] = ix[0]
- e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
- if opt.device == 0:
- e_outputs = e_outputs.cuda()
- e_outputs[:, :] = e_output[0]
- return outputs, e_outputs, log_scores
- def k_best_outputs(outputs, out, log_scores, i, k):
- probs, ix = out[:, -1].data.topk(k)
- log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
- k_probs, k_ix = log_probs.view(-1).topk(k)
- row = k_ix // k
- col = k_ix % k
- outputs[:, :i] = outputs[row, :i]
- outputs[:, i] = ix[row, col]
- log_scores = k_probs.unsqueeze(0)
- return outputs, log_scores
- def beam_search(src, model, SRC, TRG, opt):
- outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
- eos_tok = TRG.vocab.stoi['<eos>']
- src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
- ind = None
- for i in range(2, opt.max_len):
- trg_mask = nopeak_mask(i, opt)
- out = model.out(model.decoder(outputs[:,:i],
- e_outputs, src_mask, trg_mask))
- out = F.softmax(out, dim=-1)
- outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
- if (outputs==eos_tok).nonzero().size(0) == opt.k:
- alpha = 0.7
- div = 1/((outputs==eos_tok).nonzero()[:,1].type_as(log_scores)**alpha)
- _, ind = torch.max(log_scores * div, 1)
- ind = ind.data[0]
- break
- if ind is None:
- length = (outputs[0]==eos_tok).nonzero()[0]
- return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
- else:
- length = (outputs[ind]==eos_tok).nonzero()[0]
- return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
- ```
- languageTranslation (1).md
- Displaying languageTranslation (1).md.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement