Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- import math, copy, time
- import torchvision.models as models
- from torch.nn import TransformerDecoderLayer, TransformerDecoder
- from torch.nn.utils.rnn import pack_padded_sequence
- from torch.autograd import Variable
- class EncoderCNN(nn.Module):
- def __init__(self, embed_size):
- super(EncoderCNN, self).__init__()
- resnet = models.resnet152(pretrained=True)
- self.resnet = nn.Sequential(*list(resnet.children())[:-2])
- self.conv1 = nn.Conv2d(2048, embed_size, 1)
- self.embed_size = embed_size
- self.fine_tune()
- def forward(self, images):
- features = self.resnet(images)
- batch_size, _,_,_ = features.shape
- features = self.conv1(features)
- features = features.view(batch_size, self.embed_size, -1)
- features = features.permute(2, 0, 1)
- return features
- def fine_tune(self, fine_tune=True):
- for p in self.resnet.parameters():
- p.requires_grad = False
- for c in list(self.resnet.children())[5:]:
- for p in c.parameters():
- p.requires_grad = fine_tune
- class PositionEncoder(nn.Module):
- def __init__(self, d_model, dropout, max_len=5000):
- super(PositionEncoder, self).__init__()
- self.dropout = nn.Dropout(p=dropout)
- pe = torch.zeros(max_len, d_model)
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
- div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
- pe[:, 0::2] = torch.sin(position * div_term)
- pe[:, 1::2] = torch.cos(position * div_term)
- pe = pe.unsqueeze(0).transpose(0, 1)
- self.register_buffer('pe', pe)
- def forward(self, x):
- x = x + self.pe[:x.size(0), :]
- return self.dropout(x)
- class Embedder(nn.Module):
- def __init__(self, vocab_size, d_model):
- super().__init__()
- self.embed = nn.Embedding(vocab_size, d_model)
- def forward(self, x):
- return self.embed(x)
- class Transformer(nn.Module):
- def __init__(self, vocab_size, d_model, h, num_hidden, N, device, dropout_dec=0.1, dropout_pos=0.1):
- super(Transformer, self).__init__()
- decoder_layers = TransformerDecoderLayer(d_model, h, num_hidden, dropout_dec)
- self.source_mask = None
- self.device = device
- self.d_model = d_model
- self.pos_decoder = PositionalEncoder(d_model, dropout_pos)
- self.decoder = TransformerDecoder(decoder_layers, N)
- self.embed = Embedder(vocab_size, d_model)
- self.linear = nn.Linear(d_model, vocab_size)
- self.init_weights()
- def forward(self, source, mem):
- source = source.permute(1,0)
- if self.source_mask is None or self.source_mask.size(0) != len(source):
- self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(source)).to(self.device)
- source = self.embed(source)
- source = source*math.sqrt(self.d_model)
- source = self.pos_decoder(source)
- output = self.decoder(source, mem, self.source_mask)
- output = self.linear(output)
- return output
- def init_weights(self):
- initrange = 0.1
- self.linear.bias.data.zero_()
- self.linear.weight.data.uniform_(-initrange, initrange)
- def pred(self, memory, pred_len):
- batch_size = memory.size(1)
- src = torch.ones((pred_len, batch_size), dtype=int) * 2
- if self.source_mask is None or self.source_mask.size(0) != len(src):
- self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(src)).to(self.device)
- output = torch.ones((pred_len, batch_size), dtype=int)
- src, output = src.cuda(), output.cuda()
- for i in range(pred_len):
- src_emb = self.embed(src) # src_len * batch size * embed size
- src_emb = src_emb*math.sqrt(self.d_model)
- src_emb = self.pos_decoder(src_emb)
- out = self.decoder(src_emb, memory, self.source_mask)
- out = out[i]
- out = self.linear(out) # batch_size * vocab_size
- out = out.argmax(dim=1)
- if i < pred_len-1:
- src[i+1] = out
- output[i] = out
- return output
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement