Guest User

Untitled

a guest
Aug 25th, 2019
73
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. MAX_LEN = 100 # max is 512 for BERT
  2.  
  3. class text_dataset(Dataset):
  4. def __init__(self, X, y):
  5.  
  6. self.X = X
  7. self.y = y
  8.  
  9. def __getitem__(self,index):
  10.  
  11. tokenized = tokenizer.tokenize(self.X[index])
  12.  
  13. if len(tokenized) > MAX_LEN : tokenized = tokenized[:MAX_LEN]
  14.  
  15. ids = tokenizer.convert_tokens_to_ids(tokenized)
  16.  
  17. ids = torch.tensor(ids + [0] * (MAX_LEN - len(ids)))
  18.  
  19. labels = [torch.from_numpy(np.array(self.y[index]))]
  20.  
  21. return ids, labels[0]
  22.  
  23. def __len__(self):
  24. return len(self.X)
RAW Paste Data