Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- THRESHOLD = 10
- MAX_LEN = 60
- class TextDataset(data.Dataset):
- def __init__(self, examples, split, ixtoword=None, wordtoix=None, THRESHOLD=THRESHOLD):
- self.examples = examples
- self.split = split
- self.THRESHOLD = THRESHOLD
- self.vocab_size = 0
- self.textual_ids = list()
- if self.split == "train":
- self.ixtoword = dict()
- self.wordtoix = dict()
- self.build_dictionary()
- else:
- self.ixtoword = ixtoword
- self.wordtoix = wordtoix
- ### TO-DO
- def build_dictionary(self):
- ### TO-DO
- ### <end> should be at idx 0
- ### <unk> should be at idx 1
- self.ixtoword[0] = "<end>"
- self.ixtoword[1] = "<unk>"
- self.wordtoix["<end>"] = 0
- self.wordtoix["<unk>"] = 1
- cur_id = 2
- word_counts = dict()
- for sentence in self.examples:
- for word in sentence.text:
- word = word.lower()
- if word in word_counts:
- word_counts[word] += 1
- else:
- word_counts[word] = 1
- for word, count in word_counts.items():
- word = word.lower()
- if count >= THRESHOLD:
- self.ixtoword[cur_id] = word
- self.wordtoix[word] = cur_id
- cur_id += 1
- else:
- continue
- for sentence in self.examples:
- sen = list()
- for word in sentence.text:
- word = word.lower
- if word in self.wordtoix:
- sen.append(self.wordtoix[word])
- else:
- sen.append(1)
- self.textual_ids.append(sen)
- self.vocab_size = cur_id
- print(len(self.ixtoword))
- print(len(self.wordtoix))
- return self.textual_ids, self.ixtoword, self.wordtoix
- def get_label(self, index):
- ### TO-DO
- if self.examples[index].label == 'positive':
- return 0
- else:
- return 1
- def get_text(self, index):
- ### TO-DO
- while len(self.textual_ids[index]) < MAX_LEN:
- self.textual_ids[index].append(0)
- #print(self.textual_ids[index])
- return torch.LongTensor(self.textual_ids[index])
- def __len__(self):
- ### TO-DO
- return len(self.examples)
- def __getitem__(self, index):
- ### TO-DO
- text = self.get_text(index)
- lbl = self.get_label(index)
- text_len = len(self.examples[index].text)
- return text, text_len, lbl
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement