Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- import json
- from nltk import word_tokenize
- from collections import Counter
- from itertools import izip
- from pprint import pprint
- labels = ['entailment', 'contradiction', 'neutral']
- def get_dataset(filename):
- for line in open(filename, 'r'):
- data = json.loads(line.strip())
- if data['gold_label'] != '-':
- yield (
- nltk.bigrams(word_tokenize(data['sentence2'].lower())),
- data['gold_label']
- )
- if __name__ == "__main__":
- train_set = get_dataset('snli_1.0/snli_1.0_train.jsonl')
- counters = {l:Counter() for l in labels}
- print "Processing dataset...",
- for data, label in train_set:
- counters[label].update(data)
- print "Done."
- print "Smoothing...",
- vocab = set()
- for l in labels:
- vocab.update(counters[l].keys())
- for l in labels:
- counters[l].update(vocab)
- print "Done."
- result = []
- for word in vocab:
- counts = [counters[l][word] for l in labels]
- total = sum(counts)
- odds = [c / float(total - c) for c in counts]
- result.append(max(((word, l, o) for l, o in izip(labels, odds)),
- key=lambda x: x[2]))
- result.sort(key=lambda x: x[2], reverse=True)
- pprint(result[:50])
Add Comment
Please, Sign In to add comment