Guest User

Untitled

a guest
Feb 25th, 2018
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.25 KB | None | 0 0
  1. import nltk
  2. import json
  3. from nltk import word_tokenize
  4. from collections import Counter
  5. from itertools import izip
  6. from pprint import pprint
  7. labels = ['entailment', 'contradiction', 'neutral']
  8.  
  9.  
  10. def get_dataset(filename):
  11. for line in open(filename, 'r'):
  12. data = json.loads(line.strip())
  13. if data['gold_label'] != '-':
  14. yield (
  15. nltk.bigrams(word_tokenize(data['sentence2'].lower())),
  16. data['gold_label']
  17. )
  18.  
  19. if __name__ == "__main__":
  20. train_set = get_dataset('snli_1.0/snli_1.0_train.jsonl')
  21. counters = {l:Counter() for l in labels}
  22. print "Processing dataset...",
  23. for data, label in train_set:
  24. counters[label].update(data)
  25. print "Done."
  26.  
  27. print "Smoothing...",
  28. vocab = set()
  29. for l in labels:
  30. vocab.update(counters[l].keys())
  31.  
  32. for l in labels:
  33. counters[l].update(vocab)
  34. print "Done."
  35.  
  36. result = []
  37. for word in vocab:
  38. counts = [counters[l][word] for l in labels]
  39. total = sum(counts)
  40. odds = [c / float(total - c) for c in counts]
  41. result.append(max(((word, l, o) for l, o in izip(labels, odds)),
  42. key=lambda x: x[2]))
  43.  
  44. result.sort(key=lambda x: x[2], reverse=True)
  45. pprint(result[:50])
Add Comment
Please, Sign In to add comment