jmunsch

Python: NLTK bigram and trigram filter example

Aug 29th, 2014
589
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.15 KB | None | 0 0
  1. with open('_sorted.csv','r') as f:
  2.     lines = f.read().replace('\t',',')
  3.    
  4. lines = lines.split('\n')
  5. lines = [l.split(',')[12:] for l in lines]
  6. lines = [item for l in lines for item in l if ("domain.com" not in item)]
  7. lines = " ".join(lines)
  8. lines = lines.split(' ')
  9.  
  10. #---------------------------------
  11. # requires nltk v3.0
  12. #-------------------------------
  13. import nltk
  14. from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
  15. from nltk.util import ngrams
  16. #from nltk.metrics.association import QuadgramAssocMeasures
  17. #n = 4
  18. #quadgram = ngrams(lines, n) huge set that can be counted and sorted works for any N size
  19. # abstract_measure = nltk.collocations.AbstractCollocation???
  20.  
  21.  
  22. bigram_measures = BigramAssocMeasures()
  23. trigram_measures = TrigramAssocMeasures()
  24. #quadgram_measures = QuadgramAssocMeasures()
  25.    
  26.  
  27. finder2 = BigramCollocationFinder.from_words(lines)
  28. #finder21 = BigramCollocationFinder.from_words(nltk.corpus.brown.tagged_words('ca01', tagset='universal')) # finds tagged words
  29. # r21 = finder21.nbest(bigram_measures.pmi, 5)
  30. finder3 = TrigramCollocationFinder.from_words(lines)
  31. finder4 = QuadgramCollocationFinder.from_words(lines)
  32.  
  33. finder2.apply_freq_filter(5)
  34. finder3.apply_freq_filter(5)
  35. finder4.apply_freq_filter(5)
  36. # r2 = sorted(finder2.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:-1] # (("w1","w2"),int(freq))
  37.  
  38. r2 = finder2.nbest(bigram_measures.pmi, 100)
  39. r3 = finder3.nbest(trigram_measures.pmi, 100)
  40. r4 = sorted(finder4.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:-1]
  41. #$r4 = finder4.nbest(quadgram_measures.pmi, 100) ???
  42. # r2 = [item for item in r2 if any(item not in _ for _ in r3)]
  43. # r3 = [item for item in r3 if any(item not in _ for _ in r4)]
  44.  
  45. lines = []
  46. for item in r2:
  47.     lines.append(' '.join([str(x) for x in item]+['\n']))
  48. for item in r3:
  49.     lines.append(' '.join([str(x) for x in item]+['\n']))
  50. for item in r4:
  51.     lines.append(' '.join([str(x) for x in item]+['\n']))    
  52. with open('token_results.txt','w+') as f:
  53.     for l in lines:
  54.         f.write(l.replace("', '","\t").lstrip("('").replace("')", "\t"))
Advertisement
Add Comment
Please, Sign In to add comment