Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- with open('_sorted.csv','r') as f:
- lines = f.read().replace('\t',',')
- lines = lines.split('\n')
- lines = [l.split(',')[12:] for l in lines]
- lines = [item for l in lines for item in l if ("domain.com" not in item)]
- lines = " ".join(lines)
- lines = lines.split(' ')
- #---------------------------------
- # requires nltk v3.0
- #-------------------------------
- import nltk
- from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
- from nltk.util import ngrams
- #from nltk.metrics.association import QuadgramAssocMeasures
- #n = 4
- #quadgram = ngrams(lines, n) huge set that can be counted and sorted works for any N size
- # abstract_measure = nltk.collocations.AbstractCollocation???
- bigram_measures = BigramAssocMeasures()
- trigram_measures = TrigramAssocMeasures()
- #quadgram_measures = QuadgramAssocMeasures()
- finder2 = BigramCollocationFinder.from_words(lines)
- #finder21 = BigramCollocationFinder.from_words(nltk.corpus.brown.tagged_words('ca01', tagset='universal')) # finds tagged words
- # r21 = finder21.nbest(bigram_measures.pmi, 5)
- finder3 = TrigramCollocationFinder.from_words(lines)
- finder4 = QuadgramCollocationFinder.from_words(lines)
- finder2.apply_freq_filter(5)
- finder3.apply_freq_filter(5)
- finder4.apply_freq_filter(5)
- # r2 = sorted(finder2.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:-1] # (("w1","w2"),int(freq))
- r2 = finder2.nbest(bigram_measures.pmi, 100)
- r3 = finder3.nbest(trigram_measures.pmi, 100)
- r4 = sorted(finder4.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:-1]
- #$r4 = finder4.nbest(quadgram_measures.pmi, 100) ???
- # r2 = [item for item in r2 if any(item not in _ for _ in r3)]
- # r3 = [item for item in r3 if any(item not in _ for _ in r4)]
- lines = []
- for item in r2:
- lines.append(' '.join([str(x) for x in item]+['\n']))
- for item in r3:
- lines.append(' '.join([str(x) for x in item]+['\n']))
- for item in r4:
- lines.append(' '.join([str(x) for x in item]+['\n']))
- with open('token_results.txt','w+') as f:
- for l in lines:
- f.write(l.replace("', '","\t").lstrip("('").replace("')", "\t"))
Advertisement
Add Comment
Please, Sign In to add comment