Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from stemming.porter2 import stem
- l = []
- labels = []
- x=0
- y=0
- t=0
- word = []
- # открываем трейн сет, делаем из него массив массивов, где слова уже заспличены, удаляем символы
- s = open('train_set.txt')
- nolabels = []
- sent = []
- lines = []
- stopsymb = ['.',',','-','(',')','/','"','<','>',':',';','?','1','2','3','4','5','6','7','8','9','0']
- X = []
- y = []
- for i in s:
- label,text = i.split('\t')
- X.append(text)
- y.append(label)
- for line in X:
- nolabels.append(str(line[0:]))
- for line in nolabels:
- for i in stopsymb:
- line=line.replace(i,'')
- sent.append(line)
- for line in sent:
- lines.append(line.split())
- # удаляем стопслова
- stoplist = []
- s = open("stop_words.txt")
- for line in s:
- sl = str(line)
- sl=sl.replace("\n",'')
- stoplist.append(sl)
- # stemming
- for i in range(500):
- for j in range(len(lines[i])):
- for k in stoplist:
- if lines[i][j]==k:
- lines[i][j]=''
- lines[i][j]=stem(str(lines[i][j]))
- words1={}
- words0={}
- for i in range(500):
- for j in range(len(lines[i])):
- if y[i]=='1':
- words1[lines[i][j]] = words1.get(lines[i][j], 0) + 1
- else:
- words0[lines[i][j]] = words0.get(lines[i][j], 0) + 1
- print sorted(words1.items(), key=lambda (k, v): v, reverse=True)
- print 1
- print 1
- print 1
- print 1
- print 1
- print 1
- print sorted(words0.items(), key=lambda (k, v): v, reverse=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement