Advertisement
Guest User

Untitled

a guest
Nov 26th, 2014
155
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.65 KB | None | 0 0
  1. from stemming.porter2 import stem
  2. l = []
  3. labels = []
  4. x=0
  5. y=0
  6. t=0
  7. word = []
  8.  
  9. # открываем трейн сет, делаем из него массив массивов, где слова уже заспличены, удаляем символы
  10. s = open('train_set.txt')
  11. nolabels = []
  12. sent = []
  13. lines = []
  14. stopsymb = ['.',',','-','(',')','/','"','<','>',':',';','?','1','2','3','4','5','6','7','8','9','0']
  15. X = []
  16. y = []
  17. for i in s:
  18. label,text = i.split('\t')
  19. X.append(text)
  20. y.append(label)
  21.  
  22. for line in X:
  23. nolabels.append(str(line[0:]))
  24. for line in nolabels:
  25. for i in stopsymb:
  26. line=line.replace(i,'')
  27. sent.append(line)
  28.  
  29. for line in sent:
  30. lines.append(line.split())
  31.  
  32. # удаляем стопслова
  33. stoplist = []
  34. s = open("stop_words.txt")
  35. for line in s:
  36. sl = str(line)
  37. sl=sl.replace("\n",'')
  38. stoplist.append(sl)
  39.  
  40. # stemming
  41. for i in range(500):
  42. for j in range(len(lines[i])):
  43. for k in stoplist:
  44. if lines[i][j]==k:
  45. lines[i][j]=''
  46. lines[i][j]=stem(str(lines[i][j]))
  47.  
  48. words1={}
  49. words0={}
  50. for i in range(500):
  51. for j in range(len(lines[i])):
  52. if y[i]=='1':
  53. words1[lines[i][j]] = words1.get(lines[i][j], 0) + 1
  54. else:
  55. words0[lines[i][j]] = words0.get(lines[i][j], 0) + 1
  56.  
  57.  
  58.  
  59.  
  60. print sorted(words1.items(), key=lambda (k, v): v, reverse=True)
  61. print 1
  62. print 1
  63. print 1
  64. print 1
  65. print 1
  66. print 1
  67. print sorted(words0.items(), key=lambda (k, v): v, reverse=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement