Advertisement
Guest User

Untitled

a guest
Apr 21st, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.79 KB | None | 0 0
  1. import io
  2. import json
  3. import operator
  4. from collections import Counter
  5. from sklearn.linear_model import LogisticRegression
  6.  
  7. import nltk
  8. import string
  9. import pymorphy2
  10. from nltk.corpus import stopwords
  11. morph = pymorphy2.MorphAnalyzer()
  12.  
  13.  
  14.  
  15. def extracting(string):
  16.     data = json.loads(string.strip())
  17.     result_string = ""
  18.     result_string += data['description']
  19.     return result_string
  20.  
  21.  
  22. def get_rating(string):
  23.     data = eval(string)
  24.     rating = data['rating_1']
  25.     rating = int(rating[rating.find(" ") + 1:-2]) / 20
  26.     return rating
  27.  
  28.  
  29. def tokenize_me(file_text):
  30.     # firstly let's apply nltk tokenization
  31.     tokens = nltk.word_tokenize(file_text)
  32.  
  33.     # let's delete punctuation symbols
  34.     tokens = [i for i in tokens if (i not in string.punctuation)]
  35.  
  36.     # deleting stop_words
  37.     stop_words = stopwords.words('russian')
  38.     stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '"', '...', "'", '``', "но",
  39.                        "так", "в", "и", 'к', 'для',])
  40.     tokens = [i.lower() for i in tokens if (i not in stop_words)]
  41.  
  42.     # cleaning words
  43.     tokens = [i.replace("«", "").replace("»", "") for i in tokens]
  44.  
  45.     return tokens
  46.  
  47.  
  48. def compute_tf(word_dict, bow):
  49.     tfDict = {}
  50.     bowCount = len(bow)
  51.     for word, count in word_dict.items():
  52.         tfDict[word] = count/float(bowCount)
  53.     return tfDict
  54.  
  55.  
  56. def compute_idf(doc_list):
  57.     import math
  58.     idf_dict = {}
  59.     n = len(doc_list)
  60.     for i in range(len(doc_list)):
  61.         idf_dict.update(dict.fromkeys(doc_list[i].keys(), 0))
  62.     for doc in doc_list:
  63.         for word, val in doc.items():
  64.             if val > 0:
  65.                 idf_dict[word] += 1
  66.  
  67.     for word, val in idf_dict.items():
  68.         idf_dict[word] = math.log10(n / float(val))
  69.  
  70.     return idf_dict
  71.  
  72.  
  73. def computeTF_IDF(tfBow, idfs):
  74.     tf_idf ={}
  75.     for word, val in tfBow.items():
  76.         tf_idf[word] = val*idfs[word]
  77.     return tf_idf
  78.  
  79.  
  80. def list_of_all_words(dict_list):
  81.     res = []
  82.     for dict in dict_list:
  83.         for key in dict:
  84.             res.append(key)
  85.     res = list(set(res))
  86.     return res
  87.  
  88.  
  89. with io.open('hosp_reviews_texts.txt', encoding='utf-8') as f:
  90.     corpus = f.readlines()
  91.  
  92. dict_list = []
  93. c5 = 0
  94. c3 = 0
  95. class_train = []
  96. tfs = []
  97. test_class = []
  98. corpus_list = []
  99. for i in range(200):
  100.     corpus_list.append(extracting(corpus[i]))
  101.     rating = get_rating(corpus[i])
  102.     if rating > 3:
  103.         c5 += 1
  104.         if c5 < 51:
  105.             class_train.append(2)
  106.         else:
  107.             class_train.append(0)
  108.     if rating <= 3:
  109.         c3 += 1
  110.         if c3 < 51:
  111.             class_train.append(1)
  112.         else:
  113.             class_train.append(0)
  114.     corpus_list[i] = tokenize_me(corpus_list[i])
  115.     for j in range(len(corpus_list[i])):
  116.         corpus_list[i][j] = morph.parse(corpus_list[i][j])[0].normal_form
  117.     dict_list.append(dict(Counter(corpus_list[i])))
  118.     tfs.append(compute_tf(dict_list[i], corpus_list[i]))
  119.  
  120.  
  121. for i in range(len(class_train)):
  122.     if class_train[i] != 0:
  123.         index = i
  124.  
  125. i = 0
  126. c = 0
  127. while (i < index - c):
  128.     if class_train[i] == 0:
  129.         dict_list.append(dict_list.pop(i))
  130.         tfs.append(tfs.pop(i))
  131.         class_train.pop(i)
  132.         corpus.append(corpus.pop(i))
  133.         c += 1
  134.         i -= 1
  135.     i += 1
  136.  
  137. for i in range(index - c, 200):
  138.     if (get_rating(corpus[i]) > 3):
  139.         test_class.append(2)
  140.     else:
  141.         test_class.append(1)
  142.  
  143.  
  144.  
  145. '''
  146. tf = compute_tf(dict_list[0], corpus[0])
  147. idf = compute_idf(dict_list)
  148. tf_idf = computeTF_IDF(tf,idf)
  149. print(tf)
  150. '''
  151. f = open("tf_matrix.txt", "w")
  152. f1 = open("tf_sorted.txt", "w")
  153. f2 = open("idf.txt","w")
  154. f3 = open("tf_idf_matrix.txt", "w")
  155. f4 = open("tf_idf_sorted.txt", "w")
  156. idf = compute_idf(dict_list)
  157. pos = ['глагол','существительное','наречие', 'прилагательное']
  158. list_of_pos_in_doc = []
  159. tf_matrix = [list_of_all_words(dict_list[:100])]
  160. tf_idf_matrix  = [list_of_all_words(dict_list[:100]) + pos]
  161. tf_list = []
  162. tf_idf_list = []
  163. for i in range(len(dict_list)):
  164.     vector_pos = [0, 0, 0, 0]
  165.     for word in dict_list[i]:
  166.         p = morph.parse(word)[0]
  167.         if 'VERB' in p.tag:
  168.             vector_pos[0] += 1
  169.         if 'NOUN' in p.tag:
  170.             vector_pos[1] += 1
  171.         if 'ADVB' in p.tag:
  172.             vector_pos[2] += 1
  173.         if 'ADJF' in p.tag:
  174.             vector_pos[3] += 1
  175.     list_of_pos_in_doc.append(vector_pos)
  176.  
  177. for i in range(len(dict_list)):
  178.     f.write("\t".join(str(e) for e in tf_matrix[i]) + "\n")
  179.     f3.write("\t".join(str(e) for e in tf_idf_matrix[i]) + "\n")
  180.     row = []
  181.     tf_idf = computeTF_IDF(tfs[i],idf)
  182.     for key in tfs[i]:
  183.         tf_list.append([key, tfs[i][key]])
  184.     for word in tf_matrix[0]:
  185.         row.append(tfs[i].get(word, 0))
  186.     tf_matrix.append(row)
  187.     row1 = []
  188.     for key in tf_idf:
  189.         tf_idf_list.append([key, tf_idf[key]])
  190.     for j in range(len(tf_idf_matrix[0]) - 4):
  191.         row1.append(tf_idf.get(tf_idf_matrix[0][j], 0))
  192.     for j in range(4):
  193.         row1.append(list_of_pos_in_doc[i][j])
  194.     tf_idf_matrix.append(row1)
  195.  
  196. f.close()
  197. f3.close()
  198.  
  199.  
  200. for index in sorted(tf_list, key=lambda tup: tup[1]):
  201.     f1.write(" ".join(str(e) for e in index) + "\n")
  202. f1.close()
  203.  
  204. for key in idf:
  205.     f2.write(key + " " + str(idf[key]) + "\n")
  206. f2.close()
  207.  
  208. for index in sorted(tf_idf_list, key=lambda tup: tup[1]):
  209.     f4.write(" ".join(str(e) for e in index) + "\n")
  210.  
  211. x_train = tf_idf_matrix[1:101]
  212. cls = LogisticRegression()
  213. print(class_train[:100])
  214. cls.fit(x_train, class_train[:100])
  215. k = 0
  216. for i in range(100, 200):
  217.     if test_class[i-100] == cls.predict([tf_idf_matrix[i]]):
  218.         k += 1
  219.     print(i,cls.predict([tf_idf_matrix[i]]),test_class[i - 100], sep=" ")
  220. print(k / 100)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement