Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import json
- import operator
- from collections import Counter
- from sklearn.linear_model import LogisticRegression
- import nltk
- import string
- import pymorphy2
- from nltk.corpus import stopwords
- morph = pymorphy2.MorphAnalyzer()
- def extracting(string):
- data = json.loads(string.strip())
- result_string = ""
- result_string += data['description']
- return result_string
- def get_rating(string):
- data = eval(string)
- rating = data['rating_1']
- rating = int(rating[rating.find(" ") + 1:-2]) / 20
- return rating
- def tokenize_me(file_text):
- # firstly let's apply nltk tokenization
- tokens = nltk.word_tokenize(file_text)
- # let's delete punctuation symbols
- tokens = [i for i in tokens if (i not in string.punctuation)]
- # deleting stop_words
- stop_words = stopwords.words('russian')
- stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '"', '...', "'", '``', "но",
- "так", "в", "и", 'к', 'для',])
- tokens = [i.lower() for i in tokens if (i not in stop_words)]
- # cleaning words
- tokens = [i.replace("«", "").replace("»", "") for i in tokens]
- return tokens
- def compute_tf(word_dict, bow):
- tfDict = {}
- bowCount = len(bow)
- for word, count in word_dict.items():
- tfDict[word] = count/float(bowCount)
- return tfDict
- def compute_idf(doc_list):
- import math
- idf_dict = {}
- n = len(doc_list)
- for i in range(len(doc_list)):
- idf_dict.update(dict.fromkeys(doc_list[i].keys(), 0))
- for doc in doc_list:
- for word, val in doc.items():
- if val > 0:
- idf_dict[word] += 1
- for word, val in idf_dict.items():
- idf_dict[word] = math.log10(n / float(val))
- return idf_dict
- def computeTF_IDF(tfBow, idfs):
- tf_idf ={}
- for word, val in tfBow.items():
- tf_idf[word] = val*idfs[word]
- return tf_idf
- def list_of_all_words(dict_list):
- res = []
- for dict in dict_list:
- for key in dict:
- res.append(key)
- res = list(set(res))
- return res
- with io.open('hosp_reviews_texts.txt', encoding='utf-8') as f:
- corpus = f.readlines()
- dict_list = []
- c5 = 0
- c3 = 0
- class_train = []
- tfs = []
- test_class = []
- corpus_list = []
- for i in range(200):
- corpus_list.append(extracting(corpus[i]))
- rating = get_rating(corpus[i])
- if rating > 3:
- c5 += 1
- if c5 < 51:
- class_train.append(2)
- else:
- class_train.append(0)
- if rating <= 3:
- c3 += 1
- if c3 < 51:
- class_train.append(1)
- else:
- class_train.append(0)
- corpus_list[i] = tokenize_me(corpus_list[i])
- for j in range(len(corpus_list[i])):
- corpus_list[i][j] = morph.parse(corpus_list[i][j])[0].normal_form
- dict_list.append(dict(Counter(corpus_list[i])))
- tfs.append(compute_tf(dict_list[i], corpus_list[i]))
- for i in range(len(class_train)):
- if class_train[i] != 0:
- index = i
- i = 0
- c = 0
- while (i < index - c):
- if class_train[i] == 0:
- dict_list.append(dict_list.pop(i))
- tfs.append(tfs.pop(i))
- class_train.pop(i)
- corpus.append(corpus.pop(i))
- c += 1
- i -= 1
- i += 1
- for i in range(index - c, 200):
- if (get_rating(corpus[i]) > 3):
- test_class.append(2)
- else:
- test_class.append(1)
- '''
- tf = compute_tf(dict_list[0], corpus[0])
- idf = compute_idf(dict_list)
- tf_idf = computeTF_IDF(tf,idf)
- print(tf)
- '''
- f = open("tf_matrix.txt", "w")
- f1 = open("tf_sorted.txt", "w")
- f2 = open("idf.txt","w")
- f3 = open("tf_idf_matrix.txt", "w")
- f4 = open("tf_idf_sorted.txt", "w")
- idf = compute_idf(dict_list)
- pos = ['глагол','существительное','наречие', 'прилагательное']
- list_of_pos_in_doc = []
- tf_matrix = [list_of_all_words(dict_list[:100])]
- tf_idf_matrix = [list_of_all_words(dict_list[:100]) + pos]
- tf_list = []
- tf_idf_list = []
- for i in range(len(dict_list)):
- vector_pos = [0, 0, 0, 0]
- for word in dict_list[i]:
- p = morph.parse(word)[0]
- if 'VERB' in p.tag:
- vector_pos[0] += 1
- if 'NOUN' in p.tag:
- vector_pos[1] += 1
- if 'ADVB' in p.tag:
- vector_pos[2] += 1
- if 'ADJF' in p.tag:
- vector_pos[3] += 1
- list_of_pos_in_doc.append(vector_pos)
- for i in range(len(dict_list)):
- f.write("\t".join(str(e) for e in tf_matrix[i]) + "\n")
- f3.write("\t".join(str(e) for e in tf_idf_matrix[i]) + "\n")
- row = []
- tf_idf = computeTF_IDF(tfs[i],idf)
- for key in tfs[i]:
- tf_list.append([key, tfs[i][key]])
- for word in tf_matrix[0]:
- row.append(tfs[i].get(word, 0))
- tf_matrix.append(row)
- row1 = []
- for key in tf_idf:
- tf_idf_list.append([key, tf_idf[key]])
- for j in range(len(tf_idf_matrix[0]) - 4):
- row1.append(tf_idf.get(tf_idf_matrix[0][j], 0))
- for j in range(4):
- row1.append(list_of_pos_in_doc[i][j])
- tf_idf_matrix.append(row1)
- f.close()
- f3.close()
- for index in sorted(tf_list, key=lambda tup: tup[1]):
- f1.write(" ".join(str(e) for e in index) + "\n")
- f1.close()
- for key in idf:
- f2.write(key + " " + str(idf[key]) + "\n")
- f2.close()
- for index in sorted(tf_idf_list, key=lambda tup: tup[1]):
- f4.write(" ".join(str(e) for e in index) + "\n")
- x_train = tf_idf_matrix[1:101]
- cls = LogisticRegression()
- print(class_train[:100])
- cls.fit(x_train, class_train[:100])
- k = 0
- for i in range(100, 200):
- if test_class[i-100] == cls.predict([tf_idf_matrix[i]]):
- k += 1
- print(i,cls.predict([tf_idf_matrix[i]]),test_class[i - 100], sep=" ")
- print(k / 100)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement