Untitled

import io
import json
import operator
from collections import Counter
from sklearn.linear_model import LogisticRegression

import nltk
import string
import pymorphy2
from nltk.corpus import stopwords
morph = pymorphy2.MorphAnalyzer()


def extracting(string):
    data = json.loads(string.strip())
    result_string = ""
    result_string += data['description']
    return result_string


def get_rating(string):
    data = eval(string)
    rating = data['rating_1']
    rating = int(rating[rating.find(" ") + 1:-2]) / 20
    return rating


def tokenize_me(file_text):
    # firstly let's apply nltk tokenization
    tokens = nltk.word_tokenize(file_text)

    # let's delete punctuation symbols
    tokens = [i for i in tokens if (i not in string.punctuation)]

    # deleting stop_words
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на', '"', '...', "'", '``', "но",
                       "так", "в", "и", 'к', 'для',])
    tokens = [i.lower() for i in tokens if (i not in stop_words)]

    # cleaning words
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]

    return tokens


def compute_tf(word_dict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in word_dict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict


def compute_idf(doc_list):
    import math
    idf_dict = {}
    n = len(doc_list)
    for i in range(len(doc_list)):
        idf_dict.update(dict.fromkeys(doc_list[i].keys(), 0))
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1

    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(n / float(val))

    return idf_dict


def computeTF_IDF(tfBow, idfs):
    tf_idf ={}
    for word, val in tfBow.items():
        tf_idf[word] = val*idfs[word]
    return tf_idf


def list_of_all_words(dict_list):
    res = []
    for dict in dict_list:
        for key in dict:
            res.append(key)
    res = list(set(res))
    return res


with io.open('hosp_reviews_texts.txt', encoding='utf-8') as f:
    corpus = f.readlines()

dict_list = []
c5 = 0
c3 = 0
class_train = []
tfs = []
test_class = []
corpus_list = []
for i in range(200):
    corpus_list.append(extracting(corpus[i]))
    rating = get_rating(corpus[i])
    if rating > 3:
        c5 += 1
        if c5 < 51:
            class_train.append(2)
        else:
            class_train.append(0)
    if rating <= 3:
        c3 += 1
        if c3 < 51:
            class_train.append(1)
        else:
            class_train.append(0)
    corpus_list[i] = tokenize_me(corpus_list[i])
    for j in range(len(corpus_list[i])):
        corpus_list[i][j] = morph.parse(corpus_list[i][j])[0].normal_form
    dict_list.append(dict(Counter(corpus_list[i])))
    tfs.append(compute_tf(dict_list[i], corpus_list[i]))


for i in range(len(class_train)):
    if class_train[i] != 0:
        index = i

i = 0
c = 0
while (i < index - c):
    if class_train[i] == 0:
        dict_list.append(dict_list.pop(i))
        tfs.append(tfs.pop(i))
        class_train.pop(i)
        corpus.append(corpus.pop(i))
        c += 1
        i -= 1
    i += 1

for i in range(index - c, 200):
    if (get_rating(corpus[i]) > 3):
        test_class.append(2)
    else:
        test_class.append(1)


'''
tf = compute_tf(dict_list[0], corpus[0])
idf = compute_idf(dict_list)
tf_idf = computeTF_IDF(tf,idf)
print(tf)
'''
f = open("tf_matrix.txt", "w")
f1 = open("tf_sorted.txt", "w")
f2 = open("idf.txt","w")
f3 = open("tf_idf_matrix.txt", "w")
f4 = open("tf_idf_sorted.txt", "w")
idf = compute_idf(dict_list)
pos = ['глагол','существительное','наречие', 'прилагательное']
list_of_pos_in_doc = []
tf_matrix = [list_of_all_words(dict_list[:100])]
tf_idf_matrix  = [list_of_all_words(dict_list[:100]) + pos]
tf_list = []
tf_idf_list = []
for i in range(len(dict_list)):
    vector_pos = [0, 0, 0, 0]
    for word in dict_list[i]:
        p = morph.parse(word)[0]
        if 'VERB' in p.tag:
            vector_pos[0] += 1
        if 'NOUN' in p.tag:
            vector_pos[1] += 1
        if 'ADVB' in p.tag:
            vector_pos[2] += 1
        if 'ADJF' in p.tag:
            vector_pos[3] += 1
    list_of_pos_in_doc.append(vector_pos)

for i in range(len(dict_list)):
    f.write("\t".join(str(e) for e in tf_matrix[i]) + "\n")
    f3.write("\t".join(str(e) for e in tf_idf_matrix[i]) + "\n")
    row = []
    tf_idf = computeTF_IDF(tfs[i],idf)
    for key in tfs[i]:
        tf_list.append([key, tfs[i][key]])
    for word in tf_matrix[0]:
        row.append(tfs[i].get(word, 0))
    tf_matrix.append(row)
    row1 = []
    for key in tf_idf:
        tf_idf_list.append([key, tf_idf[key]])
    for j in range(len(tf_idf_matrix[0]) - 4):
        row1.append(tf_idf.get(tf_idf_matrix[0][j], 0))
    for j in range(4):
        row1.append(list_of_pos_in_doc[i][j])
    tf_idf_matrix.append(row1)

f.close()
f3.close()


for index in sorted(tf_list, key=lambda tup: tup[1]):
    f1.write(" ".join(str(e) for e in index) + "\n")
f1.close()

for key in idf:
    f2.write(key + " " + str(idf[key]) + "\n")
f2.close()

for index in sorted(tf_idf_list, key=lambda tup: tup[1]):
    f4.write(" ".join(str(e) for e in index) + "\n")

x_train = tf_idf_matrix[1:101]
cls = LogisticRegression()
print(class_train[:100])
cls.fit(x_train, class_train[:100])
k = 0
for i in range(100, 200):
    if test_class[i-100] == cls.predict([tf_idf_matrix[i]]):
        k += 1
    print(i,cls.predict([tf_idf_matrix[i]]),test_class[i - 100], sep=" ")
print(k / 100)