Untitled

import csv
import numpy as np
import deepcut
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from random import shuffle
from IPython.core.debugger import set_trace
from sklearn.metrics import confusion_matrix
import pandas as pd

#------------------------- Read data ------------------------------
file = open('new_sample_data_observe.csv', 'r', encoding='utf-8')
data = list(csv.reader(file))
shuffle(data)

# for d in data:
#     print(d)

sentences = [d[4] for d in data]
set_of_words = set()
# set of words
num_sentense = 80
for i in range(len(sentences) - num_sentense):
    sentences[i] = sentences[i]+sentences[i+num_sentense]
count = []
words = [[w for w in deepcut.tokenize(s) if w != ' '] for s in sentences]
print(words)
for i in range(num_sentense):
    for x in words[i]:
        set_of_words = set_of_words.union(words[i])
    count.append(len(words[i]))

count_word_in_set = np.zeros((num_sentense,len(set_of_words)))
set_of_words = list(set_of_words )

for i in range(num_sentense):
    for j in range(count[i]):
        for k in range(len(set_of_words)):
            if(set_of_words[k] == words[i][j]):
                count_word_in_set[i][k] = count_word_in_set[i][k] + 1

print(count_word_in_set)
# for i in range(num_sentense):
#     dott = np.sqrt(np.dot(count_word_in_set[i],count_word_in_set[i]))
#     count_word_in_set[i] =   count_word_in_set[i]/dott
pd_count_word = pd.DataFrame(count_word_in_set)
pd_count_word.to_csv(r'count_data.csv')
# count_word_in_set = np.asarray(count_word_in_set).T
# pd_word = pd.DataFrame(count_word_in_set,index=set_of_words)
# # pd_word
# pd_word.to_csv(r'data_med.csv')