Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import numpy as np
- import deepcut
- from keras.models import Model
- from keras.layers import Input, Dense
- from keras.utils import to_categorical
- import matplotlib.pyplot as plt
- from random import shuffle
- from IPython.core.debugger import set_trace
- from sklearn.metrics import confusion_matrix
- import pandas as pd
- #------------------------- Read data ------------------------------
- file = open('new_sample_data_observe.csv', 'r', encoding='utf-8')
- data = list(csv.reader(file))
- shuffle(data)
- # for d in data:
- # print(d)
- sentences = [d[4] for d in data]
- set_of_words = set()
- # set of words
- num_sentense = 80
- for i in range(len(sentences) - num_sentense):
- sentences[i] = sentences[i]+sentences[i+num_sentense]
- count = []
- words = [[w for w in deepcut.tokenize(s) if w != ' '] for s in sentences]
- print(words)
- for i in range(num_sentense):
- for x in words[i]:
- set_of_words = set_of_words.union(words[i])
- count.append(len(words[i]))
- count_word_in_set = np.zeros((num_sentense,len(set_of_words)))
- set_of_words = list(set_of_words )
- for i in range(num_sentense):
- for j in range(count[i]):
- for k in range(len(set_of_words)):
- if(set_of_words[k] == words[i][j]):
- count_word_in_set[i][k] = count_word_in_set[i][k] + 1
- print(count_word_in_set)
- # for i in range(num_sentense):
- # dott = np.sqrt(np.dot(count_word_in_set[i],count_word_in_set[i]))
- # count_word_in_set[i] = count_word_in_set[i]/dott
- pd_count_word = pd.DataFrame(count_word_in_set)
- pd_count_word.to_csv(r'count_data.csv')
- # count_word_in_set = np.asarray(count_word_in_set).T
- # pd_word = pd.DataFrame(count_word_in_set,index=set_of_words)
- # # pd_word
- # pd_word.to_csv(r'data_med.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement