Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import string
- def topWords(reviewArray):
- reviewArray = reviewArray.str.lower()
- reviewArray = reviewArray.str.translate(string.punctuation, None)
- words2D = reviewArray.str.split()
- dic = {}
- for row in words2D:
- for s in row:
- if s in dic:
- dic[s] = dic[s] + 1
- else:
- dic[s] = 1
- sorted_list = sorted(dic, key=dic.get, reverse=True)
- return sorted_list[:10000]
- def binBagWords(topWords, reviews):
- dic_ref = {}
- wordSet = set()
- i = 0
- for word in topWords:
- dic_ref[word] = i
- wordSet.add(word)
- i = i+1
- toRet = np.zeros((len(reviews), len(topWords)))
- words2D = reviews.str.split()
- for i in range(len(reviews)):
- for j in words2D[i]:
- if j in wordSet:
- toRet[i][dic_ref[j]] = 1
- return toRet
- def frequencyBagWords(topWords, reviews):
- dic_ref = {}
- wordSet = set()
- i = 0
- for word in topWords:
- dic_ref[word] = i
- wordSet.add(word)
- i = i+1
- toRet = np.zeros((len(reviews), len(topWords)))
- words2D = reviews.str.split()
- for i in range(len(reviews)):
- count = 0
- for j in words2D[i]:
- if j in wordSet:
- toRet[i][dic_ref[j]] = toRet[i][dic_ref[j]] + 1
- count = count + 1
- if (count != 0):
- for j in range(len(toRet[i])):
- toRet[i][j] = toRet[i][j] / count
- return toRet
- if __name__ == "__main__":
- yelp_train = pd.read_csv("Data/yelp-train.txt", sep='\t', header=None)
- yelp_test = pd.read_csv("Data/yelp-test.txt", sep='\t', header=None)
- yelp_valid = pd.read_csv("Data/yelp-valid.txt", sep='\t', header=None)
- yelp_words = topWords(yelp_train[0])
- binMatrix = binBagWords(yelp_words[:10000], yelp_train[0])
- freqMatrix = frequencyBagWords(yelp_words[:10000], yelp_train[0])
- print(freqMatrix)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement