Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/python3
- import pandas as ps
- import hunspell # stemming
- import sys
- import re # regex
- import numpy as np
- try:
- drop = int(sys.argv[1])
- except:
- drop = 0
- with open("../stop_words.txt") as f:
- stop_words = f.readlines()
- # you may also want to remove whitespace characters like `\n` at the end of each line
- stop_words = [x.strip() for x in stop_words]
- hobj = hunspell.HunSpell("/usr/share/hunspell/en_US.dic", "/usr/share/hunspell/en_US.aff")
- # load datasets
- df = ps.read_csv("datasets/reviews_pantene.csv", sep="\t")
- df2 = ps.read_csv("datasets/reviews_gillette.csv", sep="\t")
- df3 = ps.read_csv("datasets/reviews_always.csv", sep="\t")
- df4 = ps.read_csv("datasets/reviews_tampax.csv", sep="\t")
- df5 = ps.read_csv("datasets/reviews_oral-b.csv", sep="\t")
- # concatenate everything
- final = ps.concat([df, df2, df3, df4, df5])
- if drop:
- tmp1 = final[final["user_rating"] == 5]
- tmp2 = final[final["user_rating"] == "5"]
- tmpf = ps.concat([tmp1, tmp2])
- drop_indices = np.random.choice(tmpf.index, 2800, replace=False)
- final = final.drop(drop_indices)
- reviews = final['review']
- orig = reviews
- reviews = np.array(reviews)
- ratings = np.array(final['user_rating'])
- # necessary because we have some weird stuff going on in the CSV files
- # some ratings are NaN and some are just a weird string
- user_ratings = []
- for r in ratings:
- try:
- r_curr = int(r)
- user_ratings.append(r_curr)
- except:
- pass
- # get all the words (to determine size of matrix)
- # and at the same time construct an array with all the stemmed words
- all_words = []
- stemmed_reviews = []
- reviews_in_str = []
- for review in reviews:
- if not (type(review) is str):
- continue
- reviews_in_str.append(review)
- sp = review.split(' ')
- curr = []
- for word in sp:
- stemmed = hobj.stem(word)
- if len(stemmed) > 0:
- s = stemmed[0].decode("UTF-8")
- if s not in stop_words and s not in all_words:
- all_words.append(s.lower())
- curr.append(s.lower())
- stemmed_reviews.append(curr)
- label_nums = {
- 1: 0,
- 2: 0,
- 3: 0,
- 4: 0,
- 5: 0
- }
- ln = {
- 1: 0,
- 2: 0,
- 3: 0,
- 4: 0,
- 5: 0
- }
- for r in user_ratings:
- ln[r] += 1
- for i in range(1, 6):
- print(i, ln[i])
- min_n = len(final[final["user_rating"] == 1])
- for i in range(2, 6):
- if len(final[final["user_rating"] == i]) < min_n:
- min_n = len(final[final["user_rating"] == i])
- n_reviews = []
- n_ratings = []
- n_stemmed = []
- for i in range(len(reviews_in_str)):
- y_res = user_ratings[i]
- rev = reviews_in_str[i]
- rev_stemmed = stemmed_reviews[i]
- # if label_nums[y_res] < min_n:
- n_ratings.append(y_res)
- n_reviews.append(rev)
- n_stemmed.append(" ".join(rev_stemmed))
- label_nums[y_res] += 1
- complete = {
- "reviews": reviews_in_str,
- "ratings": user_ratings
- }
- nd = {
- "reviews": n_reviews,
- "ratings": n_ratings
- }
- nd_stemmed = {
- "reviews": n_stemmed,
- "ratings": n_ratings
- }
- s_r = [] # s_r stands for "stemmed_reviews"
- for row in stemmed_reviews:
- s_r.append(" ".join(row))
- complete_stemmed = {
- "reviews": s_r,
- "ratings": user_ratings
- }
- ndf = ps.DataFrame.from_dict(nd)
- ndf_stemmed = ps.DataFrame.from_dict(nd_stemmed)
- complete_df = ps.DataFrame.from_dict(complete)
- ndf_stemmed_complete = ps.DataFrame.from_dict(complete_stemmed)
- if __name__ == "__main__":
- complete_df.to_csv("full_data.csv", index=False)
- ndf.to_csv("better_data.csv", index=False)
- ndf_stemmed.to_csv("stemmed.csv", index=False)
- ndf_stemmed_complete.to_csv("stemmed_complete.csv", index=False)
- sys.exit(0)
- mdict = {}
- with open("../word2vec.txt", "r") as f:
- k = f.readlines()
- ignore = k[2 : len(k)]
- ignore = [i.strip() for i in ignore]
- # remove stuff like 20th, 20s, 1, etc. (basically numbers)
- regex = re.compile(r"(\d+[a-z]*)$")
- filtered = [i for i in all_words if not regex.search(i)]
- # filtered = all_words
- filtered.sort()
- for i in ignore:
- curr = i.split(" ")
- w = curr[0]
- vec = curr[1 : len(curr)]
- mdict[w] = vec
- weird_words = []
- for i in filtered:
- try:
- # just to trigger the exception
- _ = mdict[i]
- except:
- weird_words.append(i)
- continue
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement