Advertisement
Guest User

dang

a guest
Sep 19th, 2018
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.23 KB | None | 0 0
  1. #! /usr/bin/python3
  2. import pandas as ps
  3. import hunspell # stemming
  4. import sys
  5. import re # regex
  6. import numpy as np
  7.  
  8. try:
  9.   drop = int(sys.argv[1])
  10. except:
  11.   drop = 0
  12.  
  13. with open("../stop_words.txt") as f:
  14.   stop_words = f.readlines()
  15.  
  16. # you may also want to remove whitespace characters like `\n` at the end of each line
  17. stop_words = [x.strip() for x in stop_words]
  18.  
  19. hobj = hunspell.HunSpell("/usr/share/hunspell/en_US.dic", "/usr/share/hunspell/en_US.aff")
  20.  
  21. # load datasets
  22. df = ps.read_csv("datasets/reviews_pantene.csv", sep="\t")
  23. df2 = ps.read_csv("datasets/reviews_gillette.csv", sep="\t")
  24. df3 = ps.read_csv("datasets/reviews_always.csv", sep="\t")
  25. df4 = ps.read_csv("datasets/reviews_tampax.csv", sep="\t")
  26. df5 = ps.read_csv("datasets/reviews_oral-b.csv", sep="\t")
  27.  
  28. # concatenate everything
  29. final = ps.concat([df, df2, df3, df4, df5])
  30.  
  31. if drop:
  32.   tmp1 = final[final["user_rating"] == 5]
  33.   tmp2 = final[final["user_rating"] == "5"]
  34.  
  35.   tmpf = ps.concat([tmp1, tmp2])
  36.  
  37.   drop_indices = np.random.choice(tmpf.index, 2800, replace=False)
  38.   final = final.drop(drop_indices)
  39.  
  40.  
  41. reviews = final['review']
  42. orig = reviews
  43. reviews = np.array(reviews)
  44. ratings = np.array(final['user_rating'])
  45.  
  46. # necessary because we have some weird stuff going on in the CSV files
  47. # some ratings are NaN and some are just a weird string
  48. user_ratings = []
  49. for r in ratings:
  50.     try:
  51.         r_curr = int(r)
  52.         user_ratings.append(r_curr)
  53.     except:
  54.         pass
  55.  
  56. # get all the words (to determine size of matrix)
  57. # and at the same time construct an array with all the stemmed words
  58. all_words = []
  59. stemmed_reviews = []
  60. reviews_in_str = []
  61. for review in reviews:
  62.     if not (type(review) is str):
  63.         continue
  64.     reviews_in_str.append(review)
  65.     sp = review.split(' ')
  66.     curr = []
  67.     for word in sp:
  68.         stemmed = hobj.stem(word)
  69.         if len(stemmed) > 0:
  70.             s = stemmed[0].decode("UTF-8")
  71.             if s not in stop_words and s not in all_words:
  72.                 all_words.append(s.lower())
  73.                 curr.append(s.lower())
  74.     stemmed_reviews.append(curr)
  75.  
  76.  
  77. label_nums = {
  78.   1: 0,
  79.   2: 0,
  80.   3: 0,
  81.   4: 0,
  82.   5: 0
  83. }
  84.  
  85. ln = {
  86.   1: 0,
  87.   2: 0,
  88.   3: 0,
  89.   4: 0,
  90.   5: 0
  91. }
  92.  
  93. for r in user_ratings:
  94.   ln[r] += 1
  95.  
  96. for i in range(1, 6):
  97.   print(i, ln[i])
  98.  
  99. min_n = len(final[final["user_rating"] == 1])
  100.  
  101. for i in range(2, 6):
  102.   if len(final[final["user_rating"] == i]) < min_n:
  103.       min_n = len(final[final["user_rating"] == i])
  104.  
  105. n_reviews = []
  106. n_ratings = []
  107. n_stemmed = []
  108. for i in range(len(reviews_in_str)):
  109.   y_res = user_ratings[i]
  110.   rev = reviews_in_str[i]
  111.   rev_stemmed = stemmed_reviews[i]
  112.  
  113.   # if label_nums[y_res] < min_n:
  114.   n_ratings.append(y_res)
  115.   n_reviews.append(rev)
  116.   n_stemmed.append(" ".join(rev_stemmed))
  117.   label_nums[y_res] += 1
  118.  
  119. complete = {
  120.   "reviews": reviews_in_str,
  121.   "ratings": user_ratings
  122. }
  123.  
  124. nd = {
  125.   "reviews": n_reviews,
  126.   "ratings": n_ratings
  127. }
  128.  
  129. nd_stemmed = {
  130.   "reviews": n_stemmed,
  131.   "ratings": n_ratings
  132. }
  133.  
  134. s_r = [] # s_r stands for "stemmed_reviews"
  135. for row in stemmed_reviews:
  136.   s_r.append(" ".join(row))
  137.  
  138. complete_stemmed = {
  139.   "reviews": s_r,
  140.   "ratings": user_ratings
  141. }
  142.  
  143.  
  144.  
  145. ndf = ps.DataFrame.from_dict(nd)
  146. ndf_stemmed = ps.DataFrame.from_dict(nd_stemmed)
  147. complete_df = ps.DataFrame.from_dict(complete)
  148. ndf_stemmed_complete = ps.DataFrame.from_dict(complete_stemmed)
  149.  
  150. if __name__ == "__main__":
  151.  
  152.   complete_df.to_csv("full_data.csv", index=False)
  153.  
  154.   ndf.to_csv("better_data.csv", index=False)
  155.  
  156.   ndf_stemmed.to_csv("stemmed.csv", index=False)
  157.  
  158.   ndf_stemmed_complete.to_csv("stemmed_complete.csv", index=False)
  159.  
  160.   sys.exit(0)
  161.  
  162. mdict = {}
  163. with open("../word2vec.txt", "r") as f:
  164.   k = f.readlines()
  165.  
  166. ignore = k[2 : len(k)]
  167. ignore = [i.strip() for i in ignore]
  168.  
  169. # remove stuff like 20th, 20s, 1, etc. (basically numbers)
  170. regex = re.compile(r"(\d+[a-z]*)$")
  171. filtered = [i for i in all_words if not regex.search(i)]
  172. # filtered = all_words
  173. filtered.sort()
  174.  
  175. for i in ignore:
  176.   curr = i.split(" ")
  177.   w = curr[0]
  178.   vec = curr[1 : len(curr)]
  179.   mdict[w] = vec
  180.  
  181. weird_words = []
  182. for i in filtered:
  183.   try:
  184.     # just to trigger the exception
  185.     _ = mdict[i]
  186.   except:
  187.     weird_words.append(i)
  188.     continue
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement