Advertisement
Guest User

Untitled

a guest
Feb 21st, 2018
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.02 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import string
  4.  
  5. def topWords(reviewArray):
  6. reviewArray = reviewArray.str.lower()
  7. reviewArray = reviewArray.str.translate(string.punctuation, None)
  8. words2D = reviewArray.str.split()
  9. dic = {}
  10.  
  11. for row in words2D:
  12. for s in row:
  13. if s in dic:
  14. dic[s] = dic[s] + 1
  15. else:
  16. dic[s] = 1
  17.  
  18. sorted_list = sorted(dic, key=dic.get, reverse=True)
  19. return sorted_list[:10000]
  20.  
  21. def binBagWords(topWords, reviews):
  22. dic_ref = {}
  23. wordSet = set()
  24.  
  25. i = 0
  26. for word in topWords:
  27. dic_ref[word] = i
  28. wordSet.add(word)
  29. i = i+1
  30.  
  31. toRet = np.zeros((len(reviews), len(topWords)))
  32. words2D = reviews.str.split()
  33. for i in range(len(reviews)):
  34. for j in words2D[i]:
  35. if j in wordSet:
  36. toRet[i][dic_ref[j]] = 1
  37. return toRet
  38.  
  39. def frequencyBagWords(topWords, reviews):
  40. dic_ref = {}
  41. wordSet = set()
  42.  
  43. i = 0
  44. for word in topWords:
  45. dic_ref[word] = i
  46. wordSet.add(word)
  47. i = i+1
  48.  
  49. toRet = np.zeros((len(reviews), len(topWords)))
  50. words2D = reviews.str.split()
  51. for i in range(len(reviews)):
  52. count = 0
  53. for j in words2D[i]:
  54. if j in wordSet:
  55. toRet[i][dic_ref[j]] = toRet[i][dic_ref[j]] + 1
  56. count = count + 1
  57. if (count != 0):
  58. for j in range(len(toRet[i])):
  59. toRet[i][j] = toRet[i][j] / count
  60. return toRet
  61.  
  62. if __name__ == "__main__":
  63. yelp_train = pd.read_csv("Data/yelp-train.txt", sep='\t', header=None)
  64. yelp_test = pd.read_csv("Data/yelp-test.txt", sep='\t', header=None)
  65. yelp_valid = pd.read_csv("Data/yelp-valid.txt", sep='\t', header=None)
  66.  
  67. yelp_words = topWords(yelp_train[0])
  68. binMatrix = binBagWords(yelp_words[:10000], yelp_train[0])
  69. freqMatrix = frequencyBagWords(yelp_words[:10000], yelp_train[0])
  70. print(freqMatrix)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement