Advertisement
Guest User

Untitled

a guest
Dec 11th, 2018
153
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.72 KB | None | 0 0
  1.  
  2. import pandas as pd
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. reviewsDf = pd.read_csv("documents/beer_reviews.csv")
  6.  
  7. #The file is a list of reviews, the beer ABV will remain constant
  8. # for all the same beers, therefore we want to get rid of the duplicate beers
  9. beerList = reviewsDf.drop_duplicates(['beer_beerid'])
  10.  
  11. brewers = pd.pivot_table(reviewsDf, index="brewery_name", aggfunc=np.mean)
  12. #For this use
  13. print("Brewer with highest ABV on average:")
  14. print(beer_abv['beer_abv'].idxmax())
  15. print("Its (Average) ABV:")
  16. print(beer_abv['beer_abv'].max())
  17. #I had the concern that a certain brewer would only have one or very few beers and considered filterning them out
  18. # but concluded that
  19. # they should still be kept on the list
  20.  
  21.  
  22. # 2
  23.  
  24. #filteredDf = reviewsDf[reviewsDf['review_overall'] > reviewsDf['review_overall'].mean()]
  25. #filteredDf = filteredDf[reviewsDf['beer_abv'] < 7]
  26. #filteredDf = filteredDf[reviewsDf['beer_abv'] > 3.5]
  27.  
  28.  
  29. beertype_avg = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.mean)
  30. beertype_avg = beertype_avg[beertype_avg['review_overall'] > beertype_avg['review_overall'].mean()]
  31. beertype_avg = beertype_avg[beertype_avg['beer_abv'] > 3.5]
  32. beertype_avg = beertype_avg[beertype_avg['beer_abv'] < 7]
  33.  
  34.  
  35. beertype_avg_series = beertype_avg['review_overall'].rename("avg")
  36.  
  37. beertype_std = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.std)
  38. x = beertype_std['review_overall'].rename("std")
  39. beertype_std_series = (x-min(x))/(max(x)-min(x))
  40. #Normalize the STD to be used in later calcs
  41.  
  42.  
  43.  
  44. combinedDf = pd.concat([beertype_avg_series,beertype_std_series], axis=1, join="inner")
  45. #inner join to get rid of the records we earlier filtered out
  46.  
  47.  
  48. combinedDf['index'] = combinedDf['avg'] * (1+ (((combinedDf['std'] * -1)) * 0.01))
  49.  
  50. combinedDf.corr()
  51.  
  52.  
  53. print("My recommendation for a beer would be the:")
  54. print(combinedDf['index'].idxmax())
  55. #Explanation:
  56. #I want to get rid of all the beers that are below average on the overall review
  57. # I also want to get rid of the beers that are either too strong or light
  58. # I simply used 3.5 and 7 here, but could've also applied a more mathematical approach
  59. # and removed within e.g. a 50% percentile
  60. #
  61. # Lastly, i develop an index out of the average rating, as well as the STD, as I figured
  62. # I would want a beer that is generally liked, and isn't too volatile
  63. #I adjusted to index according to the correlation
  64.  
  65. # 3
  66.  
  67. pearsonCorrelations = reviewsDf.corr()['review_overall']
  68. print(pearsonCorrelations)
  69. #review_aroma         0.616013
  70. #review_appearance    0.501732
  71. #review_palate        0.701914
  72. #review_taste         0.789816
  73.  
  74. from sklearn.tree import DecisionTreeRegressor
  75.  
  76. regressor = DecisionTreeRegressor(random_state=0)
  77. tree = regressor.fit(reviewsDf[['review_aroma','review_appearance','review_palate', 'review_taste']], reviewsDf[['review_overall']])
  78. print(tree.feature_importances_)
  79. #[0.00479667 0.00543342 0.0526886  0.9370813 ]
  80. #aroma       appearance palate    taste
  81.  
  82.  
  83.  
  84. #Explanation
  85. # Pearsons correlation signals that they all have an effect on the review overall, with taste being highest, and appearance lowest
  86. # I did some extra digging and built a (simple) decision tree regressor which indicates that the taste is by far
  87. # the best predictor of a good rating
  88. ##
  89.  
  90.  
  91.  
  92.  
  93. # 4
  94. # The reviews are all on the same scale (1-5) so we simply ad them up to come up with a value combining
  95. # appearance and aroma
  96. # we can utilzie the data we created earlier
  97.  
  98. beertype_avg['aroma&appearance'] = beertype_avg['review_aroma'] + beertype_avg['review_appearance']
  99. #here is a list of 5 beers types that would suit someone that enjoys appearance and aroma. with the filter on abv and rating
  100. print(beertype_avg['aroma&appearance'].nlargest(5))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement