Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- reviewsDf = pd.read_csv("documents/beer_reviews.csv")
- #The file is a list of reviews, the beer ABV will remain constant
- # for all the same beers, therefore we want to get rid of the duplicate beers
- beerList = reviewsDf.drop_duplicates(['beer_beerid'])
- brewers = pd.pivot_table(reviewsDf, index="brewery_name", aggfunc=np.mean)
- #For this use
- print("Brewer with highest ABV on average:")
- print(beer_abv['beer_abv'].idxmax())
- print("Its (Average) ABV:")
- print(beer_abv['beer_abv'].max())
- #I had the concern that a certain brewer would only have one or very few beers and considered filterning them out
- # but concluded that
- # they should still be kept on the list
- # 2
- #filteredDf = reviewsDf[reviewsDf['review_overall'] > reviewsDf['review_overall'].mean()]
- #filteredDf = filteredDf[reviewsDf['beer_abv'] < 7]
- #filteredDf = filteredDf[reviewsDf['beer_abv'] > 3.5]
- beertype_avg = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.mean)
- beertype_avg = beertype_avg[beertype_avg['review_overall'] > beertype_avg['review_overall'].mean()]
- beertype_avg = beertype_avg[beertype_avg['beer_abv'] > 3.5]
- beertype_avg = beertype_avg[beertype_avg['beer_abv'] < 7]
- beertype_avg_series = beertype_avg['review_overall'].rename("avg")
- beertype_std = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.std)
- x = beertype_std['review_overall'].rename("std")
- beertype_std_series = (x-min(x))/(max(x)-min(x))
- #Normalize the STD to be used in later calcs
- combinedDf = pd.concat([beertype_avg_series,beertype_std_series], axis=1, join="inner")
- #inner join to get rid of the records we earlier filtered out
- combinedDf['index'] = combinedDf['avg'] * (1+ (((combinedDf['std'] * -1)) * 0.01))
- combinedDf.corr()
- print("My recommendation for a beer would be the:")
- print(combinedDf['index'].idxmax())
- #Explanation:
- #I want to get rid of all the beers that are below average on the overall review
- # I also want to get rid of the beers that are either too strong or light
- # I simply used 3.5 and 7 here, but could've also applied a more mathematical approach
- # and removed within e.g. a 50% percentile
- #
- # Lastly, i develop an index out of the average rating, as well as the STD, as I figured
- # I would want a beer that is generally liked, and isn't too volatile
- #I adjusted to index according to the correlation
- # 3
- pearsonCorrelations = reviewsDf.corr()['review_overall']
- print(pearsonCorrelations)
- #review_aroma 0.616013
- #review_appearance 0.501732
- #review_palate 0.701914
- #review_taste 0.789816
- from sklearn.tree import DecisionTreeRegressor
- regressor = DecisionTreeRegressor(random_state=0)
- tree = regressor.fit(reviewsDf[['review_aroma','review_appearance','review_palate', 'review_taste']], reviewsDf[['review_overall']])
- print(tree.feature_importances_)
- #[0.00479667 0.00543342 0.0526886 0.9370813 ]
- #aroma appearance palate taste
- #Explanation
- # Pearsons correlation signals that they all have an effect on the review overall, with taste being highest, and appearance lowest
- # I did some extra digging and built a (simple) decision tree regressor which indicates that the taste is by far
- # the best predictor of a good rating
- ##
- # 4
- # The reviews are all on the same scale (1-5) so we simply ad them up to come up with a value combining
- # appearance and aroma
- # we can utilzie the data we created earlier
- beertype_avg['aroma&appearance'] = beertype_avg['review_aroma'] + beertype_avg['review_appearance']
- #here is a list of 5 beers types that would suit someone that enjoys appearance and aroma. with the filter on abv and rating
- print(beertype_avg['aroma&appearance'].nlargest(5))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement