Untitled


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
reviewsDf = pd.read_csv("documents/beer_reviews.csv")

#The file is a list of reviews, the beer ABV will remain constant
# for all the same beers, therefore we want to get rid of the duplicate beers
beerList = reviewsDf.drop_duplicates(['beer_beerid'])

brewers = pd.pivot_table(reviewsDf, index="brewery_name", aggfunc=np.mean)
#For this use
print("Brewer with highest ABV on average:")
print(beer_abv['beer_abv'].idxmax())
print("Its (Average) ABV:")
print(beer_abv['beer_abv'].max())
#I had the concern that a certain brewer would only have one or very few beers and considered filterning them out
# but concluded that
# they should still be kept on the list


# 2

#filteredDf = reviewsDf[reviewsDf['review_overall'] > reviewsDf['review_overall'].mean()]
#filteredDf = filteredDf[reviewsDf['beer_abv'] < 7]
#filteredDf = filteredDf[reviewsDf['beer_abv'] > 3.5]


beertype_avg = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.mean)
beertype_avg = beertype_avg[beertype_avg['review_overall'] > beertype_avg['review_overall'].mean()]
beertype_avg = beertype_avg[beertype_avg['beer_abv'] > 3.5]
beertype_avg = beertype_avg[beertype_avg['beer_abv'] < 7]


beertype_avg_series = beertype_avg['review_overall'].rename("avg")

beertype_std = pd.pivot_table(reviewsDf, index="beer_style", aggfunc=np.std)
x = beertype_std['review_overall'].rename("std")
beertype_std_series = (x-min(x))/(max(x)-min(x))
#Normalize the STD to be used in later calcs


combinedDf = pd.concat([beertype_avg_series,beertype_std_series], axis=1, join="inner")
#inner join to get rid of the records we earlier filtered out


combinedDf['index'] = combinedDf['avg'] * (1+ (((combinedDf['std'] * -1)) * 0.01))

combinedDf.corr()


print("My recommendation for a beer would be the:")
print(combinedDf['index'].idxmax())
#Explanation:
#I want to get rid of all the beers that are below average on the overall review
# I also want to get rid of the beers that are either too strong or light
# I simply used 3.5 and 7 here, but could've also applied a more mathematical approach
# and removed within e.g. a 50% percentile
#
# Lastly, i develop an index out of the average rating, as well as the STD, as I figured
# I would want a beer that is generally liked, and isn't too volatile
#I adjusted to index according to the correlation

# 3

pearsonCorrelations = reviewsDf.corr()['review_overall']
print(pearsonCorrelations)
#review_aroma         0.616013
#review_appearance    0.501732
#review_palate        0.701914
#review_taste         0.789816

from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
tree = regressor.fit(reviewsDf[['review_aroma','review_appearance','review_palate', 'review_taste']], reviewsDf[['review_overall']])
print(tree.feature_importances_)
#[0.00479667 0.00543342 0.0526886  0.9370813 ]
#aroma       appearance palate    taste


#Explanation
# Pearsons correlation signals that they all have an effect on the review overall, with taste being highest, and appearance lowest
# I did some extra digging and built a (simple) decision tree regressor which indicates that the taste is by far
# the best predictor of a good rating
##


# 4
# The reviews are all on the same scale (1-5) so we simply ad them up to come up with a value combining
# appearance and aroma
# we can utilzie the data we created earlier

beertype_avg['aroma&appearance'] = beertype_avg['review_aroma'] + beertype_avg['review_appearance']
#here is a list of 5 beers types that would suit someone that enjoys appearance and aroma. with the filter on abv and rating
print(beertype_avg['aroma&appearance'].nlargest(5))