Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import os
- os.chdir('/Users/akhilakotapati/Downloads/')
- pd.set_option('display.max_columns', 999)
- pd.set_option('display.max_colwidth', -1)
- df = pd.read_csv('amazondata.csv',usecols=['uniq_id', 'product_name', 'manufacturer', 'price',
- 'number_available_in_stock', 'number_of_reviews',
- 'number_of_answered_questions', 'average_review_rating',
- 'amazon_category_and_sub_category',
- 'customers_who_bought_this_item_also_bought','description','product_information','product_description',
- 'items_customers_buy_after_viewing_this_item',
- 'customer_questions_and_answers',
- 'customer_reviews'])
- df.head(10)
- df.dtypes
- df.isnull().sum()
- # Here, we observe that there are missing values in almost all the columns.
- # However, what concern us the most is the missing values in price and number_available_in_stock.
- # There is redundancy in 'average_review_rating' column. We know that the rating is for 5 stars.
- # So we can convert this into a float data type.
- # 'number_available_in_stock' column has special characters. We can remove it and make it two different columns.
- def mapnumstock(v):
- if pd.isnull(v):
- return np.NaN ,np.NaN
- try:
- vv = v.split('\xa0')
- return int(vv[0]),vv[1]
- except ValueError:
- return np.NaN, np.NaN
- def mapnumber_of_reviews(v):
- if pd.isnull(v):
- return np.NaN
- try:
- vv = v.replace(",","")
- return int(vv)
- except ValueError:
- return np.NaN
- def mapaverage_review_rating(v):
- if pd.isnull(v):
- return 0
- try:
- vv = v.split('out')[0].strip()
- return float(vv)
- except ValueError:
- return 0
- numstock = lambda x: pd.Series([i for i in mapnumstock(x)])
- dfin_stock= df.loc[:]['number_available_in_stock']
- rev = dfin_stock.apply(numstock)
- rev.columns = ['inumber_available_in_stock','class_available_in_stock']
- df['inumber_available_in_stock'],df['class_available_in_stock'] = rev['inumber_available_in_stock'],rev['class_available_in_stock']
- # Reformatting the number_of_reviews and price columns.
- # exception data: df[df['number_of_reviews'].str.contains(',') == True]
- if df['number_of_reviews'].dtype != 'int64':
- df['number_of_reviews'] = df['number_of_reviews'].astype(str).map(mapnumber_of_reviews)
- if df['average_review_rating'].dtype != 'float64':
- df['average_review_rating'] = df['average_review_rating'].map(mapaverage_review_rating)
- x = df["uniq_id"].value_counts().to_frame()
- x.columns = ['counts']
- x.reset_index(level=0, inplace=True)
- def result():
- print("There are {} number of null values in the uniq id column".format(df['uniq_id'].isnull().sum()) if df['uniq_id'].isnull().sum()>1 else "")
- print("There are {} number of null values in the product name column".format(df['product_name'].isnull().sum()) if df['product_name'].isnull().sum()>1 else "")
- print("There are {} number of null values in the price column.".format(df['price'].isnull().sum()) if df['price'].isnull().sum()>1 else "")
- print("There are {} number of null values in the available stock column.".format(df['number_available_in_stock'].isnull().sum()) if df['number_available_in_stock'].isnull().sum()>1 else "")
- for i in range(len(x)):
- if x['counts'][i]>1:
- print("The uniq id '{}' has '{}' duplicate values.".format(x['index'][i],x['counts'][i]))
- print("Other columns that have null values:")
- result()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement