AkhilaKotapati_Protonai

import pandas as pd
import numpy as np
import os

os.chdir('/Users/akhilakotapati/Downloads/')

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_colwidth', -1)

df = pd.read_csv('amazondata.csv',usecols=['uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category',
       'customers_who_bought_this_item_also_bought','description','product_information','product_description',
                                            'items_customers_buy_after_viewing_this_item',
                                            'customer_questions_and_answers',
                                            'customer_reviews'])
df.head(10)

df.dtypes

df.isnull().sum()

# Here, we observe that there are missing values in almost all the columns.
# However, what concern us the most is the missing values in price and number_available_in_stock.
# There is redundancy in 'average_review_rating' column. We know that the rating is for 5 stars.
# So we can convert this into a float data type.
# 'number_available_in_stock' column has special characters. We can remove it and make it two different columns.


def mapnumstock(v):
    if pd.isnull(v):
        return np.NaN  ,np.NaN
    try:
        vv = v.split('\xa0')
        return int(vv[0]),vv[1]
    except ValueError:
        return np.NaN, np.NaN


def mapnumber_of_reviews(v):
    if pd.isnull(v):
        return np.NaN
    try:
        vv = v.replace(",","")
        return int(vv)
    except ValueError:
        return np.NaN


def mapaverage_review_rating(v):
    if pd.isnull(v):
        return 0
    try:
        vv = v.split('out')[0].strip()
        return float(vv)
    except ValueError:
        return 0


numstock = lambda x: pd.Series([i for i in mapnumstock(x)])
dfin_stock= df.loc[:]['number_available_in_stock']
rev = dfin_stock.apply(numstock)
rev.columns = ['inumber_available_in_stock','class_available_in_stock']
df['inumber_available_in_stock'],df['class_available_in_stock']  = rev['inumber_available_in_stock'],rev['class_available_in_stock']

# Reformatting the number_of_reviews and price columns.
# exception data: df[df['number_of_reviews'].str.contains(',') == True]
if df['number_of_reviews'].dtype != 'int64':
    df['number_of_reviews'] = df['number_of_reviews'].astype(str).map(mapnumber_of_reviews)


if df['average_review_rating'].dtype != 'float64':
    df['average_review_rating'] = df['average_review_rating'].map(mapaverage_review_rating)


x = df["uniq_id"].value_counts().to_frame()
x.columns = ['counts']
x.reset_index(level=0, inplace=True)

def result():
    print("There are {} number of null values in the uniq id column".format(df['uniq_id'].isnull().sum()) if df['uniq_id'].isnull().sum()>1 else "")
    print("There are {} number of null values in the product name column".format(df['product_name'].isnull().sum()) if df['product_name'].isnull().sum()>1 else "")
    print("There are {} number of null values in the price column.".format(df['price'].isnull().sum()) if df['price'].isnull().sum()>1 else "")
    print("There are {} number of null values in the available stock column.".format(df['number_available_in_stock'].isnull().sum()) if df['number_available_in_stock'].isnull().sum()>1 else "")
    for i in range(len(x)):
        if x['counts'][i]>1:
            print("The uniq id '{}' has '{}' duplicate values.".format(x['index'][i],x['counts'][i]))

    print("Other columns that have null values:")


result()