Advertisement
Guest User

AkhilaKotapati_Protonai

a guest
Sep 15th, 2019
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.66 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import os
  4.  
  5. os.chdir('/Users/akhilakotapati/Downloads/')
  6.  
  7. pd.set_option('display.max_columns', 999)
  8. pd.set_option('display.max_colwidth', -1)
  9.  
  10. df = pd.read_csv('amazondata.csv',usecols=['uniq_id', 'product_name', 'manufacturer', 'price',
  11.        'number_available_in_stock', 'number_of_reviews',
  12.        'number_of_answered_questions', 'average_review_rating',
  13.        'amazon_category_and_sub_category',
  14.        'customers_who_bought_this_item_also_bought','description','product_information','product_description',
  15.                                             'items_customers_buy_after_viewing_this_item',
  16.                                             'customer_questions_and_answers',
  17.                                             'customer_reviews'])
  18. df.head(10)
  19.  
  20. df.dtypes
  21.  
  22. df.isnull().sum()
  23.  
  24. # Here, we observe that there are missing values in almost all the columns.
  25. # However, what concern us the most is the missing values in price and number_available_in_stock.
  26. # There is redundancy in 'average_review_rating' column. We know that the rating is for 5 stars.
  27. # So we can convert this into a float data type.
  28. # 'number_available_in_stock' column has special characters. We can remove it and make it two different columns.
  29.  
  30.  
  31. def mapnumstock(v):
  32.     if pd.isnull(v):
  33.         return np.NaN  ,np.NaN  
  34.     try:
  35.         vv = v.split('\xa0')
  36.         return int(vv[0]),vv[1]
  37.     except ValueError:        
  38.         return np.NaN, np.NaN  
  39.  
  40.    
  41. def mapnumber_of_reviews(v):
  42.     if pd.isnull(v):
  43.         return np.NaN
  44.     try:
  45.         vv = v.replace(",","")
  46.         return int(vv)
  47.     except ValueError:
  48.         return np.NaN
  49.    
  50.  
  51. def mapaverage_review_rating(v):
  52.     if pd.isnull(v):
  53.         return 0
  54.     try:
  55.         vv = v.split('out')[0].strip()
  56.         return float(vv)
  57.     except ValueError:        
  58.         return 0
  59.    
  60.  
  61. numstock = lambda x: pd.Series([i for i in mapnumstock(x)])
  62. dfin_stock= df.loc[:]['number_available_in_stock']
  63. rev = dfin_stock.apply(numstock)
  64. rev.columns = ['inumber_available_in_stock','class_available_in_stock']
  65. df['inumber_available_in_stock'],df['class_available_in_stock']  = rev['inumber_available_in_stock'],rev['class_available_in_stock']
  66.  
  67. # Reformatting the number_of_reviews and price columns.
  68. # exception data: df[df['number_of_reviews'].str.contains(',') == True]
  69. if df['number_of_reviews'].dtype != 'int64':  
  70.     df['number_of_reviews'] = df['number_of_reviews'].astype(str).map(mapnumber_of_reviews)
  71.  
  72.    
  73. if df['average_review_rating'].dtype != 'float64':
  74.     df['average_review_rating'] = df['average_review_rating'].map(mapaverage_review_rating)
  75.    
  76.  
  77.    
  78. x = df["uniq_id"].value_counts().to_frame()
  79. x.columns = ['counts']
  80. x.reset_index(level=0, inplace=True)
  81.  
  82. def result():
  83.     print("There are {} number of null values in the uniq id column".format(df['uniq_id'].isnull().sum()) if df['uniq_id'].isnull().sum()>1 else "")
  84.     print("There are {} number of null values in the product name column".format(df['product_name'].isnull().sum()) if df['product_name'].isnull().sum()>1 else "")
  85.     print("There are {} number of null values in the price column.".format(df['price'].isnull().sum()) if df['price'].isnull().sum()>1 else "")
  86.     print("There are {} number of null values in the available stock column.".format(df['number_available_in_stock'].isnull().sum()) if df['number_available_in_stock'].isnull().sum()>1 else "")
  87.     for i in range(len(x)):
  88.         if x['counts'][i]>1:
  89.             print("The uniq id '{}' has '{}' duplicate values.".format(x['index'][i],x['counts'][i]))
  90.            
  91.     print("Other columns that have null values:")
  92.    
  93.    
  94.  
  95. result()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement