SHARE
TWEET

proton.ai -task_3

Laay_Trivedi Sep 8th, 2019 (edited) 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ---
  2. title: "Report-Amazon Sales data"
  3. output:
  4.   html_document: default
  5.   pdf_document: default
  6. ---
  7.  
  8. ```{r setup, include=FALSE}
  9. knitr::opts_chunk$set(echo = TRUE)
  10. ```
  11. ##############################################################################################################
  12.  
  13. ###### prton.ai - task
  14. #### Preparations
  15.  
  16. ##############################################################################################################
  17.  
  18. ```{r loading_packages, message = FALSE}
  19. options( warn = -1 )
  20. #install.packages("rmarkdown")
  21. #install.packages("dplyr")
  22. #install.packages("ggplot2")
  23. #install.packages("tidyr")
  24. #install.packages("purrr")
  25. #install.packages("kableExtra")
  26. #install.packages("VIM")
  27. #install.packages("knitr")
  28. #install.packages("pheatmap")
  29. #install.packages("readr")
  30. #install.packages("arules")
  31. #install.packages("stringr")
  32. library(rmakrdown)
  33. library(dplyr)
  34. library(ggplot2)
  35. library(tidyr)
  36. library(purrr)
  37. library(kableExtra)
  38. library(VIM)
  39. library(knitr)
  40. library(pheatmap)
  41. library(readr)
  42. library(arules)
  43. library(stringr)
  44.  
  45.  
  46. ########## ___________________________________________________________________________________________________________
  47. data <- read_csv("C:/Users/laayt/Downloads/amazon_co-ecommerce_sample.csv")
  48.  
  49. ```
  50.  
  51. #### Data Cleaning-Make variables more readables by removing unnecessary characters from it.
  52.  
  53.  
  54. ```{r}
  55. data2 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
  56.  
  57. data$price <- as.numeric(gsub("[^0-9.-]", "", data$price))
  58. data$average_review_rating <- as.numeric(gsub('[out of 5 stars]', '', data$average_review_rating))
  59. data$number_available_in_stock <- as.numeric(gsub("[^0-9.-]", "", data$number_available_in_stock))
  60.  
  61. data1 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
  62. ```
  63. ##### Transforming Unstructured Numerical Columns To
  64. ##############################################################################################################
  65. ```{r}
  66.    head(data2,7)
  67. ```
  68.  
  69. ## # A tibble: 7 x 3
  70. ##   price  average_review_rating number_available_in_stock
  71. ##   <chr>  <chr>                 <chr>                    
  72. ## 1 £3.42  4.9 out of 5 stars    5 new                    
  73. ## 2 £16.99 4.5 out of 5 stars    <NA>                    
  74. ## 3 £9.99  3.9 out of 5 stars    2 new                    
  75. ## 4 £39.99 5.0 out of 5 stars    <NA>                    
  76. ## 5 £32.19 4.7 out of 5 stars    <NA>                    
  77. ## 6 £6.99  5.0 out of 5 stars    <NA>                    
  78. ## 7 £24.99 4.5 out of 5 stars    <NA>
  79.  
  80. ##### Structured Numerical Columns
  81. ##############################################################################################################
  82.  
  83. ```{r}
  84. head(data1,7)
  85. ```
  86. ## # A tibble: 7 x 3
  87. ##   price average_review_rating number_available_in_stock
  88. ##   <dbl>                 <dbl>                     <dbl>
  89. ## 1  3.42                   4.9                         5
  90. ## 2 17.0                    4                          NA
  91. ## 3  9.99                   3.9                         2
  92. ## 4 40.0                    0                          NA
  93. ## 5 32.2                    4.7                        NA
  94. ## 6  6.99                   0                          NA
  95. ## 7 25.0                    4                          NA
  96.  
  97.  
  98.  
  99. #### Percentage missing data-Column wise
  100. ```{r}
  101.  
  102. getMissingData <- function(df, row_display = 6){
  103.   df1 <- data.frame(df_cols = names(df))
  104.   df1$miss_num <- map_dbl(df, ~sum(is.na(.x)))
  105.   df1$miss_percentage <- map_dbl(df, ~sum((is.na(.x)/nrow(df))*100))
  106.   df1 <<- df1
  107.   return(head(df1, n = row_display))
  108. }
  109. getMissingData(data)
  110. #kable(head(df1,6)) %>%
  111.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  112. ```
  113. ##                     df_cols miss_num miss_percentage
  114. ## 1                   uniq_id        0        0.000000
  115. ## 2              product_name        0        0.000000
  116. ## 3              manufacturer        6        0.059988
  117. ## 4                     price     1457       14.567087
  118. ## 5 number_available_in_stock     2502       25.014997
  119. ## 6         number_of_reviews       22        0.219956
  120.  
  121. #### Percentage missing data-Raw wise
  122. ##############################################################################################################
  123.  
  124.  
  125. ```{r}
  126. getMissingRows <- function(df, row_display = 6){
  127.   df1 <- data.frame(df_rows = 1:nrow(df))
  128.   df1$miss_num <- rowSums(is.na(df))
  129.   df1$miss_percentage <- rowSums((is.na(df)/ncol(df))*100)
  130.   df1 <- df1 %>% arrange(desc(miss_percentage))
  131.   df3 <<- df1
  132.   return(head(df1, n = row_display))
  133. }
  134. getMissingRows(data)
  135. #kable(head(df3, 5)) %>%
  136.   #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  137.  
  138. ```
  139. ##   df_rows miss_num miss_percentage
  140. ## 1    2276      892        99.33185
  141. ## 2     644      889        98.99777
  142. ## 3     795      889        98.99777
  143. ## 4    1013      889        98.99777
  144. ## 5    1419      889        98.99777
  145. ## 6    1809      889        98.99777
  146.  
  147. #### Removing completely empty rows and columns
  148. ##############################################################################################################
  149.  
  150. ```{r}
  151. dim(data) #inspecting current dimension of the dataset
  152. remove_empty_rows <- function(df){
  153.   no_empty_rows <<- df[rowSums(is.na(df)) != ncol(df),]
  154.  
  155. }
  156. remove_empty_rows(data)
  157.  
  158. complete_data <- function(df){
  159.   removeCols <- colSums(is.na(df)) == nrow(df)
  160.   complete_data <<- df[!removeCols]
  161. }
  162. complete_data(no_empty_rows)
  163.  
  164. dim(complete_data) #removing empty columns
  165.  
  166. ```
  167.  
  168. #### Data-Type-Definition
  169. ##############################################################################################################
  170.  
  171.  
  172. ```{r}
  173. Categorical_data<-function(df, numlevels = 5){
  174.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  175.   temp_use$var_names <- rownames(temp_use)
  176.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  177.   temp_use$type_class <- as.data.frame(type_class)
  178.   Cat_data <<- temp_use %>%
  179.     filter(levels <= numlevels |  type_class != "numeric" )
  180. }
  181.  
  182. Continious_data <-function(df, numlevels = 5){
  183.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  184.   temp_use$var_names <- rownames(temp_use)
  185.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  186.   temp_use$type_class <- as.data.frame(type_class)
  187.   Cont_data <<- temp_use %>%
  188.     filter(levels > numlevels & type_class == "numeric" )
  189. }
  190.  
  191. zero_variance_vars <- function(df){
  192.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  193.   temp_use$var_names <- rownames(temp_use)
  194.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  195.   temp_use$type_class <- as.data.frame(type_class)
  196.   static_data <<- temp_use %>%
  197.     filter(temp_use$levels == 1 | temp_use$levels == nrow(df))
  198.   if(nrow(static_data) == 0){
  199.     print("There are no variables in this dataframe having zero variance")
  200.   }
  201. }
  202. ```
  203. #### List of Continuous variables
  204. ##############################################################################################################
  205. ```{r}
  206. Continious_data(complete_data)
  207. Cont_data
  208. #kable(Cont_data) %>%
  209.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  210. ```
  211. ##   levels                    var_names type_class
  212. ## 1   2610                        price    numeric
  213. ## 2     74    number_available_in_stock    numeric
  214. ## 3    192            number_of_reviews    numeric
  215. ## 4     19 number_of_answered_questions    numeric
  216. ## 5     18        average_review_rating    numeric
  217.  
  218.  
  219. #### List of Categorical variables
  220. ##############################################################################################################
  221.  
  222. ```{r}
  223. Categorical_data(complete_data, 5)
  224. Cat_data
  225. #kable(Cat_data) %>%
  226.   #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  227. ```
  228.  
  229. ##    levels                                   var_names type_class
  230. ## 1    9992                                     uniq_id  character
  231. ## 2    9965                                product_name  character
  232. ## 3    2654                                manufacturer  character
  233. ## 4     257            amazon_category_and_sub_category  character
  234. ## 5    8756  customers_who_bought_this_item_also_bought  character
  235. ## 6    8515                                 description  character
  236. ## 7    9939                         product_information  character
  237. ## 8    8514                         product_description  character
  238. ## 9    6749 items_customers_buy_after_viewing_this_item  character
  239. ## 10    910              customer_questions_and_answers  character
  240. ## 11   9901                            customer_reviews  character
  241.  
  242.  
  243. #### List of Zero-variant variables
  244. ##############################################################################################################
  245.  
  246. ```{r}
  247. zero_variance_vars(complete_data)
  248. ```
  249. ## [1] "There are no variables in this data frame having zero variance"
  250.  
  251.  
  252. #### Missing Value-Imputation : List of imputed missing value
  253. ##############################################################################################################
  254.  
  255. ```{r}
  256. data_cont <- complete_data[, which(colnames(complete_data) %in% Cont_data$var_names)]
  257. imputed_df = as.data.frame(VIM::kNN(data_cont,c(Cont_data$var_names), k = 3))
  258. imputed_df <- imputed_df[,c(1:5)]
  259. head(imputed_df)
  260. #kable(head(imputed_df)) %>%
  261.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  262. ```
  263. ##   price number_available_in_stock number_of_reviews
  264. ## 1  3.42                         5                15
  265. ## 2 16.99                        17                 2
  266. ## 3  9.99                         2                17
  267. ## 4 39.99                         3                 1
  268. ## 5 32.19                         7                 3
  269. ## 6  6.99                         5                 2
  270. ##   number_of_answered_questions average_review_rating
  271. ## 1                            1                   4.9
  272. ## 2                            1                   4.0
  273. ## 3                            2                   3.9
  274. ## 4                            2                   0.0
  275. ## 5                            2                   4.7
  276. ## 6                            1                   0.0
  277.  
  278.  
  279.  
  280. #### CORRELATION -  
  281. ##############################################################################################################
  282.  
  283. ```{r}
  284. data_cont <- imputed_df[, which(colnames(imputed_df) %in% Cont_data$var_names)]
  285. data_cat  <- na.omit(complete_data[, which(colnames(complete_data) %in% Cat_data$var_names)])
  286.  
  287. cortable <- function(df, cutoff=0){
  288.   library(dplyr)
  289.   library(pheatmap)
  290.   a <- abs(cor(df))
  291.   corr <- data.table(variable1=rownames(a)[row(a)], variable2=colnames(a)[col(a)], corr=c(a))
  292.   corr <- corr %>%
  293.     filter(!(variable1 == variable2)) %>%
  294.     filter(corr > cutoff) %>%
  295.     arrange(desc(corr))
  296.   corr <- corr[!duplicated(t(apply(corr,1,sort))),]
  297.   return(corr)
  298. }
  299. ```
  300.  
  301. #### Highly correlated variables in descending order
  302. ##############################################################################################################
  303.  
  304. ```{r}
  305. cortable(data_cont)
  306. ```
  307. ##                       variable1                    variable2        corr
  308. ## 1         average_review_rating            number_of_reviews 0.269580511
  309. ## 3             number_of_reviews    number_available_in_stock 0.221341663
  310. ## 5         average_review_rating    number_available_in_stock 0.149496878
  311. ## 7  number_of_answered_questions            number_of_reviews 0.111823030
  312. ## 9         average_review_rating                        price 0.065134089
  313. ## 11        average_review_rating number_of_answered_questions 0.028783844
  314. ## 13            number_of_reviews                        price 0.027596509
  315. ## 15 number_of_answered_questions    number_available_in_stock 0.019411542
  316. ## 17    number_available_in_stock                        price 0.010957996
  317. ## 19 number_of_answered_questions                        price 0.005472014
  318.  
  319.  
  320.  
  321.  
  322. #### Correlation - Heat Map
  323. ##############################################################################################################
  324.  
  325. ```{r}
  326. pheatmap::pheatmap(cor(data_cont))
  327. ```
  328.  
  329.  
  330. ##############################################################################################################
  331.  
  332. Cleaning -URLs Data for Further Analysis'. Removing "http://www.amazon.co.uk/" and "/dp/", "-" and "|" from the URLs and Separating every transactional product to new column.
  333. ##############################################################################################################
  334.  
  335. ```{r echo = FALSE}
  336. df24 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
  337. df24 <- df24[, c(1, 3, 5, 7)]
  338. df231<- as.data.frame(map_chr(df24$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  339. df232<- as.data.frame(map_chr(df24$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  340. df233 <- as.data.frame(map_chr(df24$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  341. df234 <- as.data.frame(map_chr(df24$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  342. df23<- cbind(df231, df232, df233, df234)
  343. names(df23) <- c("v1", "v2", "v3", "v4")
  344. df221 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  345. df222 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  346. df223 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  347. df224 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  348. df22 <- cbind(df221, df222, df223, df224)
  349. names(df22) <- c("product1", "product2", "product3", "product4")
  350.  
  351. df14 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
  352. df14 <- df14[, c(1, 3, 5, 7)]
  353. df131<- as.data.frame(map_chr(df14$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  354. df132<- as.data.frame(map_chr(df14$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  355. df133 <- as.data.frame(map_chr(df14$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  356. df134 <- as.data.frame(map_chr(df14$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  357. df13<- cbind(df131, df132, df133, df134)
  358. names(df13) <- c("v1", "v2", "v3", "v4")
  359. df121 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  360. df122 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  361. df123 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  362. df124 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  363. df12 <- cbind(df221, df222, df223, df224)
  364. names(df12) <- c("Aproduct1", "Aproduct2", "Aproduct3", "Aproduct4")
  365. df12[] <- lapply(df12, gsub, pattern='-', replacement=' ')
  366. df22[] <- lapply(df22, gsub, pattern="-", replacement=' ')
  367.  
  368. ```
  369.  
  370. ##############################################################################################################
  371. Association Rule - Market Basket Analysis
  372. ##############################################################################################################
  373.  
  374.  
  375. ```{r echo=FALSE, results=FALSE}
  376. transaction12 <- complete_data[,c("uniq_id", "product_name")]
  377. transact_view <- cbind(transaction12, df12)
  378. transact_buy <- cbind(transaction12, df22)
  379.  
  380. transact_view <- gather(transact_view, product_class, product_name, product_name:Aproduct4)
  381. transact_buy <- gather(transact_buy, product_class, product_name, product_name:product4)
  382.  
  383.  
  384. write.csv(transact_view, file = "transact_view.csv")
  385. write.csv(transact_buy, file = "transact_buy.csv")
  386. data_buy = read.transactions("transact_buy.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
  387.  
  388. rules_buy <- apriori(data_buy, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
  389. rules_buy <- sort(rules_buy, by="lift", decreasing=TRUE)
  390. options(digit=3)
  391.  
  392. inspect(rules_buy[1:5])
  393. rules_buy = data.frame(
  394.          product1 = labels(lhs(rules_buy)),
  395.          product2 = labels(rhs(rules_buy)),
  396.          rules_buy@quality)
  397.  
  398. data_view = read.transactions("transact_view.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
  399.  
  400. rules_view <- apriori(data_view, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
  401. rules_view <- sort(rules_view, by="lift", decreasing=TRUE)
  402. options(digit=3)
  403.  
  404. inspect(rules_view[1:5])
  405. rules_view = data.frame(
  406.          bought_item = labels(lhs(rules_view)),
  407.          viewed_item_before_buying = labels(rhs(rules_view)),
  408.          rules_view@quality)
  409.  
  410. ```
  411. #
  412.  
  413.  
  414.  
  415. #### Inspecting the frequently bought together products - Association Rules
  416. ##############################################################################################################
  417.  
  418. ```{r message=FALSE}
  419.  
  420. head(rules_buy)
  421.  
  422. #kable(head(rules_buy)) %>%
  423. # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  424.  
  425. ```
  426. ##                                                                                 product1
  427. ## 5                                                   {Hornby R8222 Gauge Track Extension}
  428. ## 4                                                   {Hornby R8223 Gauge Track Extension}
  429. ## 2                                                           {Thomas Friends Trackmaster}
  430. ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  431. ## 3                                                   {Pack Girls Glitter Tattoo Stencils}
  432. ## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
  433. ##                                                                                 product2
  434. ## 5                                                   {Hornby R8223 Gauge Track Extension}
  435. ## 4                                                   {Hornby R8222 Gauge Track Extension}
  436. ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  437. ## 1                                                           {Thomas Friends Trackmaster}
  438. ## 3                                                       {Girls 10ml Glitter Tattoo Glue}
  439. ## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
  440. ##       support confidence     lift count
  441. ## 5 0.001100881  0.6875000 572.4583    11
  442. ## 4 0.001100881  0.9166667 572.4583    11
  443. ## 2 0.001501201  0.6521739 434.4348    15
  444. ## 1 0.001501201  1.0000000 434.4348    15
  445. ## 3 0.001000801  0.8333333 308.3951    10
  446. ## 7 0.001401121  0.7368421 272.6862    14
  447. #
  448.  
  449.  
  450. #### Inspecting the pair of frequently bought product X-  after viewing product Y.
  451. ##############################################################################################################
  452.  
  453. ```{r message=FALSE}
  454. head(rules_view)
  455. #kable(head(rules_view)) %>%
  456. # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  457. ```
  458.  
  459. ##                                                                              bought_item
  460. ## 5                                                   {Hornby R8222 Gauge Track Extension}
  461. ## 4                                                   {Hornby R8223 Gauge Track Extension}
  462. ## 2                                                           {Thomas Friends Trackmaster}
  463. ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  464. ## 3                                                   {Pack Girls Glitter Tattoo Stencils}
  465. ## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
  466. ##                                                                viewed_item_before_buying
  467. ## 5                                                   {Hornby R8223 Gauge Track Extension}
  468. ## 4                                                   {Hornby R8222 Gauge Track Extension}
  469. ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  470. ## 1                                                           {Thomas Friends Trackmaster}
  471. ## 3                                                       {Girls 10ml Glitter Tattoo Glue}
  472. ## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
  473. ##       support confidence     lift count
  474. ## 5 0.001100881  0.6875000 572.4583    11
  475. ## 4 0.001100881  0.9166667 572.4583    11
  476. ## 2 0.001501201  0.6521739 434.4348    15
  477. ## 1 0.001501201  1.0000000 434.4348    15
  478. ## 3 0.001000801  0.8333333 308.3951    10
  479. ## 7 0.001401121  0.7368421 272.6862    14
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top