Advertisement
Laay_Trivedi

proton.ai -task_3

Sep 8th, 2019
190
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 20.26 KB | None | 0 0
  1. ---
  2. title: "Report-Amazon Sales data"
  3. output:
  4.   html_document: default
  5.   pdf_document: default
  6. ---
  7.  
  8. ```{r setup, include=FALSE}
  9. knitr::opts_chunk$set(echo = TRUE)
  10. ```
  11. ##############################################################################################################
  12.  
  13. ###### prton.ai - task
  14. #### Preparations
  15.  
  16. ##############################################################################################################
  17.  
  18. ```{r loading_packages, message = FALSE}
  19. options( warn = -1 )
  20. #install.packages("rmarkdown")
  21. #install.packages("dplyr")
  22. #install.packages("ggplot2")
  23. #install.packages("tidyr")
  24. #install.packages("purrr")
  25. #install.packages("kableExtra")
  26. #install.packages("VIM")
  27. #install.packages("knitr")
  28. #install.packages("pheatmap")
  29. #install.packages("readr")
  30. #install.packages("arules")
  31. #install.packages("stringr")
  32. library(rmakrdown)
  33. library(dplyr)
  34. library(ggplot2)
  35. library(tidyr)
  36. library(purrr)
  37. library(kableExtra)
  38. library(VIM)
  39. library(knitr)
  40. library(pheatmap)
  41. library(readr)
  42. library(arules)
  43. library(stringr)
  44.  
  45.  
  46. ########## ___________________________________________________________________________________________________________
  47. data <- read_csv("C:/Users/laayt/Downloads/amazon_co-ecommerce_sample.csv")
  48.  
  49. ```
  50.  
  51. #### Data Cleaning-Make variables more readables by removing unnecessary characters from it.
  52.  
  53.  
  54. ```{r}
  55. data2 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
  56.  
  57. data$price <- as.numeric(gsub("[^0-9.-]", "", data$price))
  58. data$average_review_rating <- as.numeric(gsub('[out of 5 stars]', '', data$average_review_rating))
  59. data$number_available_in_stock <- as.numeric(gsub("[^0-9.-]", "", data$number_available_in_stock))
  60.  
  61. data1 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
  62. ```
  63. ##### Transforming Unstructured Numerical Columns To
  64. ##############################################################################################################
  65. ```{r}
  66.    head(data2,7)
  67. ```
  68.  
  69. ## # A tibble: 7 x 3
  70. ##   price  average_review_rating number_available_in_stock
  71. ##   <chr>  <chr>                 <chr>                    
  72. ## 1 £3.42  4.9 out of 5 stars    5 new                    
  73. ## 2 £16.99 4.5 out of 5 stars    <NA>                    
  74. ## 3 £9.99  3.9 out of 5 stars    2 new                    
  75. ## 4 £39.99 5.0 out of 5 stars    <NA>                    
  76. ## 5 £32.19 4.7 out of 5 stars    <NA>                    
  77. ## 6 £6.99  5.0 out of 5 stars    <NA>                    
  78. ## 7 £24.99 4.5 out of 5 stars    <NA>
  79.  
  80. ##### Structured Numerical Columns
  81. ##############################################################################################################
  82.  
  83. ```{r}
  84. head(data1,7)
  85. ```
  86. ## # A tibble: 7 x 3
  87. ##   price average_review_rating number_available_in_stock
  88. ##   <dbl>                 <dbl>                     <dbl>
  89. ## 1  3.42                   4.9                         5
  90. ## 2 17.0                    4                          NA
  91. ## 3  9.99                   3.9                         2
  92. ## 4 40.0                    0                          NA
  93. ## 5 32.2                    4.7                        NA
  94. ## 6  6.99                   0                          NA
  95. ## 7 25.0                    4                          NA
  96.  
  97.  
  98.  
  99. #### Percentage missing data-Column wise
  100. ```{r}
  101.  
  102. getMissingData <- function(df, row_display = 6){
  103.   df1 <- data.frame(df_cols = names(df))
  104.   df1$miss_num <- map_dbl(df, ~sum(is.na(.x)))
  105.   df1$miss_percentage <- map_dbl(df, ~sum((is.na(.x)/nrow(df))*100))
  106.   df1 <<- df1
  107.   return(head(df1, n = row_display))
  108. }
  109. getMissingData(data)
  110. #kable(head(df1,6)) %>%
  111.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  112. ```
  113. ##                     df_cols miss_num miss_percentage
  114. ## 1                   uniq_id        0        0.000000
  115. ## 2              product_name        0        0.000000
  116. ## 3              manufacturer        6        0.059988
  117. ## 4                     price     1457       14.567087
  118. ## 5 number_available_in_stock     2502       25.014997
  119. ## 6         number_of_reviews       22        0.219956
  120.  
  121. #### Percentage missing data-Raw wise
  122. ##############################################################################################################
  123.  
  124.  
  125. ```{r}
  126. getMissingRows <- function(df, row_display = 6){
  127.   df1 <- data.frame(df_rows = 1:nrow(df))
  128.   df1$miss_num <- rowSums(is.na(df))
  129.   df1$miss_percentage <- rowSums((is.na(df)/ncol(df))*100)
  130.   df1 <- df1 %>% arrange(desc(miss_percentage))
  131.   df3 <<- df1
  132.   return(head(df1, n = row_display))
  133. }
  134. getMissingRows(data)
  135. #kable(head(df3, 5)) %>%
  136.   #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  137.  
  138. ```
  139. ##   df_rows miss_num miss_percentage
  140. ## 1    2276      892        99.33185
  141. ## 2     644      889        98.99777
  142. ## 3     795      889        98.99777
  143. ## 4    1013      889        98.99777
  144. ## 5    1419      889        98.99777
  145. ## 6    1809      889        98.99777
  146.  
  147. #### Removing completely empty rows and columns
  148. ##############################################################################################################
  149.  
  150. ```{r}
  151. dim(data) #inspecting current dimension of the dataset
  152. remove_empty_rows <- function(df){
  153.   no_empty_rows <<- df[rowSums(is.na(df)) != ncol(df),]
  154.  
  155. }
  156. remove_empty_rows(data)
  157.  
  158. complete_data <- function(df){
  159.   removeCols <- colSums(is.na(df)) == nrow(df)
  160.   complete_data <<- df[!removeCols]
  161. }
  162. complete_data(no_empty_rows)
  163.  
  164. dim(complete_data) #removing empty columns
  165.  
  166. ```
  167.  
  168. #### Data-Type-Definition
  169. ##############################################################################################################
  170.  
  171.  
  172. ```{r}
  173. Categorical_data<-function(df, numlevels = 5){
  174.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  175.   temp_use$var_names <- rownames(temp_use)
  176.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  177.   temp_use$type_class <- as.data.frame(type_class)
  178.   Cat_data <<- temp_use %>%
  179.     filter(levels <= numlevels |  type_class != "numeric" )
  180. }
  181.  
  182. Continious_data <-function(df, numlevels = 5){
  183.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  184.   temp_use$var_names <- rownames(temp_use)
  185.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  186.   temp_use$type_class <- as.data.frame(type_class)
  187.   Cont_data <<- temp_use %>%
  188.     filter(levels > numlevels & type_class == "numeric" )
  189. }
  190.  
  191. zero_variance_vars <- function(df){
  192.   temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  193.   temp_use$var_names <- rownames(temp_use)
  194.   type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  195.   temp_use$type_class <- as.data.frame(type_class)
  196.   static_data <<- temp_use %>%
  197.     filter(temp_use$levels == 1 | temp_use$levels == nrow(df))
  198.   if(nrow(static_data) == 0){
  199.     print("There are no variables in this dataframe having zero variance")
  200.   }
  201. }
  202. ```
  203. #### List of Continuous variables
  204. ##############################################################################################################
  205. ```{r}
  206. Continious_data(complete_data)
  207. Cont_data
  208. #kable(Cont_data) %>%
  209.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  210. ```
  211. ##   levels                    var_names type_class
  212. ## 1   2610                        price    numeric
  213. ## 2     74    number_available_in_stock    numeric
  214. ## 3    192            number_of_reviews    numeric
  215. ## 4     19 number_of_answered_questions    numeric
  216. ## 5     18        average_review_rating    numeric
  217.  
  218.  
  219. #### List of Categorical variables
  220. ##############################################################################################################
  221.  
  222. ```{r}
  223. Categorical_data(complete_data, 5)
  224. Cat_data
  225. #kable(Cat_data) %>%
  226.   #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  227. ```
  228.  
  229. ##    levels                                   var_names type_class
  230. ## 1    9992                                     uniq_id  character
  231. ## 2    9965                                product_name  character
  232. ## 3    2654                                manufacturer  character
  233. ## 4     257            amazon_category_and_sub_category  character
  234. ## 5    8756  customers_who_bought_this_item_also_bought  character
  235. ## 6    8515                                 description  character
  236. ## 7    9939                         product_information  character
  237. ## 8    8514                         product_description  character
  238. ## 9    6749 items_customers_buy_after_viewing_this_item  character
  239. ## 10    910              customer_questions_and_answers  character
  240. ## 11   9901                            customer_reviews  character
  241.  
  242.  
  243. #### List of Zero-variant variables
  244. ##############################################################################################################
  245.  
  246. ```{r}
  247. zero_variance_vars(complete_data)
  248. ```
  249. ## [1] "There are no variables in this data frame having zero variance"
  250.  
  251.  
  252. #### Missing Value-Imputation : List of imputed missing value
  253. ##############################################################################################################
  254.  
  255. ```{r}
  256. data_cont <- complete_data[, which(colnames(complete_data) %in% Cont_data$var_names)]
  257. imputed_df = as.data.frame(VIM::kNN(data_cont,c(Cont_data$var_names), k = 3))
  258. imputed_df <- imputed_df[,c(1:5)]
  259. head(imputed_df)
  260. #kable(head(imputed_df)) %>%
  261.  # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  262. ```
  263. ##   price number_available_in_stock number_of_reviews
  264. ## 1  3.42                         5                15
  265. ## 2 16.99                        17                 2
  266. ## 3  9.99                         2                17
  267. ## 4 39.99                         3                 1
  268. ## 5 32.19                         7                 3
  269. ## 6  6.99                         5                 2
  270. ##   number_of_answered_questions average_review_rating
  271. ## 1                            1                   4.9
  272. ## 2                            1                   4.0
  273. ## 3                            2                   3.9
  274. ## 4                            2                   0.0
  275. ## 5                            2                   4.7
  276. ## 6                            1                   0.0
  277.  
  278.  
  279.  
  280. #### CORRELATION -  
  281. ##############################################################################################################
  282.  
  283. ```{r}
  284. data_cont <- imputed_df[, which(colnames(imputed_df) %in% Cont_data$var_names)]
  285. data_cat  <- na.omit(complete_data[, which(colnames(complete_data) %in% Cat_data$var_names)])
  286.  
  287. cortable <- function(df, cutoff=0){
  288.   library(dplyr)
  289.   library(pheatmap)
  290.   a <- abs(cor(df))
  291.   corr <- data.table(variable1=rownames(a)[row(a)], variable2=colnames(a)[col(a)], corr=c(a))
  292.   corr <- corr %>%
  293.     filter(!(variable1 == variable2)) %>%
  294.     filter(corr > cutoff) %>%
  295.     arrange(desc(corr))
  296.   corr <- corr[!duplicated(t(apply(corr,1,sort))),]
  297.   return(corr)
  298. }
  299. ```
  300.  
  301. #### Highly correlated variables in descending order
  302. ##############################################################################################################
  303.  
  304. ```{r}
  305. cortable(data_cont)
  306. ```
  307. ##                       variable1                    variable2        corr
  308. ## 1         average_review_rating            number_of_reviews 0.269580511
  309. ## 3             number_of_reviews    number_available_in_stock 0.221341663
  310. ## 5         average_review_rating    number_available_in_stock 0.149496878
  311. ## 7  number_of_answered_questions            number_of_reviews 0.111823030
  312. ## 9         average_review_rating                        price 0.065134089
  313. ## 11        average_review_rating number_of_answered_questions 0.028783844
  314. ## 13            number_of_reviews                        price 0.027596509
  315. ## 15 number_of_answered_questions    number_available_in_stock 0.019411542
  316. ## 17    number_available_in_stock                        price 0.010957996
  317. ## 19 number_of_answered_questions                        price 0.005472014
  318.  
  319.  
  320.  
  321.  
  322. #### Correlation - Heat Map
  323. ##############################################################################################################
  324.  
  325. ```{r}
  326. pheatmap::pheatmap(cor(data_cont))
  327. ```
  328.  
  329.  
  330. ##############################################################################################################
  331.  
  332. Cleaning -URLs Data for Further Analysis'. Removing "http://www.amazon.co.uk/" and "/dp/", "-" and "|" from the URLs and Separating every transactional product to new column.
  333. ##############################################################################################################
  334.  
  335. ```{r echo = FALSE}
  336. df24 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
  337. df24 <- df24[, c(1, 3, 5, 7)]
  338. df231<- as.data.frame(map_chr(df24$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  339. df232<- as.data.frame(map_chr(df24$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  340. df233 <- as.data.frame(map_chr(df24$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  341. df234 <- as.data.frame(map_chr(df24$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  342. df23<- cbind(df231, df232, df233, df234)
  343. names(df23) <- c("v1", "v2", "v3", "v4")
  344. df221 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  345. df222 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  346. df223 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  347. df224 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  348. df22 <- cbind(df221, df222, df223, df224)
  349. names(df22) <- c("product1", "product2", "product3", "product4")
  350.  
  351. df14 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
  352. df14 <- df14[, c(1, 3, 5, 7)]
  353. df131<- as.data.frame(map_chr(df14$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  354. df132<- as.data.frame(map_chr(df14$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  355. df133 <- as.data.frame(map_chr(df14$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  356. df134 <- as.data.frame(map_chr(df14$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
  357. df13<- cbind(df131, df132, df133, df134)
  358. names(df13) <- c("v1", "v2", "v3", "v4")
  359. df121 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  360. df122 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  361. df123 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  362. df124 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
  363. df12 <- cbind(df221, df222, df223, df224)
  364. names(df12) <- c("Aproduct1", "Aproduct2", "Aproduct3", "Aproduct4")
  365. df12[] <- lapply(df12, gsub, pattern='-', replacement=' ')
  366. df22[] <- lapply(df22, gsub, pattern="-", replacement=' ')
  367.  
  368. ```
  369.  
  370. ##############################################################################################################
  371. Association Rule - Market Basket Analysis
  372. ##############################################################################################################
  373.  
  374.  
  375. ```{r echo=FALSE, results=FALSE}
  376. transaction12 <- complete_data[,c("uniq_id", "product_name")]
  377. transact_view <- cbind(transaction12, df12)
  378. transact_buy <- cbind(transaction12, df22)
  379.  
  380. transact_view <- gather(transact_view, product_class, product_name, product_name:Aproduct4)
  381. transact_buy <- gather(transact_buy, product_class, product_name, product_name:product4)
  382.  
  383.  
  384. write.csv(transact_view, file = "transact_view.csv")
  385. write.csv(transact_buy, file = "transact_buy.csv")
  386. data_buy = read.transactions("transact_buy.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
  387.  
  388. rules_buy <- apriori(data_buy, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
  389. rules_buy <- sort(rules_buy, by="lift", decreasing=TRUE)
  390. options(digit=3)
  391.  
  392. inspect(rules_buy[1:5])
  393. rules_buy = data.frame(
  394.          product1 = labels(lhs(rules_buy)),
  395.          product2 = labels(rhs(rules_buy)),
  396.          rules_buy@quality)
  397.  
  398. data_view = read.transactions("transact_view.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
  399.  
  400. rules_view <- apriori(data_view, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
  401. rules_view <- sort(rules_view, by="lift", decreasing=TRUE)
  402. options(digit=3)
  403.  
  404. inspect(rules_view[1:5])
  405. rules_view = data.frame(
  406.          bought_item = labels(lhs(rules_view)),
  407.          viewed_item_before_buying = labels(rhs(rules_view)),
  408.          rules_view@quality)
  409.  
  410. ```
  411. #
  412.  
  413.  
  414.  
  415. #### Inspecting the frequently bought together products - Association Rules
  416. ##############################################################################################################
  417.  
  418. ```{r message=FALSE}
  419.  
  420. head(rules_buy)
  421.  
  422. #kable(head(rules_buy)) %>%
  423. # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  424.  
  425. ```
  426. ##                                                                                 product1
  427. ## 5                                                   {Hornby R8222 Gauge Track Extension}
  428. ## 4                                                   {Hornby R8223 Gauge Track Extension}
  429. ## 2                                                           {Thomas Friends Trackmaster}
  430. ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  431. ## 3                                                   {Pack Girls Glitter Tattoo Stencils}
  432. ## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
  433. ##                                                                                 product2
  434. ## 5                                                   {Hornby R8223 Gauge Track Extension}
  435. ## 4                                                   {Hornby R8222 Gauge Track Extension}
  436. ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  437. ## 1                                                           {Thomas Friends Trackmaster}
  438. ## 3                                                       {Girls 10ml Glitter Tattoo Glue}
  439. ## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
  440. ##       support confidence     lift count
  441. ## 5 0.001100881  0.6875000 572.4583    11
  442. ## 4 0.001100881  0.9166667 572.4583    11
  443. ## 2 0.001501201  0.6521739 434.4348    15
  444. ## 1 0.001501201  1.0000000 434.4348    15
  445. ## 3 0.001000801  0.8333333 308.3951    10
  446. ## 7 0.001401121  0.7368421 272.6862    14
  447. #
  448.  
  449.  
  450. #### Inspecting the pair of frequently bought product X-  after viewing product Y.
  451. ##############################################################################################################
  452.  
  453. ```{r message=FALSE}
  454. head(rules_view)
  455. #kable(head(rules_view)) %>%
  456. # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
  457. ```
  458.  
  459. ##                                                                              bought_item
  460. ## 5                                                   {Hornby R8222 Gauge Track Extension}
  461. ## 4                                                   {Hornby R8223 Gauge Track Extension}
  462. ## 2                                                           {Thomas Friends Trackmaster}
  463. ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  464. ## 3                                                   {Pack Girls Glitter Tattoo Stencils}
  465. ## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
  466. ##                                                                viewed_item_before_buying
  467. ## 5                                                   {Hornby R8223 Gauge Track Extension}
  468. ## 4                                                   {Hornby R8222 Gauge Track Extension}
  469. ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
  470. ## 1                                                           {Thomas Friends Trackmaster}
  471. ## 3                                                       {Girls 10ml Glitter Tattoo Glue}
  472. ## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
  473. ##       support confidence     lift count
  474. ## 5 0.001100881  0.6875000 572.4583    11
  475. ## 4 0.001100881  0.9166667 572.4583    11
  476. ## 2 0.001501201  0.6521739 434.4348    15
  477. ## 1 0.001501201  1.0000000 434.4348    15
  478. ## 3 0.001000801  0.8333333 308.3951    10
  479. ## 7 0.001401121  0.7368421 272.6862    14
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement