Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- title: "Report-Amazon Sales data"
- output:
- html_document: default
- pdf_document: default
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- ```
- ##############################################################################################################
- ###### prton.ai - task
- #### Preparations
- ##############################################################################################################
- ```{r loading_packages, message = FALSE}
- options( warn = -1 )
- #install.packages("rmarkdown")
- #install.packages("dplyr")
- #install.packages("ggplot2")
- #install.packages("tidyr")
- #install.packages("purrr")
- #install.packages("kableExtra")
- #install.packages("VIM")
- #install.packages("knitr")
- #install.packages("pheatmap")
- #install.packages("readr")
- #install.packages("arules")
- #install.packages("stringr")
- library(rmakrdown)
- library(dplyr)
- library(ggplot2)
- library(tidyr)
- library(purrr)
- library(kableExtra)
- library(VIM)
- library(knitr)
- library(pheatmap)
- library(readr)
- library(arules)
- library(stringr)
- ########## ___________________________________________________________________________________________________________
- data <- read_csv("C:/Users/laayt/Downloads/amazon_co-ecommerce_sample.csv")
- ```
- #### Data Cleaning-Make variables more readables by removing unnecessary characters from it.
- ```{r}
- data2 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
- data$price <- as.numeric(gsub("[^0-9.-]", "", data$price))
- data$average_review_rating <- as.numeric(gsub('[out of 5 stars]', '', data$average_review_rating))
- data$number_available_in_stock <- as.numeric(gsub("[^0-9.-]", "", data$number_available_in_stock))
- data1 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
- ```
- ##### Transforming Unstructured Numerical Columns To
- ##############################################################################################################
- ```{r}
- head(data2,7)
- ```
- ## # A tibble: 7 x 3
- ## price average_review_rating number_available_in_stock
- ## <chr> <chr> <chr>
- ## 1 £3.42 4.9 out of 5 stars 5 new
- ## 2 £16.99 4.5 out of 5 stars <NA>
- ## 3 £9.99 3.9 out of 5 stars 2 new
- ## 4 £39.99 5.0 out of 5 stars <NA>
- ## 5 £32.19 4.7 out of 5 stars <NA>
- ## 6 £6.99 5.0 out of 5 stars <NA>
- ## 7 £24.99 4.5 out of 5 stars <NA>
- ##### Structured Numerical Columns
- ##############################################################################################################
- ```{r}
- head(data1,7)
- ```
- ## # A tibble: 7 x 3
- ## price average_review_rating number_available_in_stock
- ## <dbl> <dbl> <dbl>
- ## 1 3.42 4.9 5
- ## 2 17.0 4 NA
- ## 3 9.99 3.9 2
- ## 4 40.0 0 NA
- ## 5 32.2 4.7 NA
- ## 6 6.99 0 NA
- ## 7 25.0 4 NA
- #### Percentage missing data-Column wise
- ```{r}
- getMissingData <- function(df, row_display = 6){
- df1 <- data.frame(df_cols = names(df))
- df1$miss_num <- map_dbl(df, ~sum(is.na(.x)))
- df1$miss_percentage <- map_dbl(df, ~sum((is.na(.x)/nrow(df))*100))
- df1 <<- df1
- return(head(df1, n = row_display))
- }
- getMissingData(data)
- #kable(head(df1,6)) %>%
- # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## df_cols miss_num miss_percentage
- ## 1 uniq_id 0 0.000000
- ## 2 product_name 0 0.000000
- ## 3 manufacturer 6 0.059988
- ## 4 price 1457 14.567087
- ## 5 number_available_in_stock 2502 25.014997
- ## 6 number_of_reviews 22 0.219956
- #### Percentage missing data-Raw wise
- ##############################################################################################################
- ```{r}
- getMissingRows <- function(df, row_display = 6){
- df1 <- data.frame(df_rows = 1:nrow(df))
- df1$miss_num <- rowSums(is.na(df))
- df1$miss_percentage <- rowSums((is.na(df)/ncol(df))*100)
- df1 <- df1 %>% arrange(desc(miss_percentage))
- df3 <<- df1
- return(head(df1, n = row_display))
- }
- getMissingRows(data)
- #kable(head(df3, 5)) %>%
- #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## df_rows miss_num miss_percentage
- ## 1 2276 892 99.33185
- ## 2 644 889 98.99777
- ## 3 795 889 98.99777
- ## 4 1013 889 98.99777
- ## 5 1419 889 98.99777
- ## 6 1809 889 98.99777
- #### Removing completely empty rows and columns
- ##############################################################################################################
- ```{r}
- dim(data) #inspecting current dimension of the dataset
- remove_empty_rows <- function(df){
- no_empty_rows <<- df[rowSums(is.na(df)) != ncol(df),]
- }
- remove_empty_rows(data)
- complete_data <- function(df){
- removeCols <- colSums(is.na(df)) == nrow(df)
- complete_data <<- df[!removeCols]
- }
- complete_data(no_empty_rows)
- dim(complete_data) #removing empty columns
- ```
- #### Data-Type-Definition
- ##############################################################################################################
- ```{r}
- Categorical_data<-function(df, numlevels = 5){
- temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
- temp_use$var_names <- rownames(temp_use)
- type_class <- sapply(colnames(df), function(x) class(df[[x]]))
- temp_use$type_class <- as.data.frame(type_class)
- Cat_data <<- temp_use %>%
- filter(levels <= numlevels | type_class != "numeric" )
- }
- Continious_data <-function(df, numlevels = 5){
- temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
- temp_use$var_names <- rownames(temp_use)
- type_class <- sapply(colnames(df), function(x) class(df[[x]]))
- temp_use$type_class <- as.data.frame(type_class)
- Cont_data <<- temp_use %>%
- filter(levels > numlevels & type_class == "numeric" )
- }
- zero_variance_vars <- function(df){
- temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
- temp_use$var_names <- rownames(temp_use)
- type_class <- sapply(colnames(df), function(x) class(df[[x]]))
- temp_use$type_class <- as.data.frame(type_class)
- static_data <<- temp_use %>%
- filter(temp_use$levels == 1 | temp_use$levels == nrow(df))
- if(nrow(static_data) == 0){
- print("There are no variables in this dataframe having zero variance")
- }
- }
- ```
- #### List of Continuous variables
- ##############################################################################################################
- ```{r}
- Continious_data(complete_data)
- Cont_data
- #kable(Cont_data) %>%
- # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## levels var_names type_class
- ## 1 2610 price numeric
- ## 2 74 number_available_in_stock numeric
- ## 3 192 number_of_reviews numeric
- ## 4 19 number_of_answered_questions numeric
- ## 5 18 average_review_rating numeric
- #### List of Categorical variables
- ##############################################################################################################
- ```{r}
- Categorical_data(complete_data, 5)
- Cat_data
- #kable(Cat_data) %>%
- #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## levels var_names type_class
- ## 1 9992 uniq_id character
- ## 2 9965 product_name character
- ## 3 2654 manufacturer character
- ## 4 257 amazon_category_and_sub_category character
- ## 5 8756 customers_who_bought_this_item_also_bought character
- ## 6 8515 description character
- ## 7 9939 product_information character
- ## 8 8514 product_description character
- ## 9 6749 items_customers_buy_after_viewing_this_item character
- ## 10 910 customer_questions_and_answers character
- ## 11 9901 customer_reviews character
- #### List of Zero-variant variables
- ##############################################################################################################
- ```{r}
- zero_variance_vars(complete_data)
- ```
- ## [1] "There are no variables in this data frame having zero variance"
- #### Missing Value-Imputation : List of imputed missing value
- ##############################################################################################################
- ```{r}
- data_cont <- complete_data[, which(colnames(complete_data) %in% Cont_data$var_names)]
- imputed_df = as.data.frame(VIM::kNN(data_cont,c(Cont_data$var_names), k = 3))
- imputed_df <- imputed_df[,c(1:5)]
- head(imputed_df)
- #kable(head(imputed_df)) %>%
- # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## price number_available_in_stock number_of_reviews
- ## 1 3.42 5 15
- ## 2 16.99 17 2
- ## 3 9.99 2 17
- ## 4 39.99 3 1
- ## 5 32.19 7 3
- ## 6 6.99 5 2
- ## number_of_answered_questions average_review_rating
- ## 1 1 4.9
- ## 2 1 4.0
- ## 3 2 3.9
- ## 4 2 0.0
- ## 5 2 4.7
- ## 6 1 0.0
- #### CORRELATION -
- ##############################################################################################################
- ```{r}
- data_cont <- imputed_df[, which(colnames(imputed_df) %in% Cont_data$var_names)]
- data_cat <- na.omit(complete_data[, which(colnames(complete_data) %in% Cat_data$var_names)])
- cortable <- function(df, cutoff=0){
- library(dplyr)
- library(pheatmap)
- a <- abs(cor(df))
- corr <- data.table(variable1=rownames(a)[row(a)], variable2=colnames(a)[col(a)], corr=c(a))
- corr <- corr %>%
- filter(!(variable1 == variable2)) %>%
- filter(corr > cutoff) %>%
- arrange(desc(corr))
- corr <- corr[!duplicated(t(apply(corr,1,sort))),]
- return(corr)
- }
- ```
- #### Highly correlated variables in descending order
- ##############################################################################################################
- ```{r}
- cortable(data_cont)
- ```
- ## variable1 variable2 corr
- ## 1 average_review_rating number_of_reviews 0.269580511
- ## 3 number_of_reviews number_available_in_stock 0.221341663
- ## 5 average_review_rating number_available_in_stock 0.149496878
- ## 7 number_of_answered_questions number_of_reviews 0.111823030
- ## 9 average_review_rating price 0.065134089
- ## 11 average_review_rating number_of_answered_questions 0.028783844
- ## 13 number_of_reviews price 0.027596509
- ## 15 number_of_answered_questions number_available_in_stock 0.019411542
- ## 17 number_available_in_stock price 0.010957996
- ## 19 number_of_answered_questions price 0.005472014
- #### Correlation - Heat Map
- ##############################################################################################################
- ```{r}
- pheatmap::pheatmap(cor(data_cont))
- ```
- ##############################################################################################################
- Cleaning -URLs Data for Further Analysis'. Removing "http://www.amazon.co.uk/" and "/dp/", "-" and "|" from the URLs and Separating every transactional product to new column.
- ##############################################################################################################
- ```{r echo = FALSE}
- df24 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
- df24 <- df24[, c(1, 3, 5, 7)]
- df231<- as.data.frame(map_chr(df24$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df232<- as.data.frame(map_chr(df24$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df233 <- as.data.frame(map_chr(df24$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df234 <- as.data.frame(map_chr(df24$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df23<- cbind(df231, df232, df233, df234)
- names(df23) <- c("v1", "v2", "v3", "v4")
- df221 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df222 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df223 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df224 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df22 <- cbind(df221, df222, df223, df224)
- names(df22) <- c("product1", "product2", "product3", "product4")
- df14 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
- df14 <- df14[, c(1, 3, 5, 7)]
- df131<- as.data.frame(map_chr(df14$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df132<- as.data.frame(map_chr(df14$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df133 <- as.data.frame(map_chr(df14$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df134 <- as.data.frame(map_chr(df14$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
- df13<- cbind(df131, df132, df133, df134)
- names(df13) <- c("v1", "v2", "v3", "v4")
- df121 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df122 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df123 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df124 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
- df12 <- cbind(df221, df222, df223, df224)
- names(df12) <- c("Aproduct1", "Aproduct2", "Aproduct3", "Aproduct4")
- df12[] <- lapply(df12, gsub, pattern='-', replacement=' ')
- df22[] <- lapply(df22, gsub, pattern="-", replacement=' ')
- ```
- ##############################################################################################################
- Association Rule - Market Basket Analysis
- ##############################################################################################################
- ```{r echo=FALSE, results=FALSE}
- transaction12 <- complete_data[,c("uniq_id", "product_name")]
- transact_view <- cbind(transaction12, df12)
- transact_buy <- cbind(transaction12, df22)
- transact_view <- gather(transact_view, product_class, product_name, product_name:Aproduct4)
- transact_buy <- gather(transact_buy, product_class, product_name, product_name:product4)
- write.csv(transact_view, file = "transact_view.csv")
- write.csv(transact_buy, file = "transact_buy.csv")
- data_buy = read.transactions("transact_buy.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
- rules_buy <- apriori(data_buy, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
- rules_buy <- sort(rules_buy, by="lift", decreasing=TRUE)
- options(digit=3)
- inspect(rules_buy[1:5])
- rules_buy = data.frame(
- product1 = labels(lhs(rules_buy)),
- product2 = labels(rhs(rules_buy)),
- rules_buy@quality)
- data_view = read.transactions("transact_view.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)
- rules_view <- apriori(data_view, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
- rules_view <- sort(rules_view, by="lift", decreasing=TRUE)
- options(digit=3)
- inspect(rules_view[1:5])
- rules_view = data.frame(
- bought_item = labels(lhs(rules_view)),
- viewed_item_before_buying = labels(rhs(rules_view)),
- rules_view@quality)
- ```
- #
- #### Inspecting the frequently bought together products - Association Rules
- ##############################################################################################################
- ```{r message=FALSE}
- head(rules_buy)
- #kable(head(rules_buy)) %>%
- # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## product1
- ## 5 {Hornby R8222 Gauge Track Extension}
- ## 4 {Hornby R8223 Gauge Track Extension}
- ## 2 {Thomas Friends Trackmaster}
- ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
- ## 3 {Pack Girls Glitter Tattoo Stencils}
- ## 7 {Monster Manual Dungeons Rulebook Rulebooks}
- ## product2
- ## 5 {Hornby R8223 Gauge Track Extension}
- ## 4 {Hornby R8222 Gauge Track Extension}
- ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
- ## 1 {Thomas Friends Trackmaster}
- ## 3 {Girls 10ml Glitter Tattoo Glue}
- ## 7 {Dungeon Masters Dungeons Dragons Rulebooks}
- ## support confidence lift count
- ## 5 0.001100881 0.6875000 572.4583 11
- ## 4 0.001100881 0.9166667 572.4583 11
- ## 2 0.001501201 0.6521739 434.4348 15
- ## 1 0.001501201 1.0000000 434.4348 15
- ## 3 0.001000801 0.8333333 308.3951 10
- ## 7 0.001401121 0.7368421 272.6862 14
- #
- #### Inspecting the pair of frequently bought product X- after viewing product Y.
- ##############################################################################################################
- ```{r message=FALSE}
- head(rules_view)
- #kable(head(rules_view)) %>%
- # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
- ```
- ## bought_item
- ## 5 {Hornby R8222 Gauge Track Extension}
- ## 4 {Hornby R8223 Gauge Track Extension}
- ## 2 {Thomas Friends Trackmaster}
- ## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
- ## 3 {Pack Girls Glitter Tattoo Stencils}
- ## 7 {Monster Manual Dungeons Rulebook Rulebooks}
- ## viewed_item_before_buying
- ## 5 {Hornby R8223 Gauge Track Extension}
- ## 4 {Hornby R8222 Gauge Track Extension}
- ## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
- ## 1 {Thomas Friends Trackmaster}
- ## 3 {Girls 10ml Glitter Tattoo Glue}
- ## 7 {Dungeon Masters Dungeons Dragons Rulebooks}
- ## support confidence lift count
- ## 5 0.001100881 0.6875000 572.4583 11
- ## 4 0.001100881 0.9166667 572.4583 11
- ## 2 0.001501201 0.6521739 434.4348 15
- ## 1 0.001501201 1.0000000 434.4348 15
- ## 3 0.001000801 0.8333333 308.3951 10
- ## 7 0.001401121 0.7368421 272.6862 14
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement