proton.ai -task_3

---
title: "Report-Amazon Sales data"
output:
  html_document: default
  pdf_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
##############################################################################################################

###### prton.ai - task
#### Preparations

##############################################################################################################

```{r loading_packages, message = FALSE}
options( warn = -1 )
#install.packages("rmarkdown")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("tidyr")
#install.packages("purrr")
#install.packages("kableExtra")
#install.packages("VIM")
#install.packages("knitr")
#install.packages("pheatmap")
#install.packages("readr")
#install.packages("arules")
#install.packages("stringr")
library(rmakrdown)
library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)
library(kableExtra)
library(VIM)
library(knitr)
library(pheatmap)
library(readr)
library(arules)
library(stringr)


########## ___________________________________________________________________________________________________________
data <- read_csv("C:/Users/laayt/Downloads/amazon_co-ecommerce_sample.csv")

```

#### Data Cleaning-Make variables more readables by removing unnecessary characters from it.


```{r}
data2 <- data[,c("price", "average_review_rating", "number_available_in_stock")]

data$price <- as.numeric(gsub("[^0-9.-]", "", data$price))
data$average_review_rating <- as.numeric(gsub('[out of 5 stars]', '', data$average_review_rating))
data$number_available_in_stock <- as.numeric(gsub("[^0-9.-]", "", data$number_available_in_stock))

data1 <- data[,c("price", "average_review_rating", "number_available_in_stock")]
```
##### Transforming Unstructured Numerical Columns To
##############################################################################################################
```{r}
   head(data2,7)
```

## # A tibble: 7 x 3
##   price  average_review_rating number_available_in_stock
##   <chr>  <chr>                 <chr>
## 1 £3.42  4.9 out of 5 stars    5 new
## 2 £16.99 4.5 out of 5 stars    <NA>
## 3 £9.99  3.9 out of 5 stars    2 new
## 4 £39.99 5.0 out of 5 stars    <NA>
## 5 £32.19 4.7 out of 5 stars    <NA>
## 6 £6.99  5.0 out of 5 stars    <NA>
## 7 £24.99 4.5 out of 5 stars    <NA>

##### Structured Numerical Columns
##############################################################################################################

```{r}
head(data1,7)
```
## # A tibble: 7 x 3
##   price average_review_rating number_available_in_stock
##   <dbl>                 <dbl>                     <dbl>
## 1  3.42                   4.9                         5
## 2 17.0                    4                          NA
## 3  9.99                   3.9                         2
## 4 40.0                    0                          NA
## 5 32.2                    4.7                        NA
## 6  6.99                   0                          NA
## 7 25.0                    4                          NA


#### Percentage missing data-Column wise
```{r}

getMissingData <- function(df, row_display = 6){
  df1 <- data.frame(df_cols = names(df))
  df1$miss_num <- map_dbl(df, ~sum(is.na(.x)))
  df1$miss_percentage <- map_dbl(df, ~sum((is.na(.x)/nrow(df))*100))
  df1 <<- df1
  return(head(df1, n = row_display))
}
getMissingData(data)
#kable(head(df1,6)) %>%
 # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
```
##                     df_cols miss_num miss_percentage
## 1                   uniq_id        0        0.000000
## 2              product_name        0        0.000000
## 3              manufacturer        6        0.059988
## 4                     price     1457       14.567087
## 5 number_available_in_stock     2502       25.014997
## 6         number_of_reviews       22        0.219956

#### Percentage missing data-Raw wise
##############################################################################################################


```{r}
getMissingRows <- function(df, row_display = 6){
  df1 <- data.frame(df_rows = 1:nrow(df))
  df1$miss_num <- rowSums(is.na(df))
  df1$miss_percentage <- rowSums((is.na(df)/ncol(df))*100)
  df1 <- df1 %>% arrange(desc(miss_percentage))
  df3 <<- df1
  return(head(df1, n = row_display))
}
getMissingRows(data)
#kable(head(df3, 5)) %>%
  #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)

```
##   df_rows miss_num miss_percentage
## 1    2276      892        99.33185
## 2     644      889        98.99777
## 3     795      889        98.99777
## 4    1013      889        98.99777
## 5    1419      889        98.99777
## 6    1809      889        98.99777

#### Removing completely empty rows and columns
##############################################################################################################

```{r}
dim(data) #inspecting current dimension of the dataset
remove_empty_rows <- function(df){
  no_empty_rows <<- df[rowSums(is.na(df)) != ncol(df),]

}
remove_empty_rows(data)

complete_data <- function(df){
  removeCols <- colSums(is.na(df)) == nrow(df)
  complete_data <<- df[!removeCols]
}
complete_data(no_empty_rows)

dim(complete_data) #removing empty columns

```

#### Data-Type-Definition
##############################################################################################################


```{r}
Categorical_data<-function(df, numlevels = 5){
  temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  temp_use$var_names <- rownames(temp_use)
  type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  temp_use$type_class <- as.data.frame(type_class)
  Cat_data <<- temp_use %>%
    filter(levels <= numlevels |  type_class != "numeric" )
}

Continious_data <-function(df, numlevels = 5){
  temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  temp_use$var_names <- rownames(temp_use)
  type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  temp_use$type_class <- as.data.frame(type_class)
  Cont_data <<- temp_use %>%
    filter(levels > numlevels & type_class == "numeric" )
}

zero_variance_vars <- function(df){
  temp_use <- data.frame(levels = apply(df, 2, function(x) length(table(x))))
  temp_use$var_names <- rownames(temp_use)
  type_class <- sapply(colnames(df), function(x) class(df[[x]]))
  temp_use$type_class <- as.data.frame(type_class)
  static_data <<- temp_use %>%
    filter(temp_use$levels == 1 | temp_use$levels == nrow(df))
  if(nrow(static_data) == 0){
    print("There are no variables in this dataframe having zero variance")
  }
}
```
#### List of Continuous variables
##############################################################################################################
```{r}
Continious_data(complete_data)
Cont_data
#kable(Cont_data) %>%
 # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
```
##   levels                    var_names type_class
## 1   2610                        price    numeric
## 2     74    number_available_in_stock    numeric
## 3    192            number_of_reviews    numeric
## 4     19 number_of_answered_questions    numeric
## 5     18        average_review_rating    numeric


#### List of Categorical variables
##############################################################################################################

```{r}
Categorical_data(complete_data, 5)
Cat_data
#kable(Cat_data) %>%
  #kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
```

##    levels                                   var_names type_class
## 1    9992                                     uniq_id  character
## 2    9965                                product_name  character
## 3    2654                                manufacturer  character
## 4     257            amazon_category_and_sub_category  character
## 5    8756  customers_who_bought_this_item_also_bought  character
## 6    8515                                 description  character
## 7    9939                         product_information  character
## 8    8514                         product_description  character
## 9    6749 items_customers_buy_after_viewing_this_item  character
## 10    910              customer_questions_and_answers  character
## 11   9901                            customer_reviews  character


#### List of Zero-variant variables
##############################################################################################################

```{r}
zero_variance_vars(complete_data)
```
## [1] "There are no variables in this data frame having zero variance"


#### Missing Value-Imputation : List of imputed missing value
##############################################################################################################

```{r}
data_cont <- complete_data[, which(colnames(complete_data) %in% Cont_data$var_names)]
imputed_df = as.data.frame(VIM::kNN(data_cont,c(Cont_data$var_names), k = 3))
imputed_df <- imputed_df[,c(1:5)]
head(imputed_df)
#kable(head(imputed_df)) %>%
 # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
```
##   price number_available_in_stock number_of_reviews
## 1  3.42                         5                15
## 2 16.99                        17                 2
## 3  9.99                         2                17
## 4 39.99                         3                 1
## 5 32.19                         7                 3
## 6  6.99                         5                 2
##   number_of_answered_questions average_review_rating
## 1                            1                   4.9
## 2                            1                   4.0
## 3                            2                   3.9
## 4                            2                   0.0
## 5                            2                   4.7
## 6                            1                   0.0


#### CORRELATION -
##############################################################################################################

```{r}
data_cont <- imputed_df[, which(colnames(imputed_df) %in% Cont_data$var_names)]
data_cat  <- na.omit(complete_data[, which(colnames(complete_data) %in% Cat_data$var_names)])

cortable <- function(df, cutoff=0){
  library(dplyr)
  library(pheatmap)
  a <- abs(cor(df))
  corr <- data.table(variable1=rownames(a)[row(a)], variable2=colnames(a)[col(a)], corr=c(a))
  corr <- corr %>%
    filter(!(variable1 == variable2)) %>%
    filter(corr > cutoff) %>%
    arrange(desc(corr))
  corr <- corr[!duplicated(t(apply(corr,1,sort))),]
  return(corr)
}
```

#### Highly correlated variables in descending order
##############################################################################################################

```{r}
cortable(data_cont)
```
##                       variable1                    variable2        corr
## 1         average_review_rating            number_of_reviews 0.269580511
## 3             number_of_reviews    number_available_in_stock 0.221341663
## 5         average_review_rating    number_available_in_stock 0.149496878
## 7  number_of_answered_questions            number_of_reviews 0.111823030
## 9         average_review_rating                        price 0.065134089
## 11        average_review_rating number_of_answered_questions 0.028783844
## 13            number_of_reviews                        price 0.027596509
## 15 number_of_answered_questions    number_available_in_stock 0.019411542
## 17    number_available_in_stock                        price 0.010957996
## 19 number_of_answered_questions                        price 0.005472014


#### Correlation - Heat Map
##############################################################################################################

```{r}
pheatmap::pheatmap(cor(data_cont))
```


##############################################################################################################

Cleaning -URLs Data for Further Analysis'. Removing "http://www.amazon.co.uk/" and "/dp/", "-" and "|" from the URLs and Separating every transactional product to new column.
##############################################################################################################

```{r echo = FALSE}
df24 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
df24 <- df24[, c(1, 3, 5, 7)]
df231<- as.data.frame(map_chr(df24$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df232<- as.data.frame(map_chr(df24$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df233 <- as.data.frame(map_chr(df24$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df234 <- as.data.frame(map_chr(df24$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df23<- cbind(df231, df232, df233, df234)
names(df23) <- c("v1", "v2", "v3", "v4")
df221 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df222 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df223 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df224 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df22 <- cbind(df221, df222, df223, df224)
names(df22) <- c("product1", "product2", "product3", "product4")

df14 <- as.data.frame(stringr::str_split_fixed(data$customers_who_bought_this_item_also_bought, " | ", 7))
df14 <- df14[, c(1, 3, 5, 7)]
df131<- as.data.frame(map_chr(df14$V1,~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df132<- as.data.frame(map_chr(df14$V3, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df133 <- as.data.frame(map_chr(df14$V5, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df134 <- as.data.frame(map_chr(df14$V7, ~stringr::str_remove_all(.x, "http://www.amazon.co.uk/")))
df13<- cbind(df131, df132, df133, df134)
names(df13) <- c("v1", "v2", "v3", "v4")
df121 <- as.data.frame(map_chr(df23$v1, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df122 <- as.data.frame(map_chr(df23$v2, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df123 <- as.data.frame(map_chr(df23$v3, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df124 <- as.data.frame(map_chr(df23$v4, ~stringr::str_replace_all(.x, "/dp/.*", "")))
df12 <- cbind(df221, df222, df223, df224)
names(df12) <- c("Aproduct1", "Aproduct2", "Aproduct3", "Aproduct4")
df12[] <- lapply(df12, gsub, pattern='-', replacement=' ')
df22[] <- lapply(df22, gsub, pattern="-", replacement=' ')

```

##############################################################################################################
Association Rule - Market Basket Analysis
##############################################################################################################


```{r echo=FALSE, results=FALSE}
transaction12 <- complete_data[,c("uniq_id", "product_name")]
transact_view <- cbind(transaction12, df12)
transact_buy <- cbind(transaction12, df22)

transact_view <- gather(transact_view, product_class, product_name, product_name:Aproduct4)
transact_buy <- gather(transact_buy, product_class, product_name, product_name:product4)


write.csv(transact_view, file = "transact_view.csv")
write.csv(transact_buy, file = "transact_buy.csv")
data_buy = read.transactions("transact_buy.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)

rules_buy <- apriori(data_buy, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
rules_buy <- sort(rules_buy, by="lift", decreasing=TRUE)
options(digit=3)

inspect(rules_buy[1:5])
rules_buy = data.frame(
          product1 = labels(lhs(rules_buy)),
          product2 = labels(rhs(rules_buy)),
          rules_buy@quality)

data_view = read.transactions("transact_view.csv", format = "single", sep = "," , cols = c("uniq_id", "product_name"), header = TRUE)

rules_view <- apriori(data_view, parameter = list(supp = 0.001, conf = 0.51, minlen = 2))
rules_view <- sort(rules_view, by="lift", decreasing=TRUE)
options(digit=3)

inspect(rules_view[1:5])
rules_view = data.frame(
          bought_item = labels(lhs(rules_view)),
          viewed_item_before_buying = labels(rhs(rules_view)),
          rules_view@quality)

```
#


#### Inspecting the frequently bought together products - Association Rules
##############################################################################################################

```{r message=FALSE}

head(rules_buy)

#kable(head(rules_buy)) %>%
 # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)

```
##                                                                                 product1
## 5                                                   {Hornby R8222 Gauge Track Extension}
## 4                                                   {Hornby R8223 Gauge Track Extension}
## 2                                                           {Thomas Friends Trackmaster}
## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
## 3                                                   {Pack Girls Glitter Tattoo Stencils}
## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
##                                                                                 product2
## 5                                                   {Hornby R8223 Gauge Track Extension}
## 4                                                   {Hornby R8222 Gauge Track Extension}
## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
## 1                                                           {Thomas Friends Trackmaster}
## 3                                                       {Girls 10ml Glitter Tattoo Glue}
## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
##       support confidence     lift count
## 5 0.001100881  0.6875000 572.4583    11
## 4 0.001100881  0.9166667 572.4583    11
## 2 0.001501201  0.6521739 434.4348    15
## 1 0.001501201  1.0000000 434.4348    15
## 3 0.001000801  0.8333333 308.3951    10
## 7 0.001401121  0.7368421 272.6862    14
#


#### Inspecting the pair of frequently bought product X-  after viewing product Y.
##############################################################################################################

```{r message=FALSE}
head(rules_view)
#kable(head(rules_view)) %>%
 # kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 12)
```

##                                                                              bought_item
## 5                                                   {Hornby R8222 Gauge Track Extension}
## 4                                                   {Hornby R8223 Gauge Track Extension}
## 2                                                           {Thomas Friends Trackmaster}
## 1 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
## 3                                                   {Pack Girls Glitter Tattoo Stencils}
## 7                                           {Monster Manual Dungeons Rulebook Rulebooks}
##                                                                viewed_item_before_buying
## 5                                                   {Hornby R8223 Gauge Track Extension}
## 4                                                   {Hornby R8222 Gauge Track Extension}
## 2 {Thomas Friends Trackmaster | Thomas Friends Trackmaster | Thomas Friends Trackmaster}
## 1                                                           {Thomas Friends Trackmaster}
## 3                                                       {Girls 10ml Glitter Tattoo Glue}
## 7                                           {Dungeon Masters Dungeons Dragons Rulebooks}
##       support confidence     lift count
## 5 0.001100881  0.6875000 572.4583    11
## 4 0.001100881  0.9166667 572.4583    11
## 2 0.001501201  0.6521739 434.4348    15
## 1 0.001501201  1.0000000 434.4348    15
## 3 0.001000801  0.8333333 308.3951    10
## 7 0.001401121  0.7368421 272.6862    14