Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(dplyr)
- file <- read.csv('amazon_co-ecommerce_sample.csv',header = TRUE)
- file[file==""]<-NA
- count_na = data.frame(sapply(file, function(x) sum(is.na(x))))
- file = file[0:16]
- lapply(file,class)
- new_file <- file %>%
- mutate(product_name =as.character(product_name))%>%
- mutate(price =as.numeric(str_remove(as.character(price),'£')))%>%
- mutate(number_available_in_stock = as.numeric(gsub("([0-9]+).*$", "\\1", as.character(number_available_in_stock))))%>%
- mutate(number_of_reviews = as.numeric(number_of_reviews))%>%
- mutate(number_of_answered_questions = as.numeric(number_of_answered_questions))%>%
- mutate(average_review_rating = as.numeric(gsub(" out of 5 stars", "", as.character(average_review_rating))))%>%
- mutate(description = as.character(description))%>%
- mutate(customers_who_bought_this_item_also_bought = as.character(customers_who_bought_this_item_also_bought))
- new_file$ID = seq.int(nrow(new_file))
- ###split category column
- library(tidyr)
- a <-data.frame(str_split_fixed(new_file$amazon_category_and_sub_category, " > ", 5))
- a[a==""]<-NA
- colnames(a) <-c('sub_category1','sub_category2','sub_category3','sub_category4','sub_category5')
- a$ID= seq.int(nrow(a))
- new_file <- new_file %>%
- left_join(a,by ='ID')
- ##### split customers_who_bought_this_item_also_bought column
- library(splitstackshape)
- b<- cSplit(new_file, "customers_who_bought_this_item_also_bought", " | ")
- library(dataQualityR)
- checkDataQuality(data= b, out.file.num= "dq_num.csv", out.file.cat= "dq_cat.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement