Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(plyr)
- library(dplyr)
- library(tidyr)
- # load
- df_customer <- read.table(file='Customer_data.txt', header=T, sep=',', fileEncoding="UTF-16")
- df_loan <- read.table(file='Loan_data.txt', sep=',', header = T, fileEncoding="UTF-16")
- # manipulate columns
- df_customer <- plyr::rename(df_customer, c("X.customer_id"="customer_id"))
- df_loan <- plyr::rename(df_loan, c("customer_ID"="customer_id"))
- # remove extremes
- quantile(df_customer$dependant_count, probs = c(0.9, 0.95, 0.98, 0.9, 0.99))
- df_customer <- df_customer[(df_customer$dependant_count <= 3), ]
- quantile(df_loan$Principal_loan_amount, probs = c(0.9, 0.95, 0.98, 0.9, 0.99))
- df_loan <- df_loan[(df_loan$Principal_loan_amount <= 700),]
- # filter data
- df_customer <- df_customer[!(is.na(df_customer$customer_id)),]
- df_customer <- df_customer %>% unique(c(1), incomparables = F)
- # replace factors
- gender_levels <- levels(df_customer$gender)
- df_customer$gender <- sapply(df_customer$gender, function(g) which(gender_levels == g))
- home_ownership_type_levels <- levels(df_customer$home_ownership_type)
- df_customer$home_ownership_type <- sapply(df_customer$home_ownership_type, function(hot) which(home_ownership_type_levels == hot))
- occupation_type_levels <- levels(df_customer$occupation_type)
- df_customer$occupation_type <- sapply(df_customer$occupation_type, function(ot) which(occupation_type_levels == ot))
- # add columns
- df_loan <- separate(data = df_loan, col = X.loan_id, into = c("customer_id_tmp", "loan_id"), sep = "-")
- df_loan$customer_id_tmp <- NULL
- df_loan$overdue <- as.numeric(as.character(df_loan$overdue))
- df_loan <- df_loan[!(is.na(df_loan$overdue)), ]
- df_loan$overdued <- as.numeric(df_loan$overdue > 45)
- df <- join(df_customer, df_loan, type = "inner")
- # other
- boxplot(df$Principal_loan_amount)
- boxplot(df$dependant_count)
- hist(df_customer$dependant_count)
- hist(df$occupation_type)
- df_low_45 <- subset(df, df$overdue <= 45)
- df_gre_45 <- subset(df, df$overdue > 45)
- fit <- lm(overdued~ occupation_type, data=df)
- summary(fit) # show results
- model <- glm(overdued~ .,family=binomial(link='logit'),data=df)
- summary(model)
- model <- glm(overdued~ gender,family=binomial(link='logit'),data=df)
- summary(model)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement