Advertisement
Guest User

Untitled

a guest
Mar 24th, 2017
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 2.20 KB | None | 0 0
  1. library(plyr)
  2. library(dplyr)
  3. library(tidyr)
  4.  
  5. # load
  6. df_customer <- read.table(file='Customer_data.txt', header=T, sep=',', fileEncoding="UTF-16")
  7. df_loan <- read.table(file='Loan_data.txt', sep=',', header = T, fileEncoding="UTF-16")
  8.  
  9. # manipulate columns
  10. df_customer <- plyr::rename(df_customer, c("X.customer_id"="customer_id"))
  11. df_loan <- plyr::rename(df_loan, c("customer_ID"="customer_id"))
  12.  
  13. # remove extremes
  14. quantile(df_customer$dependant_count, probs = c(0.9, 0.95, 0.98, 0.9, 0.99))
  15. df_customer <- df_customer[(df_customer$dependant_count <= 3), ]
  16. quantile(df_loan$Principal_loan_amount, probs = c(0.9, 0.95, 0.98, 0.9, 0.99))
  17. df_loan <- df_loan[(df_loan$Principal_loan_amount <= 700),]
  18.  
  19. # filter data
  20. df_customer <- df_customer[!(is.na(df_customer$customer_id)),]
  21. df_customer <- df_customer %>% unique(c(1), incomparables = F)
  22.  
  23. # replace factors
  24. gender_levels <- levels(df_customer$gender)
  25. df_customer$gender <- sapply(df_customer$gender, function(g) which(gender_levels == g))
  26. home_ownership_type_levels <- levels(df_customer$home_ownership_type)
  27. df_customer$home_ownership_type <- sapply(df_customer$home_ownership_type, function(hot) which(home_ownership_type_levels == hot))
  28. occupation_type_levels <- levels(df_customer$occupation_type)
  29. df_customer$occupation_type <- sapply(df_customer$occupation_type, function(ot) which(occupation_type_levels == ot))
  30.  
  31. # add columns
  32. df_loan <- separate(data = df_loan, col = X.loan_id, into = c("customer_id_tmp", "loan_id"), sep = "-")
  33. df_loan$customer_id_tmp <- NULL
  34. df_loan$overdue <- as.numeric(as.character(df_loan$overdue))
  35. df_loan <- df_loan[!(is.na(df_loan$overdue)), ]
  36. df_loan$overdued <- as.numeric(df_loan$overdue > 45)
  37.  
  38. df <- join(df_customer, df_loan, type = "inner")
  39.  
  40.  
  41.  
  42. # other
  43. boxplot(df$Principal_loan_amount)
  44. boxplot(df$dependant_count)
  45. hist(df_customer$dependant_count)
  46. hist(df$occupation_type)
  47.  
  48. df_low_45 <- subset(df, df$overdue <= 45)
  49. df_gre_45 <- subset(df, df$overdue > 45)
  50.  
  51.  
  52. fit <- lm(overdued~ occupation_type, data=df)
  53. summary(fit) # show results
  54.  
  55. model <- glm(overdued~ .,family=binomial(link='logit'),data=df)
  56. summary(model)
  57.  
  58. model <- glm(overdued~ gender,family=binomial(link='logit'),data=df)
  59. summary(model)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement