Advertisement
Guest User

Untitled

a guest
Sep 29th, 2016
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.28 KB | None | 0 0
  1. ---
  2. title: "Data Validation"
  3. author: "Lidia Montero"
  4. date: "September 2016"
  5. output:
  6.   html_document:
  7.     toc: true
  8.     toc_depth: 3
  9.     number_sections: true
  10. ---
  11.  
  12. # Introduction
  13.  
  14. ## Load data
  15.  
  16.  
  17.  
  18. ### Either Read database which is binary R format###
  19. #Upload file i pujo cresco
  20. base <- read.table("credsco.txt",header=T,sep='\t',na.string='99999999')
  21.  
  22. dim(base)
  23. names(base)
  24. head(base, 12)
  25. tail(base,8)
  26.  
  27. cresco<-base
  28.  
  29. save.image("F:/FIB-ADEI/CURS1617Q1/PRACTICA/CREDITSCORE/LABORATORI/credsco_raw.RData")
  30.  
  31. ```
  32.  
  33. # Data Coding and Clearance
  34.  
  35. ## Final Decision: dictamen
  36.  
  37. ```{r}
  38. summary(cresco)
  39. # cresco$dictamen  useless
  40. table(cresco$dictamen)
  41. # Remove observations with missing values (TARGET)
  42.  
  43. llista<-which( cresco$dictamen == 0);llista
  44. cresco[3310, 5] # Cel.la 3310,5
  45. cresco[3310,]
  46. cresco[c(3310,112),]
  47. cresco[1:4, c(1,3,5)]
  48. cresco[1:4, "dictamen"]
  49. c(3310,112)
  50.  
  51. cresco<-cresco[-llista,]
  52. paste("f.dict",c("Accepted","Rejected"),sep="-")
  53. cresco$f.dictamen<-factor(cresco$dictamen, levels=1:2, labels=paste("f.dict",c("Accepted","Rejected"),sep="-"))
  54. summary(cresco$f.dictamen)
  55.  
  56. # Univariant Exploratory Data Analysis (EDA)
  57. summary(cresco$f.dictamen)
  58. table(cresco$f.dictamen) # Better
  59. round(100*(table(cresco$f.dictamen)/nrow(cresco)),dig=2)
  60.  
  61. # Graphics
  62. # Pie
  63. piepercent<-round(100*(table(cresco$f.dictamen)/nrow(cresco)),dig=2); piepercent
  64.  
  65. pie(table(cresco$f.dictamen),col=heat.colors(2),labels=paste(piepercent,"%"))
  66.  
  67. legend("topright", levels(cresco$f.dictamen), cex = 0.8, fill = heat.colors(2))
  68.  
  69. # Bar Chart
  70. barplot(table(cresco$f.dictamen),main="Barplot Final Decision Factor",col=c("green","red"))
  71.  
  72. ```
  73.  
  74. ## Experience in current job (years)
  75.  
  76. Now an example on a numeric variable
  77.  
  78. ```{r}
  79. names(cresco)
  80. summary(cresco$anys.feina)
  81. # Quartils
  82. quantile(cresco$anys.feina)
  83. seq(0, 1, 0.1)
  84. quantile(cresco$anys.feina, probs = seq(0, 1, 0.1))
  85.  
  86. # Graphics
  87. plot(cresco$anys.feina)
  88. hist(cresco$anys.feina)
  89. hist(cresco$anys.feina,20)
  90. hist(cresco$anys.feina,breaks=seq(0,50,4))
  91. hist(cresco$anys.feina,freq=FALSE,breaks=seq(0,50,4),col=heat.colors(11))
  92. #
  93. boxplot(cresco$anys.feina, main="Boxplot Years in current Job")
  94. # Calculate upper thresholds for mild and severe outliers
  95. abline(h=20,col="red",lwd=2)
  96. abline(h=40,col="purple",lwd=2)
  97. calcQ(cresco$anys.feina)
  98.  
  99. # Function to calculate the outliers thresholds
  100. calcQ <- function(x) {
  101.   s.x <- summary(x)
  102.   iqr<-s.x[5]-s.x[2]
  103.   list(souti=s.x[2]-3*iqr, mouti=s.x[2]-1.5*iqr, min=s.x[1], q1=s.x[2], q2=s.x[3],
  104.        q3=s.x[5], max=s.x[6], mouts=s.x[5]+1.5*iqr, souts=s.x[5]+3*iqr ) }
  105.  
  106. ```
  107.  
  108. ## Discretization of numeric variables into factors (new)
  109.  
  110. ```{r}
  111. summary(cresco$anys.feina)
  112. quantile(cresco$anys.feina,seq(0,1,0.1))
  113. quantile(cresco$anys)
  114.  
  115. cresco$aux<-factor(cut(cresco$anys.feina,breaks=c(-1,1.99,5,12,48)))
  116. summary(cresco$aux)
  117. tapply(cresco$anys.feina,cresco$aux,median)
  118. cresco$f.afei<-factor(cut(cresco$anys.feina,breaks=c(-1,1.99,5,12,48)))
  119. levels(cresco$f.afei)<-paste("f.afei-",levels(cresco$f.afei),sep="")
  120. table(cresco$f.afei)
  121.  
  122. # Per ara ja està bé aixó
  123.  
  124.  
  125. # Outliers have to be considered for each numeric variable (initialization should be done at the beginning)
  126.  
  127. #######################################################
  128. iouts<-rep(0,nrow(cresco))
  129. jouts<-rep(0,ncol(cresco))
  130. ######################################################
  131. calcQ(cresco$anys.feina)
  132. library(car)
  133. Boxplot(cresco$anys.feina, id.n=5)
  134. abline(h=27,col="red",lwd=2)
  135. abline(h=42,col="purple",lwd=2)
  136.  
  137. llista<-which(cresco$anys.feina>42);llista;length(llista)
  138. if(length(llista)>0){
  139.   iouts[llista]<-iouts[llista]+1
  140.   jouts["anys.feina"]<-length(llista)}
  141.  
  142. ```
  143.  
  144. ## Vivenda
  145.  
  146. ```{r}
  147. summary(cresco$vivenda)
  148. ll<-which(cresco$vivenda==0);ll;length(ll)
  149. cresco$vivenda[ll]<-NA
  150. cresco$f.habi<-factor(cresco$vivenda,levels=1:6,labels=c("HAB.rental","HAB.scrpu","HAB.contpri","HAB.nocontract","HAB.family","HAB.others" ))
  151. summary(cresco$f.habi)
  152. levels(cresco$f.habi)[c(3,4,6)]<-"HAB.others"
  153.  
  154. ```
  155.  
  156. # Data Quality Report
  157.  
  158. ```{r}
  159. # Això ho feu al final de tractar totes les vars
  160. imis<-rep(0,nrow(cresco))
  161. jmis<-rep(0,ncol(cresco))
  162.  
  163. names(jmis)<-names(cresco)
  164.  
  165. for ( i in 1:nrow(cresco) ){
  166.   imis[i]<-imis[i]+sum(is.na(cresco[i,]))  # Number of missing per observation
  167. }
  168.  
  169. # Also for variables
  170.  
  171.  
  172.  
  173. ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement