Advertisement
Guest User

Untitled

a guest
Jan 13th, 2019
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 13.04 KB | None | 0 0
  1. # seminar 2 ---------------------------------------------------------------
  2. library(corrplot)
  3. library(ggplot2)
  4. library(Hmisc)
  5. library(PerformanceAnalytics)
  6. library( scatterplot3d)
  7. library(FactoMineR)
  8. library(factoextra)
  9. library(cluster)
  10. library(ca)
  11. datepath<-"D:\\cursuri\\master\\data mining\\proiect"
  12. date<-read.csv("indicatori.csv")
  13. date<-read.table(file.path(datepath,"indicatori.csv"),sep=",",header=TRUE, row.names=1)
  14. View(date)
  15. summary(date)
  16. a<-sd(date$I1)
  17. b<-sd(date$I2)
  18. c<-sd(date$I3)
  19. d<-sd(date$I4)
  20. e<-sd(date$I5)
  21. f<-sd(date$I6)
  22. g<-sd(date$I7)
  23. h<-sd(date$I8)
  24. s=c(a,b,c,d,e,f,g,h)
  25. View(s)
  26. boxplot(date,col=c("red","green","blue","yellow","ivory","pink","orange","purple"))
  27. corrplot( c2$r, type="upper", p.mat=c2$P, sig.level= 0.05, insig="blank")
  28. chart.Correlation(date,histogram=TRUE, pch=19)
  29. indic<- read.csv("indicatori.csv")
  30. ind<-indic[2:9]
  31. View(ind)
  32. acp1<- princomp( ind, cor= TRUE, scores= TRUE)
  33. summary(acp1)  
  34. plot(acp1, type="l")
  35. biplot(acp1)
  36. acp1$loadings
  37. scoruri<- acp1$ scores  
  38. scoruri
  39. acp2<-prcomp(ind, center= TRUE, scale= TRUE)
  40. summary(acp2)
  41. plot(acp2, type="l")
  42. biplot(acp2)
  43. scoruri2<- acp2$ scores
  44. cor(ind, scoruri2[,1:2])
  45. acp3<- PCA(ind)
  46. summary(acp3)
  47. fviz_pca_var(acp3,axes=c(1,2),col.var="contrib")
  48. plot(acp3,choix="ind")
  49. varimax<- varimax(acp2$rotation[,1:2])
  50. scorurirotite<- scale(ind)%*% varimax$loadings
  51. cor(ind,scorurirotite)  
  52. plot(scorurirotite[,1],scorurirotite[,2])
  53. d<- dist(scorurirotite[,1:2],method = "euclidean")
  54. sol1<- hclust(d, method = "ward.D2")
  55. plot(sol1)
  56. sol1.2<-cutree(sol1, k=2)
  57. s1.2<-silhouette(sol1.2, d )
  58. plot(s1.2)
  59. sol2<-kmeans(scorurirotite[,1:2],2) # 2 de la final ne spune cate clustere alegem
  60. s2.2<-silhouette(sol2$cluster,d)
  61. plot(s2.2)
  62. fviz_cluster(list(data=scorurirotite[,1:2],cluster=sol2$cluster))# pt Kmeans
  63. fviz_cluster(list(data=scorurirotite[,1:2],cluster=cutree(sol1, k=2))) # ceva nu e in regula
  64. rasp<- read.csv("formular3.csv")
  65. View(rasp)
  66. attach(rasp)
  67. rasp2<- subset(rasp,Solutie2!="")
  68. View(rasp2)
  69. rasp2$Solutie1<- rasp2$Solutie2
  70. rasp3<- rbind(rasp,rasp2)
  71. View(rasp3)
  72. fix(rasp3)
  73. tabel<-table(rasp3$Solutie1, rasp3$durata)
  74. View(tabel)
  75. levels(rasp3$Solutie1)
  76. levels(rasp3$Solutie1)[levels (rasp3$Solutie1)%in%c("Tonomat")]<-"Cumpar de la supermarket"
  77. levels(rasp3$Solutie1)
  78. tabel<-table(rasp3$Solutie1, rasp3$durata)
  79. View(tabel)
  80. tabel<-tabel[-5,]
  81. View(tabel)
  82. corespond<- ca(tabel)
  83. summary(corespond)
  84. window()
  85. plot(corespond)
  86. library(ggplot2)
  87. library(memisc)
  88. library(ROCR)
  89. library(ROSE)
  90. # regresie logistica ------------------------------------------------------
  91. a<- data.frame(as.data.set(spss.system.file("antrep.sav")))
  92. View(a)
  93. attach(a)
  94. table(country)
  95. China<-subset(a,country=="China")
  96. table(China$bstart)
  97. variable<-which(names(China)%in%c("bstart","gemwork3","gemhhinc","gemeduc","knowent","suskill","fearfail","gender","age9c"))
  98. variable                                    
  99. China<-China[,variable]
  100. China<-na.omit(China)
  101. China
  102. table(China$bstart)
  103. model1<-glm(bstart~gemwork3+gemhhinc+knowent+gender,data=China,family = "binomial")
  104. model1
  105. summary(model1)
  106. exp(coef(model1))
  107. table(China$gemeduc)
  108. levels(China$gemeduc)[levels(China$gemeduc) %in% c("NONE","SOME SECONDARY")] <- "level1"
  109. levels(China$gemeduc)[levels(China$gemeduc) %in% c("SECONDARY DEGREE")] <- "level2"
  110. levels(China$gemeduc)[levels(China$gemeduc) %in% c("POST SECONDARY","GRAD EXP")] <- "level3"
  111. table(China$gemeduc)
  112. table(China$age9c)
  113. levels(China$age9c)[levels(China$age9c) %in% c("0-17","18-24")]<- "categorie1"
  114. levels(China$age9c)[levels(China$age9c) %in% c("25-34","35-44","45-54")]<- "categorie2"
  115. levels(China$age9c)[levels(China$age9c) %in% c("55-64","65-120")]<- "categorie3"
  116. table(China$age9c)
  117. model3<-glm(bstart~gemwork3+knowent+suskill, data=China, family="binomial")
  118. summary(model3)
  119. exp(coef(model3))
  120. library(ROCR)
  121. yhat<- predict(model3, type="response")
  122. pr<-prediction(yhat, China$bstart, label.ordering = NULL)
  123. perf<-performance(pr, "tpr", "fpr")
  124. windows()
  125. plot(perf, colorize= TRUE, lwd=5)
  126. performance(pr, "auc")
  127. library(ROSE)
  128. table(China$bstart)
  129. date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123) #p arata cum reechilibrez
  130. table(date.rose$data$bstart)
  131. library(Matching)
  132. date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123)$data #p arata cum reechilibrez
  133. table(date.rose$bstart)
  134. model<-glm(bstart~ gemwork3+knowent+suskill,data=date.rose,family = "binomial")
  135. summary(model)
  136. exp(coef(model))
  137. library(ROCR)
  138. yhat<- predict(model, type="response")
  139. pr<-prediction(yhat, date.rose$bstart, label.ordering = NULL)
  140. perf<-performance(pr, "tpr", "fpr")
  141. plot(perf, colorize= TRUE, lwd=5)
  142. performance(pr, "auc")
  143. country<- bstart~gemwork3+knowent+suskill
  144. country1<-country[,c(1,2,4,7,8)] # aici am pastrat clasificatorii de mai sus, sunt cei folositi in model2
  145. View(country1)
  146. View(country)
  147. perfmRose<- ROSE.eval(bstart~., data= country1,
  148.                       learner = glm, method.assess = "LKOCV", K=5,
  149.                       control.learner = list(family=binomial),
  150.                       control.rose=list(p=0.4),seed=123, trace=TRUE)
  151. summary(perfmRose)  
  152. data("lalonde")
  153. View(lalonde)
  154. dim(lalonde)
  155. table(lalonde$treat)
  156. head(lalonde)
  157. mean(lalonde$re78[lalonde$treat== 1])
  158. mean(lalonde$re78[lalonde$treat== 0])
  159. scor<- glm(treat~ age + educ+ black+ hisp+ married+ nodegr, data= lalonde, family=binomial)
  160. summary(scor)
  161. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  162. summary(efect)
  163. MatchBalance(treat~ age + educ+ black+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  164. qqplot(lalonde$age[efect$index.control], lalonde$age[efect$index.treated])
  165. abline(coef=c(0,1), col=2)
  166. scor<- glm(treat~ educ+ black+ hisp+ married+ nodegr, data= lalonde, family=binomial)
  167. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  168. summary(efect)
  169. # ne uitam cat de mult difera fata de primul
  170. MatchBalance(treat~ educ+ black+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  171. qqplot(lalonde$educ[efect$index.control], lalonde$educ[efect$index.treated])
  172. abline(coef=c(0,1), col=2)
  173. #adaugam age si eliminam black
  174. scor<- glm(treat~ age+educ+ hisp+ married+ nodegr, data= lalonde, family=binomial)
  175. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  176. summary(efect)
  177. MatchBalance(treat~ age+educ+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  178. qqplot(lalonde$hisp[efect$index.control], lalonde$hisp[efect$index.treated])
  179. abline(coef=c(0,1), col=2)
  180. #adaugam black si eliminam bhisp
  181. scor<- glm(treat~ age+educ+ black+ married+ nodegr, data= lalonde, family=binomial)
  182. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  183. summary(efect)
  184. MatchBalance(treat~ age+educ+ black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  185. qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
  186. abline(coef=c(0,1), col=2)
  187. #adaugam black si eliminam hisp & age
  188. scor<- glm(treat~ educ+ black+ married+ nodegr, data= lalonde, family=binomial)
  189. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  190. summary(efect)
  191. MatchBalance(treat~ educ+ black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  192. qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
  193. abline(coef=c(0,1), col=2)
  194. #adaugam black si eliminam hisp & age & educ
  195. scor<- glm(treat~ black+ married+ nodegr, data= lalonde, family=binomial)
  196. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  197. summary(efect)
  198. MatchBalance(treat~  black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  199. qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
  200. abline(coef=c(0,1), col=2)
  201. #adaugam black si hisp si eliminam & age & educ
  202. scor<- glm(treat~ hisp+black+ married+ nodegr, data= lalonde, family=binomial)
  203. efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
  204. summary(efect)
  205. MatchBalance(treat~ hisp+black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
  206. qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
  207. abline(coef=c(0,1), col=2)
  208. #sa mai testam si chestia urmatoare, alta metoda gen
  209. library(rgenoud)
  210. x<-cbind(lalonde$hisp, lalonde$black, lalonde$married, lalonde$nodegr)
  211. genetic<- GenMatch(Tr=lalonde$treat, X=x, pop.size = 1000)
  212. #daca crestem sau scadem pop.size se modifica, cu cat e mai mare cu atat solutia e mai buna
  213. mgen1<-Match(Y=lalonde$re78, Tr=lalonde$treat, X=x, Weight.matrix = genetic)
  214. summary(mgen1)
  215. MatchBalance(treat~ hisp+black+ married+ nodegr, match.out= mgen1, data=lalonde, nboots = 200)
  216. # Analiza conjoint --------------------------------------------------------
  217. library(support.CEs)
  218. library(AlgDesign)
  219. ffd<-gen.factorial(c(2,2,2,2), varNames = c("Memorie","Rezolutie","Diagonala","Pret"),factors = "all")
  220. View(ffd)
  221. #eliminam 8(1112)si 9(2221)
  222. ffd<-ffd[-c(1,4,6,8,9,11,15,16),]r
  223. set.seed(100)
  224. des<-rotation.design(candidate.array = ffd, attribute.names = list(Memorie=c("sub 32","peste 32"), Rezolutie=c("sub 12","peste 12"),
  225.                                                                    Diagonala=c("sub 5.5", "peste 5.5"), Pret=c("sub 2000","peste 2000")),
  226.                      nalternatives = 2, nblocks=1, row.renames= FALSE, randomize= FALSE, seed=100)
  227. des
  228. desmat<-make.design.matrix(choice.experiment.design = des, optout = FALSE,
  229.                            categorical.attributes = c("Memorie","Rezolutie","Diagonala","Pret"),
  230.                            unlabeled = TRUE)
  231. q<-questionnaire(choice.experiment.design = des)
  232. rasp<-read.table("Raspunsuri.txt",header=TRUE, sep= "\t")
  233. View(rasp)
  234. dataset<- make.dataset(respondent.dataset = rasp, choice.indicators = c("I1","I2","I3","I4","I5","I6","I7","I8"),
  235.                        design.matrix = desmat)
  236. library(survival)
  237. rezultat<-clogit(RES~ASC+peste.32+peste.12+peste.5.5+peste.2000 + strata(STR),data=dataset)
  238. summary(rezultat)
  239. library(rpart)
  240. library(rpart.plot)
  241. library(ROSE)
  242. library(ROCR)
  243. library(memisc)
  244. a<- data.frame(as.data.set(spss.system.file("antrep.sav")))
  245. attach(a)
  246. China<-subset(a,country=="China")
  247. variable<-which(names(China)%in%c("bstart","gemwork3","gemhhinc","gemeduc","knowent","suskill","fearfail","gender","age9c"))
  248. variable                                    
  249. China<-China[,variable]
  250. China<-na.omit(China)
  251. China
  252. date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123)$data #p arata cum reechilibrez
  253. model<-rpart(bstart~gemwork3+knowent+suskill, method="class", data=date.rose)
  254. summary(model)
  255. datarose<-ROSE(bstart~gemwork3+knowent+suskill , data=date.rose , p=0.4, seed=123)$data
  256. prp(model, type=3, extra=106, under=TRUE, box.palette = "BuPu")
  257. plotcp(model)
  258. printcp(model)
  259. arboref<-prune(model, cp=0.04)
  260. arboref
  261. prp(arboref)
  262. esantion<- sample(1:nrow(date.rose), 1130)
  263. train<-date.rose[esantion,]
  264. test<-date.rose[-esantion,]
  265.  
  266. p<- predict(model, test, type="prob")[,2] #prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
  267. pred<-prediction(p, test$bstart)
  268. roc<-performance(pred, "tpr", "fpr")
  269. plot(roc, colorize = TRUE, lwd=5)
  270. performance(pred, "auc")  #auc=0.6749
  271. eval<-ROSE.eval(bstart~., data=country, learner = rpart,
  272.                 method.assess = "LKOCV", K=5, extr.pred = function(obj)obj[,2],
  273.                 control.rose = list(p=0.4),
  274.                 control.learner=list(control=list(cp=0.04)), seed=123, trace=TRUE)
  275. modell<-glm(bstart~ gemwork3+knowent+suskill,data=date.rose,family = "binomial")
  276. summary(modell)
  277. exp(coef(modell))
  278. library(ROCR)
  279. yhat<- predict(modell, type="response")
  280. pr<-prediction(yhat, date.rose$bstart, label.ordering = NULL)
  281. perf<-performance(pr, "tpr", "fpr")
  282. plot(perf, colorize= TRUE, lwd=5)
  283. performance(pr, "auc")
  284.  
  285. #arbore simplu
  286. model<-rpart(bstart~gemwork3+knowent+suskill, method="class", data=date.rose)
  287. p<- predict(model, test, type="prob")[,2] #prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
  288. pred<-prediction(p, test$bstart)
  289. roc<-performance(pred, "tpr", "fpr")
  290. plot(roc, colorize = TRUE, lwd=5)
  291. performance(pred, "auc")
  292.  
  293.  
  294. predl<-predict(modell,test, type="response")
  295. predarb<-predict(model,test, type="prob")[,2]
  296. pred<-cbind(predl, predarb)
  297. prednot<-prediction(pred, cbind(test$bstart, test$bstart))
  298. perf<-performance(prednot,"tpr","fpr")
  299. plot(perf, col=as.list(1:2))
  300. legend("bottomright", legend=c("logistica","arbore"),pch=19,col=c("black","red"))
  301.  
  302.  
  303. # random forest  ----------------------------------------------------------
  304.  
  305. library(randomForest)
  306. View(train[,7])
  307. rf<-randomForest(x=train[,-7],y=train$bstart, importance= TRUE, ntree=1000, replace= TRUE,
  308.                  xtest=test[,-7], ytest=test$bstart)
  309.  
  310. rf$importance
  311. library(ROCR)
  312. p<- predict(rf, test, type="prob")#prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
  313. pred<-prediction(p, test$bstart)
  314. roc<-performance(pred, "tpr", "fpr")
  315. plot(roc, colorize = TRUE, lwd=5)
  316. performance(pred, "auc")
  317. summary(rf)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement