Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # seminar 2 ---------------------------------------------------------------
- library(corrplot)
- library(ggplot2)
- library(Hmisc)
- library(PerformanceAnalytics)
- library( scatterplot3d)
- library(FactoMineR)
- library(factoextra)
- library(cluster)
- library(ca)
- datepath<-"D:\\cursuri\\master\\data mining\\proiect"
- date<-read.csv("indicatori.csv")
- date<-read.table(file.path(datepath,"indicatori.csv"),sep=",",header=TRUE, row.names=1)
- View(date)
- summary(date)
- a<-sd(date$I1)
- b<-sd(date$I2)
- c<-sd(date$I3)
- d<-sd(date$I4)
- e<-sd(date$I5)
- f<-sd(date$I6)
- g<-sd(date$I7)
- h<-sd(date$I8)
- s=c(a,b,c,d,e,f,g,h)
- View(s)
- boxplot(date,col=c("red","green","blue","yellow","ivory","pink","orange","purple"))
- corrplot( c2$r, type="upper", p.mat=c2$P, sig.level= 0.05, insig="blank")
- chart.Correlation(date,histogram=TRUE, pch=19)
- indic<- read.csv("indicatori.csv")
- ind<-indic[2:9]
- View(ind)
- acp1<- princomp( ind, cor= TRUE, scores= TRUE)
- summary(acp1)
- plot(acp1, type="l")
- biplot(acp1)
- acp1$loadings
- scoruri<- acp1$ scores
- scoruri
- acp2<-prcomp(ind, center= TRUE, scale= TRUE)
- summary(acp2)
- plot(acp2, type="l")
- biplot(acp2)
- scoruri2<- acp2$ scores
- cor(ind, scoruri2[,1:2])
- acp3<- PCA(ind)
- summary(acp3)
- fviz_pca_var(acp3,axes=c(1,2),col.var="contrib")
- plot(acp3,choix="ind")
- varimax<- varimax(acp2$rotation[,1:2])
- scorurirotite<- scale(ind)%*% varimax$loadings
- cor(ind,scorurirotite)
- plot(scorurirotite[,1],scorurirotite[,2])
- d<- dist(scorurirotite[,1:2],method = "euclidean")
- sol1<- hclust(d, method = "ward.D2")
- plot(sol1)
- sol1.2<-cutree(sol1, k=2)
- s1.2<-silhouette(sol1.2, d )
- plot(s1.2)
- sol2<-kmeans(scorurirotite[,1:2],2) # 2 de la final ne spune cate clustere alegem
- s2.2<-silhouette(sol2$cluster,d)
- plot(s2.2)
- fviz_cluster(list(data=scorurirotite[,1:2],cluster=sol2$cluster))# pt Kmeans
- fviz_cluster(list(data=scorurirotite[,1:2],cluster=cutree(sol1, k=2))) # ceva nu e in regula
- rasp<- read.csv("formular3.csv")
- View(rasp)
- attach(rasp)
- rasp2<- subset(rasp,Solutie2!="")
- View(rasp2)
- rasp2$Solutie1<- rasp2$Solutie2
- rasp3<- rbind(rasp,rasp2)
- View(rasp3)
- fix(rasp3)
- tabel<-table(rasp3$Solutie1, rasp3$durata)
- View(tabel)
- levels(rasp3$Solutie1)
- levels(rasp3$Solutie1)[levels (rasp3$Solutie1)%in%c("Tonomat")]<-"Cumpar de la supermarket"
- levels(rasp3$Solutie1)
- tabel<-table(rasp3$Solutie1, rasp3$durata)
- View(tabel)
- tabel<-tabel[-5,]
- View(tabel)
- corespond<- ca(tabel)
- summary(corespond)
- window()
- plot(corespond)
- library(ggplot2)
- library(memisc)
- library(ROCR)
- library(ROSE)
- # regresie logistica ------------------------------------------------------
- a<- data.frame(as.data.set(spss.system.file("antrep.sav")))
- View(a)
- attach(a)
- table(country)
- China<-subset(a,country=="China")
- table(China$bstart)
- variable<-which(names(China)%in%c("bstart","gemwork3","gemhhinc","gemeduc","knowent","suskill","fearfail","gender","age9c"))
- variable
- China<-China[,variable]
- China<-na.omit(China)
- China
- table(China$bstart)
- model1<-glm(bstart~gemwork3+gemhhinc+knowent+gender,data=China,family = "binomial")
- model1
- summary(model1)
- exp(coef(model1))
- table(China$gemeduc)
- levels(China$gemeduc)[levels(China$gemeduc) %in% c("NONE","SOME SECONDARY")] <- "level1"
- levels(China$gemeduc)[levels(China$gemeduc) %in% c("SECONDARY DEGREE")] <- "level2"
- levels(China$gemeduc)[levels(China$gemeduc) %in% c("POST SECONDARY","GRAD EXP")] <- "level3"
- table(China$gemeduc)
- table(China$age9c)
- levels(China$age9c)[levels(China$age9c) %in% c("0-17","18-24")]<- "categorie1"
- levels(China$age9c)[levels(China$age9c) %in% c("25-34","35-44","45-54")]<- "categorie2"
- levels(China$age9c)[levels(China$age9c) %in% c("55-64","65-120")]<- "categorie3"
- table(China$age9c)
- model3<-glm(bstart~gemwork3+knowent+suskill, data=China, family="binomial")
- summary(model3)
- exp(coef(model3))
- library(ROCR)
- yhat<- predict(model3, type="response")
- pr<-prediction(yhat, China$bstart, label.ordering = NULL)
- perf<-performance(pr, "tpr", "fpr")
- windows()
- plot(perf, colorize= TRUE, lwd=5)
- performance(pr, "auc")
- library(ROSE)
- table(China$bstart)
- date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123) #p arata cum reechilibrez
- table(date.rose$data$bstart)
- library(Matching)
- date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123)$data #p arata cum reechilibrez
- table(date.rose$bstart)
- model<-glm(bstart~ gemwork3+knowent+suskill,data=date.rose,family = "binomial")
- summary(model)
- exp(coef(model))
- library(ROCR)
- yhat<- predict(model, type="response")
- pr<-prediction(yhat, date.rose$bstart, label.ordering = NULL)
- perf<-performance(pr, "tpr", "fpr")
- plot(perf, colorize= TRUE, lwd=5)
- performance(pr, "auc")
- country<- bstart~gemwork3+knowent+suskill
- country1<-country[,c(1,2,4,7,8)] # aici am pastrat clasificatorii de mai sus, sunt cei folositi in model2
- View(country1)
- View(country)
- perfmRose<- ROSE.eval(bstart~., data= country1,
- learner = glm, method.assess = "LKOCV", K=5,
- control.learner = list(family=binomial),
- control.rose=list(p=0.4),seed=123, trace=TRUE)
- summary(perfmRose)
- data("lalonde")
- View(lalonde)
- dim(lalonde)
- table(lalonde$treat)
- head(lalonde)
- mean(lalonde$re78[lalonde$treat== 1])
- mean(lalonde$re78[lalonde$treat== 0])
- scor<- glm(treat~ age + educ+ black+ hisp+ married+ nodegr, data= lalonde, family=binomial)
- summary(scor)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ age + educ+ black+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$age[efect$index.control], lalonde$age[efect$index.treated])
- abline(coef=c(0,1), col=2)
- scor<- glm(treat~ educ+ black+ hisp+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- # ne uitam cat de mult difera fata de primul
- MatchBalance(treat~ educ+ black+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$educ[efect$index.control], lalonde$educ[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #adaugam age si eliminam black
- scor<- glm(treat~ age+educ+ hisp+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ age+educ+ hisp+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$hisp[efect$index.control], lalonde$hisp[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #adaugam black si eliminam bhisp
- scor<- glm(treat~ age+educ+ black+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ age+educ+ black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #adaugam black si eliminam hisp & age
- scor<- glm(treat~ educ+ black+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ educ+ black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #adaugam black si eliminam hisp & age & educ
- scor<- glm(treat~ black+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #adaugam black si hisp si eliminam & age & educ
- scor<- glm(treat~ hisp+black+ married+ nodegr, data= lalonde, family=binomial)
- efect<-Match(Y=lalonde$re78, Tr= lalonde$treat, X= scor$fitted, estimand = "ATT", M=1, replace= TRUE)
- summary(efect)
- MatchBalance(treat~ hisp+black+ married+ nodegr, match.out= efect, data=lalonde, nboots = 200)
- qqplot(lalonde$black[efect$index.control], lalonde$black[efect$index.treated])
- abline(coef=c(0,1), col=2)
- #sa mai testam si chestia urmatoare, alta metoda gen
- library(rgenoud)
- x<-cbind(lalonde$hisp, lalonde$black, lalonde$married, lalonde$nodegr)
- genetic<- GenMatch(Tr=lalonde$treat, X=x, pop.size = 1000)
- #daca crestem sau scadem pop.size se modifica, cu cat e mai mare cu atat solutia e mai buna
- mgen1<-Match(Y=lalonde$re78, Tr=lalonde$treat, X=x, Weight.matrix = genetic)
- summary(mgen1)
- MatchBalance(treat~ hisp+black+ married+ nodegr, match.out= mgen1, data=lalonde, nboots = 200)
- # Analiza conjoint --------------------------------------------------------
- library(support.CEs)
- library(AlgDesign)
- ffd<-gen.factorial(c(2,2,2,2), varNames = c("Memorie","Rezolutie","Diagonala","Pret"),factors = "all")
- View(ffd)
- #eliminam 8(1112)si 9(2221)
- ffd<-ffd[-c(1,4,6,8,9,11,15,16),]r
- set.seed(100)
- des<-rotation.design(candidate.array = ffd, attribute.names = list(Memorie=c("sub 32","peste 32"), Rezolutie=c("sub 12","peste 12"),
- Diagonala=c("sub 5.5", "peste 5.5"), Pret=c("sub 2000","peste 2000")),
- nalternatives = 2, nblocks=1, row.renames= FALSE, randomize= FALSE, seed=100)
- des
- desmat<-make.design.matrix(choice.experiment.design = des, optout = FALSE,
- categorical.attributes = c("Memorie","Rezolutie","Diagonala","Pret"),
- unlabeled = TRUE)
- q<-questionnaire(choice.experiment.design = des)
- rasp<-read.table("Raspunsuri.txt",header=TRUE, sep= "\t")
- View(rasp)
- dataset<- make.dataset(respondent.dataset = rasp, choice.indicators = c("I1","I2","I3","I4","I5","I6","I7","I8"),
- design.matrix = desmat)
- library(survival)
- rezultat<-clogit(RES~ASC+peste.32+peste.12+peste.5.5+peste.2000 + strata(STR),data=dataset)
- summary(rezultat)
- library(rpart)
- library(rpart.plot)
- library(ROSE)
- library(ROCR)
- library(memisc)
- a<- data.frame(as.data.set(spss.system.file("antrep.sav")))
- attach(a)
- China<-subset(a,country=="China")
- variable<-which(names(China)%in%c("bstart","gemwork3","gemhhinc","gemeduc","knowent","suskill","fearfail","gender","age9c"))
- variable
- China<-China[,variable]
- China<-na.omit(China)
- China
- date.rose<-ROSE(bstart~.,data=China,p=0.4, seed = 123)$data #p arata cum reechilibrez
- model<-rpart(bstart~gemwork3+knowent+suskill, method="class", data=date.rose)
- summary(model)
- datarose<-ROSE(bstart~gemwork3+knowent+suskill , data=date.rose , p=0.4, seed=123)$data
- prp(model, type=3, extra=106, under=TRUE, box.palette = "BuPu")
- plotcp(model)
- printcp(model)
- arboref<-prune(model, cp=0.04)
- arboref
- prp(arboref)
- esantion<- sample(1:nrow(date.rose), 1130)
- train<-date.rose[esantion,]
- test<-date.rose[-esantion,]
- p<- predict(model, test, type="prob")[,2] #prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
- pred<-prediction(p, test$bstart)
- roc<-performance(pred, "tpr", "fpr")
- plot(roc, colorize = TRUE, lwd=5)
- performance(pred, "auc") #auc=0.6749
- eval<-ROSE.eval(bstart~., data=country, learner = rpart,
- method.assess = "LKOCV", K=5, extr.pred = function(obj)obj[,2],
- control.rose = list(p=0.4),
- control.learner=list(control=list(cp=0.04)), seed=123, trace=TRUE)
- modell<-glm(bstart~ gemwork3+knowent+suskill,data=date.rose,family = "binomial")
- summary(modell)
- exp(coef(modell))
- library(ROCR)
- yhat<- predict(modell, type="response")
- pr<-prediction(yhat, date.rose$bstart, label.ordering = NULL)
- perf<-performance(pr, "tpr", "fpr")
- plot(perf, colorize= TRUE, lwd=5)
- performance(pr, "auc")
- #arbore simplu
- model<-rpart(bstart~gemwork3+knowent+suskill, method="class", data=date.rose)
- p<- predict(model, test, type="prob")[,2] #prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
- pred<-prediction(p, test$bstart)
- roc<-performance(pred, "tpr", "fpr")
- plot(roc, colorize = TRUE, lwd=5)
- performance(pred, "auc")
- predl<-predict(modell,test, type="response")
- predarb<-predict(model,test, type="prob")[,2]
- pred<-cbind(predl, predarb)
- prednot<-prediction(pred, cbind(test$bstart, test$bstart))
- perf<-performance(prednot,"tpr","fpr")
- plot(perf, col=as.list(1:2))
- legend("bottomright", legend=c("logistica","arbore"),pch=19,col=c("black","red"))
- # random forest ----------------------------------------------------------
- library(randomForest)
- View(train[,7])
- rf<-randomForest(x=train[,-7],y=train$bstart, importance= TRUE, ntree=1000, replace= TRUE,
- xtest=test[,-7], ytest=test$bstart)
- rf$importance
- library(ROCR)
- p<- predict(rf, test, type="prob")#prob - probabilitatea pe clasa 1 si pe clasa 2, [,2]- pentru a alege a 2a clasa, YES
- pred<-prediction(p, test$bstart)
- roc<-performance(pred, "tpr", "fpr")
- plot(roc, colorize = TRUE, lwd=5)
- performance(pred, "auc")
- summary(rf)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement