Untitled

###### loading required packages
library(dplyr)
library(stargazer)
library(oaxaca)
library(knitr)
library(survey)
library(Hmisc)
library(matrixStats)
library(quantreg)

######removing previous global environment
rm(list=ls())

#############################################################################################################
#Loading Data
##############################################################################################################
pg <- src_postgres("datacube", user="bsagmeister", password="migrantwagegap", host="ineq.wu.ac.at", options="-c search_path=silc")

#h <- tbl(pg, "c13h")
d  <- tbl(pg, "c13d")
p <- tbl(pg, "c13p")
#r <- tbl(pg, "c13r")


#####################################################################################################################
#####Nur-Vollzeit, fuer den Fall, dass meine Vollzeit/Teilzeit Variable nicht passt
#silc1 <- p %>% filter(pb020=="AT") %>% select(pb010, pb020, pb030, pb040, pb140, pb150, pb190,pb210,pe040,pl060,
#                                              pl073,py010g, pl074,pl100,pl200,pl040,pl111,pl150,ph010,pb220a)
#silc2 <- r %>% filter(rb020=="AT") %>% select(rb030, rb220, rb230)
#silc1 <- collect(silc1, n=Inf)
#silc2 <- collect(silc2, n=Inf)
#silc <-left_join(silc1, silc2, by=c("pb030" = "rb030"))
#silc <- collect(silc, n=Inf)


#silc <- filter(silc, pl073 == 12)
#silc <- filter(silc, py010g > 0)
#silc["hwage"]<-silc$py010g/52/silc$pl060
#silc<-na.omit(silc)


####################################################################################################################
#####Herstellung der benoetigten Variablen
####################################################################################################################
#silc <- p %>% filter(pb020=="AT") %>% select(pb010, pb020, pb030, pb040, pb140, pb150, pb190,pb210,pe040,pl060,
#                                             pl073,py010g, pl074,pl100,pl200,pl040,pl111,pl150,ph010,pb220a)
#silc <- collect(silc, n=Inf)
silc1 <- p %>% filter(pb020=="AT") %>% select(pb010, pb020, pb030,px030, pb040, pb140, pb150, pb190,pb210,pe040,pl060,
                                              pl073,py010g, pl074,pl100,pl200,pl040,pl111,pl150,ph010,pb220a)
silc2 <- d %>% filter(db020=="AT") %>% select(db030, db100)
silc1 <- collect(silc1, n=Inf)
silc2 <- collect(silc2, n=Inf)
silc <-left_join(silc1, silc2, by=c("px030" = "db030"))
rm(silc1,silc2)


##################################################
#Stundenlohn: Vollzeit+Teilzeit
#################################################


silc <- filter(silc, py010g > 0)
silc <- filter(silc, pl040 == 3)
silc$pl060[is.na(silc$pl060)]<-0
silc$pl100[is.na(silc$pl100)]<-0
silc["tothr"]<-silc$pl060+silc$pl100
silc <- filter(silc, tothr > 0)
silc["totmnth"]<-silc$pl073+silc$pl074
silc <- filter(silc, totmnth > 0)
silc["hwage"]<-silc$py010g/((silc$pl073+silc$pl074)*4*(silc$pl060+silc$pl100))

summary(silc$hwage)
silc<-na.omit(silc)

##############################################
#Education-Dummies
#############################################

#-> alles niedriger (inkl.) lower second wird zu 2 zusammengefasst und reference category; upper second and non tertiary wird zu 3 zusammengefasst
silc$pe040[silc$pe040==0]<-2
silc$pe040[silc$pe040==1]<-2
silc$pe040[silc$pe040==4]<-3
silc$education <-factor(silc$pe040, levels=c("2","3","5"))


##########################################################
#Managerial Position Dummy
##########################################################

#### 0 wenn keine leitende Position, 1 wenn in einer leitenden Position

silc["position"]<-silc$pl150
silc$position[silc$position==2]<-0
silc$position<-as.numeric(silc$position)

###########################################################
#Health Dummy
##########################################################

silc["health"]<-silc$ph010
table(silc$health)
# absolute Mehrheit im Sample entweder "fair", "good" oder "very good", daher 0 -> Abweichung ("bad", "very bad") wird zu 1

silc$health[silc$health==1]<-0
silc$health[silc$health==2]<-0
silc$health[silc$health==3]<-0
silc$health[silc$health==4]<-1
silc$health[silc$health==5]<-1


#######################################################
#Age Variable herstellen
####################################################

silc["age"]<-2013-silc$pb140
summary(silc$age)


#####Variablen-umbenennen####
silc["exp"]<-silc$pl200
summary(silc$pl200)

##################################################
#Sex Dummy
################################################

silc$pb150[silc$pb150==1]<-0
silc$pb150[silc$pb150==2]<-1
silc["sex"]<-silc$pb150


###############################################
#Degree of Urbanisation
###############################################
silc["urb"]<-silc$db100
silc$urb <-factor(silc$db100, levels=c("1","2","3"))


####################################################
#Citizenship and Country of birth
####################################################

silc["loc"]<-silc$pb210
## wie viele ATs, EUs, OTHs?
loctable<-as.data.frame(table(silc$loc))
rm(loctable)
#AT+EU wird zusammengefasst
silc$loc[silc$loc=="LOC"]<-0
silc$loc[silc$loc=="EU"]<-0
silc$loc[silc$loc=="OTH"]<-1
silc$loc<-as.numeric(silc$loc)

silc["cit"]<-silc$pb220a
silc$cit[silc$cit=="LOC"]<-0
silc$cit[silc$cit=="EU"]<-0
silc$cit[silc$cit=="OTH"]<-1
silc$cit<-as.numeric(silc$cit)

#######################################################
#Migrants
######################################################
silc["mig"]<-silc$loc
silc$mig[silc$cit]<-0
silc$mig[silc$loc]<-0
silc$mig[silc$cit==1]<-1
silc$mig[silc$loc==1]<-1
mig<-as.data.frame(table(silc$mig))
table(mig)
rm(mig)
##############################################################################
#Neues Dataframe nur mit verwendeten Variablen
############################################################################
silc<-na.omit(silc)

sub.inl.aus<-data.frame(cbind(silc$sex,silc$age,silc$exp,silc$mig,silc$education,silc$hwage,silc$position,silc$health,silc$pb040,silc$urb))
colnames(sub.inl.aus)<-c("sex","age","exp","mig","education","hwage","position","health","weights","urb")
rm(silc)
sub.inl.aus<-na.omit(sub.inl.aus)


#####################################################################################
# OLS zur Analyse des Stundenlohns
##################################################################################

#################################
# OLS des gesamten Samples
################################

ols<-(lm(log(hwage)~sex+I(exp)+I((exp)^2)+as.factor(education)+mig+health+position+as.factor(urb), data=sub.inl.aus,weights=weights))

#ols<-(lm(log(hwage)~sex*I(exp)*I((exp)^2)*as.factor(education)*loc+health*position, data=sub.inl.aus,weights=weights))
summary(ols)
summary(sub.inl.aus$hwage)


#################################
# Herstellung zweier Samples mit entweder EU Citizens (inkl. AT) oder nur Dritt-Staat Citizens
#################################

ateu<-subset(sub.inl.aus, mig==0)
oth<-subset(sub.inl.aus,mig==1)


#################################
# OLS dieser beiden Samples
################################


olsateu<-(lm(log(hwage)~sex+I(exp)+I((exp)^2)+as.factor(education)+health+position+as.factor(urb), data=ateu,weights=weights))
summary(olsateu)

table(ateu$education)

olsoth<-(lm(log(hwage)~sex+I(exp)+I((exp)^2)+as.factor(education)+health+position+as.factor(urb), data=oth,weights=weights))
summary(olsoth)
table(oth$education)


#########################################################################################
# Oaxaca-Blinder Zerlegung des gesamten Samples
########################################################################################


in.aus<-oaxaca(hwage~sex+I(exp)+I((exp)^2)+as.factor(education)+position+health+as.factor(urb)|mig, data=sub.inl.aus)

#Ergebnisse
#Neumark Gewichtung -1

summary(in.aus)
in.aus$twofold$overall


#Plot
plot(in.aus, decomposition = "twofold", weight=-1)


##################################################################################################
#####Oaxaca-Blinder Zerlegung in den verschiedenen Quartilen
##################################################################################################


################################
###Herstellung von Quartilen
################################

wtd.quantile(ateu$hwage, weights=ateu$weights, probs=c(0, .25, .5, .75, 1),
             type=c('quantile','(i-1)/(n-1)','i/(n+1)','i/n'),
             normwt=FALSE, na.rm=TRUE)


wtd.mean(ateu$hwage, weights=ateu$weights, normwt="ignored", na.rm=TRUE)


wtd.quantile(oth$hwage, weights=oth$weights, probs=c(0, .25, .5, .75, 1),
             type=c('quantile','(i-1)/(n-1)','i/(n+1)','i/n'),
             normwt=FALSE, na.rm=TRUE)


wtd.mean(oth$hwage, weights=oth$weights, normwt="ignored", na.rm=TRUE)

#wtd.quantile(sub.inl.aus$hwage, weights=sub.inl.aus$weights, probs=c(0, .25, .5, .75, 1),
#type=c('quantile','(i-1)/(n-1)','i/(n+1)','i/n'),
#normwt=FALSE, na.rm=TRUE)


#wtd.mean(sub.inl.aus$hwage, weights=sub.inl.aus$weights, normwt="ignored", na.rm=TRUE)


quart1at<-subset(ateu, ateu$hwage <= 12.504891)
quart2at<-subset(ateu,  ateu$hwage > 12.504891 & ateu$hwage <= 17.358578)
quart3at<-subset(ateu,ateu$hwage > 17.358578 & ateu$hwage <= 23.845882)
quart4at<-subset(ateu, ateu$hwage >23.845882)


quart1oth<-subset(oth, oth$hwage <= 9.9241400)
quart2oth<-subset(oth, oth$hwage > 9.9241400 & oth$hwage <= 13.2664474)
quart3oth<-subset(oth,oth$hwage > 13.2664474 & oth$hwage <= 17.5470260)
quart4oth<-subset(oth, oth$hwage >17.5470260)

#Analyse des Stundenlohns
summary(ateu$hwage)
summary(oth$hwage)

wtd.mean(ateu$hwage, weights=ateu$weights, normwt="ignored", na.rm=TRUE)
wtd.mean(oth$hwage, weights=oth$weights, normwt="ignored", na.rm=TRUE)

#Zusammenf\FCgen der beiden Quartils-Gruppen

atoth1 <- rbind(quart1at, quart1oth)
atoth2 <- rbind(quart2at, quart2oth)
atoth3 <- rbind(quart3at, quart3oth)
atoth4 <- rbind(quart4at, quart4oth)


##################################
#Analyse 1. Quartile
##################################
in.aus1<-oaxaca(hwage~sex+I(exp)+I((exp)^2)+as.factor(education)+position+health+as.factor(urb)|mig, data=atoth1)

in.aus1$y
in.aus1$n
summary(in.aus1)
in.aus1$twofold$overall
plot(in.aus1, decomposition = "twofold", weight=-1)

table(atoth1$education)
table(atoth1$mig)

##################################
#Analyse 2. Quartile
##################################

in.aus2<-oaxaca(hwage~sex+I(exp)+I((exp)^2)+as.factor(education)+position+health+as.factor(urb)|mig, data=atoth2)
in.aus2$y
in.aus2$twofold$overall
summary(in.aus2)
plot(in.aus2, decomposition = "twofold", weight=-1)

table(atoth2$education)
table(atoth2$mig)

##################################
#Analyse 3. Quartile
##################################

in.aus3<-oaxaca(hwage~sex+I(exp)+I((exp)^2)+as.factor(education)+position+health+as.factor(urb)|mig, data=atoth3)
summary(in.aus3)
in.aus3$y
in.aus3$twofold$overall
plot(in.aus3, decomposition = "twofold", weight=-1)

table(atoth3$education)
table(atoth3$mig)

##################################
#Analyse 4. Quartile
##################################

in.aus4<-oaxaca(hwage~sex+I(exp)+I((exp)^2)+as.factor(education)+position+health+as.factor(urb)|mig, data=atoth4)
summary(in.aus4)
in.aus4$y
in.aus4$twofold$overall
plot(in.aus4, decomposition = "twofold", weight=-1)

table(atoth4$education)
table(atoth4$mig)


###########################################################################################################################
#oaxaca h\E4ndisch
#########################################################################################################################

##################
#Full Sample
##################


mean.inl<-data.frame(rbind(colWeightedMeans(ateu, w=ateu$weights)))
mean.aus<-data.frame(rbind(colWeightedMeans(oth, w=oth$weights)))
#lohngleichung definieren
lohnd<-as.formula(hwage~sex+exp+I((exp)^2)+education+position+health)
# lohngleichung schaetzen
mylm.inl<-lm(lohnd,data=ateu,weights=weights)
mylm.aus<-lm(lohnd,data=oth,weights=weights)

wage_i_i<-predict(mylm.inl,newdata=mean.inl,weights=weights)
wage_a_a<-predict(mylm.aus,newdata=mean.aus,weights=weights)

wage_i_a<-predict(mylm.aus,newdata=mean.inl,weights=weights)
wage_a_i<-predict(mylm.inl,newdata=mean.aus,weights=weights)

wagediffd<-wtd.mean(ateu$hwage, weights=ateu$weights, normwt="ignored",
                    na.rm=TRUE)-wtd.mean(oth$hwage, weights=oth$weights, normwt="ignored", na.rm=TRUE)
# Fall 1: Bewertung zu inl werten
erkl.1d<-wage_i_i-wage_a_i
unerkl.1d<-wage_a_i-wage_a_a
# Fall 2: Bewertung zu ausl werten
erkl.2d<-wage_i_a-wage_a_a
unerkl.2d<-wage_i_i-wage_i_a

dec=6 # dezimalstellen angeben
oaxacad<-as.data.frame(rbind(c("differenz",round(wagediffd,dec),round(wagediffd,dec)),c("charakteristika",round(erkl.1d,dec),round(erkl.2d,dec)),c("diskriminierung",round(unerkl.1d,dec),round(unerkl.2d,dec))))

# ergebnis anzeigen
stargazer(oaxacad,type="text",summary=F,rownames=F,colnames=F)


##############################
#1. Quartil
#############################
#ateu1<-subset(quart1, loc==0)
#oth1<-subset(quart1,loc==1)


mean.inl<-data.frame(rbind(colWeightedMeans(quart1at, w=quart1at$weights)))
mean.aus<-data.frame(rbind(colWeightedMeans(quart1oth, w=quart1oth$weights)))
#lohngleichung definieren
lohnd<-as.formula(hwage~sex+exp+I((exp)^2)+education+position+health)
# lohngleichung schaetzen
mylm.inl<-lm(lohnd,data=quart1at,weights=weights)
mylm.aus<-lm(lohnd,data=quart1oth,weights=weights)

wage_i_i<-predict(mylm.inl,newdata=mean.inl,weights=weights)
wage_a_a<-predict(mylm.aus,newdata=mean.aus,weights=weights)

wage_i_a<-predict(mylm.aus,newdata=mean.inl,weights=weights)
wage_a_i<-predict(mylm.inl,newdata=mean.aus,weights=weights)

wagediffd<-wtd.mean(quart1at$hwage, weights=quart1at$weights, normwt="ignored",
                    na.rm=TRUE)-wtd.mean(quart1oth$hwage, weights=quart1oth$weights, normwt="ignored", na.rm=TRUE)

wtd.mean(quart1at$hwage, weights=quart1at$weights, normwt="ignored",
         na.rm=TRUE)
wtd.mean(quart1oth$hwage, weights=quart1oth$weights, normwt="ignored", na.rm=TRUE)

# Fall 1: Bewertung zu inl werten
erkl.1d<-wage_i_i-wage_a_i
unerkl.1d<-wage_a_i-wage_a_a
# Fall 2: Bewertung zu ausl werten
erkl.2d<-wage_i_a-wage_a_a
unerkl.2d<-wage_i_i-wage_i_a

dec=6 # dezimalstellen angeben
oaxacad1<-as.data.frame(rbind(c("differenz",round(wagediffd,dec),round(wagediffd,dec)),c("charakteristika",round(erkl.1d,dec),round(erkl.2d,dec)),c("diskriminierung",round(unerkl.1d,dec),round(unerkl.2d,dec))))

# ergebnis anzeigen
stargazer(oaxacad1,type="text",summary=F,rownames=F,colnames=F)

###############################
#2. Quartil
###############################

mean.inl<-data.frame(rbind(colWeightedMeans(quart2at, w=quart2at$weights)))
mean.aus<-data.frame(rbind(colWeightedMeans(quart2oth, w=quart2oth$weights)))
#lohngleichung definieren
lohnd<-as.formula(hwage~sex+exp+I((exp)^2)+education+position+health)
# lohngleichung schaetzen
mylm.inl<-lm(lohnd,data=quart2at,weights=weights)
mylm.aus<-lm(lohnd,data=quart2oth,weights=weights)

wage_i_i<-predict(mylm.inl,newdata=mean.inl,weights=weights)
wage_a_a<-predict(mylm.aus,newdata=mean.aus,weights=weights)

wage_i_a<-predict(mylm.aus,newdata=mean.inl,weights=weights)
wage_a_i<-predict(mylm.inl,newdata=mean.aus,weights=weights)

wagediffd<-wtd.mean(quart2at$hwage, weights=quart2at$weights, normwt="ignored",
                    na.rm=TRUE)-wtd.mean(quart2oth$hwage, weights=quart2oth$weights, normwt="ignored", na.rm=TRUE)
# Fall 1: Bewertung zu inl werten
erkl.1d<-wage_i_i-wage_a_i
unerkl.1d<-wage_a_i-wage_a_a
# Fall 2: Bewertung zu ausl werten
erkl.2d<-wage_i_a-wage_a_a
unerkl.2d<-wage_i_i-wage_i_a

dec=6 # dezimalstellen angeben
oaxacad2<-as.data.frame(rbind(c("differenz",round(wagediffd,dec),round(wagediffd,dec)),c("charakteristika",round(erkl.1d,dec),round(erkl.2d,dec)),c("diskriminierung",round(unerkl.1d,dec),round(unerkl.2d,dec))))

# ergebnis anzeigen
stargazer(oaxacad2,type="text",summary=F,rownames=F,colnames=F)

##############################
#3. Quartil
#############################
mean.inl<-data.frame(rbind(colWeightedMeans(quart3at, w=quart3at$weights)))
mean.aus<-data.frame(rbind(colWeightedMeans(quart3oth, w=quart3oth$weights)))
#lohngleichung definieren
lohnd<-as.formula(hwage~sex+exp+I((exp)^2)+education+position+health)
# lohngleichung schaetzen
mylm.inl<-lm(lohnd,data=quart3at,weights=weights)
mylm.aus<-lm(lohnd,data=quart3oth,weights=weights)

wage_i_i<-predict(mylm.inl,newdata=mean.inl,weights=weights)
wage_a_a<-predict(mylm.aus,newdata=mean.aus,weights=weights)

wage_i_a<-predict(mylm.aus,newdata=mean.inl,weights=weights)
wage_a_i<-predict(mylm.inl,newdata=mean.aus,weights=weights)

wagediffd<-wtd.mean(quart3at$hwage, weights=quart3at$weights, normwt="ignored",
                    na.rm=TRUE)-wtd.mean(quart3oth$hwage, weights=quart3oth$weights, normwt="ignored", na.rm=TRUE)
# Fall 1: Bewertung zu inl werten
erkl.1d<-wage_i_i-wage_a_i
unerkl.1d<-wage_a_i-wage_a_a
# Fall 2: Bewertung zu ausl werten
erkl.2d<-wage_i_a-wage_a_a
unerkl.2d<-wage_i_i-wage_i_a

dec=6 # dezimalstellen angeben
oaxacad3<-as.data.frame(rbind(c("differenz",round(wagediffd,dec),round(wagediffd,dec)),c("charakteristika",round(erkl.1d,dec),round(erkl.2d,dec)),c("diskriminierung",round(unerkl.1d,dec),round(unerkl.2d,dec))))

# ergebnis anzeigen
stargazer(oaxacad3,type="text",summary=F,rownames=F,colnames=F)


##############################
#4. Quartil
##############################
mean.inl<-data.frame(rbind(colWeightedMeans(quart4at, w=quart4at$weights)))
mean.aus<-data.frame(rbind(colWeightedMeans(quart4oth, w=quart4oth$weights)))
#lohngleichung definieren
lohnd<-as.formula(hwage~sex+exp+I((exp)^2)+education+position+health)
# lohngleichung schaetzen
mylm.inl<-lm(lohnd,data=quart4at,weights=weights)
mylm.aus<-lm(lohnd,data=quart4oth,weights=weights)

wage_i_i<-predict(mylm.inl,newdata=mean.inl,weights=weights)
wage_a_a<-predict(mylm.aus,newdata=mean.aus,weights=weights)

wage_i_a<-predict(mylm.aus,newdata=mean.inl,weights=weights)
wage_a_i<-predict(mylm.inl,newdata=mean.aus,weights=weights)

wagediffd<-wtd.mean(quart4at$hwage, weights=quart4at$weights, normwt="ignored",
                    na.rm=TRUE)-wtd.mean(quart4oth$hwage, weights=quart4oth$weights, normwt="ignored", na.rm=TRUE)
# Fall 1: Bewertung zu inl werten
erkl.1d<-wage_i_i-wage_a_i
unerkl.1d<-wage_a_i-wage_a_a
# Fall 2: Bewertung zu ausl werten
erkl.2d<-wage_i_a-wage_a_a
unerkl.2d<-wage_i_i-wage_i_a

dec=6 # dezimalstellen angeben
oaxacad4<-as.data.frame(rbind(c("differenz",round(wagediffd,dec),round(wagediffd,dec)),c("charakteristika",round(erkl.1d,dec),round(erkl.2d,dec)),c("diskriminierung",round(unerkl.1d,dec),round(unerkl.2d,dec))))

# ergebnis anzeigen
stargazer(oaxacad4,type="text",summary=F,rownames=F,colnames=F)


################################################################################################
#Producing Tables
################################################################################################
summary(ols)

stargazer(ols,title= "Wage Regression Full Sample", type = "latex", style="qje",single.row=TRUE,align=TRUE,
          dep.var.labels = "Effect on hourly Wage",
          covariate.labels = c("Sex","Experience","Experience Squared","Medium Education",
                                "High Education","Migrant","Bad Health","Managerial Position","Intermediate Area","Rural Area"))
summary(olsateu)

stargazer(olsateu, olsoth,title= "Wage Regression Split", type = "latex", style="qje",single.row=TRUE,align=TRUE,
          dep.var.labels = "Effect on hourly Wage",
          covariate.labels = c("Sex","Experience","Experience Squared","Medium Education",
                               "High Education","Bad Health","Managerial Position","Intermediate Area","Rural Area"))


hwage<-sub.inl.aus$hwage
sumhwage<-(summary(hwage))

stargazer(sub.inl.aus, title="Hourly Wages (Full Sample)", type="latex", style="qje",
          summary.stat=c("n","min","median","mean","max"))