Untitled

#Pre-class work
dat=data.frame(x=c(1,2,3,4,5,6),
               y=c(1,3,5,6,8,12))

min.RSS <- function(data, par) {
  with(data, sum((par[1] + par[2] * x - y)^2))
}

result <- optim(par = c(0, 1), min.RSS, data = dat)

library(Matching)
data(lalonde)

optim(par=c(0,1),min.RSS,data=data.frame(y=lalonde$re78,x=lalonde$educ))

lm(re78~educ, data=lalonde)

't
Optimization:

$par
[1] 920.8695 429.5908

$value
[1] 19262169077

$counts
function gradient
     123       NA

$convergence
[1] 0

$message
NULL

Regression:

lm(formula = re78 ~ educ, data = lalonde)

Coefficients:
(Intercept)         educ
      918.2        429.9

't

claw <- function(xx) {
  x <- xx[1]
  y <- (0.46*(dnorm(x,-1.0,2.0/3.0) + dnorm(x,1.0,2.0/3.0)) +
          (1.0/300.0)*(dnorm(x,-0.5,.01) + dnorm(x,-1.0,.01) + dnorm(x,-1.5,.01)) +
          (7.0/300.0)*(dnorm(x,0.5,.07) + dnorm(x,1.0,.07) + dnorm(x,1.5,.07)))
  return(y)
}

#Claw

#clawx=sample((-200:200),replace=1)
#clawdata=data.frame(y=sapply(clawx,claw),x=clawx)
invclaw=function(x){-claw(x)}
optim(par = -2, invclaw)
optim(par = 0, invclaw)
optim(par = 2, invclaw)
#optim(par=2,claw,control=c(5,-1,0.01,1e-3,100,0,1e-8,0,10,1,5,1e7,0,10,10))
optimize(claw,c(-2,2),maximum=1)

#Contaminated data
set.seed(123)
# Define the x values
x <- c(1:12)

# Define the y values for the first 12 observations, in terms of x
# y = -x + 10 + error term
y <- c(-1*x[1:12] + 10 + rnorm(12, sd = 1))

lm(y~x)

min.reg <- function(par) {
  loss <- mean((y - (par[1]*x + par[2]))^2)
  return(loss)
}

result.reg <- optim(par = c(1,1), min.reg)
result.reg$par
result.reg$value

# Introduce a totally different data generating process for 1 observation
# (Maybe you have some small 'data contamination'): (21, 21)
x <- c(x,21)
y <- c(y, 21)

't
Use R to obtain the coefficients for the simple regression
Use optim to reproduce those coefficents
Plot the regression line
lm1 <- reg(y~x)
plot(x,y)
abline(lm1, col = “blue”, lwd = 3)
Consider what’s wrong with the regression line
Use optim() to run a robust regression (robust to data contamination) that minimizes the median squared residual instead of the mean of the squared residuals. Try it with the default optim() method, and then experiment with different starting values. Then try it with the “SANN” method, repeating the process with different starting values.
Using your best results, add a robust regression line to your plot using a different color than the one you used in (3) above.
t'