Untitled

# Function to make ML estimate of EPP rate p plus 95% profile likelihood confidence intervals,
# taking into account that for pairs with mismatches multiple EPP events could have occured
#
# input is
#     x=vector of booleans or 0 and 1s specifying if there was a mismatch or not (1 or TRUE = mismatch), n=vector with nr of meioses per pair
# optional arguments:
#     plotit = make plot of log likelihood and Chi squared statistic as a function of p
#     pmax=max EPP prob in plots
# output is named list with est = ML estimate and LCL and UCL = lower and upper 95% C.L.s of EPP rate

estimateP = function(x, n , plotit=TRUE, pmax=0.05) {
  if (!is.logical(x[[1]])) x = (x==1)
  logLp = function(p, x, n)  sum((log(1 - (1-p)^n))[x]) + sum((n*log(1-p))[!x]) # see http://stats.stackexchange.com/questions/152111/censored-binomial-model-log-likelihood
  Chisq = function (p0, phat, x, n) 2*(logLp(phat, x, n) - logLp(p0, x, n)) # chi square LRT statistic of model with p0 vs one with best ML estimate phat
  phat = optimize(function(p) -2*logLp(p, x, n), lower=1e-16, upper=1-1e-16, tol=1e-8)$minimum # ML estimate of p
  # or using f <- function(q, x, n) sum((n / (1 - q^n))[x]) - sum(n)
  # phat = 1-optimize(function(q) f(q, x, n)^2, lower=1e-16, upper=1-1e-16, tol=1e-8)$minimum
  pUCL = optimize(function(p0) (Chisq(p0, phat, x, n)-qchisq(0.95, df = 1))^2, lower=phat, upper=1-1e-16, tol=1e-16)$minimum
  pLCL = optimize(function(p0) (Chisq(p0, phat, x, n)-qchisq(0.95, df = 1))^2, lower=1e-16, upper=phat, tol=1e-16)$minimum
  if (plotit) { pvals = seq(0, pmax, length.out=10000)
                plot(pvals, sapply(pvals,function(p) logLp(p, x, n)), type="l", ylab="Log likelihood", xlab="p")
                plot(pvals, sapply(pvals,function(p0) Chisq(p0, phat, x, n)), type="l", ylab="Chi square LRT statistic", xlab="p", ylim=c(0,qchisq(0.95, df = 1)))
                abline(h = qchisq(0.95, df = 1), col="red", lty=2) # corresponding to 95% C.L.s on p
  }

  return(list(est=phat, LCL=pLCL, UCL=pUCL))
}

n = c(7, 7, 7, 7, 7, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 23, 23, 24, 24, 25, 27, 31) # number of generations/meioses that separate presumed common paternal ancestor
x = c(rep(0,6), 1, rep(0,7), 1, 1, 1, 0, 1, rep(0,20), 1, rep(0,13), 1, 1, rep(0,5)) # whether pair of individuals had non-matching Y chromosomal genotypes

est = estimateP(x,n)
c(est$est, est$LCL, est$UCL)*100
0.9096007 0.4165108 1.6877681

# ML estimation, assuming that EPP rate p shows a temporal trend
# where log(p/(1-p))=a+b*n
# here we make a ML estimate of parameters a and b
# and we use package bbmle for convenience

estimatePtemp = function(x, n) {
  if (!is.logical(x[[1]])) x = (x==1)
  pfun = function(a, b, n) exp(a+b*n)/(1+exp(a+b*n)) # we now write p as a function of a, b and n
  logL = function(a, b, x, n)  sum((log(1 - (1-pfun(a, b, n))^n))[x]) +
    sum((n*log(1-pfun(a, b, n)))[!x]) # a and b are params to be estimated, modified from http://stats.stackexchange.com/questions/152111/censored-binomial-model-log-likelihood
  neglogL = function(a, b, x, n)  -logL(a, b, x, n) # negative log-likelihood
  require(bbmle)
  fit = mle2(neglogL, start=list(a=-3, b=-0.1), data=list(x=x, n=n))
  return(fit)
}

# fitted coefficients
estfit = estimatePtemp(x, n)
cbind(coef(estfit),confint(estfit)) # parameter estimates and profile likelihood confidence intervals
#                      2.5 %      97.5 %
#   a -3.09054167 -5.3191406 -1.12078519
#   b -0.09870851 -0.2396262  0.02848305
summary(estfit)
# Coefficients:
#      Estimate Std. Error z value    Pr(z)
#   a -3.090542   1.057382 -2.9228 0.003469 **
#   b -0.098709   0.067361 -1.4654 0.142819

pfun = function(a, b, n) exp(a+b*n)/(1+exp(a+b*n))
xvals=1:max(n)
par(mfrow=c(1,1))
plot(xvals,sapply(xvals,function (n) pfun(coef(estfit)[1], coef(estfit)[2], n)),
     type="l", xlab="Generations ago (n)", ylab="EPP rate (p)")