---
title: "Solutions"
output:
  html_document:
    toc: yes
---

## Required packages

```{r, message=FALSE}
library(GENESIS)
library(SeqArray)
library(SeqVarTools)
library(dplyr)
library(ggplot2)
library(Biobase)
```

```{r}
# make qqPlot function
qqPlot <- function(pval) {
    pval <- pval[!is.na(pval)]
    n <- length(pval)
    x <- 1:n
    dat <- data.frame(obs=sort(pval),
                      exp=x/n,
                      upper=qbeta(0.025, x, rev(x)),
                      lower=qbeta(0.975, x, rev(x)))
    
    ggplot(dat, aes(-log10(exp), -log10(obs))) +
        geom_line(aes(-log10(exp), -log10(upper)), color="gray") +
        geom_line(aes(-log10(exp), -log10(lower)), color="gray") +
        geom_point() +
        geom_abline(intercept=0, slope=1, color="red") +
        xlab(expression(paste(-log[10], "(expected P)"))) +
        ylab(expression(paste(-log[10], "(observed P)"))) +
        theme_bw()
}  
```

## GDS format

1. Set a filter selecting only multi-allelic variants. Inspect their genotypes using the different methods you learned above. Use the `alleleDosage` method to find dosage for the second (and third, etc.) alternate allele.

```{r exercise_gds}
# open a connection to the GDS file again
gdsfile <- "AnalysisFiles/1KG_phase3_subset_chr1.gds"
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
```

```{r}
# set your filter
n <- seqNumAllele(gds)
multi.allelic <- which(n > 2)
seqSetFilter(gds, variant.sel=multi.allelic)
```

```{r}
geno <- seqGetData(gds, "genotype")
dim(geno)
geno[,1:5,1:5]
```

```{r}
geno <- getGenotype(gds)
dim(geno)
head(geno)
```

```{r}
geno <- getGenotypeAlleles(gds)
head(geno)
```

```{r}
# count of reference alleles
dos <- refDosage(gds)
head(dos)
```

```{r}
# count of *any* alternate alleles
dos <- altDosage(gds)
head(dos)
```

```{r}
# count of the first alternate allele 
dos <- alleleDosage(gds, n=1)
head(dos)
```

```{r}
# count of the third alternate allele
dos <- alleleDosage(gds, n=3)
head(dos)
```

```{r}
# count of *each* of the alternate alleles
# returns multiple columns per variant
dos <- expandedAltDosage(gds)
head(dos)
```

2. Use the `hwe` function in SeqVarTools to run a Hardy-Weinberg Equilibrium test on each variant. Identify a variant with low p-value and inspect its genotypes. (Note that the HWE test is only valid for biallelic variants, and will return `NA` for multiallelic variants.)

```{r exercise_hwe}
# reset the filter to all variants
seqResetFilter(gds)
```

```{r}
# run HWE test
hwe.res <- hwe(gds)

# identify variants with small p-values
lowp <- !is.na(hwe.res$p) & hwe.res$p < 1e-4
head(hwe.res[lowp,])
```

```{r}
# look at the results for the most significant variant
minp <- which.min(hwe.res$p)
hwe.res[minp,]
```

```{r}
# look at the genotypes of the most significant variant
seqSetFilter(gds, variant.id=minp)
table(getGenotype(gds))
table(altDosage(gds))
```

```{r}
seqClose(gds)
```

## Association tests - Part I

1. As discussed in the lecture, we recommend a fully adjusted two-stage inverse Normalization procedure for fitting the null model when phenotypes have non-Normal distributions. Using the `two.stage` option in `fitNullModel`, fit a two-stage null model. Compare these residuals with the residuals from the original null model. 

To run the fully adjusted two.stage null model, we simply set the `two.stage` option to `TRUE`. The `norm.option` parameter determines if the inverse Normalization should be done with all samples together (`"all"`) or within each `group.var` group separately (`"by.group"`).

```{r}
# load the sample annotation data
annot <- get(load("AnalysisFiles/sample_phenotype_annotation.RData"))
annot
```

```{r null_model_two_stage}
nullmod.twostage <- fitNullModel(annot, 
                                 outcome="height", 
                                 covars=c("sex", "age", "study"), 
                                 group.var="study", 
                                 two.stage = TRUE,
                                 norm.option = "all",
                                 verbose=FALSE)
save(nullmod.twostage, file="OutputFiles/null_model_two_stage.RData")
```

```{r}
# description of the model we fit
nullmod.twostage$model
```

Compare the marginal residuals

```{r}
# load the original null model 
nullmod <- get(load("AnalysisFiles/null_model.RData"))
```

```{r}
# merge the data for plotting
pdat <- merge(nullmod$fit, nullmod.twostage$fit, 
              by = 'sample.id', suffixes = c('.orig', '.twostage'))
pdat <- merge(pdat, pData(annot), by = 'sample.id')
```

```{r}
# distribution of residuals - original null model
ggplot(pdat, aes(x = resid.marginal.orig)) + 
    geom_density(aes(color = study)) + 
    geom_density(size = 2)
```

```{r}
# distribution of residuals - two stage null model
ggplot(pdat, aes(x = resid.marginal.twostage)) + 
    geom_density(aes(color = study)) + 
    geom_density(size = 2)
```

```{r}
# compare residuals
ggplot(pdat, aes(x = resid.marginal.orig, y = resid.marginal.twostage, color = study)) + 
    geom_point() + 
    geom_abline(intercept = 0, slope = 1)
```

There is not much difference in the residual here because the distribution of height is not far from Normal to begin. See [Sofer et al.](https://onlinelibrary.wiley.com/doi/10.1002/gepi.22188) for more information on the fully adjusted two-stage model.


2. GENESIS also supports testing binary (e.g. case/control) outcomes. We can fit a null model using logistic regression by specifying the argument `family=binomial` in the `fitNullModel` function. Use the `status` column in the sample annotation to fit a null model for simulated case/control status, with `sex` and `Population` as covariates. Run single-variant association tests using this model and make a QQ plot of all variants with MAC >= 5.

When testing binary outcomes, we should fit our null model using logistic regression. To do so, we simply set the argument `family=binomial` in `fitNullModel`. Note that the parameter `group.var` is no longer relevant here, as the logistic model specifies the mean-variance relationship. 

```{r}
# load the sample annotation data
annot <- get(load("AnalysisFiles/sample_phenotype_annotation.RData"))

# load and prepare the genotype data
gdsfile <- "AnalysisFiles/1KG_phase3_subset_chr1.gds"
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)
```


```{r exercise_logistic}
# fit the null model with logistic regression
nullmod.status <- fitNullModel(annot, 
                               outcome="status", 
                               covars=c("sex", "Population"), 
                               family=binomial, 
                               verbose=FALSE)
```

```{r}
# run the single-variant association test
assoc.status <- assocTestSingle(iterator, nullmod.status, test="Score")
```

```{r}
head(assoc.status)
```

```{r}
# make a QQ plot
qqPlot(assoc.status$Score.pval[assoc.status$MAC >= 5])
```

Extra: in samples with highly imbalanced case:control ratios, the Score test can perform poorly for low frequency variants. Saddlepoint approximation (SPA) can be used to improve p-value calculations, and is available in GENESIS by setting the argument `test=Score.SPA` in `assocTestSingle`. See [Dey et al.](https://www.cell.com/ajhg/fulltext/S0002-9297(17)30201-X) and [Zhou et al.](https://www.nature.com/articles/s41588-018-0184-y) for details on using SPA in GWAS.

```{r}
seqClose(seqData)
```


## Association tests - Part II

1. Perform a sliding window SKAT test for the outcome status. Adjust your model for the covariates sex and study. When performing your SKAT test, use all variants with alternate allele frequency < 20%, and use the Wu weights to give larger weights to rarer variants. Use the same `windowSize` and `windowShift` as in the examples. How many windows have >1 variant? Make a QQ plot of the SKAT p-values. 

The first step is to fit our null model -- since our outcome, status, is a binary variable, we must fit a logistic regression null model using the `family = binomial` argument. The second step is to create our `SeqVarWindowIterator` object. The third step is to perform the SKAT test using `assocTestAggregate` -- we can set the maximum alternate allele frequency with the `AF.max` argument, and we can set the variant weights with the `weight.beta` argument.

```{r}
# load the sample annotation data
annot <- get(load("AnalysisFiles/sample_phenotype_annotation.RData"))

# load and prepare the genotype data
gdsfile <- "AnalysisFiles/1KG_phase3_subset_chr1.gds"
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
```

```{r exercise_sliding}
nullmod.status <- fitNullModel(annot, 
                               outcome="status", 
                               covars=c("sex", "study"), 
                               family=binomial, 
                               verbose=FALSE)
```

```{r}
iterator <- SeqVarWindowIterator(seqData, windowSize=10000, windowShift=5000, verbose=FALSE)

# run the SKAT test
assoc <- assocTestAggregate(iterator, 
                            nullmod, 
                            test="SKAT", 
                            AF.max=0.2, 
                            weight.beta=c(1,25),
                            verbose = FALSE)
```

```{r}
# results for each window
head(assoc$results)
```

```{r}
# how many variants in each window?
table(assoc$results$n.site)
```

```{r}
# variant details for windows with > 1 variant
idx <- which(assoc$results$n.site > 1)
head(assoc$variantInfo[idx])
```

```{r}
# make a QQ plot of the SKAT test p-values
qqPlot(assoc$results$pval)
```

```{r}
seqClose(seqData)
```

## Ancestry and Relatedness Inference

1. Perform a second PC-AiR analysis, this time using the PC-Relate kinship matrix as the kinship estimates (you should still use the KING-robust matrix for the ancestry divergence estimates). How does the unrelated set compare to the first PC-AiR analysis?

We run the second PC-AiR analysis the same as the first, except using the PC-Relate kinship matrix we created above (`pcrelMat`) as the input to parameter `kinobj` -- this means that we are using the PC-Relate estimates instead of the KING estimates to identify related pairs of samples. In the solution shown here, we also demonstrate that a `SeqVarData` object can be used as input, but we need to specify the variants to use in the analysis using the `snp.include` parameter.

```{r}
# load and prepare genotype data
gdsfile <- "AnalysisFiles/1KG_phase3_subset.gds"
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds)

# load list of LD pruned variants
pruned <- get(load("AnalysisFiles/ld_pruned_variants.RData"))

# load the KING robust kinship estimates
kingMat <- get(load("AnalysisFiles/king_matrix.RData"))

# load the PC-Relate kinship estimates from the first iteration
pcrelMat <- get(load("AnalysisFiles/pcrelate_matrix_round1.RData"))
```

```{r pcair2}
# run PC-AiR
pca2 <- pcair(seqData,
             kinobj=pcrelMat, 
             kin.thresh=2^(-9/2),
             divobj=kingMat, 
             div.thresh=-2^(-9/2),
             snp.include = pruned)
```

```{r}
names(pca2)
```

```{r}
# the unrelated set of samples
length(pca2$unrels)

# the related set of samples
length(pca2$rels)
```

```{r}
# extract the top 10 PCs and make a data.frame
pcs2 <- data.frame(pca2$vectors[,1:10])
colnames(pcs2) <- paste0('PC', 1:10)
pcs2$sample.id <- pca2$sample.id
head(pcs2)

save(pcs2, file="OutputFiles/pcs.RData")
```

We see that there are now 1,070 samples in the unrelated set, as opposed to 1,040 in the first PC-AiR analysis. This indicates that KING-robust likely overestimated relatedness for some pairs due to bias from ancestry admixture.

2. Make a parallel coordinates plot of the top 10 PC-AiR PCs. How does this compare to the plot from the first iteration? How many PCs seem to reflect ancestry?

We can reuse the code from above to make the parallel coordinates plot. 

```{r pcair2_parcoord, message = FALSE}
library(Biobase)
sampfile <- "AnalysisFiles/sample_annotation.RData"
annot <- get(load(sampfile))

pc.df <- as.data.frame(pcs2)
names(pc.df) <- 1:ncol(pcs2)
pc.df$sample.id <- row.names(pcs2)

library(dplyr)
annot <- pData(annot) %>%
        dplyr::select(sample.id, Population)
pc.df <- left_join(pcs2, annot, by="sample.id")

library(GGally)
library(RColorBrewer)
pop.cols <- setNames(brewer.pal(12, "Paired"),
                 c("ACB", "ASW", "CEU", "GBR", "CHB", "JPT", "CLM", "MXL", "LWK", "YRI", "GIH", "PUR"))
ggparcoord(pc.df, columns=1:10, groupColumn="Population", scale="uniminmax") +
    scale_color_manual(values=pop.cols) +
    xlab("PC") + ylab("")
```

The plot looks a bit cleaner than the one from the first PC-AiR analysis. Clearly, PCs 1-4 are reflective of ancestry here. In the prior analysis, PC7 also seemed to pick up some component of ancestry.

3. Perform a second PC-Relate analysis, this time using the new PC-AiR PCs to adjust for ancestry. Make a hexbin plot of estimated kinship vs IBD0. 

We run the second PC-Relate analysis the same as the first, except using the new PC-AiR PCs that we just generated to adjust for ancestry, and using the new PC-AiR unrelated set as our `training.set`. 

```{r pcrelate2}
# filter the GDS object to our LD-pruned variants
seqSetFilter(seqData, variant.id=pruned)
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)

# run PC-Relate
pcrel2 <- pcrelate(iterator, 
                  pcs=pca2$vectors[,1:4], 
                  training.set=pca2$unrels)

save(pcrel2, file="OutputFiles/pcrelate_kinship.RData")
```

```{r pcrelate2_plot}
ggplot(pcrel2$kinBtwn, aes(k0, kin)) +
    geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
    geom_hex(bins = 100) +
    geom_abline(intercept = 0.25, slope = -0.25) + 
    ylab("kinship estimate") +
    theme_bw()
```

```{r}
seqClose(seqData)
```

## Mixed models

1. Perform a single-variant association test for `status`. Adjust for sex, age, study, ancestry, and kinship in the model. Don't forget to consider the `family` parameter. Make a QQ plot of the p-values for all variants with MAC >= 5.

The first step is to fit the null model. We include the first 4 PCs as covariates in our model to adjust for ancestry, and we include a random effect proportional to the kinship matrix to adjust for genetic relatedness. Recall that with a binary outcome, we set `family = binomial` -- because we have a random effect, this will fit an approximate logistic mixed model using the [GMMAT method](https://www.cell.com/ajhg/fulltext/S0002-9297(16)00063-X).

```{r}
# load the sample annotation data
annot <- get(load("AnalysisFiles/sample_phenotype_pcs.RData"))

# load and prepare the genotype data
gdsfile <- "AnalysisFiles/1KG_phase3_subset_chr1.gds"
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)

# load the kinship matrix
kinfile <- "AnalysisFiles/pcrelate_kinship.RData"
pcrel <- get(load(kinfile))
kinship <- pcrelateToMatrix(pcrel, scaleKin=2, verbose=FALSE)
```

```{r exercise_mm_nullmod}
nullmod.status <- fitNullModel(annot, 
                               outcome="status", 
                               covars=c("sex", "age", "study", paste0("PC", 1:4)), 
                               cov.mat=kinship, 
                               family = binomial,
                               verbose=FALSE)
```

```{r}
# description of the model we fit
nullmod.status$model
```

```{r}
# fixed effect regression estimates
nullmod.status$fixef
```

```{r}
# variance component estimates by group.var
nullmod.status$varComp
```

Now that we have the null model, we perform the single-variant association tests and make the QQ plot the same way as before. 

```{r exercise_mm_assoc}
# run the single-variant association test
assoc.status <- assocTestSingle(iterator, nullmod.status, test="Score")
```

```{r}
head(assoc.status)
```

```{r}
qqPlot(assoc.status$Score.pval[assoc.status$MAC >= 5])
```

```{r}
seqClose(seqData)
```


## Variant annotation

1. Using [Annotation Explorer](https://platform.sb.biodatacatalyst.nhlbi.nih.gov/u/biodatacatalyst/annotation-explorer/), generate a new set of aggregation units by setting up the same filtering criteria as in use case 1, but this time use a different CADD phred score cut-off (example: 40, 10) and study how that changes plots under the `interactive plots` tab of Annotation Explorer. For example, how does changing the filtering criteria change the number of aggregation units with no variants? How does the distribution and number of aggregation units in each bin change in the histogram?

In general, a more stringent filtering approach will reduce the number of aggregation units which have at least one variant (for example, you will see fewer units with no variants at CADD phred cut-off 10 vs 40). The change in the histogram that shows the total number of aggregation units (Y-axis) in each of the bins with varying variant number ranges (X-axis) depends on the characteristics of the features used for grouping criteria (size of the aggregating regions) and distribution of values of the annotation field used for filtering. 

Sometimes there is not an obvious or recommended cut-off to implement for an annotation field. Playing with varying filtering criteria can help a user visualize its effects on the aggregation unit characteristics and may assist them in choosing a filtering criteria in an informed way


## Annotation informed aggregate association tests

1. Since we are working with a subset of the data, many of the genes listed in `group_id` have a very small number of variants. Create a new set of aggregation units based on position, rather than gene name -- create 10 units that are 1MB long and span all of the chr1 variants by using the TopmedPipeline function `aggregateGRanges`. Run a SKAT test using those units and a `SeqVarRangeIterator` object.

```{r}
# load aggregation units
aggfile <- "AnalysisFiles/variants_by_gene.RData"
aggunit <- get(load(aggfile))
# subset to chromosome 1
aggunit1 <- filter(aggunit, chr == 1)
```


```{r exercise_aggregate}
# minimum variant position
minp <- min(aggunit1$pos)
# maximum variant position
maxp <- max(aggunit1$pos)

# create a data frame breaking the position range into 10 pieces
aggByPos <- data.frame(chr=1,
                       start=seq(minp, maxp-1e6, length.out=10),
                       end=seq(minp+1e6, maxp, length.out=10))
aggByPos$group_id <- 1:nrow(aggByPos)
dim(aggByPos)
head(aggByPos)
```

```{r}
aggVarList <- TopmedPipeline::aggregateGRanges(aggByPos)
aggVarList
```

```{r, message=FALSE}
# load sample annotation
annotfile <- "AnalysisFiles/sample_phenotype_pcs.RData"
annot <- get(load(annotfile))

# load and prepare genotype data
gdsfile <- "AnalysisFiles/1KG_phase3_subset_chr1.gds"
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
```

```{r}
# prepare iterator using defined aggregation units
iterator <- SeqVarRangeIterator(seqData, variantRanges=aggVarList, verbose=FALSE)
```

```{r}
# run SKAT test by aggregation unit
assoc <- assocTestAggregate(iterator, 
                            nullmod, 
                            test="SKAT", 
                            AF.max=0.1, 
                            weight.beta=c(1,25))
```

```{r}
assoc$results
```

```{r}
seqClose(seqData)
```