Untitled

---
title: "Plot Reduce Diagnostic Report (`r jobslug`)"
author: "SilviaTerra, LLC"
date: "`r as.character(format(Sys.time(), format = '%B %d %Y %X'))`"
output: html_document
---

```{r setup, include = F}
library(boot)
library(dplyr)
library(doParallel)
library(ggplot2)
library(glmnet)
library(lme4)
library(nnet)
library(parallel)
library(pander)
library(randomForest)
library(tidyr)
library(tsalgo)
library(tsutils)
library(stheight)
library(stvolume)
library(rstanarm)
options(mc.cores = parallel::detectCores() - 2)
library(bayesplot)

opts_chunk$set(echo = FALSE,
		message = FALSE,
		warning = FALSE,
		error = FALSE,
		cache = TRUE,
		dev = 'png',
		fig.path = figureDir,
		cache.path = cacheDir,
		fig.width = 10,
		fig.height = 10,
		results = 'asis',
		comment = NA)

options(show.signif.stars = FALSE)

```

```{r debug, eval = F}
jobslug <- 'hfm-2018-f1-cc'
get_prepped(jobslug, setup = T)
groupSize <- 5
```

```{r dataPrep, include = FALSE}
cruiseData <- read_cruiseDat(cruiseJson)

#filter this two different ways
cruiseDataNoZero <- cruiseData %>%
  filter(num_records > 0) %>%
  filter(!is.na(diameter))

trainingData <- read_td(trainingFile)

trainingData$tph <- as.integer(trainingData$tph)

######## Define predictors for models

distroVarsIDX <- grep("diam", x = names(trainingData))
impVarsIDX <- grep("spp", x = names(trainingData))
impVarsIDX  <- impVarsIDX[!grepl('meta', names(trainingData)[impVarsIDX])]

pcScoresIDXRaw <- grep('pca', x = names(trainingData))
nonPcas <- grep('nonpca', x = names(trainingData))

pcScoresIDX <- pcScoresIDXRaw[pcScoresIDXRaw %nin% nonPcas]
defaultPredictors <- paste0('pca', 1:length(pcScoresIDX), collapse = '+')

#Pull in the HFM strata table
system('s3cmd get s3://silviaterra-sequoia/hfm-2018/hfm-strata-tab.csv /tmp/')

stratTab <- read.csv('/tmp/hfm-strata-tab.csv')

#Modify **AND RUN** to define properly!
transform_the_predictors <- function(pixeldataframe) {
	#add any necessary transformations to the X variables
	# for example:
	#pixeldataframe$pca1cubed <- pixeldataframe$pca1 ^ 3
	# This is also a good place to deal with factors that will be used
	# in bespoke models.
  pixeldataframe <- left_join(pixeldataframe, stratTab, by = c('meta_Strata' = 'Strata')) %>%
                    mutate(meta_megaStrata = megaStrata) %>%
                    select(-CruiseType, -ForType, -megaStrata)
	return(pixeldataframe)
}

trainingData <- transform_the_predictors(trainingData)

# check metadata for factor levels not in training data
system(paste0('s3cmd get --force s3://silviaterra-sequoia/', jobslug, '/setup/bounds/metadata.csv /tmp/metadata.csv'))
metadata <- read.csv('/tmp/metadata.csv')

tdStrata <- unique(trainingData$meta_megaStrata)
allStrata <- metadata %>%
	left_join(stratTab) %>%
	select(megaStrata) %>%
	distinct() %>%
	.$megaStrata

if(!all(as.character(allStrata) %in% as.character(tdStrata))) {
	stop('there are strata in the metadata that are not present in training data')
}

stemsPooledFormula <- as.formula(paste('tph + 1 ~ ', defaultPredictors))
baPooledFormula <- as.formula(paste('baph + 1 ~ ', defaultPredictors))

sizeTest <- trainingData %>%
		dplyr::group_by(meta_stand_id) %>%
		dplyr::summarize(nPlots = n()) %>%
		dplyr::ungroup()

if (groupSize > max(sizeTest$nPlots)) {
	stop("Group size is larger than the maximum number of plots in any stand")
}

if (groupSize < 4) {
	stop("Group size is less than 4. What are you thinking?")
}

fullTrainingData <- trainingData %>%
		mutate(cruiser = cruiseData$cruiser[match(plot_id, cruiseData$plot_id)])

zeroPlots <- fullTrainingData$plot_id[grep('zero', fullTrainingData$cruiser)]

```

----------------------

Basal area models
----------------------
### BAPH POOLED MODEL
```{r baphPooled}

#Deprecated Stan mean model until we figure out a permanent solution
# baphPooled <- stan_glm(baPooledFormula, data = fullTrainingData,
# 		family = Gamma(link = 'log'))

#previous default pooled form - currently used for mean estimates
baphPooledDefault <- glm(baPooledFormula, data = fullTrainingData,
		family = gaussian(link = 'log'),
		control = list(maxit = 1e3))

#currently being used to get precision estimates
baphPrecisionDefault <- stan_glm(baPooledFormula, data = fullTrainingData,
		family = gaussian(link = 'identity'))


#Models using plot inclusion probabilities
# totalAcres <- sum(metadata$stand_acreage)
# cruisedStands <- unique(fullTrainingData$meta_stand_id)
# cruiseWeights <- metadata %>%
#                  filter(stand_id %in% cruisedStands) %>%
#                  mutate(weight = stand_acreage / totalAcres) %>%
#                  select(stand_id, weight)
#
#fullTrainingData <- left_join(fullTrainingData, stand_weights, by = c('meta_stand_id' = 'stand_id'))
#
# #currently being used to get precision estimates
# baphPooledIdent <- stan_glm(baPooledFormula, data = fullTrainingData, weights = fullTrainingData$weight,
# 		family = gaussian(link = 'identity'))
#
# #previous default pooled form - currently used for mean estimates
# baphGlm <- glm(baPooledFormula, data = fullTrainingData, weights = fullTrainingData$weight,
# 		family = gaussian(link = 'log'),
# 		control = list(maxit = 1e3))


```

```{r baphBespoke}

# cat('### Bespoke model')
# ---------------------------------------------------------------------------
### BEGIN BESPOKE FITTING - fit by hand below
# ---------------------------------------------------------------------------

bespokePredictors <- gsub('pca9+', '', defaultPredictors)

baBespokeFormula <- as.formula(paste('baph + 1 ~ ', bespokePredictors))

#previous default pooled form - currently used for mean estimates
baphBespoke <- glm(baBespokeFormula, data = fullTrainingData,
		family = gaussian(link = 'log'))

#currently being used to get precision estimates
baphBespokePrecision <- stan_glm(baBespokeFormula, data = fullTrainingData,
		family = gaussian(link = 'identity'), adapt_delta = 0.99)

if(exists('baphBespoke')){
    baphPooled <- baphBespoke
    baphPrecision <- baphBespokePrecision
} else{
    baphPooled <- baphPooledDefault
    baphPrecision <- baphPrecisionDefault
}

transform_the_baph_prediction <- function(predicted) {
	#add any necessary backtransformations to the baph value from prediction function
	# for example: default model glm(family = gaussian(link = 'log'))
	 backtransformed <- exp(predicted) - 1
	#backtransformed <- predicted - 1  #Change this line as necessary!
	return(backtransformed)
}

baphCoeffTable <- bind_cols(data.frame(baphPooled$coefficients),
							data.frame(baphPrecision$coefficients)) %>%
	mutate(coefficient = row.names(data.frame(baphPooled$coefficients)))

#Look at posterior predicted means and SD
baphSims <- posterior_predict(baphPrecision) - 1
baphMeans <- apply(baphSims, 1, mean)

#view baphMeans
hist(baphMeans)
#view relative precision of baph and tph
sd(baphMeans) / mean(baphMeans)


baphSD <- apply(baphSims, 2, function(x){sd(x)})

baphPredFrame <- data.frame(plot_id = fullTrainingData$plot_id,
														megaStrata = fullTrainingData$meta_megaStrata,
														observed = fullTrainingData$baph)
baphPredFrame$stanPred <- colMeans(baphSims) - 1
baphPredFrame$glmPred <- exp(predict(baphPooled)) - 1

baphPlotFrame <- baphPredFrame %>%
	gather(source, predicted, -plot_id, -observed, -megaStrata) %>%
	mutate(resid = predicted - observed)

baphRmseStan <- baphPlotFrame %>%
	filter(source == 'stanPred') %>%
	summarize(rmse = sqrt(mean(resid ^ 2))) %>%
	.$rmse

baphRmseGlm <- baphPlotFrame %>%
	filter(source == 'glmPred') %>%
	summarize(rmse = sqrt(mean(resid ^ 2))) %>%
	.$rmse

#####
#CHECK MODEL CONVERGENCE
#####
#print out table of parameter means, summary stats, and convergence diagnostics
pander(data.frame(summary(baphPrecision)))

#Look at plot of Rhats and effective sample size
#Ideally, all Rhats are < 1.1 and all Neff are > 0.25
#These are logical checks to ensure model converged
plot(baphPrecision, 'rhat')
#ratio of effective sample size over total iterations drawn
plot(baphPrecision, 'neff_hist')

#Predicted vs. observed
ggplot(data = baphPlotFrame, aes(x = observed, y = predicted, col = megaStrata)) +
		geom_point(alpha = 0.5) + geom_abline(intercept = 0, slope = 1) +
		theme_bw() + facet_wrap(~source)
#Observed vs. residuals
ggplot(data = baphPlotFrame, aes(x = observed, y = resid, col = megaStrata)) +
		geom_point(alpha = 0.5) + geom_abline(intercept = 0, slope = 0) +
		theme_bw() + facet_wrap(~source)

#Precision diagnostics, posterior predictive checks, and other diagnostics
#Compare distribution of baph and tph (y is observed) to multiple simulated datasets from model (yrep)
pp_check(baphPrecision) + ggtitle('baph') +
  ggtitle('We are not currently using this model.')
#^^^ we aren't currently using this model

#Posterior predictive tests of median, upper, and lower quantiles
ppc_stat(trainingData$baph, baphSims, stat = "mean") + ggtitle('Posterior predictive test of sample mean')
ppc_stat(trainingData$baph, baphSims, stat = function(x){quantile(x, 0.95)}) + ggtitle('Posterior predictive test of upper (95%) quantile')
ppc_stat(trainingData$baph, baphSims, stat = function(x){quantile(x, 0.05)}) + ggtitle('Posterior predictive test of lower (5%) quantile')

#Look at posterior predictive intervals of each plot from baphPrecision
baphStanFrame <- baphPlotFrame %>% filter(source == 'stanPred')
ppc_intervals(colMeans(baphSims) - 1, baphSims - 1, x = baphStanFrame$observed)

# TODO figure out why gaussian(link = 'log') gives bogus results

# TODO look into different default priors (rather than out of box)
# e.g. lasso priors

# TODO look into QR transforming the predictors

# TODO incorporate plot inclusion probability into observation weights

```

__baph default pooled GLM model RMSE is
`r round(baphRmseGlm, digits = 2)` m2 per hectare from the model

----------------------

Stem density models
----------------------
### TPH POOLED MODEL
```{r tphPooled}

#Deprecated Stan mean model until we figure out a permanent solution
# tphPooledDefault <- stan_glm(stemsPooledFormula, data = fullTrainingData,
# 		family = poisson(link = 'log'))

#previous default pooled form - currently used for mean estimates
tphPooledDefault <- glm(stemsPooledFormula, data = fullTrainingData,
		family = gaussian(link = 'log'),
		control = list(maxit = 1e3))

#currently being used to get precision estimates
tphPrecisionDefault <- stan_glm(stemsPooledFormula, data = fullTrainingData,
		family = gaussian(link = 'identity'))


#Models using plot inclusion probabilities
# #currently being used to get precision estimates
# tphPooledIdent <- stan_glm(stemsPooledFormula, data = fullTrainingData, weights = fullTrainingData$weight,
# 		family = gaussian(link = 'identity'))
#
# #previous default pooled form - currently used for mean estimates
# tphGlm <- glm(stemsPooledFormula, data = fullTrainingData, weights = fullTrainingData$weight,
# 		family = gaussian(link = 'log'),
# 		control = list(maxit = 1e3))
```

```{r tphBespoke}

# cat('### Bespoke model')
# ---------------------------------------------------------------------------
### BEGIN BESPOKE FITTING - fit by hand below
# ---------------------------------------------------------------------------
#

stemsBespokeFormula <- as.formula(paste('tph + 1 ~ ', bespokePredictors))

#previous default pooled form - currently used for mean estimates
tphBespoke <- glm(stemsBespokeFormula, data = fullTrainingData,
		family = gaussian(link = 'log'))

#currently being used to get precision estimates
tphBespokePrecision <- stan_glm(stemsBespokeFormula, data = fullTrainingData,
		family = gaussian(link = 'identity'), adapt_delta = 0.99)

if(exists('tphBespoke')){
    tphPooled <- tphBespoke
    tphPrecision <- tphBespokePrecision
} else {
    tphPooled <- tphPooledDefault
    tphPrecision <- tphPrecisionDefault
}

transform_the_tph_prediction <- function(predicted) {
	#add any necessary backtransformations to the tph value from prediction function
	# for example: default model glm(family = gaussian(link = 'log'))
	 backtransformed <- exp(predicted) - 1
	#backtransformed <- predicted - 1 #Change this line as necessary!
	return(backtransformed)
}


tphCoeffTable <- bind_cols(data.frame(tphPooled$coefficients),
							data.frame(tphPrecision$coefficients)) %>%
	mutate(coefficient = row.names(data.frame(tphPooled$coefficients)))

#Look at posterior predicted means and SD
tphSims <- posterior_predict(tphPrecision) - 1
tphMeans <- apply(tphSims, 1, mean)

#view baphMeans
hist(tphMeans)
#view relative precision of baph and tph
sd(tphMeans) / mean(tphMeans)


tphSD <- apply(tphSims, 2, function(x){sd(x)})

tphPredFrame <- data.frame(plot_id = fullTrainingData$plot_id,
													 megaStrata = fullTrainingData$meta_megaStrata,
													 observed = fullTrainingData$tph)
tphPredFrame$stanPred <- colMeans(tphSims) - 1
tphPredFrame$glmPred <- exp(predict(tphPooled)) - 1

tphPlotFrame <- tphPredFrame %>%
	gather(source, predicted, -plot_id, -observed, -megaStrata) %>%
	mutate(resid = predicted - observed)

tphRmseStan <- tphPlotFrame %>%
	filter(source == 'stanPred') %>%
	summarize(rmse = sqrt(mean(resid ^ 2))) %>%
	.$rmse

tphRmseGlm <- tphPlotFrame %>%
	filter(source == 'glmPred') %>%
	summarize(rmse = sqrt(mean(resid ^ 2))) %>%
	.$rmse

#####
#CHECK MODEL CONVERGENCE
#####
#print out table of parameter means, summary stats, and convergence diagnostics
pander(data.frame(summary(tphPrecision)))

#Look at plot of Rhats and effective sample size
#Ideally, all Rhats are < 1.1 and all Neff are > 0.25
#These are logical checks to ensure model converged
plot(tphPrecision, 'rhat')
#ratio of effective sample size over total iterations drawn
plot(tphPrecision, 'neff_hist')

#Predicted vs. observed
ggplot(data = tphPlotFrame, aes(x = observed, y = predicted, col = megaStrata)) +
		geom_point(alpha = 0.5) + geom_abline(intercept = 0, slope = 1) +
		theme_bw() + facet_wrap(~source)
#Observed vs. residuals
ggplot(data = tphPlotFrame, aes(x = observed, y = resid, color = megaStrata)) +
		geom_point(alpha = 0.5) + geom_abline(intercept = 0, slope = 0) +
		theme_bw() + facet_wrap(~source)

#Precision diagnostics, posterior predictive checks, and other diagnostics
#Compare distribution of baph and tph (y is observed) to multiple simulated datasets from model (yrep)
pp_check(tphPrecision) + ggtitle('tph - we are not currently using this model')
#^^^ we aren't currently using this model

#Posterior predictive tests of median, upper, and lower quantiles
ppc_stat(trainingData$tph, tphSims, stat = "mean") + ggtitle('Posterior predictive test of sample mean')
ppc_stat(trainingData$tph, tphSims, stat = function(x){quantile(x, 0.95)}) + ggtitle('Posterior predictive test of upper (95%) quantile')
ppc_stat(trainingData$tph, tphSims, stat = function(x){quantile(x, 0.05)}) + ggtitle('Posterior predictive test of lower (5%) quantile')

#Look at posterior predictive intervals of each plot from baphPrecision
tphStanFrame <- tphPlotFrame %>% filter(source == 'stanPred')
ppc_intervals(colMeans(tphSims) - 1, tphSims - 1, x = tphStanFrame$observed)

# TODO figure out why gaussian(link = 'log') gives bogus results

# TODO look into different default priors (rather than out of box)
# e.g. lasso priors

# TODO look into QR transforming the predictors

# TODO incorporate plot inclusion probability into observation weights


```


__tph default pooled GLM model RMSE is
`r round(tphRmseGlm, digits = 2)` stems per hectare from the model.__


```{r defineCaps}

pooled_baphCap <- quantile(trainingData$baph, 0.99) + sd(trainingData$baph)
pooled_tphCap <- quantile(trainingData$tph, 0.99) + sd(trainingData$tph)

```
The baph cap is set at `r round(pooled_baphCap, 1) ` for pooled models.
The tph cap is set at `r round(pooled_tphCap, 1)` for pooled models.


```{r ModelAssistedRatios}

trainingData$baphPred <- baphPredFrame$glmPred
trainingData$tphPred <- tphPredFrame$glmPred


##Can manually change nPlots filter here if needed

# Model-assisted stand-level ratios
standRatVals <- trainingData %>%
	filter(plot_id %nin% zeroPlots) %>%
	group_by(meta_stand_id) %>%
	summarize(baphObsMean = mean(baph),
		tphObsMean = mean(tph),
		baphPredMean = mean(baphPred),
		tphPredMean = mean(tphPred),
		nPlots = n()) %>%
	mutate(tphRat = tphObsMean / tphPredMean,
		baphRat = baphObsMean / baphPredMean) %>%
	filter(nPlots >= 5)

# Model-assisted strata-level ratios

# define grouping variable in metadata (without meta_ prefix)
# if no grouping structure, set to NULL
strataVar <- 'megaStrata'

# if no grouping variable, just get population-level ratios
if(is.null(strataVar)) {
	strataRatios <- trainingData %>%
		filter(plot_id %nin% zeroPlots) %>%
		mutate(strataVar = 'population') %>%
		group_by(strataVar) %>%
		summarize(baphObsMean = mean(baph),
			tphObsMean = mean(tph),
			baphPredMean = mean(baphPred),
			tphPredMean = mean(tphPred),
			nPlots = n()) %>%
		mutate(tphRat = tphObsMean / tphPredMean,
			baphRat = baphObsMean / baphPredMean)
} else {
	trainingData$strataVar <- trainingData[, paste0('meta_', strataVar)]

	strataTab <- trainingData %>%
		select(meta_stand_id, strataVar) %>%
		distinct()

	strataRatios <- trainingData %>%
		filter(plot_id %nin% zeroPlots) %>%
		group_by(strataVar) %>%
		summarize(baphObsMean = mean(baph),
			tphObsMean = mean(tph),
			baphPredMean = mean(baphPred),
			tphPredMean = mean(tphPred),
			nPlots = n()) %>%
		mutate(tphRat = tphObsMean / tphPredMean,
			baphRat = baphObsMean / baphPredMean)
}

# model-assisted population level ratios
populationRatios <- trainingData %>%
	filter(plot_id %nin% zeroPlots) %>%
	summarize(baphObsMean = mean(baph),
		tphObsMean = mean(tph),
		baphPredMean = mean(baphPred),
		tphPredMean = mean(tphPred),
		nPlots = n()) %>%
	mutate(tphRat = tphObsMean / tphPredMean,
		baphRat = baphObsMean / baphPredMean)

varianceTable <- trainingData %>%
	filter(plot_id %nin% zeroPlots) %>%
	filter(meta_stand_id %in% standRatVals$meta_stand_id) %>%
	mutate(BAPH_resid = baphPred - baph,
		TPH_resid = tphPred - tph) %>%
	group_by(meta_stand_id) %>%
	mutate(meanBAPH_resid= mean(BAPH_resid),
		meanTPH_resid = mean(TPH_resid)) %>%
	ungroup() %>%
	mutate(BAPH_errsquared = (BAPH_resid - meanBAPH_resid) ^ 2,
		TPH_errsquared = (TPH_resid - meanTPH_resid) ^ 2) %>%
	group_by(meta_stand_id) %>%
	left_join(., standRatVals) %>%
	summarize(n = length(tph),
		varBAPH_MA = sum(BAPH_errsquared) / (length(baph) - 1),
		varTPH_MA = sum(TPH_errsquared) / (length(tph) - 1),
		varBAPH_DB = sum((baph[plot_id %nin% zeroPlots] -
			unique(baphObsMean)) ^ 2) /
			(length(baph[plot_id %nin% zeroPlots]) - 1),
			varTPH_DB = sum((tph[plot_id %nin% zeroPlots] -
				unique(tphObsMean)) ^ 2) /
				(length(tph[plot_id %nin% zeroPlots]) - 1)) %>%
	select(meta_stand_id, n,
			varBAPH_DB, varTPH_DB,
			varBAPH_MA, varTPH_MA, n) %>%
	ungroup()

standRatios <- left_join(standRatVals, varianceTable) %>%
	filter(meta_stand_id != 'OC0201-2725')

# get MA population CIs
metJoin <- trainingData %>%
	select(meta_stand_id, meta_stand_acre) %>%
	distinct()

maCIs <- varianceTable %>%
	mutate(baphMaSe = sqrt(varBAPH_MA / n),
		tphMaSe = sqrt(varTPH_MA / n)) %>%
	mutate(bapaMaSe = convert_baph_to_bapa(baphMaSe),
		tpaMaSe = tphMaSe / 2.47) %>%
	select(meta_stand_id, bapaMaSe, tpaMaSe, n) %>%
	left_join(metJoin) %>%
	summarize(bapaMaSePop = sqrt(
					sum((meta_stand_acre / sum(meta_stand_acre)) ^ 2 * bapaMaSe ^ 2)
				),
			tpaMaSePop = sqrt(
					sum((meta_stand_acre / sum(meta_stand_acre)) ^ 2 * tpaMaSe ^ 2)
				),
			nStands = n(), nPlots = sum(n)) %>%
	mutate(bapaCI = qt(1 - 0.1 / 2, df = nPlots - nStands) * bapaMaSePop,
		tpaCI = qt(1 - 0.1 / 2, df = nPlots - nStands) * tpaMaSePop)

pander(maCIs)

```

--------------------------------

Diameter distribution models
--------------------------------

```{r distroFit}

# Necessary to register parallel clusters by hand
cl <- makeForkCluster(detectCores())
registerDoParallel(cl)

genericModelBuild <- function(N, ys, predVars, ...) {

	if (is.null(ncol(ys))) { # in case only one spp.
		Gpsi <- ys
	} else {
		Gpsi <- ys[, N]
	}

	dModelObject <- cbind(data.frame(y = Gpsi), predVars)

	# TODO fit RF model, keep all trees
	GModelRF <- randomForest(y ~ .,
                       data = dModelObject,
                       ntree = 50, keep.forest = T,
                       type = 'regression')

	return(GModelRF)
}

dist_to_drop <- which(is.na(fullTrainingData[, distroVarsIDX[1]]))

if(length(dist_to_drop) > 0) {
	fullTrainingDataNoNA <- fullTrainingData[-dist_to_drop, ] %>%
	  filter(tph > 0)
} else {
	fullTrainingDataNoNA <- fullTrainingData %>%
	  filter(tph > 0)
}

distroDefault <- lapply(1:length(distroVarsIDX), genericModelBuild,
		ys = fullTrainingDataNoNA[, distroVarsIDX],
		predVars = fullTrainingDataNoNA[, pcScoresIDX])

distroBespokePredictors <- NULL

# ------------------------------------------------------------------
### Bespoke diameter distribution example
# ------------------------------------------------------------------
distroBespokePredictors <- c(names(fullTrainingDataNoNA)[pcScoresIDX], 'meta_megaStrata')
distroBespokePredictors <- distroBespokePredictors[distroBespokePredictors != 'pca9']

distroBespoke <- lapply(1:length(distroVarsIDX), genericModelBuild,
                        ys = fullTrainingDataNoNA[, distroVarsIDX],
                        predVars = fullTrainingDataNoNA[, distroBespokePredictors])
#
# -----------------------------------------------------------------

if(exists('distroBespoke')) {
	distroSet <- distroBespoke
} else {
	distroSet <- distroDefault
}

names(distroSet) <- names(fullTrainingDataNoNA)[distroVarsIDX]

```

```{r distroDiagnostics}

distroDF <- data.frame()
for (i in 1:length(distroSet)) {
  tmpPred <- predict(distroSet[[i]], newdata = fullTrainingDataNoNA, predict.all = TRUE)
	tempDF <- data.frame(predicted = tmpPred$aggregate)
	tempDF$observed <- fullTrainingDataNoNA[, distroVarsIDX[i]]
	tempDF$meta_stand_id <- fullTrainingDataNoNA$meta_stand_id
	tempDF$plot_id <- fullTrainingDataNoNA$plot_id
	tempDF$meta_megaStrata <- fullTrainingDataNoNA$meta_megaStrata
	dbhClass <- names(fullTrainingDataNoNA)[distroVarsIDX][i]
	tempDF$dbhClass <- as.numeric(strsplit(dbhClass, split = '_')[[1]][2])
	tempDF$SD <- apply(tmpPred$individual, 1, function(x) {sd(x)})
	distroDF <- rbind(distroDF, tempDF)
}

# normalize predicted distro for each plot (so all distro scores sum to one for each plot)
normalized <- distroDF %>%
	select(plot_id, dbhClass, predicted) %>%
	group_by(plot_id) %>%
	mutate(predicted = ifelse(predicted < 0, 0, predicted)) %>%
	mutate(predNorm = predicted / sum(predicted)) %>%
	select(plot_id, dbhClass, predNorm)

distroDF <- distroDF %>%
	left_join(normalized)

distFit <- ggplot(distroDF, aes(x = observed, y = predicted, color = meta_megaStrata)) +
		facet_wrap(~ dbhClass) + geom_point() + theme_bw() +
		geom_abline(intercept = 0, slope = 1)

print(distFit)

DiaDataTidy <- distroDF %>%
		gather(key = variable, value = value, -dbhClass, -plot_id, -meta_stand_id,
					 -meta_megaStrata, -SD, -predNorm)

distPlot <- ggplot(DiaDataTidy,
				aes(x = factor(dbhClass),
						y = value,
						color = variable)) +
		geom_boxplot(position = position_dodge(0.5), width = 1) +
		facet_wrap(~variable) + theme_bw() + xlab("Diameter class") +
		ylim(0, 1) + ylab("Trees per hectare") +
		scale_color_manual("", values = c("darkblue", "orange"))

print(distPlot)

#Box plot of average precision (standard deviation) per diameter class
distroSDPlot <- ggplot(distroDF, aes(x = factor(dbhClass), y = SD)) + geom_boxplot() +
                theme_bw() + xlab('diameter class') + ylab('standard deviation')
print(distroSDPlot)

```

```{r distroRatios, include = F}

if(is.null(strataVar)) {
	strataDistroRatios <- distroDF %>%
		filter(as.character(plot_id) %nin% zeroPlots) %>%
		mutate(strataVar = 'population') %>%
		group_by(strataVar, dbhClass) %>%
		summarize(meanPred = mean(predicted),
				meanObs = mean(observed)) %>%
		mutate(ratio = meanObs / meanPred) %>%
		mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
		select(DClass = dbhClass, meanPred, meanObs, rat = ratio)
} else {
	strataDistroRatios <- distroDF %>%
		filter(as.character(plot_id) %nin% zeroPlots) %>%
		left_join(strataTab) %>%
		group_by(strataVar, dbhClass) %>%
		summarize(meanPred = mean(predicted),
				meanObs = mean(observed)) %>%
		mutate(ratio = meanObs / meanPred) %>%
		mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
		select(strataVar, DClass = dbhClass, meanPred, meanObs, rat = ratio)
}

# model-assisted population level distro ratios
populationDistroRatios <- distroDF %>%
	filter(as.character(plot_id) %nin% zeroPlots) %>%
	group_by(dbhClass) %>%
	summarize(meanPred = mean(predicted),
			meanObs = mean(observed)) %>%
	mutate(ratio = meanObs / meanPred) %>%
	mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
	select(DClass = dbhClass, meanPred, meanObs, rat = ratio)

# model-assisted stand-level distro ratios
standDistroRatios <- distroDF %>%
	filter(as.character(plot_id) %nin% zeroPlots) %>%
	filter(meta_stand_id %in% unique(standRatios$meta_stand_id)) %>%
	group_by(meta_stand_id, dbhClass) %>%
	summarize(meanPred = mean(predicted),
		meanObs = mean(observed)) %>%
	mutate(ratio = meanObs / meanPred)  %>%
	select(stand = meta_stand_id, DClass = dbhClass, meanPred,
		meanObs, rat = ratio)

# obsDiams <- cruiseData$diameter[!is.na(cruiseData$diameter)]
# N <- length(obsDiams)
# N_pred <- length(obsDiams)
#
# weiFit <- stan(file = '/tmp/weibull-dist.stan', data = stanData)
#
# diamSims <- extract(weiFit, 'y_pred')$y_pred

```

--------------------------

Importance Value Models
--------------------------

```{r importanceFit}
speciesCols <- trainingData %>%
	select(contains('spp'), -contains('meta'))
speciesLabs <- names(speciesCols)


impBespokePredictors <- c(names(trainingData)[pcScoresIDX], 'meta_megaStrata')
impBespokePredictors <- impBespokePredictors[impBespokePredictors %nin% c('pca9')]
fit_spp_models <- function(spp,
													 td,
													 threshold = NULL,
													 pcScoresIDX,
													 bespokePredictors = NULL, # predictors in addition to pcas
												 	 stan = F) { # use stan models?
	predictors <- bespokePredictors

  training <- td %>%
		filter(baph > 0) %>%
    select(y = spp, one_of(predictors)) %>%
    mutate(present = ifelse(y > 0, 1, 0))

  occTab <- training %>%
		group_by(present) %>%
		summarize(n = n()) %>%
		mutate(prop = n / sum(n))

	# sometimes a species is on every plot, or it is on no plots (groupstrap)
	# a presence/absence model is not necessary in these cases
	if(nrow(occTab) == 1) {
		outcome <- occTab$present

		presenceMod <- lm(outcome ~ 1)
		training$predProp <- predict(presenceMod, newdata = training)
		threshold <- 1
	} else {
		presentProp <- occTab  %>%
			filter(present == 1) %>%
			.$prop

	  presentFormula <- as.formula(paste('present ~ ',
	                                     paste0(predictors, collapse = '+')))


	  # fit the presence/absence model

		# if the species not present or absent on more than 95% of plots, fit
		# statistical model for presence/absence
	  if(all(occTab$prop <= 0.95)) {
			# fit model in stan if desired
			if(stan) {
				tPrior <- student_t(df = 7, location = 0, scale = 2.5)
			  presenceMod <- stan_glm(presentFormula, data = training,
			                        	family = binomial(link = 'logit'),
			                        	prior = tPrior, prior_intercept = tPrior,
																cores = detectCores() - 4)

				#Calculate postProp and presAbs
			  presentPred <- posterior_predict(presenceMod)
				training$predProp <- unname(round(colMeans(presentPred), 3))
			} else { # otherwise we will just use glm
				presenceMod <- glm(presentFormula, data = training,
															family = 'binomial')
				training$predProp <- predict(presenceMod, type = 'response')
			}
		} else { # if species presence/absence is highly imbalanced use random forest
			presenceMod <- randomForest(presentFormula, data = training)
			training$predProp <- predict(presenceMod)
		}


		# threshold will be the mean present/absent score above which results in
		# the same proportion of plots getting a 'present' assignment as there were plots
		# with the species in the training data

		# the data-driven threshold can be overriden in the function call
	  if(is.null(threshold)) {
			threshold <- quantile(training$predProp, 1 - presentProp)
		}
	}

  training <- training %>%
    mutate(presAbs = ifelse(predProp >= threshold, 1, 0))

  # fit importance model to plots where species is present
  trainingPresent <- training %>%
    filter(present == 1)

	impFormula <- as.formula(paste('y ~ ', paste0(predictors, collapse = '+')))

	# use regression if more than 5 plots with the species
	if(nrow(trainingPresent) > 5) {
		impMod <- randomForest(impFormula, data = trainingPresent,
	                         type = 'regression')
	} else { # use classification if 5 or fewer plots with the species
		if(nrow(trainingPresent) > 1) {
			trainingPresent$y <- factor(as.character(trainingPresent$y))
			impMod <- randomForest(impFormula,
														 data = trainingPresent,
														 type = 'classification')
		} else {
			# if only one plot, fit model to predict that plot's value
			if(nrow(trainingPresent) == 1) {
				impMod <- lm(y ~ 1, data = trainingPresent)
			} else { # if no plots (groupstrap) predict 0
				impMod <- lm(0 ~ 1)
			}
		}

	}

  outMods <- list(spp = spp,
									threshold = threshold,
									presenceMod = presenceMod,
									impMod = impMod,
									predictors = predictors)

  return(outMods)
}

# fit species models with data driven thresholds
sppMods <- lapply(speciesLabs, fit_spp_models,
									td = trainingData,
									pcScoresIDX = pcScoresIDX,
									bespokePredictors = impBespokePredictors,
									stan = F)

names(sppMods) <- speciesLabs

# Function to predict species importance from sppMods
predict_spp_importance <- function(sppMod, pixels) {
  pixels$present <- 0

  # Set up tmp threshold
  threshold <- sppMod[['threshold']]

  # Set up tmp binomial and importance models
  presenceMod <- sppMod[['presenceMod']]
  impMod <- sppMod[['impMod']]
  smoothMod <- sppMod[['smoothMod']]

  # Predict presence/absence
	if('stanreg' %in% class(presenceMod)) {
		postPropSims <- posterior_predict(presenceMod, newdata = pixels)
	  pixels$propPred <- colMeans(postPropSims)
	} else {
		pixels$propPred <- predict(presenceMod, newdata = pixels, type = 'response')
	}


  pixelPreds <- pixels %>%
    mutate(presAbsPred = ifelse(propPred > threshold, 1, 0)) %>%
		mutate(rawImp = as.numeric(as.character(predict(impMod, newdata = .)))) %>%
	  mutate(sppImp = ifelse(presAbsPred == 1, rawImp, 0)) %>%
		select(presAbsPred, sppImp)

	out <- pixelPreds %>%
		select(sppImp)

	names(out) <- sppMod[[1]]

  return(out)

}

# training data PCAs for testing
pixels <- trainingData

sppPreds <- bind_cols(mclapply(sppMods,
															 pixels = pixels,
															 predict_spp_importance))

```

```{r importanceDiagnostics}

impDF <- data.frame()

for (i in 1:length(sppMods)) {
	sppName <- sppMods[[i]][[1]]
  tmpPred <- predict_spp_importance(sppMods[[i]], pixels = fullTrainingData)
	tempDF <- data.frame(predicted = unlist(tmpPred))
	tempDF$meta_stand_id <- fullTrainingData$meta_stand_id
	tempDF$plot_id <- fullTrainingData$plot_id
	tempDF$meta_megaStrata <- fullTrainingData$meta_megaStrata
	tempDF$observed <- fullTrainingData[, sppName]
	stID <- unlist(strsplit(sppName, split = '_'))[2]
	tempDF$species <- st_sp2$common[match(stID, st_sp2$stID)]
	tempDF$sppID <- sppName
	#tempDF$SD <- apply(tmpPred$individual, 1, function(x) {sd(x)})
	impDF <- rbind(impDF, tempDF)
}


impPlot <- ggplot(impDF, aes(x = observed, y = predicted, color = meta_megaStrata)) +
	geom_point(alpha = 0.5) +
	facet_wrap( ~ species) +
	theme_bw() +
	xlim(0, 1) + ylim(0, 1) +
	xlab("Observed species importance") +
	ylab("Predicted species importance") +
	geom_abline(slope = 1, intercept = 0, color = 'red')

print(impPlot)


# #Species importance score precision
# impSDPlot <- ggplot(impDF, aes(x = species, y = SD)) + geom_boxplot() +
#                 theme_bw() + xlab('diameter class') + ylab('standard deviation')
# print(impSDPlot)


```

```{r impRatios}

if(is.null(strataVar)) {
	strataImpRatios <- impDF %>%
		filter(as.character(plot_id) %nin% zeroPlots) %>%
		mutate(strataVar = 'population') %>%
		group_by(strataVar, sppID) %>%
		summarize(meanPred = mean(predicted),
				meanObs = mean(observed)) %>%
		mutate(ratio = meanObs / meanPred) %>%
		mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
		select(strataVar, sppID, meanPred, meanObs, rat = ratio)
} else {
	strataImpRatios <- impDF %>%
		filter(as.character(plot_id) %nin% zeroPlots) %>%
		left_join(strataTab) %>%
		group_by(strataVar, sppID) %>%
		summarize(meanPred = mean(predicted),
				meanObs = mean(observed)) %>%
		mutate(ratio = meanObs / meanPred) %>%
		mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
		select(strataVar, sppID, meanPred, meanObs, rat = ratio)
}

# model-assisted population level species importance ratios
populationImpRatios <- impDF %>%
	filter(as.character(plot_id) %nin% zeroPlots) %>%
	group_by(sppID) %>%
	summarize(meanPred = mean(predicted),
			meanObs = mean(observed)) %>%
	mutate(ratio = meanObs / meanPred) %>%
	mutate(ratio = ifelse(is.na(ratio), 1, ratio)) %>%
	select(sppID, meanPred, meanObs, rat = ratio)

# model-assisted stand-level species importance ratios
standImpRatios <- impDF %>%
	filter(as.character(plot_id) %nin% zeroPlots) %>%
	filter(meta_stand_id %in% unique(standRatios$meta_stand_id)) %>%
	group_by(meta_stand_id, sppID) %>%
	summarize(meanPred = mean(predicted),
		meanObs = mean(observed)) %>%
	mutate(ratio = meanObs / meanPred)  %>%
	select(stand = meta_stand_id, sppID, meanPred,
		meanObs, rat = ratio)

```

```{r computeEmpiricalMatrix}
# read in trees, set up predictors
wrapTrees <- cruiseDataNoZero %>%
		mutate(diameterClass = round(diameter)) %>%
		left_join(trainingData %>% mutate(plot_id = as.character(plot_id)))

sppNames <- names(trainingData)[grep('spp', names(trainingData))]
sppNames <- sppNames[!grepl('meta', sppNames)]

diamNames <- names(trainingData)[grep('diam_', names(trainingData))]
diamNames <- diamNames[!grepl('meta', diamNames)]

wrapPreds <- c('baph', 'tph', sppNames, diamNames)
predsSide <- paste0(wrapPreds, collapse = '+')
wrapFormula <- as.formula(paste0('common ~ ', predsSide))

# function that will build model to predict the species  of a tree of a given
# diameter as a function of baph, tph, sppImp scores, and diam scores
wrap_by_diam <- function(diam) {
	print(diam)
	# subset of trees where diameter = diam
	diamTraining <- wrapTrees %>%
		filter(diameterClass == diam) %>%
	    mutate(roundedExpansion = round(expansion))

	diamTraining <- diamTraining[rep(row.names(diamTraining), diamTraining$roundedExpansion), ]
	distinctRows <- diamTraining %>%
	    distinct()

	# build a model to predict species as a function of stocking and species.
	if(length(unique(diamTraining$common)) > 2 & nrow(distinctRows) > 5) {
		# fit multinomial model to predict probability of a given tree belonging
		# to each species (use predict(mod, newdata, 'probs') to get probs)
		mod <- nnet::multinom(wrapFormula,
				data = diamTraining, MaxNWts = 10000,
				maxit = 1000)

		if('try-error' %in% class(mod)) {
			stop(paste0('wrapMod failed to build for diameter class ', diam))
		}

		return(mod)
	} else { # if we only observe trees in the diameter class on less than 5 plots,
		# we assign empirical probability from training treelist
		diamSpp <- unique(diamTraining$common)
		sppTab <- table(diamTraining$common) / nrow(diamTraining)
		sppFrame <- data.frame(t(matrix(sppTab)))
		names(sppFrame) <- dimnames(sppTab)[[1]]
		return(sppFrame)
	}

}

# list of unique diameters
uniqueDiams <- wrapTrees %>%
		arrange(diameterClass) %>%
		select(diameterClass) %>%
		distinct() %>%
		.$diameterClass

# list of unique species names
uniqueCommons <- wrapTrees %>%
		select(common) %>%
		distinct() %>%
		arrange(common) %>%
		.$common

# build models for all diameter classes
wrapMods <- mclapply(uniqueDiams, wrap_by_diam, mc.cores = detectCores() - 2)
names(wrapMods) <- paste('diam', uniqueDiams, sep = '_')

# check to see if wrapMods will work
classCheck <- c()
for(i in 1:length(wrapMods)) {
	classCheck <- c(classCheck, class(wrapMods[[i]]))
}

if(any(classCheck == 'try-error')) {
	rm(list = 'wrapMods')
	stop('wrapMods failed to write out properly')
}

```

```{r sppDiameterLimits}
# global diameter limits by species
populationCruiseLimits <- cruiseDataNoZero %>%
	mutate(globalMinD = round(min(diameter)),
		globalMaxD = round(max(diameter))) %>%
	group_by(common) %>%
	summarize(minD = round(min(diameter) - 2),
			maxD = round(max(diameter) + 2),
			globalMinD = unique(globalMinD),
			globalMaxD = unique(globalMaxD)) %>%
	mutate(minAllowD = ifelse(minD < globalMinD, globalMinD, minD),
			maxAllowD = ifelse(maxD > globalMaxD, globalMaxD, maxD)) %>%
	select(common, minAllowD, maxAllowD)


# strata-level diameter limits for each species
if(is.null(strataVar)) {
	strataCruiseLimits <- cruiseDataNoZero %>%
		left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
		mutate(globalMinD = round(min(diameter)),
			globalMaxD = round(max(diameter))) %>%
		mutate(strataVar = 'population') %>%
		group_by(strataVar, common) %>%
		summarize(minD = round(min(diameter) - 2),
				maxD = round(max(diameter) + 2),
				globalMinD = unique(globalMinD),
				globalMaxD = unique(globalMaxD)) %>%
		mutate(minAllowD = ifelse(minD < globalMinD, globalMinD, minD),
				maxAllowD = ifelse(maxD > globalMaxD, globalMaxD, maxD)) %>%
		select(strataVar, common, minAllowD, maxAllowD)
} else {
	strataCruiseLimits <- cruiseDataNoZero %>%
		left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
		mutate(globalMinD = round(min(diameter)),
			globalMaxD = round(max(diameter))) %>%
		group_by(strataVar, common) %>%
		summarize(minD = round(min(diameter) - 2),
				maxD = round(max(diameter) + 2),
				globalMinD = unique(globalMinD),
				globalMaxD = unique(globalMaxD)) %>%
		mutate(minAllowD = ifelse(minD < globalMinD, globalMinD, minD),
				maxAllowD = ifelse(maxD > globalMaxD, globalMaxD, maxD)) %>%
		select(strataVar, common, minAllowD, maxAllowD)
}

```

---------------------

Core Groupstrap Results
---------------------

```{r groupStrap, results = 'hide'}

##TODO: add a function here to iterate over each set of
#values in metadata and try predicting bapa and tpa


bootPolygons <- trainingData %>%
		dplyr::group_by(meta_stand_id) %>%
		dplyr::summarize(nPlots = length(baph),
				observedBAPH = mean(baph),
				seObservedBAPH = sd(baph) / sqrt(length(meta_stand_id)),
				predBAPH = mean(baphPred),
				observedTPH = mean(tph),
				seObservedTPH = sd(tph) / sqrt(length(meta_stand_id)),
				predTPH = mean(tphPred),
				# 90% CI
        tScore = qt(1 - (0.1 / 2), nPlots - 1)) %>%
		dplyr::ungroup() %>%
		dplyr::filter(nPlots >= groupSize) # change as needed

predict_for_one_group <- function(polygon) {
	polygonIdx <- which(bootPolygons$meta_stand_id == polygon)
	catMsg(paste('Polygon', polygonIdx, ":", polygon))
	#subset training data for reduced model fitting
	tdSub <- trainingData %>%
			filter(meta_stand_id != polygon)

 	#subset training data for prediction to target polygon
 	tdApply <- trainingData %>%
 			filter(meta_stand_id == polygon)

 	baphGS <- baphPooledDefault
 	tphGS <- tphPooledDefault

	baphModelSub <- update(baphGS, data = tdSub)
	baPredSub <- exp(predict(baphModelSub,
	                    newdata = tdApply)) - 1

	tphModelSub <- update(tphGS, data = tdSub)
	tphPredSub <- exp(predict(tphModelSub,
	                         newdata = tdApply)) - 1
#
# 	# make distro and imp predictions using full models because
# 	# RF models take too long to refit repeatedly

   distroBespokePredictors <- NULL

   if(is.null(distroBespokePredictors)) {
    distroGroupSub <- lapply(1:length(distroVarsIDX), genericModelBuild,
		  											 ys = tdSub[, distroVarsIDX],
		  										 	 predVars = tdSub[, pcScoresIDX])
   } else {
    distroGroupSub <- lapply(1:length(distroVarsIDX), genericModelBuild,
		  											 ys = tdSub[, distroVarsIDX],
		  										 	 predVars = tdSub[, distroBespokePredictors])
   }
  names(distroGroupSub) <- names(tdSub)[distroVarsIDX]

	predict_distro_pixels <- function(pixelsData) {
		runOne <- function(J) {
			if(is.null(distroBespokePredictors)) {
				newdata <- pixelsData[, pcScoresIDX]
			} else {
				newdata <- pixelsData[, distroBespokePredictors]
			}
			return(predict(distroGroupSub[[J]], newdata = newdata))
		}
		temp <- data.frame(do.call(cbind, lapply(X = 1:length(distroGroupSub),
								FUN = runOne)))
		names(temp) <- names(distroGroupSub)
		return(temp)
	}

	distroFrame <- predict_distro_pixels(tdApply)
  distroOut <- as.data.frame(t(colMeans(distroFrame)))

	impPredictors <- c(names(trainingData)[pcScoresIDX])
	impPredictors <- impPredictors[impPredictors != 'pca9']
	sppModsSub <- lapply(speciesLabs, fit_spp_models,
											 td = tdSub,
											 pcScoresIDX = pcScoresIDX,
											 bespokePredictors = impPredictors,
											 stan = F)
	impPredsSub <- bind_cols(mclapply(sppModsSub,
																 		pixels = tdApply,
																 		predict_spp_importance))

  impOut <- as.data.frame(t(colMeans(impPredsSub))) / sum(colMeans(impPredsSub))

	impOut[impOut < 0] <- 0
	##We want to make baph and tph figures:
	#stand-wise full model
	#stand-wise reduced model

	#Diameter distribution by stand

	#Species importance by stand
	stockOut <- data.frame("meta_stand_id" = polygon,
	                         "groupModelPredTPH" = mean(tphPredSub),
	                         "groupModelPredBAPH" = mean(baPredSub))
#
 	output <- list(stockOut, distroOut, impOut)

 	return(output)
}
#
# # Perform groupstrap on all polygons:
 bootResults <- lapply(bootPolygons$meta_stand_id,
 		predict_for_one_group)
#
# # Define function to pull from bootResults list
 extract_from_list <- function(i, bootResults, place) {
 	return(bootResults[[i]][place][[1]])
 }
#

# # Extract the tph and ba predictions:
bootPolygonsResults <- bootPolygons %>%
 	bind_cols(do.call(rbind, lapply(1:length(bootResults),
																	extract_from_list,
																	bootResults = bootResults,
																	place = 1)))
#
# # Extract the diameter distribution predictions:
distroPredictions <- bind_rows(lapply(1:length(bootResults),
 																			extract_from_list,
 																			bootResults = bootResults,
 																			place = 2)) %>%
 	mutate(meta_stand_id = bootPolygons$meta_stand_id,
 				 origin = 'model')
#
# # Extract the species importance predictions
impPredictions <- bind_rows(lapply(1:length(bootResults),
																	 extract_from_list,
																	 bootResults = bootResults,
																	 place = 3)) %>%
	mutate(meta_stand_id = bootPolygons$meta_stand_id,
				 origin = 'model')

```

### Basal Area per Hectare

```{r groupBAPH}

baphTable <- bootPolygonsResults %>%
		dplyr::select(meta_stand_id, nPlots, tScore, contains("BAPH")) %>%
		dplyr::mutate(lowerCI = observedBAPH - tScore * seObservedBAPH,
				upperCI = observedBAPH + tScore * seObservedBAPH) %>%
		dplyr::select(meta_stand_id, nPlots, contains("CI"), contains("BAPH"))
#
pander(baphTable,
		split.table = Inf,
		justify = 'left',
		digits = 2)
#
baphLong <- baphTable %>%
		tidyr::gather(key = fit, value = predicted,
				predBAPH, groupModelPredBAPH)
#
baphMax <- baphLong %>%
		dplyr::select(upperCI, predicted) %>%
		max() + 5
#
baphColors <- c("darkgreen", "darkseagreen")
densityLabels <- c("Full model pixel prediction",
									 "Groupstrap model pixel prediction")
#
baphFaceted <- ggplot(data = baphLong,
											aes(x = observedBAPH, y = predicted, color = fit)) +
  facet_wrap(~ fit) +
	xlab('ST plot prediction') +
	ylab('ST model prediction') +
	ggtitle('Group bootstrap of basal area predictions to approximate model based performance') +
	geom_point() +
	geom_abline(intercept = 0, slope = 1) +
	geom_abline(intercept = 0, slope = 1.1, linetype = 2) +
	geom_abline(intercept = 0, slope = 0.9, linetype = 2) +
	geom_errorbarh(aes(xmin = lowerCI, xmax = upperCI, size = nPlots)) +
	xlim(0, baphMax) +
	ylim(0, baphMax) +
	theme_bw() +
	scale_color_manual(values = baphColors, labels = densityLabels)

 print(baphFaceted)

```

### Basal Area per Acre

```{r clientBAPA}
#
bapaTable <- baphTable %>%
	mutate_each(., funs(convert_baph_to_bapa), -meta_stand_id, -nPlots)

for(name in names(bapaTable)) {
	names(bapaTable)[match(name, names(bapaTable))] <- gsub("BAPH", "BAPA", name)
}

bapaMax <- bapaTable %>%
	select(upperCI, contains("BAPA")) %>%
	max() + 5

clientLayers <- list(geom_point(),
										 geom_abline(intercept = 0, slope = 1),
										 geom_errorbarh(aes(xmin = lowerCI, xmax = upperCI)),
										 theme_bw(),
										 theme(text = element_text(size = 18),
										 			 title = element_text(size = 18)),
										 geom_abline(intercept = 0, slope = 1.1, linetype = 2),
										 geom_abline(intercept = 0, slope = 0.9, linetype = 2)
)

ggplot(bapaTable, aes(x = observedBAPA, y = groupModelPredBAPA)) +
	xlab('Estimate of basal area (ft2/ac) from plot data') +
	ylab('ST reduced model prediction of basal area (ft2/ac)') +
	xlim(0, bapaMax) +
	ylim(0, bapaMax) +
	clientLayers

ggplot(bapaTable, aes(x = observedBAPA, y = predBAPA)) +
	xlab('Estimate of basal area (ft2/ac) from plot data') +
	ylab('ST model prediction of basal area (ft2/ac)') +
	xlim(0, bapaMax) +
	ylim(0, bapaMax) +
	clientLayers

```

### Trees per Hectare

```{r groupTPH}
#
tphTable <- bootPolygonsResults %>%
	select(meta_stand_id, nPlots, tScore, contains("TPH")) %>%
	mutate(lowerCI = observedTPH - tScore * seObservedTPH,
			upperCI = observedTPH + tScore * seObservedTPH) %>%
	select(meta_stand_id, nPlots, contains("CI"), contains("TPH"))

pander(tphTable, split.tables = Inf,
			 split.cells = Inf,
			 justify = 'left',
			 digits = 0)

tphLong <- tphTable %>%
	gather(key = fit, value = predicted,
				 predTPH, groupModelPredTPH)

tphMax <- tphLong %>%
	select(upperCI, predicted) %>%
	max() + 5

tphColors <- c("darkblue", "deepskyblue")

tphGroupFacetPlot <- ggplot(data = tphLong,
	 													aes(x = observedTPH, y = predicted, color = fit)) +
	xlab('ST plot prediction') +
	ylab('ST model prediction') +
	geom_point() +
	geom_abline(intercept = 0, slope = 1) +
	geom_abline(intercept = 0, slope = 1.1, linetype = 2) +
	geom_abline(intercept = 0, slope = 0.9, linetype = 2) +
	geom_errorbarh(aes(xmin = lowerCI, xmax = upperCI, size = nPlots)) +
	scale_color_manual("", values = tphColors, labels = densityLabels) +
	theme(strip.background = element_blank(), strip.text.x = element_blank()) +
	ggtitle('Group bootstrap of trees per hectare predictions to approximate model-based performance') +
	xlim(0, tphMax) +
	ylim(0, tphMax) +
	theme_bw() +
	facet_wrap( ~ fit)

print(tphGroupFacetPlot)

```

### Trees per Acre

```{r clientTPA}
#
tpaTable <- tphTable %>%
		mutate_each(., funs(convert_acres_to_hectares), -meta_stand_id, -nPlots)

for (name in names(tpaTable)) {
	names(tpaTable)[match(name, names(tpaTable))] <- gsub("TPH", "TPA", name)
}

tpaMax <- tpaTable %>%
		select(upperCI, contains("TPA")) %>%
		max() + 5

tpaTreelistClientPlot <- ggplot(tpaTable, aes(x = observedTPA,
						y = groupModelPredTPA)) +
		xlab('Estimate of trees per acre from plot data') +
		ylab('ST reduced model prediction of trees per acre') +
		xlim(0, tpaMax) + ylim(0, tpaMax) + clientLayers

tpaFullModelClientPlot <- ggplot(tpaTable, aes(x = observedTPA,
						y = predTPA)) +
		xlab('Estimate of trees per acre from plot data') +
		ylab('ST model prediction of trees per acre') +
		xlim(0, tpaMax) + ylim(0, tpaMax) + clientLayers

print(tpaTreelistClientPlot)
print(tpaFullModelClientPlot)

```

### Diameter Distribution

```{r groupDistro}
zeroTreeStands <- trainingData %>%
		group_by(meta_stand_id) %>%
		summarize(nPlots = n_distinct(plot_id),
				nZeroes = length(unique(plot_id[tph == 0]))) %>%
		filter(nPlots == nZeroes) %>%
		.$meta_stand_id

fullDist <- trainingData %>%
		dplyr::select(meta_stand_id, contains('diam')) %>%
		group_by(meta_stand_id) %>%
		dplyr::summarise_each(funs(mean)) %>%
		ungroup() %>%
		filter(meta_stand_id %in% bootPolygons$meta_stand_id) %>%
		mutate(origin = 'trainingData') %>%
		bind_rows(distroPredictions) %>%
		gather(key = diamClass, value = value, -origin, -meta_stand_id) %>%
		mutate(diamClass = as.numeric(gsub('diam_', '', diamClass))) %>%
		filter(value > 0) %>%
		filter(meta_stand_id %nin% zeroTreeStands)
#
ggplot(fullDist, aes(x = diamClass, y = value, linetype = origin)) +
		geom_point(alpha = 0.1) + geom_line(alpha = 0.9) +
		facet_wrap( ~ meta_stand_id) + theme_bw() +
		ylab("Relative presence in distribution")

```

### Species Importance Value

```{r groupImp}
#
bootSpp <- trainingData %>%
		dplyr::select(meta_stand_id,
		              one_of(names(trainingData)[impVarsIDX]),
		              tph) %>%
		group_by(meta_stand_id) %>%
		filter(tph > 0 ) %>%
		dplyr::summarise_each(funs(mean)) %>%
		select(-tph) %>%
		ungroup() %>%
		filter(meta_stand_id %in% bootPolygons$meta_stand_id) %>%
		mutate(origin = 'trainingData') %>%
		bind_rows(impPredictions) %>%
		gather(key = spp, value = value, -origin, -meta_stand_id) %>%
		mutate(spp = as.numeric(gsub('spp_', '', spp)),
				common = get_common(spp)) %>%
		filter(meta_stand_id %nin% zeroTreeStands)

pal12 <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
		"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A",
		"#FFFF99", "#B15928")

#Get correct number of distinct, non-annoying colors:
colorCount <- length(unique(bootSpp$common))
useColors <- colorRampPalette(pal12)(colorCount)

impGroupBarPlot <- ggplot(bootSpp, aes(x = origin, y = value)) +
		geom_bar(stat = "identity", aes(fill = common)) +
		scale_fill_manual(values = useColors,
				guide = guide_legend(nrow = 18)) +
		facet_wrap(~ meta_stand_id) + theme_bw() +
		ylab('Species Importance Value')

impGroupLinePlot <- ggplot(bootSpp, aes(x = common, y = value)) +
		geom_line(aes(color = origin, group = origin)) +
		ylab("Species importance") +
		facet_wrap(~ meta_stand_id) + theme_bw() +
		scale_color_manual(values = c("black", "skyblue2")) +
		theme(axis.text.x=element_text(angle=45, hjust = 1))

print(impGroupBarPlot)

print(impGroupLinePlot)

```

------------

Memory Flush
------------

```{r saveCore}

# save objects

save(tphPooled, baphPooled, distroSet,
	sppMods, predict_spp_importance,
	baphPrecision, tphPrecision,
	impBespokePredictors, distroBespokePredictors,
	transform_the_predictors,
	transform_the_baph_prediction,
	transform_the_tph_prediction,
	wrapMods,
	strataVar,
	populationCruiseLimits,
	strataCruiseLimits, stratTab,
	baphCoeffTable, tphCoeffTable,
	baphPredFrame, tphPredFrame,
	file = file.path(modelDir, 'core_models.Rda'))

save(standRatios,
     strataRatios,
		 populationRatios,
		 strataVar,
		 strataDistroRatios,
		 populationDistroRatios,
		 standDistroRatios,
		 strataImpRatios,
		 populationImpRatios,
		 standImpRatios,
		 maCIs,
		 file = file.path(modelDir, 'standRatios.Rda'))

write.csv(baphTable, file.path(reportDir, "baphTable.csv"), row.names = F)
write.csv(tphTable, file.path(reportDir, "tphTable.csv"), row.names = F)
write.csv(bapaTable, file.path(reportDir, "bapaTable.csv"), row.names = F)
write.csv(tpaTable, file.path(reportDir, "tpaTable.csv"), row.names = F)

```

--------------

Total Height Model
--------------

```{r heightPrep, eval=eval_height}

medianBAPA <- median(trainingData$baph) * 4.356

# convert total/stopper heights to feet
cruiseDataNoZero$hm <- convert_meters_to_feet(cruiseDataNoZero$hm)
cruiseDataNoZero$hp <- convert_meters_to_feet(cruiseDataNoZero$hp)
cruiseDataNoZero$hs <- convert_meters_to_feet(cruiseDataNoZero$hs)

lat <- mean(parse_lat_lon(cruiseData$loc, coord = 'lat'))
lon <- mean(parse_lat_lon(cruiseData$loc, coord = 'lon'))

radius <- 150

gleanedFile <- file.path(modelDir, 'gleanedList.Rda')

if (file.exists(gleanedFile)) {
	load(gleanedFile)
} else {

	minSpp <- 0

	while (radius < 250 & minSpp < 3) {
		gleanedList <- glean_tree_records(lat, lon, radius)

		gleanSummary <- gleanedList$trees %>%
				mutate(st_species = fia2st(SPCD)) %>%
				filter(st_species %in% unique(cruiseData$st_species)) %>%
				group_by(st_species) %>%
				summarize(count = length(st_species))

		minSpp <- min(gleanSummary$count)
		radius <- radius + 50
	}

	save(gleanedList, file = gleanedFile)
}

# we need to drop snags
cruiseDataHeight <- cruiseDataNoZero %>%
	select(plot_id, st_species, common, diameter, expansion,
		 		 hm, tqi, start_height_1, hs, hp) %>%
	left_join(trainingData %>%
	mutate(plot_id = as.character(plot_id)) %>%
	select(plot_id, baph, tph))

# create a plottable height dataset
cruiseDataHtPlot <- cruiseDataHeight %>%
	select(common, diameter, hm, tqi, hs, hp) %>%
	gather(heightType, height, -common, -diameter, -tqi) %>%
	mutate(height_type = case_when(heightType == 'hm' ~ 'total',
																 heightType == 'hs' ~ 'saw_stopper',
																 heightType == 'hp' ~ 'pulp_stopper')) %>%
	filter(!is.na(height))


totalAndMerchHtPlot <- ggplot(data = cruiseDataHtPlot,
		aes(x = diameter, y = height, color = height_type)) +
			geom_point(alpha = 0.4) +
			facet_wrap(~ common, labeller = labeller(common = label_wrap_gen(width = 15))) +
			theme_bw()

print(totalAndMerchHtPlot)
```

## FIA height models

```{r heightFIA, eval=eval_height}

# drop species that aren't in FIA database
cruiseDataFia <- cruiseDataHeight %>%
	mutate(SPCD = st2fia(st_species)) %>%
	filter(SPCD %in% unique(gleanedList$trees$SPCD))

htModelFIA <- freq_height(species = cruiseDataFia$st_species,
		diameter = cruiseDataFia$diameter,
		height = cruiseDataFia$hm,
		expansion = cruiseDataFia$expansion,
		bapa = medianBAPA,
		latitude = lat,
		longitude = lon,
		speciesCode = 'ST',
		gleanedData = gleanedList)

cruiseDataHeight$predHeightFia <- predict(htModelFIA,
                                  				newSpp = cruiseDataHeight$st_species,
                                  				newDiam = cruiseDataHeight$diameter,
                                  				speciesCode = 'ST')

if(!all(is.na(cruiseDataHeight$hm))) {
	fiaHtPlot <- ggplot(aes(x = hm, y = predHeightFia),
											data = cruiseDataHeight) +
	  geom_point(alpha = 0.2) +
		theme_bw() +
		geom_abline() +
	  facet_wrap(~common) +
		ggtitle('FIA height models')

	print(fiaHtPlot)
}

```

## Local height models

```{r heightLocal, eval=eval_height}
# drop species that have fewer than 5 observations
cruiseDataLocal <- cruiseDataHeight %>%
	group_by(common) %>%
	mutate(n = n()) %>%
	filter(n >= 5) %>%
	ungroup() %>%
	filter(!is.na(hm))

# if no total height observations, do not try to fit local height models
if(nrow(cruiseDataLocal) > 0) {
	htModelLocal <- freq_height(species = cruiseDataLocal$st_species,
			diameter = cruiseDataLocal$diameter,
			height = cruiseDataLocal$hm)

	cruiseDataHeight$predHeightLocal <- predict(htModelLocal,
	                                  newSpp = cruiseDataHeight$st_species,
	                                  newDiam = cruiseDataHeight$diameter,
	                                  speciesCode = 'ST')

	localHtPlot <- ggplot(aes(x = hm, y = predHeightLocal),
												data = cruiseDataHeight) +
	  geom_point(alpha = 0.2) +
		theme_bw() +
		geom_abline() +
	  facet_wrap(~common, labeller =
	               labeller(common = label_wrap_gen(width = 15)))
	print(localHtPlot)
}


```

## Final height models

Diagnostics for the final height model are displayed below. The default choice is the local freq_height model. Change this if you want to use the FIA model or a hybrid FIA/local moodel.

```{r heightFinal, eval=eval_height}

# htModel <- htModelFIA
htModel <- htModelLocal

# make predictions to cruised trees using final model
cruiseDataHeight$predHeightFinal <- predict(htModel,
                                  newSpp = cruiseDataHeight$st_species,
                                  newDiam = cruiseDataHeight$diameter,
                                  speciesCode = 'ST')

heightComp <- cruiseDataHeight %>%
	select(diameter, common, expansion,
		hm, predHeightFinal) %>%
	gather(source, totalHeight, hm, predHeightFinal,
		-diameter, -common, -expansion) %>%
	mutate(source = case_when(source == 'total_height' ~ 'observed',
														source == 'predHeightFinal' ~ 'predicted'))

htDiamPlot <- ggplot(data = heightComp,
		aes(x = diameter, y = totalHeight,
				color = source)) +
	geom_point(alpha = 0.4) +
	theme_bw() +
	facet_wrap(~ common,
	           labeller =
	             labeller(common = label_wrap_gen(width = 15))) +
	ylab('Total height (ft)') +
	xlab('DBH (in)') +
	ggtitle('Total height model predictions vs observed values')

print(htDiamPlot)
clientHt <- htDiamPlot +
  scale_color_manual(values = c('#30363E', #stDarkGreen
                                '#347a35')) #stGrey

print(clientHt)

print(plot(htModel))

save(htModel, file = file.path(modelDir, 'htModel.Rda'))

```

--------------

Merchantable Height Model
--------------

```{r auxProportionalSystem}

# generic proportional assignment by diameter, spp or sppGroup, and height class
# can be used for any tree-level variable

build_aux_model <- function(trees, # modeling dataset
							attribute, # name of attribute to be modeled
							stand = F,
							spp = F, # group by species?
							sppGroup = NULL, # group by a species group from st_sp2 instead of common name?
							htClass = F, # group by height class?
							groupVars = NULL # other grouping variable from metadata?
						) {

	# identify stands that will get stand-specific predictions
	if(stand) {
		modStands <- trainingData %>%
			filter(plot_id %nin% zeroPlots) %>%
			select(meta_stand_id, plot_id) %>%
			group_by(meta_stand_id) %>%
			summarize(nPlots = n()) %>%
			filter(nPlots >= 5) %>%
			.$meta_stand_id
	} else {
		modStands <- c()
	}

	dat <- trees %>%
		left_join(trainingData %>% mutate(plot_id = as.character(plot_id)))

	# define the group that each tree is in
	if(spp == T) {
		dat$group <- dat$common
	}
	if(!is.null(sppGroup)) {
		if(sppGroup %nin% c('class', 'group', 'genus')) {
			stop('sppGroup must be one of these: class, group, or genus')
		}
		dat$group <- st_sp2[[sppGroup]][match(dat$common, st_sp2$common)]
	}
	if(htClass == T) {
		# predict height if tree does not have total height
		dat$finalPredHeight <- predict(htModel,
		                               newSpp = dat$st_species,
		                               newDiam = dat$diameter,
		                               speciesCode = 'ST')
		# height for height class is either measured height, or predicted
		dat$finalPredHeight[!is.na(dat$total_height)] <- dat$total_height[!is.na(dat$total_height)]

		# maximum observed height rounded up to nearest 10
		maxHt <- 10 * ceiling(max(dat$finalPredHeight, na.rm = T) / 10)

		# 10 ft height classes
		dat$htClass <- cut(dat$finalPredHeight,
							breaks = seq(0, maxHt, by = 10),
							labels = seq(5, maxHt - 5, by = 10))

		# tack height class onto existing groupings
		dat$group <- paste(dat$group, dat$htClass, sep = '_')
	}

	if(!is.null(groupVars)) {
		dat <- dat %>%
			mutate_at(vars(one_of(groupVars)), funs(as.character)) %>%
			unite(group, one_of('group', groupVars), sep = '_')
	}

	# proportions in each group x diameter combination
	tableVars <- c('group', 'diameter')
	propTab <- dat %>%
		select(one_of(tableVars), expansion,
				attr = one_of(attribute)) %>%
		mutate(diameter = round(diameter)) %>%
		group_by_at(vars(one_of(tableVars), attr)) %>%
		summarize(n = sum(expansion)) %>%
		mutate(prop = n / sum(n)) %>%
		select(-n) %>%
		ungroup()


	# proportions in each group x diameter x stand combination
	standGroupVars <- c('meta_stand_id', 'group', 'diameter')
	propTabStand <- dat %>%
		select(one_of(standGroupVars), expansion,
				attr = one_of(attribute)) %>%
		mutate(diameter = round(diameter)) %>%
		group_by_at(vars(one_of(standGroupVars), attr)) %>%
		summarize(n = sum(expansion)) %>%
		mutate(prop = n / sum(n)) %>%
		select(-n) %>%
		ungroup() %>%
		filter(meta_stand_id %in% modStands)


	predict_aux <- function(treeList, model) {

		propTab <- model$propTab
		propTabStand <- model$propTabStand
		attribute <- model$attribute
		spp <- model$spp
		sppGroup <- model$sppGroup
		groupVars <- model$groupVars
		htClass <- model$htClass

		temp <- treeList %>%
			mutate(diameter = round(diameter)) %>%
			mutate(rec = row_number())

		if(spp == T) {
			temp$group <- temp$common
		}
		if(!is.null(sppGroup)) {
			temp$group <- st_sp2[[sppGroup]][match(temp$common, st_sp2$common)]
		}
		if(htClass == T) {
			maxHt <- 10 * ceiling(max(temp$total_height, na.rm = T) / 10)
			temp$htClass <- cut(temp$total_height,
								breaks = seq(0, maxHt, by = 10),
								labels = seq(5, maxHt - 5, by = 10))

			temp$group <- paste(temp$group, temp$htClass, sep = '_')
		}
		if(!is.null(groupVars)) {
			temp <- temp %>%
				mutate_at(vars(one_of(groupVars)), funs(as.character)) %>%
				unite(group, one_of('group', groupVars), sep = '_')
		}

		# unique diameter x group combinations
		combos <- temp %>%
			select(group, diameter) %>%
			distinct() %>%
			arrange(group, diameter)

		predict_by_group <- function(grp, diam) {
			subTemp <- temp %>%
				filter(round(diameter) == diam,
					group == grp)

			# grab stand-level proportions if stand is in table
			if(unique(subTemp$meta_stand_id) %in% unique(propTabStand$meta_stand_id)) {
				probs <- propTabStand %>%
					ungroup() %>%
					filter(group == grp, meta_stand_id == unique(treeList$meta_stand_id)) %>%
					mutate(diamDist = abs(diameter - diam)) %>%
					filter(diamDist == min(diamDist)) %>%
					select(-diamDist)

				if(nrow(probs) < 1) {
					probs <- propTab %>%
						ungroup() %>%
						filter(group == grp) %>%
						mutate(diamDist = abs(diameter - diam)) %>%
						filter(diamDist == min(diamDist)) %>%
						select(-diamDist)
				}
				if(nrow(probs) < 1 & htClass) {
					decomposed <- subTemp %>%
						mutate(htClass = as.numeric(strsplit(grp, '_')[[1]][2]),
							 		 common = strsplit(grp, '_')[[1]][1])

					spp <- unique(decomposed$common)
					ht <- unique(decomposed$htClass)
					probs <- propTab %>%
						ungroup() %>%
						separate(group, c('common', 'htClass'), sep = '_') %>%
						mutate(htClass = as.numeric(htClass)) %>%
						filter(common == spp) %>%
						mutate(diamDist = abs(diameter - diam),
									 htDist = abs(htClass - ht)) %>%
						filter(diamDist == min(diamDist)) %>%
						filter(htDist == min(htDist)) %>%
						select(-diamDist, -htDist)
				}
			} else { # otherwise used pooled proportions
				probs <- propTab %>%
					ungroup() %>%
					filter(group == grp) %>%
					mutate(diamDist = abs(diameter - diam)) %>%
					filter(diamDist == min(diamDist)) %>%
					select(-diamDist)
			}


			if(nrow(probs) > 1) {
				subTemp$attr <- sample(probs$attr, size = nrow(subTemp),
					prob = probs$prop, replace = T)
			}

			if(nrow(probs) == 1) {
				subTemp$attr <- probs$attr
			}


			return(subTemp)
		}

		predict_all <- function(i) {
			run <- combos[i, ]
			preds <- predict_by_group(diam = run$diameter,
																grp = run$group)
		}

		outTrees <- bind_rows(lapply(1:nrow(combos), predict_all)) %>%
			select(rec, attr)

		outPreds <- temp %>%
			left_join(outTrees, by = 'rec') %>%
			arrange(rec) %>%
			.$attr

		return(outPreds)

	}

	outList <- list(
					propTab = propTab, propTabStand = propTabStand,
					predict_aux = predict_aux,
					attribute = attribute,
					spp = spp,
					sppGroup = sppGroup,
					htClass = htClass,
					groupVars = groupVars)


	return(outList)
}

```

## Merch height proportional assignment system
```{r merch_heightProportional, eval = eval_merch_height, results = 'markup'}

# TODO incorporate HFM-style two-stage stopper height modeling framework
# e.g. predict occurrence of stopper height, then predict stopper height for
# trees that get a stopper height

# if NA merch heights are trees on which merch height was not observed,
# filter those out.

# otherwise, do not filter out NAs since an NA might mean no stopper height
merchDat <- cruiseDataNoZero

#Saw stopper height
hsPropSet <- build_aux_model(trees = merchDat,
																attribute = 'hs',
																spp = T,
																sppGroup = NULL,
																htClass = F,
																stand = F,
																groupVars = NULL)

predict_hs <- hsPropSet$predict_aux

#Pulp stopper height
hpPropSet <- build_aux_model(trees = merchDat,
																attribute = 'hp',
																spp = T,
																sppGroup = NULL,
																htClass = F,
																stand = F,
																groupVars = NULL)

predict_hp <- hpPropSet$predict_aux


fullTl <- cruiseDataNoZero %>%
	left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
	filter(!is.na(meta_stand_id))

stands <- unique(fullTl$meta_stand_id)

# predict merch height then summarize by diameter and species for each stand
compTab <- bind_rows(lapply(stands,
	function(stand) {
		standTl <- fullTl %>%
			filter(meta_stand_id == stand)

		standTl$hs_pred <- predict_hs(
			treeList = standTl,
			model = hsPropSet)

		standTl$hp_pred <- predict_hp(
			treeList = standTl,
			model = hpPropSet
		)

		out <- standTl %>%
			select(meta_stand_id, diameter, common, expansion,
				hs, hp, hs_pred, hp_pred) %>%
			gather(source, mht, hs_pred, hs, hp_pred, hp, -meta_stand_id,
				-diameter, -common, -expansion) %>%
			mutate(tpa = expansion,
						 bapa = calculate_inches_to_ba_ft2(diameter) * expansion) %>%
			group_by(meta_stand_id, diameter, common, source) %>%
			summarize(meanMerchHeight = mean(mht, na.rm = T),
				nTrees = n()) %>%
			mutate(source = ifelse(source == 'hs', 'observed', 'predicted'))

		return(out)
	})
)

# make plot a little better
compTab$source <- factor(compTab$source, levels = c('observed', 'predicted'))

ggplot(data = compTab, aes(x = diameter, y = meanMerchHeight,
		color = source)) +
	geom_jitter(alpha = 0.5) +
	theme_bw() +
	facet_wrap(~ common,
						 labeller = labeller(common = label_wrap_gen(width = 15))) +
	ylab('mean merchantable height (ft)') +
	xlab('DBH (in)')

# if you are happy with the proportional assignment, you can skip the parametric
# modeling section.


```

## Parametric merch height model
```{r merch_heightModel, eval = eval_merch_height}
### This default should change to beta regression
#
# merchTrain <- cruiseDataNoZero %>%
# 	select(st_species, common, diameter, hm, hs, hp) %>%
# 	mutate(predHt = predict(htModel,
# 													newSpp = st_species,
# 													newDiam = diameter,
# 													speciesCode = "ST")) %>%
# 	mutate(hs_pct = hs / hm,
# 				 pred_hs_pct = hs / predHt,
# 				 hp_pct = hp / hm,
# 				 pred_hp_pct = hp / predHt)
#
# # Counts the percentage of records that have a merch height recorded
# pctData <- merchTrain %>%
# 	mutate(hasHs = ifelse(!is.na(hs) & !is.na(hm), T, F),
# 	       hasHp = ifelse(!is.na(hp) & !is.na(hm), T, F)) %>%
# 	summarize(hasHs = mean(hasHs),
# 	          hasHp = mean(hasHp))
#
# numHsPct <- pctData$hasHs * 100
#
# numHpPct <- pctData$hasHp * 100
#
# #
# catMsg(paste(round(numHsPct, 2), "percent of the plot data had saw stopper and total
# 						heights recorded", sep = " "))
#
# catMsg(paste(round(numHpPct, 2), "percent of the plot data had pulp stopper and total
# 						heights recorded", sep = " "))
#
#
# # If 15% or more trees have merch ht and
# # total height, models are based on recorded data.
# # If less than 15%, predicted total height will be
# # used. 15% is arbitrary and can be changed.
# if (numHsPct >= .15) {
# 	hsModel <- lmer(asin(hs_pct) ~ diameter +
# 		(1 + diameter | common),
# 		data = merchTrain)
# } else {
# 	hsModel <- lmer(asin(pred_hs_pct) ~ diameter +
# 		(1 + diameter | common),
# 		data = merchTrain)
#
# 	warning("Predicted total heights were used")
# }
#
# if (numHpPct >= .15) {
# 	hpModel <- lmer(asin(hp_pct) ~ diameter +
# 		(1 + diameter | common),
# 		data = merchTrain)
# } else {
# 	hpModel <- lmer(asin(pred_hp_pct) ~ diameter +
# 		(1 + diameter | common),
# 		data = merchTrain)
#
# 	warning("Predicted total heights were used")
# }
#
# summary(hsModel)
# summary(hpModel)
#
# # predict method for asin - MODIFY if changing the model form
# predict_merch <- function(treeList, model) {
# 	predPcts <- predict(model$model, newdata = treeList,
# 									 		allow.new.levels = T)
#
# 	predHeights <- treeList$total_height * sin(predPcts) # for arcsine transformation
#
# 	return(predHeights)
# }
#
# hsModelSet <- list(hsModel = hsModel, hpModel = hpModel, predict_aux = predict_merch)
#
#
# if (pctRecordTotHt >= .15) {
# 	merchTrain$merch_hat <- predict_merch(treeList = merchTrain,
# 																				model = merchModelSet)
# } else {
# 	merchTrain$total_height <- merchTrain$predHt
# 	merchTrain$merch_hat <- predict_merch(treeList = merchTrain,
# 																				model = merchModelSet)
# }
#
# ggplot(merchTrain, aes(x = stop_height_1, y = merch_hat)) +
# 		geom_point(alpha = 0.5) +
# 		facet_wrap(~ common,
# 							 labeller = labeller(common = label_wrap_gen(width = 15))) +
# 		theme_bw() +
# 		geom_abline(intercept = 0, slope = 1) +
# 		xlab("Observed merch height (feet)") +
# 		ylab("Predicted merch height (feet)")
#

```

Select either the parametric or a the proportional merch model system. The default choice is the proportional assignment system. Change this if you want to use a statistical model.

```{r pickMerchModel, eval = eval_merch_height}
# by default, the merch model is the proportional assignment system
# change to merch Model if you desire to use the statistical model
hsSet <- hsPropSet
hpSet <- hpPropSet
# merchSet <- merchModelSet

save(hsSet, hpSet, file = file.path(modelDir, 'merchModel.Rda'))

```

--------------

Grade Model
--------------

```{r grade, eval = eval_grade}


# TODO: Convert to segment/product handling:
# build a product field based on how we want to predict grade
# e.g. if multiple segments, slap segment lengths/grades into one field


# if NA grades are trees on which grade was not observed,
# filter those out.

# otherwise, do not filter out NAs since an NA might mean no grade assignment!
# table(cruiseDataNoZero$common, cruiseDataNoZero$segment_product_1, useNA = 'always')

gradeDat <- cruiseDataNoZero %>%
	filter(!is.na(tqi))

gradeSet <- build_aux_model(trees = gradeDat,
														attribute = 'tqi',
														spp = T,
														sppGroup = NULL,
														htClass = F,
														stand = F,
														groupVars = NULL)

predict_grade <- gradeSet$predict_aux

# test it out
fullTl <- cruiseDataNoZero %>%
	left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
	filter(!is.na(meta_stand_id))

stands <- unique(fullTl$meta_stand_id)

# predict grade then summarize by diameter and species for each stand
compTabGrade <- bind_rows(lapply(stands,
	function(stand) {
		standTl <- fullTl %>%
			filter(meta_stand_id == stand)

		standTl$grade_pred <- predict_grade(
			treeList = standTl,
			model = gradeSet
		)

		out <- standTl %>%
			select(meta_stand_id, diameter, common, expansion,
				tqi, grade_pred) %>%
			gather(source, grade, tqi, grade_pred, -meta_stand_id,
						 -diameter, -common, -expansion) %>%
			mutate(tpa = expansion) %>%
			group_by(meta_stand_id, diameter, common, source, grade) %>%
			summarize(gradeTpa = sum(tpa)) %>%
			mutate(gradeProp = gradeTpa / sum(gradeTpa)) %>%
			ungroup() %>%
			mutate(source = ifelse(source == 'segment_product_1', 'observed', 'predicted'))

		return(out)
	})
)

# TODO think of a way to plot grade model diagnostics


save(gradeSet, file = file.path(modelDir, 'gradeModel.Rda'))

```


--------------

Crown Ratio Model
--------------

## Crown ratio proportional assignment system

```{r crownRatio_proportional, eval = eval_crown}

crownDat <- cruiseDataNoZero %>%
	filter(!is.na(thi))

crownPropSet <- build_aux_model(trees = crownDat,
																attribute = 'thi',
																spp = T,
																sppGroup = NULL,
																htClass = F,
																stand = F,
																groupVars = NULL)

fullTl <- cruiseDataNoZero %>%
	left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
	filter(!is.na(meta_stand_id))

stands <- unique(fullTl$meta_stand_id)

predict_crown <- crownPropSet$predict_aux

# predict merch height then summarize by diameter and species for each stand
compTabCrown <- bind_rows(lapply(stands,
	function(stand) {
		standTl <- fullTl %>%
			filter(meta_stand_id == stand)

		standTl$crownRatioPred <- predict_crown(
			treeList = standTl,
			model = crownPropSet
		)

		out <- standTl %>%
			select(meta_stand_id, diameter, common, expansion,
				thi, crownRatioPred) %>%
			gather(source, crown_ratio, thi, crownRatioPred, -meta_stand_id,
				-diameter, -common, -expansion) %>%
			mutate(tpa = expansion,
						 bapa = calculate_inches_to_ba_ft2(diameter) * expansion) %>%
			group_by(meta_stand_id, diameter, common, source) %>%
			summarize(meanCrownRatio = mean(crown_ratio),
				nTrees = n()) %>%
			mutate(source = ifelse(source == 'crownRatio', 'observed', 'predicted'))

		return(out)
	})
)

# make plot a little better
compTabCrown$source <- factor(compTabCrown$source, levels = c('observed', 'predicted'))

# ggplot(data = compTabCrown, aes(x = diameter, y = meanCrownRatio,
# 			 color = source)) +
# 	geom_point(alpha = 0.5) +
# 	theme_bw() +
# 	facet_wrap(~ common,
# 						 labeller = labeller(common = label_wrap_gen(width = 15))) +
# 	ylab('mean crown ratio') +
# 	xlab('DBH (in)')

# if you are happy with the proportional assignment, you can skip the parametric
# modeling section.


###DAMAGE####
damageDat <- cruiseDataNoZero %>%
	filter(!is.na(thinDamage))

damagePropSet <- build_aux_model(trees = damageDat,
																attribute = 'thinDamage',
																spp = T,
																sppGroup = NULL,
																htClass = F,
																stand = F,
																groupVars = NULL)

fullTl <- cruiseDataNoZero %>%
	left_join(trainingData %>% mutate(plot_id = as.character(plot_id))) %>%
	filter(!is.na(meta_stand_id))

stands <- unique(fullTl$meta_stand_id)

predict_damage <- damagePropSet$predict_aux

# predict merch height then summarize by diameter and species for each stand
compTabDamage <- bind_rows(lapply(stands,
	function(stand) {
		standTl <- fullTl %>%
			filter(meta_stand_id == stand)

		standTl$damagePred <- predict_damage(
			treeList = standTl,
			model = damagePropSet
		)

		out <- standTl %>%
			select(meta_stand_id, diameter, common, expansion,
				thinDamage, damagePred) %>%
			gather(source, damage, thinDamage, damagePred, -meta_stand_id,
				-diameter, -common, -expansion) %>%
			mutate(tpa = expansion,
						 bapa = calculate_inches_to_ba_ft2(diameter) * expansion) %>%
			group_by(meta_stand_id, diameter, common, source) %>%
			summarize(meanDamage = mean(damage),
				nTrees = n()) %>%
			mutate(source = ifelse(source == 'damage', 'observed', 'predicted'))

		return(out)
	})
)

# make plot a little better
compTabDamage$source <- factor(compTabDamage$source, levels = c('observed', 'predicted'))

# ggplot(data = compTabDamage, aes(x = diameter, y = meanDamage,
# 			 color = source)) +
# 	geom_point(alpha = 0.5) +
# 	theme_bw() +
# 	facet_wrap(~ common,
# 						 labeller = labeller(common = label_wrap_gen(width = 15))) +
# 	ylab('mean crown ratio') +
# 	xlab('DBH (in)')


```

## Crown ratio statistical model

```{r crownRatio_parametric, eval = eval_crown}
#
# # this is a similar model form to the merch model option from above
# crownModel <- lmer(asin(crownRatio) ~ diameter +
# 	(1 + diameter | common),
# 	data = crownDat)
#
# summary(crownModel)
#
# # predict method for asin transformed CR - MODIFY if changing the model form
# predict_crown <- function(treeList, model) {
# 	predCrownRatios <- sin(predict(model$model, newdata = treeList,
# 									 					 		 allow.new.levels = T))
#
# 	return(predCrownRatios)
# }
#
# crownModelSet <- list(model = crownModel, predict_aux = predict_crown)
#
# crownDat$crownPred <- predict_crown(treeList = crownDat, model = crownModelSet)
#
#
# ggplot(crownDat, aes(x = crownRatio, y = crownPred)) +
# 		geom_point(alpha = 0.5) +
# 		facet_wrap(~ common,
# 							 labeller = labeller(common = label_wrap_gen(width = 15))) +
# 		theme_bw() +
# 		geom_abline(intercept = 0, slope = 1) +
# 		xlab("observed crown ratio") +
# 		ylab("predicted crown ratio")

```

## Crown ratio model selection
The default choice is the proportional assignment system. Change this if you want to use a statistical model.
```{r pickCrownModel, eval = eval_crown}
# by default, the crown ratio model is the proportional assignment system
crownSet <- crownPropSet
damageSet <- damagePropSet
# crownSet <- crownModelSet

save(crownSet, damageSet, file = file.path(modelDir, 'crownModel.Rda'))

```

-----------------------

Taper Coefficients
-----------------------

```{r volume, eval=eval_taper}

if(eval_taper & !eval_height) {
	stop('Tried to obtain volume coefficients without fitting height model')
}

cruiseDataNoZero$ht_hat <- predict(htModel, newSpp = cruiseDataNoZero$st_species,
		newDiam = cruiseDataNoZero$diameter)

measuredHts <- cruiseDataNoZero$total_height
htTrees <- which(!is.na(measuredHts))
cruiseDataNoZero$ht_hat[htTrees] <- measuredHts[htTrees]

taperCoeffs <- taper(species = cruiseDataNoZero$st_species,
		diameter = cruiseDataNoZero$diameter,
		expansion = cruiseDataNoZero$expansion,
		height = cruiseDataNoZero$ht_hat,
		bapa = medianBAPA,
		latitude = lat,
		longitude = lon,
		gleanedData = gleanedList,
		speciesCode = 'ST',
		parallel = T)

pandoc.table(taperCoeffs,
		split.tables = Inf,
		split.cells = Inf,
		justify = 'left',
		digits = 4)

for(sp in unique(cruiseDataNoZero$st_species)) {
	taperPlot <- plot(taperCoeffs, species = sp, speciesCode ='ST')
	print(taperPlot)
}

save(taperCoeffs, file = file.path(modelDir, 'taperCoeffs.Rda'))

```