Untitled

###################################################################################################
##################################### Plot disease enrichment #####################################
###################################################################################################

library(ggplot2)
library(plotrix)
library(viridis)
library(reshape2)

setwd('~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/')

###################################################################################################
#### Read in files
###################################################################################################
## Get disease count lists
data <- read.csv(file="~/Google Drive/MGI/VICC/Manuscript/Figures/Data/disease_counts.csv",header=T, stringsAsFactors = F)

###################################################################################################
#### summarize top level data for plotting
###################################################################################################
top_data <- aggregate(. ~ TopNode_disease + TopNode_doid, data=data[,3:ncol(data)], FUN=sum)

## Get a list of all of the database names
group_cols <- colnames(top_data)[!(colnames(top_data) %in% c("TopNode_disease","TopNode_doid"))]

## Get total counts for each cancer across databases
top_data$total <- apply(top_data[,colnames(top_data) %in% group_cols], 1, FUN = sum)
top_data$total_perc <- top_data$total/sum(top_data$total)*100

# ## Get the top 10 cancers (includes top hit for all 6)
# top <- head(top_data[order(-top_data$total),], n = 5)
## Get diseases for plotting that make up more than 5% of the total
top <- top_data[which(top_data$total_perc > 5),]

## Create a dataframe of only the top 10 + a binned "other cancer" category for all other cancers combined
top_data_scaled <- as.data.frame(top_data[order(-top_data$total),])
top_data_scaled[!(top_data_scaled$TopNode_disease %in% c(top$TopNode_disease, "benign neoplasm", "other")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
top_data_scaled[(top_data_scaled$TopNode_disease %in% c("cancer")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
top_data_scaled[(top_data_scaled$TopNode_disease %in% c("other")),c("TopNode_disease","TopNode_doid")] <- c("other disease","other disease")
top_data_scaled <- aggregate(. ~ TopNode_disease + TopNode_doid, data=top_data_scaled, FUN=sum)
top_data_scaled$TopNode_disease <- gsub(" ","\n",top_data_scaled$TopNode_disease)

###################################################################################################
#### Create pie charts of data
###################################################################################################
# pie(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], main="CGI", col=viridis(length(top_data_scaled[,"cgi"]), option="B"))
# pie3D(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], explode=0.1, main="CGI")

## Create pie charts for each database
create_a_pie <- function(x,y){
  p <- pie(x[which(x[,y] > 0),y], labels=x[which(x[,y] > 0),"TopNode_disease"], main=y, col=viridis(length(x[,y]), option="B"))
  return(p)
}
# png(file = paste(getwd(),"disease_by_database__piechart_cgi.png", sep = "/"), height=1200, width=1150, res=150)
#   print(create_a_pie(top_data_scaled,"cgi"))
# dev.off()
create_a_pie(top_data_scaled,"civic")

###################################################################################################
#### Create bar charts of data
###################################################################################################
## Melt the df
top_data_scaled_long <- melt(top_data_scaled, id.vars = c("TopNode_disease", "TopNode_doid"))

## Refactor to desired order
top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = c((unique(top_data_scaled[order(top_data_scaled$total,decreasing=F),"TopNode_disease"]))), exclude = NULL)

## Plot the disease by database
png(file = paste(getwd(),"disease_by_database.png", sep = "/"), height=1200, width=1150, res=150)
  ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=TopNode_disease, y=value, fill = variable)) + geom_col(position = 'stack') + xlab("Disease") + ylab("Evidence") + guides(fill=guide_legend(title="Database")) + scale_fill_viridis(discrete = T, direction = -1)
dev.off()


## Spaces are more desirable for the following plots
top_data_scaled_long$TopNode_disease <- gsub("\n"," ",top_data_scaled_long$TopNode_disease)
## Change the disease order
top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = as.vector(c("other disease", "benign neoplasm",as.character(unique(top_data_scaled_long[!(top_data_scaled_long$TopNode_disease %in% c("benign neoplasm","other disease")),"TopNode_disease"])))), exclude = NULL)
## Create a custom color palette
plma <- rev(viridis::plasma(n=7))
new_palette = c("grey87","grey66", plma)

## Plot the databases by disease proportions
png(file = paste(getwd(),"disease_by_database_proportion.png", sep = "/"), height=1200, width=1150, res=150)
  ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Evidence") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw()
dev.off()