Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################################################################
- ##################################### Plot disease enrichment #####################################
- ###################################################################################################
- library(ggplot2)
- library(plotrix)
- library(viridis)
- library(reshape2)
- setwd('~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/')
- ###################################################################################################
- #### Read in files
- ###################################################################################################
- ## Get disease count lists
- data <- read.csv(file="~/Google Drive/MGI/VICC/Manuscript/Figures/Data/disease_counts.csv",header=T, stringsAsFactors = F)
- ###################################################################################################
- #### summarize top level data for plotting
- ###################################################################################################
- top_data <- aggregate(. ~ TopNode_disease + TopNode_doid, data=data[,3:ncol(data)], FUN=sum)
- ## Get a list of all of the database names
- group_cols <- colnames(top_data)[!(colnames(top_data) %in% c("TopNode_disease","TopNode_doid"))]
- ## Get total counts for each cancer across databases
- top_data$total <- apply(top_data[,colnames(top_data) %in% group_cols], 1, FUN = sum)
- top_data$total_perc <- top_data$total/sum(top_data$total)*100
- # ## Get the top 10 cancers (includes top hit for all 6)
- # top <- head(top_data[order(-top_data$total),], n = 5)
- ## Get diseases for plotting that make up more than 5% of the total
- top <- top_data[which(top_data$total_perc > 5),]
- ## Create a dataframe of only the top 10 + a binned "other cancer" category for all other cancers combined
- top_data_scaled <- as.data.frame(top_data[order(-top_data$total),])
- top_data_scaled[!(top_data_scaled$TopNode_disease %in% c(top$TopNode_disease, "benign neoplasm", "other")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
- top_data_scaled[(top_data_scaled$TopNode_disease %in% c("cancer")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
- top_data_scaled[(top_data_scaled$TopNode_disease %in% c("other")),c("TopNode_disease","TopNode_doid")] <- c("other disease","other disease")
- top_data_scaled <- aggregate(. ~ TopNode_disease + TopNode_doid, data=top_data_scaled, FUN=sum)
- top_data_scaled$TopNode_disease <- gsub(" ","\n",top_data_scaled$TopNode_disease)
- ###################################################################################################
- #### Create pie charts of data
- ###################################################################################################
- # pie(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], main="CGI", col=viridis(length(top_data_scaled[,"cgi"]), option="B"))
- # pie3D(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], explode=0.1, main="CGI")
- ## Create pie charts for each database
- create_a_pie <- function(x,y){
- p <- pie(x[which(x[,y] > 0),y], labels=x[which(x[,y] > 0),"TopNode_disease"], main=y, col=viridis(length(x[,y]), option="B"))
- return(p)
- }
- # png(file = paste(getwd(),"disease_by_database__piechart_cgi.png", sep = "/"), height=1200, width=1150, res=150)
- # print(create_a_pie(top_data_scaled,"cgi"))
- # dev.off()
- create_a_pie(top_data_scaled,"civic")
- ###################################################################################################
- #### Create bar charts of data
- ###################################################################################################
- ## Melt the df
- top_data_scaled_long <- melt(top_data_scaled, id.vars = c("TopNode_disease", "TopNode_doid"))
- ## Refactor to desired order
- top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = c((unique(top_data_scaled[order(top_data_scaled$total,decreasing=F),"TopNode_disease"]))), exclude = NULL)
- ## Plot the disease by database
- png(file = paste(getwd(),"disease_by_database.png", sep = "/"), height=1200, width=1150, res=150)
- ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=TopNode_disease, y=value, fill = variable)) + geom_col(position = 'stack') + xlab("Disease") + ylab("Evidence") + guides(fill=guide_legend(title="Database")) + scale_fill_viridis(discrete = T, direction = -1)
- dev.off()
- ## Spaces are more desirable for the following plots
- top_data_scaled_long$TopNode_disease <- gsub("\n"," ",top_data_scaled_long$TopNode_disease)
- ## Change the disease order
- top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = as.vector(c("other disease", "benign neoplasm",as.character(unique(top_data_scaled_long[!(top_data_scaled_long$TopNode_disease %in% c("benign neoplasm","other disease")),"TopNode_disease"])))), exclude = NULL)
- ## Create a custom color palette
- plma <- rev(viridis::plasma(n=7))
- new_palette = c("grey87","grey66", plma)
- ## Plot the databases by disease proportions
- png(file = paste(getwd(),"disease_by_database_proportion.png", sep = "/"), height=1200, width=1150, res=150)
- ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Evidence") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw()
- dev.off()
Add Comment
Please, Sign In to add comment