Untitled

#NEW VERSION (25-Feb-2019)
#This newest approach will attempt to get around the very long runtimes introduced by making a single massive correlation matrix. This time I will divide cleaned up dataframe into 10 subsets, then make correlation matrices of those 10 to acheive overall faster speeds.
#load libraries
library(dplyr)
library(tidyr)
library(tibble)
library(psych)
library(ggplot2)
library(pheatmap)
library(plotly)
library(ggcorrplot)

#Import TCGA data as RNA_Seq_v2_expression_median (txt)
#Make the import as Tab-delimited
#For now "skip" the Entrez_Gene_Id column
#Set NA's equal to DEFAULT! (not zero)

#Turn it into a dataframe (otherwise making rownames unique doesn't work)
df <- as.data.frame(data_RNA_Seq_v2_expression_median)

#Make all row names as unique versions of the genes column, then delete the genes column
rownames(df) <- make.names(df[,1], unique=TRUE)
df$Hugo_Symbol <- NULL

#transpose so that columns are genes and rows are samples
tp <- as.data.frame(t(df))

#Only proceed with columns whose sum is greater than zero
tp2 = tp[,colSums(tp) > 0]

#Split into 10 dataframes (thanks /u/AgtSquirtle007)
#NOTE!! Check : Do all 10 data frames have same number of rows and columns?
#Is the last column of df10 the same as the last column of tp2?
columns <- floor(ncol(tp2)*.1)
df1 <- tp2[,1:columns]
df2 <- tp2[,(columns+1):(2*columns)]
df3 <- tp2[,(2*columns+1):(3*columns),]
df4 <- tp2[,(3*columns+1):(4*columns),]
df5 <- tp2[,(4*columns+1):(5*columns),]
df6 <- tp2[,(5*columns+1):(6*columns),]
df7 <- tp2[,(6*columns+1):(7*columns),]
df8 <- tp2[,(7*columns+1):(8*columns),]
df9 <- tp2[,(8*columns+1):(9*columns),]
df10 <- tp2[,(9*columns+1):(10*columns),]

#Calculate spearman correlation + holm-adjusted p-vals for all 10 dfs
#Use corr.test from the psych package (USE SPEARMAN)
#Warning! confidence intervals are turned off. Otheriwse this will never complete!
#NOTE: This can take a very long time to complete
#Note : a holm adjustment was determined to be an acceptable multiple hypothesis correction for this analysis
my_matrix1 <- corr.test(df1, method="spearman", adjust="holm", ci=FALSE)
my_matrix2 <- corr.test(df2, method="spearman", adjust="holm", ci=FALSE)
my_matrix3 <- corr.test(df3, method="spearman", adjust="holm", ci=FALSE)
my_matrix4 <- corr.test(df4, method="spearman", adjust="holm", ci=FALSE)
my_matrix5 <- corr.test(df5, method="spearman", adjust="holm", ci=FALSE)
my_matrix6 <- corr.test(df6, method="spearman", adjust="holm", ci=FALSE)
my_matrix7 <- corr.test(df7, method="spearman", adjust="holm", ci=FALSE)
my_matrix8 <- corr.test(df8, method="spearman", adjust="holm", ci=FALSE)
my_matrix9 <- corr.test(df9, method="spearman", adjust="holm", ci=FALSE)
my_matrix10 <- corr.test(df10, method="spearman", adjust="holm", ci=FALSE)

#Extract the correlations and p-values for easy access
r1 <- my_matrix1[["r"]]
p1 <- my_matrix1[["p"]]
r2 <- my_matrix2[["r"]]
p2 <- my_matrix2[["p"]]
r3 <- my_matrix3[["r"]]
p3 <- my_matrix3[["p"]]
r4 <- my_matrix4[["r"]]
p4 <- my_matrix4[["p"]]
r5 <- my_matrix5[["r"]]
p5 <- my_matrix5[["p"]]
r6 <- my_matrix6[["r"]]
p6 <- my_matrix6[["p"]]
r7 <- my_matrix7[["r"]]
p7 <- my_matrix7[["p"]]
r8 <- my_matrix8[["r"]]
p8 <- my_matrix8[["p"]]
r9 <- my_matrix9[["r"]]
p9 <- my_matrix9[["p"]]
r10 <- my_matrix10[["r"]]
p10 <- my_matrix10[["p"]]

#Make r and p copies (which are about to have their diagonals and redundant values removed)
r_diag_adj1 <- r1
p_diag_adj1 <- p1
r_diag_adj2 <- r2
p_diag_adj2 <- p2
r_diag_adj3 <- r3
p_diag_adj3 <- p3
r_diag_adj4 <- r4
p_diag_adj4 <- p4
r_diag_adj5 <- r5
p_diag_adj5 <- p5
r_diag_adj6 <- r6
p_diag_adj6 <- p6
r_diag_adj7 <- r7
p_diag_adj7 <- p7
r_diag_adj8 <- r8
p_diag_adj8 <- p8
r_diag_adj9 <- r9
p_diag_adj9 <- p9
r_diag_adj10 <- r10
p_diag_adj10 <- p10

#Filtering time to get only those genes with very tight correlations or very good p-values
#Remove diagonal and redundant values (thanks /u/Laerphon)
r_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
p_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
r_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
p_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
r_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
p_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
r_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
p_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
r_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
p_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
r_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
p_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
r_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
p_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
r_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
p_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
r_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
p_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
r_diag_adj10[!lower.tri(r_diag_adj10)] <- NA
p_diag_adj10[!lower.tri(r_diag_adj10)] <- NA

#Return the correlations with R value > |0.9| (thanks /u/Laerphon)
RESULTS_r1_0.9 <- data.frame(r_diag_adj1) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r2_0.9 <- data.frame(r_diag_adj2) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r3_0.9 <- data.frame(r_diag_adj3) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r4_0.9 <- data.frame(r_diag_adj4) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r5_0.9 <- data.frame(r_diag_adj5) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r6_0.9 <- data.frame(r_diag_adj6) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r7_0.9 <- data.frame(r_diag_adj7) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r8_0.9 <- data.frame(r_diag_adj8) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r9_0.9 <- data.frame(r_diag_adj9) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)
RESULTS_r10_0.9 <- data.frame(r_diag_adj10) %>%
              rownames_to_column() %>%
              gather(key="variable", value="correlation", -rowname) %>%
              filter(abs(correlation) > 0.9)


#Return the p-values that are <0.05
RESULTS_p1_0.9 <- data.frame(p_diag_adj1) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p2_0.9 <- data.frame(p_diag_adj2) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p3_0.9 <- data.frame(p_diag_adj3) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p4_0.9 <- data.frame(p_diag_adj4) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p5_0.9 <- data.frame(p_diag_adj5) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p6_0.9 <- data.frame(p_diag_adj6) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p7_0.9 <- data.frame(p_diag_adj7) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p8_0.9 <- data.frame(p_diag_adj8) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p9_0.9 <- data.frame(p_diag_adj9) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)
RESULTS_p10_0.9 <- data.frame(p_diag_adj10) %>%
              rownames_to_column() %>%
              gather(key="variable", value="pval", -rowname) %>%
              filter((pval) < 0.05)


#Merge the two resulting filtered datasets for each of the 10 groups, where all rows from each are kept
merged_prelim1 <- merge(RESULTS_r1_0.9, RESULTS_p1_0.9, all=TRUE)
merged_prelim2 <- merge(RESULTS_r2_0.9, RESULTS_p2_0.9, all=TRUE)
merged_prelim3<- merge(RESULTS_r3_0.9, RESULTS_p3_0.9, all=TRUE)
merged_prelim4 <- merge(RESULTS_r4_0.9, RESULTS_p4_0.9, all=TRUE)
merged_prelim5 <- merge(RESULTS_r5_0.9, RESULTS_p5_0.9, all=TRUE)
merged_prelim6 <- merge(RESULTS_r6_0.9, RESULTS_p6_0.9, all=TRUE)
merged_prelim7 <- merge(RESULTS_r7_0.9, RESULTS_p7_0.9, all=TRUE)
merged_prelim8 <- merge(RESULTS_r8_0.9, RESULTS_p8_0.9, all=TRUE)
merged_prelim9 <- merge(RESULTS_r9_0.9, RESULTS_p9_0.9, all=TRUE)
merged_prelim10 <- merge(RESULTS_r10_0.9, RESULTS_p10_0.9, all=TRUE)

#This has many fewer correlation-passing genes than pval-passing genes
#Filter out any gene which has a NA in the 'correlation' column
RESULTS_merged1 <- merged_prelim1[complete.cases(merged_prelim1[, 3]),]
RESULTS_merged2 <- merged_prelim2[complete.cases(merged_prelim2[, 3]),]
RESULTS_merged3 <- merged_prelim3[complete.cases(merged_prelim3[, 3]),]
RESULTS_merged4 <- merged_prelim4[complete.cases(merged_prelim4[, 3]),]
RESULTS_merged5 <- merged_prelim5[complete.cases(merged_prelim5[, 3]),]
RESULTS_merged6 <- merged_prelim6[complete.cases(merged_prelim6[, 3]),]
RESULTS_merged7 <- merged_prelim7[complete.cases(merged_prelim7[, 3]),]
RESULTS_merged8 <- merged_prelim8[complete.cases(merged_prelim8[, 3]),]
RESULTS_merged9 <- merged_prelim9[complete.cases(merged_prelim9[, 3]),]
RESULTS_merged10 <- merged_prelim10[complete.cases(merged_prelim10[, 3]),]

#The merge function can only handle 2 dataframes at a time
#Therefore, I'll merge 2 at a time until they're all merged [this is tedious and there is probably a better way]
RESULTS_merged_1_and_2 <- merge(RESULTS_merged1, RESULTS_merged2, all=TRUE)
RESULTS_merged_3_and_4 <- merge(RESULTS_merged3, RESULTS_merged4, all=TRUE)
RESULTS_merged_5_and_6 <- merge(RESULTS_merged5, RESULTS_merged6, all=TRUE)
RESULTS_merged_7_and_8 <- merge(RESULTS_merged7, RESULTS_merged8, all=TRUE)
RESULTS_merged_9_and_10 <- merge(RESULTS_merged9, RESULTS_merged10, all=TRUE)

RESULTS_merged_1_through_4 <- merge(RESULTS_merged_1_and_2, RESULTS_merged_3_and_4, all=TRUE)
RESULTS_merged_5_through_8 <- merge(RESULTS_merged_5_and_6, RESULTS_merged_7_and_8, all=TRUE)
#(9 and 10 are already merged)

RESULTS_merged_1_through_8 <- merge(RESULTS_merged_1_through_4, RESULTS_merged_5_through_8, all=TRUE)

RESULTS_merged_1_through_10 <- merge(RESULTS_merged_1_through_8, RESULTS_merged_9_and_10, all=TRUE)


#Write the results of criteria-passing genes to a .csv (This will give genes which are both R2 > |0.9| AND pval <0.05
write.csv(RESULTS_merged_1_through_10, "PASSED_THRESHOLD_RESULTS.csv")


######For the next section, the aim is to take any gene which shows up in RESULTS_merged_1_through_10, and then end up with a correlation plot using only these values.
#First take the gene names in RESULTS_merged_1_through_10, which are in two columns 'rowname' and 'variable' - then stack them ontop of one another.
#Then get rid of the second column (it just tells which column the gene name originally came from)
stacked_names <- data.frame(stack(RESULTS_merged_1_through_10[1:2]))
stacked_names$ind <- NULL

#Get only those genes which are unique names / aka drop duplicates (otherwise downstream the matrix won't appropriately drop the duplicates)
unique_names <- data.frame(unique(stacked_names))

#Take all the threshold-passing genes and get their values (starting from the point of the transposed matrix)
hitsonly <- subset(tp2, select=c(unique_names$values))

#Make a correlation matrix of the hits
hitmatrix <- corr.test(hitsonly, method="spearman", adjust="holm", ci=FALSE)
hitmatrixr <- hitmatrix[["r"]]

#plot and save the matrix of hits
pheatmap(hitmatrixr)

#also try ggcorrplot
ggcorrplot(hitmatrixr, hc.order = TRUE, tl.cex=6, outline.col="#e2e2e2", ggtheme=ggplot2::theme_gray, colors = c("#2893e0", "white", "#DB2B18"))


#Make a copy of tp2 but change tp2 to the scale of Log2(FPKM) + 1 .....this will plot on a log scale but account for 0s
tp2log2plus1 <- log((1+tp2))

#Make a scatterplot of 2 genes (using the log2 + 1 scale)
ggplot(tp2log2plus1, aes(x=MGC29506, y=LOC96610)) + geom_point(size=2) + geom_smooth(method=lm, size=2) +  geom_rug(col="darkred", size=0.9, sides="bl", alpha = 0.3)


#######POST PROCESSING SECTION
#The threshold passing correlations are in pairs (rowname and variable) - call them gene1 and gene2 for short.

#Take each gene1 ("rowname" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
tp2_summary1 <- tp2 %>%
gather( key = rowname) %>%
group_by(rowname) %>%
summarize(avg_expression1 = mean(value))

#join gene1 and gene1 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
avg_expr_final1 <- RESULTS_merged_1_through_10 %>%
select(rowname)%>%
left_join(tp2_summary1)

#Take each gene2 ("variable" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
tp2_summary2 <- tp2 %>%
gather( key = variable) %>%
group_by(variable) %>%
summarize(avg_expression2 = mean(value))

#join gene2 and gene2 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
avg_expr_final2 <- RESULTS_merged_1_through_10 %>%
select(variable)%>%
left_join(tp2_summary2)

#merge Gene1 and Gene2 with their respective average expressions
avg_expr_merged <- merge(avg_expr_final1, avg_expr_final2, all=TRUE)


#Add a column that has the gene1 and 2 concatenated like this Gene1_Gene2, and also add a column that has the sums of Gene1+Gene2
avg_expr_merged <- cbind(avg_expr_merged, Gene1_Gene2 = paste(avg_expr_merged$rowname, avg_expr_merged$variable, sep="_"))

#Now make a character of the sums of Gene1+Gene2
#NOTE that this is probably not that useful because I'm missing things with inverse relationships if I just look at the averages.
#Just keep this in case it is needed in the future, but don't put too much stock in it for now
#Use absolute value to account for the repressor/inverse correlation (Gene1 goes up but Gene2 goes down)?????????????????????
merged_rowsums<- rowSums(select(avg_expr_merged, avg_expression1, avg_expression2))

#add it back to the avg_expr_merged
avg_expr_merged<- cbind(avg_expr_merged, merged_rowsums)


#Look at the ratios between Gene1 and Gene2
#Always want to divide the largest number by the smallest number, and also only want to consider avg_expression1 and avg_expression2 columns
#Do the calculation then add the resulting column to the avg_expr_merged dataframe
#An if_else is appropriate for this (thanks /u/cb603)
avg_expr_merged <- avg_expr_merged %>% mutate(ratio = if_else(avg_expression1 >= avg_expression2, avg_expression1/avg_expression2, avg_expression2/avg_expression1))


#Add a Gene1_Gene2 concatenation to RESULTS_merged_1_through_10 - I believe this needs to be used as a reference for when we make the master results sheet
RESULTS_merged_1_through_10 <- cbind(RESULTS_merged_1_through_10, concat_names = paste(RESULTS_merged_1_through_10$rowname, RESULTS_merged_1_through_10$variable, sep="_"))

#Take only those pairs of genes that have passed threshold criteria - and extract their correlation and p-values to be put beside the raitios
master <- merge(RESULTS_merged_1_through_10, avg_expr_merged, all.x=TRUE)
#Clean it up - keep what is needed, drop what is not needed, and re-arrange columns in a better way
master <- master[,c(1,2,6,7,9,8,4,3,10)]