Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #NEW VERSION (25-Feb-2019)
- #This newest approach will attempt to get around the very long runtimes introduced by making a single massive correlation matrix. This time I will divide cleaned up dataframe into 10 subsets, then make correlation matrices of those 10 to acheive overall faster speeds.
- #load libraries
- library(dplyr)
- library(tidyr)
- library(tibble)
- library(psych)
- library(ggplot2)
- library(pheatmap)
- library(plotly)
- library(ggcorrplot)
- #Import TCGA data as RNA_Seq_v2_expression_median (txt)
- #Make the import as Tab-delimited
- #For now "skip" the Entrez_Gene_Id column
- #Set NA's equal to DEFAULT! (not zero)
- #Turn it into a dataframe (otherwise making rownames unique doesn't work)
- df <- as.data.frame(data_RNA_Seq_v2_expression_median)
- #Make all row names as unique versions of the genes column, then delete the genes column
- rownames(df) <- make.names(df[,1], unique=TRUE)
- df$Hugo_Symbol <- NULL
- #transpose so that columns are genes and rows are samples
- tp <- as.data.frame(t(df))
- #Only proceed with columns whose sum is greater than zero
- tp2 = tp[,colSums(tp) > 0]
- #Split into 10 dataframes (thanks /u/AgtSquirtle007)
- #NOTE!! Check : Do all 10 data frames have same number of rows and columns?
- #Is the last column of df10 the same as the last column of tp2?
- columns <- floor(ncol(tp2)*.1)
- df1 <- tp2[,1:columns]
- df2 <- tp2[,(columns+1):(2*columns)]
- df3 <- tp2[,(2*columns+1):(3*columns),]
- df4 <- tp2[,(3*columns+1):(4*columns),]
- df5 <- tp2[,(4*columns+1):(5*columns),]
- df6 <- tp2[,(5*columns+1):(6*columns),]
- df7 <- tp2[,(6*columns+1):(7*columns),]
- df8 <- tp2[,(7*columns+1):(8*columns),]
- df9 <- tp2[,(8*columns+1):(9*columns),]
- df10 <- tp2[,(9*columns+1):(10*columns),]
- #Calculate spearman correlation + holm-adjusted p-vals for all 10 dfs
- #Use corr.test from the psych package (USE SPEARMAN)
- #Warning! confidence intervals are turned off. Otheriwse this will never complete!
- #NOTE: This can take a very long time to complete
- #Note : a holm adjustment was determined to be an acceptable multiple hypothesis correction for this analysis
- my_matrix1 <- corr.test(df1, method="spearman", adjust="holm", ci=FALSE)
- my_matrix2 <- corr.test(df2, method="spearman", adjust="holm", ci=FALSE)
- my_matrix3 <- corr.test(df3, method="spearman", adjust="holm", ci=FALSE)
- my_matrix4 <- corr.test(df4, method="spearman", adjust="holm", ci=FALSE)
- my_matrix5 <- corr.test(df5, method="spearman", adjust="holm", ci=FALSE)
- my_matrix6 <- corr.test(df6, method="spearman", adjust="holm", ci=FALSE)
- my_matrix7 <- corr.test(df7, method="spearman", adjust="holm", ci=FALSE)
- my_matrix8 <- corr.test(df8, method="spearman", adjust="holm", ci=FALSE)
- my_matrix9 <- corr.test(df9, method="spearman", adjust="holm", ci=FALSE)
- my_matrix10 <- corr.test(df10, method="spearman", adjust="holm", ci=FALSE)
- #Extract the correlations and p-values for easy access
- r1 <- my_matrix1[["r"]]
- p1 <- my_matrix1[["p"]]
- r2 <- my_matrix2[["r"]]
- p2 <- my_matrix2[["p"]]
- r3 <- my_matrix3[["r"]]
- p3 <- my_matrix3[["p"]]
- r4 <- my_matrix4[["r"]]
- p4 <- my_matrix4[["p"]]
- r5 <- my_matrix5[["r"]]
- p5 <- my_matrix5[["p"]]
- r6 <- my_matrix6[["r"]]
- p6 <- my_matrix6[["p"]]
- r7 <- my_matrix7[["r"]]
- p7 <- my_matrix7[["p"]]
- r8 <- my_matrix8[["r"]]
- p8 <- my_matrix8[["p"]]
- r9 <- my_matrix9[["r"]]
- p9 <- my_matrix9[["p"]]
- r10 <- my_matrix10[["r"]]
- p10 <- my_matrix10[["p"]]
- #Make r and p copies (which are about to have their diagonals and redundant values removed)
- r_diag_adj1 <- r1
- p_diag_adj1 <- p1
- r_diag_adj2 <- r2
- p_diag_adj2 <- p2
- r_diag_adj3 <- r3
- p_diag_adj3 <- p3
- r_diag_adj4 <- r4
- p_diag_adj4 <- p4
- r_diag_adj5 <- r5
- p_diag_adj5 <- p5
- r_diag_adj6 <- r6
- p_diag_adj6 <- p6
- r_diag_adj7 <- r7
- p_diag_adj7 <- p7
- r_diag_adj8 <- r8
- p_diag_adj8 <- p8
- r_diag_adj9 <- r9
- p_diag_adj9 <- p9
- r_diag_adj10 <- r10
- p_diag_adj10 <- p10
- #Filtering time to get only those genes with very tight correlations or very good p-values
- #Remove diagonal and redundant values (thanks /u/Laerphon)
- r_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
- p_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
- r_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
- p_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
- r_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
- p_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
- r_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
- p_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
- r_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
- p_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
- r_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
- p_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
- r_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
- p_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
- r_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
- p_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
- r_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
- p_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
- r_diag_adj10[!lower.tri(r_diag_adj10)] <- NA
- p_diag_adj10[!lower.tri(r_diag_adj10)] <- NA
- #Return the correlations with R value > |0.9| (thanks /u/Laerphon)
- RESULTS_r1_0.9 <- data.frame(r_diag_adj1) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r2_0.9 <- data.frame(r_diag_adj2) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r3_0.9 <- data.frame(r_diag_adj3) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r4_0.9 <- data.frame(r_diag_adj4) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r5_0.9 <- data.frame(r_diag_adj5) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r6_0.9 <- data.frame(r_diag_adj6) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r7_0.9 <- data.frame(r_diag_adj7) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r8_0.9 <- data.frame(r_diag_adj8) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r9_0.9 <- data.frame(r_diag_adj9) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- RESULTS_r10_0.9 <- data.frame(r_diag_adj10) %>%
- rownames_to_column() %>%
- gather(key="variable", value="correlation", -rowname) %>%
- filter(abs(correlation) > 0.9)
- #Return the p-values that are <0.05
- RESULTS_p1_0.9 <- data.frame(p_diag_adj1) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p2_0.9 <- data.frame(p_diag_adj2) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p3_0.9 <- data.frame(p_diag_adj3) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p4_0.9 <- data.frame(p_diag_adj4) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p5_0.9 <- data.frame(p_diag_adj5) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p6_0.9 <- data.frame(p_diag_adj6) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p7_0.9 <- data.frame(p_diag_adj7) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p8_0.9 <- data.frame(p_diag_adj8) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p9_0.9 <- data.frame(p_diag_adj9) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- RESULTS_p10_0.9 <- data.frame(p_diag_adj10) %>%
- rownames_to_column() %>%
- gather(key="variable", value="pval", -rowname) %>%
- filter((pval) < 0.05)
- #Merge the two resulting filtered datasets for each of the 10 groups, where all rows from each are kept
- merged_prelim1 <- merge(RESULTS_r1_0.9, RESULTS_p1_0.9, all=TRUE)
- merged_prelim2 <- merge(RESULTS_r2_0.9, RESULTS_p2_0.9, all=TRUE)
- merged_prelim3<- merge(RESULTS_r3_0.9, RESULTS_p3_0.9, all=TRUE)
- merged_prelim4 <- merge(RESULTS_r4_0.9, RESULTS_p4_0.9, all=TRUE)
- merged_prelim5 <- merge(RESULTS_r5_0.9, RESULTS_p5_0.9, all=TRUE)
- merged_prelim6 <- merge(RESULTS_r6_0.9, RESULTS_p6_0.9, all=TRUE)
- merged_prelim7 <- merge(RESULTS_r7_0.9, RESULTS_p7_0.9, all=TRUE)
- merged_prelim8 <- merge(RESULTS_r8_0.9, RESULTS_p8_0.9, all=TRUE)
- merged_prelim9 <- merge(RESULTS_r9_0.9, RESULTS_p9_0.9, all=TRUE)
- merged_prelim10 <- merge(RESULTS_r10_0.9, RESULTS_p10_0.9, all=TRUE)
- #This has many fewer correlation-passing genes than pval-passing genes
- #Filter out any gene which has a NA in the 'correlation' column
- RESULTS_merged1 <- merged_prelim1[complete.cases(merged_prelim1[, 3]),]
- RESULTS_merged2 <- merged_prelim2[complete.cases(merged_prelim2[, 3]),]
- RESULTS_merged3 <- merged_prelim3[complete.cases(merged_prelim3[, 3]),]
- RESULTS_merged4 <- merged_prelim4[complete.cases(merged_prelim4[, 3]),]
- RESULTS_merged5 <- merged_prelim5[complete.cases(merged_prelim5[, 3]),]
- RESULTS_merged6 <- merged_prelim6[complete.cases(merged_prelim6[, 3]),]
- RESULTS_merged7 <- merged_prelim7[complete.cases(merged_prelim7[, 3]),]
- RESULTS_merged8 <- merged_prelim8[complete.cases(merged_prelim8[, 3]),]
- RESULTS_merged9 <- merged_prelim9[complete.cases(merged_prelim9[, 3]),]
- RESULTS_merged10 <- merged_prelim10[complete.cases(merged_prelim10[, 3]),]
- #The merge function can only handle 2 dataframes at a time
- #Therefore, I'll merge 2 at a time until they're all merged [this is tedious and there is probably a better way]
- RESULTS_merged_1_and_2 <- merge(RESULTS_merged1, RESULTS_merged2, all=TRUE)
- RESULTS_merged_3_and_4 <- merge(RESULTS_merged3, RESULTS_merged4, all=TRUE)
- RESULTS_merged_5_and_6 <- merge(RESULTS_merged5, RESULTS_merged6, all=TRUE)
- RESULTS_merged_7_and_8 <- merge(RESULTS_merged7, RESULTS_merged8, all=TRUE)
- RESULTS_merged_9_and_10 <- merge(RESULTS_merged9, RESULTS_merged10, all=TRUE)
- RESULTS_merged_1_through_4 <- merge(RESULTS_merged_1_and_2, RESULTS_merged_3_and_4, all=TRUE)
- RESULTS_merged_5_through_8 <- merge(RESULTS_merged_5_and_6, RESULTS_merged_7_and_8, all=TRUE)
- #(9 and 10 are already merged)
- RESULTS_merged_1_through_8 <- merge(RESULTS_merged_1_through_4, RESULTS_merged_5_through_8, all=TRUE)
- RESULTS_merged_1_through_10 <- merge(RESULTS_merged_1_through_8, RESULTS_merged_9_and_10, all=TRUE)
- #Write the results of criteria-passing genes to a .csv (This will give genes which are both R2 > |0.9| AND pval <0.05
- write.csv(RESULTS_merged_1_through_10, "PASSED_THRESHOLD_RESULTS.csv")
- ######For the next section, the aim is to take any gene which shows up in RESULTS_merged_1_through_10, and then end up with a correlation plot using only these values.
- #First take the gene names in RESULTS_merged_1_through_10, which are in two columns 'rowname' and 'variable' - then stack them ontop of one another.
- #Then get rid of the second column (it just tells which column the gene name originally came from)
- stacked_names <- data.frame(stack(RESULTS_merged_1_through_10[1:2]))
- stacked_names$ind <- NULL
- #Get only those genes which are unique names / aka drop duplicates (otherwise downstream the matrix won't appropriately drop the duplicates)
- unique_names <- data.frame(unique(stacked_names))
- #Take all the threshold-passing genes and get their values (starting from the point of the transposed matrix)
- hitsonly <- subset(tp2, select=c(unique_names$values))
- #Make a correlation matrix of the hits
- hitmatrix <- corr.test(hitsonly, method="spearman", adjust="holm", ci=FALSE)
- hitmatrixr <- hitmatrix[["r"]]
- #plot and save the matrix of hits
- pheatmap(hitmatrixr)
- #also try ggcorrplot
- ggcorrplot(hitmatrixr, hc.order = TRUE, tl.cex=6, outline.col="#e2e2e2", ggtheme=ggplot2::theme_gray, colors = c("#2893e0", "white", "#DB2B18"))
- #Make a copy of tp2 but change tp2 to the scale of Log2(FPKM) + 1 .....this will plot on a log scale but account for 0s
- tp2log2plus1 <- log((1+tp2))
- #Make a scatterplot of 2 genes (using the log2 + 1 scale)
- ggplot(tp2log2plus1, aes(x=MGC29506, y=LOC96610)) + geom_point(size=2) + geom_smooth(method=lm, size=2) + geom_rug(col="darkred", size=0.9, sides="bl", alpha = 0.3)
- #######POST PROCESSING SECTION
- #The threshold passing correlations are in pairs (rowname and variable) - call them gene1 and gene2 for short.
- #Take each gene1 ("rowname" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
- tp2_summary1 <- tp2 %>%
- gather( key = rowname) %>%
- group_by(rowname) %>%
- summarize(avg_expression1 = mean(value))
- #join gene1 and gene1 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
- avg_expr_final1 <- RESULTS_merged_1_through_10 %>%
- select(rowname)%>%
- left_join(tp2_summary1)
- #Take each gene2 ("variable" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
- tp2_summary2 <- tp2 %>%
- gather( key = variable) %>%
- group_by(variable) %>%
- summarize(avg_expression2 = mean(value))
- #join gene2 and gene2 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
- avg_expr_final2 <- RESULTS_merged_1_through_10 %>%
- select(variable)%>%
- left_join(tp2_summary2)
- #merge Gene1 and Gene2 with their respective average expressions
- avg_expr_merged <- merge(avg_expr_final1, avg_expr_final2, all=TRUE)
- #Add a column that has the gene1 and 2 concatenated like this Gene1_Gene2, and also add a column that has the sums of Gene1+Gene2
- avg_expr_merged <- cbind(avg_expr_merged, Gene1_Gene2 = paste(avg_expr_merged$rowname, avg_expr_merged$variable, sep="_"))
- #Now make a character of the sums of Gene1+Gene2
- #NOTE that this is probably not that useful because I'm missing things with inverse relationships if I just look at the averages.
- #Just keep this in case it is needed in the future, but don't put too much stock in it for now
- #Use absolute value to account for the repressor/inverse correlation (Gene1 goes up but Gene2 goes down)?????????????????????
- merged_rowsums<- rowSums(select(avg_expr_merged, avg_expression1, avg_expression2))
- #add it back to the avg_expr_merged
- avg_expr_merged<- cbind(avg_expr_merged, merged_rowsums)
- #Look at the ratios between Gene1 and Gene2
- #Always want to divide the largest number by the smallest number, and also only want to consider avg_expression1 and avg_expression2 columns
- #Do the calculation then add the resulting column to the avg_expr_merged dataframe
- #An if_else is appropriate for this (thanks /u/cb603)
- avg_expr_merged <- avg_expr_merged %>% mutate(ratio = if_else(avg_expression1 >= avg_expression2, avg_expression1/avg_expression2, avg_expression2/avg_expression1))
- #Add a Gene1_Gene2 concatenation to RESULTS_merged_1_through_10 - I believe this needs to be used as a reference for when we make the master results sheet
- RESULTS_merged_1_through_10 <- cbind(RESULTS_merged_1_through_10, concat_names = paste(RESULTS_merged_1_through_10$rowname, RESULTS_merged_1_through_10$variable, sep="_"))
- #Take only those pairs of genes that have passed threshold criteria - and extract their correlation and p-values to be put beside the raitios
- master <- merge(RESULTS_merged_1_through_10, avg_expr_merged, all.x=TRUE)
- #Clean it up - keep what is needed, drop what is not needed, and re-arrange columns in a better way
- master <- master[,c(1,2,6,7,9,8,4,3,10)]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement