Advertisement
Guest User

Untitled

a guest
Mar 13th, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 16.31 KB | None | 0 0
  1. #NEW VERSION (25-Feb-2019)
  2. #This newest approach will attempt to get around the very long runtimes introduced by making a single massive correlation matrix. This time I will divide cleaned up dataframe into 10 subsets, then make correlation matrices of those 10 to acheive overall faster speeds.
  3. #load libraries
  4. library(dplyr)
  5. library(tidyr)
  6. library(tibble)
  7. library(psych)
  8. library(ggplot2)
  9. library(pheatmap)
  10. library(plotly)
  11. library(ggcorrplot)
  12.  
  13. #Import TCGA data as RNA_Seq_v2_expression_median (txt)
  14. #Make the import as Tab-delimited
  15. #For now "skip" the Entrez_Gene_Id column
  16. #Set NA's equal to DEFAULT! (not zero)
  17.  
  18. #Turn it into a dataframe (otherwise making rownames unique doesn't work)
  19. df <- as.data.frame(data_RNA_Seq_v2_expression_median)
  20.  
  21. #Make all row names as unique versions of the genes column, then delete the genes column
  22. rownames(df) <- make.names(df[,1], unique=TRUE)
  23. df$Hugo_Symbol <- NULL
  24.  
  25. #transpose so that columns are genes and rows are samples
  26. tp <- as.data.frame(t(df))
  27.  
  28. #Only proceed with columns whose sum is greater than zero
  29. tp2 = tp[,colSums(tp) > 0]
  30.  
  31. #Split into 10 dataframes (thanks /u/AgtSquirtle007)
  32. #NOTE!! Check : Do all 10 data frames have same number of rows and columns?
  33. #Is the last column of df10 the same as the last column of tp2?
  34. columns <- floor(ncol(tp2)*.1)
  35. df1 <- tp2[,1:columns]
  36. df2 <- tp2[,(columns+1):(2*columns)]
  37. df3 <- tp2[,(2*columns+1):(3*columns),]
  38. df4 <- tp2[,(3*columns+1):(4*columns),]
  39. df5 <- tp2[,(4*columns+1):(5*columns),]
  40. df6 <- tp2[,(5*columns+1):(6*columns),]
  41. df7 <- tp2[,(6*columns+1):(7*columns),]
  42. df8 <- tp2[,(7*columns+1):(8*columns),]
  43. df9 <- tp2[,(8*columns+1):(9*columns),]
  44. df10 <- tp2[,(9*columns+1):(10*columns),]
  45.  
  46. #Calculate spearman correlation + holm-adjusted p-vals for all 10 dfs
  47. #Use corr.test from the psych package (USE SPEARMAN)
  48. #Warning! confidence intervals are turned off. Otheriwse this will never complete!
  49. #NOTE: This can take a very long time to complete
  50. #Note : a holm adjustment was determined to be an acceptable multiple hypothesis correction for this analysis
  51. my_matrix1 <- corr.test(df1, method="spearman", adjust="holm", ci=FALSE)
  52. my_matrix2 <- corr.test(df2, method="spearman", adjust="holm", ci=FALSE)
  53. my_matrix3 <- corr.test(df3, method="spearman", adjust="holm", ci=FALSE)
  54. my_matrix4 <- corr.test(df4, method="spearman", adjust="holm", ci=FALSE)
  55. my_matrix5 <- corr.test(df5, method="spearman", adjust="holm", ci=FALSE)
  56. my_matrix6 <- corr.test(df6, method="spearman", adjust="holm", ci=FALSE)
  57. my_matrix7 <- corr.test(df7, method="spearman", adjust="holm", ci=FALSE)
  58. my_matrix8 <- corr.test(df8, method="spearman", adjust="holm", ci=FALSE)
  59. my_matrix9 <- corr.test(df9, method="spearman", adjust="holm", ci=FALSE)
  60. my_matrix10 <- corr.test(df10, method="spearman", adjust="holm", ci=FALSE)
  61.  
  62. #Extract the correlations and p-values for easy access
  63. r1 <- my_matrix1[["r"]]
  64. p1 <- my_matrix1[["p"]]
  65. r2 <- my_matrix2[["r"]]
  66. p2 <- my_matrix2[["p"]]
  67. r3 <- my_matrix3[["r"]]
  68. p3 <- my_matrix3[["p"]]
  69. r4 <- my_matrix4[["r"]]
  70. p4 <- my_matrix4[["p"]]
  71. r5 <- my_matrix5[["r"]]
  72. p5 <- my_matrix5[["p"]]
  73. r6 <- my_matrix6[["r"]]
  74. p6 <- my_matrix6[["p"]]
  75. r7 <- my_matrix7[["r"]]
  76. p7 <- my_matrix7[["p"]]
  77. r8 <- my_matrix8[["r"]]
  78. p8 <- my_matrix8[["p"]]
  79. r9 <- my_matrix9[["r"]]
  80. p9 <- my_matrix9[["p"]]
  81. r10 <- my_matrix10[["r"]]
  82. p10 <- my_matrix10[["p"]]
  83.  
  84. #Make r and p copies (which are about to have their diagonals and redundant values removed)
  85. r_diag_adj1 <- r1
  86. p_diag_adj1 <- p1
  87. r_diag_adj2 <- r2
  88. p_diag_adj2 <- p2
  89. r_diag_adj3 <- r3
  90. p_diag_adj3 <- p3
  91. r_diag_adj4 <- r4
  92. p_diag_adj4 <- p4
  93. r_diag_adj5 <- r5
  94. p_diag_adj5 <- p5
  95. r_diag_adj6 <- r6
  96. p_diag_adj6 <- p6
  97. r_diag_adj7 <- r7
  98. p_diag_adj7 <- p7
  99. r_diag_adj8 <- r8
  100. p_diag_adj8 <- p8
  101. r_diag_adj9 <- r9
  102. p_diag_adj9 <- p9
  103. r_diag_adj10 <- r10
  104. p_diag_adj10 <- p10
  105.  
  106. #Filtering time to get only those genes with very tight correlations or very good p-values
  107. #Remove diagonal and redundant values (thanks /u/Laerphon)
  108. r_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
  109. p_diag_adj1[!lower.tri(r_diag_adj1)] <- NA
  110. r_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
  111. p_diag_adj2[!lower.tri(r_diag_adj2)] <- NA
  112. r_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
  113. p_diag_adj3[!lower.tri(r_diag_adj3)] <- NA
  114. r_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
  115. p_diag_adj4[!lower.tri(r_diag_adj4)] <- NA
  116. r_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
  117. p_diag_adj5[!lower.tri(r_diag_adj5)] <- NA
  118. r_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
  119. p_diag_adj6[!lower.tri(r_diag_adj6)] <- NA
  120. r_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
  121. p_diag_adj7[!lower.tri(r_diag_adj7)] <- NA
  122. r_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
  123. p_diag_adj8[!lower.tri(r_diag_adj8)] <- NA
  124. r_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
  125. p_diag_adj9[!lower.tri(r_diag_adj9)] <- NA
  126. r_diag_adj10[!lower.tri(r_diag_adj10)] <- NA
  127. p_diag_adj10[!lower.tri(r_diag_adj10)] <- NA
  128.  
  129. #Return the correlations with R value > |0.9| (thanks /u/Laerphon)
  130. RESULTS_r1_0.9 <- data.frame(r_diag_adj1) %>%
  131.               rownames_to_column() %>%
  132.               gather(key="variable", value="correlation", -rowname) %>%
  133.               filter(abs(correlation) > 0.9)
  134. RESULTS_r2_0.9 <- data.frame(r_diag_adj2) %>%
  135.               rownames_to_column() %>%
  136.               gather(key="variable", value="correlation", -rowname) %>%
  137.               filter(abs(correlation) > 0.9)
  138. RESULTS_r3_0.9 <- data.frame(r_diag_adj3) %>%
  139.               rownames_to_column() %>%
  140.               gather(key="variable", value="correlation", -rowname) %>%
  141.               filter(abs(correlation) > 0.9)
  142. RESULTS_r4_0.9 <- data.frame(r_diag_adj4) %>%
  143.               rownames_to_column() %>%
  144.               gather(key="variable", value="correlation", -rowname) %>%
  145.               filter(abs(correlation) > 0.9)
  146. RESULTS_r5_0.9 <- data.frame(r_diag_adj5) %>%
  147.               rownames_to_column() %>%
  148.               gather(key="variable", value="correlation", -rowname) %>%
  149.               filter(abs(correlation) > 0.9)
  150. RESULTS_r6_0.9 <- data.frame(r_diag_adj6) %>%
  151.               rownames_to_column() %>%
  152.               gather(key="variable", value="correlation", -rowname) %>%
  153.               filter(abs(correlation) > 0.9)
  154. RESULTS_r7_0.9 <- data.frame(r_diag_adj7) %>%
  155.               rownames_to_column() %>%
  156.               gather(key="variable", value="correlation", -rowname) %>%
  157.               filter(abs(correlation) > 0.9)
  158. RESULTS_r8_0.9 <- data.frame(r_diag_adj8) %>%
  159.               rownames_to_column() %>%
  160.               gather(key="variable", value="correlation", -rowname) %>%
  161.               filter(abs(correlation) > 0.9)
  162. RESULTS_r9_0.9 <- data.frame(r_diag_adj9) %>%
  163.               rownames_to_column() %>%
  164.               gather(key="variable", value="correlation", -rowname) %>%
  165.               filter(abs(correlation) > 0.9)
  166. RESULTS_r10_0.9 <- data.frame(r_diag_adj10) %>%
  167.               rownames_to_column() %>%
  168.               gather(key="variable", value="correlation", -rowname) %>%
  169.               filter(abs(correlation) > 0.9)
  170.  
  171.  
  172. #Return the p-values that are <0.05
  173. RESULTS_p1_0.9 <- data.frame(p_diag_adj1) %>%
  174.               rownames_to_column() %>%
  175.               gather(key="variable", value="pval", -rowname) %>%
  176.               filter((pval) < 0.05)
  177. RESULTS_p2_0.9 <- data.frame(p_diag_adj2) %>%
  178.               rownames_to_column() %>%
  179.               gather(key="variable", value="pval", -rowname) %>%
  180.               filter((pval) < 0.05)
  181. RESULTS_p3_0.9 <- data.frame(p_diag_adj3) %>%
  182.               rownames_to_column() %>%
  183.               gather(key="variable", value="pval", -rowname) %>%
  184.               filter((pval) < 0.05)
  185. RESULTS_p4_0.9 <- data.frame(p_diag_adj4) %>%
  186.               rownames_to_column() %>%
  187.               gather(key="variable", value="pval", -rowname) %>%
  188.               filter((pval) < 0.05)
  189. RESULTS_p5_0.9 <- data.frame(p_diag_adj5) %>%
  190.               rownames_to_column() %>%
  191.               gather(key="variable", value="pval", -rowname) %>%
  192.               filter((pval) < 0.05)
  193. RESULTS_p6_0.9 <- data.frame(p_diag_adj6) %>%
  194.               rownames_to_column() %>%
  195.               gather(key="variable", value="pval", -rowname) %>%
  196.               filter((pval) < 0.05)
  197. RESULTS_p7_0.9 <- data.frame(p_diag_adj7) %>%
  198.               rownames_to_column() %>%
  199.               gather(key="variable", value="pval", -rowname) %>%
  200.               filter((pval) < 0.05)
  201. RESULTS_p8_0.9 <- data.frame(p_diag_adj8) %>%
  202.               rownames_to_column() %>%
  203.               gather(key="variable", value="pval", -rowname) %>%
  204.               filter((pval) < 0.05)
  205. RESULTS_p9_0.9 <- data.frame(p_diag_adj9) %>%
  206.               rownames_to_column() %>%
  207.               gather(key="variable", value="pval", -rowname) %>%
  208.               filter((pval) < 0.05)
  209. RESULTS_p10_0.9 <- data.frame(p_diag_adj10) %>%
  210.               rownames_to_column() %>%
  211.               gather(key="variable", value="pval", -rowname) %>%
  212.               filter((pval) < 0.05)
  213.  
  214.  
  215. #Merge the two resulting filtered datasets for each of the 10 groups, where all rows from each are kept
  216. merged_prelim1 <- merge(RESULTS_r1_0.9, RESULTS_p1_0.9, all=TRUE)
  217. merged_prelim2 <- merge(RESULTS_r2_0.9, RESULTS_p2_0.9, all=TRUE)
  218. merged_prelim3<- merge(RESULTS_r3_0.9, RESULTS_p3_0.9, all=TRUE)
  219. merged_prelim4 <- merge(RESULTS_r4_0.9, RESULTS_p4_0.9, all=TRUE)
  220. merged_prelim5 <- merge(RESULTS_r5_0.9, RESULTS_p5_0.9, all=TRUE)
  221. merged_prelim6 <- merge(RESULTS_r6_0.9, RESULTS_p6_0.9, all=TRUE)
  222. merged_prelim7 <- merge(RESULTS_r7_0.9, RESULTS_p7_0.9, all=TRUE)
  223. merged_prelim8 <- merge(RESULTS_r8_0.9, RESULTS_p8_0.9, all=TRUE)
  224. merged_prelim9 <- merge(RESULTS_r9_0.9, RESULTS_p9_0.9, all=TRUE)
  225. merged_prelim10 <- merge(RESULTS_r10_0.9, RESULTS_p10_0.9, all=TRUE)
  226.  
  227. #This has many fewer correlation-passing genes than pval-passing genes
  228. #Filter out any gene which has a NA in the 'correlation' column
  229. RESULTS_merged1 <- merged_prelim1[complete.cases(merged_prelim1[, 3]),]
  230. RESULTS_merged2 <- merged_prelim2[complete.cases(merged_prelim2[, 3]),]
  231. RESULTS_merged3 <- merged_prelim3[complete.cases(merged_prelim3[, 3]),]
  232. RESULTS_merged4 <- merged_prelim4[complete.cases(merged_prelim4[, 3]),]
  233. RESULTS_merged5 <- merged_prelim5[complete.cases(merged_prelim5[, 3]),]
  234. RESULTS_merged6 <- merged_prelim6[complete.cases(merged_prelim6[, 3]),]
  235. RESULTS_merged7 <- merged_prelim7[complete.cases(merged_prelim7[, 3]),]
  236. RESULTS_merged8 <- merged_prelim8[complete.cases(merged_prelim8[, 3]),]
  237. RESULTS_merged9 <- merged_prelim9[complete.cases(merged_prelim9[, 3]),]
  238. RESULTS_merged10 <- merged_prelim10[complete.cases(merged_prelim10[, 3]),]
  239.  
  240. #The merge function can only handle 2 dataframes at a time
  241. #Therefore, I'll merge 2 at a time until they're all merged [this is tedious and there is probably a better way]
  242. RESULTS_merged_1_and_2 <- merge(RESULTS_merged1, RESULTS_merged2, all=TRUE)
  243. RESULTS_merged_3_and_4 <- merge(RESULTS_merged3, RESULTS_merged4, all=TRUE)
  244. RESULTS_merged_5_and_6 <- merge(RESULTS_merged5, RESULTS_merged6, all=TRUE)
  245. RESULTS_merged_7_and_8 <- merge(RESULTS_merged7, RESULTS_merged8, all=TRUE)
  246. RESULTS_merged_9_and_10 <- merge(RESULTS_merged9, RESULTS_merged10, all=TRUE)
  247.  
  248. RESULTS_merged_1_through_4 <- merge(RESULTS_merged_1_and_2, RESULTS_merged_3_and_4, all=TRUE)
  249. RESULTS_merged_5_through_8 <- merge(RESULTS_merged_5_and_6, RESULTS_merged_7_and_8, all=TRUE)
  250. #(9 and 10 are already merged)
  251.  
  252. RESULTS_merged_1_through_8 <- merge(RESULTS_merged_1_through_4, RESULTS_merged_5_through_8, all=TRUE)
  253.  
  254. RESULTS_merged_1_through_10 <- merge(RESULTS_merged_1_through_8, RESULTS_merged_9_and_10, all=TRUE)
  255.  
  256.  
  257.  
  258. #Write the results of criteria-passing genes to a .csv (This will give genes which are both R2 > |0.9| AND pval <0.05
  259. write.csv(RESULTS_merged_1_through_10, "PASSED_THRESHOLD_RESULTS.csv")
  260.  
  261.  
  262. ######For the next section, the aim is to take any gene which shows up in RESULTS_merged_1_through_10, and then end up with a correlation plot using only these values.
  263. #First take the gene names in RESULTS_merged_1_through_10, which are in two columns 'rowname' and 'variable' - then stack them ontop of one another.
  264. #Then get rid of the second column (it just tells which column the gene name originally came from)
  265. stacked_names <- data.frame(stack(RESULTS_merged_1_through_10[1:2]))
  266. stacked_names$ind <- NULL
  267.  
  268. #Get only those genes which are unique names / aka drop duplicates (otherwise downstream the matrix won't appropriately drop the duplicates)
  269. unique_names <- data.frame(unique(stacked_names))
  270.  
  271. #Take all the threshold-passing genes and get their values (starting from the point of the transposed matrix)
  272. hitsonly <- subset(tp2, select=c(unique_names$values))
  273.  
  274. #Make a correlation matrix of the hits
  275. hitmatrix <- corr.test(hitsonly, method="spearman", adjust="holm", ci=FALSE)
  276. hitmatrixr <- hitmatrix[["r"]]
  277.  
  278. #plot and save the matrix of hits
  279. pheatmap(hitmatrixr)
  280.  
  281. #also try ggcorrplot
  282. ggcorrplot(hitmatrixr, hc.order = TRUE, tl.cex=6, outline.col="#e2e2e2", ggtheme=ggplot2::theme_gray, colors = c("#2893e0", "white", "#DB2B18"))
  283.  
  284.  
  285. #Make a copy of tp2 but change tp2 to the scale of Log2(FPKM) + 1 .....this will plot on a log scale but account for 0s
  286. tp2log2plus1 <- log((1+tp2))
  287.  
  288. #Make a scatterplot of 2 genes (using the log2 + 1 scale)
  289. ggplot(tp2log2plus1, aes(x=MGC29506, y=LOC96610)) + geom_point(size=2) + geom_smooth(method=lm, size=2) +  geom_rug(col="darkred", size=0.9, sides="bl", alpha = 0.3)
  290.  
  291.  
  292.  
  293. #######POST PROCESSING SECTION
  294. #The threshold passing correlations are in pairs (rowname and variable) - call them gene1 and gene2 for short.
  295.  
  296. #Take each gene1 ("rowname" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
  297. tp2_summary1 <- tp2 %>%
  298. gather( key = rowname) %>%
  299. group_by(rowname) %>%
  300. summarize(avg_expression1 = mean(value))
  301.  
  302. #join gene1 and gene1 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
  303. avg_expr_final1 <- RESULTS_merged_1_through_10 %>%
  304. select(rowname)%>%
  305. left_join(tp2_summary1)
  306.  
  307. #Take each gene2 ("variable" of REUSLTS_merged_1_through_10) and reference tp2 to find the average
  308. tp2_summary2 <- tp2 %>%
  309. gather( key = variable) %>%
  310. group_by(variable) %>%
  311. summarize(avg_expression2 = mean(value))
  312.  
  313. #join gene2 and gene2 average expressions to the gene1 names originally from RESULTS_merged_1_through_10
  314. avg_expr_final2 <- RESULTS_merged_1_through_10 %>%
  315. select(variable)%>%
  316. left_join(tp2_summary2)
  317.  
  318. #merge Gene1 and Gene2 with their respective average expressions
  319. avg_expr_merged <- merge(avg_expr_final1, avg_expr_final2, all=TRUE)
  320.  
  321.  
  322.  
  323.  
  324.  
  325. #Add a column that has the gene1 and 2 concatenated like this Gene1_Gene2, and also add a column that has the sums of Gene1+Gene2
  326. avg_expr_merged <- cbind(avg_expr_merged, Gene1_Gene2 = paste(avg_expr_merged$rowname, avg_expr_merged$variable, sep="_"))
  327.  
  328. #Now make a character of the sums of Gene1+Gene2
  329. #NOTE that this is probably not that useful because I'm missing things with inverse relationships if I just look at the averages.
  330. #Just keep this in case it is needed in the future, but don't put too much stock in it for now
  331. #Use absolute value to account for the repressor/inverse correlation (Gene1 goes up but Gene2 goes down)?????????????????????
  332. merged_rowsums<- rowSums(select(avg_expr_merged, avg_expression1, avg_expression2))
  333.  
  334. #add it back to the avg_expr_merged
  335. avg_expr_merged<- cbind(avg_expr_merged, merged_rowsums)
  336.  
  337.  
  338. #Look at the ratios between Gene1 and Gene2
  339. #Always want to divide the largest number by the smallest number, and also only want to consider avg_expression1 and avg_expression2 columns
  340. #Do the calculation then add the resulting column to the avg_expr_merged dataframe
  341. #An if_else is appropriate for this (thanks /u/cb603)
  342. avg_expr_merged <- avg_expr_merged %>% mutate(ratio = if_else(avg_expression1 >= avg_expression2, avg_expression1/avg_expression2, avg_expression2/avg_expression1))
  343.  
  344.  
  345.  
  346. #Add a Gene1_Gene2 concatenation to RESULTS_merged_1_through_10 - I believe this needs to be used as a reference for when we make the master results sheet
  347. RESULTS_merged_1_through_10 <- cbind(RESULTS_merged_1_through_10, concat_names = paste(RESULTS_merged_1_through_10$rowname, RESULTS_merged_1_through_10$variable, sep="_"))
  348.  
  349. #Take only those pairs of genes that have passed threshold criteria - and extract their correlation and p-values to be put beside the raitios
  350. master <- merge(RESULTS_merged_1_through_10, avg_expr_merged, all.x=TRUE)
  351. #Clean it up - keep what is needed, drop what is not needed, and re-arrange columns in a better way
  352. master <- master[,c(1,2,6,7,9,8,4,3,10)]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement