Advertisement
Guest User

Practical DE genes

a guest
Nov 30th, 2015
138
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 8.37 KB | None | 0 0
  1.  
  2. # data reading
  3. dataArrays = read.table("Microarrays_logValues_parapsilosis.txt", header = T, row.names = 1)
  4. dataRNAseq = read.table("RNAseq_counts_parapsilosis.txt", header = T, row.names = 1)
  5. # --> same genes are listed in  both files, with the same order
  6.  
  7. #####
  8. # 1. Microarrays dataset
  9. #####
  10.  
  11. # calculate average logFC values
  12. averageLog = apply(dataArrays, 1, mean)
  13.  
  14. # histogram of obtained values
  15. hist(averageLog, xlab = "average logFC value", nclass = 100,
  16.     main = "C. parapsilosis - Microarrays dataset")
  17. # -> most of the genes are not differentially expressed
  18. #    (distribution is centered around 0)
  19.  
  20. # thresold can be chosen (here the value is 2) to select up and down regulated genes  
  21. abline(v = 2, col = "red", lty = "dashed")
  22. abline(v = -2, col = "green", lty = "dashed")
  23.  
  24. upGenes = names(averageLog[averageLog > 2])
  25. # -> 76 genes are selected
  26. downGenes = names(averageLog[averageLog < -2])
  27. # -> 11 genes are selected
  28.  
  29. # order genes according to differential expression (averageLog)
  30. sortedGenes = sort(averageLog, decreasing = T)
  31.  
  32. # evaluate data reproducibility (standard deviation)
  33. sdValues = apply(dataArrays, 1, sd)
  34.  
  35. # visualize both information on a single graph
  36. plot(sdValues, averageLog,
  37.         main = "C. parapsilosis - Microarrays dataset",
  38.         xlab = "SD values (data reproducibility)", ylab = "LogFC (average value)",
  39.         pch = 20)
  40. abline(h = 2, col = "red", lty = "dashed")
  41. abline(h = -2, col = "green", lty = "dashed")
  42.  
  43. # superimpose up- and down- regulated genes
  44. points(sdValues[upGenes], averageLog[upGenes], col = "red", pch = 20)      
  45. points(sdValues[downGenes], averageLog[downGenes], col = "green", pch = 20)    
  46.  
  47. # calculate t values (classical Student parameter)
  48. tval = averageLog/(sdValues/sqrt(ncol(dataArrays)))
  49.  
  50. # histogram of obtained values
  51. H = hist(tval, xlab = "t value (classical Student parameter)", nclass = 100,
  52.     main = "C. parapsilosis - Microarrays dataset", prob = T)
  53. # -> most of the genes are not differentially expressed
  54. #    (distribution is centered around 0)
  55.  
  56. # in a classical test (Student statistics), genes are selected if tval > theo
  57. ttheo = pt(1-(0.001/2), df = 3) # alpha = 0.1% and df = n - 1 = 4 - 1
  58. upGenesL = names(tval[tval > ttheo])
  59. # -> 2252 genes are selected
  60. downGenesL = names(tval[tval < -ttheo])
  61. # -> 2384 genes are selected
  62.  
  63. # visualize selected genes
  64. #par(mfrow = c(2,2))
  65. plot(averageLog, tval,
  66.         main = "C. parapsilosis - Microarrays dataset",
  67.         ylab = "tval (Student parameter)", xlab = "LogFC (average value)",
  68.         pch = 20)
  69. abline(h = 0, col = "black")
  70. abline(v = 0, col = "black")
  71.  
  72. plot(averageLog, tval,
  73.         main = "C. parapsilosis - Microarrays dataset",
  74.         ylab = "tval (Student parameter)", xlab = "LogFC (average value)",
  75.         pch = 20)
  76.  
  77. abline(v = 2, col = "red", lty = "dashed")
  78. abline(v = -2, col = "green", lty = "dashed")
  79. abline(h = 0, col = "black")
  80. abline(v = 0, col = "black")
  81.  
  82. # superimpose up- and down- regulated genes (previously identified)
  83. points(averageLog[upGenes], tval[upGenes], col = "red", pch = 20)      
  84. points(averageLog[downGenes], tval[downGenes], col = "green", pch = 20)    
  85.  
  86. plot(sdValues, tval,
  87.         main = "C. parapsilosis - Microarrays dataset",
  88.         ylab = "tval (Student parameter)", xlab = "SD values (data reproducibility)",
  89.         pch = 20)
  90. abline(h = 0, col = "black")
  91. abline(v = 0, col = "black")
  92.  
  93. ####
  94. # limma method
  95. ####
  96.  
  97. library(limma)
  98.  
  99. # Linear model estimation
  100. fit = lmFit(dataArrays)
  101.  
  102. # Bayesian statistics
  103. limmaRes = topTable(eBayes(fit), number = nrow(dataArrays))[,-6]
  104.  
  105. # sort genes according to their initial order in dataArrays
  106. limmaRes2 = limmaRes[row.names(dataArrays),]
  107.  
  108. # compare logFC values obtained with LIMMA
  109. plot(limmaRes2[, "logFC"], averageLog, pch = 20,
  110.     xlab = "logFC calculated with LIMMA", ylab = "LogFC (average value)")
  111. # -> Values are identical
  112.  
  113. # compare tlimma with tval (Student)
  114. plot(averageLog, tval,
  115.         main = "C. parapsilosis - Microarrays dataset",
  116.         ylab = "t (Student in black and LIMMA in purple)", xlab = "LogFC (average value)",
  117.         pch = 20)
  118. points(averageLog, limmaRes2[,"t"], pch = 20, col = "purple")
  119. abline(h = 0, col = "black")
  120. abline(v = 0, col = "black")
  121.  
  122. plot(sdValues, tval,
  123.         main = "C. parapsilosis - Microarrays dataset",
  124.         ylab = "t (Student in black and LIMMA in purple)", xlab = "SD values (data reproducibility)",
  125.         pch = 20)
  126. points(sdValues, limmaRes2[,"t"], pch = 20, col = "purple")
  127. abline(h = 0, col = "black")
  128. # -> LIMMA is a better method to identify differentially expressed genes
  129.  
  130. limmaHigh <- row.names(limmaRes2[limmaRes2[,"logFC"]>2,])
  131. limmaLow <- row.names(limmaRes2[limmaRes2[,"logFC"]<2,])
  132.  
  133. #####
  134. # 2. RNAseq dataset
  135. #####
  136.  
  137. # calculate logFC values (using read counts)
  138.  
  139. # mean values for normoxic and hypoxic replicates
  140. meanNcounts = apply(dataRNAseq[,1:6], 1, mean)
  141. meanHcounts = apply(dataRNAseq[,7:10], 1, mean)
  142. # logFC (raw data)
  143. logFC = log2((meanHcounts + 1)/(meanNcounts + 1))
  144.  
  145. # distribution of log(FC) (raw data)
  146. hist(logFC, nclass = 100, main = "logFC (H/N) distribution \n(raw data)", xlab = "log(H/N) value")
  147. abline(v = 0, col = "red")
  148. # -> Normalization is required !
  149.  
  150. ####
  151. # DESeq method
  152. ####
  153.  
  154. library(DESeq2)
  155.  
  156. # Loading metadata for the experiment
  157. # N = Normoxic condition (with O2)
  158. # H = Hypoxic condition  (without O2)
  159. colData = read.table("design.txt", row.names = 1, header = TRUE)
  160.  
  161. # DESeqDataSet object creation
  162. dds = DESeqDataSetFromMatrix(countData = dataRNAseq, colData = colData, design = ~condition)
  163.  
  164. # normalization of counts
  165. # calculation of sizeFactors
  166. dds = estimateSizeFactors(dds)
  167. sizeFactors(dds)
  168.  
  169. # get normalized count values
  170. cdsNorm = counts(dds, normalized = TRUE)
  171.  
  172. # mean values for normoxic replicates
  173. meanNcountsNorm = apply(cdsNorm[,1:6], 1, mean)
  174. meanHcountsNorm = apply(cdsNorm[,7:10], 1, mean)
  175. # sd values for log(H/N) replicates
  176. sdNcountsNorm   = apply(cdsNorm[,1:6], 1, sd)
  177. sdHcountsNorm   = apply(cdsNorm[,7:10], 1, sd)
  178.  
  179. # logFC (after normalization)
  180. logFCNorm = log2((meanHcountsNorm + 1)/(meanNcountsNorm + 1))
  181.  
  182. hist(logFCNorm, nclass = 100, main = "logFC (H/N) distribution \n(normalized data)", xlab = "log(H/N) value")
  183. abline(v = 0, col = "red")
  184. # -> Most of the genes are not differentially expressed
  185.  
  186. # thresold can be chosen (here the value is 2) to select up and down regulated genes  
  187. abline(v = 2, col = "red", lty = "dashed")
  188. abline(v = -2, col = "green", lty = "dashed")
  189.  
  190. upGenes2 = names(logFCNorm[logFCNorm > 2])
  191. # -> 98 genes are selected
  192. downGenes2 = names(logFCNorm[logFCNorm < -2])
  193. # -> 62 genes are selected
  194.  
  195. # compare logFC values between microarrays and RNAseq
  196. plot(averageLog, logFCNorm, pch = 20,
  197.     xlab = "logFC with microarrays",
  198.     ylab = "logFC with RNAseq", main = "RNAseq versus microarrays (logFC values)")
  199. abline(v = 2, col = "red", lty = "dashed")
  200. abline(v = -2, col = "green", lty = "dashed")
  201. abline(h = 2, col = "red", lty = "dashed")
  202. abline(h = -2, col = "green", lty = "dashed")
  203.  
  204. # evaluate expression level of genes
  205. exprLevel = apply(cdsNorm, 1, mean)
  206.  
  207. # logFC versus the level of gene expression
  208. plot(log(exprLevel), logFCNorm, pch = 20,
  209.     xlab = "Gene expression level (log scale)", ylab = "logFC",
  210.     main = "C. parapsilosis - RNAseq data")
  211. abline(h = 2, col = "red", lty = "dashed")
  212. abline(h = -2, col = "green", lty = "dashed")
  213. points(log(exprLevel[upGenes2]), logFCNorm[upGenes2], pch = 20,
  214.     col = "red")
  215. points(log(exprLevel[downGenes2]), logFCNorm[downGenes2], pch = 20,
  216.     col = "green")
  217. # -> Down regulated genes have low expression compared to up regulated genes.
  218. # This can be visualized with IGV...
  219.  
  220. # perform the analysis with DESeq
  221.  
  222. # variance estimations
  223. dds = estimateDispersions(dds)
  224. plotDispEsts(dds)
  225.  
  226. # differential analysis
  227. dds = nbinomWaldTest(dds)
  228. res = results(dds)
  229.  
  230. write.table(res, "res.txt", row.name=T, quote=F, sep='\t')
  231.  
  232. # Get intersection
  233. highCommon <- intersect(limmaHigh,upGenes2)
  234. lowCommon <- intersect(limmaLow,downGenes2)
  235. allSelected=c(highCommon,lowCommon)
  236.  
  237. # compare logFC values between microarrays and RNAseq Top Values indicated
  238. plot(averageLog, logFCNorm, pch = 20,
  239.      xlab = "logFC with microarrays",
  240.      ylab = "logFC with RNAseq", main = "RNAseq versus microarrays (logFC values)",)
  241. abline(v = 2, col = "red", lty = "dashed")
  242. abline(v = -2, col = "green", lty = "dashed")
  243. abline(h = 2, col = "red", lty = "dashed")
  244. abline(h = -2, col = "green", lty = "dashed")
  245. points(averageLog[allSelected],logFCNorm[allSelected],pch=20,col="yellow")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement