Advertisement
Guest User

compgen-day2

a guest
May 23rd, 2019
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.24 KB | None | 0 0
  1. #colorectal cancer
  2. counts_file <- system.file("extdata/rna-seq/SRP029880.raw_counts.tsv", package = "compGenomRData")
  3. coldata_file <- system.file("extdata/rna-seq/SRP029880.colData.tsv", package = "compGenomRData")
  4.  
  5. counts <- as.matrix(read.table(counts_file, header = T, sep = '\t'))
  6.  
  7. summary(counts)
  8.  
  9. cpm <- apply(subset(counts, select = c(-width)), 2, function(x) x/sum(as.numeric(x)) * 10^6)
  10. head(cpm)
  11.  
  12. ###Computing RPKM
  13.  
  14. # create a vector of gene lengths
  15. geneLengths <- as.vector(subset(counts, select = c(width)))
  16.  
  17. # compute rpkm
  18. rpkm <- apply(X = subset(counts, select = c(-width)),
  19.               MARGIN = 2,
  20.               FUN = function(x) 10^9 * x / geneLengths / sum(as.numeric(x)))
  21. head(rpkm)
  22.  
  23. #Computing TPM
  24.  
  25. #find gene length normalized values
  26. rpk <- apply( subset(counts, select = c(-width)), 2,
  27.               function(x) x/(geneLengths/1000))
  28. #normalize by the sample size using rpk values
  29. tpm <- apply(rpk, 2, function(x) x / sum(as.numeric(x)) * 10^6)
  30. head(tpm)
  31.  
  32. #compute the variance of each gene across samples
  33. V <- apply(tpm, 1, var)
  34. #sort the results by variance in decreasing order and select the top 100 genes
  35. selectedGenes <- names(V[order(V, decreasing = T)][1:100])
  36.  
  37. #Now we can quickly produce a heatmap where samples and genes are clustered
  38.  
  39. library(pheatmap)
  40. pheatmap(tpm[selectedGenes,], scale = 'row', show_rownames = FALSE)
  41. pheatmap(cpm[selectedGenes,1:10], scale = 'row', show_rownames = FALSE)
  42.  
  43. colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
  44. pheatmap(tpm[selectedGenes,], scale = 'row',
  45.          show_rownames = FALSE,
  46.          annotation_col = colData)
  47.  
  48. ###PCA
  49. library(stats)
  50. library(ggplot2)
  51. require(ggfortify)
  52. #transpose the matrix
  53. M <- t(tpm[selectedGenes,])
  54. # transform the counts to log2 scale
  55. M <- log2(M + 1)
  56. #compute PCA
  57. pcaResults <- prcomp(M)
  58.  
  59. #plot PCA results making use of ggplot2's autoplot function
  60. #ggfortify is needed to let ggplot2 know about PCA data structure.
  61. autoplot(pcaResults, data = colData, colour = 'group')
  62.  
  63.  
  64. ###correlation
  65.  
  66. library(corrplot)
  67. library(stats)
  68. correlationMatrix <- cor(tpm)
  69.  
  70. #Let’s have a look at how the correlation matrix looks:
  71.  
  72. library(knitr)
  73. kable(correlationMatrix,booktabs = TRUE)
  74. # The correlation plot order by the results of the hierarchical clustering
  75. corrplot(correlationMatrix, order = 'hclust')
  76.  
  77. # We can add the pairwise correlation scores on the plot
  78. corrplot(correlationMatrix, order = 'hclust', addrect = 2, addCoef.col = 'white')
  79.  
  80. # basic correlation heatmap
  81. pheatmap(correlationMatrix)
  82.  
  83.  
  84. #####DEA######
  85. countData <- as.matrix(subset(counts, select = c(-width)))
  86. #define the experimental setup
  87. colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
  88. #define the design formula
  89. designFormula <- "~ group"
  90.  
  91. #Now, we are ready to run DESeq2.
  92.  
  93. library(DESeq2)
  94. library(stats)
  95. #create a DESeq dataset object from the count matrix and the colData
  96. dds <- DESeqDataSetFromMatrix(countData = countData,
  97.                               colData = colData,
  98.                               design = as.formula(designFormula))
  99. #print dds object to see the contents
  100. print(dds)
  101.  
  102. dds <- dds[ rowSums(counts(dds)) > 1, ]
  103.  
  104. dds <- DESeq(dds)
  105.  
  106. #Now, we can compare and contrast the samples based on different variables of interest. In this case, we currently have only one variable, which is the group variable that determines if a sample belongs to the CASE group or the CTRL group.
  107.  
  108. #compute the contrast for the 'group' variable where 'CTRL' samples are used as the control group.
  109. DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
  110. #sort results by increasing p-value
  111. DEresults <- DEresults[order(DEresults$pvalue),]
  112.  
  113. ###MA PLOT####
  114.  
  115. library(DESeq2)
  116. DESeq2::plotMA(object = dds, ylim = c(-5, 5))
  117.  
  118. #####
  119.  
  120. library(ggplot2)
  121. ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + geom_histogram(bins = 100)
  122.  
  123.  
  124.  
  125. library(DESeq2)
  126. # extract normalized counts from the DESeqDataSet object
  127. countsNormalized <- counts(dds, normalized = TRUE)
  128.  
  129. # select top 500 most variable genes
  130. selectedGenes <- names(sort(apply(countsNormalized, 1, var), decreasing = TRUE)[1:500])
  131.  
  132. plotPCA(countsNormalized[selectedGenes,], col = as.numeric(colData$group))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement