Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #colorectal cancer
- counts_file <- system.file("extdata/rna-seq/SRP029880.raw_counts.tsv", package = "compGenomRData")
- coldata_file <- system.file("extdata/rna-seq/SRP029880.colData.tsv", package = "compGenomRData")
- counts <- as.matrix(read.table(counts_file, header = T, sep = '\t'))
- summary(counts)
- cpm <- apply(subset(counts, select = c(-width)), 2, function(x) x/sum(as.numeric(x)) * 10^6)
- head(cpm)
- ###Computing RPKM
- # create a vector of gene lengths
- geneLengths <- as.vector(subset(counts, select = c(width)))
- # compute rpkm
- rpkm <- apply(X = subset(counts, select = c(-width)),
- MARGIN = 2,
- FUN = function(x) 10^9 * x / geneLengths / sum(as.numeric(x)))
- head(rpkm)
- #Computing TPM
- #find gene length normalized values
- rpk <- apply( subset(counts, select = c(-width)), 2,
- function(x) x/(geneLengths/1000))
- #normalize by the sample size using rpk values
- tpm <- apply(rpk, 2, function(x) x / sum(as.numeric(x)) * 10^6)
- head(tpm)
- #compute the variance of each gene across samples
- V <- apply(tpm, 1, var)
- #sort the results by variance in decreasing order and select the top 100 genes
- selectedGenes <- names(V[order(V, decreasing = T)][1:100])
- #Now we can quickly produce a heatmap where samples and genes are clustered
- library(pheatmap)
- pheatmap(tpm[selectedGenes,], scale = 'row', show_rownames = FALSE)
- pheatmap(cpm[selectedGenes,1:10], scale = 'row', show_rownames = FALSE)
- colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
- pheatmap(tpm[selectedGenes,], scale = 'row',
- show_rownames = FALSE,
- annotation_col = colData)
- ###PCA
- library(stats)
- library(ggplot2)
- require(ggfortify)
- #transpose the matrix
- M <- t(tpm[selectedGenes,])
- # transform the counts to log2 scale
- M <- log2(M + 1)
- #compute PCA
- pcaResults <- prcomp(M)
- #plot PCA results making use of ggplot2's autoplot function
- #ggfortify is needed to let ggplot2 know about PCA data structure.
- autoplot(pcaResults, data = colData, colour = 'group')
- ###correlation
- library(corrplot)
- library(stats)
- correlationMatrix <- cor(tpm)
- #Letβs have a look at how the correlation matrix looks:
- library(knitr)
- kable(correlationMatrix,booktabs = TRUE)
- # The correlation plot order by the results of the hierarchical clustering
- corrplot(correlationMatrix, order = 'hclust')
- # We can add the pairwise correlation scores on the plot
- corrplot(correlationMatrix, order = 'hclust', addrect = 2, addCoef.col = 'white')
- # basic correlation heatmap
- pheatmap(correlationMatrix)
- #####DEA######
- countData <- as.matrix(subset(counts, select = c(-width)))
- #define the experimental setup
- colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
- #define the design formula
- designFormula <- "~ group"
- #Now, we are ready to run DESeq2.
- library(DESeq2)
- library(stats)
- #create a DESeq dataset object from the count matrix and the colData
- dds <- DESeqDataSetFromMatrix(countData = countData,
- colData = colData,
- design = as.formula(designFormula))
- #print dds object to see the contents
- print(dds)
- dds <- dds[ rowSums(counts(dds)) > 1, ]
- dds <- DESeq(dds)
- #Now, we can compare and contrast the samples based on different variables of interest. In this case, we currently have only one variable, which is the group variable that determines if a sample belongs to the CASE group or the CTRL group.
- #compute the contrast for the 'group' variable where 'CTRL' samples are used as the control group.
- DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
- #sort results by increasing p-value
- DEresults <- DEresults[order(DEresults$pvalue),]
- ###MA PLOT####
- library(DESeq2)
- DESeq2::plotMA(object = dds, ylim = c(-5, 5))
- #####
- library(ggplot2)
- ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + geom_histogram(bins = 100)
- library(DESeq2)
- # extract normalized counts from the DESeqDataSet object
- countsNormalized <- counts(dds, normalized = TRUE)
- # select top 500 most variable genes
- selectedGenes <- names(sort(apply(countsNormalized, 1, var), decreasing = TRUE)[1:500])
- plotPCA(countsNormalized[selectedGenes,], col = as.numeric(colData$group))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement