compgen-day2

#colorectal cancer
counts_file <- system.file("extdata/rna-seq/SRP029880.raw_counts.tsv", package = "compGenomRData")
coldata_file <- system.file("extdata/rna-seq/SRP029880.colData.tsv", package = "compGenomRData")

counts <- as.matrix(read.table(counts_file, header = T, sep = '\t'))

summary(counts)

cpm <- apply(subset(counts, select = c(-width)), 2, function(x) x/sum(as.numeric(x)) * 10^6)
head(cpm)

###Computing RPKM

# create a vector of gene lengths
geneLengths <- as.vector(subset(counts, select = c(width)))

# compute rpkm
rpkm <- apply(X = subset(counts, select = c(-width)),
              MARGIN = 2,
              FUN = function(x) 10^9 * x / geneLengths / sum(as.numeric(x)))
head(rpkm)

#Computing TPM

#find gene length normalized values
rpk <- apply( subset(counts, select = c(-width)), 2,
              function(x) x/(geneLengths/1000))
#normalize by the sample size using rpk values
tpm <- apply(rpk, 2, function(x) x / sum(as.numeric(x)) * 10^6)
head(tpm)

#compute the variance of each gene across samples
V <- apply(tpm, 1, var)
#sort the results by variance in decreasing order and select the top 100 genes
selectedGenes <- names(V[order(V, decreasing = T)][1:100])

#Now we can quickly produce a heatmap where samples and genes are clustered

library(pheatmap)
pheatmap(tpm[selectedGenes,], scale = 'row', show_rownames = FALSE)
pheatmap(cpm[selectedGenes,1:10], scale = 'row', show_rownames = FALSE)

colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
pheatmap(tpm[selectedGenes,], scale = 'row',
         show_rownames = FALSE,
         annotation_col = colData)

###PCA
library(stats)
library(ggplot2)
require(ggfortify)
#transpose the matrix
M <- t(tpm[selectedGenes,])
# transform the counts to log2 scale
M <- log2(M + 1)
#compute PCA
pcaResults <- prcomp(M)

#plot PCA results making use of ggplot2's autoplot function
#ggfortify is needed to let ggplot2 know about PCA data structure.
autoplot(pcaResults, data = colData, colour = 'group')


###correlation

library(corrplot)
library(stats)
correlationMatrix <- cor(tpm)

#Let’s have a look at how the correlation matrix looks:

library(knitr)
kable(correlationMatrix,booktabs = TRUE)
# The correlation plot order by the results of the hierarchical clustering
corrplot(correlationMatrix, order = 'hclust')

# We can add the pairwise correlation scores on the plot
corrplot(correlationMatrix, order = 'hclust', addrect = 2, addCoef.col = 'white')

# basic correlation heatmap
pheatmap(correlationMatrix)


#####DEA######
countData <- as.matrix(subset(counts, select = c(-width)))
#define the experimental setup
colData <- read.table(coldata_file, header = T, sep = '\t', stringsAsFactors = TRUE)
#define the design formula
designFormula <- "~ group"

#Now, we are ready to run DESeq2.

library(DESeq2)
library(stats)
#create a DESeq dataset object from the count matrix and the colData
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData,
                              design = as.formula(designFormula))
#print dds object to see the contents
print(dds)

dds <- dds[ rowSums(counts(dds)) > 1, ]

dds <- DESeq(dds)

#Now, we can compare and contrast the samples based on different variables of interest. In this case, we currently have only one variable, which is the group variable that determines if a sample belongs to the CASE group or the CTRL group.

#compute the contrast for the 'group' variable where 'CTRL' samples are used as the control group.
DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]

###MA PLOT####

library(DESeq2)
DESeq2::plotMA(object = dds, ylim = c(-5, 5))

#####

library(ggplot2)
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + geom_histogram(bins = 100)


library(DESeq2)
# extract normalized counts from the DESeqDataSet object
countsNormalized <- counts(dds, normalized = TRUE)

# select top 500 most variable genes
selectedGenes <- names(sort(apply(countsNormalized, 1, var), decreasing = TRUE)[1:500])

plotPCA(countsNormalized[selectedGenes,], col = as.numeric(colData$group))