Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # data reading
- dataArrays = read.table("Microarrays_logValues_parapsilosis.txt", header = T, row.names = 1)
- dataRNAseq = read.table("RNAseq_counts_parapsilosis.txt", header = T, row.names = 1)
- # --> same genes are listed in both files, with the same order
- #####
- # 1. Microarrays dataset
- #####
- # calculate average logFC values
- averageLog = apply(dataArrays, 1, mean)
- # histogram of obtained values
- hist(averageLog, xlab = "average logFC value", nclass = 100,
- main = "C. parapsilosis - Microarrays dataset")
- # -> most of the genes are not differentially expressed
- # (distribution is centered around 0)
- # thresold can be chosen (here the value is 2) to select up and down regulated genes
- abline(v = 2, col = "red", lty = "dashed")
- abline(v = -2, col = "green", lty = "dashed")
- upGenes = names(averageLog[averageLog > 2])
- # -> 76 genes are selected
- downGenes = names(averageLog[averageLog < -2])
- # -> 11 genes are selected
- # order genes according to differential expression (averageLog)
- sortedGenes = sort(averageLog, decreasing = T)
- # evaluate data reproducibility (standard deviation)
- sdValues = apply(dataArrays, 1, sd)
- # visualize both information on a single graph
- plot(sdValues, averageLog,
- main = "C. parapsilosis - Microarrays dataset",
- xlab = "SD values (data reproducibility)", ylab = "LogFC (average value)",
- pch = 20)
- abline(h = 2, col = "red", lty = "dashed")
- abline(h = -2, col = "green", lty = "dashed")
- # superimpose up- and down- regulated genes
- points(sdValues[upGenes], averageLog[upGenes], col = "red", pch = 20)
- points(sdValues[downGenes], averageLog[downGenes], col = "green", pch = 20)
- # calculate t values (classical Student parameter)
- tval = averageLog/(sdValues/sqrt(ncol(dataArrays)))
- # histogram of obtained values
- H = hist(tval, xlab = "t value (classical Student parameter)", nclass = 100,
- main = "C. parapsilosis - Microarrays dataset", prob = T)
- # -> most of the genes are not differentially expressed
- # (distribution is centered around 0)
- # in a classical test (Student statistics), genes are selected if tval > theo
- ttheo = pt(1-(0.001/2), df = 3) # alpha = 0.1% and df = n - 1 = 4 - 1
- upGenesL = names(tval[tval > ttheo])
- # -> 2252 genes are selected
- downGenesL = names(tval[tval < -ttheo])
- # -> 2384 genes are selected
- # visualize selected genes
- #par(mfrow = c(2,2))
- plot(averageLog, tval,
- main = "C. parapsilosis - Microarrays dataset",
- ylab = "tval (Student parameter)", xlab = "LogFC (average value)",
- pch = 20)
- abline(h = 0, col = "black")
- abline(v = 0, col = "black")
- plot(averageLog, tval,
- main = "C. parapsilosis - Microarrays dataset",
- ylab = "tval (Student parameter)", xlab = "LogFC (average value)",
- pch = 20)
- abline(v = 2, col = "red", lty = "dashed")
- abline(v = -2, col = "green", lty = "dashed")
- abline(h = 0, col = "black")
- abline(v = 0, col = "black")
- # superimpose up- and down- regulated genes (previously identified)
- points(averageLog[upGenes], tval[upGenes], col = "red", pch = 20)
- points(averageLog[downGenes], tval[downGenes], col = "green", pch = 20)
- plot(sdValues, tval,
- main = "C. parapsilosis - Microarrays dataset",
- ylab = "tval (Student parameter)", xlab = "SD values (data reproducibility)",
- pch = 20)
- abline(h = 0, col = "black")
- abline(v = 0, col = "black")
- ####
- # limma method
- ####
- library(limma)
- # Linear model estimation
- fit = lmFit(dataArrays)
- # Bayesian statistics
- limmaRes = topTable(eBayes(fit), number = nrow(dataArrays))[,-6]
- # sort genes according to their initial order in dataArrays
- limmaRes2 = limmaRes[row.names(dataArrays),]
- # compare logFC values obtained with LIMMA
- plot(limmaRes2[, "logFC"], averageLog, pch = 20,
- xlab = "logFC calculated with LIMMA", ylab = "LogFC (average value)")
- # -> Values are identical
- # compare tlimma with tval (Student)
- plot(averageLog, tval,
- main = "C. parapsilosis - Microarrays dataset",
- ylab = "t (Student in black and LIMMA in purple)", xlab = "LogFC (average value)",
- pch = 20)
- points(averageLog, limmaRes2[,"t"], pch = 20, col = "purple")
- abline(h = 0, col = "black")
- abline(v = 0, col = "black")
- plot(sdValues, tval,
- main = "C. parapsilosis - Microarrays dataset",
- ylab = "t (Student in black and LIMMA in purple)", xlab = "SD values (data reproducibility)",
- pch = 20)
- points(sdValues, limmaRes2[,"t"], pch = 20, col = "purple")
- abline(h = 0, col = "black")
- # -> LIMMA is a better method to identify differentially expressed genes
- limmaHigh <- row.names(limmaRes2[limmaRes2[,"logFC"]>2,])
- limmaLow <- row.names(limmaRes2[limmaRes2[,"logFC"]<2,])
- #####
- # 2. RNAseq dataset
- #####
- # calculate logFC values (using read counts)
- # mean values for normoxic and hypoxic replicates
- meanNcounts = apply(dataRNAseq[,1:6], 1, mean)
- meanHcounts = apply(dataRNAseq[,7:10], 1, mean)
- # logFC (raw data)
- logFC = log2((meanHcounts + 1)/(meanNcounts + 1))
- # distribution of log(FC) (raw data)
- hist(logFC, nclass = 100, main = "logFC (H/N) distribution \n(raw data)", xlab = "log(H/N) value")
- abline(v = 0, col = "red")
- # -> Normalization is required !
- ####
- # DESeq method
- ####
- library(DESeq2)
- # Loading metadata for the experiment
- # N = Normoxic condition (with O2)
- # H = Hypoxic condition (without O2)
- colData = read.table("design.txt", row.names = 1, header = TRUE)
- # DESeqDataSet object creation
- dds = DESeqDataSetFromMatrix(countData = dataRNAseq, colData = colData, design = ~condition)
- # normalization of counts
- # calculation of sizeFactors
- dds = estimateSizeFactors(dds)
- sizeFactors(dds)
- # get normalized count values
- cdsNorm = counts(dds, normalized = TRUE)
- # mean values for normoxic replicates
- meanNcountsNorm = apply(cdsNorm[,1:6], 1, mean)
- meanHcountsNorm = apply(cdsNorm[,7:10], 1, mean)
- # sd values for log(H/N) replicates
- sdNcountsNorm = apply(cdsNorm[,1:6], 1, sd)
- sdHcountsNorm = apply(cdsNorm[,7:10], 1, sd)
- # logFC (after normalization)
- logFCNorm = log2((meanHcountsNorm + 1)/(meanNcountsNorm + 1))
- hist(logFCNorm, nclass = 100, main = "logFC (H/N) distribution \n(normalized data)", xlab = "log(H/N) value")
- abline(v = 0, col = "red")
- # -> Most of the genes are not differentially expressed
- # thresold can be chosen (here the value is 2) to select up and down regulated genes
- abline(v = 2, col = "red", lty = "dashed")
- abline(v = -2, col = "green", lty = "dashed")
- upGenes2 = names(logFCNorm[logFCNorm > 2])
- # -> 98 genes are selected
- downGenes2 = names(logFCNorm[logFCNorm < -2])
- # -> 62 genes are selected
- # compare logFC values between microarrays and RNAseq
- plot(averageLog, logFCNorm, pch = 20,
- xlab = "logFC with microarrays",
- ylab = "logFC with RNAseq", main = "RNAseq versus microarrays (logFC values)")
- abline(v = 2, col = "red", lty = "dashed")
- abline(v = -2, col = "green", lty = "dashed")
- abline(h = 2, col = "red", lty = "dashed")
- abline(h = -2, col = "green", lty = "dashed")
- # evaluate expression level of genes
- exprLevel = apply(cdsNorm, 1, mean)
- # logFC versus the level of gene expression
- plot(log(exprLevel), logFCNorm, pch = 20,
- xlab = "Gene expression level (log scale)", ylab = "logFC",
- main = "C. parapsilosis - RNAseq data")
- abline(h = 2, col = "red", lty = "dashed")
- abline(h = -2, col = "green", lty = "dashed")
- points(log(exprLevel[upGenes2]), logFCNorm[upGenes2], pch = 20,
- col = "red")
- points(log(exprLevel[downGenes2]), logFCNorm[downGenes2], pch = 20,
- col = "green")
- # -> Down regulated genes have low expression compared to up regulated genes.
- # This can be visualized with IGV...
- # perform the analysis with DESeq
- # variance estimations
- dds = estimateDispersions(dds)
- plotDispEsts(dds)
- # differential analysis
- dds = nbinomWaldTest(dds)
- res = results(dds)
- write.table(res, "res.txt", row.name=T, quote=F, sep='\t')
- # Get intersection
- highCommon <- intersect(limmaHigh,upGenes2)
- lowCommon <- intersect(limmaLow,downGenes2)
- allSelected=c(highCommon,lowCommon)
- # compare logFC values between microarrays and RNAseq Top Values indicated
- plot(averageLog, logFCNorm, pch = 20,
- xlab = "logFC with microarrays",
- ylab = "logFC with RNAseq", main = "RNAseq versus microarrays (logFC values)",)
- abline(v = 2, col = "red", lty = "dashed")
- abline(v = -2, col = "green", lty = "dashed")
- abline(h = 2, col = "red", lty = "dashed")
- abline(h = -2, col = "green", lty = "dashed")
- points(averageLog[allSelected],logFCNorm[allSelected],pch=20,col="yellow")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement