Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # create FAKA fastq files
- touch a.fastq b.fastq c.fastq d.fastq e.fastq f.fastq
- ## edgeR COMMANDS
- #Open the R terminal (from the folder with .count and samples1 files) and do the following commands.
- #R
- #Load the edgeR package, already installed:
- library(edgeR)
- #Create a variable with samples file information:
- samples = read.table("samples_edgeR", sep="\t", header=TRUE)
- #Read count files created by htseq-count with readDGE function
- counts = readDGE(samples$countf)$counts
- #Filter weakly expressed and noninformative (e.g., non-aligned) features using a command like
- noint = rownames(counts) %in% c("__no_feature","__ambiguous","__too_low_aQual","__not_aligned","__alignment_not_unique")
- cpms = cpm(counts)
- keep = rowSums(cpms >1) >=3 & !noint
- counts = counts[keep,]
- colnames(counts) = samples$shortname
- head(counts[,order(samples$condition)], 5)
- # Create a DGEList object (edgeR’s container for RNA-seq count data):
- d = DGEList(counts=counts, group = samples$condition)
- d$samples
- # Estimate normalization factors by effective library size (TMM -trimmed mean of M-values)
- #Normalizes for RNA composition by finding a set of scalingfactors for the library sizes that minimize the log-fold changes between the samples for most genes
- d = calcNormFactors(d)
- d$samples
- # Inspect the relationships between samples using a multidimensional scaling (MDS) plot
- plotMDS(d, labels=samples$shortname, col=c("darkgreen","blue")[factor(samples$condition)])
- # Export the figure in PDF format:
- dev.copy2pdf(file="plotMDS.pdf")
- # Estimate dispersions with quantile-adjusted conditional maximum likelihood (qCML) method (experiments with single factor)
- d = estimateCommonDisp(d)
- d = estimateTagwiseDisp(d)
- #Plot the mean-variance relationship:
- plotMeanVar(d, show.tagwise.vars=TRUE, NBline=TRUE)
- dev.copy2pdf(file="plotMeanVar.pdf")
- #plot the relationship of biological coefficient of variation (sqrt(common.dispersion)) versus mean log CPM
- d$common.dispersion
- plotBCV(d)
- dev.copy2pdf(file="plotBCV.pdf")
- d$common.dispersion
- sqrt(d$common.dispersion)
- # Test for differential expression (classic' edgeR for single factor experiments) - exact test:
- de = exactTest(d, pair=c("cntr","trat"))
- # Use the topTags function to present a tabular summary of the differential expression statistics (exact Test):
- tt = topTags(de, n=nrow(d))
- # Visualize it:
- head(tt$table)
- #Inspect the depth-adjusted reads per million for some of the top differentially expressed genes:
- nc = cpm(d, normalized.lib.sizes=TRUE)
- rn = rownames(tt$table)
- head(nc[rn,order(samples$condition)],5)
- #Create a graphical summary, M (log-fold change) versus A (log-average expression) plot, showing the genes selected as differentially expressed (with # a 5% false discovery rate)
- deg = rn[tt$table$FDR < .05]
- #Plot log-fold change against log-counts per million, with DE genes highlighted
- plotMD(de)
- abline(h=c(-1, 1), col="blue")
- dev.copy2pdf(file="plotMD.pdf")
- #Total number of genes significantly up-regulated or down-regulated at 5% FDR
- summary(decideTests(de))
- # Finally, save the result table as a CSV file
- write.csv(tt$table, file="tableDEG_edgeR.csv")
- #Exit R terminal and check the final output:
- q()
- #### // \\ ####
- ## DESeq2 COMMANDS
- #Open R
- #R
- #Load the DESeq2 package, already installed:
- library("DESeq2")
- #Go to the correct folder
- getwd()
- directory <- "/home/rabico/Documentos/transcriptoma"
- #Create a variable with sampleTable file information:
- sampleTable <- read.table('samples_DESeq2',sep='\t', header=T)
- #Treatments - levels
- treatments = c("cntr","trat")
- #Use the function DESeqDataSetFromHTSeqCount if you have used htseq-count from the HTSeq
- ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,directory = directory,design = ~ condition)
- #Create factor according to condition
- colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition,levels = treatments)
- #Count number of lines
- nrow(ddsHTSeq)
- #Pre-filtering low count genes
- keep <- rowSums(counts(ddsHTSeq)) >= 10
- dds <- ddsHTSeq[keep,]
- nrow(dds)
- #Differential expression analysis
- dds <- DESeq(dds)
- #inspect sample relationships, invoke a variance-stabilizaing transformation
- library("vsn")
- vsd <- vst(dds, blind = FALSE)
- plotPCA(vsd)
- dev.copy2pdf(file="plotPCAvsd")
- #Inspect the estimated dispersions
- plotDispEsts(dds)
- dev.copy2pdf(file="plotDispEsts")
- #Log fold change shrinkage for visualization and ranking
- resultsNames(dds)
- resLFC <- lfcShrink(dds, coef="condition_trat_vs_cntr", type="apeglm")
- resLFC
- plotMA(resLFC, ylim=c(-8,8), main= "trat vs cntr", alpha=0.05)
- abline(h=c(-1,1),col='green')
- dev.copy2pdf(file='plotMA.pdf')
- #Ordering by p-value adjusted
- resOrdered <- resLFC[order(resLFC$padj),]
- write.csv(as.data.frame(resOrdered), file="tableDEG_DESeq2.csv")
- resSig <- subset(resOrdered, padj < 0.05)
- resSig
- q()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement