edgeR_DEseq2

# create FAKA fastq files
touch a.fastq b.fastq c.fastq d.fastq e.fastq f.fastq

## edgeR COMMANDS
#Open the R terminal (from the folder with .count and samples1 files) and do the following commands.
#R

#Load the edgeR package, already installed:
library(edgeR)

#Create a variable with samples file information:
samples = read.table("samples_edgeR", sep="\t", header=TRUE)

#Read count files created by htseq-count with readDGE function
counts = readDGE(samples$countf)$counts

#Filter weakly expressed and noninformative (e.g., non-aligned) features using a command like
noint = rownames(counts) %in% c("__no_feature","__ambiguous","__too_low_aQual","__not_aligned","__alignment_not_unique")
cpms = cpm(counts)
keep = rowSums(cpms >1) >=3 & !noint
counts = counts[keep,]
colnames(counts) = samples$shortname
head(counts[,order(samples$condition)], 5)


# Create a DGEList object (edgeR’s container for RNA-seq count data):
d = DGEList(counts=counts, group = samples$condition)
d$samples

# Estimate normalization factors by effective library size (TMM -trimmed mean of M-values)
#Normalizes for RNA composition by finding a set of scalingfactors for the library sizes that minimize the log-fold changes between the samples for most genes
d = calcNormFactors(d)
d$samples


# Inspect the relationships between samples using a multidimensional scaling (MDS) plot
plotMDS(d, labels=samples$shortname, col=c("darkgreen","blue")[factor(samples$condition)])

# Export the figure in PDF format:
dev.copy2pdf(file="plotMDS.pdf")

# Estimate dispersions with quantile-adjusted conditional maximum likelihood (qCML) method (experiments with single factor)
d = estimateCommonDisp(d)
d = estimateTagwiseDisp(d)

#Plot the mean-variance relationship:
plotMeanVar(d, show.tagwise.vars=TRUE, NBline=TRUE)
dev.copy2pdf(file="plotMeanVar.pdf")

#plot the relationship of biological coefficient of variation (sqrt(common.dispersion)) versus mean log CPM
d$common.dispersion
plotBCV(d)
dev.copy2pdf(file="plotBCV.pdf")

d$common.dispersion
sqrt(d$common.dispersion)


# Test for differential expression (classic' edgeR for single factor experiments) - exact test:
de = exactTest(d, pair=c("cntr","trat"))

# Use the topTags function to present a tabular summary of the differential expression statistics (exact Test):
tt = topTags(de, n=nrow(d))
# Visualize it:
head(tt$table)

#Inspect the depth-adjusted reads per million for some of the top differentially expressed genes:
nc = cpm(d, normalized.lib.sizes=TRUE)
rn = rownames(tt$table)
head(nc[rn,order(samples$condition)],5)

#Create a graphical summary, M (log-fold change) versus A (log-average expression) plot, showing the genes selected as differentially expressed (with # a 5% false discovery rate)
deg = rn[tt$table$FDR < .05]

#Plot log-fold change against log-counts per million, with DE genes highlighted
plotMD(de)
abline(h=c(-1, 1), col="blue")
dev.copy2pdf(file="plotMD.pdf")

#Total number of genes significantly up-regulated or down-regulated at 5% FDR
summary(decideTests(de))


# Finally, save the result table as a CSV file
write.csv(tt$table, file="tableDEG_edgeR.csv")


#Exit R terminal and check the final output:
q()


#### // \\ ####

## DESeq2 COMMANDS

#Open R
#R

#Load the DESeq2 package, already installed:
library("DESeq2")

#Go to the correct folder
getwd()


directory <- "/home/rabico/Documentos/transcriptoma"

#Create a variable with sampleTable file information:
sampleTable <- read.table('samples_DESeq2',sep='\t', header=T)

#Treatments - levels
treatments = c("cntr","trat")

#Use the function DESeqDataSetFromHTSeqCount if you have used htseq-count from the HTSeq
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,directory = directory,design = ~ condition)


#Create factor according to condition
colData(ddsHTSeq)$condition <- factor(colData(ddsHTSeq)$condition,levels = treatments)

#Count number of lines
nrow(ddsHTSeq)

#Pre-filtering low count genes
keep <- rowSums(counts(ddsHTSeq)) >= 10
dds <- ddsHTSeq[keep,]
nrow(dds)


#Differential expression analysis
dds <- DESeq(dds)

#inspect sample relationships, invoke a variance-stabilizaing transformation
library("vsn")
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd)
dev.copy2pdf(file="plotPCAvsd")


#Inspect the estimated dispersions
plotDispEsts(dds)
dev.copy2pdf(file="plotDispEsts")


#Log fold change shrinkage for visualization and ranking
resultsNames(dds)
resLFC <- lfcShrink(dds, coef="condition_trat_vs_cntr", type="apeglm")
resLFC
plotMA(resLFC, ylim=c(-8,8), main= "trat vs cntr", alpha=0.05)
abline(h=c(-1,1),col='green')
dev.copy2pdf(file='plotMA.pdf')


#Ordering by p-value adjusted
resOrdered <- resLFC[order(resLFC$padj),]

write.csv(as.data.frame(resOrdered), file="tableDEG_DESeq2.csv")

resSig <- subset(resOrdered, padj < 0.05)
resSig


q()