Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # load data into R
- library(tidyverse)
- library(edgeR)
- files <- dir(path = "data", pattern = "*.tab", full.names = T)
- counttablefull <- files %>%
- map(read_tsv, skip = 4, col_names = FALSE ) %>%
- reduce(cbind)
- counttablefull %>% head()
- datasets <- files %>%
- stringr::str_replace("data/", "") %>%
- stringr::str_replace("ReadsPerGene.out.tab", "")
- colnames <- c()
- for (i in datasets) {
- colnames <- c(colnames,
- paste0("gene_i", i),
- paste0(i, "unstranded"),
- paste0(i, "_forwardstrand"),
- paste0(i, "_reversestrand")
- )
- }
- names(counttablefull) <- colnames
- # make a column for gene id, and remove duplicates
- counttablefull <-
- counttablefull %>%
- mutate(ensembl_gene = gene_iinput29b_2) %>%
- select(-starts_with("gene"))
- counttablefull %>%
- select(-ensembl_gene) %>%
- summarise_each(funs(sum)) %>%
- gather(library, counts) %>%
- separate(library, into = c("treatment", "replicate", "stranding"),
- sep="_") %>%
- spread(stranding, counts) %>%
- mutate(propF = forwardstrand/unstranded,
- propR = reversestrand/unstranded)
- # visualise library depth
- counttablefull %>%
- select(ends_with("reversestrand")) %>%
- summarise_each(funs(sum)) %>%
- gather(library, counts) %>%
- separate(library, into = c("treatment", "replicate", "stranding"),
- sep="_")%>%
- ggplot(aes(y=counts, x=treatment, fill=replicate)) + geom_bar(
- stat="identity", position="dodge") + theme_minimal()
- counttablefull <- counttablefull %>% select(ensembl_gene, ends_with("reversestrand"))
- names(counttablefull) <- str_replace(names(counttablefull), "_reversestrand", "")
- # EDA of count table ----
- counttablefull %>%
- select(-ensembl_gene) %>%
- gather(library, value) %>%
- filter(value > 0) %>%
- ggplot(aes(x=value)) + geom_density() + facet_wrap(~library) +
- scale_x_log10(labels = scales::comma)
- # Filter out lowly expressed genes
- counttablematrix <- counttablefull %>%
- select(-ensembl_gene) %>%
- as.matrix()
- row.names(counttablematrix) <- counttablefull$ensembl_gene
- counttable_cpm <- cpm(counttablematrix)
- counttablefull %>%
- select(-ensembl_gene) %>%
- purrr::modify(~sum(.)) %>%
- distinct() %>%
- gather(library, counts) %>%
- mutate(millionreads = counts/(10^6)) %>%
- mutate(cpmThreshold = 15/millionreads) %>%
- #filter(str_detect(library, "input")) %>%
- mutate(mycutoff = 0.6 * millionreads)
- # Filter ---
- subsetting_matrix <- counttable_cpm > 0.6
- subsetting_vector <- rowSums(subsetting_matrix) >= 3
- counttablematrix_filt <- counttablematrix[subsetting_vector,]
- mydgelist <- DGEList(counttablematrix_filt)
- ## MDS plot ---
- plotMDS(mydgelist)
- ## DGEA
- ## normalise
- mydgelist <- calcNormFactors(mydgelist)
- mydgelist$samples
- groups <- as.factor(
- str_split(rownames(mydgelist$samples), "_", simplify=T)[,1])
- design <- model.matrix(~0 + groups)
- colnames(design) <- levels(groups)
- # limma
- mydgelist_voomed <- voom(mydgelist, design, plot = T)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement