Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #-------------------------------------------------------------------------------
- # Function: hypergeometric test
- #-------------------------------------------------------------------------------
- hypergeo <- function(white.drawn, white, black, drawn, do.log=FALSE) {
- # Info: http://digitheadslabnotebook.blogspot.com/2011/02/using-r-for-introductory-statistics_21.html
- # dhyper(q, m, n, k, log = FALSE)
- # q = number of successes; "white balls drawn" (here: number of genes that overlap)
- # m + n = N ; N = total number of genes
- # m = "white balls in urn"; total number of TF-bound genes
- # n = "black balls in urn"; total number of genes NOT bound by the TF
- # k = "number of balls drawn from urn"; sample size
- if (white < 1) {return(NA)}
- p <- phyper(white.drawn-1, white, black, drawn, lower.tail = FALSE, log.p=do.log)
- return(p)
- } # end: hypergeo
- #-------------------------------------------------------------------------------
- # Set the working directory and load the data files
- #-------------------------------------------------------------------------------
- setwd("~/Desktop/R_Project/Gene_overlap")
- getwd()
- files <- list.files(pattern="*.txt", full.names = TRUE)
- files
- data.list <- lapply(files, function(fil) {
- scan(file=fil, what=character())
- })
- names(data.list) <- basename(files) %>% stringr::str_remove("\.txt$")
- str(data.list)
- # List of 8
- # $ GSE108363_BCGdown_D:chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
- # $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
- # $ GSE108363_BCGup_D : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
- # $ GSE108363_BCGup_V : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
- # $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
- # $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
- # $ GSE108363_MTBup_D : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
- # $ GSE108363_MTBup_V : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...
- intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length
- sapply(data.list, length)
- #-------------------------------------------------------------------------------
- # Using the intersect function to see the overlaps
- #-------------------------------------------------------------------------------
- data.file1 <- "GSE108363_BCGdown_V.txt"
- data.file2 <- "GSE108363_BCGdown_D.txt"
- data.file3 <- "GSE108363_BCGup_V.txt"
- data.file4 <- "GSE108363_BCGup_D.txt"
- data.file5 <- "GSE108363_MTBdown_V.txt"
- data.file6 <- "GSE108363_MTBdown_D.txt"
- data.file7 <- "GSE108363_MTBup_V.txt"
- data.file8 <- "GSE108363_MTBup_D.txt"
- genevect1 <- scan(data.file1, what=character(), sep="n")
- genevect2 <- scan(data.file2, what=character(), sep="n")
- genevect3 <- scan(data.file3, what=character(), sep="n")
- genevect4 <- scan(data.file4, what=character(), sep="n")
- genevect5 <- scan(data.file5, what=character(), sep="n")
- genevect6 <- scan(data.file6, what=character(), sep="n")
- genevect7 <- scan(data.file7, what=character(), sep="n")
- genevect8 <- scan(data.file8, what=character(), sep="n")
- filelist <- list(data.file1, data.file2, data.file3, data.file4, data.file5, data.file6, data.file7, data.file8)
- all(sapply(filelist, file.exists))
- #-------------------------------------------------------------------------------
- # read files:
- #-------------------------------------------------------------------------------
- gene.lists <- lapply(filelist, function(f) {
- scan(file=f, what=character())
- })
- #-------------------------------------------------------------------------------
- # set up empty matrix
- #-------------------------------------------------------------------------------
- x <- (length(gene.lists))^2
- x
- y <- rep(NA, x)
- mx <- matrix(y, ncol=length(gene.lists))
- mx
- row.names(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
- colnames(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
- mx
- mx.overlap.count <- mx
- #-------------------------------------------------------------------------------
- # seq_along(gene.lists) # 1 2 3 4 5 6 7 8
- #-------------------------------------------------------------------------------
- for (i in seq_along(gene.lists)) {
- g1 <- gene.lists[[i]]
- for (j in seq_along(gene.lists)) {
- g2 <- gene.lists[[j]]
- a <- intersect(g1, g2)
- b <- length(a)
- mx.overlap.count[j,i] <- b
- }
- }
- mx.overlap.count
- View(mx.overlap.count)
- #-------------------------------------------------------------------------------
- # looking at gene overlaps in terms of %
- #-------------------------------------------------------------------------------
- # modify the code written above by adding the following to the innermost loop:
- # get max of g1 and g2
- # divide overlap by the max
- # calculate the %
- # seq_along(gene.lists) # 1 2 3 4 5 6 7 8
- mx.overlap <- mx
- for (i in seq_along(gene.lists)) {
- g1 <- gene.lists[[i]]
- for (j in seq_along(gene.lists)) {
- g2 <- gene.lists[[j]]
- a <- intersect(g1, g2)
- b <- length(a)
- maxN = max(length(g1), length(g2))
- mx.overlap[j,i]= 100* b / maxN
- mx.overlap.count[j,i] <- b
- }
- }
- mx.overlap
- View(mx.overlap)
Add Comment
Please, Sign In to add comment