Guest User

Untitled

a guest
Dec 18th, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.13 KB | None | 0 0
  1. #-------------------------------------------------------------------------------
  2. # Function: hypergeometric test
  3. #-------------------------------------------------------------------------------
  4. hypergeo <- function(white.drawn, white, black, drawn, do.log=FALSE) {
  5. # Info: http://digitheadslabnotebook.blogspot.com/2011/02/using-r-for-introductory-statistics_21.html
  6. # dhyper(q, m, n, k, log = FALSE)
  7. # q = number of successes; "white balls drawn" (here: number of genes that overlap)
  8. # m + n = N ; N = total number of genes
  9. # m = "white balls in urn"; total number of TF-bound genes
  10. # n = "black balls in urn"; total number of genes NOT bound by the TF
  11. # k = "number of balls drawn from urn"; sample size
  12. if (white < 1) {return(NA)}
  13. p <- phyper(white.drawn-1, white, black, drawn, lower.tail = FALSE, log.p=do.log)
  14. return(p)
  15. } # end: hypergeo
  16.  
  17. #-------------------------------------------------------------------------------
  18. # Set the working directory and load the data files
  19. #-------------------------------------------------------------------------------
  20. setwd("~/Desktop/R_Project/Gene_overlap")
  21. getwd()
  22. files <- list.files(pattern="*.txt", full.names = TRUE)
  23. files
  24.  
  25. data.list <- lapply(files, function(fil) {
  26. scan(file=fil, what=character())
  27. })
  28.  
  29. names(data.list) <- basename(files) %>% stringr::str_remove("\.txt$")
  30.  
  31. str(data.list)
  32. # List of 8
  33. # $ GSE108363_BCGdown_D:chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
  34. # $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
  35. # $ GSE108363_BCGup_D : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
  36. # $ GSE108363_BCGup_V : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
  37. # $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
  38. # $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
  39. # $ GSE108363_MTBup_D : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
  40. # $ GSE108363_MTBup_V : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...
  41.  
  42. intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length
  43.  
  44. sapply(data.list, length)
  45.  
  46.  
  47.  
  48. #-------------------------------------------------------------------------------
  49. # Using the intersect function to see the overlaps
  50. #-------------------------------------------------------------------------------
  51.  
  52. data.file1 <- "GSE108363_BCGdown_V.txt"
  53. data.file2 <- "GSE108363_BCGdown_D.txt"
  54. data.file3 <- "GSE108363_BCGup_V.txt"
  55. data.file4 <- "GSE108363_BCGup_D.txt"
  56. data.file5 <- "GSE108363_MTBdown_V.txt"
  57. data.file6 <- "GSE108363_MTBdown_D.txt"
  58. data.file7 <- "GSE108363_MTBup_V.txt"
  59. data.file8 <- "GSE108363_MTBup_D.txt"
  60.  
  61. genevect1 <- scan(data.file1, what=character(), sep="n")
  62. genevect2 <- scan(data.file2, what=character(), sep="n")
  63. genevect3 <- scan(data.file3, what=character(), sep="n")
  64. genevect4 <- scan(data.file4, what=character(), sep="n")
  65. genevect5 <- scan(data.file5, what=character(), sep="n")
  66. genevect6 <- scan(data.file6, what=character(), sep="n")
  67. genevect7 <- scan(data.file7, what=character(), sep="n")
  68. genevect8 <- scan(data.file8, what=character(), sep="n")
  69.  
  70.  
  71. filelist <- list(data.file1, data.file2, data.file3, data.file4, data.file5, data.file6, data.file7, data.file8)
  72. all(sapply(filelist, file.exists))
  73.  
  74.  
  75. #-------------------------------------------------------------------------------
  76. # read files:
  77. #-------------------------------------------------------------------------------
  78.  
  79. gene.lists <- lapply(filelist, function(f) {
  80. scan(file=f, what=character())
  81. })
  82.  
  83.  
  84. #-------------------------------------------------------------------------------
  85. # set up empty matrix
  86. #-------------------------------------------------------------------------------
  87. x <- (length(gene.lists))^2
  88. x
  89. y <- rep(NA, x)
  90. mx <- matrix(y, ncol=length(gene.lists))
  91. mx
  92. row.names(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
  93. colnames(mx) <- sapply(filelist, basename) %>% stringr::str_remove('.txt$')
  94. mx
  95.  
  96. mx.overlap.count <- mx
  97.  
  98. #-------------------------------------------------------------------------------
  99. # seq_along(gene.lists) # 1 2 3 4 5 6 7 8
  100. #-------------------------------------------------------------------------------
  101. for (i in seq_along(gene.lists)) {
  102. g1 <- gene.lists[[i]]
  103. for (j in seq_along(gene.lists)) {
  104. g2 <- gene.lists[[j]]
  105. a <- intersect(g1, g2)
  106. b <- length(a)
  107. mx.overlap.count[j,i] <- b
  108. }
  109. }
  110.  
  111. mx.overlap.count
  112. View(mx.overlap.count)
  113.  
  114. #-------------------------------------------------------------------------------
  115. # looking at gene overlaps in terms of %
  116. #-------------------------------------------------------------------------------
  117. # modify the code written above by adding the following to the innermost loop:
  118. # get max of g1 and g2
  119. # divide overlap by the max
  120. # calculate the %
  121.  
  122. # seq_along(gene.lists) # 1 2 3 4 5 6 7 8
  123. mx.overlap <- mx
  124. for (i in seq_along(gene.lists)) {
  125. g1 <- gene.lists[[i]]
  126. for (j in seq_along(gene.lists)) {
  127. g2 <- gene.lists[[j]]
  128. a <- intersect(g1, g2)
  129. b <- length(a)
  130. maxN = max(length(g1), length(g2))
  131. mx.overlap[j,i]= 100* b / maxN
  132. mx.overlap.count[j,i] <- b
  133. }
  134. }
  135.  
  136. mx.overlap
  137. View(mx.overlap)
Add Comment
Please, Sign In to add comment