Advertisement
citrate_reiterator

Individual vs. corpus enrichment script

Aug 13th, 2013
163
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.84 KB | None | 0 0
  1. ### individual vs. corpus enrichment script ###################################
  2. ### by en forme de poire, 2013-08-13
  3. ### acknowledgements due to cortex and benito.strauss
  4.  
  5. ### Stuff you may need to tweak follows #######################################
  6.  
  7. ### Set path to your 1-gram and corpus 1-gram files here (NECESSARY)
  8. indiv.file <- "~/../Desktop/88591--1-gram--allsites--1999-01-01--2014-01-01.txt"
  9. corpus.file <- "~/../Desktop/allsites--1999-01-01--2013-01-01.txt"
  10.  
  11. ### Change false discovery rate cutoff here (default = 1%) (OPTIONAL)
  12. fdr.cutoff = 0.01
  13.  
  14. ### Begin script ##############################################################
  15.  
  16. write("loading corpuses", stderr())
  17. indiv <- read.table(indiv.file, header = 1, skip = 2, sep = '\t', stringsAs = F)
  18. fullcorpus <- read.table(corpus.file, skip = 2, header = 1, sep = '\t', stringsAs = F)
  19. ind.vs.corpus <- merge(fullcorpus, indiv, by = 3)
  20.  
  21. corpus.total <- sum(as.numeric(ind.vs.corpus[,2]))
  22. ind.total <- sum(as.numeric(ind.vs.corpus[,4]))
  23.  
  24. write("calculating enrichment scores", stderr())
  25. fisher.pvals <- apply(ind.vs.corpus, 1, function(x) {
  26.   t.freq <- as.numeric(x[2])
  27.   i.freq <- as.numeric(x[4])
  28.   c.table <- matrix(nr = 2, c(t.freq, i.freq, corpus.total - t.freq, ind.total - i.freq))
  29.   fisher.test(c.table)$p.value
  30. })
  31.  
  32. ivc.fdr <- p.adjust(fisher.pvals)
  33. ivc.pass <- which(ivc.fdr <= fdr.cutoff)
  34. ivc.sig <- ind.vs.corpus[ivc.pass,]
  35.  
  36. write("writing results to table", stderr())
  37. write.csv(file="significant-words.csv", ivc.sig[order(ivc.sig[, 4] / ivc.sig[, 2]),])
  38.  
  39. write("plotting results", stderr())
  40. pdf("word-freq-plot.pdf", width = 11, height = 8.5)
  41. plot(log10(ind.vs.corpus[,c(3,5)]), xlab = "log10(corpus ppm)", ylab = "log10(individual ppm")
  42. abline(0,1)
  43. points(log10(ivc.sig[,c(3,5)]), pch = 19)
  44. text(log10(ivc.sig[,c(3,5)]), ivc.sig[,1], pos = 1, offset = 1)
  45. dev.off()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement