Individual vs. corpus enrichment script

### individual vs. corpus enrichment script ###################################
### by en forme de poire, 2013-08-13
### acknowledgements due to cortex and benito.strauss

### Stuff you may need to tweak follows #######################################

### Set path to your 1-gram and corpus 1-gram files here (NECESSARY)
indiv.file <- "~/../Desktop/88591--1-gram--allsites--1999-01-01--2014-01-01.txt"
corpus.file <- "~/../Desktop/allsites--1999-01-01--2013-01-01.txt"

### Change false discovery rate cutoff here (default = 1%) (OPTIONAL)
fdr.cutoff = 0.01

### Begin script ##############################################################

write("loading corpuses", stderr())
indiv <- read.table(indiv.file, header = 1, skip = 2, sep = '\t', stringsAs = F)
fullcorpus <- read.table(corpus.file, skip = 2, header = 1, sep = '\t', stringsAs = F)
ind.vs.corpus <- merge(fullcorpus, indiv, by = 3)

corpus.total <- sum(as.numeric(ind.vs.corpus[,2]))
ind.total <- sum(as.numeric(ind.vs.corpus[,4]))

write("calculating enrichment scores", stderr())
fisher.pvals <- apply(ind.vs.corpus, 1, function(x) {
  t.freq <- as.numeric(x[2])
  i.freq <- as.numeric(x[4])
  c.table <- matrix(nr = 2, c(t.freq, i.freq, corpus.total - t.freq, ind.total - i.freq))
  fisher.test(c.table)$p.value
})

ivc.fdr <- p.adjust(fisher.pvals)
ivc.pass <- which(ivc.fdr <= fdr.cutoff)
ivc.sig <- ind.vs.corpus[ivc.pass,]

write("writing results to table", stderr())
write.csv(file="significant-words.csv", ivc.sig[order(ivc.sig[, 4] / ivc.sig[, 2]),])

write("plotting results", stderr())
pdf("word-freq-plot.pdf", width = 11, height = 8.5)
plot(log10(ind.vs.corpus[,c(3,5)]), xlab = "log10(corpus ppm)", ylab = "log10(individual ppm")
abline(0,1)
points(log10(ivc.sig[,c(3,5)]), pch = 19)
text(log10(ivc.sig[,c(3,5)]), ivc.sig[,1], pos = 1, offset = 1)
dev.off()