Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### individual vs. corpus enrichment script ###################################
- ### by en forme de poire, 2013-08-13
- ### acknowledgements due to cortex and benito.strauss
- ### Stuff you may need to tweak follows #######################################
- ### Set path to your 1-gram and corpus 1-gram files here (NECESSARY)
- indiv.file <- "~/../Desktop/88591--1-gram--allsites--1999-01-01--2014-01-01.txt"
- corpus.file <- "~/../Desktop/allsites--1999-01-01--2013-01-01.txt"
- ### Change false discovery rate cutoff here (default = 1%) (OPTIONAL)
- fdr.cutoff = 0.01
- ### Begin script ##############################################################
- write("loading corpuses", stderr())
- indiv <- read.table(indiv.file, header = 1, skip = 2, sep = '\t', stringsAs = F)
- fullcorpus <- read.table(corpus.file, skip = 2, header = 1, sep = '\t', stringsAs = F)
- ind.vs.corpus <- merge(fullcorpus, indiv, by = 3)
- corpus.total <- sum(as.numeric(ind.vs.corpus[,2]))
- ind.total <- sum(as.numeric(ind.vs.corpus[,4]))
- write("calculating enrichment scores", stderr())
- fisher.pvals <- apply(ind.vs.corpus, 1, function(x) {
- t.freq <- as.numeric(x[2])
- i.freq <- as.numeric(x[4])
- c.table <- matrix(nr = 2, c(t.freq, i.freq, corpus.total - t.freq, ind.total - i.freq))
- fisher.test(c.table)$p.value
- })
- ivc.fdr <- p.adjust(fisher.pvals)
- ivc.pass <- which(ivc.fdr <= fdr.cutoff)
- ivc.sig <- ind.vs.corpus[ivc.pass,]
- write("writing results to table", stderr())
- write.csv(file="significant-words.csv", ivc.sig[order(ivc.sig[, 4] / ivc.sig[, 2]),])
- write("plotting results", stderr())
- pdf("word-freq-plot.pdf", width = 11, height = 8.5)
- plot(log10(ind.vs.corpus[,c(3,5)]), xlab = "log10(corpus ppm)", ylab = "log10(individual ppm")
- abline(0,1)
- points(log10(ivc.sig[,c(3,5)]), pch = 19)
- text(log10(ivc.sig[,c(3,5)]), ivc.sig[,1], pos = 1, offset = 1)
- dev.off()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement