Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library("reutils")
- # NB: to grab FASTA of accession number x, do this:
- #
- # f <- content(efetch(x, db="nuccore", rettype="fasta", retmode="text"))
- #
- # Result is a string, with \n separators.
- grab.results <- function (term) {
- # Search for the given term on nuccore. This gives us a list of
- # record IDs.
- ids <- esearch(term, db="nuccore")
- # Grab summaries for the given record IDs, as a sort-of data frame.
- sum <- esummary(ids, db="nuccore")
- data <- content(sum, as="parsed")
- # For some reason, this parser gives us lists of lists instead of a
- # proper data frame (which should be lists of vectors). Return a
- # fixed-up version. Also turn Slen into an integer column.
- f <- data.frame(lapply(data, as.character), stringsAsFactors=FALSE)
- f$Slen <- as.integer(f$Slen)
- f
- }
- first.of.type <- function (results, type) {
- filtered <- results
- # Sort by Slen (biggest first)
- filtered <- filtered[order(filtered$Slen, decreasing=TRUE),]
- # Return the first accession number
- filtered$OSLT[1]
- }
- scrape.genbank <- function (species, genes) {
- # Create dataset skeleton
- data <- data.frame(Species=species)
- for (i in genes) {
- data[,i] <- rep(NA, length(species))
- }
- # Look up data for each species
- for (i in 1:length(species)) {
- for (g in genes) {
- n <- species[i]
- print(sprintf("Looking up %s (%s) (%d/%d)...",
- n, g, i, length(species)))
- query <- sprintf("%s AND %s", n, g)
- tryCatch({
- r <- grab.results(query)
- data[i,g] <- first.of.type(r, g)
- }, error = function(e) {
- })
- }
- }
- data
- }
- rebuild <- function () {
- data <- read.csv("highlightedSkates.csv", stringsAsFactors=FALSE)
- data <- scrape.genbank(unique(data$Host), c("NADH", "COI", "CO1", "RAG1"))
- write.csv(data, "Skate_allmarkers_accessions.csv")
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement