Advertisement
nenime

Get duplicate ids from fasta files

Aug 16th, 2019
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.75 KB | None | 0 0
  1. # /data4/bio/runs-danio/PLANT/Compare/Brass/BN
  2. # В консоли делаем
  3. # grep '>' *.fasta | sed 's/:/\t/' | sed 's/>//' > ../ids_BN.fasta
  4.  
  5. ids <- read.delim("../ids_BN.fasta", head = F)
  6. colnames(ids) <- c("file", "id")
  7. ids <- unique(ids)
  8. ids <- ids[ids$file != "BN_all.fasta", ]
  9. ids$file <- gsub(".fasta", "", ids$file)
  10. id.counts <- as.data.frame(table(ids$id))
  11. colnames(id.counts) <- c("id", "count")
  12. agg <- aggregate(ids$file, by = list(ids$id), paste, collapse = ",")
  13. colnames(agg) <- c("id", "files")
  14. id.counts <- merge(id.counts, agg)
  15. id.counts <- id.counts[order(id.counts$count, decreasing = T), ]
  16. nrow(id.counts[id.counts$count > 1, ])
  17. write.table(id.counts, "../ids_BN_counts.txt", row.names = F,
  18.             sep = "\t", quote = F)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement