document.write('
Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. ### analyze GOLD
  2.  
  3. # download data file
  4. set d = `date +%F`
  5. mkdir data/GOLD
  6. wget "http://www.genomesonline.org/documents/Export/gold.xls" -O data/GOLD/gold-$d.txt
  7.  
  8. # screenscrape completion data -- credit @ppgardne ...
  9. # takes a long time -- do on byobu window
  10. cat data/GOLD/gold-$d.txt | awk \'{print "curl -s http://genomesonline.org/cgi-bin/GOLD/bin/GOLDCards.cgi?goldstamp="$1}\' | sh | grep "COMPLETION DATE" | perl -lane \'if(/left>(\\S+)<\\/td>/){print $1}\' >! results/_t1
  11. paste data/GOLD/gold-2013-06-06.txt results/_t1 > results/gold-$d-plusdate.txt
  12.  
  13. # sort by date
  14. sed \'s/-.*//\' results/_t1 | sort | uniq -c | sort -k2n > results/GOLD_by_year.txt
  15.  
  16. # kingdom analysis
  17. gawkt \'{print $7}\' data/GOLD/gold-2013-06-06.txt | sort | uniq -c | sort -k1nr
  18.  
  19. # kingdom analysis by year -- must have a COMPLETED DATE (col 24)
  20. gawkt \'$7=="Bacteria"{print $24}\' results/gold-2013-06-06-plusdate.txt | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_byyear_bacteria.txt
  21. gawkt \'$7=="Bacteria"&&$24~/^[0-9-]+$/{print $6,$24,$13}\' results/gold-2013-06-06-plusdate.txt | sort > results/_gold-2013-06-06-plusdate_bacteria
  22. join -t"$TAB" -a1 results/_gold-2013-06-06-plusdate_bacteria results/_tax_to_category | sort -k4 -k2r -t"$TAB" | gawkt \'{print $4,$2,$3}\' > results/_gold-2013-06-06-plusdate_bacteria_speciesID
  23. gawkt \'{idx[$1]=$0}END{for (i in idx) {print idx[i]}}\' results/_gold-2013-06-06-plusdate_bacteria_speciesID > results/_gold-2013-06-06-plusdate_bacteria_speciesID_early
  24. gawkt \'{print $2}\' results/_gold-2013-06-06-plusdate_bacteria_speciesID_early | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_bacteria_byyear_species.txt
  25.  
  26.  
  27. gawkt \'$7=="Eukaryota"{print $24}\' results/gold-2013-06-06-plusdate.txt | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_byyear_eukaryotes.txt
  28. gawkt \'$7=="Eukaryota"&&$24~/^[0-9-]+$/{print $6,$24,$13}\' results/gold-2013-06-06-plusdate.txt | sort > results/_gold-2013-06-06-plusdate_eukaryotes
  29. join -t"$TAB" -a1 results/_gold-2013-06-06-plusdate_eukaryotes results/_tax_to_category | sort -k4 -k2r -t"$TAB" | gawkt \'{print $4,$2,$3}\' > results/_gold-2013-06-06-plusdate_eukaryotes_speciesID
  30. gawkt \'{idx[$1]=$0}END{for (i in idx) {print idx[i]}}\' results/_gold-2013-06-06-plusdate_eukaryotes_speciesID > results/_gold-2013-06-06-plusdate_eukaryotes_speciesID_early
  31. gawkt \'{print $2}\' results/_gold-2013-06-06-plusdate_eukaryotes_speciesID_early | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_eukaryotes_byyear_species.txt
  32.  
  33.  
  34. gawkt \'$7=="Archaea"{print $24}\' results/gold-2013-06-06-plusdate.txt | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_byyear_archaea.txt
  35. gawkt \'$7=="Archaea"&&$24~/^[0-9-]+$/{print $6,$24,$13}\' results/gold-2013-06-06-plusdate.txt | sort > results/_gold-2013-06-06-plusdate_archaea
  36. join -t"$TAB" -a1 results/_gold-2013-06-06-plusdate_archaea results/_tax_to_category | sort -k4 -k2r -t"$TAB" | gawkt \'{print $4,$2,$3}\' > results/_gold-2013-06-06-plusdate_archaea_speciesID
  37. gawkt \'{idx[$1]=$0}END{for (i in idx) {print idx[i]}}\' results/_gold-2013-06-06-plusdate_archaea_speciesID > results/_gold-2013-06-06-plusdate_archaea_speciesID_early
  38. gawkt \'{print $2}\' results/_gold-2013-06-06-plusdate_archaea_speciesID_early | sed \'s/-.*//\' | sort | uniq -c | sort -k2n > results/gold-2013-06-06-plusdate_archaea_byyear_species.txt
  39.  
  40.  
  41.  
  42. ### analyze NCBI genomes
  43.  
  44. # download files
  45. wget -r -l1 -nd --no-parent -P data/NCBI_genome_reports ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/
  46.  
  47. # analyze eukaryotes
  48. # "Status" in col 19, "Release Date" in col 17
  49. cat data/NCBI_genome_reports/eukaryotes.txt | gawkt \'$19=="Chromosomes"||$19=="Scaffolds or contigs"{print $17}\' >! results/_NCBI_eukaryotes_years
  50. cat data/NCBI_genome_reports/eukaryotes.txt | gawkt \'($19=="Chromosomes"||$19=="Scaffolds or contigs")&&$17!="-"{print $17,$2}\' | sort -k2  >! results/_NCBI_eukaryotes_taxid
  51. join -12 -21 -a1 -t"$TAB" results/_NCBI_eukaryotes_taxid results/_tax_to_category | sort -k3 -k2r >! results/_NCBI_eukaryotes_speciesID
  52. gawkt \'{idx[$3]=$0}END{for (i in idx) {print idx[i]}}\' results/_NCBI_eukaryotes_speciesID >! results/_NCBI_eukaryotes_speciesID_early
  53. gawkt \'{print $2}\' results/_NCBI_eukaryotes_speciesID_early | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n >! results/NCBI_eukaryotes_by_year_species.txt
  54.  
  55. cat data/NCBI_genome_reports/eukaryotes.txt | gawkt \'$19=="Chromosomes"&&$17!="-"{print $17,$2}\' | sort -k2  >! results/_NCBI_eukaryotes_taxid_chronly
  56. join -12 -21 -a1 -t"$TAB" results/_NCBI_eukaryotes_taxid_chronly results/_tax_to_category | sort -k3 -k2r >! results/_NCBI_eukaryotes_speciesID_chronly
  57. gawkt \'{idx[$3]=$0}END{for (i in idx) {print idx[i]}}\' results/_NCBI_eukaryotes_speciesID_chronly >! results/_NCBI_eukaryotes_speciesID_early_chronly
  58. gawkt \'{print $2}\' results/_NCBI_eukaryotes_speciesID_early_chronly | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n >! results/NCBI_eukaryotes_by_year_species_chronly.txt
  59.  
  60.  
  61. # analyze prokaryotes
  62. # "Status" in col 19, "Release Date" in col 17
  63. cat data/NCBI_genome_reports/prokaryotes.txt | gawkt \'$19=="Complete"||$19=="Scaffolds or contigs"{print $17}\' > results/_NCBI_prokaryotes_years
  64. cat data/NCBI_genome_reports/prokaryotes.txt | gawkt \'($19=="Complete"||$19=="Scaffolds or contigs")&&$17!="-"{print $17,$2}\' | sort -k2  >! results/_NCBI_prokaryotes_taxid
  65. join -12 -21 -a1 -t"$TAB" results/_NCBI_prokaryotes_taxid results/_tax_to_category | sort -k3 -k2r > results/_NCBI_prokaryotes_speciesID
  66. gawkt \'{idx[$3]=$0}END{for (i in idx) {print idx[i]}}\' results/_NCBI_prokaryotes_speciesID > results/_NCBI_prokaryotes_speciesID_early
  67. gawkt \'{print $2}\' results/_NCBI_prokaryotes_speciesID_early | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_prokaryotes_by_year_species.txt
  68.  
  69.  
  70. # analyze viruses
  71. # "Status" in col **15**, "Release Date" in col **13**
  72. cat data/NCBI_genome_reports/viruses.txt | gawkt \'$15=="Complete"||$15=="Scaffolds or contigs"{print $13}\' > results/_NCBI_viruses_years
  73. cat data/NCBI_genome_reports/viruses.txt | gawkt \'($15=="Complete"||$15=="Scaffolds or contigs")&&$13!="-"{print $13,$2}\' | sort -k2  >! results/_NCBI_viruses_taxid
  74. join -12 -21 -a1 -t"$TAB" results/_NCBI_viruses_taxid results/_tax_to_category | sort -k3 -k2r > results/_NCBI_viruses_speciesID
  75. gawkt \'{idx[$3]=$0}END{for (i in idx) {print idx[i]}}\' results/_NCBI_viruses_speciesID > results/_NCBI_viruses_speciesID_early
  76. gawkt \'{print $2}\' results/_NCBI_viruses_speciesID_early | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_viruses_by_year_species.txt
  77.  
  78.  
  79. # analyze aggregate results
  80. cat results/_NCBI_eukaryotes_years | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_eukaryotes_by_year.txt
  81. cat results/_NCBI_prokaryotes_years | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_prokaryotes_by_year.txt
  82. cat results/_NCBI_viruses_years | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_viruses_by_year.txt
  83. cat results/_NCBI_{eukaryotes,prokaryotes,viruses}_years | sed \'s/\\/.*//\' | sort | uniq -c | sort -k2n > results/NCBI_by_year.txt
  84.  
  85. # kingdom analysis (method #1)
  86. gawkt \'{print $2}\' data/NCBI_genome_reports/overview.txt | sort | uniq -c | sort -k1nr
  87.  
  88. # kingdom analysis (method #2)
  89. wc -l data/NCBI_genome_reports/{eukaryotes,viruses}.txt | sed \'s/data.*\\///;s/\\.txt//\'
  90. gawkt \'$5~/archae/\' data/NCBI_genome_reports/prokaryotes.txt | wc
  91. gawkt \'$5!~/archae/\' data/NCBI_genome_reports/prokaryotes.txt | wc
');