Guest User

Untitled

a guest
Oct 18th, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.72 KB | None | 0 0
  1. #!/usr/bin/env perl
  2.  
  3. use strict;
  4. use warnings;
  5. use Data::Dumper;
  6. use Getopt::Long;
  7. use Bio::Perl;
  8. use File::Copy qw/mv/;
  9.  
  10. my %rank=(
  11. d => "domain",
  12. p => "phylum",
  13. c => "class",
  14. o => "order",
  15. f => "family",
  16. g => "genus",
  17. s => "species",
  18. );
  19.  
  20.  
  21. exit main();
  22.  
  23. sub main{
  24.  
  25. my $settings={};
  26. GetOptions($settings, qw(infile=s help)) or die $!;
  27. $$settings{infile} ||= die "ERROR: need --infile";
  28. die usage() if($$settings{help});
  29.  
  30. mkdir "taxonomy";
  31. mkdir "library";
  32. mkdir "library/gtdb";
  33.  
  34. open(my $nodesFh, ">", "taxonomy/nodes.dmp") or die "ERROR writing to nodes.dmp";
  35. open(my $namesFh, ">", "taxonomy/names.dmp") or die "ERROR writing to names.dmp";
  36.  
  37. my $rootTaxid=1;
  38. my %root=(
  39. taxid => $rootTaxid,
  40. scientificName => "root",
  41. rank => "root",
  42. parent => $rootTaxid,
  43. );
  44.  
  45. my $taxonCounter=1;
  46. my %taxon=(root=>\%root);
  47.  
  48. open(my $inFh, "<", $$settings{infile}) or die "ERROR: could not read $$settings{infile}: $!";
  49. while(my $line=<$inFh>){
  50. next if($line=~/^\s*#/);
  51. #print STDERR "." if($. % 10 == 0);
  52. chomp $line;
  53. my($asmid,$lineageStr)=split /\t/, $line;
  54. my $assemblyId = $asmid;
  55. $assemblyId=~s/^RS_//; # remove prefix RS_
  56. $assemblyId=~s/\.\d+$//; # remove version
  57.  
  58. print STDERR "Loading ". $assemblyId.", ".substr($lineageStr,0,20)."...".substr($lineageStr,-40,40)."\n";
  59. my @lineage=split(/;/, $lineageStr);
  60. for(my $i=0;$i<@lineage;$i++){
  61. my $name = $lineage[$i];
  62. my ($rank,$scientificName) = split(/__/, $name);
  63. if(!defined($taxon{$name})){
  64. my $taxid = ++$taxonCounter;
  65. my $rank = $rank{lc($rank)};
  66.  
  67. my $parent;
  68. if($rank eq "domain"){
  69. $parent = $rootTaxid;
  70. } else {
  71. $parent = $taxon{$lineage[$i-1]}{taxid};
  72. }
  73.  
  74. $taxon{$name} = {
  75. taxid => $taxid,
  76. scientificName => $scientificName,
  77. rank => $rank,
  78. parent => $parent,
  79. asm => $asmid,
  80. };
  81.  
  82. print $nodesFh join("\t|\t", $taxid, $parent, $rank, "", 0, 1, 11, 1, 0, 1, 1, 0)."\t|\n";
  83. print $namesFh join("\t|\t", $taxid, $scientificName, "", "scientific name")."\t|\n";
  84.  
  85. }
  86. }
  87.  
  88. # Download the genome with the last taxid
  89. my $taxid = $taxon{$lineage[-1]}{taxid};
  90. my $filename = "library/gtdb/$assemblyId.fna";
  91. print STDERR " downloading...";
  92. if(-e $filename){
  93. print STDERR "file present, not downloading again.\n";
  94. next;
  95. }
  96.  
  97. system("esearch -db assembly -query $assemblyId | elink -target nuccore | efetch -format fasta > $filename.tmp");
  98. my $filesize = (stat "$filename.tmp")[7];
  99. if($filesize < 1){
  100. #print STDERR "File size is $filesize. Trying again\n";
  101. die "ERROR downloading $assemblyId with exit code $?: $!. To skip this error, comment this line in the input file.";
  102. }
  103.  
  104. # Replace with taxids
  105. my $in=Bio::SeqIO->new(-file=>"$filename.tmp", -format=>"fasta");
  106. my $out=Bio::SeqIO->new(-file=>">$filename.kraken", -format=>"fasta");
  107. my %seenSeq=();
  108. while(my $seq=$in->next_seq){
  109. next if($seenSeq{$seq->seq}++); # avoid duplicate contigs within a genome
  110. my $id=$seq->id;
  111. $seq->desc(" "); # unset the description fields
  112. $seq->id("$id|kraken:taxid|$taxid");
  113. $out->write_seq($seq);
  114. }
  115.  
  116. # Cleanup
  117. unlink("$filename.tmp"); # cleanup
  118. mv("$filename.kraken",$filename);
  119. print STDERR "downloaded\n";
  120. }
  121.  
  122. close $inFh;
  123. print STDERR "\n";
  124.  
  125. close $nodesFh;
  126. close $namesFh;
  127.  
  128.  
  129. return 0;
  130. }
  131.  
  132. sub usage{
  133. "Usage: perl -i gtdb.txt $0
  134. Where gtdb.txt is a two column file with assembly ID and semicolon-delimited lineage
  135. Outputs two folders for taxonomy and library of fasta files.
  136. "
  137. }
Add Comment
Please, Sign In to add comment