Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env perl
- use strict;
- use warnings;
- use Data::Dumper;
- use Getopt::Long;
- use Bio::Perl;
- use File::Copy qw/mv/;
- my %rank=(
- d => "domain",
- p => "phylum",
- c => "class",
- o => "order",
- f => "family",
- g => "genus",
- s => "species",
- );
- exit main();
- sub main{
- my $settings={};
- GetOptions($settings, qw(infile=s help)) or die $!;
- $$settings{infile} ||= die "ERROR: need --infile";
- die usage() if($$settings{help});
- mkdir "taxonomy";
- mkdir "library";
- mkdir "library/gtdb";
- open(my $nodesFh, ">", "taxonomy/nodes.dmp") or die "ERROR writing to nodes.dmp";
- open(my $namesFh, ">", "taxonomy/names.dmp") or die "ERROR writing to names.dmp";
- my $rootTaxid=1;
- my %root=(
- taxid => $rootTaxid,
- scientificName => "root",
- rank => "root",
- parent => $rootTaxid,
- );
- my $taxonCounter=1;
- my %taxon=(root=>\%root);
- open(my $inFh, "<", $$settings{infile}) or die "ERROR: could not read $$settings{infile}: $!";
- while(my $line=<$inFh>){
- next if($line=~/^\s*#/);
- #print STDERR "." if($. % 10 == 0);
- chomp $line;
- my($asmid,$lineageStr)=split /\t/, $line;
- my $assemblyId = $asmid;
- $assemblyId=~s/^RS_//; # remove prefix RS_
- $assemblyId=~s/\.\d+$//; # remove version
- print STDERR "Loading ". $assemblyId.", ".substr($lineageStr,0,20)."...".substr($lineageStr,-40,40)."\n";
- my @lineage=split(/;/, $lineageStr);
- for(my $i=0;$i<@lineage;$i++){
- my $name = $lineage[$i];
- my ($rank,$scientificName) = split(/__/, $name);
- if(!defined($taxon{$name})){
- my $taxid = ++$taxonCounter;
- my $rank = $rank{lc($rank)};
- my $parent;
- if($rank eq "domain"){
- $parent = $rootTaxid;
- } else {
- $parent = $taxon{$lineage[$i-1]}{taxid};
- }
- $taxon{$name} = {
- taxid => $taxid,
- scientificName => $scientificName,
- rank => $rank,
- parent => $parent,
- asm => $asmid,
- };
- print $nodesFh join("\t|\t", $taxid, $parent, $rank, "", 0, 1, 11, 1, 0, 1, 1, 0)."\t|\n";
- print $namesFh join("\t|\t", $taxid, $scientificName, "", "scientific name")."\t|\n";
- }
- }
- # Download the genome with the last taxid
- my $taxid = $taxon{$lineage[-1]}{taxid};
- my $filename = "library/gtdb/$assemblyId.fna";
- print STDERR " downloading...";
- if(-e $filename){
- print STDERR "file present, not downloading again.\n";
- next;
- }
- system("esearch -db assembly -query $assemblyId | elink -target nuccore | efetch -format fasta > $filename.tmp");
- my $filesize = (stat "$filename.tmp")[7];
- if($filesize < 1){
- #print STDERR "File size is $filesize. Trying again\n";
- die "ERROR downloading $assemblyId with exit code $?: $!. To skip this error, comment this line in the input file.";
- }
- # Replace with taxids
- my $in=Bio::SeqIO->new(-file=>"$filename.tmp", -format=>"fasta");
- my $out=Bio::SeqIO->new(-file=>">$filename.kraken", -format=>"fasta");
- my %seenSeq=();
- while(my $seq=$in->next_seq){
- next if($seenSeq{$seq->seq}++); # avoid duplicate contigs within a genome
- my $id=$seq->id;
- $seq->desc(" "); # unset the description fields
- $seq->id("$id|kraken:taxid|$taxid");
- $out->write_seq($seq);
- }
- # Cleanup
- unlink("$filename.tmp"); # cleanup
- mv("$filename.kraken",$filename);
- print STDERR "downloaded\n";
- }
- close $inFh;
- print STDERR "\n";
- close $nodesFh;
- close $namesFh;
- return 0;
- }
- sub usage{
- "Usage: perl -i gtdb.txt $0
- Where gtdb.txt is a two column file with assembly ID and semicolon-delimited lineage
- Outputs two folders for taxonomy and library of fasta files.
- "
- }
Add Comment
Please, Sign In to add comment