carandraug

#!/usr/bin/perl
## Copyright (C) 2011 Carnë Draug <[email protected]>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, see <http://www.gnu.org/licenses/>.

use 5.010;                      # Use Perl 5.10
use warnings;                   # Replacement for the -w flag, but lexically scoped
use strict;                     # Enforce some good programming rules
use Getopt::Long;               # Parse program arguments
use Cwd;                        # Determines current working directory
use File::Spec;                 # Perform operation on file names
use Bio::SeqIO;                 # Handler for SeqIO Formats

## List of useful variables related to project
my $version       = 0.01;                         # program version. Will be saved on log file
my $email         = '[email protected]';   # mail to be used by the database sysadmins
my $tool          = 'bioperl-sequence_extractor'; # project name to be given to database sysadmin

=head1 NAME

sequence_extractor - for a specific list of searches, extracts all related sequences

=head1 SYNOPSIS

B<sequence_extractor> [] [] SEARCH

=head1 DESCRIPTION

This script does blah blah blah

=head1 OPTIONS

See the section bugs for problems when using default values of options.

=over
=cut

=item B<--database>, B<--db>

Specifies the database to search on. By default, it will search on genbank.
=cut
my $database      = 'genbank';
sub database_option_parsing {
  given ($_[1]) {
    when (/^(genbank|gb)$/i) { $database = 'genbank'; }
    when (/^(ensembl|e)$/i)  { $database = 'ensembl'; }
    when ('')                 { } ## Do nothing. If not set, use default
    default                  { die "Specified database '$database' to search is not valid."; }
  }
}

=item B<--debug>

If set, some warnings are printed (but not logged) that can may help on debugging. Might be a good idea
to also set the option B<--very-verbose> when debugging (or read the log file).
=cut
my $debug         = 0;

=item B<--diagram>

If set, a diagram showing the relationship between the extracted sequences is saved
together with the log files.
=cut
my $diagram       = 0;

=item B<--downstream>, B<--down>

Specifies the number of extra base pairs to be extracted downstream of the gene.
This extra base pairs only affect the sequence, not the transcript or proteins.
=cut
my $downstream    = 0;

=item B<--format>

Specifies the format that the sequences are to be saved. All bioperl supported sequence
format are valid. Currently, there's no way to obtain sequences for each thing in a
different formats. Some databases may have the requested only in certain formats.

Due to limitations of the BIO::Seq object, it's safer to just save the raw sequence
returned by the database queried rather than use Bio::SeqIO to avoid losing information,
specially sequence Features and Annotatoins.
=cut
my $format        = 'genbank';

=item B<--limit>

When making a query to a database, limit the result to the first specific results. This helps
prevent the use of specially unspecific queries. The default value is 200. Note that this limit
is for each search.
=cut
my $limit         = 200;

=item B<--save>

Specifies the path for the directory where the sequence and log files will be saved. If the
directory does not exist it will be created but the directory below must exist. Files on the
directory may be rewritten if necessary. If unspecified, a directory named F<extracted sequences>
on the current directory will be used.
=cut
my $save          = File::Spec->catfile (getcwd, 'extracted sequences');

=item B<--sequences>, B<--seq>

Specifies the name for gene file. By default, they are not saved. If no value is given
defaults to its UID. Possible values are 'uid', 'name', 'symbol' (the officyal symbol or
nomenclature).
=cut
my $sequences     = '';
sub sequences_option_parsing {
  given ($_[1]) {
    when (/^(u)?id$/i)    { $sequences = 'uid'; }
    when (/^sym(bol)?$/i) { $sequences = 'symbol'; }
    when (/^name$/i)      { $sequences = 'name'; }
    ## default is set here, when value is empty
    when ('')             { $sequences = 'uid' }
    default               { die "Invalid identifier '$_[1]' for gene files."; }
  }
}


=item B<--proteins>

Specifies the name for proteins file. By default, they are not not saved. If no value is given
defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
gene ID. Note that one gene may have more than one protein so it may overwritten) and 'transcript'
(the corresponding transcript accesion).
=cut
my $proteins      = '';
sub proteins_option_parsing {
  given ($_[1]) {
    when (/^acc(ession)?$/i)      { $proteins = 'accession'; }
    when (/^desc(ription)?$/i)    { $proteins = 'description'; }
    when (/^gene$/i)              { $proteins = 'gene'; }
    when (/^(transcript|mrna)$/i) { $proteins = 'transcript'; }
    ## default is set here, when value is empty
    when ('')                     { $proteins = 'accession' }
    default                       { die "Invalid identifier '$_[1]' for protein files."; }
  }
}

=item B<--pseudo>, B<--nopseudo>

By default, sequences of pseudo genes will be saved. B<--nopseudo> can be used to ignore those genes.
=cut
my $get_pseudo    = 1;

=item B<--transcripts>, B<--mrna>

Specifies the name for transcripts file. By default, they are not is not saved. If no value is given
defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
gene ID. Note that one gene may have more than one transcript so it may overwritten) and 'protein'
(the corresponding protein accesion).
=cut
my $transcripts   = '';
sub transcripts_option_parsing {
  given ($_[1]) {
    when (/^acc(ession)?$/i)    { $transcripts = 'accession'; }
    when (/^desc(ription)?$/i)  { $transcripts = 'description'; }
    when (/^gene$/i)            { $transcripts = 'gene'; }
    when (/^protein$/i)         { $transcripts = 'protein'; }
    ## default is set here, when value is empty
    when ('')                   { $transcripts = 'accession' }
    default                     { die "Invalid identifier '$_[1]' for transcript files."; }
  }
}

=item B<--upstream>, B<--up>

Specifies the number of extra base pairs to be extracted upstream of the gene.
This extra base pairs only affect the sequence, not the transcript or proteins.
=cut
my $upstream      = 0;

=item B<--verbose>

If set, program becomes verbose. For an extremely verbose program, use B<--very-verbose> instead.
=cut
my $verbose       = '';

=item B<--very-verbose>

If set, program becomes extremely verbose. Setting this option, automatically sets B<--verbose> as well.
=cut
my $very_verbose  = '';

=back

=head1 BUGS

If you find any bug, or have a feature request, please report these at

=over

=item *

When supplying options, it's possible to not supply a value and use their default. However,
when the expected value is a string, the next argument may be confused as value for the
option. For example, when using the following command:

C<sequence_extractor --transcripts 'H2A AND homo sapiens'>

we mean to search for 'H2A AND homo sapiens' saving only the transcripts and using the default
as base for the filename. However, the search terms will be interpreted as the base for the
filenames (but since it's not a valid identifier, it will return an error). To prevent
this, you can either specify the values:

C<sequence_extractor --transcripts 'accession' 'H2A AND homo sapiens'>

C<sequence_extractor --transcripts='accession' 'H2A AND homo sapiens'>

or you can use the double hash to stop processing options. Note that this should only be used
after the last option. All arguments supplied after the double dash will be interpreted as search terms

C<sequence_extractor --transcripts -- 'H2A AND homo sapiens'>

=back

=head1 EXAMPLES

=over

=back

=head1 NOTES ON USAGE

=over

=cut

################################################################################
## Parse options, check and create files and directories needed
################################################################################

GetOptions(
            'database|db:s'       => \&database_option_parsing,
            'debug'               => \$debug,
            'diagram'             => \$diagram,
            'down|downstream=i'   => \$downstream,
            'format=s'            => \$format,
            'limit=i'             => \$limit,
            'proteins:s'          => \&proteins_option_parsing,
            'pseudo!'             => \$get_pseudo,
            'save=s'              => \$save,
            'seq|sequences:s'     => \&sequences_option_parsing,
            'mrna|transcripts:s'  => \&transcripts_option_parsing,
            'up|upstream=i'       => \$upstream,
            'verbose'             => \$verbose,
            'very-verbose'        => \$very_verbose,
          ) or die "Error processing options";
## It is necessary to check success of GetOptions since:
## ''GetOptions returns true to indicate success. It returns false when the function
## detected one or more errors during option parsing. These errors are signalled
## using warn() and can be trapped with $SIG{__WARN__}''

my $seq_dir   = File::Spec->catfile ($save, 'sequences');
my $mrna_dir  = File::Spec->catfile ($save, 'transcripts');
my $prot_dir  = File::Spec->catfile ($save, 'proteins');
check_dir($_) foreach ($save, $seq_dir, $mrna_dir, $prot_dir);
my $log_file    = File::Spec->catfile ($save, 'extractor.log');
open (LOG, ">", $log_file) or die "Couldn't open file $log_file for writing: $!";

## set verbosity level
my $verbosity;
if ($very_verbose) {
  $verbosity = 3;
} elsif ($verbose) {
  $verbosity = 2;
} else {
  $verbosity = 1;
}

log_it (1, "This is sequence extractor version $version on ". &get_time);

################################################################################
## Everything is ready. Start accessing the databases
################################################################################

my $data = Structure->new;
given ($database) {
  when ('genbank') {
    use Bio::DB::EUtilities; # Retrieve entries from Entrez
    say "Searching on genbank...";
    my @uids;
    push (@uids, gb_search ($_)) foreach (@ARGV);
    {
      my $start = scalar(@uids);
      clean_array(\@uids);
      my $diff  = $start - scalar(@uids);
      log_it (2, "Removed $diff UIDs from the search results for being repeated.");
      log_it (3, "List of retrieved ids is: @uids");
    }
    say "Fetching gene info...";
    analyze_entrez_genes ($data, \@uids);

    if ($sequences)   { say "Fetching gene sequences...";       get_genes($data); }
    if ($transcripts) { say "Fetching transcript sequences..."; get_products('transcript', $data); }
    if ($proteins)    { say "Fetching protein sequences...";    get_products('protein', $data); }
  }
  when ('ensembl') {
    say "Searching on ensembl";
    die "Search on ensembl has not been implemented yet.";
  }
  default {
    die "Non valid database '$database' to search.";
  }
}
if ($debug) { use Data::Dumper; print Dumper $data; }
exit;

################################################################################
## genbank search subs
################################################################################

sub gb_search {
  log_it (3, "Searching with '$_[0]' on the 'gene' database");
  my $searcher  = Bio::DB::EUtilities->new(
                                            -eutil   => 'esearch',
                                            -db      => 'gene',
                                            -term    => $_[0],
                                            -tool    => $tool,
                                            -email   => $email,
                                            -retmax  => $limit,
                                            );
  log_it (3, "Query $_[0] translated into '" . $searcher->get_query_translation . "'");
  log_it (2, "Found " . $searcher->get_count . " UIDS with query '$_[0]'");
  log_it (2, "Search returned more ids than the set limit of $limit. Retrieving only the first '$limit' genes.") if ($searcher->get_count > $limit);
  return $searcher->get_ids;
}

{
  ## it's possible that when analyzing genes, if a gene has a RefSeq status of secondary, it will point
  ## to another gene, also in the list. To prevent analyzing the same gene twice here, this persistent
  ## hash private to this function is created and keeps track of the UID of genes analyzed
  ## TODO check new varianle type state instead of this hack
  my %analyzed_genes;
  sub analyze_entrez_genes {
    ## XXX we are not using esummary because it doesn't tell us the products of the gene thus forcing us
    ## to download the sequence and analyze it from that. We are also not using elink because it fails
    ## too frequently. Also, entrezgene gives currently the most information so it'll be easier to implement
    ## new stuff at a later time
    my $struct  = shift;
    my $uid_ref = shift;  # a reference for the array containing the list of gene UID

    ## TODO may be a good idea to limit this and download only a few sequences rather
    ## than what can possibly be thousands of them.
    my $fetcher = Bio::DB::EUtilities->new(
                                            -eutil   => 'efetch',
                                            -db      => 'gene',
                                            -id      => $uid_ref,
                                            -retmode => 'text',
                                            -rettype => 'asn1',
                                           );
    my $response = $fetcher->get_Response->content;

    ## TODO when this bug is fixed https://redmine.open-bio.org/issues/3261
    ## this should be fixed to use Bio::SeqIO with format=> 'entrezgene'
    ## then we could use methods to access the data
    use Bio::ASN1::EntrezGene;

    ## This use of the open function requires perl 5.8.0 or later
    open(my $seq_fh, "<", \$response) or die "Could not open sequences string for reading: $!";

    my $parser = Bio::ASN1::EntrezGene->new(
                                            -fh => $seq_fh,
                                            );

    ## when analyzing the genes, if it has another current id, it's possible that
    my %analyzed;
    SEQ: while(my $result = $parser->next_seq){
      $result = $result->[0] if(ref($result) eq 'ARRAY');
      ## Data::Dumper can be used to look into the structure and find where things are
#      use Data::Dumper;
#      print Dumper ($result);

      my $uid = $result->{'track-info'}->[0]->{'geneid'};
      next SEQ if $analyzed_genes{$uid};
      $analyzed_genes{$uid} = 1;

      my ($symbol, $name);
      foreach my $p (@{$result->{'properties'}}){
        $p = $p->[0] if(ref($p) eq 'ARRAY');
        next unless ($p->{'label'} && $p->{'label'} eq 'Nomenclature');
        foreach my $pp (@{$p->{'properties'}}){
          $pp     = $pp->[0] if(ref($pp) eq 'ARRAY');
          $name   = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Full Name');
          $symbol = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Symbol');
        }
      }
      ## values for the gene-status (different from RefSeq status)
      ## live           good??
      ## secondary      synonym with merged
      ## discontinued   'deleted', still index and display to public
      ## newentry-      for GeneRif submission
      my $status = $result->{'track-info'}->[0]->{'status'};

      if ($status eq 'discontinued') {
        log_it (3, "Discontinued gene: UID='$uid', symbol='$symbol', name='$name'. Forgetting about it...");
        next SEQ;
      } elsif ($status eq 'secondary') {
        ## recursivity! UUUUUUUUUU!
        log_it (3, "Secondary gene: UID='$uid', symbol='$symbol', name='$name'. Attempting to find its current UID...");
        my $current_id;
        foreach my $c (@{$result->{'track-info'}->[0]->{'current-id'}}) {
          next unless $c->{'db'} eq 'GeneID';
          $current_id = $c->{'tag'}->[0]->{'id'};
          next unless $current_id;
          log_it (3, "Update: found current UID '$current_id' of secondary gene with UID='$uid'");
          analyze_entrez_genes ($struct, [$current_id]);
        }
        log_it (3, "Update: could not find current UID of secondary gene with UID='$uid'") unless $current_id;
        next SEQ;
      } else {
        $struct->add_gene(
                          uid     => $uid,
                          status  => $status,
                          name    => $name,
                          symbol  => $symbol,
                          );
        log_it (3, "New gene: UID='$uid', gene status='$status', symbol='$symbol', name='$name'.");
      }
      ## value is 'pseudo' for pseudo genes, should be 'protein-coding' otherwise
      if ($result->{'type'} eq 'pseudo') {
        $struct->add_gene(uid => $uid, pseudo  => 1);
        log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as pseudo...");
        unless ($get_pseudo) {
          log_it (3, "Removing gene with UID='$uid' for being pseudo...");
          $struct->remove_gene($uid);
          next SEQ;
        }
      } else {
        $struct->add_gene(uid => $uid, pseudo  => 0);
        log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as protein-coding...");
      }

      foreach my $l (@{$result->{'locus'}}){
        $l = $l->[0] if(ref($l) eq 'ARRAY');
        next unless ($l->{'heading'} && $l->{'heading'} =~ m/Reference primary assembly/i);
        my $ChrAccVer = $l->{'accession'};
        my $ChrStart  = $l->{'seqs'}->[0]->{'int'}->[0]->{'from'};
        my $ChrStop   = $l->{'seqs'}->[0]->{'int'}->[0]->{'to'};
        my $ChrStrand = $l->{'seqs'}->[0]->{'int'}->[0]->{'strand'};
        if ($ChrStrand eq 'minus') {
          $ChrStrand   = 1;
          $ChrStart   += 1 - $upstream;
          $ChrStop    += 1 + $downstream;
        } else {
          $ChrStrand   = 2;
          $ChrStart   += 1 - (-$upstream);
          $ChrStop    += 1 + (-$downstream);
        }
        $struct->add_gene(
                          uid       => $uid,
                          ChrAccVer => $ChrAccVer,
                          ChrStart  => $ChrStart,
                          ChrStop   => $ChrStop,
                          ChrStrand => $ChrStrand,
                          );
        log_it (3, "Update: gene with UID='$uid' has Accesion id $ChrAccVer between coordinates $ChrStart ... $ChrStop on strand $ChrStrand.");
        last; # if we got here once, no point in looking on the others
      }

      ## we will look for products accessions on the comments section instead of the
      ## locus section because locus doesn't say the RefSeq status of the products
      foreach my $c (@{$result->{'comments'}}){
        $c = $c->[0] if(ref($c) eq 'ARRAY');

        ## get RefSeq status
        if ($c->{'heading'} && $c->{'heading'} eq 'RefSeq Status') {
          my $refseq_status = $c->{'label'};
          $struct->add_gene(uid => $uid, 'RefSeq status' => $refseq_status);
          log_it (3, "Update: gene with UID='$uid' has RefSeq status='$refseq_status'");
        }

        ## the rest of the block only makes sense if it's not a pseudo gene
        if ($struct->get_info('gene', $uid, 'pseudo') ) {
          say "Finished analyzing gene with UID $uid earlier since it's pseudo gene " . by_caller_and_location('here') if $debug;
          next SEQ;
        }
        next unless ($c->{'heading'} && $c->{heading} eq 'NCBI Reference Sequences (RefSeq)');
        foreach my $cc (@{$c->{'comment'}}){
          $cc = $cc->[0] if(ref($cc) eq 'ARRAY');
          next unless $cc->{'heading'} eq 'RefSeqs maintained independently of Annotated Genomes';
          foreach my $ccp (@{$cc->{'products'}}){
            $ccp = $ccp->[0] if(ref($ccp) eq 'ARRAY');
            my $mRNA_acc  = $ccp->{'accession'};
            my $prot_acc  = $ccp->{'products'}->[0]->{'accession'};
            ## for the RefSeq status needs to be on a loop since there's a list of comments
            ## About RefSeq status of products:
            ## http://www.ncbi.nlm.nih.gov/entrez/query/static/help/genefaq.html#faq_g7.2
            ## XXX what can we do with this product RefSeq status?
            my $ref_stat;
            foreach my $ccpc (@{$ccp->{'comment'}}){
              next unless ($ccpc->{'label'} && $ccpc->{'label'} eq 'RefSeq Status');
              $ref_stat = $ccpc->{'text'};
            }
            $struct->add_product(
                                  type       => 'transcript',
                                  accession  => $mRNA_acc,
                                  gene       => $uid,
                                  protein    => $prot_acc,
                                  );
            $struct->add_product(
                                  type       => 'protein',
                                  accession  => $prot_acc,
                                  gene       => $uid,
                                  transcript => $mRNA_acc,
                                  );
            log_it (3, "Update: gene with UID='$uid' found to encode transcrip='$mRNA_acc' and protein='$prot_acc' with product RefStatus of $ref_stat.");
          }
        }
      }
      unless (scalar($struct->get_product_list('transcript', $uid)) >= 1 && scalar($struct->get_product_list('protein', $uid)) >= 1) {
        warn "WARNING: non-pseudo gene with UID='$uid' returned no protein and transcript.";
      }
    }
  }
}

sub create_fetcher {
  my $searched  = shift;
  my $rettype;
  given ($format) {
    when ('genbank') {$rettype = 'gb';}
    when ('fasta')   {$rettype = 'fasta';}
    default          {$rettype = 'native'; warn "WARNING: couldn't convert format '$format' to rettype. Using native as default."}
  }
  my $fetcher = Bio::DB::EUtilities->new(
                                          -eutil    => 'efetch',
                                          -db       => $searched,
                                          -retmode  => 'text',
                                          -rettype  => $rettype,
                                          );
  return $fetcher;
}

sub get_filehandle {
  my $type        = shift;
  my $product_key = shift;
  my $name_key    = shift;
  my $struct      = shift;
  my $base_dir;
  given ($type) {
    when ('gene')       { $base_dir = $seq_dir;  }
    when ('transcript') { $base_dir = $mrna_dir; }
    when ('protein')    { $base_dir = $prot_dir; }
    default { die "Found a bug. Unknow type provided '$type' to generate filename ". by_caller_and_location('before') ." Please report."; }
  }
  my $filepath = File::Spec->catfile ($base_dir, $struct->get_info($type, $product_key, $name_key) . file_extension_for($format));
  $filepath    = fix_filename ($filepath);
  open (my $filehandle, ">", $filepath) or die "Couldn't open '$filepath' for writing: $!";
  return $filehandle;
}

sub get_genes {
  my $struct    = shift;
  my @gene_uids = $struct->get_list('gene');
  my $fetcher   = create_fetcher('nucleotide');
  foreach my $gene_uid (@gene_uids) {
    log_it (2, "Getting gene: trying to get gene with UID='$gene_uid'...");
    unless ($struct->get_info('gene', $gene_uid, 'ChrAccVer')) {
      log_it (2, "Update: found no genomic info for gene with UID='$gene_uid'...");
      next;
    }
    my $filehandle = get_filehandle('gene', $gene_uid, $sequences, $struct);
    $fetcher->set_parameters (
                              -id         => $struct->get_info('gene', $gene_uid, 'ChrAccVer'),
                              -seq_start  => $struct->get_info('gene', $gene_uid, 'ChrStart'),
                              -seq_stop   => $struct->get_info('gene', $gene_uid, 'ChrStop'),
                              -strand     => $struct->get_info('gene', $gene_uid, 'ChrStrand'),
                             );
    log_it (3, "Update: fetching gene sequence for gene with UID='$gene_uid'...");
    print $filehandle $fetcher->get_Response->content;
    close $filehandle or warn $! ? "WARNING: error closing filehandle: $!" : "WARNING: exit status $? from filehandle";
  }
}

sub get_products {
  my $product       = shift;
  my $struct        = shift;
  my @product_acc   = $struct->get_list($product);

  my ($fetcher, $base_name);
  if ($product eq 'transcript') {
    $fetcher    = create_fetcher ('nuccore');
    $base_name  = $transcripts;
  } elsif ($product eq 'protein') {
    $fetcher    = create_fetcher ('protein');
    $base_name  = $proteins;
  } else {
    die "Bug found. Invalid product $product argument given ". by_caller_and_location('before') .". Please report it.";
  }

  ## ideally, there would be no loop, and we'd use $fetcher to get all the sequences in
  ## one go. However, that would force us to get the sequences in Bio::Seq files which
  ## can be different from the actually downloaded file. Better to not take the chance
  foreach my $product_acc (@product_acc) {
    log_it (2, "Getting $product: trying to get $product with accession='$product_acc'...");
    $fetcher->set_parameters (
                              -id => $product_acc,
                               );
    my $response = $fetcher->get_Response->content;
    open(my $seq_fh, "<", \$response) or die "Could not open sequence string for reading: $!";
    my $parser = Bio::SeqIO->new(
                                 -fh      => $seq_fh,
                                 -format  => $format,
                                 );
    while (my $seq = $parser->next_seq) {
      my $product_desc = $seq->description;
      $struct->add_product(
                            type        => $product,
                            accession   => $product_acc,
                            description => $product_desc,
                            );
    }
    my $filehandle = get_filehandle($product, $product_acc, $base_name, $struct);
    print $filehandle $response;
    close $filehandle;
  }
}

################################################################################
## other small useful subs
################################################################################

=item *
When creating the directories to save the files, if the directory already exists it will be used and no error
or warning will be issued unless B<--debug> as been set. If a non-directory file already exists with that name
sequence_extractor exits with an error.
=cut

## checks if directories exist and creates them as needed
sub check_dir {
  # TODO must check for permissions as well
  if (!-e $_[0]) {
    mkdir $_[0] or die "Could not mkdir '$_[0]': $!";
    say "Directory '$_[0]' created." if $debug;
  } elsif (!-d $_[0]) {
    die "Directory '$_[0]' to save output already exists as non-directory.";
  } else {
    say "Directory '$_[0]' NOT created since it already exists." if $debug;
  }
}

=item *
On the subject of program verbosity, all messages are saved on the log file. The
options B<--verbose> and B<--very-verbose> only affect their printing to standard
output. Messages from B<--debug> are independent and only printed if requested (and
then, still only for STDOUT).
=cut

sub log_it {
  my $level   = shift;
  my $message = shift;
  say LOG $message;
  say STDOUT $message unless $level > $verbosity;
}

## Removes repeated elements from an array. Does not respect original order
sub clean_array {
  my %hash;
  foreach (@{$_[0]}) {
    if ($hash{$_}) {
      say "Value $_ removed from array " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
    } else {
      $hash{$_} = 1;
    }
  }
  @{$_[0]} = keys %hash;
}

## Returns a pretty string about current time
sub get_time {
  my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5];
  return sprintf ("[%04d-%02d-%02d %02d:%02d:%02d]", $year+1900, $month+1, $day, $hour, $minute, $second);
}

=item *
When saving a file, to avoid problems with limited filesystems such as NTFS or FAT, only some
characters are allowed. All other characters will be replaced by an underscore. Allowed characters
are:

=over

B<a-z 0-9 - +  . , () {} []'>

=back
=cut

## Tries to sanitize a filename
sub fix_filename {
  my ($volume, $directories, $file) = File::Spec->splitpath( $_[0] );
  $file =~ s/[^a-z0-9\-\+ \.,\(\){}\[\]']/_/ig;
  my $filepath  = File::Spec->catpath($volume, $directories, $file);
  say "Filepath '$_[0]' was converted to '$filepath' " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
  return $filepath;
}

sub by_caller_and_location {
  my $level;
  if (!@_ || $_[0] eq 'here') {
    $level = 1;
  } elsif ($_[0] eq 'before'){
    $level = 2;
  } elsif ($_[0] =~ /^[0-9]+$/){
    $level = 1 + $_[0];
  } else {
    die "Bug found when calculating level for caller function. Please report.";
  }
  my $deeper = shift;
  return "by " . (caller($level))[3] . " at line " . (caller($level))[2];
}

=item *
B<sequence extractor> tries to use the same file extensions that bioperl would expect when saving the file. It's,
possible that bioperl adds support for a new sequence format before B<sequence extractor>. In that case, the
file extension used defaults to '.seq'. Please report if you find a format that is not yet supported.
=cut

sub file_extension_for {
  ## TODO in some cases, extension changes whether it's protein or DNA or whatever
  ## should support this

  ## to update this list, look in the _guess_format method, inside SeqIO.pm of bioperl
  given ($_[0]) {
    when (/abi/i)        {return '.abi';}
    when (/ace/i)        {return '.ace';}
    when (/alf/i)        {return '.alf';}
    when (/bsml/i)       {return '.bsm';}
    when (/ctf/i)        {return '.ctf';}
    when (/embl/i)       {return '.embl';}
    when (/entrezgene/i) {return '.asn';}
    when (/exp/i)        {return '.exp';}
    when (/fasta/i)      {return '.fasta';} # fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa
    when (/fastq/i)      {return '.fastq';}
    when (/gcg/i)        {return '.gcg';}
    when (/genbank/i)    {return '.gb';} # gb|gbank|genbank|gbk|gbs
    when (/phd/i)        {return '.phd';} # phd|phred
    when (/pir/i)        {return '.pir';}
    when (/pln/i)        {return '.pln';}
    when (/qual/i)       {return '.qual';}
    when (/raw/i)        {return '.txt';}
    when (/scf/i)        {return '.scf';}
    when (/swiss/i)      {return '.swiss';} # swiss|sp
    when (/strider/i)    {return '.xdna';}  # ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA, DNA Degenerate, RNA and Protein Sequence Files, respectively
    when (/ztr/i)        {return '.ztr';}
    default {
      say "Couldn't find the right extension for the requested format. Using '.seq' as default" if $debug;
      return ".seq";
    }
  }
}

################################################################################
## Structure methods
################################################################################
package Structure;

## creates a new instance of the object
sub new {
  my $class = shift;
  my $self  = {};
  bless ($self, $class);
  return $self;
}

## adds information to a specific gene and adds the gene to the structure if it doesn't exist
## $object->add_gene (
##                    uid        => gene_uid_of_the_gene,
##                    gene_name  => gene_name,
##                    other_info => corresponding value
##                    );
sub add_gene {
  my $self  = shift;
  my %data  = @_;
  ## remove the value from the hash so it doesn't show up in the loop
  my $gene_uid = delete $data{'uid'};
  unless ($gene_uid) {
    warn "WARNING: no gene UID supplied when adding new gene". main::by_caller_and_location('before');
    return;
  }
  ## when adding a gene (instead of updating, this goes first. Can't just hope
  ## to happen by itself on the loop later because it's possible to add a gene
  ## with no info on it
  if ( !exists($self->{'gene'}->{$gene_uid}) ) {
    say "Creating new gene with UID=$gene_uid." if $debug;
    $self->{'gene'}->{$gene_uid} = {};
    $self->{'gene'}->{$gene_uid}->{'uid'} = $gene_uid;  # this is not stupid. Makes it easier to have a general function to create the filename
    $self->{'gene'}->{$gene_uid}->{'transcript'} = [];
    $self->{'gene'}->{$gene_uid}->{'protein'}    = [];
  }
  ## fill it with all the data
  foreach my $key (keys %data) {
    $self->{'gene'}->{$gene_uid}->{$key} = $data{$key};
    say "Added $key=$data{$key} to gene with UID=$gene_uid." if $debug;
  }
}

## remove genes from the structure given their uid
## $object->remove_gene($uid)
## $object->remove_gene(@uids)
sub remove_gene {
  my $self  = shift;
  foreach my $uid (@_) {
    delete $self->{'gene'}->{$uid};
    say "Removed gene with UID=$uid" if $debug;
  }
}

sub add_product {
  my $self  = shift;
  my %data  = @_;
  ## remove these values from the hash so they don't show up in the loop later
  my $product     = delete $data{'type'};
  die "Bug found. Please report this. Product requested was $product ". main::by_caller_and_location('before') unless ($product eq 'protein' || $product eq 'transcript');
  my $product_acc = delete $data{'accession'};
  unless ($product_acc) {
    warn "WARNING: no $product accession supplied when adding new product". main::by_caller_and_location('before') . " Adding nothing.";
    return;
  }
  ## Since it's possible that a record for a product be generated automatically when
  ## creating it's related product, the only way to know if it's the first time, it's
  ## to check it's accession (it's the same as the key. It looks kinda of redundant
  ## but allows to have a simpler function to generate filename that uses get_info)
  if ( !exists($self->{$product}->{$product_acc}->{'accession'}) ) {
    say "Creating new $product with accession=$product_acc." if $debug;
    $self->{$product}->{$product_acc}->{'accession'} = $product_acc;
  }
  ## fill it with all the data
  foreach my $key (keys %data) {
    $self->{$product}->{$product_acc}->{$key} = $data{$key};
    ## if we're adding info about gene and related products, the array on their
    ## part of structure  needs to be updated
    if ($key eq 'gene') {
      my $products_on_gene = \@{$self->{'gene'}->{$data{$key}}->{$product}};
      push (@$products_on_gene, $product_acc);
      main::clean_array($products_on_gene);
    } elsif ($key eq 'transcript' && $product eq 'protein') {
      my $transcript_acc  = $data{$key};
      my $current         = $self->{'transcript'}->{$transcript_acc}->{'protein'};
      if ($current && $current ne $product_acc) {
        warn "WARNING: replacing accession $current with $product_acc as product of $transcript_acc. Please report this bug.";
        $self->{'transcript'}->{'protein'} = $product_acc;
      }
    } elsif ($key eq 'protein' && $product eq 'transcript') {
      my $protein_acc  = $data{$key};
      my $current      = $self->{'protein'}->{$protein_acc}->{'transcript'};
      if ($current && $current ne $product_acc) {
        warn "WARNING: replacing accession $current with $product_acc as ''template'' of $product_acc. Please report this bug.";
        $self->{'protein'}->{'transcript'} = $product_acc;
      }
    }
  }
}
## get information from the structure
## $value = $object->get_info('gene', $gene_id, 'wanted info');
## $value = $object->get_info('protein', $protein_acc, 'wanted info');
sub get_info {
  my $self  = shift;
  my $type  = shift;
  my $key   = shift;
  my $req   = shift;
  ## can't check the request here and return the key if the request is accession
  ## or id because even though we're using product accessions and gene UID as keys,
  ## gene also have accessions and products have id and if we had support for them
  ## later, it would create a lot of confusion and bugs
  return $self->{$type}->{$key}->{$req};
}

## returns a list of all gene UIDs or product accessions
##   @gene_uids = $structure->get_list('gene')
##   @mRNA_acc  = $structure->get_list('transcript')
sub get_list {
  my $self  = shift;
  my $type  = shift;
  return keys %{$self->{$type}};
}

## for the specified genes UIDs returns list of the requested products accessions
## If no gene_id is specified, returns a list of all accessions of the product requested
## @mRNA_acc  = structure->get_product_list('transcript', $gene_uid)
## @prot_acc  = structure->get_product_list('protein', @gene_uid)
## @prot_acc  = structure->get_product_list('protein')
sub get_product_list {
  my $self    = shift;
  my $product = shift;
  given ( scalar(@_) ) {
    when ([1]) { return @{$self->{'gene'}->{$_[0]}->{$product}}; }
    when ([0]) { return $self->get_list($product); }
    default    { return map { $self->get_product_list($product, $_) } @_; }
  }
}