Advertisement
Guest User

carandraug

a guest
Jul 17th, 2011
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 37.43 KB | None | 0 0
  1. #!/usr/bin/perl
  2. ## Copyright (C) 2011 Carnë Draug <carandraug+dev@gmail.com>
  3. ##
  4. ## This program is free software; you can redistribute it and/or modify
  5. ## it under the terms of the GNU General Public License as published by
  6. ## the Free Software Foundation; either version 3 of the License, or
  7. ## (at your option) any later version.
  8. ##
  9. ## This program is distributed in the hope that it will be useful,
  10. ## but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. ## GNU General Public License for more details.
  13. ##
  14. ## You should have received a copy of the GNU General Public License
  15. ## along with this program; if not, see <http://www.gnu.org/licenses/>.
  16.  
  17. use 5.010;                      # Use Perl 5.10
  18. use warnings;                   # Replacement for the -w flag, but lexically scoped
  19. use strict;                     # Enforce some good programming rules
  20. use Getopt::Long;               # Parse program arguments
  21. use Cwd;                        # Determines current working directory
  22. use File::Spec;                 # Perform operation on file names
  23. use Bio::SeqIO;                 # Handler for SeqIO Formats
  24.  
  25. ## List of useful variables related to project
  26. my $version       = 0.01;                         # program version. Will be saved on log file
  27. my $email         = 'carandraug+dev@gmail.com';   # mail to be used by the database sysadmins
  28. my $tool          = 'bioperl-sequence_extractor'; # project name to be given to database sysadmin
  29.  
  30. =head1 NAME
  31.  
  32. sequence_extractor - for a specific list of searches, extracts all related sequences
  33.  
  34. =head1 SYNOPSIS
  35.  
  36. B<sequence_extractor> [] [] SEARCH
  37.  
  38. =head1 DESCRIPTION
  39.  
  40. This script does blah blah blah
  41.  
  42. =head1 OPTIONS
  43.  
  44. See the section bugs for problems when using default values of options.
  45.  
  46. =over
  47. =cut
  48.  
  49. =item B<--database>, B<--db>
  50.  
  51. Specifies the database to search on. By default, it will search on genbank.
  52. =cut
  53. my $database      = 'genbank';
  54. sub database_option_parsing {
  55.   given ($_[1]) {
  56.     when (/^(genbank|gb)$/i) { $database = 'genbank'; }
  57.     when (/^(ensembl|e)$/i)  { $database = 'ensembl'; }
  58.     when ('')                 { } ## Do nothing. If not set, use default
  59.     default                  { die "Specified database '$database' to search is not valid."; }
  60.   }
  61. }
  62.  
  63. =item B<--debug>
  64.  
  65. If set, some warnings are printed (but not logged) that can may help on debugging. Might be a good idea
  66. to also set the option B<--very-verbose> when debugging (or read the log file).
  67. =cut
  68. my $debug         = 0;
  69.  
  70. =item B<--diagram>
  71.  
  72. If set, a diagram showing the relationship between the extracted sequences is saved
  73. together with the log files.
  74. =cut
  75. my $diagram       = 0;
  76.  
  77. =item B<--downstream>, B<--down>
  78.  
  79. Specifies the number of extra base pairs to be extracted downstream of the gene.
  80. This extra base pairs only affect the sequence, not the transcript or proteins.
  81. =cut
  82. my $downstream    = 0;
  83.  
  84. =item B<--format>
  85.  
  86. Specifies the format that the sequences are to be saved. All bioperl supported sequence
  87. format are valid. Currently, there's no way to obtain sequences for each thing in a
  88. different formats. Some databases may have the requested only in certain formats.
  89.  
  90. Due to limitations of the BIO::Seq object, it's safer to just save the raw sequence
  91. returned by the database queried rather than use Bio::SeqIO to avoid losing information,
  92. specially sequence Features and Annotatoins.
  93. =cut
  94. my $format        = 'genbank';
  95.  
  96. =item B<--limit>
  97.  
  98. When making a query to a database, limit the result to the first specific results. This helps
  99. prevent the use of specially unspecific queries. The default value is 200. Note that this limit
  100. is for each search.
  101. =cut
  102. my $limit         = 200;
  103.  
  104. =item B<--save>
  105.  
  106. Specifies the path for the directory where the sequence and log files will be saved. If the
  107. directory does not exist it will be created but the directory below must exist. Files on the
  108. directory may be rewritten if necessary. If unspecified, a directory named F<extracted sequences>
  109. on the current directory will be used.
  110. =cut
  111. my $save          = File::Spec->catfile (getcwd, 'extracted sequences');
  112.  
  113. =item B<--sequences>, B<--seq>
  114.  
  115. Specifies the name for gene file. By default, they are not saved. If no value is given
  116. defaults to its UID. Possible values are 'uid', 'name', 'symbol' (the officyal symbol or
  117. nomenclature).
  118. =cut
  119. my $sequences     = '';
  120. sub sequences_option_parsing {
  121.   given ($_[1]) {
  122.     when (/^(u)?id$/i)    { $sequences = 'uid'; }
  123.     when (/^sym(bol)?$/i) { $sequences = 'symbol'; }
  124.     when (/^name$/i)      { $sequences = 'name'; }
  125.     ## default is set here, when value is empty
  126.     when ('')             { $sequences = 'uid' }
  127.     default               { die "Invalid identifier '$_[1]' for gene files."; }
  128.   }
  129. }
  130.  
  131.  
  132.  
  133. =item B<--proteins>
  134.  
  135. Specifies the name for proteins file. By default, they are not not saved. If no value is given
  136. defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
  137. gene ID. Note that one gene may have more than one protein so it may overwritten) and 'transcript'
  138. (the corresponding transcript accesion).
  139. =cut
  140. my $proteins      = '';
  141. sub proteins_option_parsing {
  142.   given ($_[1]) {
  143.     when (/^acc(ession)?$/i)      { $proteins = 'accession'; }
  144.     when (/^desc(ription)?$/i)    { $proteins = 'description'; }
  145.     when (/^gene$/i)              { $proteins = 'gene'; }
  146.     when (/^(transcript|mrna)$/i) { $proteins = 'transcript'; }
  147.     ## default is set here, when value is empty
  148.     when ('')                     { $proteins = 'accession' }
  149.     default                       { die "Invalid identifier '$_[1]' for protein files."; }
  150.   }
  151. }
  152.  
  153. =item B<--pseudo>, B<--nopseudo>
  154.  
  155. By default, sequences of pseudo genes will be saved. B<--nopseudo> can be used to ignore those genes.
  156. =cut
  157. my $get_pseudo    = 1;
  158.  
  159. =item B<--transcripts>, B<--mrna>
  160.  
  161. Specifies the name for transcripts file. By default, they are not is not saved. If no value is given
  162. defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
  163. gene ID. Note that one gene may have more than one transcript so it may overwritten) and 'protein'
  164. (the corresponding protein accesion).
  165. =cut
  166. my $transcripts   = '';
  167. sub transcripts_option_parsing {
  168.   given ($_[1]) {
  169.     when (/^acc(ession)?$/i)    { $transcripts = 'accession'; }
  170.     when (/^desc(ription)?$/i)  { $transcripts = 'description'; }
  171.     when (/^gene$/i)            { $transcripts = 'gene'; }
  172.     when (/^protein$/i)         { $transcripts = 'protein'; }
  173.     ## default is set here, when value is empty
  174.     when ('')                   { $transcripts = 'accession' }
  175.     default                     { die "Invalid identifier '$_[1]' for transcript files."; }
  176.   }
  177. }
  178.  
  179. =item B<--upstream>, B<--up>
  180.  
  181. Specifies the number of extra base pairs to be extracted upstream of the gene.
  182. This extra base pairs only affect the sequence, not the transcript or proteins.
  183. =cut
  184. my $upstream      = 0;
  185.  
  186. =item B<--verbose>
  187.  
  188. If set, program becomes verbose. For an extremely verbose program, use B<--very-verbose> instead.
  189. =cut
  190. my $verbose       = '';
  191.  
  192. =item B<--very-verbose>
  193.  
  194. If set, program becomes extremely verbose. Setting this option, automatically sets B<--verbose> as well.
  195. =cut
  196. my $very_verbose  = '';
  197.  
  198. =back
  199.  
  200. =head1 BUGS
  201.  
  202. If you find any bug, or have a feature request, please report these at
  203.  
  204. =over
  205.  
  206. =item *
  207.  
  208. When supplying options, it's possible to not supply a value and use their default. However,
  209. when the expected value is a string, the next argument may be confused as value for the
  210. option. For example, when using the following command:
  211.  
  212. C<sequence_extractor --transcripts 'H2A AND homo sapiens'>
  213.  
  214. we mean to search for 'H2A AND homo sapiens' saving only the transcripts and using the default
  215. as base for the filename. However, the search terms will be interpreted as the base for the
  216. filenames (but since it's not a valid identifier, it will return an error). To prevent
  217. this, you can either specify the values:
  218.  
  219. C<sequence_extractor --transcripts 'accession' 'H2A AND homo sapiens'>
  220.  
  221. C<sequence_extractor --transcripts='accession' 'H2A AND homo sapiens'>
  222.  
  223. or you can use the double hash to stop processing options. Note that this should only be used
  224. after the last option. All arguments supplied after the double dash will be interpreted as search terms
  225.  
  226. C<sequence_extractor --transcripts -- 'H2A AND homo sapiens'>
  227.  
  228. =back
  229.  
  230. =head1 EXAMPLES
  231.  
  232. =over
  233.  
  234. =back
  235.  
  236. =head1 NOTES ON USAGE
  237.  
  238. =over
  239.  
  240. =cut
  241.  
  242. ################################################################################
  243. ## Parse options, check and create files and directories needed
  244. ################################################################################
  245.  
  246. GetOptions(
  247.             'database|db:s'       => \&database_option_parsing,
  248.             'debug'               => \$debug,
  249.             'diagram'             => \$diagram,
  250.             'down|downstream=i'   => \$downstream,
  251.             'format=s'            => \$format,
  252.             'limit=i'             => \$limit,
  253.             'proteins:s'          => \&proteins_option_parsing,
  254.             'pseudo!'             => \$get_pseudo,
  255.             'save=s'              => \$save,
  256.             'seq|sequences:s'     => \&sequences_option_parsing,
  257.             'mrna|transcripts:s'  => \&transcripts_option_parsing,
  258.             'up|upstream=i'       => \$upstream,
  259.             'verbose'             => \$verbose,
  260.             'very-verbose'        => \$very_verbose,
  261.           ) or die "Error processing options";
  262. ## It is necessary to check success of GetOptions since:
  263. ## ''GetOptions returns true to indicate success. It returns false when the function
  264. ## detected one or more errors during option parsing. These errors are signalled
  265. ## using warn() and can be trapped with $SIG{__WARN__}''
  266.  
  267. my $seq_dir   = File::Spec->catfile ($save, 'sequences');
  268. my $mrna_dir  = File::Spec->catfile ($save, 'transcripts');
  269. my $prot_dir  = File::Spec->catfile ($save, 'proteins');
  270. check_dir($_) foreach ($save, $seq_dir, $mrna_dir, $prot_dir);
  271. my $log_file    = File::Spec->catfile ($save, 'extractor.log');
  272. open (LOG, ">", $log_file) or die "Couldn't open file $log_file for writing: $!";
  273.  
  274. ## set verbosity level
  275. my $verbosity;
  276. if ($very_verbose) {
  277.   $verbosity = 3;
  278. } elsif ($verbose) {
  279.   $verbosity = 2;
  280. } else {
  281.   $verbosity = 1;
  282. }
  283.  
  284. log_it (1, "This is sequence extractor version $version on ". &get_time);
  285.  
  286. ################################################################################
  287. ## Everything is ready. Start accessing the databases
  288. ################################################################################
  289.  
  290. my $data = Structure->new;
  291. given ($database) {
  292.   when ('genbank') {
  293.     use Bio::DB::EUtilities; # Retrieve entries from Entrez
  294.     say "Searching on genbank...";
  295.     my @uids;
  296.     push (@uids, gb_search ($_)) foreach (@ARGV);
  297.     {
  298.       my $start = scalar(@uids);
  299.       clean_array(\@uids);
  300.       my $diff  = $start - scalar(@uids);
  301.       log_it (2, "Removed $diff UIDs from the search results for being repeated.");
  302.       log_it (3, "List of retrieved ids is: @uids");
  303.     }
  304.     say "Fetching gene info...";
  305.     analyze_entrez_genes ($data, \@uids);
  306.  
  307.     if ($sequences)   { say "Fetching gene sequences...";       get_genes($data); }
  308.     if ($transcripts) { say "Fetching transcript sequences..."; get_products('transcript', $data); }
  309.     if ($proteins)    { say "Fetching protein sequences...";    get_products('protein', $data); }
  310.   }
  311.   when ('ensembl') {
  312.     say "Searching on ensembl";
  313.     die "Search on ensembl has not been implemented yet.";
  314.   }
  315.   default {
  316.     die "Non valid database '$database' to search.";
  317.   }
  318. }
  319. if ($debug) { use Data::Dumper; print Dumper $data; }
  320. exit;
  321.  
  322. ################################################################################
  323. ## genbank search subs
  324. ################################################################################
  325.  
  326. sub gb_search {
  327.   log_it (3, "Searching with '$_[0]' on the 'gene' database");
  328.   my $searcher  = Bio::DB::EUtilities->new(
  329.                                             -eutil   => 'esearch',
  330.                                             -db      => 'gene',
  331.                                             -term    => $_[0],
  332.                                             -tool    => $tool,
  333.                                             -email   => $email,
  334.                                             -retmax  => $limit,
  335.                                             );
  336.   log_it (3, "Query $_[0] translated into '" . $searcher->get_query_translation . "'");
  337.   log_it (2, "Found " . $searcher->get_count . " UIDS with query '$_[0]'");
  338.   log_it (2, "Search returned more ids than the set limit of $limit. Retrieving only the first '$limit' genes.") if ($searcher->get_count > $limit);
  339.   return $searcher->get_ids;
  340. }
  341.  
  342. {
  343.   ## it's possible that when analyzing genes, if a gene has a RefSeq status of secondary, it will point
  344.   ## to another gene, also in the list. To prevent analyzing the same gene twice here, this persistent
  345.   ## hash private to this function is created and keeps track of the UID of genes analyzed
  346.   ## TODO check new varianle type state instead of this hack
  347.   my %analyzed_genes;
  348.   sub analyze_entrez_genes {
  349.     ## XXX we are not using esummary because it doesn't tell us the products of the gene thus forcing us
  350.     ## to download the sequence and analyze it from that. We are also not using elink because it fails
  351.     ## too frequently. Also, entrezgene gives currently the most information so it'll be easier to implement
  352.     ## new stuff at a later time
  353.     my $struct  = shift;
  354.     my $uid_ref = shift;  # a reference for the array containing the list of gene UID
  355.  
  356.     ## TODO may be a good idea to limit this and download only a few sequences rather
  357.     ## than what can possibly be thousands of them.
  358.     my $fetcher = Bio::DB::EUtilities->new(
  359.                                             -eutil   => 'efetch',
  360.                                             -db      => 'gene',
  361.                                             -id      => $uid_ref,
  362.                                             -retmode => 'text',
  363.                                             -rettype => 'asn1',
  364.                                            );
  365.     my $response = $fetcher->get_Response->content;
  366.  
  367.     ## TODO when this bug is fixed https://redmine.open-bio.org/issues/3261
  368.     ## this should be fixed to use Bio::SeqIO with format=> 'entrezgene'
  369.     ## then we could use methods to access the data
  370.     use Bio::ASN1::EntrezGene;
  371.  
  372.     ## This use of the open function requires perl 5.8.0 or later
  373.     open(my $seq_fh, "<", \$response) or die "Could not open sequences string for reading: $!";
  374.  
  375.     my $parser = Bio::ASN1::EntrezGene->new(
  376.                                             -fh => $seq_fh,
  377.                                             );
  378.  
  379.     ## when analyzing the genes, if it has another current id, it's possible that
  380.     my %analyzed;
  381.     SEQ: while(my $result = $parser->next_seq){
  382.       $result = $result->[0] if(ref($result) eq 'ARRAY');
  383.       ## Data::Dumper can be used to look into the structure and find where things are
  384. #      use Data::Dumper;
  385. #      print Dumper ($result);
  386.  
  387.       my $uid = $result->{'track-info'}->[0]->{'geneid'};
  388.       next SEQ if $analyzed_genes{$uid};
  389.       $analyzed_genes{$uid} = 1;
  390.  
  391.       my ($symbol, $name);
  392.       foreach my $p (@{$result->{'properties'}}){
  393.         $p = $p->[0] if(ref($p) eq 'ARRAY');
  394.         next unless ($p->{'label'} && $p->{'label'} eq 'Nomenclature');
  395.         foreach my $pp (@{$p->{'properties'}}){
  396.           $pp     = $pp->[0] if(ref($pp) eq 'ARRAY');
  397.           $name   = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Full Name');
  398.           $symbol = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Symbol');
  399.         }
  400.       }
  401.       ## values for the gene-status (different from RefSeq status)
  402.       ## live           good??
  403.       ## secondary      synonym with merged
  404.       ## discontinued   'deleted', still index and display to public
  405.       ## newentry-      for GeneRif submission
  406.       my $status = $result->{'track-info'}->[0]->{'status'};
  407.  
  408.       if ($status eq 'discontinued') {
  409.         log_it (3, "Discontinued gene: UID='$uid', symbol='$symbol', name='$name'. Forgetting about it...");
  410.         next SEQ;
  411.       } elsif ($status eq 'secondary') {
  412.         ## recursivity! UUUUUUUUUU!
  413.         log_it (3, "Secondary gene: UID='$uid', symbol='$symbol', name='$name'. Attempting to find its current UID...");
  414.         my $current_id;
  415.         foreach my $c (@{$result->{'track-info'}->[0]->{'current-id'}}) {
  416.           next unless $c->{'db'} eq 'GeneID';
  417.           $current_id = $c->{'tag'}->[0]->{'id'};
  418.           next unless $current_id;
  419.           log_it (3, "Update: found current UID '$current_id' of secondary gene with UID='$uid'");
  420.           analyze_entrez_genes ($struct, [$current_id]);
  421.         }
  422.         log_it (3, "Update: could not find current UID of secondary gene with UID='$uid'") unless $current_id;
  423.         next SEQ;
  424.       } else {
  425.         $struct->add_gene(
  426.                           uid     => $uid,
  427.                           status  => $status,
  428.                           name    => $name,
  429.                           symbol  => $symbol,
  430.                           );
  431.         log_it (3, "New gene: UID='$uid', gene status='$status', symbol='$symbol', name='$name'.");
  432.       }
  433.       ## value is 'pseudo' for pseudo genes, should be 'protein-coding' otherwise
  434.       if ($result->{'type'} eq 'pseudo') {
  435.         $struct->add_gene(uid => $uid, pseudo  => 1);
  436.         log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as pseudo...");
  437.         unless ($get_pseudo) {
  438.           log_it (3, "Removing gene with UID='$uid' for being pseudo...");
  439.           $struct->remove_gene($uid);
  440.           next SEQ;
  441.         }
  442.       } else {
  443.         $struct->add_gene(uid => $uid, pseudo  => 0);
  444.         log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as protein-coding...");
  445.       }
  446.  
  447.       foreach my $l (@{$result->{'locus'}}){
  448.         $l = $l->[0] if(ref($l) eq 'ARRAY');
  449.         next unless ($l->{'heading'} && $l->{'heading'} =~ m/Reference primary assembly/i);
  450.         my $ChrAccVer = $l->{'accession'};
  451.         my $ChrStart  = $l->{'seqs'}->[0]->{'int'}->[0]->{'from'};
  452.         my $ChrStop   = $l->{'seqs'}->[0]->{'int'}->[0]->{'to'};
  453.         my $ChrStrand = $l->{'seqs'}->[0]->{'int'}->[0]->{'strand'};
  454.         if ($ChrStrand eq 'minus') {
  455.           $ChrStrand   = 1;
  456.           $ChrStart   += 1 - $upstream;
  457.           $ChrStop    += 1 + $downstream;
  458.         } else {
  459.           $ChrStrand   = 2;
  460.           $ChrStart   += 1 - (-$upstream);
  461.           $ChrStop    += 1 + (-$downstream);
  462.         }
  463.         $struct->add_gene(
  464.                           uid       => $uid,
  465.                           ChrAccVer => $ChrAccVer,
  466.                           ChrStart  => $ChrStart,
  467.                           ChrStop   => $ChrStop,
  468.                           ChrStrand => $ChrStrand,
  469.                           );
  470.         log_it (3, "Update: gene with UID='$uid' has Accesion id $ChrAccVer between coordinates $ChrStart ... $ChrStop on strand $ChrStrand.");
  471.         last; # if we got here once, no point in looking on the others
  472.       }
  473.      
  474.       ## we will look for products accessions on the comments section instead of the
  475.       ## locus section because locus doesn't say the RefSeq status of the products
  476.       foreach my $c (@{$result->{'comments'}}){
  477.         $c = $c->[0] if(ref($c) eq 'ARRAY');
  478.        
  479.         ## get RefSeq status
  480.         if ($c->{'heading'} && $c->{'heading'} eq 'RefSeq Status') {
  481.           my $refseq_status = $c->{'label'};
  482.           $struct->add_gene(uid => $uid, 'RefSeq status' => $refseq_status);
  483.           log_it (3, "Update: gene with UID='$uid' has RefSeq status='$refseq_status'");
  484.         }
  485.        
  486.         ## the rest of the block only makes sense if it's not a pseudo gene
  487.         if ($struct->get_info('gene', $uid, 'pseudo') ) {
  488.           say "Finished analyzing gene with UID $uid earlier since it's pseudo gene " . by_caller_and_location('here') if $debug;
  489.           next SEQ;
  490.         }
  491.         next unless ($c->{'heading'} && $c->{heading} eq 'NCBI Reference Sequences (RefSeq)');
  492.         foreach my $cc (@{$c->{'comment'}}){
  493.           $cc = $cc->[0] if(ref($cc) eq 'ARRAY');
  494.           next unless $cc->{'heading'} eq 'RefSeqs maintained independently of Annotated Genomes';
  495.           foreach my $ccp (@{$cc->{'products'}}){
  496.             $ccp = $ccp->[0] if(ref($ccp) eq 'ARRAY');
  497.             my $mRNA_acc  = $ccp->{'accession'};
  498.             my $prot_acc  = $ccp->{'products'}->[0]->{'accession'};
  499.             ## for the RefSeq status needs to be on a loop since there's a list of comments
  500.             ## About RefSeq status of products:
  501.             ## http://www.ncbi.nlm.nih.gov/entrez/query/static/help/genefaq.html#faq_g7.2
  502.             ## XXX what can we do with this product RefSeq status?
  503.             my $ref_stat;
  504.             foreach my $ccpc (@{$ccp->{'comment'}}){
  505.               next unless ($ccpc->{'label'} && $ccpc->{'label'} eq 'RefSeq Status');
  506.               $ref_stat = $ccpc->{'text'};
  507.             }
  508.             $struct->add_product(
  509.                                   type       => 'transcript',
  510.                                   accession  => $mRNA_acc,
  511.                                   gene       => $uid,
  512.                                   protein    => $prot_acc,
  513.                                   );
  514.             $struct->add_product(
  515.                                   type       => 'protein',
  516.                                   accession  => $prot_acc,
  517.                                   gene       => $uid,
  518.                                   transcript => $mRNA_acc,
  519.                                   );
  520.             log_it (3, "Update: gene with UID='$uid' found to encode transcrip='$mRNA_acc' and protein='$prot_acc' with product RefStatus of $ref_stat.");
  521.           }
  522.         }
  523.       }
  524.       unless (scalar($struct->get_product_list('transcript', $uid)) >= 1 && scalar($struct->get_product_list('protein', $uid)) >= 1) {
  525.         warn "WARNING: non-pseudo gene with UID='$uid' returned no protein and transcript.";
  526.       }
  527.     }
  528.   }
  529. }
  530.  
  531. sub create_fetcher {
  532.   my $searched  = shift;
  533.   my $rettype;
  534.   given ($format) {
  535.     when ('genbank') {$rettype = 'gb';}
  536.     when ('fasta')   {$rettype = 'fasta';}
  537.     default          {$rettype = 'native'; warn "WARNING: couldn't convert format '$format' to rettype. Using native as default."}
  538.   }
  539.   my $fetcher = Bio::DB::EUtilities->new(
  540.                                           -eutil    => 'efetch',
  541.                                           -db       => $searched,
  542.                                           -retmode  => 'text',
  543.                                           -rettype  => $rettype,
  544.                                           );
  545.   return $fetcher;
  546. }
  547.  
  548. sub get_filehandle {
  549.   my $type        = shift;
  550.   my $product_key = shift;
  551.   my $name_key    = shift;
  552.   my $struct      = shift;
  553.   my $base_dir;
  554.   given ($type) {
  555.     when ('gene')       { $base_dir = $seq_dir;  }
  556.     when ('transcript') { $base_dir = $mrna_dir; }
  557.     when ('protein')    { $base_dir = $prot_dir; }
  558.     default { die "Found a bug. Unknow type provided '$type' to generate filename ". by_caller_and_location('before') ." Please report."; }
  559.   }
  560.   my $filepath = File::Spec->catfile ($base_dir, $struct->get_info($type, $product_key, $name_key) . file_extension_for($format));
  561.   $filepath    = fix_filename ($filepath);
  562.   open (my $filehandle, ">", $filepath) or die "Couldn't open '$filepath' for writing: $!";
  563.   return $filehandle;
  564. }
  565.  
  566. sub get_genes {
  567.   my $struct    = shift;
  568.   my @gene_uids = $struct->get_list('gene');
  569.   my $fetcher   = create_fetcher('nucleotide');
  570.   foreach my $gene_uid (@gene_uids) {
  571.     log_it (2, "Getting gene: trying to get gene with UID='$gene_uid'...");
  572.     unless ($struct->get_info('gene', $gene_uid, 'ChrAccVer')) {
  573.       log_it (2, "Update: found no genomic info for gene with UID='$gene_uid'...");
  574.       next;
  575.     }
  576.     my $filehandle = get_filehandle('gene', $gene_uid, $sequences, $struct);
  577.     $fetcher->set_parameters (
  578.                               -id         => $struct->get_info('gene', $gene_uid, 'ChrAccVer'),
  579.                               -seq_start  => $struct->get_info('gene', $gene_uid, 'ChrStart'),
  580.                               -seq_stop   => $struct->get_info('gene', $gene_uid, 'ChrStop'),
  581.                               -strand     => $struct->get_info('gene', $gene_uid, 'ChrStrand'),
  582.                              );
  583.     log_it (3, "Update: fetching gene sequence for gene with UID='$gene_uid'...");
  584.     print $filehandle $fetcher->get_Response->content;
  585.     close $filehandle or warn $! ? "WARNING: error closing filehandle: $!" : "WARNING: exit status $? from filehandle";
  586.   }
  587. }
  588.  
  589. sub get_products {
  590.   my $product       = shift;
  591.   my $struct        = shift;
  592.   my @product_acc   = $struct->get_list($product);
  593.  
  594.   my ($fetcher, $base_name);
  595.   if ($product eq 'transcript') {
  596.     $fetcher    = create_fetcher ('nuccore');
  597.     $base_name  = $transcripts;
  598.   } elsif ($product eq 'protein') {
  599.     $fetcher    = create_fetcher ('protein');
  600.     $base_name  = $proteins;
  601.   } else {
  602.     die "Bug found. Invalid product $product argument given ". by_caller_and_location('before') .". Please report it.";
  603.   }
  604.  
  605.   ## ideally, there would be no loop, and we'd use $fetcher to get all the sequences in
  606.   ## one go. However, that would force us to get the sequences in Bio::Seq files which
  607.   ## can be different from the actually downloaded file. Better to not take the chance
  608.   foreach my $product_acc (@product_acc) {
  609.     log_it (2, "Getting $product: trying to get $product with accession='$product_acc'...");
  610.     $fetcher->set_parameters (
  611.                               -id => $product_acc,
  612.                                );
  613.     my $response = $fetcher->get_Response->content;
  614.     open(my $seq_fh, "<", \$response) or die "Could not open sequence string for reading: $!";
  615.     my $parser = Bio::SeqIO->new(
  616.                                  -fh      => $seq_fh,
  617.                                  -format  => $format,
  618.                                  );
  619.     while (my $seq = $parser->next_seq) {
  620.       my $product_desc = $seq->description;
  621.       $struct->add_product(
  622.                             type        => $product,
  623.                             accession   => $product_acc,
  624.                             description => $product_desc,
  625.                             );
  626.     }
  627.     my $filehandle = get_filehandle($product, $product_acc, $base_name, $struct);
  628.     print $filehandle $response;
  629.     close $filehandle;
  630.   }
  631. }
  632.  
  633. ################################################################################
  634. ## other small useful subs
  635. ################################################################################
  636.  
  637. =item *
  638. When creating the directories to save the files, if the directory already exists it will be used and no error
  639. or warning will be issued unless B<--debug> as been set. If a non-directory file already exists with that name
  640. sequence_extractor exits with an error.
  641. =cut
  642.  
  643. ## checks if directories exist and creates them as needed
  644. sub check_dir {
  645.   # TODO must check for permissions as well
  646.   if (!-e $_[0]) {
  647.     mkdir $_[0] or die "Could not mkdir '$_[0]': $!";
  648.     say "Directory '$_[0]' created." if $debug;
  649.   } elsif (!-d $_[0]) {
  650.     die "Directory '$_[0]' to save output already exists as non-directory.";
  651.   } else {
  652.     say "Directory '$_[0]' NOT created since it already exists." if $debug;
  653.   }
  654. }
  655.  
  656. =item *
  657. On the subject of program verbosity, all messages are saved on the log file. The
  658. options B<--verbose> and B<--very-verbose> only affect their printing to standard
  659. output. Messages from B<--debug> are independent and only printed if requested (and
  660. then, still only for STDOUT).
  661. =cut
  662.  
  663. sub log_it {
  664.   my $level   = shift;
  665.   my $message = shift;
  666.   say LOG $message;
  667.   say STDOUT $message unless $level > $verbosity;
  668. }
  669.  
  670. ## Removes repeated elements from an array. Does not respect original order
  671. sub clean_array {
  672.   my %hash;
  673.   foreach (@{$_[0]}) {
  674.     if ($hash{$_}) {
  675.       say "Value $_ removed from array " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
  676.     } else {
  677.       $hash{$_} = 1;
  678.     }
  679.   }
  680.   @{$_[0]} = keys %hash;
  681. }
  682.  
  683. ## Returns a pretty string about current time
  684. sub get_time {
  685.   my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5];
  686.   return sprintf ("[%04d-%02d-%02d %02d:%02d:%02d]", $year+1900, $month+1, $day, $hour, $minute, $second);
  687. }
  688.  
  689. =item *
  690. When saving a file, to avoid problems with limited filesystems such as NTFS or FAT, only some
  691. characters are allowed. All other characters will be replaced by an underscore. Allowed characters
  692. are:
  693.  
  694. =over
  695.  
  696. B<a-z 0-9 - +  . , () {} []'>
  697.  
  698. =back
  699. =cut
  700.  
  701. ## Tries to sanitize a filename
  702. sub fix_filename {
  703.   my ($volume, $directories, $file) = File::Spec->splitpath( $_[0] );
  704.   $file =~ s/[^a-z0-9\-\+ \.,\(\){}\[\]']/_/ig;
  705.   my $filepath  = File::Spec->catpath($volume, $directories, $file);
  706.   say "Filepath '$_[0]' was converted to '$filepath' " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
  707.   return $filepath;
  708. }
  709.  
  710. sub by_caller_and_location {
  711.   my $level;
  712.   if (!@_ || $_[0] eq 'here') {
  713.     $level = 1;
  714.   } elsif ($_[0] eq 'before'){
  715.     $level = 2;
  716.   } elsif ($_[0] =~ /^[0-9]+$/){
  717.     $level = 1 + $_[0];
  718.   } else {
  719.     die "Bug found when calculating level for caller function. Please report.";
  720.   }
  721.   my $deeper = shift;
  722.   return "by " . (caller($level))[3] . " at line " . (caller($level))[2];
  723. }
  724.  
  725. =item *
  726. B<sequence extractor> tries to use the same file extensions that bioperl would expect when saving the file. It's,
  727. possible that bioperl adds support for a new sequence format before B<sequence extractor>. In that case, the
  728. file extension used defaults to '.seq'. Please report if you find a format that is not yet supported.
  729. =cut
  730.  
  731. sub file_extension_for {
  732.   ## TODO in some cases, extension changes whether it's protein or DNA or whatever
  733.   ## should support this
  734.  
  735.   ## to update this list, look in the _guess_format method, inside SeqIO.pm of bioperl
  736.   given ($_[0]) {
  737.     when (/abi/i)        {return '.abi';}
  738.     when (/ace/i)        {return '.ace';}
  739.     when (/alf/i)        {return '.alf';}
  740.     when (/bsml/i)       {return '.bsm';}
  741.     when (/ctf/i)        {return '.ctf';}
  742.     when (/embl/i)       {return '.embl';}
  743.     when (/entrezgene/i) {return '.asn';}
  744.     when (/exp/i)        {return '.exp';}
  745.     when (/fasta/i)      {return '.fasta';} # fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa
  746.     when (/fastq/i)      {return '.fastq';}
  747.     when (/gcg/i)        {return '.gcg';}
  748.     when (/genbank/i)    {return '.gb';} # gb|gbank|genbank|gbk|gbs
  749.     when (/phd/i)        {return '.phd';} # phd|phred
  750.     when (/pir/i)        {return '.pir';}
  751.     when (/pln/i)        {return '.pln';}
  752.     when (/qual/i)       {return '.qual';}
  753.     when (/raw/i)        {return '.txt';}
  754.     when (/scf/i)        {return '.scf';}
  755.     when (/swiss/i)      {return '.swiss';} # swiss|sp
  756.     when (/strider/i)    {return '.xdna';}  # ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA, DNA Degenerate, RNA and Protein Sequence Files, respectively
  757.     when (/ztr/i)        {return '.ztr';}
  758.     default {
  759.       say "Couldn't find the right extension for the requested format. Using '.seq' as default" if $debug;
  760.       return ".seq";
  761.     }
  762.   }
  763. }
  764.  
  765. ################################################################################
  766. ## Structure methods
  767. ################################################################################
  768. package Structure;
  769.  
  770. ## creates a new instance of the object
  771. sub new {
  772.   my $class = shift;
  773.   my $self  = {};
  774.   bless ($self, $class);
  775.   return $self;
  776. }
  777.  
  778. ## adds information to a specific gene and adds the gene to the structure if it doesn't exist
  779. ## $object->add_gene (
  780. ##                    uid        => gene_uid_of_the_gene,
  781. ##                    gene_name  => gene_name,
  782. ##                    other_info => corresponding value
  783. ##                    );
  784. sub add_gene {
  785.   my $self  = shift;
  786.   my %data  = @_;
  787.   ## remove the value from the hash so it doesn't show up in the loop
  788.   my $gene_uid = delete $data{'uid'};
  789.   unless ($gene_uid) {
  790.     warn "WARNING: no gene UID supplied when adding new gene". main::by_caller_and_location('before');
  791.     return;
  792.   }
  793.   ## when adding a gene (instead of updating, this goes first. Can't just hope
  794.   ## to happen by itself on the loop later because it's possible to add a gene
  795.   ## with no info on it
  796.   if ( !exists($self->{'gene'}->{$gene_uid}) ) {
  797.     say "Creating new gene with UID=$gene_uid." if $debug;
  798.     $self->{'gene'}->{$gene_uid} = {};
  799.     $self->{'gene'}->{$gene_uid}->{'uid'} = $gene_uid;  # this is not stupid. Makes it easier to have a general function to create the filename
  800.     $self->{'gene'}->{$gene_uid}->{'transcript'} = [];
  801.     $self->{'gene'}->{$gene_uid}->{'protein'}    = [];
  802.   }
  803.   ## fill it with all the data
  804.   foreach my $key (keys %data) {
  805.     $self->{'gene'}->{$gene_uid}->{$key} = $data{$key};
  806.     say "Added $key=$data{$key} to gene with UID=$gene_uid." if $debug;
  807.   }
  808. }
  809.  
  810. ## remove genes from the structure given their uid
  811. ## $object->remove_gene($uid)
  812. ## $object->remove_gene(@uids)
  813. sub remove_gene {
  814.   my $self  = shift;
  815.   foreach my $uid (@_) {
  816.     delete $self->{'gene'}->{$uid};
  817.     say "Removed gene with UID=$uid" if $debug;
  818.   }
  819. }
  820.  
  821. sub add_product {
  822.   my $self  = shift;
  823.   my %data  = @_;
  824.   ## remove these values from the hash so they don't show up in the loop later
  825.   my $product     = delete $data{'type'};
  826.   die "Bug found. Please report this. Product requested was $product ". main::by_caller_and_location('before') unless ($product eq 'protein' || $product eq 'transcript');
  827.   my $product_acc = delete $data{'accession'};
  828.   unless ($product_acc) {
  829.     warn "WARNING: no $product accession supplied when adding new product". main::by_caller_and_location('before') . " Adding nothing.";
  830.     return;
  831.   }
  832.   ## Since it's possible that a record for a product be generated automatically when
  833.   ## creating it's related product, the only way to know if it's the first time, it's
  834.   ## to check it's accession (it's the same as the key. It looks kinda of redundant
  835.   ## but allows to have a simpler function to generate filename that uses get_info)
  836.   if ( !exists($self->{$product}->{$product_acc}->{'accession'}) ) {
  837.     say "Creating new $product with accession=$product_acc." if $debug;
  838.     $self->{$product}->{$product_acc}->{'accession'} = $product_acc;
  839.   }
  840.   ## fill it with all the data
  841.   foreach my $key (keys %data) {
  842.     $self->{$product}->{$product_acc}->{$key} = $data{$key};
  843.     ## if we're adding info about gene and related products, the array on their
  844.     ## part of structure  needs to be updated
  845.     if ($key eq 'gene') {
  846.       my $products_on_gene = \@{$self->{'gene'}->{$data{$key}}->{$product}};
  847.       push (@$products_on_gene, $product_acc);
  848.       main::clean_array($products_on_gene);
  849.     } elsif ($key eq 'transcript' && $product eq 'protein') {
  850.       my $transcript_acc  = $data{$key};
  851.       my $current         = $self->{'transcript'}->{$transcript_acc}->{'protein'};
  852.       if ($current && $current ne $product_acc) {
  853.         warn "WARNING: replacing accession $current with $product_acc as product of $transcript_acc. Please report this bug.";
  854.         $self->{'transcript'}->{'protein'} = $product_acc;
  855.       }
  856.     } elsif ($key eq 'protein' && $product eq 'transcript') {
  857.       my $protein_acc  = $data{$key};
  858.       my $current      = $self->{'protein'}->{$protein_acc}->{'transcript'};
  859.       if ($current && $current ne $product_acc) {
  860.         warn "WARNING: replacing accession $current with $product_acc as ''template'' of $product_acc. Please report this bug.";
  861.         $self->{'protein'}->{'transcript'} = $product_acc;
  862.       }
  863.     }
  864.   }
  865. }
  866. ## get information from the structure
  867. ## $value = $object->get_info('gene', $gene_id, 'wanted info');
  868. ## $value = $object->get_info('protein', $protein_acc, 'wanted info');
  869. sub get_info {
  870.   my $self  = shift;
  871.   my $type  = shift;
  872.   my $key   = shift;
  873.   my $req   = shift;
  874.   ## can't check the request here and return the key if the request is accession
  875.   ## or id because even though we're using product accessions and gene UID as keys,
  876.   ## gene also have accessions and products have id and if we had support for them
  877.   ## later, it would create a lot of confusion and bugs
  878.   return $self->{$type}->{$key}->{$req};
  879. }
  880.  
  881. ## returns a list of all gene UIDs or product accessions
  882. ##   @gene_uids = $structure->get_list('gene')
  883. ##   @mRNA_acc  = $structure->get_list('transcript')
  884. sub get_list {
  885.   my $self  = shift;
  886.   my $type  = shift;
  887.   return keys %{$self->{$type}};
  888. }
  889.  
  890. ## for the specified genes UIDs returns list of the requested products accessions
  891. ## If no gene_id is specified, returns a list of all accessions of the product requested
  892. ## @mRNA_acc  = structure->get_product_list('transcript', $gene_uid)
  893. ## @prot_acc  = structure->get_product_list('protein', @gene_uid)
  894. ## @prot_acc  = structure->get_product_list('protein')
  895. sub get_product_list {
  896.   my $self    = shift;
  897.   my $product = shift;
  898.   given ( scalar(@_) ) {
  899.     when ([1]) { return @{$self->{'gene'}->{$_[0]}->{$product}}; }
  900.     when ([0]) { return $self->get_list($product); }
  901.     default    { return map { $self->get_product_list($product, $_) } @_; }
  902.   }
  903. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement