Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- ## Copyright (C) 2011 Carnë Draug <carandraug+dev@gmail.com>
- ##
- ## This program is free software; you can redistribute it and/or modify
- ## it under the terms of the GNU General Public License as published by
- ## the Free Software Foundation; either version 3 of the License, or
- ## (at your option) any later version.
- ##
- ## This program is distributed in the hope that it will be useful,
- ## but WITHOUT ANY WARRANTY; without even the implied warranty of
- ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ## GNU General Public License for more details.
- ##
- ## You should have received a copy of the GNU General Public License
- ## along with this program; if not, see <http://www.gnu.org/licenses/>.
- use 5.010; # Use Perl 5.10
- use warnings; # Replacement for the -w flag, but lexically scoped
- use strict; # Enforce some good programming rules
- use Getopt::Long; # Parse program arguments
- use Cwd; # Determines current working directory
- use File::Spec; # Perform operation on file names
- use Bio::SeqIO; # Handler for SeqIO Formats
- ## List of useful variables related to project
- my $version = 0.01; # program version. Will be saved on log file
- my $email = 'carandraug+dev@gmail.com'; # mail to be used by the database sysadmins
- my $tool = 'bioperl-sequence_extractor'; # project name to be given to database sysadmin
- =head1 NAME
- sequence_extractor - for a specific list of searches, extracts all related sequences
- =head1 SYNOPSIS
- B<sequence_extractor> [] [] SEARCH
- =head1 DESCRIPTION
- This script does blah blah blah
- =head1 OPTIONS
- See the section bugs for problems when using default values of options.
- =over
- =cut
- =item B<--database>, B<--db>
- Specifies the database to search on. By default, it will search on genbank.
- =cut
- my $database = 'genbank';
- sub database_option_parsing {
- given ($_[1]) {
- when (/^(genbank|gb)$/i) { $database = 'genbank'; }
- when (/^(ensembl|e)$/i) { $database = 'ensembl'; }
- when ('') { } ## Do nothing. If not set, use default
- default { die "Specified database '$database' to search is not valid."; }
- }
- }
- =item B<--debug>
- If set, some warnings are printed (but not logged) that can may help on debugging. Might be a good idea
- to also set the option B<--very-verbose> when debugging (or read the log file).
- =cut
- my $debug = 0;
- =item B<--diagram>
- If set, a diagram showing the relationship between the extracted sequences is saved
- together with the log files.
- =cut
- my $diagram = 0;
- =item B<--downstream>, B<--down>
- Specifies the number of extra base pairs to be extracted downstream of the gene.
- This extra base pairs only affect the sequence, not the transcript or proteins.
- =cut
- my $downstream = 0;
- =item B<--format>
- Specifies the format that the sequences are to be saved. All bioperl supported sequence
- format are valid. Currently, there's no way to obtain sequences for each thing in a
- different formats. Some databases may have the requested only in certain formats.
- Due to limitations of the BIO::Seq object, it's safer to just save the raw sequence
- returned by the database queried rather than use Bio::SeqIO to avoid losing information,
- specially sequence Features and Annotatoins.
- =cut
- my $format = 'genbank';
- =item B<--limit>
- When making a query to a database, limit the result to the first specific results. This helps
- prevent the use of specially unspecific queries. The default value is 200. Note that this limit
- is for each search.
- =cut
- my $limit = 200;
- =item B<--save>
- Specifies the path for the directory where the sequence and log files will be saved. If the
- directory does not exist it will be created but the directory below must exist. Files on the
- directory may be rewritten if necessary. If unspecified, a directory named F<extracted sequences>
- on the current directory will be used.
- =cut
- my $save = File::Spec->catfile (getcwd, 'extracted sequences');
- =item B<--sequences>, B<--seq>
- Specifies the name for gene file. By default, they are not saved. If no value is given
- defaults to its UID. Possible values are 'uid', 'name', 'symbol' (the officyal symbol or
- nomenclature).
- =cut
- my $sequences = '';
- sub sequences_option_parsing {
- given ($_[1]) {
- when (/^(u)?id$/i) { $sequences = 'uid'; }
- when (/^sym(bol)?$/i) { $sequences = 'symbol'; }
- when (/^name$/i) { $sequences = 'name'; }
- ## default is set here, when value is empty
- when ('') { $sequences = 'uid' }
- default { die "Invalid identifier '$_[1]' for gene files."; }
- }
- }
- =item B<--proteins>
- Specifies the name for proteins file. By default, they are not not saved. If no value is given
- defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
- gene ID. Note that one gene may have more than one protein so it may overwritten) and 'transcript'
- (the corresponding transcript accesion).
- =cut
- my $proteins = '';
- sub proteins_option_parsing {
- given ($_[1]) {
- when (/^acc(ession)?$/i) { $proteins = 'accession'; }
- when (/^desc(ription)?$/i) { $proteins = 'description'; }
- when (/^gene$/i) { $proteins = 'gene'; }
- when (/^(transcript|mrna)$/i) { $proteins = 'transcript'; }
- ## default is set here, when value is empty
- when ('') { $proteins = 'accession' }
- default { die "Invalid identifier '$_[1]' for protein files."; }
- }
- }
- =item B<--pseudo>, B<--nopseudo>
- By default, sequences of pseudo genes will be saved. B<--nopseudo> can be used to ignore those genes.
- =cut
- my $get_pseudo = 1;
- =item B<--transcripts>, B<--mrna>
- Specifies the name for transcripts file. By default, they are not is not saved. If no value is given
- defaults to its accession. Possible values are 'accession', 'description', 'gene' (the corresponding
- gene ID. Note that one gene may have more than one transcript so it may overwritten) and 'protein'
- (the corresponding protein accesion).
- =cut
- my $transcripts = '';
- sub transcripts_option_parsing {
- given ($_[1]) {
- when (/^acc(ession)?$/i) { $transcripts = 'accession'; }
- when (/^desc(ription)?$/i) { $transcripts = 'description'; }
- when (/^gene$/i) { $transcripts = 'gene'; }
- when (/^protein$/i) { $transcripts = 'protein'; }
- ## default is set here, when value is empty
- when ('') { $transcripts = 'accession' }
- default { die "Invalid identifier '$_[1]' for transcript files."; }
- }
- }
- =item B<--upstream>, B<--up>
- Specifies the number of extra base pairs to be extracted upstream of the gene.
- This extra base pairs only affect the sequence, not the transcript or proteins.
- =cut
- my $upstream = 0;
- =item B<--verbose>
- If set, program becomes verbose. For an extremely verbose program, use B<--very-verbose> instead.
- =cut
- my $verbose = '';
- =item B<--very-verbose>
- If set, program becomes extremely verbose. Setting this option, automatically sets B<--verbose> as well.
- =cut
- my $very_verbose = '';
- =back
- =head1 BUGS
- If you find any bug, or have a feature request, please report these at
- =over
- =item *
- When supplying options, it's possible to not supply a value and use their default. However,
- when the expected value is a string, the next argument may be confused as value for the
- option. For example, when using the following command:
- C<sequence_extractor --transcripts 'H2A AND homo sapiens'>
- we mean to search for 'H2A AND homo sapiens' saving only the transcripts and using the default
- as base for the filename. However, the search terms will be interpreted as the base for the
- filenames (but since it's not a valid identifier, it will return an error). To prevent
- this, you can either specify the values:
- C<sequence_extractor --transcripts 'accession' 'H2A AND homo sapiens'>
- C<sequence_extractor --transcripts='accession' 'H2A AND homo sapiens'>
- or you can use the double hash to stop processing options. Note that this should only be used
- after the last option. All arguments supplied after the double dash will be interpreted as search terms
- C<sequence_extractor --transcripts -- 'H2A AND homo sapiens'>
- =back
- =head1 EXAMPLES
- =over
- =back
- =head1 NOTES ON USAGE
- =over
- =cut
- ################################################################################
- ## Parse options, check and create files and directories needed
- ################################################################################
- GetOptions(
- 'database|db:s' => \&database_option_parsing,
- 'debug' => \$debug,
- 'diagram' => \$diagram,
- 'down|downstream=i' => \$downstream,
- 'format=s' => \$format,
- 'limit=i' => \$limit,
- 'proteins:s' => \&proteins_option_parsing,
- 'pseudo!' => \$get_pseudo,
- 'save=s' => \$save,
- 'seq|sequences:s' => \&sequences_option_parsing,
- 'mrna|transcripts:s' => \&transcripts_option_parsing,
- 'up|upstream=i' => \$upstream,
- 'verbose' => \$verbose,
- 'very-verbose' => \$very_verbose,
- ) or die "Error processing options";
- ## It is necessary to check success of GetOptions since:
- ## ''GetOptions returns true to indicate success. It returns false when the function
- ## detected one or more errors during option parsing. These errors are signalled
- ## using warn() and can be trapped with $SIG{__WARN__}''
- my $seq_dir = File::Spec->catfile ($save, 'sequences');
- my $mrna_dir = File::Spec->catfile ($save, 'transcripts');
- my $prot_dir = File::Spec->catfile ($save, 'proteins');
- check_dir($_) foreach ($save, $seq_dir, $mrna_dir, $prot_dir);
- my $log_file = File::Spec->catfile ($save, 'extractor.log');
- open (LOG, ">", $log_file) or die "Couldn't open file $log_file for writing: $!";
- ## set verbosity level
- my $verbosity;
- if ($very_verbose) {
- $verbosity = 3;
- } elsif ($verbose) {
- $verbosity = 2;
- } else {
- $verbosity = 1;
- }
- log_it (1, "This is sequence extractor version $version on ". &get_time);
- ################################################################################
- ## Everything is ready. Start accessing the databases
- ################################################################################
- my $data = Structure->new;
- given ($database) {
- when ('genbank') {
- use Bio::DB::EUtilities; # Retrieve entries from Entrez
- say "Searching on genbank...";
- my @uids;
- push (@uids, gb_search ($_)) foreach (@ARGV);
- {
- my $start = scalar(@uids);
- clean_array(\@uids);
- my $diff = $start - scalar(@uids);
- log_it (2, "Removed $diff UIDs from the search results for being repeated.");
- log_it (3, "List of retrieved ids is: @uids");
- }
- say "Fetching gene info...";
- analyze_entrez_genes ($data, \@uids);
- if ($sequences) { say "Fetching gene sequences..."; get_genes($data); }
- if ($transcripts) { say "Fetching transcript sequences..."; get_products('transcript', $data); }
- if ($proteins) { say "Fetching protein sequences..."; get_products('protein', $data); }
- }
- when ('ensembl') {
- say "Searching on ensembl";
- die "Search on ensembl has not been implemented yet.";
- }
- default {
- die "Non valid database '$database' to search.";
- }
- }
- if ($debug) { use Data::Dumper; print Dumper $data; }
- exit;
- ################################################################################
- ## genbank search subs
- ################################################################################
- sub gb_search {
- log_it (3, "Searching with '$_[0]' on the 'gene' database");
- my $searcher = Bio::DB::EUtilities->new(
- -eutil => 'esearch',
- -db => 'gene',
- -term => $_[0],
- -tool => $tool,
- -email => $email,
- -retmax => $limit,
- );
- log_it (3, "Query $_[0] translated into '" . $searcher->get_query_translation . "'");
- log_it (2, "Found " . $searcher->get_count . " UIDS with query '$_[0]'");
- log_it (2, "Search returned more ids than the set limit of $limit. Retrieving only the first '$limit' genes.") if ($searcher->get_count > $limit);
- return $searcher->get_ids;
- }
- {
- ## it's possible that when analyzing genes, if a gene has a RefSeq status of secondary, it will point
- ## to another gene, also in the list. To prevent analyzing the same gene twice here, this persistent
- ## hash private to this function is created and keeps track of the UID of genes analyzed
- ## TODO check new varianle type state instead of this hack
- my %analyzed_genes;
- sub analyze_entrez_genes {
- ## XXX we are not using esummary because it doesn't tell us the products of the gene thus forcing us
- ## to download the sequence and analyze it from that. We are also not using elink because it fails
- ## too frequently. Also, entrezgene gives currently the most information so it'll be easier to implement
- ## new stuff at a later time
- my $struct = shift;
- my $uid_ref = shift; # a reference for the array containing the list of gene UID
- ## TODO may be a good idea to limit this and download only a few sequences rather
- ## than what can possibly be thousands of them.
- my $fetcher = Bio::DB::EUtilities->new(
- -eutil => 'efetch',
- -db => 'gene',
- -id => $uid_ref,
- -retmode => 'text',
- -rettype => 'asn1',
- );
- my $response = $fetcher->get_Response->content;
- ## TODO when this bug is fixed https://redmine.open-bio.org/issues/3261
- ## this should be fixed to use Bio::SeqIO with format=> 'entrezgene'
- ## then we could use methods to access the data
- use Bio::ASN1::EntrezGene;
- ## This use of the open function requires perl 5.8.0 or later
- open(my $seq_fh, "<", \$response) or die "Could not open sequences string for reading: $!";
- my $parser = Bio::ASN1::EntrezGene->new(
- -fh => $seq_fh,
- );
- ## when analyzing the genes, if it has another current id, it's possible that
- my %analyzed;
- SEQ: while(my $result = $parser->next_seq){
- $result = $result->[0] if(ref($result) eq 'ARRAY');
- ## Data::Dumper can be used to look into the structure and find where things are
- # use Data::Dumper;
- # print Dumper ($result);
- my $uid = $result->{'track-info'}->[0]->{'geneid'};
- next SEQ if $analyzed_genes{$uid};
- $analyzed_genes{$uid} = 1;
- my ($symbol, $name);
- foreach my $p (@{$result->{'properties'}}){
- $p = $p->[0] if(ref($p) eq 'ARRAY');
- next unless ($p->{'label'} && $p->{'label'} eq 'Nomenclature');
- foreach my $pp (@{$p->{'properties'}}){
- $pp = $pp->[0] if(ref($pp) eq 'ARRAY');
- $name = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Full Name');
- $symbol = $pp->{'text'} if ($pp->{'label'} && $pp->{'label'} eq 'Official Symbol');
- }
- }
- ## values for the gene-status (different from RefSeq status)
- ## live good??
- ## secondary synonym with merged
- ## discontinued 'deleted', still index and display to public
- ## newentry- for GeneRif submission
- my $status = $result->{'track-info'}->[0]->{'status'};
- if ($status eq 'discontinued') {
- log_it (3, "Discontinued gene: UID='$uid', symbol='$symbol', name='$name'. Forgetting about it...");
- next SEQ;
- } elsif ($status eq 'secondary') {
- ## recursivity! UUUUUUUUUU!
- log_it (3, "Secondary gene: UID='$uid', symbol='$symbol', name='$name'. Attempting to find its current UID...");
- my $current_id;
- foreach my $c (@{$result->{'track-info'}->[0]->{'current-id'}}) {
- next unless $c->{'db'} eq 'GeneID';
- $current_id = $c->{'tag'}->[0]->{'id'};
- next unless $current_id;
- log_it (3, "Update: found current UID '$current_id' of secondary gene with UID='$uid'");
- analyze_entrez_genes ($struct, [$current_id]);
- }
- log_it (3, "Update: could not find current UID of secondary gene with UID='$uid'") unless $current_id;
- next SEQ;
- } else {
- $struct->add_gene(
- uid => $uid,
- status => $status,
- name => $name,
- symbol => $symbol,
- );
- log_it (3, "New gene: UID='$uid', gene status='$status', symbol='$symbol', name='$name'.");
- }
- ## value is 'pseudo' for pseudo genes, should be 'protein-coding' otherwise
- if ($result->{'type'} eq 'pseudo') {
- $struct->add_gene(uid => $uid, pseudo => 1);
- log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as pseudo...");
- unless ($get_pseudo) {
- log_it (3, "Removing gene with UID='$uid' for being pseudo...");
- $struct->remove_gene($uid);
- next SEQ;
- }
- } else {
- $struct->add_gene(uid => $uid, pseudo => 0);
- log_it (3, "Update: gene with UID='$uid' is ". $result->{'type'} ." gene. Marking as protein-coding...");
- }
- foreach my $l (@{$result->{'locus'}}){
- $l = $l->[0] if(ref($l) eq 'ARRAY');
- next unless ($l->{'heading'} && $l->{'heading'} =~ m/Reference primary assembly/i);
- my $ChrAccVer = $l->{'accession'};
- my $ChrStart = $l->{'seqs'}->[0]->{'int'}->[0]->{'from'};
- my $ChrStop = $l->{'seqs'}->[0]->{'int'}->[0]->{'to'};
- my $ChrStrand = $l->{'seqs'}->[0]->{'int'}->[0]->{'strand'};
- if ($ChrStrand eq 'minus') {
- $ChrStrand = 1;
- $ChrStart += 1 - $upstream;
- $ChrStop += 1 + $downstream;
- } else {
- $ChrStrand = 2;
- $ChrStart += 1 - (-$upstream);
- $ChrStop += 1 + (-$downstream);
- }
- $struct->add_gene(
- uid => $uid,
- ChrAccVer => $ChrAccVer,
- ChrStart => $ChrStart,
- ChrStop => $ChrStop,
- ChrStrand => $ChrStrand,
- );
- log_it (3, "Update: gene with UID='$uid' has Accesion id $ChrAccVer between coordinates $ChrStart ... $ChrStop on strand $ChrStrand.");
- last; # if we got here once, no point in looking on the others
- }
- ## we will look for products accessions on the comments section instead of the
- ## locus section because locus doesn't say the RefSeq status of the products
- foreach my $c (@{$result->{'comments'}}){
- $c = $c->[0] if(ref($c) eq 'ARRAY');
- ## get RefSeq status
- if ($c->{'heading'} && $c->{'heading'} eq 'RefSeq Status') {
- my $refseq_status = $c->{'label'};
- $struct->add_gene(uid => $uid, 'RefSeq status' => $refseq_status);
- log_it (3, "Update: gene with UID='$uid' has RefSeq status='$refseq_status'");
- }
- ## the rest of the block only makes sense if it's not a pseudo gene
- if ($struct->get_info('gene', $uid, 'pseudo') ) {
- say "Finished analyzing gene with UID $uid earlier since it's pseudo gene " . by_caller_and_location('here') if $debug;
- next SEQ;
- }
- next unless ($c->{'heading'} && $c->{heading} eq 'NCBI Reference Sequences (RefSeq)');
- foreach my $cc (@{$c->{'comment'}}){
- $cc = $cc->[0] if(ref($cc) eq 'ARRAY');
- next unless $cc->{'heading'} eq 'RefSeqs maintained independently of Annotated Genomes';
- foreach my $ccp (@{$cc->{'products'}}){
- $ccp = $ccp->[0] if(ref($ccp) eq 'ARRAY');
- my $mRNA_acc = $ccp->{'accession'};
- my $prot_acc = $ccp->{'products'}->[0]->{'accession'};
- ## for the RefSeq status needs to be on a loop since there's a list of comments
- ## About RefSeq status of products:
- ## http://www.ncbi.nlm.nih.gov/entrez/query/static/help/genefaq.html#faq_g7.2
- ## XXX what can we do with this product RefSeq status?
- my $ref_stat;
- foreach my $ccpc (@{$ccp->{'comment'}}){
- next unless ($ccpc->{'label'} && $ccpc->{'label'} eq 'RefSeq Status');
- $ref_stat = $ccpc->{'text'};
- }
- $struct->add_product(
- type => 'transcript',
- accession => $mRNA_acc,
- gene => $uid,
- protein => $prot_acc,
- );
- $struct->add_product(
- type => 'protein',
- accession => $prot_acc,
- gene => $uid,
- transcript => $mRNA_acc,
- );
- log_it (3, "Update: gene with UID='$uid' found to encode transcrip='$mRNA_acc' and protein='$prot_acc' with product RefStatus of $ref_stat.");
- }
- }
- }
- unless (scalar($struct->get_product_list('transcript', $uid)) >= 1 && scalar($struct->get_product_list('protein', $uid)) >= 1) {
- warn "WARNING: non-pseudo gene with UID='$uid' returned no protein and transcript.";
- }
- }
- }
- }
- sub create_fetcher {
- my $searched = shift;
- my $rettype;
- given ($format) {
- when ('genbank') {$rettype = 'gb';}
- when ('fasta') {$rettype = 'fasta';}
- default {$rettype = 'native'; warn "WARNING: couldn't convert format '$format' to rettype. Using native as default."}
- }
- my $fetcher = Bio::DB::EUtilities->new(
- -eutil => 'efetch',
- -db => $searched,
- -retmode => 'text',
- -rettype => $rettype,
- );
- return $fetcher;
- }
- sub get_filehandle {
- my $type = shift;
- my $product_key = shift;
- my $name_key = shift;
- my $struct = shift;
- my $base_dir;
- given ($type) {
- when ('gene') { $base_dir = $seq_dir; }
- when ('transcript') { $base_dir = $mrna_dir; }
- when ('protein') { $base_dir = $prot_dir; }
- default { die "Found a bug. Unknow type provided '$type' to generate filename ". by_caller_and_location('before') ." Please report."; }
- }
- my $filepath = File::Spec->catfile ($base_dir, $struct->get_info($type, $product_key, $name_key) . file_extension_for($format));
- $filepath = fix_filename ($filepath);
- open (my $filehandle, ">", $filepath) or die "Couldn't open '$filepath' for writing: $!";
- return $filehandle;
- }
- sub get_genes {
- my $struct = shift;
- my @gene_uids = $struct->get_list('gene');
- my $fetcher = create_fetcher('nucleotide');
- foreach my $gene_uid (@gene_uids) {
- log_it (2, "Getting gene: trying to get gene with UID='$gene_uid'...");
- unless ($struct->get_info('gene', $gene_uid, 'ChrAccVer')) {
- log_it (2, "Update: found no genomic info for gene with UID='$gene_uid'...");
- next;
- }
- my $filehandle = get_filehandle('gene', $gene_uid, $sequences, $struct);
- $fetcher->set_parameters (
- -id => $struct->get_info('gene', $gene_uid, 'ChrAccVer'),
- -seq_start => $struct->get_info('gene', $gene_uid, 'ChrStart'),
- -seq_stop => $struct->get_info('gene', $gene_uid, 'ChrStop'),
- -strand => $struct->get_info('gene', $gene_uid, 'ChrStrand'),
- );
- log_it (3, "Update: fetching gene sequence for gene with UID='$gene_uid'...");
- print $filehandle $fetcher->get_Response->content;
- close $filehandle or warn $! ? "WARNING: error closing filehandle: $!" : "WARNING: exit status $? from filehandle";
- }
- }
- sub get_products {
- my $product = shift;
- my $struct = shift;
- my @product_acc = $struct->get_list($product);
- my ($fetcher, $base_name);
- if ($product eq 'transcript') {
- $fetcher = create_fetcher ('nuccore');
- $base_name = $transcripts;
- } elsif ($product eq 'protein') {
- $fetcher = create_fetcher ('protein');
- $base_name = $proteins;
- } else {
- die "Bug found. Invalid product $product argument given ". by_caller_and_location('before') .". Please report it.";
- }
- ## ideally, there would be no loop, and we'd use $fetcher to get all the sequences in
- ## one go. However, that would force us to get the sequences in Bio::Seq files which
- ## can be different from the actually downloaded file. Better to not take the chance
- foreach my $product_acc (@product_acc) {
- log_it (2, "Getting $product: trying to get $product with accession='$product_acc'...");
- $fetcher->set_parameters (
- -id => $product_acc,
- );
- my $response = $fetcher->get_Response->content;
- open(my $seq_fh, "<", \$response) or die "Could not open sequence string for reading: $!";
- my $parser = Bio::SeqIO->new(
- -fh => $seq_fh,
- -format => $format,
- );
- while (my $seq = $parser->next_seq) {
- my $product_desc = $seq->description;
- $struct->add_product(
- type => $product,
- accession => $product_acc,
- description => $product_desc,
- );
- }
- my $filehandle = get_filehandle($product, $product_acc, $base_name, $struct);
- print $filehandle $response;
- close $filehandle;
- }
- }
- ################################################################################
- ## other small useful subs
- ################################################################################
- =item *
- When creating the directories to save the files, if the directory already exists it will be used and no error
- or warning will be issued unless B<--debug> as been set. If a non-directory file already exists with that name
- sequence_extractor exits with an error.
- =cut
- ## checks if directories exist and creates them as needed
- sub check_dir {
- # TODO must check for permissions as well
- if (!-e $_[0]) {
- mkdir $_[0] or die "Could not mkdir '$_[0]': $!";
- say "Directory '$_[0]' created." if $debug;
- } elsif (!-d $_[0]) {
- die "Directory '$_[0]' to save output already exists as non-directory.";
- } else {
- say "Directory '$_[0]' NOT created since it already exists." if $debug;
- }
- }
- =item *
- On the subject of program verbosity, all messages are saved on the log file. The
- options B<--verbose> and B<--very-verbose> only affect their printing to standard
- output. Messages from B<--debug> are independent and only printed if requested (and
- then, still only for STDOUT).
- =cut
- sub log_it {
- my $level = shift;
- my $message = shift;
- say LOG $message;
- say STDOUT $message unless $level > $verbosity;
- }
- ## Removes repeated elements from an array. Does not respect original order
- sub clean_array {
- my %hash;
- foreach (@{$_[0]}) {
- if ($hash{$_}) {
- say "Value $_ removed from array " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
- } else {
- $hash{$_} = 1;
- }
- }
- @{$_[0]} = keys %hash;
- }
- ## Returns a pretty string about current time
- sub get_time {
- my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5];
- return sprintf ("[%04d-%02d-%02d %02d:%02d:%02d]", $year+1900, $month+1, $day, $hour, $minute, $second);
- }
- =item *
- When saving a file, to avoid problems with limited filesystems such as NTFS or FAT, only some
- characters are allowed. All other characters will be replaced by an underscore. Allowed characters
- are:
- =over
- B<a-z 0-9 - + . , () {} []'>
- =back
- =cut
- ## Tries to sanitize a filename
- sub fix_filename {
- my ($volume, $directories, $file) = File::Spec->splitpath( $_[0] );
- $file =~ s/[^a-z0-9\-\+ \.,\(\){}\[\]']/_/ig;
- my $filepath = File::Spec->catpath($volume, $directories, $file);
- say "Filepath '$_[0]' was converted to '$filepath' " . by_caller_and_location('here') . " called " . by_caller_and_location('before') if $debug;
- return $filepath;
- }
- sub by_caller_and_location {
- my $level;
- if (!@_ || $_[0] eq 'here') {
- $level = 1;
- } elsif ($_[0] eq 'before'){
- $level = 2;
- } elsif ($_[0] =~ /^[0-9]+$/){
- $level = 1 + $_[0];
- } else {
- die "Bug found when calculating level for caller function. Please report.";
- }
- my $deeper = shift;
- return "by " . (caller($level))[3] . " at line " . (caller($level))[2];
- }
- =item *
- B<sequence extractor> tries to use the same file extensions that bioperl would expect when saving the file. It's,
- possible that bioperl adds support for a new sequence format before B<sequence extractor>. In that case, the
- file extension used defaults to '.seq'. Please report if you find a format that is not yet supported.
- =cut
- sub file_extension_for {
- ## TODO in some cases, extension changes whether it's protein or DNA or whatever
- ## should support this
- ## to update this list, look in the _guess_format method, inside SeqIO.pm of bioperl
- given ($_[0]) {
- when (/abi/i) {return '.abi';}
- when (/ace/i) {return '.ace';}
- when (/alf/i) {return '.alf';}
- when (/bsml/i) {return '.bsm';}
- when (/ctf/i) {return '.ctf';}
- when (/embl/i) {return '.embl';}
- when (/entrezgene/i) {return '.asn';}
- when (/exp/i) {return '.exp';}
- when (/fasta/i) {return '.fasta';} # fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa
- when (/fastq/i) {return '.fastq';}
- when (/gcg/i) {return '.gcg';}
- when (/genbank/i) {return '.gb';} # gb|gbank|genbank|gbk|gbs
- when (/phd/i) {return '.phd';} # phd|phred
- when (/pir/i) {return '.pir';}
- when (/pln/i) {return '.pln';}
- when (/qual/i) {return '.qual';}
- when (/raw/i) {return '.txt';}
- when (/scf/i) {return '.scf';}
- when (/swiss/i) {return '.swiss';} # swiss|sp
- when (/strider/i) {return '.xdna';} # ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA, DNA Degenerate, RNA and Protein Sequence Files, respectively
- when (/ztr/i) {return '.ztr';}
- default {
- say "Couldn't find the right extension for the requested format. Using '.seq' as default" if $debug;
- return ".seq";
- }
- }
- }
- ################################################################################
- ## Structure methods
- ################################################################################
- package Structure;
- ## creates a new instance of the object
- sub new {
- my $class = shift;
- my $self = {};
- bless ($self, $class);
- return $self;
- }
- ## adds information to a specific gene and adds the gene to the structure if it doesn't exist
- ## $object->add_gene (
- ## uid => gene_uid_of_the_gene,
- ## gene_name => gene_name,
- ## other_info => corresponding value
- ## );
- sub add_gene {
- my $self = shift;
- my %data = @_;
- ## remove the value from the hash so it doesn't show up in the loop
- my $gene_uid = delete $data{'uid'};
- unless ($gene_uid) {
- warn "WARNING: no gene UID supplied when adding new gene". main::by_caller_and_location('before');
- return;
- }
- ## when adding a gene (instead of updating, this goes first. Can't just hope
- ## to happen by itself on the loop later because it's possible to add a gene
- ## with no info on it
- if ( !exists($self->{'gene'}->{$gene_uid}) ) {
- say "Creating new gene with UID=$gene_uid." if $debug;
- $self->{'gene'}->{$gene_uid} = {};
- $self->{'gene'}->{$gene_uid}->{'uid'} = $gene_uid; # this is not stupid. Makes it easier to have a general function to create the filename
- $self->{'gene'}->{$gene_uid}->{'transcript'} = [];
- $self->{'gene'}->{$gene_uid}->{'protein'} = [];
- }
- ## fill it with all the data
- foreach my $key (keys %data) {
- $self->{'gene'}->{$gene_uid}->{$key} = $data{$key};
- say "Added $key=$data{$key} to gene with UID=$gene_uid." if $debug;
- }
- }
- ## remove genes from the structure given their uid
- ## $object->remove_gene($uid)
- ## $object->remove_gene(@uids)
- sub remove_gene {
- my $self = shift;
- foreach my $uid (@_) {
- delete $self->{'gene'}->{$uid};
- say "Removed gene with UID=$uid" if $debug;
- }
- }
- sub add_product {
- my $self = shift;
- my %data = @_;
- ## remove these values from the hash so they don't show up in the loop later
- my $product = delete $data{'type'};
- die "Bug found. Please report this. Product requested was $product ". main::by_caller_and_location('before') unless ($product eq 'protein' || $product eq 'transcript');
- my $product_acc = delete $data{'accession'};
- unless ($product_acc) {
- warn "WARNING: no $product accession supplied when adding new product". main::by_caller_and_location('before') . " Adding nothing.";
- return;
- }
- ## Since it's possible that a record for a product be generated automatically when
- ## creating it's related product, the only way to know if it's the first time, it's
- ## to check it's accession (it's the same as the key. It looks kinda of redundant
- ## but allows to have a simpler function to generate filename that uses get_info)
- if ( !exists($self->{$product}->{$product_acc}->{'accession'}) ) {
- say "Creating new $product with accession=$product_acc." if $debug;
- $self->{$product}->{$product_acc}->{'accession'} = $product_acc;
- }
- ## fill it with all the data
- foreach my $key (keys %data) {
- $self->{$product}->{$product_acc}->{$key} = $data{$key};
- ## if we're adding info about gene and related products, the array on their
- ## part of structure needs to be updated
- if ($key eq 'gene') {
- my $products_on_gene = \@{$self->{'gene'}->{$data{$key}}->{$product}};
- push (@$products_on_gene, $product_acc);
- main::clean_array($products_on_gene);
- } elsif ($key eq 'transcript' && $product eq 'protein') {
- my $transcript_acc = $data{$key};
- my $current = $self->{'transcript'}->{$transcript_acc}->{'protein'};
- if ($current && $current ne $product_acc) {
- warn "WARNING: replacing accession $current with $product_acc as product of $transcript_acc. Please report this bug.";
- $self->{'transcript'}->{'protein'} = $product_acc;
- }
- } elsif ($key eq 'protein' && $product eq 'transcript') {
- my $protein_acc = $data{$key};
- my $current = $self->{'protein'}->{$protein_acc}->{'transcript'};
- if ($current && $current ne $product_acc) {
- warn "WARNING: replacing accession $current with $product_acc as ''template'' of $product_acc. Please report this bug.";
- $self->{'protein'}->{'transcript'} = $product_acc;
- }
- }
- }
- }
- ## get information from the structure
- ## $value = $object->get_info('gene', $gene_id, 'wanted info');
- ## $value = $object->get_info('protein', $protein_acc, 'wanted info');
- sub get_info {
- my $self = shift;
- my $type = shift;
- my $key = shift;
- my $req = shift;
- ## can't check the request here and return the key if the request is accession
- ## or id because even though we're using product accessions and gene UID as keys,
- ## gene also have accessions and products have id and if we had support for them
- ## later, it would create a lot of confusion and bugs
- return $self->{$type}->{$key}->{$req};
- }
- ## returns a list of all gene UIDs or product accessions
- ## @gene_uids = $structure->get_list('gene')
- ## @mRNA_acc = $structure->get_list('transcript')
- sub get_list {
- my $self = shift;
- my $type = shift;
- return keys %{$self->{$type}};
- }
- ## for the specified genes UIDs returns list of the requested products accessions
- ## If no gene_id is specified, returns a list of all accessions of the product requested
- ## @mRNA_acc = structure->get_product_list('transcript', $gene_uid)
- ## @prot_acc = structure->get_product_list('protein', @gene_uid)
- ## @prot_acc = structure->get_product_list('protein')
- sub get_product_list {
- my $self = shift;
- my $product = shift;
- given ( scalar(@_) ) {
- when ([1]) { return @{$self->{'gene'}->{$_[0]}->{$product}}; }
- when ([0]) { return $self->get_list($product); }
- default { return map { $self->get_product_list($product, $_) } @_; }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement