Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/perl
- =head1 NAME
- run_genewisedb.pl
- =head1 SYNOPSIS
- run_genewisedb.pl input_fasta input_hmms output outputdir spliceflat parameterfile
- where input_fasta is the input fasta file of scaffolds,
- input_hmms is the input file of HMMs,
- output is the genewise output file,
- outputdir is the output directory for writing output files,
- spliceflat says whether to use the -splice flat option (yes/no),
- parameterfile is the name of the intron parameter file (none if there is none).
- =head1 DESCRIPTION
- This script takes an input fasta file of scaffolds (<input_fasta>) and an
- input file of HMMs (<input_hmms>) and runs genewise on the scaffolds, using
- one HMM at a time. The output is then written in the output file <output>.
- Before running this perl script you need to run:
- setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/
- =head1 VERSION
- Perl script last edited 31-Oct-2012.
- =head1 CONTACT
- alc@sanger.ac.uk (Avril Coghlan)
- =cut
- #
- # Perl script run_genewisedb.pl
- # Written by Avril Coghlan (alc@sanger.ac.uk)
- # 31-Oct-12.
- # Last edited 31-Oct-2012.
- # SCRIPT SYNOPSIS: run_genewisedb.pl: runs GeneWise using TreeFam HMMs, on an input fasta file of scaffolds.
- #
- #------------------------------------------------------------------#
- # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
- use strict;
- use warnings;
- my $num_args = $#ARGV + 1;
- if ($num_args != 6)
- {
- print "Usage of run_genewisedb.pl\n\n";
- print "perl run_genewisedb.pl <input_fasta> <input_hmms> <output> <outputdir> <spliceflat> <parameterfile>\n";
- print "where <input_fasta> is the input fasta file,\n";
- print " <input_hmms> is the input file of HMMs,\n";
- print " <output> is the genewise output file,\n";
- print " <outputdir> is the output directory for writing output files,\n";
- print " <spliceflat> says whether to use the -splice flat option (yes/no),\n";
- print " <parameterfile> is the name of the intron parameter file (none if there is none)\n";
- print "For example, >perl run_genewisedb.pl /lustre/scratch108/parasites/es9/50HGgenebuild/brpah/genome_small.fa\n";
- print "/lustre/scratch108/parasites/es9/GeneWise/TreeFam8.hmm genewise_output\n";
- print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n";
- print "NOTE: Before running this perl script, you need to run:\n";
- print "setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
- exit;
- }
- # FIND THE PATH TO THE INPUT FASTA FILE:
- my $input_fasta = $ARGV[0];
- # FIND THE INPUT FILE OF HMMs:
- my $input_hmms = $ARGV[1];
- # FIND THE GENEWISE OUTPUT FILE:
- my $output = $ARGV[2];
- # FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
- my $outputdir = $ARGV[3];
- # FIND OUT WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE:
- my $spliceflat = $ARGV[4];
- # FIND THE NAME OF THE INTRON PARAMETER FILE:
- my $parameterfile = $ARGV[5];
- #------------------------------------------------------------------#
- # TEST SUBROUTINES:
- my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
- &test_print_error;
- &test_write_fasta_file($outputdir);
- #------------------------------------------------------------------#
- # RUN THE MAIN PART OF THE CODE:
- &run_main_program($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);
- print STDERR "FINISHED.\n";
- #------------------------------------------------------------------#
- # RUN THE MAIN PART OF THE CODE:
- sub run_main_program
- {
- my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
- my $input_fasta = $_[1]; # THE INPUT FASTA FILE
- my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
- my $output = $_[3]; # THE OUTPUT FILE NAME
- my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
- my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
- my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
- my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
- # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:
- ($errorcode,$errormsg) = &check_wiseconfigdir;
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:
- ($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- }
- #------------------------------------------------------------------#
- # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:
- sub run_genewise_on_scaffolds
- {
- my $outputdir = $_[0]; # THE DIRECTORY FOR WRITING OUTPUT FILES
- my $input_fasta = $_[1]; # THE INPUT FASTA FILE OF SCAFFOLDS
- my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
- my $output = $_[3]; # THE OUTPUT FILE
- my $spliceflat = $_[4]; # SAYS WHETHER TO USE THE SPLICE FLAT OPTION
- my $parameterfile = $_[5]; # THE INTRON PARAMETER FILE NAME
- my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
- my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
- my $line; #
- my @temp; #
- my $name; # NAME OF A SCAFFOLD
- my $seq = ""; # SEQUENCE OF A SCAFFOLD
- my $seq_fasta; # FASTA FILE FOR SEQUENCE OF A SCAFFOLD
- # OPEN THE OUTPUT FILE:
- $output = $outputdir."/".$output;
- open(OUTPUT,">$output") || die "ERROR: run_genewise: cannot open $output\n";
- close(OUTPUT);
- # READ IN THE INPUT FASTA FILE AND TAKE ONE SCAFFOLD AT A TIME:
- open(INPUTFASTA,"$input_fasta") || die "ERROR: run_genewise_on_scaffolds: cannot open $input_fasta\n";
- while(<INPUTFASTA>)
- {
- $line = $_;
- chomp $line;
- if (substr($line,0,1) eq '>')
- {
- # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
- if ($seq ne '')
- {
- ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- # RUN GENEWISE ON $seq_fasta, ONE HMM AT A TIME:
- ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- # DELETE TEMPORARY FILE:
- system "rm -f $seq_fasta";
- }
- @temp = split(/\s+/,$line);
- $name = $temp[0];
- $name = substr($name,1,length($name)-1);
- }
- else
- {
- $seq = $seq.$line;
- }
- }
- close(INPUTFASTA);
- # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
- if ($seq ne '')
- {
- ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- # RUN GENEWISE ON $seq_fasta:
- ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- # DELETE TEMPORARY FILE:
- system "rm -f $seq_fasta";
- }
- return($errorcode,$errormsg);
- }
- #------------------------------------------------------------------#
- # TEST &write_fasta_file
- sub test_write_fasta_file
- {
- my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES TO
- my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
- my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
- my $fasta; # FASTA FILE
- my $expected_fasta; # EXPECTED CONTENTS OF FASTA FILE
- my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
- my $differences; # DIFFERENCES BETWEEN $fasta AND $expected_fasta
- my $length_differences; # LENGTH OF $differences
- my $line; #
- ($fasta,$errorcode,$errormsg) = &write_fasta_file("AGCTAGCT","myseq",$outputdir);
- if ($errorcode != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1\n"; exit;}
- $random_number = rand();
- $expected_fasta = $outputdir."/tmp".$random_number;
- open(EXPECTED,">$expected_fasta") || die "ERROR: write_fasta_file: cannot open $expected_fasta\n";
- print EXPECTED ">myseq\n";
- print EXPECTED "AGCTAGCT\n";
- close(EXPECTED);
- $differences = "";
- open(TEMP,"diff $fasta $expected_fasta |");
- while(<TEMP>)
- {
- $line = $_;
- $differences = $differences.$line;
- }
- close(TEMP);
- $length_differences = length($differences);
- if ($length_differences != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1 (files $fasta $expected_fasta)\n"; exit;}
- system "rm -f $fasta";
- system "rm -f $expected_fasta";
- }
- #------------------------------------------------------------------#
- # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
- sub write_fasta_file
- {
- my $seq = $_[0]; # SEQUENCE
- my $name = $_[1]; # NAME OF THE SEQUENCE
- my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO
- my $fasta; # FASTA FILE
- my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
- my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
- my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
- $random_number = rand();
- $fasta = $outputdir."/tmp".$random_number;
- open(FASTA,">$fasta") || die "ERROR: write_fasta_file: cannot open $fasta\n";
- print FASTA ">$name\n";
- print FASTA "$seq\n";
- close(FASTA);
- return($fasta,$errorcode,$errormsg);
- }
- #------------------------------------------------------------------#
- # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:
- sub check_wiseconfigdir
- {
- my $wiseconfigdir = "none";# THE WISECONFIGDIR
- my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
- my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
- open(TMP,"echo \$WISECONFIGDIR |");
- while(<TMP>)
- {
- $wiseconfigdir = $_;
- chomp($wiseconfigdir);
- }
- close(TMP);
- if ($wiseconfigdir ne '/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/')
- {
- $errormsg = "ERROR: check_wiseconfigdir: wiseconfigdir $wiseconfigdir\nBefore running this perl script, you must run:\nsetenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
- $errorcode = 6; # ERRORCODE=6
- return($errorcode,$errormsg);
- }
- return($errorcode,$errormsg);
- }
- #------------------------------------------------------------------#
- # RUN GENEWISE ON ONE HMM AT A TIME:
- sub run_genewise
- {
- my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO
- my $input_fasta = $_[1]; # THE INPUT FASTA FILE
- my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
- my $output = $_[3]; # THE OUTPUT FILE NAME
- my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
- my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
- my $scaffoldname = $_[6]; # THE NAME OF THE SCAFFOLD
- my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
- my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
- my $line; #
- my $hmm = ""; # A HMM
- my $hmmname = "none";# HMM NAME
- my @temp; #
- # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:
- open(HMMS,"$input_hmms") || die "ERROR: run_genewise: cannot open $input_hmms\n";
- while(<HMMS>)
- {
- $line = $_;
- $hmm = $hmm.$line;
- if (substr($line,0,2) eq '//')
- {
- # CHECK WE HAVE A NAME FOR THE HMM:
- if ($hmmname eq 'none')
- {
- $errormsg = "ERROR: run_genewise: do not have name for hmm $hmm\n";
- $errorcode = 3; # ERRORCODE=3
- return($errorcode,$errormsg);
- }
- # RUN GENEWISE FOR THIS HMM:
- ($errorcode,$errormsg) = &run_genewise_on_hmm($hmm,$outputdir,$input_fasta,$output,$hmmname,$spliceflat,$parameterfile,$scaffoldname);
- if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
- $hmm = "";
- $hmmname = "none";
- }
- if (substr($line,0,4) eq 'NAME')
- {
- @temp = split(/\s+/,$line);
- $hmmname = $temp[1];
- chomp($hmmname);
- }
- }
- close(HMMS);
- if ($hmm ne '')
- {
- $errormsg = "ERROR: run_genewise: at end of file hmm should be empty\n";
- $errorcode = 1; # ERRORCODE=1
- return($errorcode,$errormsg);
- }
- return($errorcode,$errormsg);
- }
- #------------------------------------------------------------------#
- # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:
- sub run_genewise_on_hmm
- {
- my $hmm = $_[0]; # THE HMM
- my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES TO
- my $input_fasta = $_[2]; # INPUT FASTA FILE
- my $output = $_[3]; # FILE FOR GENEWISE OUTPUT
- my $hmmname = $_[4]; # NAME OF THE HMM
- my $spliceflat = $_[5]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
- my $parameterfile = $_[6]; # THE NAME OF THE INTRON PARAMETER FILE
- my $scaffoldname = $_[7]; # NAME OF THE SCAFFOLD
- my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
- my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
- my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME
- my $hmmfile; # FILE CONTAINING THE HMM
- my $genewise; # TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM
- my $cmd; # THE GENEWISE COMMAND TO RUN
- my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats
- # CHECK THE HMM LOOKS OK:
- if (substr($hmm,0,8) ne 'HMMER2.0' && substr($hmm,length($hmm)-2,2) ne '//')
- {
- $errormsg = "ERROR: run_genewise_on_hmm: hmm $hmm\n";
- $errorcode = 2; # ERRORCODE=2
- return($errorcode,$errormsg);
- }
- # CHECK THE SPLICEFLAT OPTION IS 'yes' OR 'no':
- if ($spliceflat ne 'yes' && $spliceflat ne 'no')
- {
- $errormsg = "ERROR: run_genewise_on_hmm: spliceflat is $spliceflat (should be yes or no)\n";
- $errorcode = 4; # ERRORCODE=4
- return($errorcode,$errormsg);
- }
- # WRITE THE HMM TO A FILE:
- $random_number = rand();
- $hmmfile = $outputdir."/tmp".$random_number;
- open(HMMFILE,">$hmmfile") || die "ERROR: run_genewise_on_hmm: cannot open $hmmfile\n";
- print HMMFILE "$hmm";
- close(HMMFILE);
- # WRITE THE FAMILY NAME IN THE OUTPUT FILE:
- system "echo $hmmname >> $output";
- # RUN GENEWISE:
- $random_number = rand();
- $genewise = $outputdir."/tmp".$random_number;
- print STDERR "Scaffold $scaffoldname: running genewise for HMM $hmmname ...\n";
- if ($parameterfile ne 'none' && $spliceflat eq 'no')
- {
- $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -genestats $parameterfile -nosplice_gtag > $genewise";
- }
- elsif ($parameterfile eq 'none' && $spliceflat eq 'yes')
- {
- $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -splice_gtag > $genewise";
- }
- elsif ($parameterfile eq 'none' && $spliceflat eq 'no')
- {
- $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -nosplice_gtag > $genewise"; # USES THE gene.stats FILE THAT COMES WITH GENEWISE, /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat
- }
- elsif ($parameterfile ne 'none' && $spliceflat eq 'yes')
- {
- $errormsg = "ERROR: run_genewise_on_hmm: cannot have spliceflat $spliceflat and parameterfile $parameterfile\n";
- $errorcode = 5; # ERRORCODE=5
- return($errorcode,$errormsg);
- }
- system "$cmd";
- # CONCATENATE THIS GENEWISE OUTPUT ONTO THE MAIN OUTPUT FILE OF GENEWISE OUTPUT:
- sleep(1);
- system "cat $genewise";
- system "cat $genewise >> $output";
- # DELETE TEMPORARY FILES:
- system "rm -f $hmmfile";
- system "rm -f $genewise";
- return($errorcode,$errormsg);
- }
- #------------------------------------------------------------------#
- # TEST &print_error
- sub test_print_error
- {
- my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
- my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
- ($errormsg,$errorcode) = &print_error(45,45,1);
- if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
- ($errormsg,$errorcode) = &print_error('My error message','My error message',1);
- if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
- ($errormsg,$errorcode) = &print_error('none',45,1);
- if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
- ($errormsg,$errorcode) = &print_error('My error message', 0, 1);
- if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
- }
- #------------------------------------------------------------------#
- # PRINT OUT AN ERROR MESSAGE AND EXIT.
- sub print_error
- {
- my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
- my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
- my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
- if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
- {
- if ($called_from_test == 1)
- {
- $errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
- return($errormsg,$errorcode);
- }
- else
- {
- print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
- exit;
- }
- }
- if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
- {
- if ($called_from_test == 1)
- {
- $errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
- return($errormsg,$errorcode);
- }
- else
- {
- print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
- exit;
- }
- }
- if ($errormsg eq 'none' || $errorcode == 0)
- {
- if ($called_from_test == 1)
- {
- $errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
- return($errormsg,$errorcode);
- }
- else
- {
- print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
- exit;
- }
- }
- else
- {
- print STDERR "$errormsg";
- exit;
- }
- return($errormsg,$errorcode);
- }
- #------------------------------------------------------------------#
Add Comment
Please, Sign In to add comment