Advertisement
Guest User

Perl Genome Unpacking Module

a guest
Dec 3rd, 2012
284
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 1.86 KB | None | 0 0
  1. #!usr/bin/perl
  2.  
  3. #package to support Chromosomal Sequence Extraction without reading in the whole chromosome file. There should be only one sequence in the genome file, which format_genome will convert to binary string. get_sequence can then be used to lookup a subset of sequence information without loading the entire chromosome sequence.
  4. #written by Russell Durrett
  5.  
  6. package Extract;
  7.  
  8. sub is_fasta {
  9.     my $self = shift;
  10.     my $file = shift;
  11.     open (IN, $file);
  12.     $first_character = getc IN;
  13.     close IN;
  14.    
  15.     print "Running IS_FASTA? Routine - First character of file is $first_character";
  16.     if ($first_character =~ />/ ) { return 'true' } else { return 'false' }
  17. }
  18.  
  19. sub format_genome {
  20.     my $self = shift;
  21.     my $genome_infile = shift;
  22.     my $genome_outfile = shift;
  23.    
  24.     print "Reformatting $genome_infile to use with seek-unpack method. Renaming it $genome_outfile\n\n";
  25.    
  26.     open (IN, $genome_infile);
  27.     @lines = <IN>;
  28.     close IN;
  29.    
  30.     open (OUT, ">$genome_outfile");
  31.    
  32.     foreach $line (@lines){
  33.     if ($line =~ /^>/){ #do nothing with header
  34.     } else {
  35.         chomp $line;
  36.         push @newlines, $line;
  37.         }
  38.     }
  39.     $sequence = join ("", @newlines);
  40.    
  41.     print OUT $sequence;
  42.     print "\n\nPRINTED ALL OF $genome_infile CONCATENATED TO ONE LINE NOW IN $genome_outfile \n\n\n";  
  43. }
  44.  
  45.  
  46.  
  47.  
  48. sub get_sequence{
  49.  
  50.  
  51.     #USAGE = get_sequence( INFILE , START POSITION , LENGTH );
  52.     my $name = shift;
  53.     my $infile = shift;
  54.     my $start = shift;
  55.     my $length = shift;
  56.    
  57.     open (IN, "<$infile");
  58.    
  59.     #print "\n Getting $length characters from $infile starting at $start\n\n";
  60.    
  61.     $uplen = "A" . $length; #format length & input as ASCII Text (just 'A' and then the number of text characters)
  62.    
  63.     seek(IN, $start, 0); # seek to the position in the file you want to start at - beware newlines
  64.    
  65.     $output = unpack($uplen, <IN>); #read in bytes from binary
  66.    
  67.     return $output;
  68.  
  69. }
  70.  
  71.  
  72.  
  73.  
  74.  
  75.  
  76.  
  77.  
  78.  
  79. 1;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement