Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### ----------------------------------------------------
- ### Script for sorting and comparing multi-sample report
- ### values from Avadis NGS
- ###
- ### Written by Niklas Malmqvist 2012
- ### ----------------------------------------------------
- #!/usr/bin/perl
- use strict;
- use warnings;
- use Getopt::Long;
- ## Declare various variables;
- my $sortColumn;
- my $sortDescending = 0;
- my $INPUT;
- my @splitline = ();
- my $noTotalHighScores = 0;
- my $minScore;
- my $offset;
- my $noSamples;
- my $noSampleHighScores = 0;
- my $minSampleCount = 0;
- GetOptions( "col=i" => \$sortColumn,
- "r" => \$sortDescending,
- "s=n" => \$noSamples, ## TODO: Figure out how to automatically read nbr of samples
- "m=n" => \$minScore,
- "c=n" => \$minSampleCount,
- "i=s" => \$INPUT);
- ## Options input check
- if ($sortDescending) { $sortDescending = "r"; } else { $sortDescending = ""; }
- unless ($sortColumn && $INPUT && $noSamples && $minScore && $minSampleCount)
- { print "$0: Missing mandatory input arguments!\n"; exit 1; }
- ## Open the input file
- open INFILE, "<", $INPUT or die "Can't open the input file!";
- ## Open the output file
- open OUTFILE , ">", $INPUT."-highscores" or die "Can't open the output file!";
- ## Sort the file and filter out rows with "Sample count" below treshold
- system("head -n1 $INPUT > tmpPart1.csv");
- system("tail -n+2 $INPUT | sort -k".$sortColumn."n".$sortDescending." > tmpPart2.csv");
- system("awk \'\$7 >= $minSampleCount\' tmpPart2.csv > tmpPart3.csv");
- system("cat tmpPart1.csv tmpPart3.csv > $INPUT.sorted");
- system("rm tmpPart1.csv tmpPart2.csv tmpPart3.csv");
- close (INFILE);
- ## Go through the sorted file line-by-line and check for scores
- open INFILE_SORTED, "<", $INPUT.".sorted" or die "Can't open the sorted input file!";
- my $lineCount = 1;
- while (my $line = <INFILE_SORTED>) {
- # Skip first line (header)
- if ($lineCount == 1) {
- ## Print the header to the output
- print OUTFILE $line;
- }
- else {
- # The score for the first sample is stored in column 14, next sample in column 20 and so on
- # This is handled by $offset
- $offset = 14;
- @splitline = split('\t', $line);
- $noSampleHighScores = 0;
- ## Check the scores of all the samples in the report
- for (1 .. $noSamples) {
- if ($splitline[$offset-1] && ($splitline[$offset-1] >= $minScore)) {
- ## Score threshold OK
- $noSampleHighScores++;
- }
- $offset += 6;
- }
- if ($noSampleHighScores == $noSamples) {
- $noTotalHighScores++;
- print OUTFILE $line;
- }
- }
- $lineCount++;
- }
- print "Done!\nSamples with score > threshold: $noTotalHighScores\n";
- close (INFILE_SORTED);
- close (OUTFILE);
Add Comment
Please, Sign In to add comment