Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- E00489:44:HNNVYCCXX:3:1101:24890:5616 99 22 16052014 150M
- E00489:44:HNNVYCCXX:1:1110:21704:27345 99 22 16052044 150M
- E00489:44:HNNVYCCXX:1:1217:2372:69519 163 22 16052044 150M
- E00489:44:HNNVYCCXX:3:2123:8948:16779 99 22 16052044 150M
- E00489:44:HNNVYCCXX:2:2213:2920:25534 147 22 16052054 146M4S
- E00489:44:HNNVYCCXX:2:2206:5020:71717 83 22 16052055 145M5S
- E00489:44:HNNVYCCXX:4:2206:12642:40829 99 22 16052056 144M6S
- #!/bin/perl
- #initialize hashes
- my %softhash;
- my %IDhash;
- my $file1Name = $ARGV[0] . 'all_sc_positions.txt';
- my $file2Name = $ARGV[0] . 'edge_sc_positions.txt';
- open(my $fh_all, '>', $file1Name);
- open(my $fh_edge, '>', $file2Name);
- #for each line
- while (my $line = <STDIN>) {
- #read in line and parse
- chomp($line);
- my @a = split("t", $line);
- my $start = $a[3];
- my $cigar = $a[4];
- my @b = split(/[A-Z]/, $cigar);# keeps track of numbers
- my @c = split(/[0-9]*/, $cigar); #keeps track of letters
- my $loc = $start; #distance from start of read
- my $var_start;
- my $var_end;
- #loop over type of alignment in cigar string, build hashes of candidate locations
- for (my $i = 0; $i <= $#c; $i++) {
- #if there is softclipping or an indel, find the location and store in hash
- if ($c[$i] eq "S" || $c[$i] eq "I" || $c[$i] eq "D") {
- #find softclipping location
- if ($i == 1) {
- $var_start = $start - $b[0];
- $var_end = $start;
- if ($c[$i] eq "S") {
- for (my $pos = $var_start; $pos < $var_end; $pos++) {
- $softhash{$a[2]}{$pos}++;
- }
- $var_start = $var_end - 1;
- $var_end = $var_start;
- }
- } else {
- for (my $j = $i-2; $j >= 0; $j--) {# subtract 2 from i, because the first value from c will always be empty
- $loc = $loc + $b[$j];
- }
- $var_start = $loc;
- $var_end = $loc + $b[($i-1)];
- if ($c[$i] eq "S") {
- for (my $pos=$var_start; $pos<$var_end; $pos++) {
- $softhash{$a[2]}{$pos}++;
- }
- $var_end = $var_start;
- }
- }
- $IDhash{$a[2]}{$var_start}{$var_end}{$c[$i]}++;
- }
- }
- }
- #write out edge features
- foreach my $key_chr (sort(keys %IDhash)) {
- foreach my $key_start (sort { $a <=> $b } (keys %{ $IDhash{$key_chr} })) {
- foreach my $key_end (sort { $a <=> $b } (keys %{ $IDhash{$key_chr}{$key_start} })){
- print $fh_edge "$key_chrt$key_startt$key_endt";
- my $sum = $IDhash{$key_chr}{$key_start}{$key_end}{I} + $IDhash{$key_chr}{$key_start}{$key_end}{D} + $IDhash{$key_chr}{$key_start}{$key_end}{S};
- print $fh_edge "$sum,";
- for my $l ('S','I','D') {
- if (defined($IDhash{$key_chr}{$key_start}{$key_end}{$l})) {
- print $fh_edge "$IDhash{$key_chr}{$key_start}{$key_end}{$l},";
- } else {
- print $fh_edge "0,";
- }
- }
- print $fh_edge "n";
- }
- }
- }
- #write out "all" features
- foreach my $key_chr (sort(keys %softhash)) {
- foreach my $key_pos (sort { $a <=> $b } (keys %{ $softhash{$key_chr} })) {
- print $fh_all "$key_chrt$key_post";
- print $fh_all "$softhash{$key_chr}{$key_pos}n";
- }
- }
Add Comment
Please, Sign In to add comment