Guest User

Untitled

a guest
May 25th, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.39 KB | None | 0 0
  1. #!/usr/bin/env perl
  2. use warnings;
  3. use strict;
  4.  
  5. local $SIG{__WARN__} = sub {
  6. my $message = shift;
  7. logger('warning', $message);
  8. };
  9. sub logger {
  10. my ($level, $msg) = @_;
  11. if (open my $out, '>>', 'convert2gn.log') {
  12. chomp $msg;
  13. print $out "$level - $msg\n"; #add newline after $msg
  14. }
  15. }
  16.  
  17.  
  18. my @files = glob( "\.\/IN" . '/*' );
  19. print "select file for converting to gene names\n";
  20. my $i = 0;
  21. for (@files) {print "$i\t$_\n"; $i++};
  22. chomp (my $will = <STDIN>);
  23. my $fl = $files[$will];
  24. print "$fl\n";
  25.  
  26. my (@out, $header) = ();
  27. my $fasta = get_fasta();
  28. my %fas = %{$fasta};
  29. my (%t, @h, $idx);
  30. open my $fh, "<", $fl or die "ERROR. Can not open $fl $!";
  31. while (<$fh>) {
  32. chomp;
  33. if (m/.*ID\t.+/) {
  34. $header = $_;
  35. @h = split /\t/, $_;
  36. }
  37. else {
  38. my @aa = split /\t/, $_;
  39. @t{@h} = @aa;
  40. my $bb = rm_sht($t{ID});
  41. my @idx2 = grep {$h[$_] eq 'ID' } 0..$#h;
  42. $idx = $idx2[0];
  43. for (@{$bb}) {
  44. if ($fas{$_}){
  45. $aa[$idx] = "$fas{$_}\t$_";
  46. my $bb = join "\t", @aa;
  47. push @out, $bb
  48. }
  49. }
  50. }
  51. }
  52. close $fh or die "ERROR. can not close fh\n$!\n";
  53. $h[$idx] = "GN\tID";
  54. @out = uniq(@out);
  55. writeO(\@out, "\.\/IN\/conv2gn_out.txt", join("\t", @h));
  56.  
  57.  
  58. sub rm_sht {
  59. #receive a mq formatted string (multiple accession tremble and shit) and convert it to SINGLE accession
  60. my ($string) = @_;
  61. if ($string =~ m/(.+);(.+)/g){ #if multiple accession
  62. my %out;
  63. my @arr = split /;/, $string;
  64. #remove swissprot or tremble id
  65. for my $single (@arr) {
  66.  
  67. #$single =~ s/sp\|.+\||tr\|.+\|//g;
  68. $single =~ s/sp\|(.+)\|[\w]+/$1/g;
  69. $out{$single} = 1; #so removing the duplicates by default
  70. }
  71. my @lol = keys %out;
  72. return (\@lol);
  73. }
  74. else {
  75. $string =~ s/sp\|(.+)\|[\w]+/$1/g;
  76. my @arr;
  77. push @arr, $string;
  78. return (\@arr);
  79. }
  80. }
  81.  
  82. sub read_table {
  83. #read a table and return a hash of an array with everything in that
  84. my ($path, $firstcolumname, $sep) = @_;
  85. open my $fh, "<", $path or die "ERROR. can not open fh$:";
  86. my (@h, %t, %out);
  87. while (<$fh>) {
  88. chomp;
  89. if (/^$firstcolumname/) {
  90. @h = split /$sep/, $_;
  91. next;
  92. }
  93. else {
  94. my $string = $_;
  95. my @d;
  96. if ($sep eq "\t") {
  97. @d = split /\t/, $string;
  98. }
  99. elsif ($sep eq ",") {
  100. @d = $string =~ m/("[^"]+"|[^,]*)(?:,s*)?/g;
  101. }
  102. @t{@h} = @d;
  103. }
  104. if (@h) {
  105. for my $headers (@h) {
  106. push @{$out{$headers}}, $t{$headers};
  107. }
  108. }
  109. }
  110. eval {
  111. close $fh or die "ERROR. can not close fh$!";
  112. };
  113. if ($@) {
  114. return ('error with close fh')
  115. }
  116. my $out_ref = \%out;
  117. return ($out_ref);
  118. }
  119.  
  120. sub writeO {
  121. #print out to file giving an array ref, path to file and the headers
  122. #use writeO($uni_sec, "./$lst{second_flnm},uni.txt", "unique");
  123. my ($ref, $outfile, $headers) = @_;
  124. my @out = @{$ref};
  125. unshift @out, $headers; #headers
  126. @out = map {$_ . "\n"}@out; #map newline
  127. open my $out_fh, ">", $outfile or die "$! can not open $outfile";
  128. print $out_fh @out;
  129. close $out_fh or die "can not close $outfile $!";
  130. }
  131.  
  132. sub get_fasta {
  133. my ($wanted) = @_;
  134. my @res = glob( "\.\/FASTA" . '/*' );
  135. print "fasta files available are:\n";
  136. my $i = 0;
  137. for (@res) {print "$i\t$_\n"; $i++};
  138. print "select fasta file to use for the conversion\t";
  139. chomp (my $will = <STDIN>);
  140. my $fasta = $res[$will];
  141. print "$fasta\n";
  142. open my $fh, "<", $fasta or die "ERROR. Can not open FASTA file $!";
  143. my (@h, %t);
  144. my $dupl = 1;
  145. while (<$fh>) {
  146. chomp;
  147. #used before ^<sp|.*\|([\w]+) .*GN=([\w]+) then added the part for matching the hyphen and if anything after the hyphen
  148. if (/sp\|(.+)\|[\w]+ .*GN=([\w]+[\-]?[\w]+).*/) {
  149. my $gene = $2;
  150. my $prot = $1;
  151. if (grep {$_ eq $gene} values %t){
  152. $dupl++;
  153. $gene = "$gene\_$dupl";
  154. $t{$prot} = uc($gene);
  155. }
  156. else {
  157. $t{$prot} = uc($gene);
  158. }
  159. }
  160. }
  161. close $fh or die "ERROR. can not close fh$!";
  162. return \%t;
  163. }
  164.  
  165. sub uniq {
  166. my %seen;
  167. grep !$seen{$_}++, @_;
  168. }
Add Comment
Please, Sign In to add comment