Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env perl
- use warnings;
- use strict;
- local $SIG{__WARN__} = sub {
- my $message = shift;
- logger('warning', $message);
- };
- sub logger {
- my ($level, $msg) = @_;
- if (open my $out, '>>', 'convert2gn.log') {
- chomp $msg;
- print $out "$level - $msg\n"; #add newline after $msg
- }
- }
- my @files = glob( "\.\/IN" . '/*' );
- print "select file for converting to gene names\n";
- my $i = 0;
- for (@files) {print "$i\t$_\n"; $i++};
- chomp (my $will = <STDIN>);
- my $fl = $files[$will];
- print "$fl\n";
- my (@out, $header) = ();
- my $fasta = get_fasta();
- my %fas = %{$fasta};
- my (%t, @h, $idx);
- open my $fh, "<", $fl or die "ERROR. Can not open $fl $!";
- while (<$fh>) {
- chomp;
- if (m/.*ID\t.+/) {
- $header = $_;
- @h = split /\t/, $_;
- }
- else {
- my @aa = split /\t/, $_;
- @t{@h} = @aa;
- my $bb = rm_sht($t{ID});
- my @idx2 = grep {$h[$_] eq 'ID' } 0..$#h;
- $idx = $idx2[0];
- for (@{$bb}) {
- if ($fas{$_}){
- $aa[$idx] = "$fas{$_}\t$_";
- my $bb = join "\t", @aa;
- push @out, $bb
- }
- }
- }
- }
- close $fh or die "ERROR. can not close fh\n$!\n";
- $h[$idx] = "GN\tID";
- @out = uniq(@out);
- writeO(\@out, "\.\/IN\/conv2gn_out.txt", join("\t", @h));
- sub rm_sht {
- #receive a mq formatted string (multiple accession tremble and shit) and convert it to SINGLE accession
- my ($string) = @_;
- if ($string =~ m/(.+);(.+)/g){ #if multiple accession
- my %out;
- my @arr = split /;/, $string;
- #remove swissprot or tremble id
- for my $single (@arr) {
- #$single =~ s/sp\|.+\||tr\|.+\|//g;
- $single =~ s/sp\|(.+)\|[\w]+/$1/g;
- $out{$single} = 1; #so removing the duplicates by default
- }
- my @lol = keys %out;
- return (\@lol);
- }
- else {
- $string =~ s/sp\|(.+)\|[\w]+/$1/g;
- my @arr;
- push @arr, $string;
- return (\@arr);
- }
- }
- sub read_table {
- #read a table and return a hash of an array with everything in that
- my ($path, $firstcolumname, $sep) = @_;
- open my $fh, "<", $path or die "ERROR. can not open fh$:";
- my (@h, %t, %out);
- while (<$fh>) {
- chomp;
- if (/^$firstcolumname/) {
- @h = split /$sep/, $_;
- next;
- }
- else {
- my $string = $_;
- my @d;
- if ($sep eq "\t") {
- @d = split /\t/, $string;
- }
- elsif ($sep eq ",") {
- @d = $string =~ m/("[^"]+"|[^,]*)(?:,s*)?/g;
- }
- @t{@h} = @d;
- }
- if (@h) {
- for my $headers (@h) {
- push @{$out{$headers}}, $t{$headers};
- }
- }
- }
- eval {
- close $fh or die "ERROR. can not close fh$!";
- };
- if ($@) {
- return ('error with close fh')
- }
- my $out_ref = \%out;
- return ($out_ref);
- }
- sub writeO {
- #print out to file giving an array ref, path to file and the headers
- #use writeO($uni_sec, "./$lst{second_flnm},uni.txt", "unique");
- my ($ref, $outfile, $headers) = @_;
- my @out = @{$ref};
- unshift @out, $headers; #headers
- @out = map {$_ . "\n"}@out; #map newline
- open my $out_fh, ">", $outfile or die "$! can not open $outfile";
- print $out_fh @out;
- close $out_fh or die "can not close $outfile $!";
- }
- sub get_fasta {
- my ($wanted) = @_;
- my @res = glob( "\.\/FASTA" . '/*' );
- print "fasta files available are:\n";
- my $i = 0;
- for (@res) {print "$i\t$_\n"; $i++};
- print "select fasta file to use for the conversion\t";
- chomp (my $will = <STDIN>);
- my $fasta = $res[$will];
- print "$fasta\n";
- open my $fh, "<", $fasta or die "ERROR. Can not open FASTA file $!";
- my (@h, %t);
- my $dupl = 1;
- while (<$fh>) {
- chomp;
- #used before ^<sp|.*\|([\w]+) .*GN=([\w]+) then added the part for matching the hyphen and if anything after the hyphen
- if (/sp\|(.+)\|[\w]+ .*GN=([\w]+[\-]?[\w]+).*/) {
- my $gene = $2;
- my $prot = $1;
- if (grep {$_ eq $gene} values %t){
- $dupl++;
- $gene = "$gene\_$dupl";
- $t{$prot} = uc($gene);
- }
- else {
- $t{$prot} = uc($gene);
- }
- }
- }
- close $fh or die "ERROR. can not close fh$!";
- return \%t;
- }
- sub uniq {
- my %seen;
- grep !$seen{$_}++, @_;
- }
Add Comment
Please, Sign In to add comment