Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- use strict;
- use warnings;
- use File::Copy;
- # This script reads all provisional studies on datahub and does the following fixes:
- # 1. Checks if the sample_type value corresponds to the tcga barcode else fixes it in place.
- # 2. Corrects the sample_type_id col
- # 3. Changes the PRIMARY_SITE, METASTATIC_SITE and TUMOR_TISSUE_SITE attribute headers and descriptions to patient level attributes PRIMARY_SITE_PATIENT, METASTATIC_SITE_PATIENT and SITE_OF_TUMOR_TISSUE
- my %bar_codes = ('01' => 'Primary', '02' => 'Recurrent', '03' => 'Primary', '05' => 'Primary', '06' => 'Metastatic', '07' => 'Metastatic', '11' => 'Normal', '12' => 'Normal');
- my %sampletypeids = ('01' => '1', '02' => '2', '03' => '3', '05' => '5', '06' => '6', '07' => '7', '11' => '11', '12' => '12');
- my $path = "/Users/rmadupuri/GitHub/datahub/public";
- my @dirs;
- opendir(D,$path);
- while(my $dir = readdir(D))
- {
- next if $dir =~ /^\./;
- if($dir =~ /._tcga$/) # For provisional studies
- {
- push(@dirs,$dir);
- }
- }
- closedir(D);
- # correct the sample types
- foreach(@dirs)
- {
- my $dirname = $_;
- my $dirpath = $path."/".$_;
- if (-d $dirpath)
- {
- opendir(H,$dirpath);
- while(my $file = readdir(H))
- {
- if($file =~ /.data.sample./)
- {
- my $filepath = $dirpath."/".$file;
- open(FH,"<$filepath") or die("can't open the file");
- my $file_cont = "";
- my $pat_col;
- my $sam_col;
- my $sam_typeid_col;
- while(my $line = <FH>)
- {
- if($line =~ /^#/)
- {
- $file_cont .= $line;
- next;
- }
- elsif($line =~ /^PATIENT_ID/ || $line =~ /^SAMPLE_ID/)
- {
- $file_cont .= $line;
- my @arr = split("\t",$line);
- for(my $i=0;$i < scalar(@arr);$i++)
- {
- if($arr[$i] eq "SAMPLE_ID"){ $pat_col = $i};
- if($arr[$i] eq "SAMPLE_TYPE"){ $sam_col = $i};
- if($arr[$i] eq "SAMPLE_TYPE_ID"){ $sam_typeid_col = $i};
- }
- }
- else
- {
- my @arr1 = split("\t",$line);
- my @s_c = split("-",$arr1[$pat_col]);
- my $sam_code = $s_c[scalar(@s_c)-1];
- if($arr1[$sam_col] ne $bar_codes{$sam_code})
- {
- if($arr1[$sam_typeid_col] ne $sampletypeids{$sam_code})
- {
- $arr1[$sam_typeid_col] = $sampletypeids{$sam_code};
- }
- #print $dirname."\t".$arr1[$pat_col]."\t".$arr1[$sam_col]."\t".$bar_codes{$sam_code}."\n";
- $arr1[$sam_col] = $bar_codes{$sam_code};
- my $str = join("\t",@arr1);
- $file_cont .= $str;
- }
- else
- {
- if($arr1[$sam_typeid_col] ne $sampletypeids{$sam_code})
- {
- $arr1[$sam_typeid_col] = $sampletypeids{$sam_code};
- }
- my $str = join("\t",@arr1);
- $file_cont .= $str;
- }
- }
- }
- close(FH);
- unlink($filepath);
- open(NF,">$filepath") or die();
- print NF $file_cont;
- }
- }
- }
- }
- # change the patient attribute names
- foreach(@dirs)
- {
- my $dirname = $_;
- my $dirpath = $path."/".$_;
- if (-d $dirpath)
- {
- opendir(H,$dirpath);
- while(my $file = readdir(H))
- {
- if($file =~ /.data.patient./)
- {
- my $ps_col = -1;
- my $ms_col = -1;
- my $tts_col = -1;
- my $display_line;
- my $comments = "";
- my $header;
- my $file_cont = "";
- my $filepath = $dirpath."/".$file;
- open(FH,"<$filepath") or die("can't open the file");
- my $c=1;
- while(my $line = <FH>)
- {
- if($line =~ /^#/)
- {
- if($c == 1)
- {
- $display_line = $line;
- $c++;
- }
- else
- {
- $comments .= $line;
- }
- }
- elsif($line =~ /^OTHER_PATIENT_ID/)
- {
- $header = $line;
- my @arr = split("\t",$line);
- for(my $i=0;$i < scalar(@arr);$i++)
- {
- if($arr[$i] eq "PRIMARY_SITE"){ $ps_col = $i};
- if($arr[$i] eq "METASTATIC_SITE"){ $ms_col = $i};
- if($arr[$i] eq "TUMOR_TISSUE_SITE"){ $tts_col = $i};
- }
- }
- else
- {
- $file_cont .= $line;
- }
- }
- my @com_line = split("\t",$display_line);
- if($ps_col != -1){ $com_line[$ps_col] = "Patient Primary Tumor Site" };
- if($ms_col != -1){ $com_line[$ms_col] = "Patient Metastatic Sites" };
- if($tts_col != -1){ $com_line[$tts_col] = "Tumor Tissue Site" };
- my $new_disp_line = join("\t",@com_line);
- my @he_line = split("\t",$header);
- if($ps_col != -1){ $he_line[$ps_col] = "PRIMARY_SITE_PATIENT" };
- if($ms_col != -1){ $he_line[$ms_col] = "METASTATIC_SITE_PATIENT" };
- if($tts_col != -1){ $he_line[$tts_col] = "SITE_OF_TUMOR_TISSUE" };
- my $new_head_line = join("\t",@he_line);
- close(FH);
- unlink($filepath);
- open(NF,">$filepath") or die();
- print NF $new_disp_line.$comments.$new_head_line.$file_cont;
- }
- }
- }
- }
Add Comment
Please, Sign In to add comment