Guest User

Untitled

a guest
Nov 16th, 2018
111
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.61 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. use strict;
  4. use warnings;
  5. use File::Copy;
  6.  
  7. # This script reads all provisional studies on datahub and does the following fixes:
  8. # 1. Checks if the sample_type value corresponds to the tcga barcode else fixes it in place.
  9. # 2. Corrects the sample_type_id col
  10. # 3. Changes the PRIMARY_SITE, METASTATIC_SITE and TUMOR_TISSUE_SITE attribute headers and descriptions to patient level attributes PRIMARY_SITE_PATIENT, METASTATIC_SITE_PATIENT and SITE_OF_TUMOR_TISSUE
  11.  
  12. my %bar_codes = ('01' => 'Primary', '02' => 'Recurrent', '03' => 'Primary', '05' => 'Primary', '06' => 'Metastatic', '07' => 'Metastatic', '11' => 'Normal', '12' => 'Normal');
  13. my %sampletypeids = ('01' => '1', '02' => '2', '03' => '3', '05' => '5', '06' => '6', '07' => '7', '11' => '11', '12' => '12');
  14. my $path = "/Users/rmadupuri/GitHub/datahub/public";
  15. my @dirs;
  16. opendir(D,$path);
  17. while(my $dir = readdir(D))
  18. {
  19. next if $dir =~ /^\./;
  20. if($dir =~ /._tcga$/) # For provisional studies
  21. {
  22. push(@dirs,$dir);
  23. }
  24. }
  25. closedir(D);
  26.  
  27.  
  28. # correct the sample types
  29. foreach(@dirs)
  30. {
  31. my $dirname = $_;
  32. my $dirpath = $path."/".$_;
  33.  
  34. if (-d $dirpath)
  35. {
  36. opendir(H,$dirpath);
  37. while(my $file = readdir(H))
  38. {
  39. if($file =~ /.data.sample./)
  40. {
  41.  
  42. my $filepath = $dirpath."/".$file;
  43. open(FH,"<$filepath") or die("can't open the file");
  44. my $file_cont = "";
  45. my $pat_col;
  46. my $sam_col;
  47. my $sam_typeid_col;
  48. while(my $line = <FH>)
  49. {
  50. if($line =~ /^#/)
  51. {
  52. $file_cont .= $line;
  53. next;
  54. }
  55. elsif($line =~ /^PATIENT_ID/ || $line =~ /^SAMPLE_ID/)
  56. {
  57. $file_cont .= $line;
  58. my @arr = split("\t",$line);
  59. for(my $i=0;$i < scalar(@arr);$i++)
  60. {
  61. if($arr[$i] eq "SAMPLE_ID"){ $pat_col = $i};
  62. if($arr[$i] eq "SAMPLE_TYPE"){ $sam_col = $i};
  63. if($arr[$i] eq "SAMPLE_TYPE_ID"){ $sam_typeid_col = $i};
  64. }
  65. }
  66. else
  67. {
  68. my @arr1 = split("\t",$line);
  69.  
  70. my @s_c = split("-",$arr1[$pat_col]);
  71. my $sam_code = $s_c[scalar(@s_c)-1];
  72.  
  73. if($arr1[$sam_col] ne $bar_codes{$sam_code})
  74. {
  75. if($arr1[$sam_typeid_col] ne $sampletypeids{$sam_code})
  76. {
  77. $arr1[$sam_typeid_col] = $sampletypeids{$sam_code};
  78. }
  79. #print $dirname."\t".$arr1[$pat_col]."\t".$arr1[$sam_col]."\t".$bar_codes{$sam_code}."\n";
  80. $arr1[$sam_col] = $bar_codes{$sam_code};
  81. my $str = join("\t",@arr1);
  82. $file_cont .= $str;
  83.  
  84. }
  85. else
  86. {
  87. if($arr1[$sam_typeid_col] ne $sampletypeids{$sam_code})
  88. {
  89. $arr1[$sam_typeid_col] = $sampletypeids{$sam_code};
  90. }
  91. my $str = join("\t",@arr1);
  92. $file_cont .= $str;
  93. }
  94. }
  95.  
  96. }
  97. close(FH);
  98. unlink($filepath);
  99.  
  100. open(NF,">$filepath") or die();
  101. print NF $file_cont;
  102. }
  103.  
  104.  
  105. }
  106. }
  107. }
  108.  
  109.  
  110.  
  111. # change the patient attribute names
  112. foreach(@dirs)
  113. {
  114. my $dirname = $_;
  115. my $dirpath = $path."/".$_;
  116.  
  117. if (-d $dirpath)
  118. {
  119. opendir(H,$dirpath);
  120. while(my $file = readdir(H))
  121. {
  122.  
  123. if($file =~ /.data.patient./)
  124. {
  125.  
  126. my $ps_col = -1;
  127. my $ms_col = -1;
  128. my $tts_col = -1;
  129. my $display_line;
  130. my $comments = "";
  131. my $header;
  132. my $file_cont = "";
  133.  
  134. my $filepath = $dirpath."/".$file;
  135. open(FH,"<$filepath") or die("can't open the file");
  136.  
  137. my $c=1;
  138. while(my $line = <FH>)
  139. {
  140. if($line =~ /^#/)
  141. {
  142. if($c == 1)
  143. {
  144. $display_line = $line;
  145. $c++;
  146. }
  147. else
  148. {
  149. $comments .= $line;
  150. }
  151. }
  152. elsif($line =~ /^OTHER_PATIENT_ID/)
  153. {
  154. $header = $line;
  155. my @arr = split("\t",$line);
  156. for(my $i=0;$i < scalar(@arr);$i++)
  157. {
  158. if($arr[$i] eq "PRIMARY_SITE"){ $ps_col = $i};
  159. if($arr[$i] eq "METASTATIC_SITE"){ $ms_col = $i};
  160. if($arr[$i] eq "TUMOR_TISSUE_SITE"){ $tts_col = $i};
  161. }
  162. }
  163. else
  164. {
  165. $file_cont .= $line;
  166. }
  167. }
  168. my @com_line = split("\t",$display_line);
  169. if($ps_col != -1){ $com_line[$ps_col] = "Patient Primary Tumor Site" };
  170. if($ms_col != -1){ $com_line[$ms_col] = "Patient Metastatic Sites" };
  171. if($tts_col != -1){ $com_line[$tts_col] = "Tumor Tissue Site" };
  172. my $new_disp_line = join("\t",@com_line);
  173.  
  174. my @he_line = split("\t",$header);
  175. if($ps_col != -1){ $he_line[$ps_col] = "PRIMARY_SITE_PATIENT" };
  176. if($ms_col != -1){ $he_line[$ms_col] = "METASTATIC_SITE_PATIENT" };
  177. if($tts_col != -1){ $he_line[$tts_col] = "SITE_OF_TUMOR_TISSUE" };
  178. my $new_head_line = join("\t",@he_line);
  179.  
  180. close(FH);
  181.  
  182. unlink($filepath);
  183.  
  184. open(NF,">$filepath") or die();
  185.  
  186. print NF $new_disp_line.$comments.$new_head_line.$file_cont;
  187.  
  188. }
  189.  
  190. }
  191. }
  192. }
Add Comment
Please, Sign In to add comment