Guest User

Untitled

a guest
Feb 24th, 2018
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.76 KB | None | 0 0
  1. #! /usr/bin/perl
  2. use strict; use warnings;
  3. # This script will parce methylation table obtained
  4. # using bedtools unionbedg and will keep only the rows
  5. # the contain methylation data in minimum N samples.
  6. # The table has to contain only 2 experimental groups
  7. # and the samples belonging to this group have to placed in the table sequentially
  8. # i.e.
  9. # chr start end g1 g1 g1 g1 g2 g2 g2 g2
  10.  
  11. # Command line arguments in this exact sequence, since I'm lazy
  12. # to use the proper command line options
  13. my $meth_table = shift or die "Please provide methylation table\n"; # name of the file
  14. my $num_g1 = shift or die "Please provide number of samples in group 1\n"; # number of samples in group 1 <int>
  15. my $num_g2 = shift or die "Please provide number of samples in group 2\n"; # number of samples in group 2 <int>
  16. my $N = shift or die "Please provide minimum number of valid samples per group\n"; # minimum samples per group <int>
  17.  
  18. # Specify global variables
  19. my @data_cols = ();
  20.  
  21. # Read methylation table, return columns with more then N valid data
  22. # points per group
  23. open( my $meth_fh, "<", $meth_table ) or die "Cannot open methylation data file\n";
  24. # print header (first line of the methylation table)
  25. print scalar(<$meth_fh>);
  26. while( <$meth_fh> )
  27. {
  28. chomp;
  29. if ( m/^chr/ )
  30. {
  31. next;
  32. }
  33. else
  34. {
  35. @data_cols = split("\t");
  36. my $filtered_col_ref = analyze_data_cols(\@data_cols, $num_g1, $num_g2, $N);
  37. if ( defined($filtered_col_ref) )
  38. {
  39. print join("\t", @$filtered_col_ref), "\n";
  40. }
  41.  
  42. }
  43. }
  44.  
  45. ######################### Subroutines ##############################
  46. # This sub will take each row of the methylation data table and return rows
  47. # where each of the experimental groups have valid methylation data in the minimum
  48. # of N samples
  49. sub analyze_data_cols
  50. {
  51. # Get methylation data array, number of samples in group1, group2 and
  52. # the minimum number of valid samples per group
  53. my $data_cols_ref = shift;
  54. my $num_g1 = shift;
  55. my $num_g2 = shift;
  56. my $N = shift;
  57.  
  58. # Dereference methylation data array
  59. my @data_cols = @$data_cols_ref;
  60. my @group1_cols = @data_cols[3 .. 3 + ($num_g1 - 1)]; # select columns methylation data from group 1
  61. my @group2_cols = @data_cols[(3 + $num_g1) .. ($num_g1 + 1 + $num_g2 + 1)]; # select methylation data from group 2
  62. # Check that the data from each group contains at least N valid data points
  63. my @group1_valid = grep { $_ =~ m/^\d+/ } @group1_cols; # get array of valid data points from group1
  64. my @group2_valid = grep { $_ =~ m/^\d+/ } @group2_cols; # get array of valid data points from group2
  65. if ( (scalar(@group1_valid) >= $N) and (scalar(@group2_valid) >= $N) )
  66. {
  67. return \@data_cols;
  68. }
  69. else
  70. {
  71. return;
  72. }
  73. }
Add Comment
Please, Sign In to add comment