Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/perl
- use strict; use warnings;
- # This script will parce methylation table obtained
- # using bedtools unionbedg and will keep only the rows
- # the contain methylation data in minimum N samples.
- # The table has to contain only 2 experimental groups
- # and the samples belonging to this group have to placed in the table sequentially
- # i.e.
- # chr start end g1 g1 g1 g1 g2 g2 g2 g2
- # Command line arguments in this exact sequence, since I'm lazy
- # to use the proper command line options
- my $meth_table = shift or die "Please provide methylation table\n"; # name of the file
- my $num_g1 = shift or die "Please provide number of samples in group 1\n"; # number of samples in group 1 <int>
- my $num_g2 = shift or die "Please provide number of samples in group 2\n"; # number of samples in group 2 <int>
- my $N = shift or die "Please provide minimum number of valid samples per group\n"; # minimum samples per group <int>
- # Specify global variables
- my @data_cols = ();
- # Read methylation table, return columns with more then N valid data
- # points per group
- open( my $meth_fh, "<", $meth_table ) or die "Cannot open methylation data file\n";
- # print header (first line of the methylation table)
- print scalar(<$meth_fh>);
- while( <$meth_fh> )
- {
- chomp;
- if ( m/^chr/ )
- {
- next;
- }
- else
- {
- @data_cols = split("\t");
- my $filtered_col_ref = analyze_data_cols(\@data_cols, $num_g1, $num_g2, $N);
- if ( defined($filtered_col_ref) )
- {
- print join("\t", @$filtered_col_ref), "\n";
- }
- }
- }
- ######################### Subroutines ##############################
- # This sub will take each row of the methylation data table and return rows
- # where each of the experimental groups have valid methylation data in the minimum
- # of N samples
- sub analyze_data_cols
- {
- # Get methylation data array, number of samples in group1, group2 and
- # the minimum number of valid samples per group
- my $data_cols_ref = shift;
- my $num_g1 = shift;
- my $num_g2 = shift;
- my $N = shift;
- # Dereference methylation data array
- my @data_cols = @$data_cols_ref;
- my @group1_cols = @data_cols[3 .. 3 + ($num_g1 - 1)]; # select columns methylation data from group 1
- my @group2_cols = @data_cols[(3 + $num_g1) .. ($num_g1 + 1 + $num_g2 + 1)]; # select methylation data from group 2
- # Check that the data from each group contains at least N valid data points
- my @group1_valid = grep { $_ =~ m/^\d+/ } @group1_cols; # get array of valid data points from group1
- my @group2_valid = grep { $_ =~ m/^\d+/ } @group2_cols; # get array of valid data points from group2
- if ( (scalar(@group1_valid) >= $N) and (scalar(@group2_valid) >= $N) )
- {
- return \@data_cols;
- }
- else
- {
- return;
- }
- }
Add Comment
Please, Sign In to add comment