Untitled

#! /usr/bin/perl
use strict; use warnings;
# This script will parce methylation table obtained
# using bedtools unionbedg and will keep only the rows
# the contain methylation data in minimum N samples.
# The table has to contain only 2 experimental groups
# and the samples belonging to this group have to placed in the table sequentially
# i.e.
# chr start end g1 g1 g1 g1 g2 g2 g2 g2

# Command line arguments in this exact sequence, since I'm lazy
# to use the proper command line options
my $meth_table = shift or die "Please provide methylation table\n"; # name of the file
my $num_g1     = shift or die "Please provide number of samples in group 1\n"; # number of samples in group 1 <int>
my $num_g2     = shift or die "Please provide number of samples in group 2\n"; # number of samples in group 2 <int>
my $N          = shift or die "Please provide minimum number of valid samples per group\n"; # minimum samples per group <int>

# Specify global variables
my @data_cols   = ();

# Read methylation table, return columns with more then N valid data
# points per group
open( my $meth_fh, "<", $meth_table ) or die "Cannot open methylation data file\n";
# print header (first line of the methylation table)
print scalar(<$meth_fh>);
while( <$meth_fh> )
{
   chomp;
   if ( m/^chr/ )
   {
        next;
   }
   else
   {
        @data_cols = split("\t");
        my $filtered_col_ref = analyze_data_cols(\@data_cols, $num_g1, $num_g2, $N);
        if ( defined($filtered_col_ref) )
        {
            print join("\t", @$filtered_col_ref), "\n";
        }

   }
}

######################### Subroutines ##############################
# This sub will take each row of the methylation data table and return rows
# where each of the experimental groups have valid methylation data in the minimum
# of N samples
sub analyze_data_cols
{
    # Get methylation data array, number of samples in group1, group2 and
    # the minimum number of valid samples per group
    my $data_cols_ref = shift;
    my $num_g1 = shift;
    my $num_g2 = shift;
    my $N = shift;

    # Dereference methylation data array
    my @data_cols = @$data_cols_ref;
    my @group1_cols = @data_cols[3 .. 3 + ($num_g1 - 1)]; # select columns methylation data from group 1
    my @group2_cols = @data_cols[(3 + $num_g1) .. ($num_g1 + 1 + $num_g2 + 1)]; # select methylation data from group 2
    # Check that the data from each group contains at least N valid data points
    my @group1_valid = grep { $_ =~ m/^\d+/ } @group1_cols; # get array of valid data points from group1
    my @group2_valid = grep { $_ =~ m/^\d+/ } @group2_cols; # get array of valid data points from group2
    if ( (scalar(@group1_valid) >= $N) and (scalar(@group2_valid) >= $N) )
    {
        return \@data_cols;
    }
    else
    {
        return;
    }
}