HiCpy_2

#!/usr/bin/env python
# HiCpy_2 was written by Samantha Taffner on May 26th, 2013

import sys
import os
import datetime


def run(first_file, second_file, folder_location):

#time ________________________________________________________________
    #get start time
    start_time = datetime.datetime.now()
    Y_M_D = start_time.strftime('%Y_%m_%d')

#find format of first_file and store as variable______________________________
    #call find_format(make sure first line is stored!)
    file_format = find_format(first_file)

    #raise format error and abort HiCpy if file_format not equal to 5 or 7
    if file_format != 5 and file_format != 7:
        return print('''File format ERROR: Please refer to the HiCpy user manual about query name
formats accepted by HiCpy''')

#get file names________________________________________________________________
    #get first file name
    first_file_dir = first_file.split('/')
    first_file_name = first_file_dir[-1]

    #get second file name
    second_file_dir = second_file.split('/')
    second_file_name = second_file_dir[-1]

#make folders________________________________________________________________
    #make first_file folder
    folder_1_location = folder_location + '/' + first_file_name + '_' + Y_M_D
    if not os.path.exists(folder_1_location): os.makedirs(folder_1_location)

    #make second_file folder
    folder_2_location = folder_location + '/' + second_file_name + '_' + Y_M_D
    if not os.path.exists(folder_2_location): os.makedirs(folder_2_location)

    #make cis folder
    cis_folder = folder_location + '/cis_paired_results_' + Y_M_D
    if not os.path.exists(cis_folder): os.makedirs(cis_folder)

#make out_files
    #make cis file
    cis_file = cis_folder + '/cis_results'  + Y_M_D + '.bed'
    make_file = open(cis_file,'w')
    make_file.close()


#call find_mapped twice for each input file________________________________
    find_mapped(first_file, folder_1_location, first_file_name)
    find_mapped(second_file, folder_2_location, second_file_name)

#call appropriate pairing function or raise a format error
    #if 5 call function five
    if file_format == 5:
        end_time = five(folder_1_location, folder_2_location, cis_file)
    #elif 7 call function seven
    elif file_format == 7:
        end_time = seven(folder_1_location, folder_2_location, cis_file)
#return total time
    elapsed_time = str(end_time - start_time)
    print('The amount of time that has elapsed since HiCpy started is:\n', elapsed_time)

def find_format(first_file):
    '''
    This function will find if format is:

    len(line[0]) = 7
    D99KWTL1:845:C214UACXX:6:1101:1404:2170 3:N:0:  -   chr15   33279215

    or

    len(line[0]) = 5
    HW-ST997_0116:3:1101:1213:2167#0/3  -   chr8    36287971

    '''

    #open
    with open((first_file), 'r') as f:

        #read one line
        line = f.readline()

        #extract length of query name
        split_line = line.strip(' ').split('\t')
        split_line = split_line[0].split(' ')
        len_query_name = len(split_line[0].split(':'))


    return len_query_name

def find_mapped(start_file, file_path, file_name):
    '''
    This function will seperate each seqencing into their corresponding the chr
    it was mapped to
    '''
    count = 0

    f_1 = open((file_path + '/1.sam'), 'w')
    f_2 = open((file_path + '/2.sam'), 'w')
    f_3 = open((file_path + '/3.sam'), 'w')
    f_4 = open((file_path + '/4.sam'), 'w')
    f_5 = open((file_path + '/5.sam'), 'w')
    f_6 = open((file_path + '/6.sam'), 'w')
    f_7 = open((file_path + '/7.sam'), 'w')
    f_8 = open((file_path + '/8.sam'), 'w')
    f_9 = open((file_path + '/9.sam'), 'w')
    f_10 = open((file_path + '/10.sam'), 'w')
    f_11 = open((file_path + '/11.sam'), 'w')
    f_12 = open((file_path + '/12.sam'), 'w')
    f_13 = open((file_path + '/13.sam'), 'w')
    f_14 = open((file_path + '/14.sam'), 'w')
    f_15 = open((file_path + '/15.sam'), 'w')
    f_16 = open((file_path + '/16.sam'), 'w')
    f_17 = open((file_path + '/17.sam'), 'w')
    f_18 = open((file_path + '/18.sam'), 'w')
    f_19 = open((file_path + '/19.sam'), 'w')
    f_Y = open((file_path + '/Y.sam'), 'w')
    f_X = open((file_path + '/X.sam'), 'w')
    f_M = open((file_path + '/M.sam'), 'w')

    file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6,
                 '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
                 '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16,
                 '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}

    print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')

    with open(start_file, 'r') as sf:

        line = sf.readline()
        while line != "":
            count += 1
            split_line = line.strip().split('\t')

#ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
#ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
            if split_line[2].startswith('chr'):
                chro = split_line[2]
                chro_num = chro.strip('chr')
                file_dict[chro_num].write(line)


            if count % 1000000 == 0:
                print('current line = ', count)

            line = sf.readline()
        print('All done')


def seven(folder_1, folder_2, cis_file):
    '''
    Read file one store to dict then find pair in second file.  If pair not
    found write to not found file

    '''

    print("I'm now working on making the bed file for you")

    with open((cis_file), 'w') as wf:

#make a list from 1-x and M. This list is used to to iterate over so all
    #files are accessed

        file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']

    #access all files in folder_1 = store_file

        for ch in file_list:
            store_dict = {}
            store_file_counter = 0
            read_file_counter = 0
            print("I'm working on chromsome ", ch)
            store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
            line = store_file.readline()

    #read threw store_file  and store in dict
            while line != "":
                store_file_counter += 1

                #counter for to keep user updated on progress
                if store_file_counter % 1000000 == 0:
                    print('current line store_file_counter is = ', store_file_counter)
                list_line = line.strip().split('\t')

    #work on getting .bed format for store_file
                chro = list_line[2]
                position = list_line[3]
                sign = list_line[1]

                if sign == '+':
                    ending_pos = int(position) + 35
                    bed = chro + '\t' + position + '\t' + str(ending_pos)
                elif sign == '-':
                    ending_pos = int(position) - 35
                    bed = chro + '\t' + str(ending_pos) + '\t' + position

    #get xy on CHIP location
                query_name_list = list_line[0].split(' ')
                query_name_list = query_name_list[0].split(':')

                xy = query_name_list[-2] + query_name_list[-1]


    #add to store_dict chromosome number, sign, and bed format
                store_dict[xy] = [position, sign, bed, xy]
                line = store_file.readline()


    #access file '1' = read_file from folder_2

            read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
            line_2 = read_file.readline()

    #read threw read_file find if it is in stored dict

            while line_2 != "":
                read_file_counter += 1

               #counter for to keep user updated on progress
                if read_file_counter % 1000000 == 0:
                    print('current line read_file_counter is = ', read_file_counter)

                list_line = line_2.strip().split('\t')

    #work on getting xy_location
                query_name_list = list_line[0].split(' ')
                query_name_list = query_name_list[0].split(':')

                xy_read = query_name_list[-2] + query_name_list[-1]


    #find matching xy_location in store dictionary
                if xy_read in store_dict.keys():

    #work on getting .bed format for read_file
                    chro = list_line[2]
                    position = list_line[3]

    #find out if absolute value of read_file position - store_position
    # is > 1000
                    if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
                        sign = list_line[1]

    #write + sign next to - sign frag
                        if sign == '+':
                            ending_pos = int(position) + 35
                            bed = chro + '\t' + position + '\t' + str(ending_pos)
                            to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
                            wf.write(to_write)
                        elif sign == '-':
                            ending_pos = int(position) - 35
                            bed = chro + '\t' + str(ending_pos) + '\t' + position
                            to_write = store_dict[xy_read][2] + '\t' +  bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
                            wf.write(to_write)


                line_2 = read_file.readline()

    print("I'done with the pairing")
    end_time = datetime.datetime.now()
    return end_time

def five(folder_1, folder_2, cis_file):
    print("I'm now working on making the bed file for you")

    with open((cis_file), 'w') as wf:

    #make a list from 1-x and M. This list is used to to iterate over so all
    #files are accessed

        file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']

    #access all files in folder_1 = store_file

        for ch in file_list:
            store_dict = {}
            store_file_counter = 0
            read_file_counter = 0
            print("I'm working on chromsome ", ch)
            store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
            line = store_file.readline()

    #read threw store_file  and store in dict
            while line != "":
                store_file_counter += 1

                #counter for to keep user updated on progress
                if store_file_counter % 1000000 == 0:
                    print('current line store_file_counter is = ', store_file_counter)
                list_line = line.strip().split('\t')

    #work on getting .bed format for store_file
                chro = list_line[2]
                position = list_line[3]
                sign = list_line[1]
                if sign == '+':
                    ending_pos = int(position) + 35
                    bed = chro + '\t' + position + '\t' + str(ending_pos)
                elif sign == '-':
                    ending_pos = int(position) - 35
                    bed = chro + '\t' + str(ending_pos) + '\t' + position

    #get xy on CHIP location
                query_name_list = list_line[0].split(':')
                x = query_name_list[3]
                y = query_name_list[4].split('#')
                y = y[0]
                xy = x + y

    #add to store_dict chromosome number, sign, and bed format
                store_dict[xy] = [position, sign, bed, xy]
                line = store_file.readline()


    #access file '1' = read_file from folder_2

            read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
            line_2 = read_file.readline()

    #read threw read_file find if it is in stored dict

            while line_2 != "":
                read_file_counter += 1

               #counter for to keep user updated on progress
                if read_file_counter % 1000000 == 0:
                    print('current line read_file_counter is = ', read_file_counter)

                list_line = line_2.strip().split('\t')

    #work on getting xy_location
                query_name_list = list_line[0].split(':')
                x = query_name_list[3]
                y = query_name_list[4].split('#')
                y = y[0]
                xy_read = x + y

    #find matching xy_location in store dictionary
                if xy_read in store_dict.keys():

    #work on getting .bed format for read_file
                    chro = list_line[2]
                    position = list_line[3]

    #find out if absolute value of read_file position - store_position
    # is > 1000
                    if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
                        sign = list_line[1]

    #write + sign next to - sign frag
                        if sign == '+':
                            ending_pos = int(position) + 35
                            bed = chro + '\t' + position + '\t' + str(ending_pos)
                            to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
                            wf.write(to_write)
                        elif sign == '-':
                            ending_pos = int(position) - 35
                            bed = chro + '\t' + str(ending_pos) + '\t' + position
                            to_write = store_dict[xy_read][2] + '\t' +  bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
                            wf.write(to_write)


                line_2 = read_file.readline()

    print("I'done with the pairing")
    end_time = datetime.datetime.now()
    return end_time


if __name__ == '__main__':
    run('First_SAM_file', 'Second_SAM_file', 'Folder_to_save_data_to')