HiCpy

#!/usr/bin/env python
# this function was written in python 3 by Samantha Taffner

import sys
import os

def run(first_file, second_file, folder_location):
    '''
    |   HiCpy is designed to bridge the speed gap between the very fast short
    |   single read alignment and the considerably slower paired-end alignment.
    |   HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C
    |   analysis pipeline. Bowtie is called an ultrafast memory-efficient
    |   short read aligner and there is not any denying that for single
    |   read sequencing it is very fast.  However, it is lacking tremendously
    |   in speed for paired-end alignment.  After 2 weeks it finally finished
    |   while using 7 processors. HiCpy shortens the time to 4 hours on a
    |   2 processor machine.
    |
    |   Go to HiCpy manual for more information.
    '''
    first_file_dir = first_file.split('/')
    first_file_name = first_file_dir[-1]

    inside_folder_1 = folder_location + '/first_file'

    if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1)

    find_mapped(first_file, inside_folder_1, first_file_name)

    inside_folder_2 = folder_location + '/second_file'
    if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2)

    second_file_dir = second_file.split('/')
    second_file_name = second_file_dir[-1]

    find_mapped(first_file, inside_folder_2, second_file_name)

    out_folder = folder_location + '/your_results'
    if not os.path.exists(out_folder): os.makedirs(out_folder)

    out_file = out_folder + '/your_results.bed'
    make_file = open(out_file,'w')
    make_file.close()

    pairing(inside_folder_1, inside_folder_2, out_file)


def find_mapped(start_file, file_path, file_name):


    count = 0

    f_1 = open((file_path + '/1.sam'), 'w')
    f_2 = open((file_path + '/2.sam'), 'w')
    f_3 = open((file_path + '/3.sam'), 'w')
    f_4 = open((file_path + '/4.sam'), 'w')
    f_5 = open((file_path + '/5.sam'), 'w')
    f_6 = open((file_path + '/6.sam'), 'w')
    f_7 = open((file_path + '/7.sam'), 'w')
    f_8 = open((file_path + '/8.sam'), 'w')
    f_9 = open((file_path + '/9.sam'), 'w')
    f_10 = open((file_path + '/10.sam'), 'w')
    f_11 = open((file_path + '/11.sam'), 'w')
    f_12 = open((file_path + '/12.sam'), 'w')
    f_13 = open((file_path + '/13.sam'), 'w')
    f_14 = open((file_path + '/14.sam'), 'w')
    f_15 = open((file_path + '/15.sam'), 'w')
    f_16 = open((file_path + '/16.sam'), 'w')
    f_17 = open((file_path + '/17.sam'), 'w')
    f_18 = open((file_path + '/18.sam'), 'w')
    f_19 = open((file_path + '/19.sam'), 'w')
    f_Y = open((file_path + '/Y.sam'), 'w')
    f_X = open((file_path + '/X.sam'), 'w')
    f_M = open((file_path + '/M.sam'), 'w')

    file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
                 '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}

    print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')

    with open(start_file, 'r') as sf:

        line = sf.readline()
        while line != "":
            count += 1
            split_line = line.strip().split('\t')

#ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
#ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
            if split_line[2].startswith('chr'):
                chro = split_line[2]
                chro_num = chro.strip('chr')
                file_dict[chro_num].write(line)


            if count % 1000000 == 0:
                print('current line = ', count)

            line = sf.readline()
        print('All done')


def pairing(folder_1, folder_2, out_file):

    print("I'm now working on making the bed file for you")

    with open((out_file), 'w') as wf:

    #make a list from 1-x and M. This list is used to to iterate over so all
    #files are accessed

        file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']

    #access all files in folder_1 = store_file

        for ch in file_list:
            store_dict = {}
            store_file_counter = 0
            read_file_counter = 0
            print("I'm working on chromsome ", ch)
            store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')

            line = store_file.readline()

    #read threw store_file  and store in dict
            while line != "":
                store_file_counter += 1

                #counter for to keep user updated on progress
                if store_file_counter % 1000000 == 0:
                    print('current line store_file_counter is = ', store_file_counter)
                list_line = line.strip().split('\t')

    #work on getting .bed format for store_file
                chro = list_line[2]
                position = list_line[3]
                sign = list_line[1]
                if sign == '+':
                    ending_pos = int(position) + 35
                    bed = chro + '\t' + position + '\t' + str(ending_pos)
                elif sign == '-':
                    ending_pos = int(position) - 35
                    bed = chro + '\t' + str(ending_pos) + '\t' + position


    #work on getting xy_location
                s_0 = list_line[0]
                s_0_split = s_0.split(':')
                s_6 = s_0_split[6].split(' ')

                xy_location = s_0_split[5] + s_6[0]

    #add to store_dict chromosome number, sign, and bed format
                store_dict[xy_location] = [position, sign, bed]
                line = store_file.readline()


    #access file '1' = read_file from folder_2

            read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
            line_2 = read_file.readline()

    #read threw read_file find if it is in stored dict

            while line_2 != "":
               read_file_counter += 1

               #counter for to keep user updated on progress
               if read_file_counter % 1000000 == 0:
                    print('current line read_file_counter is = ', read_file_counter)

               list_line = line_2.strip().split('\t')

    #work on getting xy_location
               s_0 = list_line[0]
               s_0_split = s_0.split(':')
               s_6 = s_0_split[6].split(' ')
               xy_location_read_file = s_0_split[5] + s_6[0]

    #find matching xy_location in store dictionary
               if xy_location_read_file in store_dict.keys():


    #work on getting .bed format for read_file
                   chro = list_line[2]
                   position = list_line[3]

    #find out if absolute value of read_file position - store_position
    # is > 1000
                   if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000:
                       sign = list_line[1]

    #write + sign next to - sign frag
                       if sign == '+':
                           ending_pos = int(position) + 35
                           bed = chro + '\t' + position + '\t' + str(ending_pos)
                           to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n'
                           wf.write(to_write)
                       elif sign == '-':
                           ending_pos = int(position) - 35
                           bed = chro + '\t' + str(ending_pos) + '\t' + position
                           to_write = store_dict[xy_location_read_file][2] + '\t' +  bed + '\n'
                           wf.write(to_write)


               line_2 = read_file.readline()

    print("I'done with the pairing")

if __name__ == '__main__':

    run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13")
    #started 9:32 AM ended at 9:58 AM