Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # HiCpy_2 was written by Samantha Taffner on May 26th, 2013
- import sys
- import os
- import datetime
- def run(first_file, second_file, folder_location):
- #time ________________________________________________________________
- #get start time
- start_time = datetime.datetime.now()
- Y_M_D = start_time.strftime('%Y_%m_%d')
- #find format of first_file and store as variable______________________________
- #call find_format(make sure first line is stored!)
- file_format = find_format(first_file)
- #raise format error and abort HiCpy if file_format not equal to 5 or 7
- if file_format != 5 and file_format != 7:
- return print('''File format ERROR: Please refer to the HiCpy user manual about query name
- formats accepted by HiCpy''')
- #get file names________________________________________________________________
- #get first file name
- first_file_dir = first_file.split('/')
- first_file_name = first_file_dir[-1]
- #get second file name
- second_file_dir = second_file.split('/')
- second_file_name = second_file_dir[-1]
- #make folders________________________________________________________________
- #make first_file folder
- folder_1_location = folder_location + '/' + first_file_name + '_' + Y_M_D
- if not os.path.exists(folder_1_location): os.makedirs(folder_1_location)
- #make second_file folder
- folder_2_location = folder_location + '/' + second_file_name + '_' + Y_M_D
- if not os.path.exists(folder_2_location): os.makedirs(folder_2_location)
- #make cis folder
- cis_folder = folder_location + '/cis_paired_results_' + Y_M_D
- if not os.path.exists(cis_folder): os.makedirs(cis_folder)
- #make out_files
- #make cis file
- cis_file = cis_folder + '/cis_results' + Y_M_D + '.bed'
- make_file = open(cis_file,'w')
- make_file.close()
- #call find_mapped twice for each input file________________________________
- find_mapped(first_file, folder_1_location, first_file_name)
- find_mapped(second_file, folder_2_location, second_file_name)
- #call appropriate pairing function or raise a format error
- #if 5 call function five
- if file_format == 5:
- end_time = five(folder_1_location, folder_2_location, cis_file)
- #elif 7 call function seven
- elif file_format == 7:
- end_time = seven(folder_1_location, folder_2_location, cis_file)
- #return total time
- elapsed_time = str(end_time - start_time)
- print('The amount of time that has elapsed since HiCpy started is:\n', elapsed_time)
- def find_format(first_file):
- '''
- This function will find if format is:
- len(line[0]) = 7
- D99KWTL1:845:C214UACXX:6:1101:1404:2170 3:N:0: - chr15 33279215
- or
- len(line[0]) = 5
- HW-ST997_0116:3:1101:1213:2167#0/3 - chr8 36287971
- '''
- #open
- with open((first_file), 'r') as f:
- #read one line
- line = f.readline()
- #extract length of query name
- split_line = line.strip(' ').split('\t')
- split_line = split_line[0].split(' ')
- len_query_name = len(split_line[0].split(':'))
- return len_query_name
- def find_mapped(start_file, file_path, file_name):
- '''
- This function will seperate each seqencing into their corresponding the chr
- it was mapped to
- '''
- count = 0
- f_1 = open((file_path + '/1.sam'), 'w')
- f_2 = open((file_path + '/2.sam'), 'w')
- f_3 = open((file_path + '/3.sam'), 'w')
- f_4 = open((file_path + '/4.sam'), 'w')
- f_5 = open((file_path + '/5.sam'), 'w')
- f_6 = open((file_path + '/6.sam'), 'w')
- f_7 = open((file_path + '/7.sam'), 'w')
- f_8 = open((file_path + '/8.sam'), 'w')
- f_9 = open((file_path + '/9.sam'), 'w')
- f_10 = open((file_path + '/10.sam'), 'w')
- f_11 = open((file_path + '/11.sam'), 'w')
- f_12 = open((file_path + '/12.sam'), 'w')
- f_13 = open((file_path + '/13.sam'), 'w')
- f_14 = open((file_path + '/14.sam'), 'w')
- f_15 = open((file_path + '/15.sam'), 'w')
- f_16 = open((file_path + '/16.sam'), 'w')
- f_17 = open((file_path + '/17.sam'), 'w')
- f_18 = open((file_path + '/18.sam'), 'w')
- f_19 = open((file_path + '/19.sam'), 'w')
- f_Y = open((file_path + '/Y.sam'), 'w')
- f_X = open((file_path + '/X.sam'), 'w')
- f_M = open((file_path + '/M.sam'), 'w')
- file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6,
- '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
- '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16,
- '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
- print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
- with open(start_file, 'r') as sf:
- line = sf.readline()
- while line != "":
- count += 1
- split_line = line.strip().split('\t')
- #ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
- #ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
- if split_line[2].startswith('chr'):
- chro = split_line[2]
- chro_num = chro.strip('chr')
- file_dict[chro_num].write(line)
- if count % 1000000 == 0:
- print('current line = ', count)
- line = sf.readline()
- print('All done')
- def seven(folder_1, folder_2, cis_file):
- '''
- Read file one store to dict then find pair in second file. If pair not
- found write to not found file
- '''
- print("I'm now working on making the bed file for you")
- with open((cis_file), 'w') as wf:
- #make a list from 1-x and M. This list is used to to iterate over so all
- #files are accessed
- file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
- #access all files in folder_1 = store_file
- for ch in file_list:
- store_dict = {}
- store_file_counter = 0
- read_file_counter = 0
- print("I'm working on chromsome ", ch)
- store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
- line = store_file.readline()
- #read threw store_file and store in dict
- while line != "":
- store_file_counter += 1
- #counter for to keep user updated on progress
- if store_file_counter % 1000000 == 0:
- print('current line store_file_counter is = ', store_file_counter)
- list_line = line.strip().split('\t')
- #work on getting .bed format for store_file
- chro = list_line[2]
- position = list_line[3]
- sign = list_line[1]
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- #get xy on CHIP location
- query_name_list = list_line[0].split(' ')
- query_name_list = query_name_list[0].split(':')
- xy = query_name_list[-2] + query_name_list[-1]
- #add to store_dict chromosome number, sign, and bed format
- store_dict[xy] = [position, sign, bed, xy]
- line = store_file.readline()
- #access file '1' = read_file from folder_2
- read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
- line_2 = read_file.readline()
- #read threw read_file find if it is in stored dict
- while line_2 != "":
- read_file_counter += 1
- #counter for to keep user updated on progress
- if read_file_counter % 1000000 == 0:
- print('current line read_file_counter is = ', read_file_counter)
- list_line = line_2.strip().split('\t')
- #work on getting xy_location
- query_name_list = list_line[0].split(' ')
- query_name_list = query_name_list[0].split(':')
- xy_read = query_name_list[-2] + query_name_list[-1]
- #find matching xy_location in store dictionary
- if xy_read in store_dict.keys():
- #work on getting .bed format for read_file
- chro = list_line[2]
- position = list_line[3]
- #find out if absolute value of read_file position - store_position
- # is > 1000
- if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
- sign = list_line[1]
- #write + sign next to - sign frag
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
- wf.write(to_write)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- to_write = store_dict[xy_read][2] + '\t' + bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
- wf.write(to_write)
- line_2 = read_file.readline()
- print("I'done with the pairing")
- end_time = datetime.datetime.now()
- return end_time
- def five(folder_1, folder_2, cis_file):
- print("I'm now working on making the bed file for you")
- with open((cis_file), 'w') as wf:
- #make a list from 1-x and M. This list is used to to iterate over so all
- #files are accessed
- file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
- #access all files in folder_1 = store_file
- for ch in file_list:
- store_dict = {}
- store_file_counter = 0
- read_file_counter = 0
- print("I'm working on chromsome ", ch)
- store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
- line = store_file.readline()
- #read threw store_file and store in dict
- while line != "":
- store_file_counter += 1
- #counter for to keep user updated on progress
- if store_file_counter % 1000000 == 0:
- print('current line store_file_counter is = ', store_file_counter)
- list_line = line.strip().split('\t')
- #work on getting .bed format for store_file
- chro = list_line[2]
- position = list_line[3]
- sign = list_line[1]
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- #get xy on CHIP location
- query_name_list = list_line[0].split(':')
- x = query_name_list[3]
- y = query_name_list[4].split('#')
- y = y[0]
- xy = x + y
- #add to store_dict chromosome number, sign, and bed format
- store_dict[xy] = [position, sign, bed, xy]
- line = store_file.readline()
- #access file '1' = read_file from folder_2
- read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
- line_2 = read_file.readline()
- #read threw read_file find if it is in stored dict
- while line_2 != "":
- read_file_counter += 1
- #counter for to keep user updated on progress
- if read_file_counter % 1000000 == 0:
- print('current line read_file_counter is = ', read_file_counter)
- list_line = line_2.strip().split('\t')
- #work on getting xy_location
- query_name_list = list_line[0].split(':')
- x = query_name_list[3]
- y = query_name_list[4].split('#')
- y = y[0]
- xy_read = x + y
- #find matching xy_location in store dictionary
- if xy_read in store_dict.keys():
- #work on getting .bed format for read_file
- chro = list_line[2]
- position = list_line[3]
- #find out if absolute value of read_file position - store_position
- # is > 1000
- if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
- sign = list_line[1]
- #write + sign next to - sign frag
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
- wf.write(to_write)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- to_write = store_dict[xy_read][2] + '\t' + bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
- wf.write(to_write)
- line_2 = read_file.readline()
- print("I'done with the pairing")
- end_time = datetime.datetime.now()
- return end_time
- if __name__ == '__main__':
- run('First_SAM_file', 'Second_SAM_file', 'Folder_to_save_data_to')
Advertisement
Add Comment
Please, Sign In to add comment