Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # this function was written in python 3 by Samantha Taffner
- import sys
- import os
- def run(first_file, second_file, folder_location):
- '''
- | HiCpy is designed to bridge the speed gap between the very fast short
- | single read alignment and the considerably slower paired-end alignment.
- | HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C
- | analysis pipeline. Bowtie is called an ultrafast memory-efficient
- | short read aligner and there is not any denying that for single
- | read sequencing it is very fast. However, it is lacking tremendously
- | in speed for paired-end alignment. After 2 weeks it finally finished
- | while using 7 processors. HiCpy shortens the time to 4 hours on a
- | 2 processor machine.
- |
- | Go to HiCpy manual for more information.
- '''
- first_file_dir = first_file.split('/')
- first_file_name = first_file_dir[-1]
- inside_folder_1 = folder_location + '/first_file'
- if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1)
- find_mapped(first_file, inside_folder_1, first_file_name)
- inside_folder_2 = folder_location + '/second_file'
- if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2)
- second_file_dir = second_file.split('/')
- second_file_name = second_file_dir[-1]
- find_mapped(first_file, inside_folder_2, second_file_name)
- out_folder = folder_location + '/your_results'
- if not os.path.exists(out_folder): os.makedirs(out_folder)
- out_file = out_folder + '/your_results.bed'
- make_file = open(out_file,'w')
- make_file.close()
- pairing(inside_folder_1, inside_folder_2, out_file)
- def find_mapped(start_file, file_path, file_name):
- count = 0
- f_1 = open((file_path + '/1.sam'), 'w')
- f_2 = open((file_path + '/2.sam'), 'w')
- f_3 = open((file_path + '/3.sam'), 'w')
- f_4 = open((file_path + '/4.sam'), 'w')
- f_5 = open((file_path + '/5.sam'), 'w')
- f_6 = open((file_path + '/6.sam'), 'w')
- f_7 = open((file_path + '/7.sam'), 'w')
- f_8 = open((file_path + '/8.sam'), 'w')
- f_9 = open((file_path + '/9.sam'), 'w')
- f_10 = open((file_path + '/10.sam'), 'w')
- f_11 = open((file_path + '/11.sam'), 'w')
- f_12 = open((file_path + '/12.sam'), 'w')
- f_13 = open((file_path + '/13.sam'), 'w')
- f_14 = open((file_path + '/14.sam'), 'w')
- f_15 = open((file_path + '/15.sam'), 'w')
- f_16 = open((file_path + '/16.sam'), 'w')
- f_17 = open((file_path + '/17.sam'), 'w')
- f_18 = open((file_path + '/18.sam'), 'w')
- f_19 = open((file_path + '/19.sam'), 'w')
- f_Y = open((file_path + '/Y.sam'), 'w')
- f_X = open((file_path + '/X.sam'), 'w')
- f_M = open((file_path + '/M.sam'), 'w')
- file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
- '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
- print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
- with open(start_file, 'r') as sf:
- line = sf.readline()
- while line != "":
- count += 1
- split_line = line.strip().split('\t')
- #ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
- #ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
- if split_line[2].startswith('chr'):
- chro = split_line[2]
- chro_num = chro.strip('chr')
- file_dict[chro_num].write(line)
- if count % 1000000 == 0:
- print('current line = ', count)
- line = sf.readline()
- print('All done')
- def pairing(folder_1, folder_2, out_file):
- print("I'm now working on making the bed file for you")
- with open((out_file), 'w') as wf:
- #make a list from 1-x and M. This list is used to to iterate over so all
- #files are accessed
- file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
- #access all files in folder_1 = store_file
- for ch in file_list:
- store_dict = {}
- store_file_counter = 0
- read_file_counter = 0
- print("I'm working on chromsome ", ch)
- store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
- line = store_file.readline()
- #read threw store_file and store in dict
- while line != "":
- store_file_counter += 1
- #counter for to keep user updated on progress
- if store_file_counter % 1000000 == 0:
- print('current line store_file_counter is = ', store_file_counter)
- list_line = line.strip().split('\t')
- #work on getting .bed format for store_file
- chro = list_line[2]
- position = list_line[3]
- sign = list_line[1]
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- #work on getting xy_location
- s_0 = list_line[0]
- s_0_split = s_0.split(':')
- s_6 = s_0_split[6].split(' ')
- xy_location = s_0_split[5] + s_6[0]
- #add to store_dict chromosome number, sign, and bed format
- store_dict[xy_location] = [position, sign, bed]
- line = store_file.readline()
- #access file '1' = read_file from folder_2
- read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
- line_2 = read_file.readline()
- #read threw read_file find if it is in stored dict
- while line_2 != "":
- read_file_counter += 1
- #counter for to keep user updated on progress
- if read_file_counter % 1000000 == 0:
- print('current line read_file_counter is = ', read_file_counter)
- list_line = line_2.strip().split('\t')
- #work on getting xy_location
- s_0 = list_line[0]
- s_0_split = s_0.split(':')
- s_6 = s_0_split[6].split(' ')
- xy_location_read_file = s_0_split[5] + s_6[0]
- #find matching xy_location in store dictionary
- if xy_location_read_file in store_dict.keys():
- #work on getting .bed format for read_file
- chro = list_line[2]
- position = list_line[3]
- #find out if absolute value of read_file position - store_position
- # is > 1000
- if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000:
- sign = list_line[1]
- #write + sign next to - sign frag
- if sign == '+':
- ending_pos = int(position) + 35
- bed = chro + '\t' + position + '\t' + str(ending_pos)
- to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n'
- wf.write(to_write)
- elif sign == '-':
- ending_pos = int(position) - 35
- bed = chro + '\t' + str(ending_pos) + '\t' + position
- to_write = store_dict[xy_location_read_file][2] + '\t' + bed + '\n'
- wf.write(to_write)
- line_2 = read_file.readline()
- print("I'done with the pairing")
- if __name__ == '__main__':
- run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13")
- #started 9:32 AM ended at 9:58 AM
Advertisement
Add Comment
Please, Sign In to add comment