SHOW:
|
|
- or go back to the newest paste.
1 | #!/usr/bin/env python | |
2 | # this function was written in python 3 by Samantha Taffner | |
3 | ||
4 | import sys | |
5 | import os | |
6 | ||
7 | def run(first_file, second_file, folder_location): | |
8 | ''' | |
9 | | HiCpy is designed to bridge the speed gap between the very fast short | |
10 | | single read alignment and the considerably slower paired-end alignment. | |
11 | | HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C | |
12 | | analysis pipeline. Bowtie is called an ultrafast memory-efficient | |
13 | | short read aligner and there is not any denying that for single | |
14 | | read sequencing it is very fast. However, it is lacking tremendously | |
15 | | in speed for paired-end alignment. After 2 weeks it finally finished | |
16 | | while using 7 processors. HiCpy shortens the time to 4 hours on a | |
17 | | 2 processor machine. | |
18 | | | |
19 | | Go to HiCpy manual for more information. | |
20 | ''' | |
21 | first_file_dir = first_file.split('/') | |
22 | first_file_name = first_file_dir[-1] | |
23 | ||
24 | inside_folder_1 = folder_location + '/first_file' | |
25 | ||
26 | if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1) | |
27 | ||
28 | find_mapped(first_file, inside_folder_1, first_file_name) | |
29 | ||
30 | inside_folder_2 = folder_location + '/second_file' | |
31 | if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2) | |
32 | ||
33 | second_file_dir = second_file.split('/') | |
34 | second_file_name = second_file_dir[-1] | |
35 | ||
36 | find_mapped(first_file, inside_folder_2, second_file_name) | |
37 | ||
38 | out_folder = folder_location + '/your_results' | |
39 | if not os.path.exists(out_folder): os.makedirs(out_folder) | |
40 | ||
41 | out_file = out_folder + '/your_results.bed' | |
42 | make_file = open(out_file,'w') | |
43 | make_file.close() | |
44 | ||
45 | pairing(inside_folder_1, inside_folder_2, out_file) | |
46 | ||
47 | ||
48 | ||
49 | def find_mapped(start_file, file_path, file_name): | |
50 | ||
51 | ||
52 | count = 0 | |
53 | ||
54 | f_1 = open((file_path + '/1.sam'), 'w') | |
55 | f_2 = open((file_path + '/2.sam'), 'w') | |
56 | f_3 = open((file_path + '/3.sam'), 'w') | |
57 | f_4 = open((file_path + '/4.sam'), 'w') | |
58 | f_5 = open((file_path + '/5.sam'), 'w') | |
59 | f_6 = open((file_path + '/6.sam'), 'w') | |
60 | f_7 = open((file_path + '/7.sam'), 'w') | |
61 | f_8 = open((file_path + '/8.sam'), 'w') | |
62 | f_9 = open((file_path + '/9.sam'), 'w') | |
63 | f_10 = open((file_path + '/10.sam'), 'w') | |
64 | f_11 = open((file_path + '/11.sam'), 'w') | |
65 | f_12 = open((file_path + '/12.sam'), 'w') | |
66 | f_13 = open((file_path + '/13.sam'), 'w') | |
67 | f_14 = open((file_path + '/14.sam'), 'w') | |
68 | f_15 = open((file_path + '/15.sam'), 'w') | |
69 | f_16 = open((file_path + '/16.sam'), 'w') | |
70 | f_17 = open((file_path + '/17.sam'), 'w') | |
71 | f_18 = open((file_path + '/18.sam'), 'w') | |
72 | f_19 = open((file_path + '/19.sam'), 'w') | |
73 | f_Y = open((file_path + '/Y.sam'), 'w') | |
74 | f_X = open((file_path + '/X.sam'), 'w') | |
75 | f_M = open((file_path + '/M.sam'), 'w') | |
76 | ||
77 | file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11, | |
78 | '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M} | |
79 | ||
80 | print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000') | |
81 | ||
82 | with open(start_file, 'r') as sf: | |
83 | ||
84 | line = sf.readline() | |
85 | while line != "": | |
86 | count += 1 | |
87 | split_line = line.strip().split('\t') | |
88 | ||
89 | #ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*' | |
90 | #ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7' | |
91 | if split_line[2].startswith('chr'): | |
92 | chro = split_line[2] | |
93 | chro_num = chro.strip('chr') | |
94 | file_dict[chro_num].write(line) | |
95 | ||
96 | ||
97 | if count % 1000000 == 0: | |
98 | print('current line = ', count) | |
99 | ||
100 | line = sf.readline() | |
101 | print('All done') | |
102 | ||
103 | ||
104 | def pairing(folder_1, folder_2, out_file): | |
105 | ||
106 | print("I'm now working on making the bed file for you") | |
107 | ||
108 | with open((out_file), 'w') as wf: | |
109 | ||
110 | #make a list from 1-x and M. This list is used to to iterate over so all | |
111 | #files are accessed | |
112 | ||
113 | file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y'] | |
114 | ||
115 | #access all files in folder_1 = store_file | |
116 | ||
117 | for ch in file_list: | |
118 | store_dict = {} | |
119 | store_file_counter = 0 | |
120 | read_file_counter = 0 | |
121 | print("I'm working on chromsome ", ch) | |
122 | store_file = open((folder_1 +'/'+ ch + '.sam'), 'r') | |
123 | ||
124 | line = store_file.readline() | |
125 | ||
126 | #read threw store_file and store in dict | |
127 | while line != "": | |
128 | store_file_counter += 1 | |
129 | ||
130 | #counter for to keep user updated on progress | |
131 | if store_file_counter % 1000000 == 0: | |
132 | print('current line store_file_counter is = ', store_file_counter) | |
133 | list_line = line.strip().split('\t') | |
134 | ||
135 | #work on getting .bed format for store_file | |
136 | chro = list_line[2] | |
137 | position = list_line[3] | |
138 | sign = list_line[1] | |
139 | if sign == '+': | |
140 | ending_pos = int(position) + 35 | |
141 | bed = chro + '\t' + position + '\t' + str(ending_pos) | |
142 | elif sign == '-': | |
143 | ending_pos = int(position) - 35 | |
144 | bed = chro + '\t' + str(ending_pos) + '\t' + position | |
145 | ||
146 | ||
147 | #work on getting xy_location | |
148 | s_0 = list_line[0] | |
149 | s_0_split = s_0.split(':') | |
150 | s_6 = s_0_split[6].split(' ') | |
151 | ||
152 | xy_location = s_0_split[5] + s_6[0] | |
153 | ||
154 | #add to store_dict chromosome number, sign, and bed format | |
155 | store_dict[xy_location] = [position, sign, bed] | |
156 | line = store_file.readline() | |
157 | ||
158 | ||
159 | #access file '1' = read_file from folder_2 | |
160 | ||
161 | read_file = open((folder_2 +'/'+ ch + '.sam'), 'r') | |
162 | line_2 = read_file.readline() | |
163 | ||
164 | #read threw read_file find if it is in stored dict | |
165 | ||
166 | while line_2 != "": | |
167 | read_file_counter += 1 | |
168 | ||
169 | #counter for to keep user updated on progress | |
170 | if read_file_counter % 1000000 == 0: | |
171 | print('current line read_file_counter is = ', read_file_counter) | |
172 | ||
173 | list_line = line_2.strip().split('\t') | |
174 | ||
175 | #work on getting xy_location | |
176 | s_0 = list_line[0] | |
177 | s_0_split = s_0.split(':') | |
178 | s_6 = s_0_split[6].split(' ') | |
179 | xy_location_read_file = s_0_split[5] + s_6[0] | |
180 | ||
181 | #find matching xy_location in store dictionary | |
182 | if xy_location_read_file in store_dict.keys(): | |
183 | ||
184 | ||
185 | #work on getting .bed format for read_file | |
186 | chro = list_line[2] | |
187 | position = list_line[3] | |
188 | ||
189 | #find out if absolute value of read_file position - store_position | |
190 | # is > 1000 | |
191 | if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000: | |
192 | sign = list_line[1] | |
193 | ||
194 | #write + sign next to - sign frag | |
195 | if sign == '+': | |
196 | ending_pos = int(position) + 35 | |
197 | bed = chro + '\t' + position + '\t' + str(ending_pos) | |
198 | to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n' | |
199 | wf.write(to_write) | |
200 | elif sign == '-': | |
201 | ending_pos = int(position) - 35 | |
202 | bed = chro + '\t' + str(ending_pos) + '\t' + position | |
203 | to_write = store_dict[xy_location_read_file][2] + '\t' + bed + '\n' | |
204 | wf.write(to_write) | |
205 | ||
206 | ||
207 | line_2 = read_file.readline() | |
208 | ||
209 | print("I'done with the pairing") | |
210 | ||
211 | if __name__ == '__main__': | |
212 | ||
213 | run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13") | |
214 | #started 9:32 AM ended at 9:58 AM |