View difference between Paste ID: <a href="/W39U8BEh">W39U8BEh</a> and <a href="/Cw2YjGHY">Cw2YjGHY</a>

#!/usr/bin/env python
1		#!/usr/bin/env python
2		# this function was written in python 3 by Samantha Taffner
3
4		import sys
5		import os
6
7		def run(first_file, second_file, folder_location):
8		'''
9		\| HiCpy is designed to bridge the speed gap between the very fast short
10		\| single read alignment and the considerably slower paired-end alignment.
11		\| HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C
12		\| analysis pipeline. Bowtie is called an ultrafast memory-efficient
13		\| short read aligner and there is not any denying that for single
14		\| read sequencing it is very fast. However, it is lacking tremendously
15		\| in speed for paired-end alignment. After 2 weeks it finally finished
16		\| while using 7 processors. HiCpy shortens the time to 4 hours on a
17		\| 2 processor machine.
18		\|
19		\| Go to HiCpy manual for more information.
20		'''
21		first_file_dir = first_file.split('/')
22		first_file_name = first_file_dir[-1]
23
24		inside_folder_1 = folder_location + '/first_file'
25
26		if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1)
27
28		find_mapped(first_file, inside_folder_1, first_file_name)
29
30		inside_folder_2 = folder_location + '/second_file'
31		if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2)
32
33		second_file_dir = second_file.split('/')
34		second_file_name = second_file_dir[-1]
35
36		find_mapped(first_file, inside_folder_2, second_file_name)
37
38		out_folder = folder_location + '/your_results'
39		if not os.path.exists(out_folder): os.makedirs(out_folder)
40
41		out_file = out_folder + '/your_results.bed'
42		make_file = open(out_file,'w')
43		make_file.close()
44
45		pairing(inside_folder_1, inside_folder_2, out_file)
46
47
48
49		def find_mapped(start_file, file_path, file_name):
50
51
52		count = 0
53
54		f_1 = open((file_path + '/1.sam'), 'w')
55		f_2 = open((file_path + '/2.sam'), 'w')
56		f_3 = open((file_path + '/3.sam'), 'w')
57		f_4 = open((file_path + '/4.sam'), 'w')
58		f_5 = open((file_path + '/5.sam'), 'w')
59		f_6 = open((file_path + '/6.sam'), 'w')
60		f_7 = open((file_path + '/7.sam'), 'w')
61		f_8 = open((file_path + '/8.sam'), 'w')
62		f_9 = open((file_path + '/9.sam'), 'w')
63		f_10 = open((file_path + '/10.sam'), 'w')
64		f_11 = open((file_path + '/11.sam'), 'w')
65		f_12 = open((file_path + '/12.sam'), 'w')
66		f_13 = open((file_path + '/13.sam'), 'w')
67		f_14 = open((file_path + '/14.sam'), 'w')
68		f_15 = open((file_path + '/15.sam'), 'w')
69		f_16 = open((file_path + '/16.sam'), 'w')
70		f_17 = open((file_path + '/17.sam'), 'w')
71		f_18 = open((file_path + '/18.sam'), 'w')
72		f_19 = open((file_path + '/19.sam'), 'w')
73		f_Y = open((file_path + '/Y.sam'), 'w')
74		f_X = open((file_path + '/X.sam'), 'w')
75		f_M = open((file_path + '/M.sam'), 'w')
76
77		file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
78		'12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
79
80		print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
81
82		with open(start_file, 'r') as sf:
83
84		line = sf.readline()
85		while line != "":
86		count += 1
87		split_line = line.strip().split('\t')
88
89		#ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
90		#ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
91		if split_line[2].startswith('chr'):
92		chro = split_line[2]
93		chro_num = chro.strip('chr')
94		file_dict[chro_num].write(line)
95
96
97		if count % 1000000 == 0:
98		print('current line = ', count)
99
100		line = sf.readline()
101		print('All done')
102
103
104		def pairing(folder_1, folder_2, out_file):
105
106		print("I'm now working on making the bed file for you")
107
108		with open((out_file), 'w') as wf:
109
110		#make a list from 1-x and M. This list is used to to iterate over so all
111		#files are accessed
112
113		file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
114
115		#access all files in folder_1 = store_file
116
117		for ch in file_list:
118		store_dict = {}
119		store_file_counter = 0
120		read_file_counter = 0
121		print("I'm working on chromsome ", ch)
122		store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
123
124		line = store_file.readline()
125
126		#read threw store_file and store in dict
127		while line != "":
128		store_file_counter += 1
129
130		#counter for to keep user updated on progress
131		if store_file_counter % 1000000 == 0:
132		print('current line store_file_counter is = ', store_file_counter)
133		list_line = line.strip().split('\t')
134
135		#work on getting .bed format for store_file
136		chro = list_line[2]
137		position = list_line[3]
138		sign = list_line[1]
139		if sign == '+':
140		ending_pos = int(position) + 35
141		bed = chro + '\t' + position + '\t' + str(ending_pos)
142		elif sign == '-':
143		ending_pos = int(position) - 35
144		bed = chro + '\t' + str(ending_pos) + '\t' + position
145
146
147		#work on getting xy_location
148		s_0 = list_line[0]
149		s_0_split = s_0.split(':')
150		s_6 = s_0_split[6].split(' ')
151
152		xy_location = s_0_split[5] + s_6[0]
153
154		#add to store_dict chromosome number, sign, and bed format
155		store_dict[xy_location] = [position, sign, bed]
156		line = store_file.readline()
157
158
159		#access file '1' = read_file from folder_2
160
161		read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
162		line_2 = read_file.readline()
163
164		#read threw read_file find if it is in stored dict
165
166		while line_2 != "":
167		read_file_counter += 1
168
169		#counter for to keep user updated on progress
170		if read_file_counter % 1000000 == 0:
171		print('current line read_file_counter is = ', read_file_counter)
172
173		list_line = line_2.strip().split('\t')
174
175		#work on getting xy_location
176		s_0 = list_line[0]
177		s_0_split = s_0.split(':')
178		s_6 = s_0_split[6].split(' ')
179		xy_location_read_file = s_0_split[5] + s_6[0]
180
181		#find matching xy_location in store dictionary
182		if xy_location_read_file in store_dict.keys():
183
184
185		#work on getting .bed format for read_file
186		chro = list_line[2]
187		position = list_line[3]
188
189		#find out if absolute value of read_file position - store_position
190		# is > 1000
191		if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000:
192		sign = list_line[1]
193
194		#write + sign next to - sign frag
195		if sign == '+':
196		ending_pos = int(position) + 35
197		bed = chro + '\t' + position + '\t' + str(ending_pos)
198		to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n'
199		wf.write(to_write)
200		elif sign == '-':
201		ending_pos = int(position) - 35
202		bed = chro + '\t' + str(ending_pos) + '\t' + position
203		to_write = store_dict[xy_location_read_file][2] + '\t' + bed + '\n'
204		wf.write(to_write)
205
206
207		line_2 = read_file.readline()
208
209		print("I'done with the pairing")
210
211		if __name__ == '__main__':
212
213		run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13")
214		#started 9:32 AM ended at 9:58 AM