View difference between Paste ID: W39U8BEh and Cw2YjGHY
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/env python
2
# this function was written in python 3 by Samantha Taffner
3
4
import sys
5
import os
6
7
def run(first_file, second_file, folder_location):
8
    '''
9
    |   HiCpy is designed to bridge the speed gap between the very fast short
10
    |   single read alignment and the considerably slower paired-end alignment.
11
    |   HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C
12
    |   analysis pipeline. Bowtie is called an ultrafast memory-efficient
13
    |   short read aligner and there is not any denying that for single
14
    |   read sequencing it is very fast.  However, it is lacking tremendously
15
    |   in speed for paired-end alignment.  After 2 weeks it finally finished
16
    |   while using 7 processors. HiCpy shortens the time to 4 hours on a
17
    |   2 processor machine.
18
    |
19
    |   Go to HiCpy manual for more information. 
20
    '''
21
    first_file_dir = first_file.split('/')
22
    first_file_name = first_file_dir[-1] 
23
24
    inside_folder_1 = folder_location + '/first_file'
25
    
26
    if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1)
27
28
    find_mapped(first_file, inside_folder_1, first_file_name)
29
30
    inside_folder_2 = folder_location + '/second_file'
31
    if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2)
32
33
    second_file_dir = second_file.split('/')
34
    second_file_name = second_file_dir[-1]
35
36
    find_mapped(first_file, inside_folder_2, second_file_name)
37
38
    out_folder = folder_location + '/your_results'
39
    if not os.path.exists(out_folder): os.makedirs(out_folder)
40
41
    out_file = out_folder + '/your_results.bed'
42
    make_file = open(out_file,'w')
43
    make_file.close()
44
45
    pairing(inside_folder_1, inside_folder_2, out_file)
46
47
    
48
49
def find_mapped(start_file, file_path, file_name):
50
   
51
    
52
    count = 0
53
54
    f_1 = open((file_path + '/1.sam'), 'w')
55
    f_2 = open((file_path + '/2.sam'), 'w')
56
    f_3 = open((file_path + '/3.sam'), 'w')
57
    f_4 = open((file_path + '/4.sam'), 'w')
58
    f_5 = open((file_path + '/5.sam'), 'w')
59
    f_6 = open((file_path + '/6.sam'), 'w')
60
    f_7 = open((file_path + '/7.sam'), 'w')
61
    f_8 = open((file_path + '/8.sam'), 'w')
62
    f_9 = open((file_path + '/9.sam'), 'w')
63
    f_10 = open((file_path + '/10.sam'), 'w')
64
    f_11 = open((file_path + '/11.sam'), 'w')
65
    f_12 = open((file_path + '/12.sam'), 'w')
66
    f_13 = open((file_path + '/13.sam'), 'w')
67
    f_14 = open((file_path + '/14.sam'), 'w')
68
    f_15 = open((file_path + '/15.sam'), 'w')
69
    f_16 = open((file_path + '/16.sam'), 'w')
70
    f_17 = open((file_path + '/17.sam'), 'w')
71
    f_18 = open((file_path + '/18.sam'), 'w')
72
    f_19 = open((file_path + '/19.sam'), 'w')
73
    f_Y = open((file_path + '/Y.sam'), 'w')
74
    f_X = open((file_path + '/X.sam'), 'w')
75
    f_M = open((file_path + '/M.sam'), 'w') 
76
    
77
    file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
78
                 '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
79
    
80
    print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
81
82
    with open(start_file, 'r') as sf:
83
        
84
        line = sf.readline()
85
        while line != "":
86
            count += 1
87
            split_line = line.strip().split('\t')
88
                                
89
#ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
90
#ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
91
            if split_line[2].startswith('chr'):
92
                chro = split_line[2]
93
                chro_num = chro.strip('chr')
94
                file_dict[chro_num].write(line)
95
96
                    
97
            if count % 1000000 == 0:
98
                print('current line = ', count) 
99
                    
100
            line = sf.readline()
101
        print('All done')
102
103
   
104
def pairing(folder_1, folder_2, out_file):
105
106
    print("I'm now working on making the bed file for you")
107
   
108
    with open((out_file), 'w') as wf:
109
 
110
    #make a list from 1-x and M. This list is used to to iterate over so all
111
    #files are accessed
112
 
113
        file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
114
   
115
    #access all files in folder_1 = store_file
116
       
117
        for ch in file_list:
118
            store_dict = {}
119
            store_file_counter = 0
120
            read_file_counter = 0
121
            print("I'm working on chromsome ", ch)        
122
            store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
123
       
124
            line = store_file.readline()
125
       
126
    #read threw store_file  and store in dict
127
            while line != "":
128
                store_file_counter += 1
129
130
                #counter for to keep user updated on progress
131
                if store_file_counter % 1000000 == 0: 
132
                    print('current line store_file_counter is = ', store_file_counter)
133
                list_line = line.strip().split('\t')
134
135
    #work on getting .bed format for store_file
136
                chro = list_line[2]
137
                position = list_line[3]
138
                sign = list_line[1]
139
                if sign == '+':
140
                    ending_pos = int(position) + 35
141
                    bed = chro + '\t' + position + '\t' + str(ending_pos)
142
                elif sign == '-':
143
                    ending_pos = int(position) - 35
144
                    bed = chro + '\t' + str(ending_pos) + '\t' + position             
145
                
146
147
    #work on getting xy_location                   
148
                s_0 = list_line[0]
149
                s_0_split = s_0.split(':')
150
                s_6 = s_0_split[6].split(' ')
151
           
152
                xy_location = s_0_split[5] + s_6[0]
153
                
154
    #add to store_dict chromosome number, sign, and bed format
155
                store_dict[xy_location] = [position, sign, bed]
156
                line = store_file.readline()
157
       
158
 
159
    #access file '1' = read_file from folder_2
160
   
161
            read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
162
            line_2 = read_file.readline()
163
       
164
    #read threw read_file find if it is in stored dict
165
 
166
            while line_2 != "":
167
               read_file_counter += 1
168
169
               #counter for to keep user updated on progress
170
               if read_file_counter % 1000000 == 0:
171
                    print('current line read_file_counter is = ', read_file_counter)
172
                
173
               list_line = line_2.strip().split('\t')
174
                        
175
    #work on getting xy_location               
176
               s_0 = list_line[0]
177
               s_0_split = s_0.split(':')
178
               s_6 = s_0_split[6].split(' ')
179
               xy_location_read_file = s_0_split[5] + s_6[0]
180
181
    #find matching xy_location in store dictionary
182
               if xy_location_read_file in store_dict.keys():
183
184
    
185
    #work on getting .bed format for read_file
186
                   chro = list_line[2]
187
                   position = list_line[3]
188
189
    #find out if absolute value of read_file position - store_position
190
    # is > 1000
191
                   if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000:
192
                       sign = list_line[1]
193
194
    #write + sign next to - sign frag
195
                       if sign == '+':
196
                           ending_pos = int(position) + 35
197
                           bed = chro + '\t' + position + '\t' + str(ending_pos)
198
                           to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n'
199
                           wf.write(to_write)
200
                       elif sign == '-':
201
                           ending_pos = int(position) - 35
202
                           bed = chro + '\t' + str(ending_pos) + '\t' + position 
203
                           to_write = store_dict[xy_location_read_file][2] + '\t' +  bed + '\n'
204
                           wf.write(to_write)
205
206
                   
207
               line_2 = read_file.readline()
208
           
209
    print("I'done with the pairing")
210
211
if __name__ == '__main__':
212
   
213
    run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13")
214
    #started 9:32 AM ended at 9:58 AM