taffners

HiCpy

May 17th, 2013
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # this function was written in python 3 by Samantha Taffner
  3.  
  4. import sys
  5. import os
  6.  
  7. def run(first_file, second_file, folder_location):
  8.     '''
  9.    |   HiCpy is designed to bridge the speed gap between the very fast short
  10.    |   single read alignment and the considerably slower paired-end alignment.
  11.    |   HiCpy works in conjuction with Bowtie to accomplish an efficient Hi-C
  12.    |   analysis pipeline. Bowtie is called an ultrafast memory-efficient
  13.    |   short read aligner and there is not any denying that for single
  14.    |   read sequencing it is very fast.  However, it is lacking tremendously
  15.    |   in speed for paired-end alignment.  After 2 weeks it finally finished
  16.    |   while using 7 processors. HiCpy shortens the time to 4 hours on a
  17.    |   2 processor machine.
  18.    |
  19.    |   Go to HiCpy manual for more information.
  20.    '''
  21.     first_file_dir = first_file.split('/')
  22.     first_file_name = first_file_dir[-1]
  23.  
  24.     inside_folder_1 = folder_location + '/first_file'
  25.    
  26.     if not os.path.exists(inside_folder_1): os.makedirs(inside_folder_1)
  27.  
  28.     find_mapped(first_file, inside_folder_1, first_file_name)
  29.  
  30.     inside_folder_2 = folder_location + '/second_file'
  31.     if not os.path.exists(inside_folder_2): os.makedirs(inside_folder_2)
  32.  
  33.     second_file_dir = second_file.split('/')
  34.     second_file_name = second_file_dir[-1]
  35.  
  36.     find_mapped(first_file, inside_folder_2, second_file_name)
  37.  
  38.     out_folder = folder_location + '/your_results'
  39.     if not os.path.exists(out_folder): os.makedirs(out_folder)
  40.  
  41.     out_file = out_folder + '/your_results.bed'
  42.     make_file = open(out_file,'w')
  43.     make_file.close()
  44.  
  45.     pairing(inside_folder_1, inside_folder_2, out_file)
  46.  
  47.    
  48.  
  49. def find_mapped(start_file, file_path, file_name):
  50.    
  51.    
  52.     count = 0
  53.  
  54.     f_1 = open((file_path + '/1.sam'), 'w')
  55.     f_2 = open((file_path + '/2.sam'), 'w')
  56.     f_3 = open((file_path + '/3.sam'), 'w')
  57.     f_4 = open((file_path + '/4.sam'), 'w')
  58.     f_5 = open((file_path + '/5.sam'), 'w')
  59.     f_6 = open((file_path + '/6.sam'), 'w')
  60.     f_7 = open((file_path + '/7.sam'), 'w')
  61.     f_8 = open((file_path + '/8.sam'), 'w')
  62.     f_9 = open((file_path + '/9.sam'), 'w')
  63.     f_10 = open((file_path + '/10.sam'), 'w')
  64.     f_11 = open((file_path + '/11.sam'), 'w')
  65.     f_12 = open((file_path + '/12.sam'), 'w')
  66.     f_13 = open((file_path + '/13.sam'), 'w')
  67.     f_14 = open((file_path + '/14.sam'), 'w')
  68.     f_15 = open((file_path + '/15.sam'), 'w')
  69.     f_16 = open((file_path + '/16.sam'), 'w')
  70.     f_17 = open((file_path + '/17.sam'), 'w')
  71.     f_18 = open((file_path + '/18.sam'), 'w')
  72.     f_19 = open((file_path + '/19.sam'), 'w')
  73.     f_Y = open((file_path + '/Y.sam'), 'w')
  74.     f_X = open((file_path + '/X.sam'), 'w')
  75.     f_M = open((file_path + '/M.sam'), 'w')
  76.    
  77.     file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6, '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
  78.                  '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16, '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
  79.    
  80.     print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
  81.  
  82.     with open(start_file, 'r') as sf:
  83.        
  84.         line = sf.readline()
  85.         while line != "":
  86.             count += 1
  87.             split_line = line.strip().split('\t')
  88.                                
  89. #ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
  90. #ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
  91.             if split_line[2].startswith('chr'):
  92.                 chro = split_line[2]
  93.                 chro_num = chro.strip('chr')
  94.                 file_dict[chro_num].write(line)
  95.  
  96.                    
  97.             if count % 1000000 == 0:
  98.                 print('current line = ', count)
  99.                    
  100.             line = sf.readline()
  101.         print('All done')
  102.  
  103.    
  104. def pairing(folder_1, folder_2, out_file):
  105.  
  106.     print("I'm now working on making the bed file for you")
  107.    
  108.     with open((out_file), 'w') as wf:
  109.  
  110.     #make a list from 1-x and M. This list is used to to iterate over so all
  111.     #files are accessed
  112.  
  113.         file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
  114.    
  115.     #access all files in folder_1 = store_file
  116.        
  117.         for ch in file_list:
  118.             store_dict = {}
  119.             store_file_counter = 0
  120.             read_file_counter = 0
  121.             print("I'm working on chromsome ", ch)        
  122.             store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
  123.        
  124.             line = store_file.readline()
  125.        
  126.     #read threw store_file  and store in dict
  127.             while line != "":
  128.                 store_file_counter += 1
  129.  
  130.                 #counter for to keep user updated on progress
  131.                 if store_file_counter % 1000000 == 0:
  132.                     print('current line store_file_counter is = ', store_file_counter)
  133.                 list_line = line.strip().split('\t')
  134.  
  135.     #work on getting .bed format for store_file
  136.                 chro = list_line[2]
  137.                 position = list_line[3]
  138.                 sign = list_line[1]
  139.                 if sign == '+':
  140.                     ending_pos = int(position) + 35
  141.                     bed = chro + '\t' + position + '\t' + str(ending_pos)
  142.                 elif sign == '-':
  143.                     ending_pos = int(position) - 35
  144.                     bed = chro + '\t' + str(ending_pos) + '\t' + position            
  145.                
  146.  
  147.     #work on getting xy_location                  
  148.                 s_0 = list_line[0]
  149.                 s_0_split = s_0.split(':')
  150.                 s_6 = s_0_split[6].split(' ')
  151.            
  152.                 xy_location = s_0_split[5] + s_6[0]
  153.                
  154.     #add to store_dict chromosome number, sign, and bed format
  155.                 store_dict[xy_location] = [position, sign, bed]
  156.                 line = store_file.readline()
  157.        
  158.  
  159.     #access file '1' = read_file from folder_2
  160.    
  161.             read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
  162.             line_2 = read_file.readline()
  163.        
  164.     #read threw read_file find if it is in stored dict
  165.  
  166.             while line_2 != "":
  167.                read_file_counter += 1
  168.  
  169.                #counter for to keep user updated on progress
  170.                if read_file_counter % 1000000 == 0:
  171.                     print('current line read_file_counter is = ', read_file_counter)
  172.                
  173.                list_line = line_2.strip().split('\t')
  174.                        
  175.     #work on getting xy_location              
  176.                s_0 = list_line[0]
  177.                s_0_split = s_0.split(':')
  178.                s_6 = s_0_split[6].split(' ')
  179.                xy_location_read_file = s_0_split[5] + s_6[0]
  180.  
  181.     #find matching xy_location in store dictionary
  182.                if xy_location_read_file in store_dict.keys():
  183.  
  184.    
  185.     #work on getting .bed format for read_file
  186.                    chro = list_line[2]
  187.                    position = list_line[3]
  188.  
  189.     #find out if absolute value of read_file position - store_position
  190.     # is > 1000
  191.                    if abs(int(store_dict[xy_location_read_file][0]) - int(position)) > 1000:
  192.                        sign = list_line[1]
  193.  
  194.     #write + sign next to - sign frag
  195.                        if sign == '+':
  196.                            ending_pos = int(position) + 35
  197.                            bed = chro + '\t' + position + '\t' + str(ending_pos)
  198.                            to_write = bed + '\t' + store_dict[xy_location_read_file][2] + '\n'
  199.                            wf.write(to_write)
  200.                        elif sign == '-':
  201.                            ending_pos = int(position) - 35
  202.                            bed = chro + '\t' + str(ending_pos) + '\t' + position
  203.                            to_write = store_dict[xy_location_read_file][2] + '\t' +  bed + '\n'
  204.                            wf.write(to_write)
  205.  
  206.                    
  207.                line_2 = read_file.readline()
  208.            
  209.     print("I'done with the pairing")
  210.  
  211. if __name__ == '__main__':
  212.    
  213.     run("/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R1_cDNA_35bp.sam", "/Users/Bea/Desktop/samantha_folder/sam/5_11_13/R3_cDNA_35bp.sam", "/Users/Bea/Desktop/5_15_13")
  214.     #started 9:32 AM ended at 9:58 AM
Advertisement
Add Comment
Please, Sign In to add comment