taffners

HiCpy_2

May 28th, 2013
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.53 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # HiCpy_2 was written by Samantha Taffner on May 26th, 2013
  3.  
  4. import sys
  5. import os
  6. import datetime
  7.  
  8.  
  9.  
  10.  
  11. def run(first_file, second_file, folder_location):
  12.  
  13. #time ________________________________________________________________
  14.     #get start time
  15.     start_time = datetime.datetime.now()
  16.     Y_M_D = start_time.strftime('%Y_%m_%d')
  17.  
  18. #find format of first_file and store as variable______________________________
  19.     #call find_format(make sure first line is stored!)
  20.     file_format = find_format(first_file)
  21.    
  22.     #raise format error and abort HiCpy if file_format not equal to 5 or 7
  23.     if file_format != 5 and file_format != 7:
  24.         return print('''File format ERROR: Please refer to the HiCpy user manual about query name
  25. formats accepted by HiCpy''')
  26.    
  27. #get file names________________________________________________________________
  28.     #get first file name
  29.     first_file_dir = first_file.split('/')
  30.     first_file_name = first_file_dir[-1]
  31.  
  32.     #get second file name
  33.     second_file_dir = second_file.split('/')
  34.     second_file_name = second_file_dir[-1]
  35.  
  36. #make folders________________________________________________________________  
  37.     #make first_file folder
  38.     folder_1_location = folder_location + '/' + first_file_name + '_' + Y_M_D
  39.     if not os.path.exists(folder_1_location): os.makedirs(folder_1_location)      
  40.    
  41.     #make second_file folder
  42.     folder_2_location = folder_location + '/' + second_file_name + '_' + Y_M_D
  43.     if not os.path.exists(folder_2_location): os.makedirs(folder_2_location)
  44.  
  45.     #make cis folder
  46.     cis_folder = folder_location + '/cis_paired_results_' + Y_M_D
  47.     if not os.path.exists(cis_folder): os.makedirs(cis_folder)    
  48.  
  49. #make out_files
  50.     #make cis file
  51.     cis_file = cis_folder + '/cis_results'  + Y_M_D + '.bed'
  52.     make_file = open(cis_file,'w')
  53.     make_file.close()
  54.  
  55.    
  56. #call find_mapped twice for each input file________________________________
  57.     find_mapped(first_file, folder_1_location, first_file_name)
  58.     find_mapped(second_file, folder_2_location, second_file_name)
  59.  
  60. #call appropriate pairing function or raise a format error
  61.     #if 5 call function five
  62.     if file_format == 5:
  63.         end_time = five(folder_1_location, folder_2_location, cis_file)
  64.     #elif 7 call function seven
  65.     elif file_format == 7:
  66.         end_time = seven(folder_1_location, folder_2_location, cis_file)
  67. #return total time
  68.     elapsed_time = str(end_time - start_time)
  69.     print('The amount of time that has elapsed since HiCpy started is:\n', elapsed_time)
  70.        
  71. def find_format(first_file):
  72.     '''
  73.    This function will find if format is:
  74.    
  75.    len(line[0]) = 7
  76.     D99KWTL1:845:C214UACXX:6:1101:1404:2170 3:N:0:  -   chr15   33279215
  77.  
  78.    or
  79.  
  80.    len(line[0]) = 5
  81.     HW-ST997_0116:3:1101:1213:2167#0/3  -   chr8    36287971
  82.    
  83.    '''
  84.  
  85.     #open
  86.     with open((first_file), 'r') as f:
  87.  
  88.         #read one line
  89.         line = f.readline()
  90.    
  91.         #extract length of query name
  92.         split_line = line.strip(' ').split('\t')
  93.         split_line = split_line[0].split(' ')
  94.         len_query_name = len(split_line[0].split(':'))
  95.                  
  96.  
  97.     return len_query_name
  98.  
  99. def find_mapped(start_file, file_path, file_name):
  100.     '''
  101.    This function will seperate each seqencing into their corresponding the chr
  102.    it was mapped to
  103.    '''
  104.     count = 0
  105.  
  106.     f_1 = open((file_path + '/1.sam'), 'w')
  107.     f_2 = open((file_path + '/2.sam'), 'w')
  108.     f_3 = open((file_path + '/3.sam'), 'w')
  109.     f_4 = open((file_path + '/4.sam'), 'w')
  110.     f_5 = open((file_path + '/5.sam'), 'w')
  111.     f_6 = open((file_path + '/6.sam'), 'w')
  112.     f_7 = open((file_path + '/7.sam'), 'w')
  113.     f_8 = open((file_path + '/8.sam'), 'w')
  114.     f_9 = open((file_path + '/9.sam'), 'w')
  115.     f_10 = open((file_path + '/10.sam'), 'w')
  116.     f_11 = open((file_path + '/11.sam'), 'w')
  117.     f_12 = open((file_path + '/12.sam'), 'w')
  118.     f_13 = open((file_path + '/13.sam'), 'w')
  119.     f_14 = open((file_path + '/14.sam'), 'w')
  120.     f_15 = open((file_path + '/15.sam'), 'w')
  121.     f_16 = open((file_path + '/16.sam'), 'w')
  122.     f_17 = open((file_path + '/17.sam'), 'w')
  123.     f_18 = open((file_path + '/18.sam'), 'w')
  124.     f_19 = open((file_path + '/19.sam'), 'w')
  125.     f_Y = open((file_path + '/Y.sam'), 'w')
  126.     f_X = open((file_path + '/X.sam'), 'w')
  127.     f_M = open((file_path + '/M.sam'), 'w')
  128.    
  129.     file_dict = {'1':f_1, '2':f_2, '3':f_3, '4':f_4, '5':f_5, '6':f_6,
  130.                  '7':f_7, '8':f_8, '9':f_9, '10':f_10, '11':f_11,
  131.                  '12':f_12, '13':f_13, '14':f_14, '15':f_15, '16':f_16,
  132.                  '17':f_17, '18':f_18, '19':f_19, 'Y':f_Y, 'X':f_X, 'M':f_M}
  133.    
  134.     print('Program has started on ' + file_name + ' and it will update you every time it reaches an interval of 1,000,000')
  135.  
  136.     with open(start_file, 'r') as sf:
  137.        
  138.         line = sf.readline()
  139.         while line != "":
  140.             count += 1
  141.             split_line = line.strip().split('\t')
  142.                                
  143. #ex.bad ['D99KWTL1:845:C214UACXX:6:1101:7094:2225', '1:N:0:', '77', '*'
  144. #ex.good ['D99KWTL1:845:C214UACXX:6:1101:7166:2131', '83', 'chr7'
  145.             if split_line[2].startswith('chr'):
  146.                 chro = split_line[2]
  147.                 chro_num = chro.strip('chr')
  148.                 file_dict[chro_num].write(line)
  149.  
  150.                    
  151.             if count % 1000000 == 0:
  152.                 print('current line = ', count)
  153.                    
  154.             line = sf.readline()
  155.         print('All done')
  156.  
  157.  
  158.  
  159. def seven(folder_1, folder_2, cis_file):
  160.     '''
  161.    Read file one store to dict then find pair in second file.  If pair not
  162.    found write to not found file
  163.  
  164.    '''
  165.  
  166.     print("I'm now working on making the bed file for you")
  167.    
  168.     with open((cis_file), 'w') as wf:
  169.          
  170. #make a list from 1-x and M. This list is used to to iterate over so all
  171.     #files are accessed
  172.  
  173.         file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
  174.    
  175.     #access all files in folder_1 = store_file
  176.        
  177.         for ch in file_list:
  178.             store_dict = {}
  179.             store_file_counter = 0
  180.             read_file_counter = 0
  181.             print("I'm working on chromsome ", ch)        
  182.             store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
  183.             line = store_file.readline()
  184.        
  185.     #read threw store_file  and store in dict
  186.             while line != "":
  187.                 store_file_counter += 1
  188.  
  189.                 #counter for to keep user updated on progress
  190.                 if store_file_counter % 1000000 == 0:
  191.                     print('current line store_file_counter is = ', store_file_counter)
  192.                 list_line = line.strip().split('\t')
  193.                
  194.     #work on getting .bed format for store_file
  195.                 chro = list_line[2]
  196.                 position = list_line[3]
  197.                 sign = list_line[1]
  198.                
  199.                 if sign == '+':
  200.                     ending_pos = int(position) + 35
  201.                     bed = chro + '\t' + position + '\t' + str(ending_pos)
  202.                 elif sign == '-':
  203.                     ending_pos = int(position) - 35
  204.                     bed = chro + '\t' + str(ending_pos) + '\t' + position            
  205.                              
  206.     #get xy on CHIP location              
  207.                 query_name_list = list_line[0].split(' ')
  208.                 query_name_list = query_name_list[0].split(':')
  209.                
  210.                 xy = query_name_list[-2] + query_name_list[-1]
  211.                
  212.                
  213.                
  214.     #add to store_dict chromosome number, sign, and bed format
  215.                 store_dict[xy] = [position, sign, bed, xy]
  216.                 line = store_file.readline()
  217.        
  218.  
  219.     #access file '1' = read_file from folder_2
  220.    
  221.             read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
  222.             line_2 = read_file.readline()
  223.            
  224.     #read threw read_file find if it is in stored dict
  225.  
  226.             while line_2 != "":
  227.                 read_file_counter += 1
  228.  
  229.                #counter for to keep user updated on progress
  230.                 if read_file_counter % 1000000 == 0:
  231.                     print('current line read_file_counter is = ', read_file_counter)
  232.                
  233.                 list_line = line_2.strip().split('\t')
  234.                                                      
  235.     #work on getting xy_location              
  236.                 query_name_list = list_line[0].split(' ')
  237.                 query_name_list = query_name_list[0].split(':')
  238.                
  239.                 xy_read = query_name_list[-2] + query_name_list[-1]
  240.                
  241.  
  242.  
  243.     #find matching xy_location in store dictionary
  244.                 if xy_read in store_dict.keys():
  245.                      
  246.     #work on getting .bed format for read_file
  247.                     chro = list_line[2]
  248.                     position = list_line[3]
  249.                    
  250.     #find out if absolute value of read_file position - store_position
  251.     # is > 1000
  252.                     if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
  253.                         sign = list_line[1]
  254.                        
  255.     #write + sign next to - sign frag
  256.                         if sign == '+':
  257.                             ending_pos = int(position) + 35
  258.                             bed = chro + '\t' + position + '\t' + str(ending_pos)
  259.                             to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
  260.                             wf.write(to_write)
  261.                         elif sign == '-':
  262.                             ending_pos = int(position) - 35
  263.                             bed = chro + '\t' + str(ending_pos) + '\t' + position
  264.                             to_write = store_dict[xy_read][2] + '\t' +  bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
  265.                             wf.write(to_write)
  266.  
  267.                    
  268.                 line_2 = read_file.readline()
  269.            
  270.     print("I'done with the pairing")
  271.     end_time = datetime.datetime.now()
  272.     return end_time
  273.  
  274. def five(folder_1, folder_2, cis_file):
  275.     print("I'm now working on making the bed file for you")
  276.    
  277.     with open((cis_file), 'w') as wf:
  278.          
  279.     #make a list from 1-x and M. This list is used to to iterate over so all
  280.     #files are accessed
  281.  
  282.         file_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19','M','X','Y']
  283.    
  284.     #access all files in folder_1 = store_file
  285.        
  286.         for ch in file_list:
  287.             store_dict = {}
  288.             store_file_counter = 0
  289.             read_file_counter = 0
  290.             print("I'm working on chromsome ", ch)        
  291.             store_file = open((folder_1 +'/'+ ch + '.sam'), 'r')
  292.             line = store_file.readline()
  293.        
  294.     #read threw store_file  and store in dict
  295.             while line != "":
  296.                 store_file_counter += 1
  297.  
  298.                 #counter for to keep user updated on progress
  299.                 if store_file_counter % 1000000 == 0:
  300.                     print('current line store_file_counter is = ', store_file_counter)
  301.                 list_line = line.strip().split('\t')
  302.  
  303.     #work on getting .bed format for store_file
  304.                 chro = list_line[2]
  305.                 position = list_line[3]
  306.                 sign = list_line[1]
  307.                 if sign == '+':
  308.                     ending_pos = int(position) + 35
  309.                     bed = chro + '\t' + position + '\t' + str(ending_pos)
  310.                 elif sign == '-':
  311.                     ending_pos = int(position) - 35
  312.                     bed = chro + '\t' + str(ending_pos) + '\t' + position            
  313.                
  314.     #get xy on CHIP location              
  315.                 query_name_list = list_line[0].split(':')
  316.                 x = query_name_list[3]
  317.                 y = query_name_list[4].split('#')
  318.                 y = y[0]
  319.                 xy = x + y
  320.                
  321.     #add to store_dict chromosome number, sign, and bed format
  322.                 store_dict[xy] = [position, sign, bed, xy]
  323.                 line = store_file.readline()
  324.        
  325.  
  326.     #access file '1' = read_file from folder_2
  327.    
  328.             read_file = open((folder_2 +'/'+ ch + '.sam'), 'r')
  329.             line_2 = read_file.readline()
  330.            
  331.     #read threw read_file find if it is in stored dict
  332.  
  333.             while line_2 != "":
  334.                 read_file_counter += 1
  335.  
  336.                #counter for to keep user updated on progress
  337.                 if read_file_counter % 1000000 == 0:
  338.                     print('current line read_file_counter is = ', read_file_counter)
  339.                
  340.                 list_line = line_2.strip().split('\t')
  341.                                      
  342.     #work on getting xy_location              
  343.                 query_name_list = list_line[0].split(':')
  344.                 x = query_name_list[3]
  345.                 y = query_name_list[4].split('#')
  346.                 y = y[0]
  347.                 xy_read = x + y
  348.  
  349.     #find matching xy_location in store dictionary
  350.                 if xy_read in store_dict.keys():
  351.                      
  352.     #work on getting .bed format for read_file
  353.                     chro = list_line[2]
  354.                     position = list_line[3]
  355.  
  356.     #find out if absolute value of read_file position - store_position
  357.     # is > 1000
  358.                     if abs(int(store_dict[xy_read][0]) - int(position)) > 1000:
  359.                         sign = list_line[1]
  360.  
  361.     #write + sign next to - sign frag
  362.                         if sign == '+':
  363.                             ending_pos = int(position) + 35
  364.                             bed = chro + '\t' + position + '\t' + str(ending_pos)
  365.                             to_write = bed + '\t' + store_dict[xy_read][2] + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
  366.                             wf.write(to_write)
  367.                         elif sign == '-':
  368.                             ending_pos = int(position) - 35
  369.                             bed = chro + '\t' + str(ending_pos) + '\t' + position
  370.                             to_write = store_dict[xy_read][2] + '\t' +  bed + '\t' + xy_read + '\t' + store_dict[xy_read][3] +'\n'
  371.                             wf.write(to_write)
  372.  
  373.                    
  374.                 line_2 = read_file.readline()
  375.            
  376.     print("I'done with the pairing")
  377.     end_time = datetime.datetime.now()
  378.     return end_time
  379.  
  380.  
  381. if __name__ == '__main__':
  382.     run('First_SAM_file', 'Second_SAM_file', 'Folder_to_save_data_to')
Advertisement
Add Comment
Please, Sign In to add comment