Guest User

Program

a guest
Jul 30th, 2013
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.10 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. import os
  4. import sys
  5. #import argparse
  6.  
  7. #arguments:
  8. # program.py enzyme_name motif break_pos fasta_file output_file
  9.  
  10. #no argparse in python 2.6
  11. '''
  12. parser = argparse.ArgumentParser(description='This program performs in silico restriction digestion\
  13. of a fasta file.')
  14.  
  15. parser.add_argument('enzyme_name',type=str,
  16.                    help='Restriction enzyme name, e.g. AluI')
  17. parser.add_argument('enzyme_motif',type=str,
  18.                    help='Restriction enzyme motif with any IUPAC nucleotide, e.g. AGCT')
  19. parser.add_argument('cut_position',type=int,
  20.                    help='0-based position of cutsite within motif, e.g. 2')
  21. parser.add_argument('genome_input',type=str,
  22.                    help='Enter path of fasta file to be digested in silico.\
  23.                    There can be more than one sequence in the file, denoted by \'>\', e.g. genome.fasta')
  24. parser.add_argument('output_file',type=str,
  25.                    help='Enter path of output file, e.g. RE_digest_genome1_AluI.out')
  26.  
  27. args = parser.parse_args()
  28. '''                    
  29. enzyme_name=sys.argv[1]
  30. motif=sys.argv[2]
  31. break_pos=int(sys.argv[3])
  32.  
  33. len_motif=len(motif)
  34.  
  35.    
  36. #set up input and output files
  37. path=sys.argv[4]
  38.  
  39. output=sys.argv[5]
  40.  
  41.  
  42.  
  43.  
  44. ## This dictionary will store all possible recognized motifs. This is necessary in case
  45. ## the motif has ambiguity. For example, the motif AARA will result is sequences of
  46. ## AAAA and AAGA to be accepted as possible motifs, because R is ambiguous for A and G.
  47.  
  48. IUPAC={}
  49. IUPAC['A'] = ['A']
  50. IUPAC['T'] = ['T']
  51. IUPAC['G'] = ['G']
  52. IUPAC['C'] = ['C']
  53.  
  54. IUPAC['R'] = ['R','A','G']
  55. IUPAC['Y']=['Y','C','T']
  56. IUPAC['S']=['S','G','C']
  57. IUPAC['W']=['W','A','T']
  58. IUPAC['K']=['K','G','T']
  59. IUPAC['M']=['M','A','C']
  60. IUPAC['B']=['B','C','G','T']
  61. IUPAC['D']=['D','A','G','T']
  62. IUPAC['H']=['H','A','C','T']
  63. IUPAC['V']=['V','A','C','G']
  64. IUPAC['N']=['A','T','C','G']
  65.  
  66. #excluding 'N' because of masking
  67. #IUPAC['N']=['N','A','T','C','G']
  68.  
  69.  
  70.  
  71. #preprocessing enzymes for ambiguity
  72.  
  73. #initialize
  74. possible_motifs=IUPAC[motif[0]]
  75.  
  76. #go through all possibilities semi-recursively
  77. for char in motif[1:]:
  78.     temp=[]
  79.     for char2 in IUPAC[char]:
  80.         for item in possible_motifs:
  81.             temp+=[item+char2]
  82.     #remove old, unnecessary motifs
  83.     possible_motifs=[]
  84.    
  85.     possible_motifs+=temp
  86.  
  87.  
  88. #save the copies with the same length as the motifs, i.e. send all possible motifs into a dictionary
  89. motif_amb={}
  90. for item in possible_motifs:
  91.     if len(item) == len(motif):
  92.         motif_amb[item]=''
  93.  
  94.  
  95. #open the file,identify a contig, analyze the contig
  96.  
  97. f=open(path)
  98.  
  99. c=f.readlines()
  100.  
  101. f.close()
  102.  
  103. out=open(output,'w')
  104. out.write('#enzyme_name\tchr_file\tstart_position\tend_site\tfragment_length\n')
  105.  
  106. print 'fasta file uploaded, now parsing (may take some time)'
  107.  
  108. #it makes it easier if we initialize the first name
  109. #so that subsequent name lines can be used as analysis terminators
  110. j=0
  111. name=c[0]
  112. s=''
  113. j+=1
  114.  
  115. while j < len(c):
  116.     if c[j][0] != '>':
  117.         s+=c[j]
  118.     if c[j][0] == '>':
  119.         s=s.replace('\n','')
  120.  
  121.         #begin analysis and output
  122.  
  123.         #technically the first position is a fragment site
  124.         sites=[0]
  125.         i=0
  126.         while i <= len(s)-(len_motif):
  127.             try:
  128.                 motif_amb[s[i:i+len_motif]]
  129.  
  130.             except KeyError:
  131.                 i+=1
  132.                 pass
  133.  
  134.             else:
  135.                 sites.append(i+break_pos)
  136.                 i+=break_pos
  137.  
  138.            
  139.  
  140.         #technically the last position is a fragment site as well
  141.         sites.append(len(s)-1)
  142.  
  143.         i=0
  144.  
  145.         #tab file header, 0 based counting system for both start and stop sites
  146.         #name[1:-1] removes > and \n
  147.         while i < len(sites)-1:
  148.             out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
  149.             i+=1
  150.  
  151.         #reset name and sequence for next contig
  152.         print name[1:-1]+' finished'
  153.         name=c[j]
  154.         s=''
  155.  
  156.        
  157.     #We need to pretend that the last line has a '>', because it doesn't
  158.     #but we still need to terminate the analysis of the last fasta.
  159.     if j == len(c)-1:
  160.         s=s.replace('\n','')
  161.  
  162.         #begin analysis and output
  163.  
  164.         #technically the first position is a fragment site
  165.         sites=[0]
  166.         i=0
  167.     while i <= len(s)-(len_motif):
  168.         try:
  169.                 motif_amb[s[i:i+len_motif]]
  170.        
  171.         except KeyError:
  172.         i+=1
  173.         pass
  174.        
  175.         else:
  176.         sites.append(i+break_pos)
  177.         i+=break_pos
  178.  
  179.         #technically the last position is a fragment site as well
  180.         sites.append(len(s)-1)
  181.  
  182.         i=0
  183.  
  184.         #tab file header, 0 based counting system for both start and stop sites
  185.         #name[1:-1] removes > and \n
  186.         while i < len(sites)-1:
  187.             out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
  188.             i+=1
  189.  
  190.         #reset name and sequence for next contig (unnecessary for last contig)
  191.         #name=c[j]
  192.         #s=''
  193.        
  194.     j+=1
  195.        
  196.  
  197.  
  198. out.close()
Advertisement
Add Comment
Please, Sign In to add comment