Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import os
- import sys
- #import argparse
- #arguments:
- # program.py enzyme_name motif break_pos fasta_file output_file
- #no argparse in python 2.6
- '''
- parser = argparse.ArgumentParser(description='This program performs in silico restriction digestion\
- of a fasta file.')
- parser.add_argument('enzyme_name',type=str,
- help='Restriction enzyme name, e.g. AluI')
- parser.add_argument('enzyme_motif',type=str,
- help='Restriction enzyme motif with any IUPAC nucleotide, e.g. AGCT')
- parser.add_argument('cut_position',type=int,
- help='0-based position of cutsite within motif, e.g. 2')
- parser.add_argument('genome_input',type=str,
- help='Enter path of fasta file to be digested in silico.\
- There can be more than one sequence in the file, denoted by \'>\', e.g. genome.fasta')
- parser.add_argument('output_file',type=str,
- help='Enter path of output file, e.g. RE_digest_genome1_AluI.out')
- args = parser.parse_args()
- '''
- enzyme_name=sys.argv[1]
- motif=sys.argv[2]
- break_pos=int(sys.argv[3])
- len_motif=len(motif)
- #set up input and output files
- path=sys.argv[4]
- output=sys.argv[5]
- ## This dictionary will store all possible recognized motifs. This is necessary in case
- ## the motif has ambiguity. For example, the motif AARA will result is sequences of
- ## AAAA and AAGA to be accepted as possible motifs, because R is ambiguous for A and G.
- IUPAC={}
- IUPAC['A'] = ['A']
- IUPAC['T'] = ['T']
- IUPAC['G'] = ['G']
- IUPAC['C'] = ['C']
- IUPAC['R'] = ['R','A','G']
- IUPAC['Y']=['Y','C','T']
- IUPAC['S']=['S','G','C']
- IUPAC['W']=['W','A','T']
- IUPAC['K']=['K','G','T']
- IUPAC['M']=['M','A','C']
- IUPAC['B']=['B','C','G','T']
- IUPAC['D']=['D','A','G','T']
- IUPAC['H']=['H','A','C','T']
- IUPAC['V']=['V','A','C','G']
- IUPAC['N']=['A','T','C','G']
- #excluding 'N' because of masking
- #IUPAC['N']=['N','A','T','C','G']
- #preprocessing enzymes for ambiguity
- #initialize
- possible_motifs=IUPAC[motif[0]]
- #go through all possibilities semi-recursively
- for char in motif[1:]:
- temp=[]
- for char2 in IUPAC[char]:
- for item in possible_motifs:
- temp+=[item+char2]
- #remove old, unnecessary motifs
- possible_motifs=[]
- possible_motifs+=temp
- #save the copies with the same length as the motifs, i.e. send all possible motifs into a dictionary
- motif_amb={}
- for item in possible_motifs:
- if len(item) == len(motif):
- motif_amb[item]=''
- #open the file,identify a contig, analyze the contig
- f=open(path)
- c=f.readlines()
- f.close()
- out=open(output,'w')
- out.write('#enzyme_name\tchr_file\tstart_position\tend_site\tfragment_length\n')
- print 'fasta file uploaded, now parsing (may take some time)'
- #it makes it easier if we initialize the first name
- #so that subsequent name lines can be used as analysis terminators
- j=0
- name=c[0]
- s=''
- j+=1
- while j < len(c):
- if c[j][0] != '>':
- s+=c[j]
- if c[j][0] == '>':
- s=s.replace('\n','')
- #begin analysis and output
- #technically the first position is a fragment site
- sites=[0]
- i=0
- while i <= len(s)-(len_motif):
- try:
- motif_amb[s[i:i+len_motif]]
- except KeyError:
- i+=1
- pass
- else:
- sites.append(i+break_pos)
- i+=break_pos
- #technically the last position is a fragment site as well
- sites.append(len(s)-1)
- i=0
- #tab file header, 0 based counting system for both start and stop sites
- #name[1:-1] removes > and \n
- while i < len(sites)-1:
- out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
- i+=1
- #reset name and sequence for next contig
- print name[1:-1]+' finished'
- name=c[j]
- s=''
- #We need to pretend that the last line has a '>', because it doesn't
- #but we still need to terminate the analysis of the last fasta.
- if j == len(c)-1:
- s=s.replace('\n','')
- #begin analysis and output
- #technically the first position is a fragment site
- sites=[0]
- i=0
- while i <= len(s)-(len_motif):
- try:
- motif_amb[s[i:i+len_motif]]
- except KeyError:
- i+=1
- pass
- else:
- sites.append(i+break_pos)
- i+=break_pos
- #technically the last position is a fragment site as well
- sites.append(len(s)-1)
- i=0
- #tab file header, 0 based counting system for both start and stop sites
- #name[1:-1] removes > and \n
- while i < len(sites)-1:
- out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
- i+=1
- #reset name and sequence for next contig (unnecessary for last contig)
- #name=c[j]
- #s=''
- j+=1
- out.close()
Advertisement
Add Comment
Please, Sign In to add comment