Program

#!/usr/bin/python

import os
import sys
#import argparse

#arguments:
# program.py enzyme_name motif break_pos fasta_file output_file

#no argparse in python 2.6
'''
parser = argparse.ArgumentParser(description='This program performs in silico restriction digestion\
 of a fasta file.')

parser.add_argument('enzyme_name',type=str,
                    help='Restriction enzyme name, e.g. AluI')
parser.add_argument('enzyme_motif',type=str,
                    help='Restriction enzyme motif with any IUPAC nucleotide, e.g. AGCT')
parser.add_argument('cut_position',type=int,
                    help='0-based position of cutsite within motif, e.g. 2')
parser.add_argument('genome_input',type=str,
                    help='Enter path of fasta file to be digested in silico.\
                    There can be more than one sequence in the file, denoted by \'>\', e.g. genome.fasta')
parser.add_argument('output_file',type=str,
                    help='Enter path of output file, e.g. RE_digest_genome1_AluI.out')

args = parser.parse_args()
'''
enzyme_name=sys.argv[1]
motif=sys.argv[2]
break_pos=int(sys.argv[3])

len_motif=len(motif)


#set up input and output files
path=sys.argv[4]

output=sys.argv[5]


## This dictionary will store all possible recognized motifs. This is necessary in case
## the motif has ambiguity. For example, the motif AARA will result is sequences of
## AAAA and AAGA to be accepted as possible motifs, because R is ambiguous for A and G.

IUPAC={}
IUPAC['A'] = ['A']
IUPAC['T'] = ['T']
IUPAC['G'] = ['G']
IUPAC['C'] = ['C']

IUPAC['R'] = ['R','A','G']
IUPAC['Y']=['Y','C','T']
IUPAC['S']=['S','G','C']
IUPAC['W']=['W','A','T']
IUPAC['K']=['K','G','T']
IUPAC['M']=['M','A','C']
IUPAC['B']=['B','C','G','T']
IUPAC['D']=['D','A','G','T']
IUPAC['H']=['H','A','C','T']
IUPAC['V']=['V','A','C','G']
IUPAC['N']=['A','T','C','G']

#excluding 'N' because of masking
#IUPAC['N']=['N','A','T','C','G']


#preprocessing enzymes for ambiguity

#initialize
possible_motifs=IUPAC[motif[0]]

#go through all possibilities semi-recursively
for char in motif[1:]:
    temp=[]
    for char2 in IUPAC[char]:
        for item in possible_motifs:
            temp+=[item+char2]
    #remove old, unnecessary motifs
    possible_motifs=[]

    possible_motifs+=temp


#save the copies with the same length as the motifs, i.e. send all possible motifs into a dictionary
motif_amb={}
for item in possible_motifs:
    if len(item) == len(motif):
        motif_amb[item]=''


#open the file,identify a contig, analyze the contig

f=open(path)

c=f.readlines()

f.close()

out=open(output,'w')
out.write('#enzyme_name\tchr_file\tstart_position\tend_site\tfragment_length\n')

print 'fasta file uploaded, now parsing (may take some time)'

#it makes it easier if we initialize the first name
#so that subsequent name lines can be used as analysis terminators
j=0
name=c[0]
s=''
j+=1

while j < len(c):
    if c[j][0] != '>':
        s+=c[j]
    if c[j][0] == '>':
        s=s.replace('\n','')

        #begin analysis and output

        #technically the first position is a fragment site
        sites=[0]
        i=0
        while i <= len(s)-(len_motif):
            try:
                motif_amb[s[i:i+len_motif]]

            except KeyError:
                i+=1
                pass

            else:
                sites.append(i+break_pos)
                i+=break_pos


        #technically the last position is a fragment site as well
        sites.append(len(s)-1)

        i=0

        #tab file header, 0 based counting system for both start and stop sites
        #name[1:-1] removes > and \n
        while i < len(sites)-1:
            out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
            i+=1

        #reset name and sequence for next contig
        print name[1:-1]+' finished'
        name=c[j]
        s=''


    #We need to pretend that the last line has a '>', because it doesn't
    #but we still need to terminate the analysis of the last fasta.
    if j == len(c)-1:
        s=s.replace('\n','')

        #begin analysis and output

        #technically the first position is a fragment site
        sites=[0]
        i=0
    while i <= len(s)-(len_motif):
        try:
                motif_amb[s[i:i+len_motif]]

        except KeyError:
        i+=1
        pass

        else:
        sites.append(i+break_pos)
        i+=break_pos

        #technically the last position is a fragment site as well
        sites.append(len(s)-1)

        i=0

        #tab file header, 0 based counting system for both start and stop sites
        #name[1:-1] removes > and \n
        while i < len(sites)-1:
            out.write(enzyme_name+'\t'+name[1:-1]+'\t'+str(sites[i])+'\t'+str(sites[i+1])+'\t'+str(sites[i+1]-sites[i])+'\n')
            i+=1

        #reset name and sequence for next contig (unnecessary for last contig)
        #name=c[j]
        #s=''

    j+=1


out.close()