Untitled

import pandas as pd
import csv

# ----------------------------------- Input variables -----------------------------------

# Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
data_file = 'Master Thesis 14 PrESTs'
protease_name = 'Trypsin'
#protease_name = 'Lys-C'

# ---------------------------------------------------------------------------------------

data = pd.read_csv(data_file + '.csv', sep=';')
proteasome = 'Non-Unique Reference (' + protease_name + ') (UniProt, canonical).csv'

# Create peptide reference Set
ref_set = set()
with open(proteasome, 'rU') as in_file:
    reader_2 = csv.reader(in_file)
    for row in reader_2:
        (ref_set.add(str(row).replace("'","").replace(",","")
                    .replace("[","").replace("]","").replace(" ","")))
print(str(len(ref_set)) + ' peptides in reference')

def protease(PrEST_seq, a, type):
    if protease_name == 'Trypsin':
        if type == 1:
            if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
                return 1
            else:
                return 0
        else:
            if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
                return 1
            else:
                return 0
    if protease_name == 'Lys-C':
        if PrEST_seq[a:a+1] == 'K':
            return 1
        else:
            return 0

# Initiate variables
Peptide_list = [] # List for Peptides (resets for each PrEST)
ID_list = [] # List for PrEST IDs (resets for each PrEST)
Non_Uniques = [] # List for non-unique peptides
Non_Uniques_ID = [] # List for non-unique PrEST IDs
Peptide = '' # Current peptide (no missed cleavages)
Peptide_MC1 = '' # Current peptide with 1 missed cleavage
Peptide_MC2 = '' # Current peptide with 2 missed cleavages

PrEST_data = pd.DataFrame()
# ------------------------------------------------ Main PrEST for-loop ------------------------------------------------
for row in data.iterrows():  # For every PrEST (row)
    First = 'Y'
    PrEST_seq = row[1][1]
    Pep_Count = 0
    MC_Pep_Count = 0
    Non_Unique_Count = 0

    # ----------------------------------------- No missed cleavages for-loop ------------------------------------------
    for n in range(len(PrEST_seq)):  # For every AA in every PrEST

        if protease(PrEST_seq, n, 1) == 1:
            if First != 'Y':  # Does not count first peptide + MCs (part of ABP)
                Peptide += PrEST_seq[n:n+1]
                if len(Peptide) >= 6:  # Only appends peptide if longer than 6 AA
                    if Peptide not in ref_set:
                        ID_list.append(row[1][0])
                        Peptide_list.append(Peptide)
                        Pep_Count += 1
                    else:
                        Non_Uniques_ID.append(row[1][0])
                        Non_Uniques.append(Peptide)
                        Non_Unique_Count += 1
                        ID_list.append(row[1][0] + ' (Not unique)')
                        Peptide_list.append(Peptide)

                # ----------------------------------- One missed cleavage while-loop ----------------------------------
                Peptide_MC1 = Peptide
                m = n
                while m+1 <= len(PrEST_seq):
                    m += 1
                    if protease(PrEST_seq, m, 1) == 1:
                        Peptide_MC1 += PrEST_seq[m:m+1]
                        if len(Peptide_MC1) >= 6:
                            if Peptide_MC1 not in ref_set:
                                ID_list.append(row[1][0])
                                Peptide_list.append(Peptide_MC1)
                                MC_Pep_Count += 1
                                break
                            else:
                                Non_Uniques_ID.append(row[1][0])
                                Non_Uniques.append(Peptide_MC1)
                                Non_Unique_Count += 1
                                ID_list.append(row[1][0] + ' (Not unique)')
                                Peptide_list.append(Peptide_MC1)
                    else:
                        Peptide_MC1 += PrEST_seq[m:m+1]

                # ---------------------------------- Two missed cleavages while-loop ----------------------------------
                Peptide_MC2 = Peptide_MC1
                k = m
                while k+1 <= len(PrEST_seq):
                    k += 1
                    if protease(PrEST_seq, k, 1) == 1:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                        if len(Peptide_MC2) >= 6:
                            if Peptide_MC2 not in ref_set:
                                ID_list.append(row[1][0])
                                Peptide_list.append(Peptide_MC2)
                                MC_Pep_Count += 1
                                break
                            else:
                                Non_Uniques_ID.append(row[1][0])
                                Non_Uniques.append(Peptide_MC2)
                                Non_Unique_Count += 1
                                ID_list.append(row[1][0] + ' (Not unique)')
                                Peptide_list.append(Peptide_MC2)
                    else:
                        Peptide_MC2 += PrEST_seq[k:k+1]
                    # -------------------------------------------------------------------------------------------------

                # Resets variables
                Peptide = ''
                Peptide_MC1 = ''
                Peptide_MC2 = ''
            elif First == 'Y':  # Doesn't count first cleavage (contains ABP)
                Peptide = ''
                First = 'N'
        else:  # Non-cleavable AAs - Peptide grows
            Peptide += PrEST_seq[n:n+1]

    # Appends PrEST data
    K = row[1][1].count('K')
    R = row[1][1].count('R')
    PrEST_data = PrEST_data.append(pd.DataFrame(data=[[row[1][0], Pep_Count, MC_Pep_Count, Non_Unique_Count, K, R]],
                                                columns=['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques',
                                                         '# K', 'R']))
# Writes PrEST data to file
PrEST_data = PrEST_data[['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques', '# K', 'R']]
ew = pd.ExcelWriter(data_file + ' Results.xlsx', encoding='iso-8859-1')
PrEST_data.to_excel(ew, sheet_name='PrESTs (' + protease_name + ')', index=False)

# Creates peptide list for Perseus and writes to file
peptides = pd.DataFrame(Peptide_list, columns=['Peptides']).join(pd.DataFrame(ID_list, columns=['PrEST ID']))
peptides['temp'] = peptides['Peptides'].str.len()
peptides = peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
peptides.to_excel(ew, sheet_name='Peptide list for Perseus', index=False)

# List for non-unique peptides
NU_peptides = pd.DataFrame(Non_Uniques, columns=['Peptides']).join(pd.DataFrame(Non_Uniques_ID, columns=['PrEST ID']))
NU_peptides['temp'] = NU_peptides['Peptides'].str.len()
NU_peptides = NU_peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
NU_peptides.to_excel(ew, sheet_name='Non-unique peptides', index=False)

ew.save()

def protease(PrEST_seq, a, type):
    if protease_name == 'Trypsin':
        if type == 1:
            if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
                return 1
            else:
                return 0
        else:
            if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
                return 1
            else:
                return 0
    if protease_name == 'Lys-C':
        if PrEST_seq[a:a+1] == 'K':
            return 1
        else:
            return 0

def protease(PrEST_seq, a, type):
    if protease_name == 'Trypsin':
        if type != 1:
            a+=1
        return PrEST_seq[a+1] != 'P' and (PrEST_seq[a] in ['R','K']):
    assert(protease_name == 'Lys-C')
    return PrEST_seq[a] == 'K'

m = n
            while m+1 <= len(PrEST_seq):
                m += 1
                stuff_about(m)

for m in range(n+1, len(PrEST_seq)+1):
                stuff_about(m)

if len(Peptide) >= 6:  # Only appends peptide if longer than 6 AA
                if Peptide not in ref_set:
                    ID_list.append(row[1][0])
                    Peptide_list.append(Peptide)
                    Pep_Count += 1
                else:
                    Non_Uniques_ID.append(row[1][0])
                    Non_Uniques.append(Peptide)
                    Non_Unique_Count += 1
                    ID_list.append(row[1][0] + ' (Not unique)')
                    Peptide_list.append(Peptide)

if len(Peptide) >= 6:  # Only appends peptide if longer than 6 AA
                Peptide_list.append(Peptide)

                if Peptide not in ref_set:
                    ID_list.append(row[1][0])
                    Pep_Count += 1
                else:
                    Non_Uniques_ID.append(row[1][0])
                    Non_Uniques.append(Peptide)
                    ID_list.append(row[1][0] + ' (Not unique)')