Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import csv
- # ----------------------------------- Input variables -----------------------------------
- # Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
- data_file = 'Master Thesis 14 PrESTs'
- protease_name = 'Trypsin'
- #protease_name = 'Lys-C'
- # ---------------------------------------------------------------------------------------
- data = pd.read_csv(data_file + '.csv', sep=';')
- proteasome = 'Non-Unique Reference (' + protease_name + ') (UniProt, canonical).csv'
- # Create peptide reference Set
- ref_set = set()
- with open(proteasome, 'rU') as in_file:
- reader_2 = csv.reader(in_file)
- for row in reader_2:
- (ref_set.add(str(row).replace("'","").replace(",","")
- .replace("[","").replace("]","").replace(" ","")))
- print(str(len(ref_set)) + ' peptides in reference')
- def protease(PrEST_seq, a, type):
- if protease_name == 'Trypsin':
- if type == 1:
- if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
- return 1
- else:
- return 0
- else:
- if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
- return 1
- else:
- return 0
- if protease_name == 'Lys-C':
- if PrEST_seq[a:a+1] == 'K':
- return 1
- else:
- return 0
- # Initiate variables
- Peptide_list = [] # List for Peptides (resets for each PrEST)
- ID_list = [] # List for PrEST IDs (resets for each PrEST)
- Non_Uniques = [] # List for non-unique peptides
- Non_Uniques_ID = [] # List for non-unique PrEST IDs
- Peptide = '' # Current peptide (no missed cleavages)
- Peptide_MC1 = '' # Current peptide with 1 missed cleavage
- Peptide_MC2 = '' # Current peptide with 2 missed cleavages
- PrEST_data = pd.DataFrame()
- # ------------------------------------------------ Main PrEST for-loop ------------------------------------------------
- for row in data.iterrows(): # For every PrEST (row)
- First = 'Y'
- PrEST_seq = row[1][1]
- Pep_Count = 0
- MC_Pep_Count = 0
- Non_Unique_Count = 0
- # ----------------------------------------- No missed cleavages for-loop ------------------------------------------
- for n in range(len(PrEST_seq)): # For every AA in every PrEST
- if protease(PrEST_seq, n, 1) == 1:
- if First != 'Y': # Does not count first peptide + MCs (part of ABP)
- Peptide += PrEST_seq[n:n+1]
- if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
- if Peptide not in ref_set:
- ID_list.append(row[1][0])
- Peptide_list.append(Peptide)
- Pep_Count += 1
- else:
- Non_Uniques_ID.append(row[1][0])
- Non_Uniques.append(Peptide)
- Non_Unique_Count += 1
- ID_list.append(row[1][0] + ' (Not unique)')
- Peptide_list.append(Peptide)
- # ----------------------------------- One missed cleavage while-loop ----------------------------------
- Peptide_MC1 = Peptide
- m = n
- while m+1 <= len(PrEST_seq):
- m += 1
- if protease(PrEST_seq, m, 1) == 1:
- Peptide_MC1 += PrEST_seq[m:m+1]
- if len(Peptide_MC1) >= 6:
- if Peptide_MC1 not in ref_set:
- ID_list.append(row[1][0])
- Peptide_list.append(Peptide_MC1)
- MC_Pep_Count += 1
- break
- else:
- Non_Uniques_ID.append(row[1][0])
- Non_Uniques.append(Peptide_MC1)
- Non_Unique_Count += 1
- ID_list.append(row[1][0] + ' (Not unique)')
- Peptide_list.append(Peptide_MC1)
- else:
- Peptide_MC1 += PrEST_seq[m:m+1]
- # ---------------------------------- Two missed cleavages while-loop ----------------------------------
- Peptide_MC2 = Peptide_MC1
- k = m
- while k+1 <= len(PrEST_seq):
- k += 1
- if protease(PrEST_seq, k, 1) == 1:
- Peptide_MC2 += PrEST_seq[k:k+1]
- if len(Peptide_MC2) >= 6:
- if Peptide_MC2 not in ref_set:
- ID_list.append(row[1][0])
- Peptide_list.append(Peptide_MC2)
- MC_Pep_Count += 1
- break
- else:
- Non_Uniques_ID.append(row[1][0])
- Non_Uniques.append(Peptide_MC2)
- Non_Unique_Count += 1
- ID_list.append(row[1][0] + ' (Not unique)')
- Peptide_list.append(Peptide_MC2)
- else:
- Peptide_MC2 += PrEST_seq[k:k+1]
- # -------------------------------------------------------------------------------------------------
- # Resets variables
- Peptide = ''
- Peptide_MC1 = ''
- Peptide_MC2 = ''
- elif First == 'Y': # Doesn't count first cleavage (contains ABP)
- Peptide = ''
- First = 'N'
- else: # Non-cleavable AAs - Peptide grows
- Peptide += PrEST_seq[n:n+1]
- # Appends PrEST data
- K = row[1][1].count('K')
- R = row[1][1].count('R')
- PrEST_data = PrEST_data.append(pd.DataFrame(data=[[row[1][0], Pep_Count, MC_Pep_Count, Non_Unique_Count, K, R]],
- columns=['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques',
- '# K', 'R']))
- # Writes PrEST data to file
- PrEST_data = PrEST_data[['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques', '# K', 'R']]
- ew = pd.ExcelWriter(data_file + ' Results.xlsx', encoding='iso-8859-1')
- PrEST_data.to_excel(ew, sheet_name='PrESTs (' + protease_name + ')', index=False)
- # Creates peptide list for Perseus and writes to file
- peptides = pd.DataFrame(Peptide_list, columns=['Peptides']).join(pd.DataFrame(ID_list, columns=['PrEST ID']))
- peptides['temp'] = peptides['Peptides'].str.len()
- peptides = peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
- peptides.to_excel(ew, sheet_name='Peptide list for Perseus', index=False)
- # List for non-unique peptides
- NU_peptides = pd.DataFrame(Non_Uniques, columns=['Peptides']).join(pd.DataFrame(Non_Uniques_ID, columns=['PrEST ID']))
- NU_peptides['temp'] = NU_peptides['Peptides'].str.len()
- NU_peptides = NU_peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
- NU_peptides.to_excel(ew, sheet_name='Non-unique peptides', index=False)
- ew.save()
- def protease(PrEST_seq, a, type):
- if protease_name == 'Trypsin':
- if type == 1:
- if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
- return 1
- else:
- return 0
- else:
- if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
- return 1
- else:
- return 0
- if protease_name == 'Lys-C':
- if PrEST_seq[a:a+1] == 'K':
- return 1
- else:
- return 0
- def protease(PrEST_seq, a, type):
- if protease_name == 'Trypsin':
- if type != 1:
- a+=1
- return PrEST_seq[a+1] != 'P' and (PrEST_seq[a] in ['R','K']):
- assert(protease_name == 'Lys-C')
- return PrEST_seq[a] == 'K'
- m = n
- while m+1 <= len(PrEST_seq):
- m += 1
- stuff_about(m)
- for m in range(n+1, len(PrEST_seq)+1):
- stuff_about(m)
- if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
- if Peptide not in ref_set:
- ID_list.append(row[1][0])
- Peptide_list.append(Peptide)
- Pep_Count += 1
- else:
- Non_Uniques_ID.append(row[1][0])
- Non_Uniques.append(Peptide)
- Non_Unique_Count += 1
- ID_list.append(row[1][0] + ' (Not unique)')
- Peptide_list.append(Peptide)
- if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
- Peptide_list.append(Peptide)
- if Peptide not in ref_set:
- ID_list.append(row[1][0])
- Pep_Count += 1
- else:
- Non_Uniques_ID.append(row[1][0])
- Non_Uniques.append(Peptide)
- ID_list.append(row[1][0] + ' (Not unique)')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement