Advertisement
Guest User

Untitled

a guest
Feb 28th, 2015
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.97 KB | None | 0 0
  1. import pandas as pd
  2. import csv
  3.  
  4. # ----------------------------------- Input variables -----------------------------------
  5.  
  6. # Input should be a .csv file with 2 columns (PrEST ID, PrEST Sequence)
  7. data_file = 'Master Thesis 14 PrESTs'
  8. protease_name = 'Trypsin'
  9. #protease_name = 'Lys-C'
  10.  
  11. # ---------------------------------------------------------------------------------------
  12.  
  13. data = pd.read_csv(data_file + '.csv', sep=';')
  14. proteasome = 'Non-Unique Reference (' + protease_name + ') (UniProt, canonical).csv'
  15.  
  16. # Create peptide reference Set
  17. ref_set = set()
  18. with open(proteasome, 'rU') as in_file:
  19. reader_2 = csv.reader(in_file)
  20. for row in reader_2:
  21. (ref_set.add(str(row).replace("'","").replace(",","")
  22. .replace("[","").replace("]","").replace(" ","")))
  23. print(str(len(ref_set)) + ' peptides in reference')
  24.  
  25. def protease(PrEST_seq, a, type):
  26. if protease_name == 'Trypsin':
  27. if type == 1:
  28. if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
  29. return 1
  30. else:
  31. return 0
  32. else:
  33. if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
  34. return 1
  35. else:
  36. return 0
  37. if protease_name == 'Lys-C':
  38. if PrEST_seq[a:a+1] == 'K':
  39. return 1
  40. else:
  41. return 0
  42.  
  43. # Initiate variables
  44. Peptide_list = [] # List for Peptides (resets for each PrEST)
  45. ID_list = [] # List for PrEST IDs (resets for each PrEST)
  46. Non_Uniques = [] # List for non-unique peptides
  47. Non_Uniques_ID = [] # List for non-unique PrEST IDs
  48. Peptide = '' # Current peptide (no missed cleavages)
  49. Peptide_MC1 = '' # Current peptide with 1 missed cleavage
  50. Peptide_MC2 = '' # Current peptide with 2 missed cleavages
  51.  
  52. PrEST_data = pd.DataFrame()
  53. # ------------------------------------------------ Main PrEST for-loop ------------------------------------------------
  54. for row in data.iterrows(): # For every PrEST (row)
  55. First = 'Y'
  56. PrEST_seq = row[1][1]
  57. Pep_Count = 0
  58. MC_Pep_Count = 0
  59. Non_Unique_Count = 0
  60.  
  61. # ----------------------------------------- No missed cleavages for-loop ------------------------------------------
  62. for n in range(len(PrEST_seq)): # For every AA in every PrEST
  63.  
  64. if protease(PrEST_seq, n, 1) == 1:
  65. if First != 'Y': # Does not count first peptide + MCs (part of ABP)
  66. Peptide += PrEST_seq[n:n+1]
  67. if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
  68. if Peptide not in ref_set:
  69. ID_list.append(row[1][0])
  70. Peptide_list.append(Peptide)
  71. Pep_Count += 1
  72. else:
  73. Non_Uniques_ID.append(row[1][0])
  74. Non_Uniques.append(Peptide)
  75. Non_Unique_Count += 1
  76. ID_list.append(row[1][0] + ' (Not unique)')
  77. Peptide_list.append(Peptide)
  78.  
  79. # ----------------------------------- One missed cleavage while-loop ----------------------------------
  80. Peptide_MC1 = Peptide
  81. m = n
  82. while m+1 <= len(PrEST_seq):
  83. m += 1
  84. if protease(PrEST_seq, m, 1) == 1:
  85. Peptide_MC1 += PrEST_seq[m:m+1]
  86. if len(Peptide_MC1) >= 6:
  87. if Peptide_MC1 not in ref_set:
  88. ID_list.append(row[1][0])
  89. Peptide_list.append(Peptide_MC1)
  90. MC_Pep_Count += 1
  91. break
  92. else:
  93. Non_Uniques_ID.append(row[1][0])
  94. Non_Uniques.append(Peptide_MC1)
  95. Non_Unique_Count += 1
  96. ID_list.append(row[1][0] + ' (Not unique)')
  97. Peptide_list.append(Peptide_MC1)
  98. else:
  99. Peptide_MC1 += PrEST_seq[m:m+1]
  100.  
  101. # ---------------------------------- Two missed cleavages while-loop ----------------------------------
  102. Peptide_MC2 = Peptide_MC1
  103. k = m
  104. while k+1 <= len(PrEST_seq):
  105. k += 1
  106. if protease(PrEST_seq, k, 1) == 1:
  107. Peptide_MC2 += PrEST_seq[k:k+1]
  108. if len(Peptide_MC2) >= 6:
  109. if Peptide_MC2 not in ref_set:
  110. ID_list.append(row[1][0])
  111. Peptide_list.append(Peptide_MC2)
  112. MC_Pep_Count += 1
  113. break
  114. else:
  115. Non_Uniques_ID.append(row[1][0])
  116. Non_Uniques.append(Peptide_MC2)
  117. Non_Unique_Count += 1
  118. ID_list.append(row[1][0] + ' (Not unique)')
  119. Peptide_list.append(Peptide_MC2)
  120. else:
  121. Peptide_MC2 += PrEST_seq[k:k+1]
  122. # -------------------------------------------------------------------------------------------------
  123.  
  124. # Resets variables
  125. Peptide = ''
  126. Peptide_MC1 = ''
  127. Peptide_MC2 = ''
  128. elif First == 'Y': # Doesn't count first cleavage (contains ABP)
  129. Peptide = ''
  130. First = 'N'
  131. else: # Non-cleavable AAs - Peptide grows
  132. Peptide += PrEST_seq[n:n+1]
  133.  
  134. # Appends PrEST data
  135. K = row[1][1].count('K')
  136. R = row[1][1].count('R')
  137. PrEST_data = PrEST_data.append(pd.DataFrame(data=[[row[1][0], Pep_Count, MC_Pep_Count, Non_Unique_Count, K, R]],
  138. columns=['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques',
  139. '# K', 'R']))
  140. # Writes PrEST data to file
  141. PrEST_data = PrEST_data[['PrEST ID', '# Peptides', '# MC Peptides', '# Non-Uniques', '# K', 'R']]
  142. ew = pd.ExcelWriter(data_file + ' Results.xlsx', encoding='iso-8859-1')
  143. PrEST_data.to_excel(ew, sheet_name='PrESTs (' + protease_name + ')', index=False)
  144.  
  145. # Creates peptide list for Perseus and writes to file
  146. peptides = pd.DataFrame(Peptide_list, columns=['Peptides']).join(pd.DataFrame(ID_list, columns=['PrEST ID']))
  147. peptides['temp'] = peptides['Peptides'].str.len()
  148. peptides = peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
  149. peptides.to_excel(ew, sheet_name='Peptide list for Perseus', index=False)
  150.  
  151. # List for non-unique peptides
  152. NU_peptides = pd.DataFrame(Non_Uniques, columns=['Peptides']).join(pd.DataFrame(Non_Uniques_ID, columns=['PrEST ID']))
  153. NU_peptides['temp'] = NU_peptides['Peptides'].str.len()
  154. NU_peptides = NU_peptides.sort(['PrEST ID', 'temp'], ascending=[True, False]).drop('temp', axis=1)
  155. NU_peptides.to_excel(ew, sheet_name='Non-unique peptides', index=False)
  156.  
  157. ew.save()
  158.  
  159. def protease(PrEST_seq, a, type):
  160. if protease_name == 'Trypsin':
  161. if type == 1:
  162. if PrEST_seq[a+1:a+2] != 'P' and (PrEST_seq[a:a+1] == 'R' or PrEST_seq[a:a+1] == 'K'):
  163. return 1
  164. else:
  165. return 0
  166. else:
  167. if PrEST_seq[a+2:a+3] != 'P' and (PrEST_seq[a+1:a+2] == 'R' or PrEST_seq[a+1:a+2] == 'K'):
  168. return 1
  169. else:
  170. return 0
  171. if protease_name == 'Lys-C':
  172. if PrEST_seq[a:a+1] == 'K':
  173. return 1
  174. else:
  175. return 0
  176.  
  177. def protease(PrEST_seq, a, type):
  178. if protease_name == 'Trypsin':
  179. if type != 1:
  180. a+=1
  181. return PrEST_seq[a+1] != 'P' and (PrEST_seq[a] in ['R','K']):
  182. assert(protease_name == 'Lys-C')
  183. return PrEST_seq[a] == 'K'
  184.  
  185. m = n
  186. while m+1 <= len(PrEST_seq):
  187. m += 1
  188. stuff_about(m)
  189.  
  190. for m in range(n+1, len(PrEST_seq)+1):
  191. stuff_about(m)
  192.  
  193. if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
  194. if Peptide not in ref_set:
  195. ID_list.append(row[1][0])
  196. Peptide_list.append(Peptide)
  197. Pep_Count += 1
  198. else:
  199. Non_Uniques_ID.append(row[1][0])
  200. Non_Uniques.append(Peptide)
  201. Non_Unique_Count += 1
  202. ID_list.append(row[1][0] + ' (Not unique)')
  203. Peptide_list.append(Peptide)
  204.  
  205. if len(Peptide) >= 6: # Only appends peptide if longer than 6 AA
  206. Peptide_list.append(Peptide)
  207.  
  208. if Peptide not in ref_set:
  209. ID_list.append(row[1][0])
  210. Pep_Count += 1
  211. else:
  212. Non_Uniques_ID.append(row[1][0])
  213. Non_Uniques.append(Peptide)
  214. ID_list.append(row[1][0] + ' (Not unique)')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement