Advertisement
Guest User

Untitled

a guest
Mar 26th, 2017
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.32 KB | None | 0 0
  1. # This file will give you a lot of information about fasta file
  2. # You just have to enter the file name with path
  3. # It will give us some statistics and one pie chart and two histogram
  4. # Save and close the pie chart to see the histogram, do the same for histogram
  5. # The original algorithm was from Ben Lengmead gist page
  6.  
  7. def naive_with_rc(p, t):
  8. occurrences = []
  9. p_rev = reverseComplement(p);
  10. for i in range(len(t) - len(p) + 1): # loop over alignments
  11. match, match_rev = True, True
  12. for j in range(len(p)): # loop over characters
  13. if t[i+j] != p[j]: # compare characters of p
  14. match = False
  15. if t[i+j] != p_rev[j]: # compare characters reverse complement if p
  16. match_rev = False
  17. if(not (match or match_rev)):
  18. break
  19. if match or match_rev:
  20. occurrences.append(i) # all chars matched and added
  21. return occurrences
  22.  
  23. def reverseComplement(s):
  24. complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
  25. t = ''
  26. for base in s:
  27. t = complement[base] + t
  28. return t
  29.  
  30. def readGenome(filename):
  31. genome = ''
  32. with open(filename, 'r') as f:
  33. for line in f:
  34. # ignore header line with genome information
  35. if not line[0] == '>':
  36. genome += line.rstrip()
  37. return genome
  38.  
  39. genome = readGenome(input("Enter your fasta file or provide the full path:"))#'/home/zillur/Desktop/zillur/phd/Apecomplexan/B_bigemina/PiroplasmaDB-28_BbigeminaBOND_Genome.fasta')
  40. genome=genome.upper()
  41. counts={'A':0,'C':0,'G':0,'T':0,'N':0}
  42. for base in genome:
  43. counts[base] +=1
  44. print(counts)
  45. print(len(naive_with_rc('TT', genome)),"Number of thymidine dimer in our genome") # Q1
  46. print(len(naive_with_rc('TTT', genome)),"Number of thymidine triplet in our genome") # Q1
  47.  
  48. A=counts['A']#len(naive_with_rc('A', genome))
  49. T=counts['T']
  50. C=counts['C']
  51. G=counts['G']
  52. N=counts['N']
  53. TT=len(naive_with_rc('TT', genome))
  54. TTT=len(naive_with_rc('TTT', genome))
  55. TTTT=len(naive_with_rc('TTTT', genome))
  56. GG=len(naive_with_rc('GG', genome))
  57. GGG=len(naive_with_rc('GGG', genome))
  58. GGGG=len(naive_with_rc('GGGG', genome))
  59. NN=len(naive_with_rc('NN', genome))
  60. NNN=len(naive_with_rc('NNN', genome))
  61. wholegenome=A+T+C+G+N
  62. print("A:",A,"T:",T,"C:",C,"G:",G,"TT:",TT,"TTT:",TTT,"TTTT:",TTTT,"GG:",GG,"GGG:",GGG,"GGGG:",GGGG, "NN:",NN,"NNN:",NNN,"Whole genome:",wholegenome)
  63. A_g=wholegenome/A
  64. G_g=wholegenome/G
  65. T_g=wholegenome/T
  66. C_g=wholegenome/C
  67. #N_g=wholegenome/N
  68. TT_g=wholegenome/TT
  69. TTT_g=wholegenome/TTT
  70. TTTT_g=wholegenome/TTTT
  71. GG_g=wholegenome/GG
  72. GGG_g=wholegenome/GGG
  73. GGGG_g=wholegenome/GGGG
  74. #NN_g=wholegenome/NN
  75. #NNN_g=wholegenome/NNN
  76. print("bases/A:",A_g,'bases/A',G_g,'bases/T:',T_g,'bases/C:',C_g,'bases/TT:',TT_g,'bases/TTT:',TTT_g,'bases/TTTT:',TTTT_g,'bases/GG:',GG_g,'bases/GGG:',GGG_g,'bases/GGGG:',GGGG_g)
  77.  
  78. from pylab import *
  79.  
  80. # make a square figure and axes
  81. figure(1, figsize=(6,6))
  82. ax = axes([0.1, 0.1, 0.8, 0.8])
  83.  
  84. # The slices will be ordered and plotted counter-clockwise.
  85. labels = 'Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N'
  86. fracs = [A, C, G, T, N]
  87. explode=(0, 0.05, 0, 0, 0)
  88.  
  89. pie(fracs, explode=explode, labels=labels,
  90. autopct='%1.1f%%', shadow=True, startangle=90)
  91. # The default startangle is 0, which would start
  92. # the Frogs slice on the x-axis. With startangle=90,
  93. # everything is rotated counter-clockwise by 90 degrees,
  94. # so the plotting starts on the positive y-axis.
  95.  
  96. title('Presence of Nucleotide', bbox={'facecolor':'0.8', 'pad':5})
  97. show()
  98.  
  99. import numpy as np
  100. import matplotlib.pyplot as plt
  101.  
  102. name=['Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N','TT','TTT','GG','GGG','NN','NNN']
  103. nucleotides=(A, C, G,T,N,TT,TTT,GG,GGG,NN,NNN)
  104. bars=len(name)
  105. ind = np.arange(bars)
  106. width=0.44
  107. p1 = plt.bar(ind, nucleotides, width, color='b')
  108. plt.ylabel('Number of bases')
  109. plt.title('Nucleotides with doublets and triplets')
  110. plt.xticks(ind + width/2., ('A', 'C', 'G', 'T', 'N','TT','TTT','GG','GGG'))
  111. plt.show()
  112.  
  113. dou_trp=['A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG']
  114. datas=(A_g, C_g, G_g,T_g,TT_g,TTT_g,TTTT_g,GG_g,GGG_g,GGGG_g)
  115. bar=len(dou_trp)
  116. i=np.arange(bar)
  117. p2=plt.bar(i,datas,width,color='r')
  118. plt.ylabel('Nucleotides per bases')
  119. plt.title('Density of nucleotides')
  120. plt.xticks(ind + width/2., ('A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG'))
  121. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement