Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # This file will give you a lot of information about fasta file
- # You just have to enter the file name with path
- # It will give us some statistics and one pie chart and two histogram
- # Save and close the pie chart to see the histogram, do the same for histogram
- # The original algorithm was from Ben Lengmead gist page
- def naive_with_rc(p, t):
- occurrences = []
- p_rev = reverseComplement(p);
- for i in range(len(t) - len(p) + 1): # loop over alignments
- match, match_rev = True, True
- for j in range(len(p)): # loop over characters
- if t[i+j] != p[j]: # compare characters of p
- match = False
- if t[i+j] != p_rev[j]: # compare characters reverse complement if p
- match_rev = False
- if(not (match or match_rev)):
- break
- if match or match_rev:
- occurrences.append(i) # all chars matched and added
- return occurrences
- def reverseComplement(s):
- complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
- t = ''
- for base in s:
- t = complement[base] + t
- return t
- def readGenome(filename):
- genome = ''
- with open(filename, 'r') as f:
- for line in f:
- # ignore header line with genome information
- if not line[0] == '>':
- genome += line.rstrip()
- return genome
- genome = readGenome(input("Enter your fasta file or provide the full path:"))#'/home/zillur/Desktop/zillur/phd/Apecomplexan/B_bigemina/PiroplasmaDB-28_BbigeminaBOND_Genome.fasta')
- genome=genome.upper()
- counts={'A':0,'C':0,'G':0,'T':0,'N':0}
- for base in genome:
- counts[base] +=1
- print(counts)
- print(len(naive_with_rc('TT', genome)),"Number of thymidine dimer in our genome") # Q1
- print(len(naive_with_rc('TTT', genome)),"Number of thymidine triplet in our genome") # Q1
- A=counts['A']#len(naive_with_rc('A', genome))
- T=counts['T']
- C=counts['C']
- G=counts['G']
- N=counts['N']
- TT=len(naive_with_rc('TT', genome))
- TTT=len(naive_with_rc('TTT', genome))
- TTTT=len(naive_with_rc('TTTT', genome))
- GG=len(naive_with_rc('GG', genome))
- GGG=len(naive_with_rc('GGG', genome))
- GGGG=len(naive_with_rc('GGGG', genome))
- NN=len(naive_with_rc('NN', genome))
- NNN=len(naive_with_rc('NNN', genome))
- wholegenome=A+T+C+G+N
- print("A:",A,"T:",T,"C:",C,"G:",G,"TT:",TT,"TTT:",TTT,"TTTT:",TTTT,"GG:",GG,"GGG:",GGG,"GGGG:",GGGG, "NN:",NN,"NNN:",NNN,"Whole genome:",wholegenome)
- A_g=wholegenome/A
- G_g=wholegenome/G
- T_g=wholegenome/T
- C_g=wholegenome/C
- #N_g=wholegenome/N
- TT_g=wholegenome/TT
- TTT_g=wholegenome/TTT
- TTTT_g=wholegenome/TTTT
- GG_g=wholegenome/GG
- GGG_g=wholegenome/GGG
- GGGG_g=wholegenome/GGGG
- #NN_g=wholegenome/NN
- #NNN_g=wholegenome/NNN
- print("bases/A:",A_g,'bases/A',G_g,'bases/T:',T_g,'bases/C:',C_g,'bases/TT:',TT_g,'bases/TTT:',TTT_g,'bases/TTTT:',TTTT_g,'bases/GG:',GG_g,'bases/GGG:',GGG_g,'bases/GGGG:',GGGG_g)
- from pylab import *
- # make a square figure and axes
- figure(1, figsize=(6,6))
- ax = axes([0.1, 0.1, 0.8, 0.8])
- # The slices will be ordered and plotted counter-clockwise.
- labels = 'Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N'
- fracs = [A, C, G, T, N]
- explode=(0, 0.05, 0, 0, 0)
- pie(fracs, explode=explode, labels=labels,
- autopct='%1.1f%%', shadow=True, startangle=90)
- # The default startangle is 0, which would start
- # the Frogs slice on the x-axis. With startangle=90,
- # everything is rotated counter-clockwise by 90 degrees,
- # so the plotting starts on the positive y-axis.
- title('Presence of Nucleotide', bbox={'facecolor':'0.8', 'pad':5})
- show()
- import numpy as np
- import matplotlib.pyplot as plt
- name=['Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N','TT','TTT','GG','GGG','NN','NNN']
- nucleotides=(A, C, G,T,N,TT,TTT,GG,GGG,NN,NNN)
- bars=len(name)
- ind = np.arange(bars)
- width=0.44
- p1 = plt.bar(ind, nucleotides, width, color='b')
- plt.ylabel('Number of bases')
- plt.title('Nucleotides with doublets and triplets')
- plt.xticks(ind + width/2., ('A', 'C', 'G', 'T', 'N','TT','TTT','GG','GGG'))
- plt.show()
- dou_trp=['A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG']
- datas=(A_g, C_g, G_g,T_g,TT_g,TTT_g,TTTT_g,GG_g,GGG_g,GGGG_g)
- bar=len(dou_trp)
- i=np.arange(bar)
- p2=plt.bar(i,datas,width,color='r')
- plt.ylabel('Nucleotides per bases')
- plt.title('Density of nucleotides')
- plt.xticks(ind + width/2., ('A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG'))
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement