Untitled

# This file will give you a lot of information about fasta file
# You just have to enter the file name with path
# It will give us some statistics and one pie chart and two histogram
# Save and close the pie chart to see the histogram, do the same for histogram
# The original algorithm was from Ben Lengmead gist page

def naive_with_rc(p, t):
    occurrences = []
    p_rev = reverseComplement(p);
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match, match_rev = True, True
        for j in range(len(p)): # loop over characters
            if t[i+j] != p[j]: # compare characters of p
                match = False
            if t[i+j] != p_rev[j]:  # compare characters reverse complement if p
                match_rev = False
            if(not (match or match_rev)):
                break
        if match or match_rev:
            occurrences.append(i)  # all chars matched and added
    return occurrences

def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

genome = readGenome(input("Enter your fasta file or provide the full path:"))#'/home/zillur/Desktop/zillur/phd/Apecomplexan/B_bigemina/PiroplasmaDB-28_BbigeminaBOND_Genome.fasta')
genome=genome.upper()
counts={'A':0,'C':0,'G':0,'T':0,'N':0}
for base in genome:
    counts[base] +=1
print(counts)
print(len(naive_with_rc('TT', genome)),"Number of thymidine dimer in our genome") # Q1
print(len(naive_with_rc('TTT', genome)),"Number of thymidine triplet in our genome") # Q1

A=counts['A']#len(naive_with_rc('A', genome))
T=counts['T']
C=counts['C']
G=counts['G']
N=counts['N']
TT=len(naive_with_rc('TT', genome))
TTT=len(naive_with_rc('TTT', genome))
TTTT=len(naive_with_rc('TTTT', genome))
GG=len(naive_with_rc('GG', genome))
GGG=len(naive_with_rc('GGG', genome))
GGGG=len(naive_with_rc('GGGG', genome))
NN=len(naive_with_rc('NN', genome))
NNN=len(naive_with_rc('NNN', genome))
wholegenome=A+T+C+G+N
print("A:",A,"T:",T,"C:",C,"G:",G,"TT:",TT,"TTT:",TTT,"TTTT:",TTTT,"GG:",GG,"GGG:",GGG,"GGGG:",GGGG, "NN:",NN,"NNN:",NNN,"Whole genome:",wholegenome)
A_g=wholegenome/A
G_g=wholegenome/G
T_g=wholegenome/T
C_g=wholegenome/C
#N_g=wholegenome/N
TT_g=wholegenome/TT
TTT_g=wholegenome/TTT
TTTT_g=wholegenome/TTTT
GG_g=wholegenome/GG
GGG_g=wholegenome/GGG
GGGG_g=wholegenome/GGGG
#NN_g=wholegenome/NN
#NNN_g=wholegenome/NNN
print("bases/A:",A_g,'bases/A',G_g,'bases/T:',T_g,'bases/C:',C_g,'bases/TT:',TT_g,'bases/TTT:',TTT_g,'bases/TTTT:',TTTT_g,'bases/GG:',GG_g,'bases/GGG:',GGG_g,'bases/GGGG:',GGGG_g)

from pylab import *

# make a square figure and axes
figure(1, figsize=(6,6))
ax = axes([0.1, 0.1, 0.8, 0.8])

# The slices will be ordered and plotted counter-clockwise.
labels = 'Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N'
fracs = [A, C, G, T, N]
explode=(0, 0.05, 0, 0, 0)

pie(fracs, explode=explode, labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=90)
                # The default startangle is 0, which would start
                # the Frogs slice on the x-axis.  With startangle=90,
                # everything is rotated counter-clockwise by 90 degrees,
                # so the plotting starts on the positive y-axis.

title('Presence of Nucleotide', bbox={'facecolor':'0.8', 'pad':5})
show()

import numpy as np
import matplotlib.pyplot as plt

name=['Adenine', 'Cytosine', 'Guanine', 'Thymine', 'N','TT','TTT','GG','GGG','NN','NNN']
nucleotides=(A, C, G,T,N,TT,TTT,GG,GGG,NN,NNN)
bars=len(name)
ind = np.arange(bars)
width=0.44
p1 = plt.bar(ind, nucleotides, width, color='b')
plt.ylabel('Number of bases')
plt.title('Nucleotides with doublets and triplets')
plt.xticks(ind + width/2., ('A', 'C', 'G', 'T', 'N','TT','TTT','GG','GGG'))
plt.show()

dou_trp=['A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG']
datas=(A_g, C_g, G_g,T_g,TT_g,TTT_g,TTTT_g,GG_g,GGG_g,GGGG_g)
bar=len(dou_trp)
i=np.arange(bar)
p2=plt.bar(i,datas,width,color='r')
plt.ylabel('Nucleotides per bases')
plt.title('Density of nucleotides')
plt.xticks(ind + width/2., ('A', 'C', 'G', 'T','TT','TTT','TTTT','GG','GGG','GGGG'))
plt.show()