View difference between Paste ID: <a href="/f5e457c64">f5e457c64</a> and <a href="/post/view"></a>

View difference between Paste ID: f5e457c64 and

SHOW: | | - or go back to the newest paste.


#!/usr/bin/python

#calculates the entropy of a 13-STR CODIS (FBI) DNA profile 
#from observed allele frequencies

import math

def shannonEntropy(freqs):
    """plain old Shannon entropy of a probability distribution"""
    return -sum(p * math.log(p, 2) for p in freqs)

def locusEntropy(allelefreqs):
    """given the set of individual allele frequencies at a locus
    compute the genotype frequencies assuming Hardy-Weinberg equilibrium
    then calculate the Shannon entropy of the observed genotype frequencies"""
    genotypes = [2 * p * q  for (i, p) in enumerate(allelefreqs) 
                            for (j, q) in enumerate(allelefreqs)
                            if i < j] + \
                [p**2 for p in allelefreqs]
    return shannonEntropy(genotypes)

# download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm
# save as 'allelefreqs.csv'
def main():
    # read in data
    data = [line.split(',') for line in open('allelefreqs.csv')]
    
    # ignore first 2 header rows
    data = data[2:]
    
    # use caucasian data because sample size is highest
    caucasian_colids = range(1, 13*3, 3)
    
    # list of allele frequencies at each locus
    allelefreqs = [[float(row[colid]) for row in data
                    if len(row[colid]) > 0] 
                    for colid in caucasian_colids]

    entropies = [locusEntropy(x) for x in allelefreqs]

    print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies)
    print "total entropy: %.2f" % sum(entropies)

if __name__ == "__main__":
    main()

1	-
1	+	#!/usr/bin/python
2
3		#calculates the entropy of a 13-STR CODIS (FBI) DNA profile
4		#from observed allele frequencies
5
6		import math
7
8		def shannonEntropy(freqs):
9		"""plain old Shannon entropy of a probability distribution"""
10		return -sum(p * math.log(p, 2) for p in freqs)
11
12		def locusEntropy(allelefreqs):
13		"""given the set of individual allele frequencies at a locus
14		compute the genotype frequencies assuming Hardy-Weinberg equilibrium
15		then calculate the Shannon entropy of the observed genotype frequencies"""
16		genotypes = [2 * p * q for (i, p) in enumerate(allelefreqs)
17		for (j, q) in enumerate(allelefreqs)
18		if i < j] + \
19		[p**2 for p in allelefreqs]
20		return shannonEntropy(genotypes)
21
22		# download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm
23		# save as 'allelefreqs.csv'
24		def main():
25		# read in data
26		data = [line.split(',') for line in open('allelefreqs.csv')]
27
28		# ignore first 2 header rows
29		data = data[2:]
30
31		# use caucasian data because sample size is highest
32		caucasian_colids = range(1, 13*3, 3)
33
34		# list of allele frequencies at each locus
35		allelefreqs = [[float(row[colid]) for row in data
36		if len(row[colid]) > 0]
37		for colid in caucasian_colids]
38
39		entropies = [locusEntropy(x) for x in allelefreqs]
40
41		print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies)
42		print "total entropy: %.2f" % sum(entropies)
43
44		if __name__ == "__main__":
45		main()