#!/usr/bin/python
#calculates the entropy of a 13-STR CODIS (FBI) DNA profile
#from observed allele frequencies
import math
def shannonEntropy(freqs):
"""plain old Shannon entropy of a probability distribution"""
return -sum(p * math.log(p, 2) for p in freqs)
def locusEntropy(allelefreqs):
"""given the set of individual allele frequencies at a locus
compute the genotype frequencies assuming Hardy-Weinberg equilibrium
then calculate the Shannon entropy of the observed genotype frequencies"""
genotypes = [2 * p * q for (i, p) in enumerate(allelefreqs)
for (j, q) in enumerate(allelefreqs)
if i < j] + \
[p**2 for p in allelefreqs]
return shannonEntropy(genotypes)
# download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm
# save as 'allelefreqs.csv'
def main():
# read in data
data = [line.split(',') for line in open('allelefreqs.csv')]
# ignore first 2 header rows
data = data[2:]
# use caucasian data because sample size is highest
caucasian_colids = range(1, 13*3, 3)
# list of allele frequencies at each locus
allelefreqs = [[float(row[colid]) for row in data
if len(row[colid]) > 0]
for colid in caucasian_colids]
entropies = [locusEntropy(x) for x in allelefreqs]
print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies)
print "total entropy: %.2f" % sum(entropies)
if __name__ == "__main__":
main()