Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- #calculates the entropy of a 13-STR CODIS (FBI) DNA profile
- #from observed allele frequencies
- import math
- def shannonEntropy(freqs):
- """plain old Shannon entropy of a probability distribution"""
- return -sum(p * math.log(p, 2) for p in freqs)
- def locusEntropy(allelefreqs):
- """given the set of individual allele frequencies at a locus
- compute the genotype frequencies assuming Hardy-Weinberg equilibrium
- then calculate the Shannon entropy of the observed genotype frequencies"""
- genotypes = [2 * p * q for (i, p) in enumerate(allelefreqs)
- for (j, q) in enumerate(allelefreqs)
- if i < j] + \
- [p**2 for p in allelefreqs]
- return shannonEntropy(genotypes)
- # download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm
- # save as 'allelefreqs.csv'
- def main():
- # read in data
- data = [line.split(',') for line in open('allelefreqs.csv')]
- # ignore first 2 header rows
- data = data[2:]
- # use caucasian data because sample size is highest
- caucasian_colids = range(1, 13*3, 3)
- # list of allele frequencies at each locus
- allelefreqs = [[float(row[colid]) for row in data
- if len(row[colid]) > 0]
- for colid in caucasian_colids]
- entropies = [locusEntropy(x) for x in allelefreqs]
- print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies)
- print "total entropy: %.2f" % sum(entropies)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement