View difference between Paste ID: f5e457c64 and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | #!/usr/bin/python |
2 | ||
3 | #calculates the entropy of a 13-STR CODIS (FBI) DNA profile | |
4 | #from observed allele frequencies | |
5 | ||
6 | import math | |
7 | ||
8 | def shannonEntropy(freqs): | |
9 | """plain old Shannon entropy of a probability distribution""" | |
10 | return -sum(p * math.log(p, 2) for p in freqs) | |
11 | ||
12 | def locusEntropy(allelefreqs): | |
13 | """given the set of individual allele frequencies at a locus | |
14 | compute the genotype frequencies assuming Hardy-Weinberg equilibrium | |
15 | then calculate the Shannon entropy of the observed genotype frequencies""" | |
16 | genotypes = [2 * p * q for (i, p) in enumerate(allelefreqs) | |
17 | for (j, q) in enumerate(allelefreqs) | |
18 | if i < j] + \ | |
19 | [p**2 for p in allelefreqs] | |
20 | return shannonEntropy(genotypes) | |
21 | ||
22 | # download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm | |
23 | # save as 'allelefreqs.csv' | |
24 | def main(): | |
25 | # read in data | |
26 | data = [line.split(',') for line in open('allelefreqs.csv')] | |
27 | ||
28 | # ignore first 2 header rows | |
29 | data = data[2:] | |
30 | ||
31 | # use caucasian data because sample size is highest | |
32 | caucasian_colids = range(1, 13*3, 3) | |
33 | ||
34 | # list of allele frequencies at each locus | |
35 | allelefreqs = [[float(row[colid]) for row in data | |
36 | if len(row[colid]) > 0] | |
37 | for colid in caucasian_colids] | |
38 | ||
39 | entropies = [locusEntropy(x) for x in allelefreqs] | |
40 | ||
41 | print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies) | |
42 | print "total entropy: %.2f" % sum(entropies) | |
43 | ||
44 | if __name__ == "__main__": | |
45 | main() |