Share Pastebin
Guest
Public paste!

randomwalker

By: a guest | Nov 28th, 2009 | Syntax: Python | Size: 1.46 KB | Hits: 174 | Expires: Never
This paste has a previous version, view the difference. Copy text to clipboard
  1. #!/usr/bin/python
  2. import math
  3.  
  4. def shannonEntropy(freqs):
  5.     """plain old Shannon entropy of a probability distribution"""
  6.     return -sum(p * math.log(p, 2) for p in freqs)
  7.  
  8. def locusEntropy(allelefreqs):
  9.     """given the set of individual allele frequencies at a locus
  10.    compute the genotype frequencies assuming Hardy-Weinberg equilibrium
  11.    then calculate the Shannon entropy of the observed genotype frequencies"""
  12.     genotypes = [2 * p * q  for (i, p) in enumerate(allelefreqs)
  13.                             for (j, q) in enumerate(allelefreqs)
  14.                             if i < j] + \
  15.                 [p**2 for p in allelefreqs]
  16.     return shannonEntropy(genotypes)
  17.  
  18. # download raw XLS data from http://www.cstl.nist.gov/strbase/NISTpop.htm
  19. # save as 'allelefreqs.csv'
  20. def main():
  21.     # read in data
  22.     data = [line.split(',') for line in open('allelefreqs.csv')]
  23.    
  24.     # ignore first 2 header rows
  25.     data = data[2:]
  26.    
  27.     # use caucasian data because sample size is highest
  28.     caucasian_colids = range(1, 13*3, 3)
  29.    
  30.     # list of allele frequencies at each locus
  31.     allelefreqs = [[float(row[colid]) for row in data
  32.                     if len(row[colid]) > 0]
  33.                     for colid in caucasian_colids]
  34.  
  35.     entropies = [locusEntropy(x) for x in allelefreqs]
  36.  
  37.     print "locus entropies:\n\t", '\n\t'.join('%.2f' % s for s in entropies)
  38.     print "total entropy: %.2f" % sum(entropies)
  39.  
  40. if __name__ == "__main__":
  41.     main()