Advertisement
dvdjaco

10.3

Feb 18th, 2012
265
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.10 KB | None | 0 0
  1. # #!/usr/bin/python
  2. #
  3. # By dvdjaco
  4. # Exercise 10.3
  5. #
  6.  
  7. # most_frequent takes a string and prints the letters in decreasing order of frequency
  8.  
  9. import string
  10.  
  11. def most_frequent(s):
  12.     # first remove all punctuation and spaces, and lowercase the string:
  13.     s = s.translate(None,string.punctuation)
  14.     s = s.translate(None,' ')
  15.     s = s.lower()
  16. #    print s
  17.    
  18.     # build a dictionary where keys are letters and values are counts
  19.     d = dict()
  20.     totalchars = 0
  21.     for char in s:
  22.         d[char] = d.get(char,0) + 1
  23.         totalchars = totalchars + 1
  24.    
  25.     # build a list of tuples to sort the dictionary by descending value
  26.     l = list()
  27.     for char,count in d.items(): l.append( (count,char) )
  28.     l.sort(reverse=True)
  29.    
  30.     # print the list including the relative frequency of each letter
  31.     for count,char in l:
  32.         r = 100*float(count)/float(totalchars)
  33.         print char, count, (str("%.2f" % r)) + "%"
  34.    
  35. f = raw_input("Enter a filename: ")
  36.  
  37. try:
  38.     fhand = open(f)
  39. except err:
  40.     print "Error opening file ", f, err
  41.  
  42. s = ''
  43. for line in fhand:
  44.     s = s + line.rstrip()
  45. most_frequent(s)
  46.  
  47. #
  48. # Below the results on a few books and a mark in the first letter with a different order when compared to http://en.wikipedia.org/wiki/Letter_frequency
  49. #
  50. # Hitchhiker's Guide to the Galaxy (EN)
  51. # $ python exercise-10.3.py
  52. # Enter a filename: hh.txt
  53. # e 29391 10.26%
  54. # t 23494 8.20%
  55. # a 23143 8.08%
  56. # s 18725 6.54% <--
  57. # i 18246 6.37%
  58. # o 16918 5.91%
  59. # l 15218 5.31%
  60. # r 15006 5.24%
  61. # h 14849 5.18%
  62. # n 14720 5.14%
  63. #
  64. #
  65. # The Selfish Gene (EN)
  66. # $ python exercise-10.3.py
  67. # Enter a filename: sg.txt
  68. # e 72639 12.56%
  69. # t 52166 9.02%
  70. # a 46869 8.10%
  71. # i 43425 7.51% <--
  72. # o 41231 7.13%
  73. # s 41077 7.10%
  74. # n 37575 6.49%
  75. # r 32962 5.70%
  76. # h 28264 4.89%
  77. # l 28116 4.86%
  78. #
  79. # Don Quijote de la Mancha (ES)
  80. # $ python exercise-10.3.py
  81. # Enter a filename: pg2000.txt
  82. # e 223827 13.15%
  83. # a 194365 11.42%
  84. # o 154709 9.09%
  85. # s 126489 7.43%
  86. # n 109490 6.43% <--
  87. # r 102124 6.00%
  88. # l 89630 5.27%
  89. # d 87776 5.16%
  90. # i 78739 4.63%
  91. # u 78717 4.63%
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement