Advertisement
Guest User

Untitled

a guest
Jan 31st, 2015
166
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.37 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import argparse
  4. import collections
  5. import sys
  6.  
  7.  
  8. def train_hmm(filename):
  9. """ Trains a Hidden Markov Model with data from a text file. Returns a markov
  10. dictionary (see `markov_dict`) and a dictionary of emission probabilities.
  11. """
  12.  
  13. def words_and_tags_from_file(filename):
  14. """ Reads words and POS tags from a text file. The file must contain a word
  15. and its POS tag in each line, seperated by '\t'. Returns two lists of same
  16. length: one containing the words and one containing the tags.
  17. """
  18. words, tags = [''], ['']
  19. # starting with an empty word/tag so all words that begin sentences follow a
  20. # '' word/tag and we can calculate starting probabilities with `markov['']`
  21.  
  22. with open(filename) as file:
  23. for line in file:
  24. if line != '\n':
  25. split = line.strip().split('\t')
  26. words.append(split[0])
  27. tags.append(split[1])
  28. else:
  29. words.append('')
  30. tags.append('')
  31. return words, tags
  32.  
  33. def markov_dict(words):
  34. """ Generates a Markov chain dictionary from a list of states. The dictionary
  35. maps a state to a dictionary of its neighbors and their probabilities, e.g.
  36. {'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...}
  37. """
  38. neighbors = collections.defaultdict(list)
  39. for i in xrange(len(words) - 1):
  40. word, neighbor = words[i], words[i+1]
  41. neighbors[word].append(neighbor)
  42. return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()}
  43.  
  44. def emission_probabilities(pairs):
  45. """ Generates a dictionary of emission probabilities from a list of pairs
  46. (state, emission) of HMM states and their emissions.
  47. """
  48. emissions = collections.defaultdict(list)
  49. for state, emission in pairs:
  50. emissions[state].append(emission)
  51. return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()}
  52.  
  53. def probabilities_dict(list):
  54. """ Helper function that generates a dictionary of probabilities from a list,
  55. e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25}
  56. """
  57. counts = collections.defaultdict(int)
  58. for value in list:
  59. counts[value] += 1
  60. return {key: count * 1.0 / len(list) for key, count in counts.iteritems()}
  61.  
  62. words, tags = words_and_tags_from_file(filename)
  63. hidden_markov = markov_dict(tags)
  64. emissions = emission_probabilities(zip(tags, words))
  65. return hidden_markov, emissions
  66.  
  67.  
  68. def hmm_viterbi(sentence, hidden_markov, emissions):
  69. """ Returns a list of states generated by the Viterbi algorithm. The list is the most
  70. probable sequence of HMM states (POS tags) for the sentence (emissions).
  71. """
  72.  
  73. a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor
  74.  
  75. for k, word in enumerate(sentence):
  76. a.append(collections.defaultdict(float))
  77. for state in hidden_markov.keys():
  78. paths = []
  79. for previous_state in a[k]:
  80. transition_probability = hidden_markov[previous_state].get(state) or 0
  81. emission_probability = emissions[state].get(word) or 0.0001
  82. # handle unknown words: set emission probability to 0.0001 for all states.
  83.  
  84. paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state))
  85. max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0]
  86. a[k+1][state] = max_path
  87. # only save the probability and predecessor of the most probable path
  88.  
  89. # backtracking the path: inserting the predecessor at the beginning of the tag list.
  90. end = '$.'
  91. tags = [end]
  92.  
  93. k = len(sentence)
  94. while k > 1:
  95. predecessor = a[k][tags[0]][1]
  96. tags.insert(0, predecessor)
  97. k -= 1
  98.  
  99. return tags
  100.  
  101.  
  102. def print_pos_tags(pairs):
  103. """ Helper function that prints words and their tags seperated by a backslash
  104. as required by this exercise.
  105. """
  106. output = ''
  107. for word, tag in pairs:
  108. output += word + '\\' + tag + ' '
  109. print output
  110.  
  111.  
  112. if __name__ == '__main__':
  113. parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`')
  114. parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags')
  115. parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.')
  116. args = parser.parse_args()
  117.  
  118. if not args.sentence:
  119. parser.print_help()
  120. sys.exit(0)
  121.  
  122. hidden_markov, emissions = train_hmm(args.filename)
  123. viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions)
  124. print_pos_tags(zip(args.sentence, viterbi_tags))
  125.  
  126. sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement