Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import argparse
- import collections
- import sys
- def train_hmm(filename):
- """ Trains a Hidden Markov Model with data from a text file. Returns a markov
- dictionary (see `markov_dict`) and a dictionary of emission probabilities.
- """
- def words_and_tags_from_file(filename):
- """ Reads words and POS tags from a text file. The file must contain a word
- and its POS tag in each line, seperated by '\t'. Returns two lists of same
- length: one containing the words and one containing the tags.
- """
- words, tags = [''], ['']
- # starting with an empty word/tag so all words that begin sentences follow a
- # '' word/tag and we can calculate starting probabilities with `markov['']`
- with open(filename) as file:
- for line in file:
- if line != '\n':
- split = line.strip().split('\t')
- words.append(split[0])
- tags.append(split[1])
- else:
- words.append('')
- tags.append('')
- return words, tags
- def markov_dict(words):
- """ Generates a Markov chain dictionary from a list of states. The dictionary
- maps a state to a dictionary of its neighbors and their probabilities, e.g.
- {'GWV': {'lecture': 0.5, 'tutorial': 0.3, ...}, ...}
- """
- neighbors = collections.defaultdict(list)
- for i in xrange(len(words) - 1):
- word, neighbor = words[i], words[i+1]
- neighbors[word].append(neighbor)
- return {word: probabilities_dict(neighbors) for word, neighbors in neighbors.iteritems()}
- def emission_probabilities(pairs):
- """ Generates a dictionary of emission probabilities from a list of pairs
- (state, emission) of HMM states and their emissions.
- """
- emissions = collections.defaultdict(list)
- for state, emission in pairs:
- emissions[state].append(emission)
- return {state: probabilities_dict(emissions) for state, emissions in emissions.iteritems()}
- def probabilities_dict(list):
- """ Helper function that generates a dictionary of probabilities from a list,
- e.g. ['a', 'a', 'b', 'c'] -> {'a': 0.5, 'b': 0.25, 'c': 0.25}
- """
- counts = collections.defaultdict(int)
- for value in list:
- counts[value] += 1
- return {key: count * 1.0 / len(list) for key, count in counts.iteritems()}
- words, tags = words_and_tags_from_file(filename)
- hidden_markov = markov_dict(tags)
- emissions = emission_probabilities(zip(tags, words))
- return hidden_markov, emissions
- def hmm_viterbi(sentence, hidden_markov, emissions):
- """ Returns a list of states generated by the Viterbi algorithm. The list is the most
- probable sequence of HMM states (POS tags) for the sentence (emissions).
- """
- a = [{'': (1.0, None)}] # mapping a state to its most probable predecessor
- for k, word in enumerate(sentence):
- a.append(collections.defaultdict(float))
- for state in hidden_markov.keys():
- paths = []
- for previous_state in a[k]:
- transition_probability = hidden_markov[previous_state].get(state) or 0
- emission_probability = emissions[state].get(word) or 0.0001
- # handle unknown words: set emission probability to 0.0001 for all states.
- paths.append((a[k][previous_state][0] * transition_probability * emission_probability, previous_state))
- max_path = sorted(paths, key=lambda x: x[0], reverse=True)[0]
- a[k+1][state] = max_path
- # only save the probability and predecessor of the most probable path
- # backtracking the path: inserting the predecessor at the beginning of the tag list.
- end = '$.'
- tags = [end]
- k = len(sentence)
- while k > 1:
- predecessor = a[k][tags[0]][1]
- tags.insert(0, predecessor)
- k -= 1
- return tags
- def print_pos_tags(pairs):
- """ Helper function that prints words and their tags seperated by a backslash
- as required by this exercise.
- """
- output = ''
- for word, tag in pairs:
- output += word + '\\' + tag + ' '
- print output
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(add_help=False, description='example: `python tagger.py Dell und Cisco zeigten sich hingegen davon relativ unbeeindruckt .`')
- parser.add_argument('--filename', '-f', nargs=1, help='The training file. Default: hdt-1-10000-train.tags', default='hdt-1-10000-train.tags')
- parser.add_argument('sentence', nargs='*', help='The sentence to be tagged.')
- args = parser.parse_args()
- if not args.sentence:
- parser.print_help()
- sys.exit(0)
- hidden_markov, emissions = train_hmm(args.filename)
- viterbi_tags = hmm_viterbi(args.sentence, hidden_markov, emissions)
- print_pos_tags(zip(args.sentence, viterbi_tags))
- sys.exit(0)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement