Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """vocabulary class for an image-to-text model"""
- from __future__ import division
- from __future__ import absolute_import
- from __future__ import print_function
- import tensorflow as tf
- class Vocabulary(object):
- """vocabulary class for an image-to-text model"""
- def __init__(self,
- vocab_file,
- start_word="<S>",
- end_word="</S>",
- unk_word="<UNK>"):
- """initializes the vocabulary
- vocab_file:file containing the vocabulary,where the words are the first
- whitespace-separated token on each file(other tokens are ignored) and the
- word ids are the corresponding line numbers
- start_word:special note denoting sentence start
- end_word:special note denoting sentence end
- unk_word: Special word denoting unknown words.
- """
- if not tf.gfile.Exists(vocab_file):
- tf.logging.fatal("vocab file %s not found", vocab_file)
- tf.logging.info("initializing vocabulary from file %s", vocab_file)
- with tf.gfile.GFile(vocab_file, mode="r") as f:
- reverse_vocab = list(f.readlines())
- reverse_vocab = [line.split()[0] for line in reverse_vocab]
- assert start_word in reverse_vocab
- assert end_word in reverse_vocab
- if unk_word not in reverse_vocab:
- reverse_vocab.append(unk_word)
- vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
- tf.logging.info("created vocabulary with %d words" % len(vocab))
- self.vocab = vocab # vocab[word]=id
- self.reverse_vocab = reverse_vocab # reverse_vocab[id] =word
- # save special word ids.
- self.start_id = vocab[start_word]
- self.end_id = vocab[end_word]
- self.unk_id = vocab[unk_word]
- def word_to_id(self, word):
- """returns the integer word id of a word string"""
- if word in self.vocab:
- return self.vocab[word]
- else:
- return self.unk_id
- def id_to_word(self, word_id):
- """returns the word strings of an integer word id"""
- if word_id >= len(self.reverse_vocab):
- return self.reverse_vocab[self.unk_id]
- else:
- return self.reverse_vocab[word_id]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement