Guest User

Untitled

a guest
Feb 23rd, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.76 KB | None | 0 0
  1. '''
  2. Tools to take a directory of txt files and convert them to TF records
  3. '''
  4. from collections import defaultdict, Counter
  5. import numpy as np
  6. import tensorflow as tf
  7. PAD = "<PAD>"
  8. EOS = "<EOS>"
  9.  
  10.  
  11. class Preppy():
  12. '''
  13. Class that converts text inputs to numpy arrays of ids.
  14. It assigns ids sequentially to the token on the fly.
  15. '''
  16. def __init__(self, tokenizer_fn):
  17. self.vocab = defaultdict(lambda: self.next_value())
  18. self.word_count = Counter()
  19. self.vocab[PAD] = 0
  20. self.vocab[EOS] = 1
  21. self.next = 1
  22. self.tokenizer = tokenizer_fn
  23. self.reverse_vocab = {}
  24.  
  25. def next_value(self):
  26. self.next += 1
  27. return self.next
  28.  
  29.  
  30. def convert_token_to_id(self, token):
  31. self.word_count[token] += 1
  32. return self.vocab[token]
  33.  
  34. def sentance_to_tokens(self, sent):
  35. return self.tokenizer(sent)
  36.  
  37. def tokens_to_id_list(self, tokens):
  38. return list(map(self.convert_token_to_id, tokens))
  39.  
  40. def sentance_to_id_list(self, sent):
  41. tokens = self.sentance_to_tokens(sent)
  42. id_list = self.tokens_to_id_list(tokens)
  43. return id_list
  44.  
  45. def sentance_to_numpy_array(self, sent):
  46. id_list = self.sentance_to_id_list(sent)
  47. return np.array(id_list)
  48.  
  49. def update_reverse_vocab(self):
  50. self.reverse_vocab = {id_:token for token,id_ in self.vocab.items()}
  51. def id_list_to_text(self,id_list):
  52. tokens = ''.join(map(lambda x:self.reverse_vocab[x],id_list))
  53. return tokens
  54.  
  55. def sequence_to_tf_example(self,sequence):
  56. id_list = self.sentance_to_id_list(sequence)
  57. ex = tf.train.SequenceExample()
  58. # A non-sequential feature of our example
  59. sequence_length = len(sequence)
  60. ex.context.feature["length"].int64_list.value.append(sequence_length)
  61. # Feature lists for the two sequential features of our example
  62. fl_tokens = ex.feature_lists.feature_list["tokens"]
  63.  
  64. for token in id_list:
  65. fl_tokens.feature.add().int64_list.value.append(token)
  66.  
  67. return ex
  68.  
  69. @staticmethod
  70. def parse(ex):
  71. '''
  72. Explain to TF how to go froma serialized example back to tensors
  73. :param ex:
  74. :return:
  75. '''
  76. context_features = {
  77. "length": tf.FixedLenFeature([], dtype=tf.int64)
  78. }
  79. sequence_features = {
  80. "tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64),
  81. }
  82.  
  83. # Parse the example (returns a dictionary of tensors)
  84. context_parsed, sequence_parsed = tf.parse_single_sequence_example(
  85. serialized=ex,
  86. context_features=context_features,
  87. sequence_features=sequence_features
  88. )
  89. return {"seq":sequence_parsed["tokens"], "length": context_parsed["length"]}
Add Comment
Please, Sign In to add comment