Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import tensorflow as tf
- from tensorflow.contrib import rnn
- import random
- #"abc" => "abc"
- #"aabbac" => "abc"
- #"abacd" => "abcd"
- MAX_LENGTH = 6 # Max length of 6
- chars = ["a", "b", "c", "d", "e", "f"]
- all_chars = chars + [' '] # Space for padding
- NUM_EXAMPLES = 50000
- # Args:
- # n: number of examples to generate
- # Returns:
- # strings: list of strings that may contain duplicates
- # solutions: strings without duplicates
- # strings_v: One hot encoding of strings with duplicates (without padding)
- # solutions_v: One hot encoding of solutions (with padding)
- def generate_data(n=NUM_EXAMPLES):
- all_chars_to_idx = { c:i for i, c in enumerate(all_chars) }
- strings_v = np.zeros((NUM_EXAMPLES, MAX_LENGTH, len(all_chars)))
- solutions_v = np.zeros((NUM_EXAMPLES, MAX_LENGTH, len(all_chars)))
- strings = [''] * NUM_EXAMPLES
- solutions = [''] * NUM_EXAMPLES
- for i in range(NUM_EXAMPLES):
- for l in range(MAX_LENGTH):
- char = random.choice(chars) # only sample from valid characters
- strings[i] += char
- if char not in solutions[i]:
- solutions[i] += char
- # Pad solutions strings
- num_missing = MAX_LENGTH - len(solutions[i])
- solutions[i] += ' ' * num_missing
- for x in range(len(strings)):
- for y in range(MAX_LENGTH):
- string_char = strings[x][y]
- strings_v[x][y][all_chars_to_idx[string_char]] = 1
- solution_char = solutions[x][y]
- solutions_v[x][y][all_chars_to_idx[solution_char]] = 1
- return strings, solutions, strings_v, solutions_v
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement