Untitled

import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import random
#"abc" => "abc"
#"aabbac" => "abc"
#"abacd" => "abcd"
MAX_LENGTH = 6 # Max length of 6
chars = ["a", "b", "c", "d", "e", "f"]
all_chars = chars + [' '] # Space for padding
NUM_EXAMPLES = 50000
# Args:
#   n: number of examples to generate
# Returns:
#   strings: list of strings that may contain duplicates
#   solutions: strings without duplicates
#   strings_v: One hot encoding of strings with duplicates (without padding)
#   solutions_v: One hot encoding of solutions (with padding)
def generate_data(n=NUM_EXAMPLES):
    all_chars_to_idx = { c:i for i, c in enumerate(all_chars) }
    strings_v = np.zeros((NUM_EXAMPLES, MAX_LENGTH, len(all_chars)))
    solutions_v = np.zeros((NUM_EXAMPLES, MAX_LENGTH, len(all_chars)))

    strings = [''] * NUM_EXAMPLES
    solutions = [''] * NUM_EXAMPLES

    for i in range(NUM_EXAMPLES):
        for l in range(MAX_LENGTH):
            char = random.choice(chars) # only sample from valid characters
            strings[i] += char
            if char not in solutions[i]:
                solutions[i] += char

        # Pad solutions strings
        num_missing = MAX_LENGTH - len(solutions[i])
        solutions[i] += ' ' * num_missing

    for x in range(len(strings)):
        for y in range(MAX_LENGTH):
            string_char = strings[x][y]
            strings_v[x][y][all_chars_to_idx[string_char]] = 1

            solution_char = solutions[x][y]
            solutions_v[x][y][all_chars_to_idx[solution_char]] = 1

    return strings, solutions, strings_v, solutions_v