Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from random import choice, random
- from nltk import word_tokenize
- import argparse
- import sys
- def get_counts(context_length, training_text):
- counts = {}
- tokens = word_tokenize(training_text)
- for i in range(len(tokens) - context_length):
- context = []
- next_token = tokens[i + context_length]
- for j in range(context_length):
- context.append(tokens[i + j])
- # Add 1 to frequency or create new dictionary item for this tuple
- if tuple(context) in counts:
- if next_token in counts[tuple(context)]:
- counts[tuple(context)][next_token] += 1
- else:
- counts[tuple(context)][next_token] = 1
- else:
- counts[tuple(context)] = {next_token: 1}
- return counts
- def generate_from_file(context_length, training_file, my_starter, output_length=60):
- # Open the training file
- with open(training_file, 'r') as f:
- training_data = f.read()
- counts = get_counts(context_length, training_data)
- if my_starter is not None:
- first_tokens = tuple(my_starter)
- else:
- first_tokens = choice(list(counts.keys())) # Choose a random first context
- if first_tokens in counts:
- pass
- else:
- q = " ".join(first_tokens) + ". "
- first_tokens = choice(list(counts.keys()))
- output_list = list(first_tokens)
- current_context = first_tokens
- for i in range(output_length):
- a = counts[current_context]
- b=(counts[current_context].get)
- next_context = max(a, key=b)
- temp = list(current_context)
- temp.pop(0) # Remove first token in previous context
- temp.append(next_context) # Add new token for the next context
- next_token = temp[-1]
- next_context = tuple(temp)
- current_context = next_context
- output_list.append(next_token)
- x = (" ".join(output_list))
- print(q + x)
- parser = argparse.ArgumentParser()
- parser.add_argument("file")
- parser.add_argument("-n", action="store", default="2")
- parser.add_argument("--starter", action="store")
- options = parser.parse_args()
- number = int(options.n)
- starter = options.starter
- starter = word_tokenize(starter)
- # starter = None
- if len(starter) == number:
- generate_from_file(number,options.file,starter)
- else:
- print("Input is too long.")
- sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement