Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### PASSES SOMETIMES, AND NOT OTHERTIMES.
- #TODO REDO LOGIC FOR SELECTING END OF SENTENCES
- import nltk
- from collections import defaultdict
- from collections import Counter
- import random
- import re
- def initialise():
- nltk.download('punkt')
- def main():
- initialise()
- corpus = Corpus(input())
- # corpus.report()
- # corpus.report_bigrams()
- # corpus.report_clean_bigrams()
- # corpus.print_10_tokens()
- corpus.print_pseudorandom_tokens_smarter()
- class Corpus:
- def __init__(self, file_path="corpus.txt"):
- self.file_path = file_path
- self.raw_text = self.load_file()
- self.corpus = self.tokenize_whitespace()
- self.unique_corpus = set(self.corpus)
- self.bigrams = self.generate_bigrams()
- self.clean_bigrams = self.generate_clean_bigrams()
- def tokenize_whitespace(self):
- return nltk.tokenize.regexp_tokenize(self.raw_text, "\S+")
- def generate_bigrams(self):
- bigrams = []
- for i, token in enumerate(self.corpus):
- if i != 0:
- bigrams.append([self.corpus[i - 1], token])
- return bigrams
- def load_file(self):
- text = ""
- with open(self.file_path, "r", encoding="utf-8") as file:
- for line in file:
- text += line
- return text
- def generate_clean_bigrams(self):
- clean_bigrams_setup = defaultdict(list)
- clean_bigrams = defaultdict(dict)
- for bigram in self.bigrams:
- clean_bigrams_setup[bigram[0]].append(bigram[1])
- for key, value in clean_bigrams_setup.items():
- clean_bigrams[key] = Counter(value)
- clean_bigrams[key] = {key2: value2 for (key2, value2) in
- sorted(clean_bigrams[key].items(), key=lambda x: (-x[1], x[0]))}
- return clean_bigrams
- def report(self):
- print("Corpus statistics: ")
- print(f"All tokens: {len(self.corpus)}")
- print(f"Unique tokens: {len(self.unique_corpus)}")
- i = 0
- while i != "exit":
- try:
- i = input()
- index = int(i)
- print(self.corpus[index])
- except Exception as e:
- if i != 'exit':
- if type(e).__name__ == "ValueError":
- print("Type Error. Please input an integer.")
- elif type(e).__name__ == "IndexError":
- print("Index Error. Please input an integer that is in the range of the corpus.")
- def report_bigrams(self):
- print(f"Number of bigrams: {len(self.bigrams)}")
- head = ""
- tail = ""
- while True:
- try:
- i = input()
- if i == "exit":
- exit()
- head, tail = self.bigrams[int(i)]
- except (TypeError, ValueError):
- # Test module expecting "TypError" not "Type Error"
- print("TypError. Please input an integer.")
- except IndexError:
- print("Index Error. Please input a value that is not greater than the number of all bigrams.")
- else:
- print(f"Head: {head} Tail: {tail}")
- def report_clean_bigrams(self):
- while True:
- head = input()
- if head == "exit":
- exit()
- else:
- print(f"Head: {head}")
- if len(self.clean_bigrams[head].items()):
- for key, value in self.clean_bigrams[head].items():
- print(f"Tail: {key} Count: {value}")
- else:
- print("The requested word is not in the model. Please input another word.")
- def print_10_tokens(self):
- for i in range(10):
- word = random.sample(self.unique_corpus, 1)[0]
- sentence = [word]
- for i in range(9):
- tails_of_word = self.clean_bigrams.get(word)
- tails = []
- weights = []
- for key, value in tails_of_word.items():
- tails.append(key)
- weights.append(value)
- word = random.choices(tails, weights)
- word = word[0]
- sentence.append(word)
- print(" ".join(sentence))
- def print_pseudorandom_tokens_smarter(self):
- starting_words = [word for word in self.unique_corpus if self.is_starting_word(word)]
- for i in range(10):
- # print(f"Generating sentence {i}")
- sentence = [random.sample(starting_words, 1)[0]]
- # print("sentence =", sentence)
- while True:
- tails_of_word = self.clean_bigrams.get(sentence[-1])
- tails = []
- weights = []
- for key, value in tails_of_word.items():
- tails.append(key)
- weights.append(value)
- tmp_word = random.choices(tails, weights)[0]
- # print("tmp word =", tmp_word)
- sentence.append(tmp_word)
- # print(sentence[-1])
- # print('tails = ', tails)
- if self.is_ending_word(sentence[-1]) and len(sentence) >= 5:
- break
- elif self.is_ending_word(sentence[-1]):
- tails.remove(tmp_word)
- sentence.pop(-1)
- if len(tails) == 0:
- if len(sentence) <= 1:
- sentence = [random.sample(starting_words, 1)[0]]
- else:
- sentence.pop(-1)
- print(" ".join(sentence))
- def is_starting_word(self, word):
- starting_word_template = "^[A-Z]+[^.!?]$"
- return bool(re.match(starting_word_template, word))
- def is_ending_word(self, word):
- last_word_template = ".*[.!?]$"
- return bool(re.match(last_word_template, word))
- main()
Advertisement
Add Comment
Please, Sign In to add comment