Hyperskill Text Generation Pseudorandom

### PASSES SOMETIMES, AND NOT OTHERTIMES.
#TODO REDO LOGIC FOR SELECTING END OF SENTENCES

import nltk
from collections import defaultdict
from collections import Counter
import random
import re


def initialise():
    nltk.download('punkt')


def main():
    initialise()
    corpus = Corpus(input())
    # corpus.report()
    # corpus.report_bigrams()
    # corpus.report_clean_bigrams()
    # corpus.print_10_tokens()
    corpus.print_pseudorandom_tokens_smarter()


class Corpus:
    def __init__(self, file_path="corpus.txt"):
        self.file_path = file_path
        self.raw_text = self.load_file()
        self.corpus = self.tokenize_whitespace()
        self.unique_corpus = set(self.corpus)
        self.bigrams = self.generate_bigrams()
        self.clean_bigrams = self.generate_clean_bigrams()

    def tokenize_whitespace(self):
        return nltk.tokenize.regexp_tokenize(self.raw_text, "\S+")

    def generate_bigrams(self):
        bigrams = []

        for i, token in enumerate(self.corpus):
            if i != 0:
                bigrams.append([self.corpus[i - 1], token])

        return bigrams

    def load_file(self):

        text = ""

        with open(self.file_path, "r", encoding="utf-8") as file:
            for line in file:
                text += line

        return text

    def generate_clean_bigrams(self):
        clean_bigrams_setup = defaultdict(list)
        clean_bigrams = defaultdict(dict)

        for bigram in self.bigrams:
            clean_bigrams_setup[bigram[0]].append(bigram[1])

        for key, value in clean_bigrams_setup.items():
            clean_bigrams[key] = Counter(value)
            clean_bigrams[key] = {key2: value2 for (key2, value2) in
                                  sorted(clean_bigrams[key].items(), key=lambda x: (-x[1], x[0]))}

        return clean_bigrams

    def report(self):
        print("Corpus statistics: ")
        print(f"All tokens: {len(self.corpus)}")
        print(f"Unique tokens: {len(self.unique_corpus)}")

        i = 0
        while i != "exit":
            try:
                i = input()
                index = int(i)
                print(self.corpus[index])
            except Exception as e:
                if i != 'exit':
                    if type(e).__name__ == "ValueError":
                        print("Type Error. Please input an integer.")
                    elif type(e).__name__ == "IndexError":
                        print("Index Error. Please input an integer that is in the range of the corpus.")

    def report_bigrams(self):
        print(f"Number of bigrams: {len(self.bigrams)}")
        head = ""
        tail = ""

        while True:
            try:
                i = input()
                if i == "exit":
                    exit()
                head, tail = self.bigrams[int(i)]
            except (TypeError, ValueError):
                # Test module expecting "TypError" not "Type Error"
                print("TypError. Please input an integer.")
            except IndexError:
                print("Index Error. Please input a value that is not greater than the number of all bigrams.")
            else:
                print(f"Head: {head}     Tail: {tail}")

    def report_clean_bigrams(self):
        while True:

            head = input()
            if head == "exit":
                exit()
            else:
                print(f"Head: {head}")
                if len(self.clean_bigrams[head].items()):
                    for key, value in self.clean_bigrams[head].items():
                        print(f"Tail: {key}    Count: {value}")
                else:
                    print("The requested word is not in the model. Please input another word.")

    def print_10_tokens(self):

        for i in range(10):
            word = random.sample(self.unique_corpus, 1)[0]
            sentence = [word]

            for i in range(9):

                tails_of_word = self.clean_bigrams.get(word)

                tails = []
                weights = []

                for key, value in tails_of_word.items():
                    tails.append(key)
                    weights.append(value)

                word = random.choices(tails, weights)
                word = word[0]

                sentence.append(word)

            print(" ".join(sentence))

    def print_pseudorandom_tokens_smarter(self):

        starting_words = [word for word in self.unique_corpus if self.is_starting_word(word)]

        for i in range(10):
            # print(f"Generating sentence {i}")

            sentence = [random.sample(starting_words, 1)[0]]

            # print("sentence =", sentence)

            while True:
                tails_of_word = self.clean_bigrams.get(sentence[-1])

                tails = []
                weights = []

                for key, value in tails_of_word.items():
                    tails.append(key)
                    weights.append(value)

                tmp_word = random.choices(tails, weights)[0]
                # print("tmp word =", tmp_word)

                sentence.append(tmp_word)
                # print(sentence[-1])
                # print('tails = ', tails)
                if self.is_ending_word(sentence[-1]) and len(sentence) >= 5:
                    break
                elif self.is_ending_word(sentence[-1]):
                    tails.remove(tmp_word)
                    sentence.pop(-1)
                    if len(tails) == 0:
                        if len(sentence) <= 1:
                            sentence = [random.sample(starting_words, 1)[0]]
                        else:
                            sentence.pop(-1)

            print(" ".join(sentence))

    def is_starting_word(self, word):
        starting_word_template = "^[A-Z]+[^.!?]$"
        return bool(re.match(starting_word_template, word))

    def is_ending_word(self, word):
        last_word_template = ".*[.!?]$"
        return bool(re.match(last_word_template, word))


main()