Guest User

Hyperskill Text Generation Pseudorandom

a guest
Mar 27th, 2021
383
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.04 KB | None | 0 0
  1. ### PASSES SOMETIMES, AND NOT OTHERTIMES.
  2. #TODO REDO LOGIC FOR SELECTING END OF SENTENCES
  3.  
  4. import nltk
  5. from collections import defaultdict
  6. from collections import Counter
  7. import random
  8. import re
  9.  
  10.  
  11. def initialise():
  12.     nltk.download('punkt')
  13.  
  14.  
  15. def main():
  16.     initialise()
  17.     corpus = Corpus(input())
  18.     # corpus.report()
  19.     # corpus.report_bigrams()
  20.     # corpus.report_clean_bigrams()
  21.     # corpus.print_10_tokens()
  22.     corpus.print_pseudorandom_tokens_smarter()
  23.  
  24.  
  25. class Corpus:
  26.     def __init__(self, file_path="corpus.txt"):
  27.         self.file_path = file_path
  28.         self.raw_text = self.load_file()
  29.         self.corpus = self.tokenize_whitespace()
  30.         self.unique_corpus = set(self.corpus)
  31.         self.bigrams = self.generate_bigrams()
  32.         self.clean_bigrams = self.generate_clean_bigrams()
  33.  
  34.     def tokenize_whitespace(self):
  35.         return nltk.tokenize.regexp_tokenize(self.raw_text, "\S+")
  36.  
  37.     def generate_bigrams(self):
  38.         bigrams = []
  39.  
  40.         for i, token in enumerate(self.corpus):
  41.             if i != 0:
  42.                 bigrams.append([self.corpus[i - 1], token])
  43.  
  44.         return bigrams
  45.  
  46.     def load_file(self):
  47.  
  48.         text = ""
  49.  
  50.         with open(self.file_path, "r", encoding="utf-8") as file:
  51.             for line in file:
  52.                 text += line
  53.  
  54.         return text
  55.  
  56.     def generate_clean_bigrams(self):
  57.         clean_bigrams_setup = defaultdict(list)
  58.         clean_bigrams = defaultdict(dict)
  59.  
  60.         for bigram in self.bigrams:
  61.             clean_bigrams_setup[bigram[0]].append(bigram[1])
  62.  
  63.         for key, value in clean_bigrams_setup.items():
  64.             clean_bigrams[key] = Counter(value)
  65.             clean_bigrams[key] = {key2: value2 for (key2, value2) in
  66.                                   sorted(clean_bigrams[key].items(), key=lambda x: (-x[1], x[0]))}
  67.  
  68.         return clean_bigrams
  69.  
  70.     def report(self):
  71.         print("Corpus statistics: ")
  72.         print(f"All tokens: {len(self.corpus)}")
  73.         print(f"Unique tokens: {len(self.unique_corpus)}")
  74.  
  75.         i = 0
  76.         while i != "exit":
  77.             try:
  78.                 i = input()
  79.                 index = int(i)
  80.                 print(self.corpus[index])
  81.             except Exception as e:
  82.                 if i != 'exit':
  83.                     if type(e).__name__ == "ValueError":
  84.                         print("Type Error. Please input an integer.")
  85.                     elif type(e).__name__ == "IndexError":
  86.                         print("Index Error. Please input an integer that is in the range of the corpus.")
  87.  
  88.     def report_bigrams(self):
  89.         print(f"Number of bigrams: {len(self.bigrams)}")
  90.         head = ""
  91.         tail = ""
  92.  
  93.         while True:
  94.             try:
  95.                 i = input()
  96.                 if i == "exit":
  97.                     exit()
  98.                 head, tail = self.bigrams[int(i)]
  99.             except (TypeError, ValueError):
  100.                 # Test module expecting "TypError" not "Type Error"
  101.                 print("TypError. Please input an integer.")
  102.             except IndexError:
  103.                 print("Index Error. Please input a value that is not greater than the number of all bigrams.")
  104.             else:
  105.                 print(f"Head: {head}     Tail: {tail}")
  106.  
  107.     def report_clean_bigrams(self):
  108.         while True:
  109.  
  110.             head = input()
  111.             if head == "exit":
  112.                 exit()
  113.             else:
  114.                 print(f"Head: {head}")
  115.                 if len(self.clean_bigrams[head].items()):
  116.                     for key, value in self.clean_bigrams[head].items():
  117.                         print(f"Tail: {key}    Count: {value}")
  118.                 else:
  119.                     print("The requested word is not in the model. Please input another word.")
  120.  
  121.     def print_10_tokens(self):
  122.  
  123.         for i in range(10):
  124.             word = random.sample(self.unique_corpus, 1)[0]
  125.             sentence = [word]
  126.  
  127.             for i in range(9):
  128.  
  129.                 tails_of_word = self.clean_bigrams.get(word)
  130.  
  131.                 tails = []
  132.                 weights = []
  133.  
  134.                 for key, value in tails_of_word.items():
  135.                     tails.append(key)
  136.                     weights.append(value)
  137.  
  138.                 word = random.choices(tails, weights)
  139.                 word = word[0]
  140.  
  141.                 sentence.append(word)
  142.  
  143.             print(" ".join(sentence))
  144.  
  145.     def print_pseudorandom_tokens_smarter(self):
  146.  
  147.         starting_words = [word for word in self.unique_corpus if self.is_starting_word(word)]
  148.  
  149.         for i in range(10):
  150.             # print(f"Generating sentence {i}")
  151.  
  152.             sentence = [random.sample(starting_words, 1)[0]]
  153.  
  154.             # print("sentence =", sentence)
  155.  
  156.             while True:
  157.                 tails_of_word = self.clean_bigrams.get(sentence[-1])
  158.  
  159.                 tails = []
  160.                 weights = []
  161.  
  162.                 for key, value in tails_of_word.items():
  163.                     tails.append(key)
  164.                     weights.append(value)
  165.  
  166.                 tmp_word = random.choices(tails, weights)[0]
  167.                 # print("tmp word =", tmp_word)
  168.  
  169.                 sentence.append(tmp_word)
  170.                 # print(sentence[-1])
  171.                 # print('tails = ', tails)
  172.                 if self.is_ending_word(sentence[-1]) and len(sentence) >= 5:
  173.                     break
  174.                 elif self.is_ending_word(sentence[-1]):
  175.                     tails.remove(tmp_word)
  176.                     sentence.pop(-1)
  177.                     if len(tails) == 0:
  178.                         if len(sentence) <= 1:
  179.                             sentence = [random.sample(starting_words, 1)[0]]
  180.                         else:
  181.                             sentence.pop(-1)
  182.  
  183.             print(" ".join(sentence))
  184.  
  185.     def is_starting_word(self, word):
  186.         starting_word_template = "^[A-Z]+[^.!?]$"
  187.         return bool(re.match(starting_word_template, word))
  188.  
  189.     def is_ending_word(self, word):
  190.         last_word_template = ".*[.!?]$"
  191.         return bool(re.match(last_word_template, word))
  192.  
  193.  
  194. main()
Advertisement
Add Comment
Please, Sign In to add comment