Advertisement
jbozhich

NGrams

Oct 29th, 2017
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.41 KB | None | 0 0
  1. from random import choice, random
  2. from nltk import word_tokenize
  3. import argparse
  4. import sys
  5.  
  6.  
  7. def get_counts(context_length, training_text):
  8.  
  9. counts = {}
  10.  
  11. tokens = word_tokenize(training_text)
  12. for i in range(len(tokens) - context_length):
  13. context = []
  14. next_token = tokens[i + context_length]
  15. for j in range(context_length):
  16. context.append(tokens[i + j])
  17.  
  18. # Add 1 to frequency or create new dictionary item for this tuple
  19. if tuple(context) in counts:
  20. if next_token in counts[tuple(context)]:
  21. counts[tuple(context)][next_token] += 1
  22. else:
  23. counts[tuple(context)][next_token] = 1
  24. else:
  25. counts[tuple(context)] = {next_token: 1}
  26.  
  27. return counts
  28.  
  29.  
  30. def generate_from_file(context_length, training_file, my_starter, output_length=60):
  31.  
  32. # Open the training file
  33. with open(training_file, 'r') as f:
  34. training_data = f.read()
  35.  
  36. counts = get_counts(context_length, training_data)
  37. if my_starter is not None:
  38. first_tokens = tuple(my_starter)
  39. else:
  40. first_tokens = choice(list(counts.keys())) # Choose a random first context
  41.  
  42.  
  43. if first_tokens in counts:
  44. pass
  45. else:
  46. q = " ".join(first_tokens) + ". "
  47. first_tokens = choice(list(counts.keys()))
  48. output_list = list(first_tokens)
  49. current_context = first_tokens
  50.  
  51. for i in range(output_length):
  52. a = counts[current_context]
  53. b=(counts[current_context].get)
  54. next_context = max(a, key=b)
  55. temp = list(current_context)
  56. temp.pop(0) # Remove first token in previous context
  57. temp.append(next_context) # Add new token for the next context
  58. next_token = temp[-1]
  59. next_context = tuple(temp)
  60.  
  61. current_context = next_context
  62.  
  63. output_list.append(next_token)
  64.  
  65. x = (" ".join(output_list))
  66. print(q + x)
  67.  
  68.  
  69. parser = argparse.ArgumentParser()
  70. parser.add_argument("file")
  71. parser.add_argument("-n", action="store", default="2")
  72. parser.add_argument("--starter", action="store")
  73.  
  74. options = parser.parse_args()
  75. number = int(options.n)
  76. starter = options.starter
  77. starter = word_tokenize(starter)
  78. # starter = None
  79.  
  80. if len(starter) == number:
  81. generate_from_file(number,options.file,starter)
  82. else:
  83. print("Input is too long.")
  84. sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement