mskf

Untitled

Sep 28th, 2020
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.31 KB | None | 0 0
  1. # Given an existing complete statement or question, generates variations of the sentences using synonym matching.
  2. # Synonym matching code from nickloewen (https://github.com/nickloewen/thesaurus)
  3. # Can either run script and pass arguments in Terminal / Command Prompt, or imported as library.
  4.  
  5. import sys
  6. import nltk
  7. from nltk.corpus import wordnet
  8. from nltk.tokenize import RegexpTokenizer
  9. import re
  10.  
  11. class TextRegenerator(object):
  12.  
  13. words_to_preserve = []
  14. words_to_preserve_default = ["a", "the", "in", "of", "at", "does"]
  15. words_to_preserve.extend([w.title() for w in words_to_preserve])
  16. punctuation = [".", ",", ":", ";", "?", "!"]
  17.  
  18.  
  19. def __init__(self):
  20.  
  21. self.str_base = ""
  22. self.new_str = ""
  23. self.new_str_list = []
  24.  
  25.  
  26. def createWordSynonyms(self, word):
  27. synsets = wordnet.synsets(word)
  28. synonyms = [word]
  29.  
  30. if word not in TextRegenerator.words_to_preserve:
  31. for s in synsets:
  32. for l in s.lemmas():
  33. synonyms.append(l.name())
  34.  
  35. # if there are no synonyms, put the original word in
  36. synonyms.append(word)
  37. return self.uniq(synonyms)
  38.  
  39.  
  40. def createPhraseSynonyms(self, _str_base):
  41. """Finds synonyms for every word in the input. Returns a list, containing a
  42. list of synonyms for every word in the input."""
  43.  
  44. tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
  45. tokens = tokenizer.tokenize(_str_base)
  46.  
  47. # synonyms for all words: each word is a list of synonyms inside this one
  48. synonyms = []
  49. for t in tokens:
  50. synonyms.append(self.createWordSynonyms(t))
  51. return synonyms
  52.  
  53.  
  54. def stripUnderscores(self, word):
  55. return re.sub("_", " ", word)
  56.  
  57. def tidyPunctuation(self, word):
  58. return re.sub(r'\s([?.!"](?:\s|$))', r'\1', word)
  59.  
  60. def uniq(self, seq):
  61. seen = set()
  62. seen_add = seen.add
  63. return [ x for x in seq if x not in seen and not seen_add(x)]
  64.  
  65.  
  66. def permuteAllSynonyms(self, phrase_synonyms):
  67.  
  68. output = []
  69.  
  70. """Determine which token has the most phrase_synonyms."""
  71. longest = ""
  72. for item in phrase_synonyms:
  73. if len(item) > len(longest):
  74. longest = item
  75.  
  76. # Loop for each synonym in 'longest' list.
  77. for i in range(len(longest)):
  78. """Build a new phrase using the first word of each list, then remove
  79. that word, unless it is the last one."""
  80.  
  81. phrase = ""
  82. for s in phrase_synonyms:
  83. phrase = phrase + " " + str(s[0])
  84. if len(s) > 1:
  85. s.pop(0)
  86. output.append(phrase.strip())
  87.  
  88. return output
  89.  
  90.  
  91. def generateStrVariations(self, _str_base):
  92.  
  93. """Generates variations (through synonym matching) of an inputted string, ignoring
  94. list of stop words."""
  95.  
  96. print('\n\tNow generating variations of: "' + _str_base + '"..')
  97.  
  98. # Use the code block below jf you want to make a list of variations using synonym matching (many of the variations don't make sense)
  99. output = self.createPhraseSynonyms(_str_base)
  100. output = self.permuteAllSynonyms(output)
  101. for phrase in output[:10]:
  102. print ("\t\t" + str(self.tidyPunctuation(self.stripUnderscores(phrase))))
  103. print("> > >\n")
  104. return output
  105.  
  106.  
  107. def addStopWords(self, l_param):
  108.  
  109. """Takes in a 'string' (enclosed in quotes, and meant to be typed as a list of words separated
  110. by commas and spaces) to parse and append to the stop (ignored) words."""
  111. try:
  112. l_param = l_param.lower().split(', ')
  113. except:
  114. pass
  115. # print("\n\t" + "Default list of stop words: " + str(TextRegenerator.words_to_preserve_default))
  116. TextRegenerator.words_to_preserve = list(l_param)
  117. TextRegenerator.words_to_preserve.extend(x for x in TextRegenerator.words_to_preserve_default if x not in TextRegenerator.words_to_preserve)
  118. print("\n< < <\n\t")
  119. print("Final list of stop words: " + str(TextRegenerator.words_to_preserve))
  120.  
  121.  
  122. if __name__ == '__main__':
  123. try:
  124. l_param = str(sys.argv[2])
  125. except:
  126. l_param = []
  127. pass
  128. try:
  129. t= "This topic covers service virtualization as a method to emulate the behavior of specific components such as cloud-based applications and service-oriented architecture."
  130. str_base = str(t).lower()
  131. except IndexError as err:
  132. print("\n\n\tError initializing TextRegenerator: {0}".format(err))
  133. print("\n\tPossible invalid or no parameter supplied.\n\n\tRun the script again, with either the complete string"
  134. "you want to generate mass variations of as the only argument, or the second argument of a list of words to ignore (not" +
  135. "replace), separated by commas and spaces and enclosed in quotes.")
  136. print("\n\tExiting..")
  137. sys.exit(0)
  138. except:
  139. print("\n\n\tUnexpected error:", sys.exc_info()[0])
  140. print("\n\tExiting..")
  141. sys.exit(0)
  142. trgnr = TextRegenerator()
  143. trgnr.addStopWords(l_param)
  144. trgnr.generateStrVariations(str_base)
  145. else:
  146. trgnr = TextRegenerator()
  147. l_param = []
  148. trgnr.addStopWords(l_param)
Add Comment
Please, Sign In to add comment