Guest User

Untitled

a guest
Dec 12th, 2018
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.37 KB | None | 0 0
  1. def preprocess_text(text):
  2.  
  3. lower_text = text.lower()
  4. cleaned_text = re.sub("\\(.*?\\)", "", lower_text)
  5.  
  6. tokens = stemmer.analyze(cleaned_text)
  7.  
  8. lemmas = []
  9.  
  10. for token in tokens:
  11.  
  12. if "analysis" in token:
  13. if len(token["analysis"]):
  14. if "lex" in token["analysis"][0]:
  15. lemmas.append(token["analysis"][0]["lex"])
  16. else:
  17. if "text" not in token:
  18. continue
  19.  
  20. # english
  21. if re.match("[a-z]", token["text"].lower()):
  22. if not len(lemmas) or lemmas[-1] != "_ENG_":
  23. lemmas.append("_ENG_")
  24. else:
  25. if re.sub("\s", "", token["text"]):
  26. if not len(lemmas) or lemmas[-1] != "_NAME_":
  27. lemmas.append("_NAME_")
  28. else:
  29. if "text" not in token:
  30. continue
  31.  
  32. if token["text"].isdigit():
  33. if not len(lemmas) or lemmas[-1] != "_DIGITS_":
  34. lemmas.append("_DIGITS_")
  35. else:
  36. clean_text = re.sub(" ", "", token["text"])
  37. if clean_text in [".", ",", "?", "!", ":", "(", ")"]:
  38. lemmas.append(clean_text)
  39.  
  40.  
  41. return " ".join(lemmas)
Add Comment
Please, Sign In to add comment