Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def preprocess_text(text):
- lower_text = text.lower()
- cleaned_text = re.sub("\\(.*?\\)", "", lower_text)
- tokens = stemmer.analyze(cleaned_text)
- lemmas = []
- for token in tokens:
- if "analysis" in token:
- if len(token["analysis"]):
- if "lex" in token["analysis"][0]:
- lemmas.append(token["analysis"][0]["lex"])
- else:
- if "text" not in token:
- continue
- # english
- if re.match("[a-z]", token["text"].lower()):
- if not len(lemmas) or lemmas[-1] != "_ENG_":
- lemmas.append("_ENG_")
- else:
- if re.sub("\s", "", token["text"]):
- if not len(lemmas) or lemmas[-1] != "_NAME_":
- lemmas.append("_NAME_")
- else:
- if "text" not in token:
- continue
- if token["text"].isdigit():
- if not len(lemmas) or lemmas[-1] != "_DIGITS_":
- lemmas.append("_DIGITS_")
- else:
- clean_text = re.sub(" ", "", token["text"])
- if clean_text in [".", ",", "?", "!", ":", "(", ")"]:
- lemmas.append(clean_text)
- return " ".join(lemmas)
Add Comment
Please, Sign In to add comment