Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def processData(data):
- data = data.lower() #casefold
- data = re.sub('<[^>]*>',' ',data) #remove any html
- data = re.sub(r'#([^s]+)', r'1', data) #Replace #word with word
- remove = string.punctuation
- remove = remove.replace("'", "") # don't remove '
- p = r"[{}]".format(remove) #create the pattern
- data = re.sub(p, "", data)
- data = re.sub('[s]+', ' ', data) #remove additional whitespaces
- pp = re.compile(r"(.)1{1,}", re.DOTALL) #pattern for remove repetitions
- data = pp.sub(r"11", data)
- return data
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement