Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys, os
- import pymongo
- import nltk
- import twokenize
- import re
- import string
- import unicodedata
- from nltk.corpus import stopwords
- def process_hashtags(entities):
- hashtags = []
- for hashtag in entities['hashtags']:
- h = hashtag['text'].lower()
- h = remove_accents(h)
- hashtags.append(h)
- return hashtags
- def process_text(text, language, emoticons_df):
- #print 'ORIGINAL: ' , text
- try:
- # put text in lower case
- text = text.lower()
- except Exception as exp:
- print ':::::::::::::::::ERROR IN LOWER CASE::::::::::::::::: -> ', exp
- pass
- try:
- # remove repeated characters
- text = max_reps(text, 3)
- except Exception as exp:
- print ':::::::::::::::::ERROR IN REMOVE REPETEAD CHARACTERS::::::::::::::::: -> ', exp
- pass
- try:
- # remove user mentions, urls, hashtag symbols
- text = remove_user_mentions_url_hashtag_symbol(text)
- except Exception as exp:
- print ':::::::::::::::::ERROR IN REMOVE USER M AND HASHTAGS SYMBOL::::::::::::::::: -> ', exp
- pass
- #try:
- # remove punctuation
- # text = remove_puntuaction(text)
- #except Exception as exp:
- # print ':::::::::::::::::ERROR IN REMOVE PUNCTUATION::::::::::::::::: -> ', exp
- # pass
- try:
- # tokenize raw text of tweet
- text = twokenize.simpleTokenize(text)
- except Exception as exp:
- print ':::::::::::::::::ERROR IN TOKENIZATION::::::::::::::::: -> ', exp
- pass
- #print 'AFTER TOKEN: ' , ' '.join(text)
- try:
- # remove stop words
- text = remove_stop_word(text, language)
- except Exception as exp:
- print ':::::::::::::::::ERROR IN REMOVE STOP WORDS::::::::::::::::: -> ', exp
- pass
- text = ' '.join(text)
- #try:
- #remove accents from text
- # text = remove_accents(text)
- #except Exception as exp:
- # print ':::::::::::::::::ERROR IN REMOVE ACCENTS::::::::::::::::: -> ', exp
- # remove non-alphanumeric
- re.sub(r'\W+', '', text)
- #print 'AFTER PREPROCESS: ' , text
- return text
- def remove_puntuaction(text):
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- return regex.sub('', text)
- def remove_user_mentions_url_hashtag_symbol(text):
- return ' '.join(re.sub('(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)|(#)', '', text).split())
- def remove_stop_word(text, language):
- if language == 'en':
- list_sw = stopwords.words('english')
- elif language == 'pt':
- list_sw = stopwords.words('portuguese')
- return [word for word in text if word not in list_sw]
- def remove_unrecognizable_emoticons(text):
- return ' '.join(re.sub('([^0-9A-Za-z \t])', '', text).split())
- def remove_accents(txt):
- new_txt = []
- ascii = string.ascii_letters + string.digits
- for data in txt.split(' '):
- t = ''.join(x for x in unicodedata.normalize(
- 'NFKD', unicode(data)) if x in ascii).lower()
- if t != ' ':
- new_txt.append(t)
- return ' '.join(new_txt)
- def max_reps(sentence, n=3):
- """
- Normalizes a string to at most n repetitions of the same character
- e.g, for n=3 and "helllloooooo" -> "helllooo"
- Function from Silvio Amir available at: https://github.com/samiroid/sma_toolkit/
- """
- new_sentence = ''
- last_c = ''
- max_counter = n
- for c in sentence:
- if c != last_c:
- new_sentence+=c
- last_c = c
- max_counter = n
- else:
- if max_counter > 1:
- new_sentence+=c
- max_counter-=1
- else:
- pass
- return new_sentence
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement