Untitled

import sys, os
import pymongo
import nltk
import twokenize
import re
import string
import unicodedata

from nltk.corpus import stopwords

def process_hashtags(entities):
	hashtags = []

	for hashtag in entities['hashtags']:
		h = hashtag['text'].lower()
		h = remove_accents(h)
		hashtags.append(h)

	return hashtags

def process_text(text, language, emoticons_df):
	#print 'ORIGINAL: ' , text

	try:
		# put text in lower case
		text = text.lower()
	except Exception as exp:
		print ':::::::::::::::::ERROR IN LOWER CASE::::::::::::::::: -> ', exp
		pass

	try:
		# remove repeated characters
		text = max_reps(text, 3)
	except Exception as exp:
		print ':::::::::::::::::ERROR IN REMOVE REPETEAD CHARACTERS::::::::::::::::: -> ', exp
		pass

	try:
		# remove user mentions, urls, hashtag symbols
		text = remove_user_mentions_url_hashtag_symbol(text)
	except Exception as exp:
		print ':::::::::::::::::ERROR IN REMOVE USER M AND HASHTAGS SYMBOL::::::::::::::::: -> ', exp
		pass

	#try:
		# remove punctuation
	#	text = remove_puntuaction(text)
	#except Exception as exp:
	#	print ':::::::::::::::::ERROR IN REMOVE PUNCTUATION::::::::::::::::: -> ', exp
	#	pass

	try:
		# tokenize raw text of tweet
		text = twokenize.simpleTokenize(text)
	except Exception as exp:
		print ':::::::::::::::::ERROR IN TOKENIZATION::::::::::::::::: -> ', exp
		pass

	#print 'AFTER TOKEN: ' , ' '.join(text)

	try:
		# remove stop words
		text = remove_stop_word(text, language)
	except Exception as exp:
		print ':::::::::::::::::ERROR IN REMOVE STOP WORDS::::::::::::::::: -> ', exp
		pass

	text = ' '.join(text)

	#try:
		#remove accents from text
	#	text = remove_accents(text)
	#except Exception as exp:
	#	print ':::::::::::::::::ERROR IN REMOVE ACCENTS::::::::::::::::: -> ', exp

	# remove non-alphanumeric
	re.sub(r'\W+', '', text)

	#print 'AFTER PREPROCESS: ' , text

	return text

def remove_puntuaction(text):
	regex = re.compile('[%s]' % re.escape(string.punctuation))
	return regex.sub('', text)

def remove_user_mentions_url_hashtag_symbol(text):
	return ' '.join(re.sub('(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)|(#)', '', text).split())

def remove_stop_word(text, language):
	if language == 'en':
		list_sw = stopwords.words('english')
	elif language == 'pt':
		list_sw = stopwords.words('portuguese')

	return [word for word in text if word not in list_sw]

def remove_unrecognizable_emoticons(text):
	return ' '.join(re.sub('([^0-9A-Za-z \t])', '', text).split())

def remove_accents(txt):
	new_txt = []
	ascii = string.ascii_letters + string.digits
	for data in txt.split(' '):
		t = ''.join(x for x in unicodedata.normalize(
			'NFKD', unicode(data)) if x in ascii).lower()
		if t != ' ':
			new_txt.append(t)
	return ' '.join(new_txt)

def max_reps(sentence, n=3):
	"""
		Normalizes a string to at most n repetitions of the same character
		e.g, for n=3 and "helllloooooo" -> "helllooo"
		Function from Silvio Amir available at: https://github.com/samiroid/sma_toolkit/
	"""
	new_sentence = ''
	last_c = ''
	max_counter = n
	for c in sentence:
		if c != last_c:
			new_sentence+=c
			last_c = c
			max_counter = n
		else:
			if max_counter > 1:
				new_sentence+=c
				max_counter-=1
			else:
				pass
	return new_sentence