Advertisement
Guest User

Untitled

a guest
Sep 25th, 2017
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.28 KB | None | 0 0
  1. import sys, os
  2. import pymongo
  3. import nltk
  4. import twokenize
  5. import re
  6. import string
  7. import unicodedata
  8.  
  9. from nltk.corpus import stopwords
  10.  
  11. def process_hashtags(entities):
  12. hashtags = []
  13.  
  14. for hashtag in entities['hashtags']:
  15. h = hashtag['text'].lower()
  16. h = remove_accents(h)
  17. hashtags.append(h)
  18.  
  19. return hashtags
  20.  
  21. def process_text(text, language, emoticons_df):
  22. #print 'ORIGINAL: ' , text
  23.  
  24. try:
  25. # put text in lower case
  26. text = text.lower()
  27. except Exception as exp:
  28. print ':::::::::::::::::ERROR IN LOWER CASE::::::::::::::::: -> ', exp
  29. pass
  30.  
  31. try:
  32. # remove repeated characters
  33. text = max_reps(text, 3)
  34. except Exception as exp:
  35. print ':::::::::::::::::ERROR IN REMOVE REPETEAD CHARACTERS::::::::::::::::: -> ', exp
  36. pass
  37.  
  38. try:
  39. # remove user mentions, urls, hashtag symbols
  40. text = remove_user_mentions_url_hashtag_symbol(text)
  41. except Exception as exp:
  42. print ':::::::::::::::::ERROR IN REMOVE USER M AND HASHTAGS SYMBOL::::::::::::::::: -> ', exp
  43. pass
  44.  
  45. #try:
  46. # remove punctuation
  47. # text = remove_puntuaction(text)
  48. #except Exception as exp:
  49. # print ':::::::::::::::::ERROR IN REMOVE PUNCTUATION::::::::::::::::: -> ', exp
  50. # pass
  51.  
  52. try:
  53. # tokenize raw text of tweet
  54. text = twokenize.simpleTokenize(text)
  55. except Exception as exp:
  56. print ':::::::::::::::::ERROR IN TOKENIZATION::::::::::::::::: -> ', exp
  57. pass
  58.  
  59. #print 'AFTER TOKEN: ' , ' '.join(text)
  60.  
  61. try:
  62. # remove stop words
  63. text = remove_stop_word(text, language)
  64. except Exception as exp:
  65. print ':::::::::::::::::ERROR IN REMOVE STOP WORDS::::::::::::::::: -> ', exp
  66. pass
  67.  
  68. text = ' '.join(text)
  69.  
  70. #try:
  71. #remove accents from text
  72. # text = remove_accents(text)
  73. #except Exception as exp:
  74. # print ':::::::::::::::::ERROR IN REMOVE ACCENTS::::::::::::::::: -> ', exp
  75.  
  76. # remove non-alphanumeric
  77. re.sub(r'\W+', '', text)
  78.  
  79. #print 'AFTER PREPROCESS: ' , text
  80.  
  81. return text
  82.  
  83. def remove_puntuaction(text):
  84. regex = re.compile('[%s]' % re.escape(string.punctuation))
  85. return regex.sub('', text)
  86.  
  87. def remove_user_mentions_url_hashtag_symbol(text):
  88. return ' '.join(re.sub('(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)|(#)', '', text).split())
  89.  
  90. def remove_stop_word(text, language):
  91. if language == 'en':
  92. list_sw = stopwords.words('english')
  93. elif language == 'pt':
  94. list_sw = stopwords.words('portuguese')
  95.  
  96. return [word for word in text if word not in list_sw]
  97.  
  98. def remove_unrecognizable_emoticons(text):
  99. return ' '.join(re.sub('([^0-9A-Za-z \t])', '', text).split())
  100.  
  101. def remove_accents(txt):
  102. new_txt = []
  103. ascii = string.ascii_letters + string.digits
  104. for data in txt.split(' '):
  105. t = ''.join(x for x in unicodedata.normalize(
  106. 'NFKD', unicode(data)) if x in ascii).lower()
  107. if t != ' ':
  108. new_txt.append(t)
  109. return ' '.join(new_txt)
  110.  
  111. def max_reps(sentence, n=3):
  112. """
  113. Normalizes a string to at most n repetitions of the same character
  114. e.g, for n=3 and "helllloooooo" -> "helllooo"
  115. Function from Silvio Amir available at: https://github.com/samiroid/sma_toolkit/
  116. """
  117. new_sentence = ''
  118. last_c = ''
  119. max_counter = n
  120. for c in sentence:
  121. if c != last_c:
  122. new_sentence+=c
  123. last_c = c
  124. max_counter = n
  125. else:
  126. if max_counter > 1:
  127. new_sentence+=c
  128. max_counter-=1
  129. else:
  130. pass
  131. return new_sentence
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement