Advertisement
jakc13

Untitled

Nov 21st, 2012
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.53 KB | None | 0 0
  1. import tweetstream, cld, traceback, re
  2. global lang
  3.  
  4. def language(some_text):
  5.     try:
  6.         global lang
  7.         lang = cld.detect(some_text, pickSummaryLanguage=True, removeWeakMatches=True)
  8.         # check the reliable property
  9.         if lang[2] is True:
  10.             return lang[0]
  11.         else:
  12.             return "unreliable"
  13.     except cld.error as e:
  14.         print "Error on ", some_text
  15.         print type(lang)
  16.         print type(language[0])
  17.  
  18. def clean_text(dirty_text):
  19.     try:
  20.         # strip out hashtags, URLs and special characters as these mislead CLD.
  21.         # cant use a-z, as need to accomodate non ascii fonts (e.g. asia).
  22.         just_text = re.sub(r'(?:@\S*|#\S*|http(?=.*://)\S*|[\'{}\(\)\^$&._%#!@=<>:;,~`"\?\*\?\/\+\|\[\\\\])', r'', dirty_text)
  23.  
  24.         # Convert to UTF8
  25.         text_UTF8 = just_text.encode('utf-8')
  26.         return text_UTF8
  27.     except:
  28.         print "Exception"
  29.         traceback.print_exception
  30.  
  31. def parse_tweet(in_t):
  32.     raw = in_t["text"]
  33.     clean = clean_text(raw)
  34.     lan = language(clean)
  35.     if lan != 'ENGLISH':
  36.         print "Raw:", raw
  37.         print "----"
  38.         print "clean:", clean
  39.         print type(lan)
  40.         print "----"
  41.         print "Full Lan:", lang
  42.         print "--------------"
  43.  
  44. uname = 'xxx'
  45. passwd = 'xxx'
  46. extent =["144.0,-39.0,146.0,-37.0"]
  47.  
  48. with tweetstream.FilterStream(uname, passwd, locations=extent) as stream:
  49.     for tweet in stream:
  50.         if "coordinates" in tweet and tweet["coordinates"]:
  51.                     parse_tweet(tweet)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement