Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tweetstream, cld, traceback, re
- global lang
- def language(some_text):
- try:
- global lang
- lang = cld.detect(some_text, pickSummaryLanguage=True, removeWeakMatches=True)
- # check the reliable property
- if lang[2] is True:
- return lang[0]
- else:
- return "unreliable"
- except cld.error as e:
- print "Error on ", some_text
- print type(lang)
- print type(language[0])
- def clean_text(dirty_text):
- try:
- # strip out hashtags, URLs and special characters as these mislead CLD.
- # cant use a-z, as need to accomodate non ascii fonts (e.g. asia).
- just_text = re.sub(r'(?:@\S*|#\S*|http(?=.*://)\S*|[\'{}\(\)\^$&._%#!@=<>:;,~`"\?\*\?\/\+\|\[\\\\])', r'', dirty_text)
- # Convert to UTF8
- text_UTF8 = just_text.encode('utf-8')
- return text_UTF8
- except:
- print "Exception"
- traceback.print_exception
- def parse_tweet(in_t):
- raw = in_t["text"]
- clean = clean_text(raw)
- lan = language(clean)
- if lan != 'ENGLISH':
- print "Raw:", raw
- print "----"
- print "clean:", clean
- print type(lan)
- print "----"
- print "Full Lan:", lang
- print "--------------"
- uname = 'xxx'
- passwd = 'xxx'
- extent =["144.0,-39.0,146.0,-37.0"]
- with tweetstream.FilterStream(uname, passwd, locations=extent) as stream:
- for tweet in stream:
- if "coordinates" in tweet and tweet["coordinates"]:
- parse_tweet(tweet)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement