Untitled

import tweetstream, cld, traceback, re
global lang

def language(some_text):
    try:
        global lang
        lang = cld.detect(some_text, pickSummaryLanguage=True, removeWeakMatches=True)
        # check the reliable property
        if lang[2] is True:
            return lang[0]
        else:
            return "unreliable"
    except cld.error as e:
        print "Error on ", some_text
        print type(lang)
        print type(language[0])

def clean_text(dirty_text):
    try:
        # strip out hashtags, URLs and special characters as these mislead CLD.
        # cant use a-z, as need to accomodate non ascii fonts (e.g. asia).
        just_text = re.sub(r'(?:@\S*|#\S*|http(?=.*://)\S*|[\'{}\(\)\^$&._%#!@=<>:;,~`"\?\*\?\/\+\|\[\\\\])', r'', dirty_text)

        # Convert to UTF8
        text_UTF8 = just_text.encode('utf-8')
        return text_UTF8
    except:
        print "Exception"
        traceback.print_exception

def parse_tweet(in_t):
    raw = in_t["text"]
    clean = clean_text(raw)
    lan = language(clean)
    if lan != 'ENGLISH':
        print "Raw:", raw
        print "----"
        print "clean:", clean
        print type(lan)
        print "----"
        print "Full Lan:", lang
        print "--------------"

uname = 'xxx'
passwd = 'xxx'
extent =["144.0,-39.0,146.0,-37.0"]

with tweetstream.FilterStream(uname, passwd, locations=extent) as stream:
    for tweet in stream:
        if "coordinates" in tweet and tweet["coordinates"]:
                    parse_tweet(tweet)