import twitter
import nltk
import cPickle
import re
import networkx as nx
import sys
import json

# Source Code: O\'Reilly Mining The Social Web
# 7/18/2011

# Your Query
Q = sys.argv[1]

twitter_search = twitter.Twitter(domain="search.twitter.com")
search_results = []
for page in range(1,15):
     search_results.append(twitter_search.search(q = Q , rpp=100,page=page))

f = open(\'NCD_tweets.txt\', \'w\')
print >>f, json.dumps(search_results, sort_keys=True, indent=1)

tweets = [r[\'text\'] \\
    for result in search_results \\
            for r in result[\'results\'] ]
words = []
for t in tweets:
     words += [ w for w in t.split() ]

# print \'Total Words: \',len(words)  # total words

# print \'Unique Words: \',len(set(words)) # unique words

# print \'Lexical Diversity: \',1.0*len(set(words))/len(words) #lexical diversity

# print \'Avg Words Per Tweet: \',1.0*sum([len(t.split()) for t in tweets])/len(tweets)\\
    #avg words per tweet

freq_dist = nltk.FreqDist(words)
# print \'Fifty most frequent tokens: \',freq_dist.keys()[:50] # 50 most frequent tokens

# print \'Fifty least frequent tokens: \',freq_dist.keys()[-50:] # 50 least frequent tokens

g = nx.DiGraph()

all_tweets = [ tweet
               for page in search_results
                    for tweet in page["results"] ]

def get_rt_sources(tweet):
     rt_patterns = re.compile(r"(RT|via)((?:\\b\\W*@\\w+)+)", re.IGNORECASE)
     return [ source.strip()
              for tuple in rt_patterns.findall(tweet)
                   for source in tuple
                      if source not in ("RT", "via") ]

for tweet in all_tweets:
     rt_sources = get_rt_sources(tweet["text"])
     if not rt_sources: continue
     for rt_source in rt_sources:
          g.add_edge(rt_source, tweet["from_user"], {"tweet_id" : tweet["id"]})

print \'Number of Nodes: \',g.number_of_nodes()

print \'Number of Edges: \', g.number_of_edges()

print len(nx.connected_components(g.to_undirected()))

print list(nx.degree(g).values())