import twitter
import nltk
import cPickle
import re
import networkx as nx
import sys
import json
# Source Code: O\'Reilly Mining The Social Web
# 7/18/2011
# Your Query
Q = sys.argv[1]
twitter_search = twitter.Twitter(domain="search.twitter.com")
search_results = []
for page in range(1,15):
search_results.append(twitter_search.search(q = Q , rpp=100,page=page))
f = open(\'NCD_tweets.txt\', \'w\')
print >>f, json.dumps(search_results, sort_keys=True, indent=1)
tweets = [r[\'text\'] \\
for result in search_results \\
for r in result[\'results\'] ]
words = []
for t in tweets:
words += [ w for w in t.split() ]
# print \'Total Words: \',len(words) # total words
# print \'Unique Words: \',len(set(words)) # unique words
# print \'Lexical Diversity: \',1.0*len(set(words))/len(words) #lexical diversity
# print \'Avg Words Per Tweet: \',1.0*sum([len(t.split()) for t in tweets])/len(tweets)\\
#avg words per tweet
freq_dist = nltk.FreqDist(words)
# print \'Fifty most frequent tokens: \',freq_dist.keys()[:50] # 50 most frequent tokens
# print \'Fifty least frequent tokens: \',freq_dist.keys()[-50:] # 50 least frequent tokens
g = nx.DiGraph()
all_tweets = [ tweet
for page in search_results
for tweet in page["results"] ]
def get_rt_sources(tweet):
rt_patterns = re.compile(r"(RT|via)((?:\\b\\W*@\\w+)+)", re.IGNORECASE)
return [ source.strip()
for tuple in rt_patterns.findall(tweet)
for source in tuple
if source not in ("RT", "via") ]
for tweet in all_tweets:
rt_sources = get_rt_sources(tweet["text"])
if not rt_sources: continue
for rt_source in rt_sources:
g.add_edge(rt_source, tweet["from_user"], {"tweet_id" : tweet["id"]})
print \'Number of Nodes: \',g.number_of_nodes()
print \'Number of Edges: \', g.number_of_edges()
print len(nx.connected_components(g.to_undirected()))
print list(nx.degree(g).values())