document.write('
Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. import twitter
  2. import nltk
  3. import cPickle
  4. import re
  5. import networkx as nx
  6. import sys
  7. import json
  8.  
  9. # Source Code: O\'Reilly Mining The Social Web
  10. # 7/18/2011
  11.  
  12. # Your Query
  13. Q = sys.argv[1]
  14.  
  15. twitter_search = twitter.Twitter(domain="search.twitter.com")
  16. search_results = []
  17. for page in range(1,15):
  18.      search_results.append(twitter_search.search(q = Q , rpp=100,page=page))
  19.  
  20. f = open(\'NCD_tweets.txt\', \'w\')
  21. print >>f, json.dumps(search_results, sort_keys=True, indent=1)
  22.  
  23. tweets = [r[\'text\'] \\
  24.     for result in search_results \\
  25.             for r in result[\'results\'] ]
  26. words = []
  27. for t in tweets:
  28.      words += [ w for w in t.split() ]
  29.  
  30. # print \'Total Words: \',len(words)  # total words
  31.  
  32. # print \'Unique Words: \',len(set(words)) # unique words
  33.  
  34. # print \'Lexical Diversity: \',1.0*len(set(words))/len(words) #lexical diversity
  35.  
  36. # print \'Avg Words Per Tweet: \',1.0*sum([len(t.split()) for t in tweets])/len(tweets)\\
  37.     #avg words per tweet
  38.  
  39. freq_dist = nltk.FreqDist(words)
  40. # print \'Fifty most frequent tokens: \',freq_dist.keys()[:50] # 50 most frequent tokens
  41.  
  42. # print \'Fifty least frequent tokens: \',freq_dist.keys()[-50:] # 50 least frequent tokens      
  43.  
  44. g = nx.DiGraph()
  45.  
  46. all_tweets = [ tweet
  47.                for page in search_results
  48.                     for tweet in page["results"] ]
  49.  
  50. def get_rt_sources(tweet):
  51.      rt_patterns = re.compile(r"(RT|via)((?:\\b\\W*@\\w+)+)", re.IGNORECASE)
  52.      return [ source.strip()
  53.               for tuple in rt_patterns.findall(tweet)
  54.                    for source in tuple
  55.                       if source not in ("RT", "via") ]
  56.  
  57. for tweet in all_tweets:
  58.      rt_sources = get_rt_sources(tweet["text"])
  59.      if not rt_sources: continue
  60.      for rt_source in rt_sources:
  61.           g.add_edge(rt_source, tweet["from_user"], {"tweet_id" : tweet["id"]})
  62.  
  63. print \'Number of Nodes: \',g.number_of_nodes()
  64.  
  65. print \'Number of Edges: \', g.number_of_edges()
  66.  
  67. print len(nx.connected_components(g.to_undirected()))
  68.  
  69. print list(nx.degree(g).values())
');