Advertisement
Guest User

Untitled

a guest
Feb 12th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.57 KB | None | 0 0
  1. import sys
  2.  
  3. from pymongo import MongoClient
  4. import json, io, datetime, time
  5. import networkx as nx
  6. import matplotlib.pyplot as plt
  7. from datetime import timedelta
  8. import gVar
  9.  
  10. import logging
  11.  
  12. total_aware_user_list=set()
  13. # Function query_tweets(DiGraph, database, startDate, endDate, hashtag):
  14. # A follow B:
  15. # A -> B
  16. # A is B's follower
  17. # B is A's friend
  18. #
  19. # update node information
  20. # create_date: tweet create date
  21. # timestamp: date's timestamp format
  22. # hash_list: which hashtags it include
  23. # screen_name: user name
  24. # firsttime: 1. first-time adopter, 0. non first-time adopter
  25. def query_tweets(G, db, startDate='9/24/2014', endDate='1/1/2015', hashtag=''):
  26. global total_aware_user_list
  27.  
  28. logging.debug('\nQuery Tweets Begin')
  29.  
  30. count = 0
  31. tweets_list = {}
  32. query_string = {}
  33.  
  34. start_timestamp = str(long(time.mktime(datetime.datetime.strptime(startDate, "%m/%d/%Y").timetuple()))*1000)
  35. end_timestamp = str(long(time.mktime(datetime.datetime.strptime(endDate, "%m/%d/%Y").timetuple()))*1000)
  36. logging.debug('Start date is '+startDate+', timestamp is ' + start_timestamp + '. End date is '+ endDate +', timestamp is ' + end_timestamp)
  37.  
  38. query_string['timestamp_ms'] = {"$gte": start_timestamp, "$lte": end_timestamp}
  39. if hashtag:
  40. query_string['entities.hashtags.text'] = hashtag
  41. logging.debug('Query string is ' + str(query_string))
  42.  
  43. # Get tweets author list
  44. #for tweet in db.tweets.find({"$where": "this.entities.hashtags.length > 1"}).limit(10):
  45. #for tweet in db.tweets.find({"timestamp_ms": {"$lte": "1414850272000"}}, timeout=False):
  46. for tweet in db.tweets.find(query_string):
  47. count += 1
  48. if count % 10000 == 0:
  49. logging.debug('Tweet: processed ' + str(count) + ' tweets')
  50.  
  51. user_id = tweet['user']['id']
  52. # Only keep the earliest timestamp
  53. if user_id in tweets_list and tweets_list[user_id]['timestamp'] < long(tweet['timestamp_ms']):
  54. continue
  55. tweets_list[user_id] = {'timestamp': long(tweet['timestamp_ms'])}
  56. # Add user name is user's follower is larger than 500
  57. if long(tweet['user']['followers_count']) > 500:
  58. tweets_list[user_id]['name'] = tweet['user']['name']
  59.  
  60. logging.debug('Finished querying tweets: Processed {0} tweets.'.format(str(count)))
  61.  
  62. count = 0
  63. for user in tweets_list:
  64. first_time = 1
  65. count += 1
  66. if count % 10000 == 0:
  67. logging.debug('Graph: Added ' + str(count) + ' nodes')
  68. # find first-time adopter
  69. if user in total_aware_user_list:
  70. first_time = 0
  71. node_name = "";
  72. if 'name' in tweets_list[user]:
  73. node_name = tweets_list[user]['name']
  74. G.add_node(user, timestamp=tweets_list[user]['timestamp'], aware=0, firsttime=first_time, name=node_name)
  75. total_aware_user_list = total_aware_user_list.union(set(tweets_list.keys()))
  76. logging.debug('Finished adding nodes: Added {0} nodes.'.format(str(count)))
  77.  
  78. def draw_and_save(G, filename='draw.gml.gz'):
  79. nx.write_graphml(G, filename)
  80.  
  81. def add_relations(G, db):
  82. logging.debug('Query relations Begin')
  83. start = datetime.datetime.now()
  84. count = 0
  85. friends_wait_count = 0
  86. following_wait_count = 0
  87. node_count =0
  88.  
  89. nodes_list = set(nx.nodes(G))
  90. logging.debug('Have {0} nodes'.format(len(nodes_list)))
  91. for node in nodes_list:
  92. node_count += 1
  93. result = db.social_network.find_one({"userid": long(node)})
  94. if result == None:
  95. logging.debug('Node {0} is not in the database.'.format(long(node)))
  96. continue
  97. if result['friends_status'] == 2:
  98. for friend in result['friends']:
  99. # add edge if friend is in the graph
  100. if friend in nodes_list:
  101. count += 1
  102. G.add_edge(result['userid'], friend)
  103. if result['following_status'] == 2:
  104. for follower in result['followers']:
  105. # add edge if follower is in the graph
  106. if follower in nodes_list:
  107. count += 1
  108. G.add_edge(follower, result['userid'])
  109. if result['friends_status'] == 0:
  110. friends_wait_count += 1
  111. if result['following_status'] == 0:
  112. following_wait_count += 1
  113. if node_count % 100 == 0:
  114. logging.debug(str(node_count) + ' nodes Done. '+ str(len(nodes_list)-node_count) + ' nodes remain')
  115. # end = datetime.datetime.now()
  116. #logging.debug('Add relations: processed ' + str(count) + ' users, cost ' + str(float((end-start).seconds)/3600) + ' hours.')
  117. logging.debug('Query relations End, processed ' + str(count) + ' edges. Wait for friends count is ' + str(friends_wait_count) + '. Wait for followers count is ' + str(following_wait_count))
  118.  
  119.  
  120. if __name__ == "__main__":
  121.  
  122. if len(sys.argv) != 5:
  123. print 'Arguments error!'
  124. print 'Usage: python draw_network.py [hashtag] [startday(%m/%d/%y)] [steplength(day)] [stepnumber]'
  125. sys.exit()
  126. logging.basicConfig(filename='draw_network.log', level=logging.DEBUG, format='%(asctime)s %(message)s')
  127. gVar.init()
  128. #query_social_network(gVar.G, gVar.db)
  129. tv_show = sys.argv[1]
  130. start = datetime.datetime.strptime(sys.argv[2], "%m/%d/%Y")
  131. duration = datetime.timedelta(days=int(sys.argv[3]))
  132. logging.debug('Query begin\n hashtag:{0}, startday:{1}, step length:{2}, step number:{3}'.format(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]))
  133.  
  134. for i in xrange(0,int(sys.argv[4])):
  135. end = start + duration
  136. query_tweets(gVar.G, gVar.db, startDate=start.strftime("%m/%d/%Y"), endDate=end.strftime("%m/%d/%Y"), hashtag=tv_show)
  137. add_relations(gVar.G, gVar.db)
  138. draw_and_save(gVar.G, filename=tv_show+'_'+start.strftime("%Y_%m_%d")+'-'+end.strftime("%Y_%m_%d")+'.graphml')
  139. gVar.clean()
  140. start = end
  141.  
  142. #draw_and_save(gVar.G)
  143. #query_tweets(gVar.G, gVar.db, startDate='9/24/2014', endDate='9/29/2015', hashtag='atoz')
  144. #draw_and_save(gVar.G)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement