Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- from pymongo import MongoClient
- import json, io, datetime, time
- import networkx as nx
- import matplotlib.pyplot as plt
- from datetime import timedelta
- import gVar
- import logging
- total_aware_user_list=set()
- # Function query_tweets(DiGraph, database, startDate, endDate, hashtag):
- # A follow B:
- # A -> B
- # A is B's follower
- # B is A's friend
- #
- # update node information
- # create_date: tweet create date
- # timestamp: date's timestamp format
- # hash_list: which hashtags it include
- # screen_name: user name
- # firsttime: 1. first-time adopter, 0. non first-time adopter
- def query_tweets(G, db, startDate='9/24/2014', endDate='1/1/2015', hashtag=''):
- global total_aware_user_list
- logging.debug('\nQuery Tweets Begin')
- count = 0
- tweets_list = {}
- query_string = {}
- start_timestamp = str(long(time.mktime(datetime.datetime.strptime(startDate, "%m/%d/%Y").timetuple()))*1000)
- end_timestamp = str(long(time.mktime(datetime.datetime.strptime(endDate, "%m/%d/%Y").timetuple()))*1000)
- logging.debug('Start date is '+startDate+', timestamp is ' + start_timestamp + '. End date is '+ endDate +', timestamp is ' + end_timestamp)
- query_string['timestamp_ms'] = {"$gte": start_timestamp, "$lte": end_timestamp}
- if hashtag:
- query_string['entities.hashtags.text'] = hashtag
- logging.debug('Query string is ' + str(query_string))
- # Get tweets author list
- #for tweet in db.tweets.find({"$where": "this.entities.hashtags.length > 1"}).limit(10):
- #for tweet in db.tweets.find({"timestamp_ms": {"$lte": "1414850272000"}}, timeout=False):
- for tweet in db.tweets.find(query_string):
- count += 1
- if count % 10000 == 0:
- logging.debug('Tweet: processed ' + str(count) + ' tweets')
- user_id = tweet['user']['id']
- # Only keep the earliest timestamp
- if user_id in tweets_list and tweets_list[user_id]['timestamp'] < long(tweet['timestamp_ms']):
- continue
- tweets_list[user_id] = {'timestamp': long(tweet['timestamp_ms'])}
- # Add user name is user's follower is larger than 500
- if long(tweet['user']['followers_count']) > 500:
- tweets_list[user_id]['name'] = tweet['user']['name']
- logging.debug('Finished querying tweets: Processed {0} tweets.'.format(str(count)))
- count = 0
- for user in tweets_list:
- first_time = 1
- count += 1
- if count % 10000 == 0:
- logging.debug('Graph: Added ' + str(count) + ' nodes')
- # find first-time adopter
- if user in total_aware_user_list:
- first_time = 0
- node_name = "";
- if 'name' in tweets_list[user]:
- node_name = tweets_list[user]['name']
- G.add_node(user, timestamp=tweets_list[user]['timestamp'], aware=0, firsttime=first_time, name=node_name)
- total_aware_user_list = total_aware_user_list.union(set(tweets_list.keys()))
- logging.debug('Finished adding nodes: Added {0} nodes.'.format(str(count)))
- def draw_and_save(G, filename='draw.gml.gz'):
- nx.write_graphml(G, filename)
- def add_relations(G, db):
- logging.debug('Query relations Begin')
- start = datetime.datetime.now()
- count = 0
- friends_wait_count = 0
- following_wait_count = 0
- node_count =0
- nodes_list = set(nx.nodes(G))
- logging.debug('Have {0} nodes'.format(len(nodes_list)))
- for node in nodes_list:
- node_count += 1
- result = db.social_network.find_one({"userid": long(node)})
- if result == None:
- logging.debug('Node {0} is not in the database.'.format(long(node)))
- continue
- if result['friends_status'] == 2:
- for friend in result['friends']:
- # add edge if friend is in the graph
- if friend in nodes_list:
- count += 1
- G.add_edge(result['userid'], friend)
- if result['following_status'] == 2:
- for follower in result['followers']:
- # add edge if follower is in the graph
- if follower in nodes_list:
- count += 1
- G.add_edge(follower, result['userid'])
- if result['friends_status'] == 0:
- friends_wait_count += 1
- if result['following_status'] == 0:
- following_wait_count += 1
- if node_count % 100 == 0:
- logging.debug(str(node_count) + ' nodes Done. '+ str(len(nodes_list)-node_count) + ' nodes remain')
- # end = datetime.datetime.now()
- #logging.debug('Add relations: processed ' + str(count) + ' users, cost ' + str(float((end-start).seconds)/3600) + ' hours.')
- logging.debug('Query relations End, processed ' + str(count) + ' edges. Wait for friends count is ' + str(friends_wait_count) + '. Wait for followers count is ' + str(following_wait_count))
- if __name__ == "__main__":
- if len(sys.argv) != 5:
- print 'Arguments error!'
- print 'Usage: python draw_network.py [hashtag] [startday(%m/%d/%y)] [steplength(day)] [stepnumber]'
- sys.exit()
- logging.basicConfig(filename='draw_network.log', level=logging.DEBUG, format='%(asctime)s %(message)s')
- gVar.init()
- #query_social_network(gVar.G, gVar.db)
- tv_show = sys.argv[1]
- start = datetime.datetime.strptime(sys.argv[2], "%m/%d/%Y")
- duration = datetime.timedelta(days=int(sys.argv[3]))
- logging.debug('Query begin\n hashtag:{0}, startday:{1}, step length:{2}, step number:{3}'.format(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]))
- for i in xrange(0,int(sys.argv[4])):
- end = start + duration
- query_tweets(gVar.G, gVar.db, startDate=start.strftime("%m/%d/%Y"), endDate=end.strftime("%m/%d/%Y"), hashtag=tv_show)
- add_relations(gVar.G, gVar.db)
- draw_and_save(gVar.G, filename=tv_show+'_'+start.strftime("%Y_%m_%d")+'-'+end.strftime("%Y_%m_%d")+'.graphml')
- gVar.clean()
- start = end
- #draw_and_save(gVar.G)
- #query_tweets(gVar.G, gVar.db, startDate='9/24/2014', endDate='9/29/2015', hashtag='atoz')
- #draw_and_save(gVar.G)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement