Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import csv
- import os.path
- import operator
- import matplotlib.pyplot as plt
- import matplotlib.cm as cm
- import numpy as np
- # Candidate information (as described in assignment writeup)
- CANDIDATE_NAMES = {"bush": "Jeb Bush",
- "carson": "Ben Carson",
- "christie": "Chris Christie",
- "cruz": "Ted Cruz",
- "fiorina": "Carly Fiorina",
- "gilmore": "Jim Gilmore",
- "graham": "Lindsey Graham",
- "huckabee": "Mike Huckabee",
- "jindal": "Bobby Jindal",
- "kasich": "John Kasich",
- "pataki": "George Pataki",
- "paul": "Rand Paul",
- "perry": "Rick Perry",
- "rubio": "Marco Rubio",
- "santorum": "Rick Santorum",
- "trump": "Donald Trump",
- "walker": "Scott Walker",
- "chafee": "Lincoln Chafee",
- "clinton": "Hillary Clinton",
- "omalley": "Martin O'Malley",
- "sanders": "Bernie Sanders",
- "webb": "Jim Webb"}
- GOP_CANDIDATES = ['bush', 'carson', 'christie', 'cruz', 'fiorina', 'gilmore', 'graham', 'huckabee',
- 'jindal', 'kasich', 'pataki', 'paul', 'perry', 'rubio', 'santorum', 'trump', 'walker']
- DEM_CANDIDATES = ['chafee', 'clinton', 'omalley', 'sanders', 'webb']
- ALL_CANDIDATES = GOP_CANDIDATES + DEM_CANDIDATES
- # Size of the figures (these are the values you should pass
- # in parameter "figsize" of matplotlib's "figure" function)
- # Note: For task 4, use FIGWIDTH*2
- FIGWIDTH = 12
- FIGHEIGHT = 8
- # Start and end time (in seconds) of the debate
- DEBATE_START = 86400
- DEBATE_END = 97200
- # Maximum time (in seconds) of the dataset
- MAX_TIME = 183600
- # This function generates colors that can be passed to matplotlib functions
- # that accept a list of colors. The function takes one parameter: the number
- # of colors to generate. Using this function should result in the same colors
- # shown in the assignment writeup.
- def get_nice_colors(n_colors):
- return cm.Accent( [1 - (i/n_colors) for i in range(n_colors)] )
- ################################################
- #
- # Your functions go here
- #
- # Call your functions from the __main__ block
- #
- ################################################
- class Tweet:
- def __init__(self, seconds, length, candidates, retweet, when, \
- polarity, subjectivity, longitude, lat):
- self.seconds = seconds
- self.length = length
- self.candidates = candidates
- self.retweet = retweet
- self.when = when
- self.polarity = polarity
- self.subjectivity = subjectivity
- self.longitude = longitude
- self.lat = lat
- def read_csv(filename):
- tweets = []
- with open(filename) as f:
- reader = csv.DictReader(f)
- for row in reader:
- tweets.append(Tweet(row['seconds'], row['length'], \
- list(set(row['candidates'].split('|'))), row['retweet'], \
- row['when'], row['polarity'], row['subjectivity'], \
- row['long'], row['lat']))
- return tweets
- def count_candidates_per_tweet(tweets_list):
- counts_per_tweet = {}
- for tweet in tweets_list:
- if len(tweet.candidates) not in counts_per_tweet:
- counts_per_tweet[len(tweet.candidates)] = 0
- counts_per_tweet[len(tweet.candidates)] += 1
- return counts_per_tweet
- def count_candidate_pairs(tweets_list):
- candidate_pairs_count = {}
- candidate_pairs_top_ten = {}
- for tweets in tweets_list:
- for name1 in tweets.candidates:
- for name2 in tweets.candidates:
- if name1 != name2:
- pair = tuple(sorted([name1, name2]))
- if pair not in candidate_pairs_count:
- candidate_pairs_count[pair] = 0
- candidate_pairs_count[pair] += 1
- # make dictionary of top ten counts
- candidate_pairs_count = {y:x for x, y in candidate_pairs_count.items()}
- counts = sorted(candidate_pairs_count.values())[:10]
- for i in counts:
- candidate_pairs_top_ten[i] = candidate_pairs_count_flip[i]
- candidate_pairs_top_ten = {y:x for x, y in candidate_pairs_top_ten.items()}
- return candidate_pairs_top_ten
- def count_candidate_mentions(tweets_list):
- candidate_count = {}
- candidate_percentage = {}
- total = 0
- other = 0
- for tweet in tweets_list:
- for name in tweet.candidates:
- if name not in candidate_count:
- candidate_count[name] = 0
- candidate_count[name] += 1
- total += 1
- for name in candidate_count:
- candidate_count[name] = candidate_count[name] / total
- if candidate_count[name] < 0.03:
- other += candidate_count[name]
- candidate_count[name] = 'rem'
- if candidate_count[name] != 'rem':
- candidate_percentage[name] = candidate_count[name]
- return candidate_percentage
- def count_candidate_mentions_per_min(tweets_list):
- mentions_per_min = {}
- all_seconds = tweets_list[:, 0]
- min_sec = min(all_seconds) # are these 2 lines of code ok?
- max_sec = max(all_seconds)
- sec = min_sec
- while sec <= max_sec:
- mentions_per_min[sec] = {} # each key is lower bound
- sec += 60
- for tweet in tweets_list:
- for sec in tweet.seconds:
- # binary search???
- pass
- def convert_dict_to_lists(dictionary):
- keys = list(dictionary.keys())
- values = list(dictionary.values())
- return keys, values
- def plot_bar_graph(xs, ys, filename, plot_title = None, x_label = None, \
- y_label = None, x_label_rot = None):
- xnum = np.arange(len(xs))
- # fig = plt.figure()
- plt.figure()
- plt.bar(xnum, ys, color = 'blue')
- if plot_title != None:
- plt.title('{}'.format(plot_title))
- if x_label != None:
- plt.ylabel('{}'.format(y_label))
- if y_label != None:
- plt.ylabel('{}'.format(y_label))
- xtickslocs = xnum + 0.4
- if x_label_rot != None:
- plt.xticks(xtickslocs, xs, rotation = x_label_rot)
- else:
- plt.xticks(xtickslocs, xs)
- plt.show()
- # fig.savefig('{}.png'.format(filename))
- if __name__ == "__main__":
- # The following code parses the command-line parameters.
- # There is one required parameter (the CSV file) and an optional
- # parameter (the directory where the PNG files will be created;
- # if not specified, this defaults to "output/").
- #
- # This code results in two variables:
- #
- # - csv_file: The data file to read
- # - output_dir: The directory where the images should be generated
- if not 2 <= len(sys.argv) <= 3:
- print("Usage: python3 {} <data file> [<output directory>]".format(sys.argv[0]))
- sys.exit(1)
- else:
- csv_file = sys.argv[1]
- if not os.path.exists(csv_file) or not os.path.isfile(csv_file):
- print("{} does not exist or is not a file.".format(csv_file))
- sys.exit(1)
- if len(sys.argv) == 3:
- output_dir = sys.argv[2]
- if not os.path.exists(output_dir) or not os.path.isdir(output_dir):
- print("{} does not exist or is not a directory.".format(output_dir))
- sys.exit(1)
- else:
- output_dir = "./output"
- # Use the following file names to generate the plots
- TASK1_FILE = "{}/bar_num_mentions.png".format(output_dir)
- TASK2_GOP_FILE = "{}/bar_candidates_together_gop.png".format(output_dir)
- TASK2_ALL_FILE = "{}/bar_candidates_together_all.png".format(output_dir)
- TASK3_GOP_FILE = "{}/candidates_gop.png".format(output_dir)
- TASK3_ALL_FILE = "{}/candidates_all.png".format(output_dir)
- TASK4A_DURING_FILE = "{}/mentions_over_time_during.png".format(output_dir)
- TASK4A_FULL_FILE = "{}/mentions_over_time.png".format(output_dir)
- TASK4B_FILE = "{}/stackplot.png".format(output_dir)
- # Your code goes here, BUT NOT **ALL** YOUR CODE.
- #
- # You should write functions that do all the work, and then
- # call them from here.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement