Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''Skeleton file with all strings for Mimir testing'''
- import string, calendar, pylab,csv
- from operator import itemgetter
- MONTH_NAMES = [calendar.month_name[month] for month in range(1,13)]
- def open_file():
- '''docstring'''
- '''prompt for file name, open file, return file pointer'''
- while True: # Infinite loop
- # Check for correct file name
- try:
- # Open file here -- done with filename
- filename = input("Input a filename: ")
- file_object = open(filename)
- return file_object # File pointer to return
- except FileNotFoundError:
- print("Error in input filename. Please try again.")
- def validate_hashtag(s):
- '''docstring'''
- if len(s) <=2:
- return False
- else :
- inp = s[1:]
- for ch in inp:
- if ch in string.punctuation:
- return False
- return True
- def get_hashtags(s):
- '''docstring'''
- words = s.split()
- hash_tags = []
- for word in words:
- if word[0] == '#':
- is_valid = validate_hashtag(word)
- if is_valid:
- hash_tags.append(word)
- return hash_tags
- def read_data(fp):
- '''docstring'''
- rows = csv.reader(fp, delimiter=',')
- result = []
- for row in rows:
- hashtags = get_hashtags(row[2])
- result.append([row[0], int(row[1]), hashtags])
- return result
- def get_histogram_tag_count_for_users(data,usernames):
- '''docstring'''
- histogram = {}
- for row in data:
- if row[0] in usernames:
- hashtags = row[2]
- for hashtag in hashtags:
- if hashtag in histogram:
- histogram[hashtag] = histogram[hashtag]+1
- else:
- histogram[hashtag] = 1
- return histogram
- def get_tags_by_month_for_users(data,usernames):
- '''docstring'''
- result = []
- for i in range(1, 13):
- result.append((i, set()))
- for row in data:
- if row[0] in usernames:
- month = row[1]
- for hashtag in row[2]:
- (result[month-1][1]).add(hashtag)
- return result
- def get_user_names(L):
- '''docstring'''
- names = set()
- for data in L:
- names.add(data[0])
- names = list(names)
- names.sort()
- return names
- def three_most_common_hashtags_combined(L,usernames):
- '''docstring'''
- histogram = get_histogram_tag_count_for_users (L, usernames)
- data = []
- for (hastag, count) in histogram.items():
- data.append((count, hastag))
- sorted_data = sorted(data, key = itemgetter(0), reverse = True)
- return sorted_data[0:3]
- def three_most_common_hashtags_individuals(data_lst,usernames):
- '''docstring'''
- data = []
- for username in usernames:
- user_dict = get_histogram_tag_count_for_users(data_lst, [username])
- for (hashtag, count) in user_dict.items():
- data.append((count, hashtag, username))
- sorted_data = sorted(data, key = itemgetter(0), reverse = True)
- return sorted_data[0:3]
- def similarity(data_lst,user1,user2):
- '''docstring'''
- user1_data = get_tags_by_month_for_users(data_lst, [user1])
- user2_data = get_tags_by_month_for_users(data_lst, [user2])
- data = []
- for i in range(0, 12):
- user1_tags = user1_data[i][1]
- user2_tags = user2_data[i][1]
- tags = user1_tags.intersection(user2_tags)
- data.append((user1_data[i][0], tags))
- return data
- def plot_similarity(x_list,y_list,name1,name2):
- '''Plot y vs. x with name1 and name2 in the title.'''
- pylab.plot(x_list,y_list)
- pylab.xticks(x_list,MONTH_NAMES,rotation=45,ha='right')
- pylab.ylabel('Hashtag Similarity')
- pylab.title('Twitter Similarity Between '+name1+' and '+name2)
- pylab.tight_layout()
- pylab.show()
- # the next line is simply to illustrate how to save the plot
- # leave it commented out in the version you submit
- #pylab.savefig("plot.png")
- def main():
- # Open the file
- file = open_file()
- # Read the data from the file
- data = read_data(file)
- # Create username list from data
- usernames_str = get_user_names(data)
- # Calculate the top three hashtags combined for all users
- top_hashtags = three_most_common_hashtags_combined(data,usernames_str)
- # Print them
- print(top_hashtags)
- # Calculate the top three hashtags individually for all users
- top3_individual_hashtags = three_most_common_hashtags_individuals(data, usernames_str)
- # Print them
- print(top3_individual_hashtags)
- # Prompt for two user names from username list
- # Calculate similarity for the two users
- # Print them
- # Prompt to plot or not and plot if 'yes'
- print("Top Three Hashtags Combined")
- print("{:>6s} {:<20s}".format("Count","Hashtag"))
- # your printing loop goes here
- for (count,hashtag) in top_hashtags:
- print("{:>6d} {:<20s}".format(count,hashtag))
- print()
- print("Top Three Hashtags by Individual")
- print("{:>6s} {:<20s} {:<20s}".format("Count","Hashtag","User"))
- # your printing loop goes here
- print()
- print("Usernames: ", usernames_str)
- while True: # prompt for and validate user names
- user_str = input("Input two user names from the list, comma separated: ")
- user_lst = user_str.strip().split(",")
- if len(user_lst) >= 2 and user_lst[0] in usernames_str and user_lst[1] in usernames_str:
- break
- else:
- print("Error in user names. Please try again")
- similarity = similarity(data, user_lst[0], user_lst[1])
- # calculate similarity here
- print()
- #print("Similarities for "+users[0]+" and "+users[1])
- print("{:12s}{:6s}".format("Month","Count"))
- # your printing loop goes here
- print()
- # Prompt for a plot
- #choice = input("Do you want to plot (yes/no)?: ")
- #if choice.lower() == 'yes':
- # create x_list and y_list
- #plot_similarity(x_list,y_list,users[0],users[1])
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement