Advertisement
Guest User

Untitled

a guest
Nov 19th, 2017
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.16 KB | None | 0 0
  1. '''Skeleton file with all strings for Mimir testing'''
  2.  
  3. import string, calendar, pylab,csv
  4. from operator import itemgetter
  5.  
  6. MONTH_NAMES = [calendar.month_name[month] for month in range(1,13)]
  7.  
  8. def open_file():
  9. '''docstring'''
  10. '''prompt for file name, open file, return file pointer'''
  11. while True: # Infinite loop
  12. # Check for correct file name
  13. try:
  14. # Open file here -- done with filename
  15. filename = input("Input a filename: ")
  16. file_object = open(filename)
  17. return file_object # File pointer to return
  18. except FileNotFoundError:
  19. print("Error in input filename. Please try again.")
  20.  
  21. def validate_hashtag(s):
  22. '''docstring'''
  23. if len(s) <=2:
  24. return False
  25. else :
  26. inp = s[1:]
  27. for ch in inp:
  28. if ch in string.punctuation:
  29. return False
  30. return True
  31.  
  32. def get_hashtags(s):
  33. '''docstring'''
  34. words = s.split()
  35. hash_tags = []
  36. for word in words:
  37. if word[0] == '#':
  38. is_valid = validate_hashtag(word)
  39. if is_valid:
  40. hash_tags.append(word)
  41. return hash_tags
  42.  
  43. def read_data(fp):
  44. '''docstring'''
  45. rows = csv.reader(fp, delimiter=',')
  46. result = []
  47. for row in rows:
  48. hashtags = get_hashtags(row[2])
  49. result.append([row[0], int(row[1]), hashtags])
  50. return result
  51.  
  52.  
  53. def get_histogram_tag_count_for_users(data,usernames):
  54. '''docstring'''
  55. histogram = {}
  56. for row in data:
  57. if row[0] in usernames:
  58. hashtags = row[2]
  59. for hashtag in hashtags:
  60. if hashtag in histogram:
  61. histogram[hashtag] = histogram[hashtag]+1
  62. else:
  63. histogram[hashtag] = 1
  64.  
  65. return histogram
  66.  
  67. def get_tags_by_month_for_users(data,usernames):
  68. '''docstring'''
  69. result = []
  70. for i in range(1, 13):
  71. result.append((i, set()))
  72. for row in data:
  73. if row[0] in usernames:
  74. month = row[1]
  75. for hashtag in row[2]:
  76. (result[month-1][1]).add(hashtag)
  77.  
  78. return result
  79.  
  80. def get_user_names(L):
  81. '''docstring'''
  82. names = set()
  83. for data in L:
  84. names.add(data[0])
  85. names = list(names)
  86. names.sort()
  87. return names
  88.  
  89. def three_most_common_hashtags_combined(L,usernames):
  90. '''docstring'''
  91. histogram = get_histogram_tag_count_for_users (L, usernames)
  92. data = []
  93. for (hastag, count) in histogram.items():
  94. data.append((count, hastag))
  95. sorted_data = sorted(data, key = itemgetter(0), reverse = True)
  96. return sorted_data[0:3]
  97.  
  98. def three_most_common_hashtags_individuals(data_lst,usernames):
  99. '''docstring'''
  100. data = []
  101. for username in usernames:
  102. user_dict = get_histogram_tag_count_for_users(data_lst, [username])
  103. for (hashtag, count) in user_dict.items():
  104. data.append((count, hashtag, username))
  105. sorted_data = sorted(data, key = itemgetter(0), reverse = True)
  106. return sorted_data[0:3]
  107.  
  108.  
  109. def similarity(data_lst,user1,user2):
  110. '''docstring'''
  111. user1_data = get_tags_by_month_for_users(data_lst, [user1])
  112. user2_data = get_tags_by_month_for_users(data_lst, [user2])
  113. data = []
  114. for i in range(0, 12):
  115. user1_tags = user1_data[i][1]
  116. user2_tags = user2_data[i][1]
  117. tags = user1_tags.intersection(user2_tags)
  118. data.append((user1_data[i][0], tags))
  119. return data
  120.  
  121.  
  122. def plot_similarity(x_list,y_list,name1,name2):
  123. '''Plot y vs. x with name1 and name2 in the title.'''
  124.  
  125. pylab.plot(x_list,y_list)
  126. pylab.xticks(x_list,MONTH_NAMES,rotation=45,ha='right')
  127. pylab.ylabel('Hashtag Similarity')
  128. pylab.title('Twitter Similarity Between '+name1+' and '+name2)
  129. pylab.tight_layout()
  130. pylab.show()
  131. # the next line is simply to illustrate how to save the plot
  132. # leave it commented out in the version you submit
  133. #pylab.savefig("plot.png")
  134.  
  135.  
  136. def main():
  137. # Open the file
  138. file = open_file()
  139. # Read the data from the file
  140. data = read_data(file)
  141. # Create username list from data
  142. usernames_str = get_user_names(data)
  143. # Calculate the top three hashtags combined for all users
  144. top_hashtags = three_most_common_hashtags_combined(data,usernames_str)
  145. # Print them
  146. print(top_hashtags)
  147. # Calculate the top three hashtags individually for all users
  148. top3_individual_hashtags = three_most_common_hashtags_individuals(data, usernames_str)
  149. # Print them
  150. print(top3_individual_hashtags)
  151. # Prompt for two user names from username list
  152. # Calculate similarity for the two users
  153. # Print them
  154. # Prompt to plot or not and plot if 'yes'
  155.  
  156.  
  157. print("Top Three Hashtags Combined")
  158. print("{:>6s} {:<20s}".format("Count","Hashtag"))
  159. # your printing loop goes here
  160. for (count,hashtag) in top_hashtags:
  161. print("{:>6d} {:<20s}".format(count,hashtag))
  162. print()
  163.  
  164. print("Top Three Hashtags by Individual")
  165. print("{:>6s} {:<20s} {:<20s}".format("Count","Hashtag","User"))
  166. # your printing loop goes here
  167. print()
  168.  
  169. print("Usernames: ", usernames_str)
  170. while True: # prompt for and validate user names
  171. user_str = input("Input two user names from the list, comma separated: ")
  172. user_lst = user_str.strip().split(",")
  173. if len(user_lst) >= 2 and user_lst[0] in usernames_str and user_lst[1] in usernames_str:
  174. break
  175. else:
  176. print("Error in user names. Please try again")
  177. similarity = similarity(data, user_lst[0], user_lst[1])
  178.  
  179. # calculate similarity here
  180. print()
  181. #print("Similarities for "+users[0]+" and "+users[1])
  182. print("{:12s}{:6s}".format("Month","Count"))
  183. # your printing loop goes here
  184. print()
  185.  
  186. # Prompt for a plot
  187. #choice = input("Do you want to plot (yes/no)?: ")
  188. #if choice.lower() == 'yes':
  189. # create x_list and y_list
  190. #plot_similarity(x_list,y_list,users[0],users[1])
  191.  
  192. if __name__ == '__main__':
  193. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement