Untitled

'''Skeleton file with all strings for Mimir testing'''

import string, calendar, pylab,csv
from operator import itemgetter

MONTH_NAMES = [calendar.month_name[month] for month in range(1,13)]

def open_file():
    '''docstring'''
    '''prompt for file name, open file, return file pointer'''
    while True:  # Infinite loop
        # Check for correct file name
        try:
            # Open file here  -- done with filename
            filename = input("Input a filename: ")
            file_object = open(filename)
            return file_object  # File pointer to return
        except FileNotFoundError:
            print("Error in input filename. Please try again.")

def validate_hashtag(s):
    '''docstring'''
    if len(s) <=2:
        return False
    else :
        inp = s[1:]
        for ch in inp:
            if ch in  string.punctuation:
                return False
        return True

def get_hashtags(s):
    '''docstring'''
    words = s.split()
    hash_tags = []
    for word in words:
        if word[0] == '#':
            is_valid = validate_hashtag(word)
            if is_valid:
                hash_tags.append(word)
    return hash_tags

def read_data(fp):
    '''docstring'''
    rows = csv.reader(fp, delimiter=',')
    result = []
    for row in rows:
        hashtags = get_hashtags(row[2])
        result.append([row[0], int(row[1]), hashtags])
    return result


def get_histogram_tag_count_for_users(data,usernames):
    '''docstring'''
    histogram = {}
    for row in data:
        if row[0] in usernames:
            hashtags = row[2]
            for hashtag in hashtags:
                if hashtag in histogram:
                    histogram[hashtag] = histogram[hashtag]+1
                else:
                    histogram[hashtag] = 1

    return histogram

def get_tags_by_month_for_users(data,usernames):
    '''docstring'''
    result = []
    for i in range(1, 13):
        result.append((i, set()))
    for row in data:
        if row[0] in usernames:
            month = row[1]
            for hashtag in row[2]:
                (result[month-1][1]).add(hashtag)

    return result

def get_user_names(L):
    '''docstring'''
    names = set()
    for data in L:
        names.add(data[0])
    names = list(names)
    names.sort()
    return names

def three_most_common_hashtags_combined(L,usernames):
    '''docstring'''
    histogram = get_histogram_tag_count_for_users (L, usernames)
    data = []
    for (hastag, count) in histogram.items():
        data.append((count, hastag))
    sorted_data = sorted(data, key = itemgetter(0), reverse = True)
    return sorted_data[0:3]

def three_most_common_hashtags_individuals(data_lst,usernames):
    '''docstring'''
    data = []
    for username in usernames:
        user_dict = get_histogram_tag_count_for_users(data_lst, [username])
        for (hashtag, count) in user_dict.items():
            data.append((count, hashtag, username))
    sorted_data = sorted(data, key = itemgetter(0), reverse = True)
    return sorted_data[0:3]


def similarity(data_lst,user1,user2):
    '''docstring'''
    user1_data = get_tags_by_month_for_users(data_lst, [user1])
    user2_data = get_tags_by_month_for_users(data_lst, [user2])
    data = []
    for i in range(0, 12):
        user1_tags = user1_data[i][1]
        user2_tags = user2_data[i][1]
        tags = user1_tags.intersection(user2_tags)
        data.append((user1_data[i][0], tags))
    return data


def plot_similarity(x_list,y_list,name1,name2):
    '''Plot y vs. x with name1 and name2 in the title.'''

    pylab.plot(x_list,y_list)
    pylab.xticks(x_list,MONTH_NAMES,rotation=45,ha='right')
    pylab.ylabel('Hashtag Similarity')
    pylab.title('Twitter Similarity Between '+name1+' and '+name2)
    pylab.tight_layout()
    pylab.show()
    # the next line is simply to illustrate how to save the plot
    # leave it commented out in the version you submit
    #pylab.savefig("plot.png")


def main():
    # Open the file
    file = open_file()
    # Read the data from the file
    data = read_data(file)
    # Create username list from data
    usernames_str = get_user_names(data)
    # Calculate the top three hashtags combined for all users
    top_hashtags = three_most_common_hashtags_combined(data,usernames_str)
    # Print them
    print(top_hashtags)
    # Calculate the top three hashtags individually for all users
    top3_individual_hashtags = three_most_common_hashtags_individuals(data, usernames_str)
    # Print them
    print(top3_individual_hashtags)
    # Prompt for two user names from username list
    # Calculate similarity for the two users
    # Print them
    # Prompt to plot or not and plot if 'yes'


    print("Top Three Hashtags Combined")
    print("{:>6s} {:<20s}".format("Count","Hashtag"))
    # your printing loop goes here
    for (count,hashtag) in top_hashtags:
        print("{:>6d} {:<20s}".format(count,hashtag))
    print()

    print("Top Three Hashtags by Individual")
    print("{:>6s} {:<20s} {:<20s}".format("Count","Hashtag","User"))
    # your printing loop goes here
    print()

    print("Usernames: ", usernames_str)
    while True:  # prompt for and validate user names
        user_str = input("Input two user names from the list, comma separated: ")
        user_lst = user_str.strip().split(",")
        if len(user_lst) >= 2 and user_lst[0] in usernames_str and user_lst[1] in usernames_str:
            break
        else:
            print("Error in user names.  Please try again")
    similarity = similarity(data, user_lst[0], user_lst[1])

    # calculate similarity here
    print()
    #print("Similarities for "+users[0]+" and "+users[1])
    print("{:12s}{:6s}".format("Month","Count"))
    # your printing loop goes here
    print()

    # Prompt for a plot
    #choice = input("Do you want to plot (yes/no)?: ")
    #if choice.lower() == 'yes':
        # create x_list and y_list
        #plot_similarity(x_list,y_list,users[0],users[1])

if __name__ == '__main__':
    main()