Untitled

import os.path, math

def clean_up(str):
    ''' Return a version of string str in which all letters have been
    converted to lowercase and punctuation characters have been stripped
    from both ends. Inner punctuation is left untouched. '''

    punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
    result = str.lower().strip(punctuation)
    return result

def average_word_length(text):
    ''' Return the average length of all words in text. Do not
    include surrounding punctuation in words.
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word.'''

    lst_of_words = []
    clean_words = []
    total_words = 0
    total_chars = 0
    for lines in text:

        line = clean_up(lines)

        lst_of_words += line.split()


    for everything in lst_of_words:
        word = clean_up(everything)
        clean_words.append(word)

        if '' in clean_words:
            clean_words.remove('')
    for words in clean_words:

        total_chars += len(words)

    total_words = len(clean_words)

    return float(total_chars)/total_words


def type_token_ratio(text):
    ''' Return the type token ratio (TTR) for this text.
    TTR is the number of different words divided by the total number of words.
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word. '''


    lst_of_duplicates = []
    lst_of_unique = []
    lst_of_words = []
    clean_words = []

    for lines in text:

        line = clean_up(lines)

        lst_of_words += line.split()


    for everything in lst_of_words:
        word = clean_up(everything)
        clean_words.append(word)

        if '' in clean_words:
            clean_words.remove('')


    for words in clean_words:
        if words not in lst_of_unique:
            lst_of_unique.append(words)
        else:
            lst_of_duplicates.append(words)

    number_of_duplicates = len(lst_of_duplicates)
    number_of_uniques = len(lst_of_unique)

    total_words = number_of_uniques + number_of_duplicates


    ratio = float(number_of_uniques) / total_words

    return ratio


def hapax_legomana_ratio(text):
    ''' Return the hapax_legomana ratio for this text.
    This ratio is the number of words that occur exactly once divided
    by the total number of words.
    text is a list of strings each ending in \n.
    At least one line in text contains a word.'''


    lst_of_duplicates = []
    lst_of_unique = []
    lst_of_words = []
    clean_words = []
    only_words = []

    for lines in text:

        line = clean_up(lines)

        lst_of_words += line.split()


    for everything in lst_of_words:
        word = clean_up(everything)
        clean_words.append(word)

        if '' in clean_words:
            clean_words.remove('')

    total_words = len(clean_words)

    for words in clean_words:
        if words  not in lst_of_unique:
            lst_of_unique.append(words)
        else:
            lst_of_duplicates.append(words)
    for commons in lst_of_duplicates:
        if commons in lst_of_unique:
            lst_of_unique.remove(commons)


    number_of_uniques = len(lst_of_unique)

    ratio = float(number_of_uniques) / total_words

    return ratio


def split_on_separators(original, separators):
    ''' Return a list of non-empty, non-blank strings from the original string
    determined by splitting the string on any of the separators.
    separators is a string of single-character separators.'''

    results = []
    temp = ''


    for chars in original:
        if chars in separators:
            results.append(temp)
            temp = ''
            if '' in results:
                results.remove('')

        else:
            temp += chars

    return results


def average_sentence_length(text):
    ''' Return the average number of words per sentence in text.
    text is guaranteed to have at least one sentence.
    Terminating punctuation defined as !?.
    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file. '''

    txt = open(text,'r').readlines()


    number_of_words = 0
    big_list = []
    big_str = ''
    list_of_sentances = []
    temp = []


    for lines in txt:
        big_str += lines


    list_of_sentances = split_on_separators(big_str, '?!.')
    print list_of_sentances


    for sentance in list_of_sentances:
        temp = sentance.split()

        number_of_words += len(temp)


    for strings in list_of_sentances:
        if ' ' in list_of_sentances:
            list_of_sentances.remove(' ')

    number_of_sentances = len(list_of_sentances)


    result = float(number_of_words) / number_of_sentances

    return result


def avg_sentence_complexity(text):
    '''Return the average number of phrases per sentence.
    Terminating punctuation defined as !?.
    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file.
    Phrases are substrings of a sentences separated by
    one or more of the following delimiters ,;: '''

    #To do: replace the body of this function with something meaningful
    return 1.0


def get_valid_filename(prompt):
    '''Use prompt (a string) to ask the user to type the name of a file. If
    the file does not exist, keep asking until they give a valid filename.
    Return the name of that file.'''

    filename = raw_input(prompt)
    while not os.path.isfile(filename):
        print "That file does not exist."
        filename = raw_input(prompt)
    return filename

def read_directory_name(prompt):
    '''Use prompt (a string) to ask the user to type the name of a directory. If
    the directory does not exist, keep asking until they give a valid directory.
    '''

    directory = raw_input(prompt)
    while not os.path.isdir(directory):
        print "That directory does not exist."
        directory = raw_input(prompt)
    return directory


def compare_signatures(sig1, sig2, weight):
    '''Return a non-negative real number indicating the similarity of two
    linguistic signatures. The smaller the number the more similar the
    signatures. Zero indicates identical signatures.
    sig1 and sig2 are 6 element lists with the following elements
    0  : author name (a string)
    1  : average word length (float)
    2  : TTR (float)
    3  : Hapax Legomana Ratio (float)
    4  : average sentence length (float)
    5  : average sentence complexity (float)
    weight is a list of multiplicative weights to apply to each
    linguistic feature. weight[0] is ignored.
    '''

    i = 1
    while i <= 5:
        result += (abs(sig1[i]-sig2[i]))*weight[i]
        i += 1
        return result

def read_signature(filename):
    '''Read a linguistic signature from filename and return it as
    list of features. '''

    file = open(filename,'r')
    # the first feature is a string so it doesn't need casting to float
    result = [file.readline()]
    # all remaining features are real numbers
    for line in file:
        result.append(float(line.strip()))
    return result

if __name__ == '__main__':

    prompt = 'enter the name of the file with unknown author:'
    mystery_filename = get_valid_filename(prompt)

    # readlines gives us a list of strings one for each line of the file
    text = open(mystery_filename,'r').readlines()

    # calculate the signature for the mystery file
    mystery_signature = [mystery_filename]
    mystery_signature.append(average_word_length(text))
    mystery_signature.append(type_token_ratio(text))
    mystery_signature.append(hapax_legomana_ratio(text))
    mystery_signature.append(average_sentence_length(text))
    mystery_signature.append(avg_sentence_complexity(text))

    weights = [0, 11, 33, 50, 0.4, 4]

    prompt = 'enter the path to the directory of signature files: '
    dir = read_directory_name(prompt)
    # every file in this directory must be a linguistic signature
    files = os.listdir(dir)

    # we will assume that there is at least one signature in that directory
    this_file = files[0]
    signature = read_signature('%s/%s'%(dir,this_file))
    best_score = compare_signatures(mystery_signature,signature,weights)
    best_author = signature[0]
    for this_file in files[1:]:
        signature = read_signature('%s/%s'%(dir,this_file))
        score = compare_signatures(mystery_signature,signature,weights)
        if score < best_score:
            best_score = score
            best_author = signature[0]
    print "best author match: %s with score %s"%(best_author,best_score)