Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os.path, math
- def clean_up(str):
- ''' Return a version of string str in which all letters have been
- converted to lowercase and punctuation characters have been stripped
- from both ends. Inner punctuation is left untouched. '''
- punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
- result = str.lower().strip(punctuation)
- return result
- def average_word_length(text):
- ''' Return the average length of all words in text. Do not
- include surrounding punctuation in words.
- text is a non-empty list of strings each ending in \n.
- At least one line in text contains a word.'''
- lst_of_words = []
- clean_words = []
- total_words = 0
- total_chars = 0
- for lines in text:
- line = clean_up(lines)
- lst_of_words += line.split()
- for everything in lst_of_words:
- word = clean_up(everything)
- clean_words.append(word)
- if '' in clean_words:
- clean_words.remove('')
- for words in clean_words:
- total_chars += len(words)
- total_words = len(clean_words)
- return float(total_chars)/total_words
- def type_token_ratio(text):
- ''' Return the type token ratio (TTR) for this text.
- TTR is the number of different words divided by the total number of words.
- text is a non-empty list of strings each ending in \n.
- At least one line in text contains a word. '''
- lst_of_duplicates = []
- lst_of_unique = []
- lst_of_words = []
- clean_words = []
- for lines in text:
- line = clean_up(lines)
- lst_of_words += line.split()
- for everything in lst_of_words:
- word = clean_up(everything)
- clean_words.append(word)
- if '' in clean_words:
- clean_words.remove('')
- for words in clean_words:
- if words not in lst_of_unique:
- lst_of_unique.append(words)
- else:
- lst_of_duplicates.append(words)
- number_of_duplicates = len(lst_of_duplicates)
- number_of_uniques = len(lst_of_unique)
- total_words = number_of_uniques + number_of_duplicates
- ratio = float(number_of_uniques) / total_words
- return ratio
- def hapax_legomana_ratio(text):
- ''' Return the hapax_legomana ratio for this text.
- This ratio is the number of words that occur exactly once divided
- by the total number of words.
- text is a list of strings each ending in \n.
- At least one line in text contains a word.'''
- lst_of_duplicates = []
- lst_of_unique = []
- lst_of_words = []
- clean_words = []
- only_words = []
- for lines in text:
- line = clean_up(lines)
- lst_of_words += line.split()
- for everything in lst_of_words:
- word = clean_up(everything)
- clean_words.append(word)
- if '' in clean_words:
- clean_words.remove('')
- total_words = len(clean_words)
- for words in clean_words:
- if words not in lst_of_unique:
- lst_of_unique.append(words)
- else:
- lst_of_duplicates.append(words)
- for commons in lst_of_duplicates:
- if commons in lst_of_unique:
- lst_of_unique.remove(commons)
- number_of_uniques = len(lst_of_unique)
- ratio = float(number_of_uniques) / total_words
- return ratio
- def split_on_separators(original, separators):
- ''' Return a list of non-empty, non-blank strings from the original string
- determined by splitting the string on any of the separators.
- separators is a string of single-character separators.'''
- results = []
- temp = ''
- for chars in original:
- if chars in separators:
- results.append(temp)
- temp = ''
- if '' in results:
- results.remove('')
- else:
- temp += chars
- return results
- def average_sentence_length(text):
- ''' Return the average number of words per sentence in text.
- text is guaranteed to have at least one sentence.
- Terminating punctuation defined as !?.
- A sentence is defined as a non-empty string of non-terminating
- punctuation surrounded by terminating punctuation
- or beginning or end of file. '''
- txt = open(text,'r').readlines()
- number_of_words = 0
- big_list = []
- big_str = ''
- list_of_sentances = []
- temp = []
- for lines in txt:
- big_str += lines
- list_of_sentances = split_on_separators(big_str, '?!.')
- print list_of_sentances
- for sentance in list_of_sentances:
- temp = sentance.split()
- number_of_words += len(temp)
- for strings in list_of_sentances:
- if ' ' in list_of_sentances:
- list_of_sentances.remove(' ')
- number_of_sentances = len(list_of_sentances)
- result = float(number_of_words) / number_of_sentances
- return result
- def avg_sentence_complexity(text):
- '''Return the average number of phrases per sentence.
- Terminating punctuation defined as !?.
- A sentence is defined as a non-empty string of non-terminating
- punctuation surrounded by terminating punctuation
- or beginning or end of file.
- Phrases are substrings of a sentences separated by
- one or more of the following delimiters ,;: '''
- #To do: replace the body of this function with something meaningful
- return 1.0
- def get_valid_filename(prompt):
- '''Use prompt (a string) to ask the user to type the name of a file. If
- the file does not exist, keep asking until they give a valid filename.
- Return the name of that file.'''
- filename = raw_input(prompt)
- while not os.path.isfile(filename):
- print "That file does not exist."
- filename = raw_input(prompt)
- return filename
- def read_directory_name(prompt):
- '''Use prompt (a string) to ask the user to type the name of a directory. If
- the directory does not exist, keep asking until they give a valid directory.
- '''
- directory = raw_input(prompt)
- while not os.path.isdir(directory):
- print "That directory does not exist."
- directory = raw_input(prompt)
- return directory
- def compare_signatures(sig1, sig2, weight):
- '''Return a non-negative real number indicating the similarity of two
- linguistic signatures. The smaller the number the more similar the
- signatures. Zero indicates identical signatures.
- sig1 and sig2 are 6 element lists with the following elements
- 0 : author name (a string)
- 1 : average word length (float)
- 2 : TTR (float)
- 3 : Hapax Legomana Ratio (float)
- 4 : average sentence length (float)
- 5 : average sentence complexity (float)
- weight is a list of multiplicative weights to apply to each
- linguistic feature. weight[0] is ignored.
- '''
- i = 1
- while i <= 5:
- result += (abs(sig1[i]-sig2[i]))*weight[i]
- i += 1
- return result
- def read_signature(filename):
- '''Read a linguistic signature from filename and return it as
- list of features. '''
- file = open(filename,'r')
- # the first feature is a string so it doesn't need casting to float
- result = [file.readline()]
- # all remaining features are real numbers
- for line in file:
- result.append(float(line.strip()))
- return result
- if __name__ == '__main__':
- prompt = 'enter the name of the file with unknown author:'
- mystery_filename = get_valid_filename(prompt)
- # readlines gives us a list of strings one for each line of the file
- text = open(mystery_filename,'r').readlines()
- # calculate the signature for the mystery file
- mystery_signature = [mystery_filename]
- mystery_signature.append(average_word_length(text))
- mystery_signature.append(type_token_ratio(text))
- mystery_signature.append(hapax_legomana_ratio(text))
- mystery_signature.append(average_sentence_length(text))
- mystery_signature.append(avg_sentence_complexity(text))
- weights = [0, 11, 33, 50, 0.4, 4]
- prompt = 'enter the path to the directory of signature files: '
- dir = read_directory_name(prompt)
- # every file in this directory must be a linguistic signature
- files = os.listdir(dir)
- # we will assume that there is at least one signature in that directory
- this_file = files[0]
- signature = read_signature('%s/%s'%(dir,this_file))
- best_score = compare_signatures(mystery_signature,signature,weights)
- best_author = signature[0]
- for this_file in files[1:]:
- signature = read_signature('%s/%s'%(dir,this_file))
- score = compare_signatures(mystery_signature,signature,weights)
- if score < best_score:
- best_score = score
- best_author = signature[0]
- print "best author match: %s with score %s"%(best_author,best_score)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement