Advertisement
Guest User

Untitled

a guest
Jun 24th, 2017
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.24 KB | None | 0 0
  1. import os.path, math
  2.  
  3. def clean_up(str):
  4.     ''' Return a version of string str in which all letters have been
  5.    converted to lowercase and punctuation characters have been stripped
  6.    from both ends. Inner punctuation is left untouched. '''
  7.    
  8.     punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
  9.     result = str.lower().strip(punctuation)
  10.     return result
  11.  
  12. def average_word_length(text):
  13.     ''' Return the average length of all words in text. Do not
  14.    include surrounding punctuation in words.
  15.    text is a non-empty list of strings each ending in \n.
  16.    At least one line in text contains a word.'''
  17.    
  18.     lst_of_words = []
  19.     clean_words = []
  20.     total_words = 0
  21.     total_chars = 0
  22.     for lines in text:
  23.        
  24.         line = clean_up(lines)
  25.        
  26.         lst_of_words += line.split()
  27.        
  28.    
  29.     for everything in lst_of_words:
  30.         word = clean_up(everything)
  31.         clean_words.append(word)
  32.        
  33.         if '' in clean_words:
  34.             clean_words.remove('')
  35.     for words in clean_words:
  36.            
  37.         total_chars += len(words)
  38.    
  39.     total_words = len(clean_words)
  40.    
  41.     return float(total_chars)/total_words
  42.        
  43.  
  44. def type_token_ratio(text):
  45.     ''' Return the type token ratio (TTR) for this text.
  46.    TTR is the number of different words divided by the total number of words.
  47.    text is a non-empty list of strings each ending in \n.
  48.    At least one line in text contains a word. '''
  49.  
  50.    
  51.     lst_of_duplicates = []
  52.     lst_of_unique = []
  53.     lst_of_words = []
  54.     clean_words = []
  55.    
  56.     for lines in text:
  57.        
  58.         line = clean_up(lines)
  59.        
  60.         lst_of_words += line.split()
  61.        
  62.    
  63.     for everything in lst_of_words:
  64.         word = clean_up(everything)
  65.         clean_words.append(word)
  66.        
  67.         if '' in clean_words:
  68.             clean_words.remove('')
  69.    
  70.            
  71.     for words in clean_words:
  72.         if words not in lst_of_unique:
  73.             lst_of_unique.append(words)
  74.         else:
  75.             lst_of_duplicates.append(words)
  76.  
  77.     number_of_duplicates = len(lst_of_duplicates)
  78.     number_of_uniques = len(lst_of_unique)
  79.    
  80.     total_words = number_of_uniques + number_of_duplicates
  81.    
  82.    
  83.     ratio = float(number_of_uniques) / total_words
  84.    
  85.     return ratio
  86.    
  87.  
  88.                    
  89. def hapax_legomana_ratio(text):
  90.     ''' Return the hapax_legomana ratio for this text.
  91.    This ratio is the number of words that occur exactly once divided
  92.    by the total number of words.
  93.    text is a list of strings each ending in \n.
  94.    At least one line in text contains a word.'''
  95.  
  96.    
  97.     lst_of_duplicates = []
  98.     lst_of_unique = []
  99.     lst_of_words = []
  100.     clean_words = []
  101.     only_words = []
  102.    
  103.     for lines in text:
  104.        
  105.         line = clean_up(lines)
  106.        
  107.         lst_of_words += line.split()
  108.        
  109.    
  110.     for everything in lst_of_words:
  111.         word = clean_up(everything)
  112.         clean_words.append(word)
  113.        
  114.         if '' in clean_words:
  115.             clean_words.remove('')
  116.            
  117.     total_words = len(clean_words)
  118.            
  119.     for words in clean_words:
  120.         if words  not in lst_of_unique:
  121.             lst_of_unique.append(words)
  122.         else:
  123.             lst_of_duplicates.append(words)
  124.     for commons in lst_of_duplicates:
  125.         if commons in lst_of_unique:
  126.             lst_of_unique.remove(commons)
  127.    
  128.        
  129.     number_of_uniques = len(lst_of_unique)
  130.    
  131.     ratio = float(number_of_uniques) / total_words
  132.    
  133.     return ratio
  134.    
  135.  
  136. def split_on_separators(original, separators):
  137.     ''' Return a list of non-empty, non-blank strings from the original string
  138.    determined by splitting the string on any of the separators.
  139.    separators is a string of single-character separators.'''
  140.    
  141.     results = []
  142.     temp = ''
  143.    
  144.    
  145.     for chars in original:
  146.         if chars in separators:
  147.             results.append(temp)
  148.             temp = ''
  149.             if '' in results:
  150.                 results.remove('')
  151.            
  152.         else:
  153.             temp += chars
  154.    
  155.     return results
  156.                
  157.    
  158. def average_sentence_length(text):
  159.     ''' Return the average number of words per sentence in text.
  160.    text is guaranteed to have at least one sentence.
  161.    Terminating punctuation defined as !?.
  162.    A sentence is defined as a non-empty string of non-terminating
  163.    punctuation surrounded by terminating punctuation
  164.    or beginning or end of file. '''
  165.    
  166.     txt = open(text,'r').readlines()
  167.    
  168.    
  169.     number_of_words = 0
  170.     big_list = []
  171.     big_str = ''
  172.     list_of_sentances = []
  173.     temp = []
  174.    
  175.    
  176.     for lines in txt:
  177.         big_str += lines
  178.    
  179.    
  180.    
  181.     list_of_sentances = split_on_separators(big_str, '?!.')
  182.     print list_of_sentances
  183.    
  184.    
  185.     for sentance in list_of_sentances:
  186.         temp = sentance.split()
  187.        
  188.         number_of_words += len(temp)
  189.    
  190.    
  191.     for strings in list_of_sentances:
  192.         if ' ' in list_of_sentances:
  193.             list_of_sentances.remove(' ')
  194.            
  195.     number_of_sentances = len(list_of_sentances)
  196.    
  197.    
  198.     result = float(number_of_words) / number_of_sentances
  199.    
  200.     return result
  201.    
  202.  
  203. def avg_sentence_complexity(text):
  204.     '''Return the average number of phrases per sentence.
  205.    Terminating punctuation defined as !?.
  206.    A sentence is defined as a non-empty string of non-terminating
  207.    punctuation surrounded by terminating punctuation
  208.    or beginning or end of file.
  209.    Phrases are substrings of a sentences separated by
  210.    one or more of the following delimiters ,;: '''
  211.    
  212.     #To do: replace the body of this function with something meaningful
  213.     return 1.0
  214.    
  215.    
  216. def get_valid_filename(prompt):
  217.     '''Use prompt (a string) to ask the user to type the name of a file. If
  218.    the file does not exist, keep asking until they give a valid filename.
  219.    Return the name of that file.'''
  220.    
  221.     filename = raw_input(prompt)
  222.     while not os.path.isfile(filename):
  223.         print "That file does not exist."
  224.         filename = raw_input(prompt)
  225.     return filename
  226.  
  227. def read_directory_name(prompt):
  228.     '''Use prompt (a string) to ask the user to type the name of a directory. If
  229.    the directory does not exist, keep asking until they give a valid directory.
  230.    '''
  231.    
  232.     directory = raw_input(prompt)    
  233.     while not os.path.isdir(directory):
  234.         print "That directory does not exist."
  235.         directory = raw_input(prompt)
  236.     return directory
  237.    
  238.    
  239. def compare_signatures(sig1, sig2, weight):
  240.     '''Return a non-negative real number indicating the similarity of two
  241.    linguistic signatures. The smaller the number the more similar the
  242.    signatures. Zero indicates identical signatures.
  243.    sig1 and sig2 are 6 element lists with the following elements
  244.    0  : author name (a string)
  245.    1  : average word length (float)
  246.    2  : TTR (float)
  247.    3  : Hapax Legomana Ratio (float)
  248.    4  : average sentence length (float)
  249.    5  : average sentence complexity (float)
  250.    weight is a list of multiplicative weights to apply to each
  251.    linguistic feature. weight[0] is ignored.
  252.    '''
  253.    
  254.     i = 1
  255.     while i <= 5:
  256.         result += (abs(sig1[i]-sig2[i]))*weight[i]
  257.         i += 1
  258.         return result
  259.    
  260. def read_signature(filename):
  261.     '''Read a linguistic signature from filename and return it as
  262.    list of features. '''
  263.    
  264.     file = open(filename,'r')
  265.     # the first feature is a string so it doesn't need casting to float
  266.     result = [file.readline()]
  267.     # all remaining features are real numbers
  268.     for line in file:
  269.         result.append(float(line.strip()))
  270.     return result
  271.        
  272. if __name__ == '__main__':
  273.    
  274.     prompt = 'enter the name of the file with unknown author:'
  275.     mystery_filename = get_valid_filename(prompt)
  276.  
  277.     # readlines gives us a list of strings one for each line of the file
  278.     text = open(mystery_filename,'r').readlines()
  279.    
  280.     # calculate the signature for the mystery file
  281.     mystery_signature = [mystery_filename]
  282.     mystery_signature.append(average_word_length(text))
  283.     mystery_signature.append(type_token_ratio(text))
  284.     mystery_signature.append(hapax_legomana_ratio(text))
  285.     mystery_signature.append(average_sentence_length(text))
  286.     mystery_signature.append(avg_sentence_complexity(text))
  287.    
  288.     weights = [0, 11, 33, 50, 0.4, 4]
  289.    
  290.     prompt = 'enter the path to the directory of signature files: '
  291.     dir = read_directory_name(prompt)
  292.     # every file in this directory must be a linguistic signature
  293.     files = os.listdir(dir)
  294.  
  295.     # we will assume that there is at least one signature in that directory
  296.     this_file = files[0]
  297.     signature = read_signature('%s/%s'%(dir,this_file))
  298.     best_score = compare_signatures(mystery_signature,signature,weights)
  299.     best_author = signature[0]
  300.     for this_file in files[1:]:
  301.         signature = read_signature('%s/%s'%(dir,this_file))
  302.         score = compare_signatures(mystery_signature,signature,weights)
  303.         if score < best_score:
  304.             best_score = score
  305.             best_author = signature[0]
  306.     print "best author match: %s with score %s"%(best_author,best_score)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement