This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on May 13th, 2013  |  syntax: Python  |  size: 5.29 KB  |  views: 61  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. punctuation = '!.,?";:'
  2. letters = 'etaoinshrdlcumwfgypbvkjxqz'
  3. numbers = '0123456789'
  4.  
  5. def number_of_words(text):
  6.     # returns number of words in a string
  7.     text = ' '.join(text.split())
  8.     text = text.split(' ')
  9.     count = 0
  10.     for word in text:
  11.         word = remove_punctuation(word)
  12.         if is_word(word):
  13.             count += 1
  14.     return count
  15.  
  16. def remove_punctuation(word):
  17.     #returns a string without punctation at the end as well as 's
  18.     if word[-2:] == "'s":
  19.             word = word[:-2]
  20.     else:
  21.         #TODO stuff like cat?.;! still get counted even if not a word per se
  22.         while word[-1] in punctuation:
  23.             word = word[:-1]
  24.     return word
  25.  
  26. def is_word(word):
  27.     #checks to see if string is only letters
  28.     if word == '':
  29.         return False
  30.     for letter in word:
  31.         if letter not in letters:
  32.             return False
  33.     return True
  34.  
  35. def number_of_letters(text):
  36.     #counts number of letters in a string
  37.     count = 0
  38.     for letter in text:
  39.         if letter in letters:
  40.             count +=1
  41.     return count
  42.  
  43. def number_of_symbols(text):
  44.     #number of symbols in text excluding whitespace
  45.     text = ' '.join(text.split())
  46.     text = ' '.join(text.split(' '))
  47.     count = 0
  48.     for letter in text:
  49.         if (letter in letters) or (letter in numbers):
  50.             continue
  51.         else:
  52.             count += 1
  53.     return count
  54.  
  55. def most_common_words(text):
  56.     #returns list of 3 most common words
  57.     word_list = {}
  58.     text = ' '.join(text.split())
  59.     text = text.split(' ')
  60.     for word in text:
  61.         word = remove_punctuation(word)
  62.         if word == '':
  63.             continue
  64.         if word in word_list:
  65.             word_list[word] += 1
  66.         else:
  67.             word_list[word] = 1
  68.     word_list2 = []
  69.     #flips keys/values into wordlist2
  70.     for k, v in word_list.items():
  71.         word_list2.append((v,k))
  72.     #adds the top 3 words into answer
  73.     answer = []
  74.     for unused in range(3):
  75.         current_max = max(word_list2)
  76.         word_list2.remove(current_max)
  77.         answer.append(current_max[1])
  78.  
  79.     return answer
  80.  
  81. def most_common_letters(text):
  82.     #returns list of 3 most common letters
  83.     letter_list = {}
  84.     for letter in letters:
  85.         letter_list[letter] = 0
  86.        
  87.     text = ' '.join(text.split())
  88.     text = ' '.join(text.split(' '))
  89.    
  90.     for letter in text:
  91.         if letter not in letters:
  92.             continue
  93.         letter_list[letter] += 1
  94.        
  95.     letter_list2 = []
  96.     #flips keys/values into wordlist2
  97.     for k, v in letter_list.items():
  98.         letter_list2.append((v,k))
  99.     #adds the top 3 words into answer
  100.     answer = []
  101.     for unused in range(3):
  102.         current_max = max(letter_list2)
  103.         letter_list2.remove(current_max)
  104.         answer.append(current_max[1])
  105.  
  106.     return answer
  107.  
  108. def common_first_word(text):
  109.     # returns most common first word in a paragraph
  110.     text = text.split('\n')
  111.     while ' ' in text:
  112.         text.remove(' ')
  113.        
  114.     for paragraph_number in range(len(text)):    
  115.         text[paragraph_number] = text[paragraph_number].split(' ')
  116.    
  117.     word_list = {}
  118.     for line in text:
  119.         for word in line:
  120.             if word == '':
  121.                 continue
  122.             word = remove_punctuation(word)
  123.             if is_word(word):
  124.                 if word in word_list:
  125.                     word_list[word] += 1
  126.                 else:
  127.                     word_list[word] = 1
  128.                 break
  129.            
  130.     word_list2 = []
  131.     #flips keys/values into wordlist2
  132.     for k, v in word_list.items():
  133.         word_list2.append((v,k))
  134.  
  135.     return max(word_list2)[1]
  136.  
  137. def words_used_once(text):
  138.     # returns a list of all words used once
  139.     word_list = {}
  140.     text = ' '.join(text.split())
  141.     text = text.split(' ')
  142.     for word in text:
  143.         word = remove_punctuation(word)
  144.         if word == '':
  145.             continue
  146.         if word in word_list:
  147.             word_list[word] += 1
  148.         else:
  149.             word_list[word] = 1
  150.            
  151.     answer = []
  152.     for word in word_list:
  153.         if word_list[word] == 1:
  154.             answer.append(word)
  155.            
  156.     return answer
  157.  
  158. def letters_used_once(text):
  159.     # returns letters not used
  160.     answer = []
  161.     for letter in letters:
  162.         if letter not in text:
  163.             answer.append(letter)
  164.            
  165.     return answer
  166.            
  167. #TODO check to see if input is valid
  168. text_location = input('Please type your text file location')
  169. file = open(text_location,'r')
  170. text_file = ' '.join(file.readlines()[:])
  171. text_file = text_file.lower()
  172. file.close()
  173. print('{0} words'.format(number_of_words(text_file)))
  174. print('{0} letters'.format(number_of_letters(text_file)))
  175. print('{0} symbols'.format(number_of_symbols(text_file)))
  176. top_3 = most_common_words(text_file)
  177. print('Top three most common words: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
  178. top_3 = most_common_letters(text_file)
  179. print('Top three most common letters: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
  180. print('{0} is the most common first word of all paragraphs'.format(common_first_word(text_file)))
  181. print('Words only used once: {0}'.format(words_used_once(text_file)))
  182. print('Letters not used in the document: {0}'.format(letters_used_once(text_file)))
clone this paste RAW Paste Data