punctuation = '!.,?";:' letters = 'etaoinshrdlcumwfgypbvkjxqz' numbers = '0123456789' def number_of_words(text): # returns number of words in a string text = ' '.join(text.split()) text = text.split(' ') count = 0 for word in text: word = remove_punctuation(word) if is_word(word): count += 1 return count def remove_punctuation(word): #returns a string without punctation at the end as well as 's if word[-2:] == "'s": word = word[:-2] else: #TODO stuff like cat?.;! still get counted even if not a word per se while word[-1] in punctuation: word = word[:-1] return word def is_word(word): #checks to see if string is only letters if word == '': return False for letter in word: if letter not in letters: return False return True def number_of_letters(text): #counts number of letters in a string count = 0 for letter in text: if letter in letters: count +=1 return count def number_of_symbols(text): #number of symbols in text excluding whitespace text = ' '.join(text.split()) text = ' '.join(text.split(' ')) count = 0 for letter in text: if (letter in letters) or (letter in numbers): continue else: count += 1 return count def most_common_words(text): #returns list of 3 most common words word_list = {} text = ' '.join(text.split()) text = text.split(' ') for word in text: word = remove_punctuation(word) if word == '': continue if word in word_list: word_list[word] += 1 else: word_list[word] = 1 word_list2 = [] #flips keys/values into wordlist2 for k, v in word_list.items(): word_list2.append((v,k)) #adds the top 3 words into answer answer = [] for unused in range(3): current_max = max(word_list2) word_list2.remove(current_max) answer.append(current_max[1]) return answer def most_common_letters(text): #returns list of 3 most common letters letter_list = {} for letter in letters: letter_list[letter] = 0 text = ' '.join(text.split()) text = ' '.join(text.split(' ')) for letter in text: if letter not in letters: continue letter_list[letter] += 1 letter_list2 = [] #flips keys/values into wordlist2 for k, v in letter_list.items(): letter_list2.append((v,k)) #adds the top 3 words into answer answer = [] for unused in range(3): current_max = max(letter_list2) letter_list2.remove(current_max) answer.append(current_max[1]) return answer def common_first_word(text): # returns most common first word in a paragraph text = text.split('\n') while ' ' in text: text.remove(' ') for paragraph_number in range(len(text)): text[paragraph_number] = text[paragraph_number].split(' ') word_list = {} for line in text: for word in line: if word == '': continue word = remove_punctuation(word) if is_word(word): if word in word_list: word_list[word] += 1 else: word_list[word] = 1 break word_list2 = [] #flips keys/values into wordlist2 for k, v in word_list.items(): word_list2.append((v,k)) return max(word_list2)[1] def words_used_once(text): # returns a list of all words used once word_list = {} text = ' '.join(text.split()) text = text.split(' ') for word in text: word = remove_punctuation(word) if word == '': continue if word in word_list: word_list[word] += 1 else: word_list[word] = 1 answer = [] for word in word_list: if word_list[word] == 1: answer.append(word) return answer def letters_used_once(text): # returns letters not used answer = [] for letter in letters: if letter not in text: answer.append(letter) return answer #TODO check to see if input is valid text_location = input('Please type your text file location') file = open(text_location,'r') text_file = ' '.join(file.readlines()[:]) text_file = text_file.lower() file.close() print('{0} words'.format(number_of_words(text_file))) print('{0} letters'.format(number_of_letters(text_file))) print('{0} symbols'.format(number_of_symbols(text_file))) top_3 = most_common_words(text_file) print('Top three most common words: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2])) top_3 = most_common_letters(text_file) print('Top three most common letters: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2])) print('{0} is the most common first word of all paragraphs'.format(common_first_word(text_file))) print('Words only used once: {0}'.format(words_used_once(text_file))) print('Letters not used in the document: {0}'.format(letters_used_once(text_file)))