punctuation = '!.,?";:'
letters = 'etaoinshrdlcumwfgypbvkjxqz'
numbers = '0123456789'
def number_of_words(text):
# returns number of words in a string
text = ' '.join(text.split())
text = text.split(' ')
count = 0
for word in text:
word = remove_punctuation(word)
if is_word(word):
count += 1
return count
def remove_punctuation(word):
#returns a string without punctation at the end as well as 's
if word[-2:] == "'s":
word = word[:-2]
else:
#TODO stuff like cat?.;! still get counted even if not a word per se
while word[-1] in punctuation:
word = word[:-1]
return word
def is_word(word):
#checks to see if string is only letters
if word == '':
return False
for letter in word:
if letter not in letters:
return False
return True
def number_of_letters(text):
#counts number of letters in a string
count = 0
for letter in text:
if letter in letters:
count +=1
return count
def number_of_symbols(text):
#number of symbols in text excluding whitespace
text = ' '.join(text.split())
text = ' '.join(text.split(' '))
count = 0
for letter in text:
if (letter in letters) or (letter in numbers):
continue
else:
count += 1
return count
def most_common_words(text):
#returns list of 3 most common words
word_list = {}
text = ' '.join(text.split())
text = text.split(' ')
for word in text:
word = remove_punctuation(word)
if word == '':
continue
if word in word_list:
word_list[word] += 1
else:
word_list[word] = 1
word_list2 = []
#flips keys/values into wordlist2
for k, v in word_list.items():
word_list2.append((v,k))
#adds the top 3 words into answer
answer = []
for unused in range(3):
current_max = max(word_list2)
word_list2.remove(current_max)
answer.append(current_max[1])
return answer
def most_common_letters(text):
#returns list of 3 most common letters
letter_list = {}
for letter in letters:
letter_list[letter] = 0
text = ' '.join(text.split())
text = ' '.join(text.split(' '))
for letter in text:
if letter not in letters:
continue
letter_list[letter] += 1
letter_list2 = []
#flips keys/values into wordlist2
for k, v in letter_list.items():
letter_list2.append((v,k))
#adds the top 3 words into answer
answer = []
for unused in range(3):
current_max = max(letter_list2)
letter_list2.remove(current_max)
answer.append(current_max[1])
return answer
def common_first_word(text):
# returns most common first word in a paragraph
text = text.split('\n')
while ' ' in text:
text.remove(' ')
for paragraph_number in range(len(text)):
text[paragraph_number] = text[paragraph_number].split(' ')
word_list = {}
for line in text:
for word in line:
if word == '':
continue
word = remove_punctuation(word)
if is_word(word):
if word in word_list:
word_list[word] += 1
else:
word_list[word] = 1
break
word_list2 = []
#flips keys/values into wordlist2
for k, v in word_list.items():
word_list2.append((v,k))
return max(word_list2)[1]
def words_used_once(text):
# returns a list of all words used once
word_list = {}
text = ' '.join(text.split())
text = text.split(' ')
for word in text:
word = remove_punctuation(word)
if word == '':
continue
if word in word_list:
word_list[word] += 1
else:
word_list[word] = 1
answer = []
for word in word_list:
if word_list[word] == 1:
answer.append(word)
return answer
def letters_used_once(text):
# returns letters not used
answer = []
for letter in letters:
if letter not in text:
answer.append(letter)
return answer
#TODO check to see if input is valid
text_location = input('Please type your text file location')
file = open(text_location,'r')
text_file = ' '.join(file.readlines()[:])
text_file = text_file.lower()
file.close()
print('{0} words'.format(number_of_words(text_file)))
print('{0} letters'.format(number_of_letters(text_file)))
print('{0} symbols'.format(number_of_symbols(text_file)))
top_3 = most_common_words(text_file)
print('Top three most common words: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
top_3 = most_common_letters(text_file)
print('Top three most common letters: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
print('{0} is the most common first word of all paragraphs'.format(common_first_word(text_file)))
print('Words only used once: {0}'.format(words_used_once(text_file)))
print('Letters not used in the document: {0}'.format(letters_used_once(text_file)))