Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- punctuation = '!.,?";:'
- letters = 'etaoinshrdlcumwfgypbvkjxqz'
- numbers = '0123456789'
- def number_of_words(text):
- # returns number of words in a string
- text = ' '.join(text.split())
- text = text.split(' ')
- count = 0
- for word in text:
- word = remove_punctuation(word)
- if is_word(word):
- count += 1
- return count
- def remove_punctuation(word):
- #returns a string without punctation at the end as well as 's
- if word[-2:] == "'s":
- word = word[:-2]
- else:
- #TODO stuff like cat?.;! still get counted even if not a word per se
- while word[-1] in punctuation:
- word = word[:-1]
- return word
- def is_word(word):
- #checks to see if string is only letters
- if word == '':
- return False
- for letter in word:
- if letter not in letters:
- return False
- return True
- def number_of_letters(text):
- #counts number of letters in a string
- count = 0
- for letter in text:
- if letter in letters:
- count +=1
- return count
- def number_of_symbols(text):
- #number of symbols in text excluding whitespace
- text = ' '.join(text.split())
- text = ' '.join(text.split(' '))
- count = 0
- for letter in text:
- if (letter in letters) or (letter in numbers):
- continue
- else:
- count += 1
- return count
- def most_common_words(text):
- #returns list of 3 most common words
- word_list = {}
- text = ' '.join(text.split())
- text = text.split(' ')
- for word in text:
- word = remove_punctuation(word)
- if word == '':
- continue
- if word in word_list:
- word_list[word] += 1
- else:
- word_list[word] = 1
- word_list2 = []
- #flips keys/values into wordlist2
- for k, v in word_list.items():
- word_list2.append((v,k))
- #adds the top 3 words into answer
- answer = []
- for unused in range(3):
- current_max = max(word_list2)
- word_list2.remove(current_max)
- answer.append(current_max[1])
- return answer
- def most_common_letters(text):
- #returns list of 3 most common letters
- letter_list = {}
- for letter in letters:
- letter_list[letter] = 0
- text = ' '.join(text.split())
- text = ' '.join(text.split(' '))
- for letter in text:
- if letter not in letters:
- continue
- letter_list[letter] += 1
- letter_list2 = []
- #flips keys/values into wordlist2
- for k, v in letter_list.items():
- letter_list2.append((v,k))
- #adds the top 3 words into answer
- answer = []
- for unused in range(3):
- current_max = max(letter_list2)
- letter_list2.remove(current_max)
- answer.append(current_max[1])
- return answer
- def common_first_word(text):
- # returns most common first word in a paragraph
- text = text.split('\n')
- while ' ' in text:
- text.remove(' ')
- for paragraph_number in range(len(text)):
- text[paragraph_number] = text[paragraph_number].split(' ')
- word_list = {}
- for line in text:
- for word in line:
- if word == '':
- continue
- word = remove_punctuation(word)
- if is_word(word):
- if word in word_list:
- word_list[word] += 1
- else:
- word_list[word] = 1
- break
- word_list2 = []
- #flips keys/values into wordlist2
- for k, v in word_list.items():
- word_list2.append((v,k))
- return max(word_list2)[1]
- def words_used_once(text):
- # returns a list of all words used once
- word_list = {}
- text = ' '.join(text.split())
- text = text.split(' ')
- for word in text:
- word = remove_punctuation(word)
- if word == '':
- continue
- if word in word_list:
- word_list[word] += 1
- else:
- word_list[word] = 1
- answer = []
- for word in word_list:
- if word_list[word] == 1:
- answer.append(word)
- return answer
- def letters_used_once(text):
- # returns letters not used
- answer = []
- for letter in letters:
- if letter not in text:
- answer.append(letter)
- return answer
- #TODO check to see if input is valid
- text_location = input('Please type your text file location')
- file = open(text_location,'r')
- text_file = ' '.join(file.readlines()[:])
- text_file = text_file.lower()
- file.close()
- print('{0} words'.format(number_of_words(text_file)))
- print('{0} letters'.format(number_of_letters(text_file)))
- print('{0} symbols'.format(number_of_symbols(text_file)))
- top_3 = most_common_words(text_file)
- print('Top three most common words: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
- top_3 = most_common_letters(text_file)
- print('Top three most common letters: {0},{1},{2}'.format(top_3[0],top_3[1],top_3[2]))
- print('{0} is the most common first word of all paragraphs'.format(common_first_word(text_file)))
- print('Words only used once: {0}'.format(words_used_once(text_file)))
- print('Letters not used in the document: {0}'.format(letters_used_once(text_file)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement