Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- KEY = 0
- VALUE = 1
- import string
- def get_file_object(filename):
- try:
- file_object = open(filename, 'r')
- return file_object
- except FileNotFoundError:
- return None
- def process_lines(file_object):
- '''Splits text up in lists of lists with paragraphs in seperate lists'''
- paragraph_list = []
- line_list = []
- for line in file_object:
- word_list = line.split()
- if line != "\n":
- for word in word_list:
- line_list.append(word.lower().strip(string.punctuation))
- else:
- paragraph_list.append(line_list)
- line_list = []
- if line_list not in paragraph_list: #Takes the rest of the text, before it was missing the 3 paragraph
- paragraph_list.append(line_list)
- return paragraph_list
- def find_words(paragraph_list):
- '''assigns each word with the paragraph it is located in, returned in tuples.'''
- a_dict = {}
- for index, paragraph in enumerate(paragraph_list, start=1):
- for word in paragraph:
- if word not in a_dict:
- a_dict[word] = list(str(index))
- else:
- a_dict[word].append(str(index))
- a_list= []
- for key, value in a_dict.items():
- value = [int(x) for x in value]
- remove_doubles = list(set(value))
- a_list.append((key, sorted(remove_doubles))) #Returns the dict in list of tuples
- return a_list
- def get_second(x):
- '''this function is to sort the list by the value'''
- return x[1]
- def count_words(paragraph_list):
- '''counts words to find the top 10 and top 20 counts'''
- count_dict = {}
- for paragraph in paragraph_list:
- for word in paragraph:
- if word not in count_dict:
- count_dict[word] = 1
- else:
- count_dict[word] += 1
- count_list = []
- for word, value in count_dict.items():
- count_list.append((word, value),)
- count_list = sorted(count_list)
- count_list = sorted(count_list, key = get_second, reverse=True) # get second sorts after the value.
- return count_list[:10], count_list[:20]
- def print_lines(word_placement, top_10, top_20):
- '''prints lines and formats the text'''
- print()
- print("The paragraph index:")
- for a_tuple in sorted(word_placement):
- value = [str(x) for x in a_tuple[VALUE]]
- if len(value) >= 1:
- value = ", ".join(value)
- key = a_tuple[KEY]
- print("{} {}".format(key, value))
- print()
- print("The highest 10 counts: ")
- for a_tuple in top_10:
- print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
- print()
- print("The highest 20 counts: ")
- for a_tuple in top_20:
- print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
- def main():
- filename = input("Enter filename: ")
- file_object = get_file_object(filename)
- if file_object:
- paragraph_list = process_lines(file_object)
- word_placement = find_words(paragraph_list)
- top_10, top_20 = count_words(paragraph_list)
- print_lines(word_placement, top_10, top_20)
- else:
- print("Filename {} not found!".format(filename))
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement