Advertisement
Guest User

Untitled

a guest
Oct 21st, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.21 KB | None | 0 0
  1. KEY = 0
  2. VALUE = 1
  3. import string
  4.  
  5.  
  6.  
  7. def get_file_object(filename):
  8. try:
  9. file_object = open(filename, 'r')
  10. return file_object
  11. except FileNotFoundError:
  12. return None
  13.  
  14. def process_lines(file_object):
  15. '''Splits text up in lists of lists with paragraphs in seperate lists'''
  16. paragraph_list = []
  17. line_list = []
  18. for line in file_object:
  19. word_list = line.split()
  20. if line != "\n":
  21. for word in word_list:
  22. line_list.append(word.lower().strip(string.punctuation))
  23. else:
  24. paragraph_list.append(line_list)
  25. line_list = []
  26. if line_list not in paragraph_list: #Takes the rest of the text, before it was missing the 3 paragraph
  27. paragraph_list.append(line_list)
  28. return paragraph_list
  29.  
  30. def find_words(paragraph_list):
  31. '''assigns each word with the paragraph it is located in, returned in tuples.'''
  32. a_dict = {}
  33. for index, paragraph in enumerate(paragraph_list, start=1):
  34. for word in paragraph:
  35. if word not in a_dict:
  36. a_dict[word] = list(str(index))
  37. else:
  38. a_dict[word].append(str(index))
  39. a_list= []
  40. for key, value in a_dict.items():
  41. value = [int(x) for x in value]
  42. remove_doubles = list(set(value))
  43. a_list.append((key, sorted(remove_doubles))) #Returns the dict in list of tuples
  44. return a_list
  45.  
  46. def get_second(x):
  47. '''this function is to sort the list by the value'''
  48. return x[1]
  49.  
  50. def count_words(paragraph_list):
  51. '''counts words to find the top 10 and top 20 counts'''
  52. count_dict = {}
  53. for paragraph in paragraph_list:
  54. for word in paragraph:
  55. if word not in count_dict:
  56. count_dict[word] = 1
  57. else:
  58. count_dict[word] += 1
  59. count_list = []
  60. for word, value in count_dict.items():
  61. count_list.append((word, value),)
  62. count_list = sorted(count_list)
  63. count_list = sorted(count_list, key = get_second, reverse=True) # get second sorts after the value.
  64. return count_list[:10], count_list[:20]
  65.  
  66. def print_lines(word_placement, top_10, top_20):
  67. '''prints lines and formats the text'''
  68. print()
  69. print("The paragraph index:")
  70. for a_tuple in sorted(word_placement):
  71. value = [str(x) for x in a_tuple[VALUE]]
  72. if len(value) >= 1:
  73. value = ", ".join(value)
  74. key = a_tuple[KEY]
  75. print("{} {}".format(key, value))
  76. print()
  77. print("The highest 10 counts: ")
  78. for a_tuple in top_10:
  79. print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
  80. print()
  81. print("The highest 20 counts: ")
  82. for a_tuple in top_20:
  83. print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
  84.  
  85. def main():
  86. filename = input("Enter filename: ")
  87. file_object = get_file_object(filename)
  88. if file_object:
  89. paragraph_list = process_lines(file_object)
  90. word_placement = find_words(paragraph_list)
  91. top_10, top_20 = count_words(paragraph_list)
  92. print_lines(word_placement, top_10, top_20)
  93. else:
  94. print("Filename {} not found!".format(filename))
  95.  
  96. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement