SHARE
TWEET

Untitled

a guest Oct 21st, 2019 79 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. KEY = 0
  2. VALUE = 1
  3. import string
  4.  
  5.  
  6.  
  7. def get_file_object(filename):
  8.     try:
  9.         file_object = open(filename, 'r')
  10.         return file_object
  11.     except FileNotFoundError:
  12.         return None
  13.  
  14. def process_lines(file_object):
  15.     '''Splits text up in lists of lists with paragraphs in seperate lists'''
  16.     paragraph_list = []
  17.     line_list = []
  18.     for line in file_object:
  19.         word_list = line.split()
  20.         if line != "\n":
  21.             for word in word_list:
  22.                 line_list.append(word.lower().strip(string.punctuation))
  23.         else:
  24.             paragraph_list.append(line_list)
  25.             line_list = []
  26.     if line_list not in paragraph_list:         #Takes the rest of the text, before it was missing the 3 paragraph
  27.         paragraph_list.append(line_list)
  28.     return paragraph_list
  29.  
  30. def find_words(paragraph_list):
  31.     '''assigns each word with the paragraph it is located in, returned in tuples.'''
  32.     a_dict = {}
  33.     for index, paragraph in enumerate(paragraph_list, start=1):
  34.         for word in paragraph:
  35.             if word not in a_dict:
  36.                 a_dict[word] = list(str(index))
  37.             else:
  38.                 a_dict[word].append(str(index))
  39.     a_list= []
  40.     for key, value in a_dict.items():
  41.         value = [int(x) for x in value]
  42.         remove_doubles = list(set(value))
  43.         a_list.append((key, sorted(remove_doubles)))       #Returns the dict in list of tuples
  44.     return a_list
  45.  
  46. def get_second(x):
  47.     '''this function is to sort the list by the value'''
  48.     return x[1]
  49.  
  50. def count_words(paragraph_list):
  51.     '''counts words to find the top 10 and top 20 counts'''
  52.     count_dict = {}
  53.     for paragraph in paragraph_list:
  54.         for word in paragraph:
  55.             if word not in count_dict:
  56.                 count_dict[word] = 1
  57.             else:
  58.                 count_dict[word] += 1
  59.     count_list = []
  60.     for word, value in count_dict.items():
  61.         count_list.append((word, value),)
  62.     count_list = sorted(count_list)  
  63.     count_list = sorted(count_list, key = get_second, reverse=True)  # get second sorts after the value.
  64.     return count_list[:10], count_list[:20]
  65.  
  66. def print_lines(word_placement, top_10, top_20):
  67.     '''prints lines and formats the text'''
  68.     print()
  69.     print("The paragraph index:")
  70.     for a_tuple in sorted(word_placement):
  71.         value = [str(x) for x in a_tuple[VALUE]]
  72.         if len(value) >= 1:
  73.             value = ", ".join(value)
  74.         key = a_tuple[KEY]
  75.         print("{} {}".format(key, value))
  76.     print()
  77.     print("The highest 10 counts: ")
  78.     for a_tuple in top_10:
  79.         print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
  80.     print()
  81.     print("The highest 20 counts: ")
  82.     for a_tuple in top_20:
  83.         print("{}: {}".format(a_tuple[KEY],a_tuple[VALUE]))
  84.  
  85. def main():
  86.     filename = input("Enter filename: ")
  87.     file_object = get_file_object(filename)
  88.     if file_object:
  89.         paragraph_list = process_lines(file_object)
  90.         word_placement = find_words(paragraph_list)
  91.         top_10, top_20 = count_words(paragraph_list)
  92.         print_lines(word_placement, top_10, top_20)
  93.     else:
  94.         print("Filename¬†{} not found!".format(filename))
  95.    
  96. main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top