Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
273
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.94 KB | None | 0 0
  1. from operator import itemgetter
  2.  
  3.  
  4. def get_filename():
  5.     filename = str(input("Enter a file name: "))
  6.     # print(filename)
  7.     return filename
  8.  
  9.  
  10. def process_file(filename):
  11.     weekly_tokens = [[], [], [], [], [], [], [], [], [], [], []]
  12.     words_count = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
  13.     filename1 = open(filename, "r+")
  14.     # part 2 - finding the start of each article
  15.     file = filename1.readline()
  16.     count = 1
  17.     while not file == "":
  18.         if file.startswith("URL:"):
  19.             # print("Article", count, "\n", file)
  20.             url = file.split("URL: ")[1]
  21.             curr_week = get_week(url)
  22.             count += 1
  23.         file = filename1.readline()
  24.         # get string
  25.  
  26.         # rest of weeks append here
  27.  
  28.         # weekly_tokens = #something goes here
  29.         # return weekly_tokens
  30.  
  31.         # tokenize data
  32.         file1 = file.split(" ")
  33.         for file1words in file1:
  34.             file2words = file1words.strip(",?!()/—“”.;:\n")
  35.             file3words = file2words.lower()
  36.             if file3words == '' or file3words[0:4] == "http":
  37.                 continue
  38.             weekly_tokens[curr_week].append(file3words)
  39.             if file3words not in words_count[curr_week].keys():
  40.                 words_count[curr_week][file3words] = 1
  41.             else:
  42.                 words_count[curr_week][file3words] += 1
  43.  
  44.             # print(file3words)
  45.     print("A total number of", count - 1, "articles were found!")
  46.     filename1.close()
  47.     return words_count
  48.  
  49.  
  50. def get_week(url):
  51.     s = url[28:33]  # Gives string in mm/dd
  52.     month = int(s[0:2])  # Gives month
  53.     day = int(s[3:])  # Gives day
  54.     if month == 4:
  55.         if 15 <= day <= 21:
  56.             return 0
  57.         if 22 <= day <= 28:
  58.             return 1
  59.         else:
  60.             return 2
  61.     if month == 5:
  62.         if 1 <= day <= 5:
  63.             return 2
  64.         if 6 <= day <= 12:
  65.             return 3
  66.         if 13 <= day <= 19:
  67.             return 4
  68.         if 20 <= day <= 26:
  69.             return 5
  70.         else:
  71.             return 6
  72.     if month == 6:
  73.         if day == 1 or day == 2:
  74.             return 6
  75.         if 3 <= day <= 9:
  76.             return 7
  77.         if 10 <= day <= 16:
  78.             return 8
  79.         if 17 <= day <= 23:
  80.             return 9
  81.         else:
  82.             return 10
  83.  
  84.  
  85. def top5(token):
  86.     final_list = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
  87.     for i in range(11):
  88.         sortedDict = dict(sorted(token[i].items(), key=itemgetter(1), reverse=True))
  89.         final_list[i] = {key: value for key, value in list(sortedDict.items())[0:5]}
  90.     return final_list
  91.  
  92.  
  93. # main function
  94. def main():
  95.     fname = get_filename()
  96.     weekly_tokens = process_file(fname)
  97.     weekly_tokens = top5(weekly_tokens)
  98.     for i in range(11):
  99.         print("Top five words for Week", i)
  100.         for key, value in weekly_tokens[i].items():
  101.             print(key, ": ", value)
  102.         print("\n")
  103.  
  104. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement