Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from operator import itemgetter
- def get_filename():
- filename = str(input("Enter a file name: "))
- # print(filename)
- return filename
- def process_file(filename):
- weekly_tokens = [[], [], [], [], [], [], [], [], [], [], []]
- words_count = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
- filename1 = open(filename, "r+")
- # part 2 - finding the start of each article
- file = filename1.readline()
- count = 1
- while not file == "":
- if file.startswith("URL:"):
- # print("Article", count, "\n", file)
- url = file.split("URL: ")[1]
- curr_week = get_week(url)
- count += 1
- file = filename1.readline()
- # get string
- # rest of weeks append here
- # weekly_tokens = #something goes here
- # return weekly_tokens
- # tokenize data
- file1 = file.split(" ")
- for file1words in file1:
- file2words = file1words.strip(",?!()/—“”.;:\n")
- file3words = file2words.lower()
- if file3words == '' or file3words[0:4] == "http":
- continue
- weekly_tokens[curr_week].append(file3words)
- if file3words not in words_count[curr_week].keys():
- words_count[curr_week][file3words] = 1
- else:
- words_count[curr_week][file3words] += 1
- # print(file3words)
- print("A total number of", count - 1, "articles were found!")
- filename1.close()
- return words_count
- def get_week(url):
- s = url[28:33] # Gives string in mm/dd
- month = int(s[0:2]) # Gives month
- day = int(s[3:]) # Gives day
- if month == 4:
- if 15 <= day <= 21:
- return 0
- if 22 <= day <= 28:
- return 1
- else:
- return 2
- if month == 5:
- if 1 <= day <= 5:
- return 2
- if 6 <= day <= 12:
- return 3
- if 13 <= day <= 19:
- return 4
- if 20 <= day <= 26:
- return 5
- else:
- return 6
- if month == 6:
- if day == 1 or day == 2:
- return 6
- if 3 <= day <= 9:
- return 7
- if 10 <= day <= 16:
- return 8
- if 17 <= day <= 23:
- return 9
- else:
- return 10
- def top5(token):
- final_list = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
- for i in range(11):
- sortedDict = dict(sorted(token[i].items(), key=itemgetter(1), reverse=True))
- final_list[i] = {key: value for key, value in list(sortedDict.items())[0:5]}
- return final_list
- # main function
- def main():
- fname = get_filename()
- weekly_tokens = process_file(fname)
- weekly_tokens = top5(weekly_tokens)
- for i in range(11):
- print("Top five words for Week", i)
- for key, value in weekly_tokens[i].items():
- print(key, ": ", value)
- print("\n")
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement