Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import operator
- import re
- def count_combos(input_string):
- freq = {}
- words = input_string.split()
- if len(words) == 1:
- return freq
- for idx, word in enumerate(words):
- if idx+1 < len(words):
- word_pair = (word, words[idx+1])
- if word_pair in freq:
- freq[word_pair] += 1
- else:
- freq[word_pair] = 1
- return freq
- def main():
- story_path = sys.argv[1]
- skip_path = sys.argv[2]
- with open(story_path, 'r') as content:
- story_content = content.read().lower()
- with open(skip_path, 'r') as content:
- skip_words = content.read().lower().split(',')
- story_content = re.sub('[\n\t.?,;:\'\"]', ' ', story_content)
- story_list = story_content.split(' ')
- story_list = list(filter(None, story_list))
- story_list = list(filter(lambda x: x not in skip_words, story_list))
- freq_dict = count_combos(" ".join(story_list))
- freq_dict = sorted(freq_dict.items(), key=operator.itemgetter(1))
- top_5 = freq_dict[len(freq_dict)-5:len(freq_dict)]
- print(f"Story file name: {story_path}")
- print(f"Skip words file name: {skip_path}")
- print(f"Skip words: {skip_words}")
- print(f"The five most frequently occuring word pairs are:")
- for pair in top_5[::-1]:
- print(f"('{pair[0][0]} {pair[0][1]}', {pair[1]})")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement