Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- p_words = re.compile(r'\w+')
- # p_words = re.compile(r'[^\n\r\t\.\:\,\d\!\?; ]+')
- # p_words = re.compile(r'[\w\']+')
- def parse_line(line):
- words = p_words.findall(line)
- return [w.lower() for w in words]
- def generate_corpus(text):
- corpus = {}
- words = parse_line(text)
- total_words = 0
- total_words_bi = len(words) - 1
- if len(words) > 0:
- if len(words[0]) > 2:
- corpus[words[0]] = 1
- total_words += 1
- for i in range(1, len(words)):
- word = words[i]
- prev_word = words[i - 1]
- if len(word) > 2:
- corpus.setdefault(word, 0)
- corpus[word] += 1
- total_words += 1
- corpus.setdefault((prev_word, word), 0)
- corpus[(prev_word, word)] += 1
- return total_words, total_words_bi, corpus
- def process_test_case(line):
- words = p_words.findall(line)
- if len(words) == 1:
- cnt = corpus.get(words[0], 0)
- print(line, cnt, round(cnt * 1.0 / total_words, 5))
- elif len(words) == 2:
- cnt = corpus.get((words[0], words[1]), 0)
- print(line, cnt, round(cnt * 1.0 / (total_words_bi), 5))
- else:
- print('invalid input')
- if __name__ == '__main__':
- total_words, total_words_bi, corpus = generate_corpus(sample_text)
- line = input()
- process_test_case(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement