Advertisement
AlisaK

Untitled

Oct 23rd, 2017
121
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.38 KB | None | 0 0
  1. p_words = re.compile(r'\w+')
  2.  
  3.  
  4. # p_words = re.compile(r'[^\n\r\t\.\:\,\d\!\?; ]+')
  5. # p_words = re.compile(r'[\w\']+')
  6.  
  7.  
  8. def parse_line(line):
  9.     words = p_words.findall(line)
  10.     return [w.lower() for w in words]
  11.  
  12.  
  13. def generate_corpus(text):
  14.     corpus = {}
  15.     words = parse_line(text)
  16.     total_words = 0
  17.     total_words_bi = len(words) - 1
  18.     if len(words) > 0:
  19.         if len(words[0]) > 2:
  20.             corpus[words[0]] = 1
  21.             total_words += 1
  22.         for i in range(1, len(words)):
  23.             word = words[i]
  24.             prev_word = words[i - 1]
  25.             if len(word) > 2:
  26.                 corpus.setdefault(word, 0)
  27.                 corpus[word] += 1
  28.                 total_words += 1
  29.             corpus.setdefault((prev_word, word), 0)
  30.             corpus[(prev_word, word)] += 1
  31.  
  32.     return total_words, total_words_bi, corpus
  33.  
  34.  
  35. def process_test_case(line):
  36.     words = p_words.findall(line)
  37.     if len(words) == 1:
  38.         cnt = corpus.get(words[0], 0)
  39.         print(line, cnt, round(cnt * 1.0 / total_words, 5))
  40.     elif len(words) == 2:
  41.         cnt = corpus.get((words[0], words[1]), 0)
  42.         print(line, cnt, round(cnt * 1.0 / (total_words_bi), 5))
  43.     else:
  44.         print('invalid input')
  45.  
  46.  
  47. if __name__ == '__main__':
  48.     total_words, total_words_bi, corpus = generate_corpus(sample_text)
  49.     line = input()
  50.     process_test_case(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement