Advertisement
Guest User

Untitled

a guest
Feb 17th, 2019
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.75 KB | None | 0 0
  1. # coding=utf-8
  2. import codecs
  3. import string
  4. import re
  5. import math
  6. import time
  7.  
  8. start = time.time()
  9. print('Training...')
  10. categories = ('business', 'culture', 'science', 'economics', 'forces', 'life', 'media', 'sport', 'style', 'travel')
  11. data = {cat: {'words': 0, 'docs': 0} for cat in categories}
  12. words = {}
  13. with codecs.open('news_train.txt', 'r', 'utf8') as train:
  14.     for line in train:
  15.         line = re.sub(r'\b\w{1,2}\b', ' ', re.sub('[-.»«0-9' + string.punctuation + ']', ' ', line)).lower()
  16.         line_split = [s.strip() for s in line.split("\t")]
  17.         file_words = line_split[1].split() + line_split[2].split()
  18.         category = line_split[0]
  19.         data[category]['words'] += 1
  20.         data[category]['docs'] += 1
  21.         for word in file_words:
  22.             if word not in words:
  23.                 words[word] = {cat: 1 for cat in categories}
  24.             words[word][category] += 1
  25.             data[category]['words'] += 1
  26.  
  27. print('Text reading completed in', round(time.time() - start, 3), 'sec.')
  28. print('Counting probabilities...')
  29. word_probability = {}
  30. for word in words:
  31.     word_count = sum(words[word].values())
  32.     word_probability[word] = {cat: words[word][cat] / word_count for cat in categories}
  33.  
  34. print('Probability counting completed in', round(time.time() - start, 3), 'sec.')
  35. print('Normalization...')
  36.  
  37. word_probability_ln = {}
  38. for word in words:
  39.     denominator = sum(word_probability[word][cat] / data[cat]['words'] for cat in categories)
  40.     word_probability_ln[word] = {cat: math.log((word_probability[word][cat] / data[cat]['words']) / denominator)
  41.                                  for cat in categories}
  42.  
  43. print('Normalization completed in', round(time.time() - start, 3), 'sec.')
  44.  
  45. documents_count = sum(data[cat]['docs'] for cat in categories)
  46. category_prob_ln = {cat: math.log(data[cat]['docs'] / documents_count) for cat in categories}
  47.  
  48. print('Training completed in', round(time.time() - start, 3), 'sec.')
  49. print('Recognition...')
  50. unknown_word_probability = 0.1
  51. denominator = sum(unknown_word_probability / data[cat]['words'] for cat in categories)
  52. unknown_prob_ln = {cat: math.log((unknown_word_probability / data[cat]['words']) / denominator) for cat in categories}
  53.  
  54. with codecs.open('news_test.txt', 'r', 'utf8') as test, open('output.txt', 'w+') as output:
  55.     for line in test:
  56.         line = re.sub(r'\b\w{1,2}\b', ' ', re.sub('[-.»«0-9' + string.punctuation + ']', ' ', line)).lower()
  57.         file_words = [s.strip() for s in line.split()]
  58.         p = {sum((word_probability_ln.get(word, unknown_prob_ln)[cat] for word in file_words), category_prob_ln[cat]): cat
  59.              for cat in categories}
  60.         output.write(p[max(p)] + '\n')
  61.  
  62. print('Recognition completed in', round(time.time() - start, 3), 'sec.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement