Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding=utf-8
- import codecs
- import string
- import re
- import math
- import time
- start = time.time()
- print('Training...')
- categories = ('business', 'culture', 'science', 'economics', 'forces', 'life', 'media', 'sport', 'style', 'travel')
- data = {cat: {'words': 0, 'docs': 0} for cat in categories}
- words = {}
- with codecs.open('news_train.txt', 'r', 'utf8') as train:
- for line in train:
- line = re.sub(r'\b\w{1,2}\b', ' ', re.sub('[-.»«0-9' + string.punctuation + ']', ' ', line)).lower()
- line_split = [s.strip() for s in line.split("\t")]
- file_words = line_split[1].split() + line_split[2].split()
- category = line_split[0]
- data[category]['words'] += 1
- data[category]['docs'] += 1
- for word in file_words:
- if word not in words:
- words[word] = {cat: 1 for cat in categories}
- words[word][category] += 1
- data[category]['words'] += 1
- print('Text reading completed in', round(time.time() - start, 3), 'sec.')
- print('Counting probabilities...')
- word_probability = {}
- for word in words:
- word_count = sum(words[word].values())
- word_probability[word] = {cat: words[word][cat] / word_count for cat in categories}
- print('Probability counting completed in', round(time.time() - start, 3), 'sec.')
- print('Normalization...')
- word_probability_ln = {}
- for word in words:
- denominator = sum(word_probability[word][cat] / data[cat]['words'] for cat in categories)
- word_probability_ln[word] = {cat: math.log((word_probability[word][cat] / data[cat]['words']) / denominator)
- for cat in categories}
- print('Normalization completed in', round(time.time() - start, 3), 'sec.')
- documents_count = sum(data[cat]['docs'] for cat in categories)
- category_prob_ln = {cat: math.log(data[cat]['docs'] / documents_count) for cat in categories}
- print('Training completed in', round(time.time() - start, 3), 'sec.')
- print('Recognition...')
- unknown_word_probability = 0.1
- denominator = sum(unknown_word_probability / data[cat]['words'] for cat in categories)
- unknown_prob_ln = {cat: math.log((unknown_word_probability / data[cat]['words']) / denominator) for cat in categories}
- with codecs.open('news_test.txt', 'r', 'utf8') as test, open('output.txt', 'w+') as output:
- for line in test:
- line = re.sub(r'\b\w{1,2}\b', ' ', re.sub('[-.»«0-9' + string.punctuation + ']', ' ', line)).lower()
- file_words = [s.strip() for s in line.split()]
- p = {sum((word_probability_ln.get(word, unknown_prob_ln)[cat] for word in file_words), category_prob_ln[cat]): cat
- for cat in categories}
- output.write(p[max(p)] + '\n')
- print('Recognition completed in', round(time.time() - start, 3), 'sec.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement