Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Created by Panchenko Anton. 15.01.2017.
- import numpy as np
- import sklearn as sk
- import codecs
- from sklearn.datasets import base
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.linear_model import SGDClassifier
- from sklearn.pipeline import Pipeline
- ident = {'science': 0, 'style': 1, 'culture': 2, 'life': 3, 'economics': 4,
- 'business': 5, 'travel': 6, 'forces': 7, 'media': 8, 'sport': 9}
- revident = {0: 'science', 1: 'style', 2: 'culture', 3: 'life', 4: 'economics',
- 5: 'business', 6: 'travel', 7: 'forces', 8: 'media', 9: 'sport'}
- heading = ['science', 'style', 'culture', 'life', 'economics',
- 'business', 'travel', 'forces', 'media', 'sport']
- itrtr = 0
- target = np.zeros((60000,), dtype=np.int64)
- data = []
- with codecs.open('news_train.txt', 'r','UTF-8') as f:
- for line in f:
- cur_line = line.split()
- data.append(line)
- target[itrtr] = ident[cur_line[0]]
- itrtr += 1
- training = sk.datasets.base.Bunch(data=data, target=target, target_names=heading)
- target = np.zeros((15000,), dtype=np.int64)
- data = []
- with codecs.open('news_test.txt', 'r','UTF-8') as input:
- for line in input:
- data.append(line)
- test = sk.datasets.base.Bunch(data=data, target_names=heading)
- clf = Pipeline([('vect', CountVectorizer()),
- ('tfidf', TfidfTransformer()),
- ('clf', SGDClassifier(loss='hinge', penalty='l2',
- alpha=1e-4, n_iter=5, random_state=42)), ])
- clf = clf.fit(training.data, training.target)
- result = clf.predict(test.data)
- output = codecs.open('news_answer.txt', 'w','UTF-8')
- for i in range(15000):
- output.write(revident[result[i]] + '\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement