Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2017
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.80 KB | None | 0 0
  1. # Created by Panchenko Anton. 15.01.2017.
  2.  
  3. import numpy as np
  4. import sklearn as sk
  5. import codecs
  6. from sklearn.datasets import base
  7. from sklearn.feature_extraction.text import CountVectorizer
  8. from sklearn.feature_extraction.text import TfidfTransformer
  9. from sklearn.linear_model import SGDClassifier
  10. from sklearn.pipeline import Pipeline
  11.  
  12. ident = {'science': 0, 'style': 1, 'culture': 2, 'life': 3, 'economics': 4,
  13. 'business': 5, 'travel': 6, 'forces': 7, 'media': 8, 'sport': 9}
  14.  
  15. revident = {0: 'science', 1: 'style', 2: 'culture', 3: 'life', 4: 'economics',
  16. 5: 'business', 6: 'travel', 7: 'forces', 8: 'media', 9: 'sport'}
  17.  
  18. heading = ['science', 'style', 'culture', 'life', 'economics',
  19. 'business', 'travel', 'forces', 'media', 'sport']
  20.  
  21. itrtr = 0
  22. target = np.zeros((60000,), dtype=np.int64)
  23. data = []
  24.  
  25. with codecs.open('news_train.txt', 'r','UTF-8') as f:
  26. for line in f:
  27. cur_line = line.split()
  28. data.append(line)
  29. target[itrtr] = ident[cur_line[0]]
  30. itrtr += 1
  31.  
  32. training = sk.datasets.base.Bunch(data=data, target=target, target_names=heading)
  33. target = np.zeros((15000,), dtype=np.int64)
  34. data = []
  35.  
  36. with codecs.open('news_test.txt', 'r','UTF-8') as input:
  37. for line in input:
  38. data.append(line)
  39.  
  40. test = sk.datasets.base.Bunch(data=data, target_names=heading)
  41.  
  42. clf = Pipeline([('vect', CountVectorizer()),
  43. ('tfidf', TfidfTransformer()),
  44. ('clf', SGDClassifier(loss='hinge', penalty='l2',
  45. alpha=1e-4, n_iter=5, random_state=42)), ])
  46.  
  47. clf = clf.fit(training.data, training.target)
  48.  
  49. result = clf.predict(test.data)
  50.  
  51. output = codecs.open('news_answer.txt', 'w','UTF-8')
  52. for i in range(15000):
  53. output.write(revident[result[i]] + '\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement