Advertisement
Guest User

Untitled

a guest
Mar 21st, 2019
464
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.65 KB | None | 0 0
  1. train_contents=[]
  2. train_labels=[]
  3. test_contents=[]
  4. test_labels=[]
  5. #  导入文件
  6. import os
  7. import io
  8. start=os.listdir(r'text classification/train')
  9. for item in start:
  10.     test_path='text classification/test/'+item+'/'
  11.     train_path='text classification/train/'+item+'/'
  12.     for file in os.listdir(test_path):
  13.         with open(test_path+file,encoding="GBK") as f:
  14.             test_contents.append(f.readline())
  15.             #print(test_contents)
  16.             test_labels.append(item)
  17.     for file in os.listdir(train_path):
  18.         with open(train_path+file,encoding='gb18030', errors='ignore') as f:
  19.             train_contents.append(f.readline())
  20.             train_labels.append(item)
  21. print(len(train_contents),len(test_contents))
  22.  
  23. # 导入stop word
  24. import jieba
  25. from sklearn import metrics
  26. from sklearn.naive_bayes import MultinomialNB  
  27. stop_words = [line.strip() for line in io.open('text classification/stop/stopword.txt').readlines()]
  28.  
  29. # 分词方式使用jieba,计算单词的权重
  30. tf = TfidfVectorizer(tokenizer=jieba.cut,stop_words=stop_words, max_df=0.5)
  31. train_features = tf.fit_transform(train_contents)
  32. print(train_features.shape)
  33.  
  34. 模块 4:生成朴素贝叶斯分类器
  35. # 多项式贝叶斯分类器
  36. clf = MultinomialNB(alpha=0.001).fit(train_features, train_labels)
  37.  
  38. 模块 5:使用生成的分类器做预测
  39. test_tf = TfidfVectorizer(tokenizer=jieba.cut,stop_words=stop_words, max_df=0.5, vocabulary=tf.vocabulary_)
  40. test_features=test_tf.fit_transform(test_contents)
  41.  
  42. print(test_features.shape)
  43. predicted_labels=clf.predict(test_features)
  44. print(metrics.accuracy_score(test_labels, predicted_labels))
  45.  
  46. # 最终结果0.925
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement