daily pastebin goal
31%
SHARE
TWEET

Untitled

a guest Dec 11th, 2018 48 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from sklearn_crfsuite import scorers,metrics
  2. from sklearn.metrics import make_scorer
  3. from sklearn.model_selection import cross_validate,train_test_split
  4. import sklearn_crfsuite
  5. def doc2features(doc, i):
  6.     word = doc[i][0]
  7.     postag = doc[i][1]
  8.     # Features from current word
  9.     features={
  10.         'word.word': word,
  11.         'word.isspace':word.isspace(),
  12.         'postag':postag,
  13.         'word.isdigit()': word.isdigit()
  14.     }
  15.     if i > 0:
  16.         prevword = doc[i-1][0]
  17.         postag1 = doc[i-1][1]
  18.         features['word.prevword'] = prevword
  19.         features['word.previsspace']=prevword.isspace()
  20.         features['word.prepostag'] = postag1
  21.         features['word.prevwordisdigit'] = prevword.isdigit()
  22.     else:
  23.         features['BOS'] = True # Special "Beginning of Sequence" tag
  24.     # Features from next word
  25.     if i < len(doc)-1:
  26.         nextword = doc[i+1][0]
  27.         postag1 = doc[i+1][1]
  28.         features['word.nextword'] = nextword
  29.         features['word.nextisspace']=nextword.isspace()
  30.         features['word.nextpostag'] = postag1
  31.         features['word.nextwordisdigit'] = nextword.isdigit()
  32.     else:
  33.         features['EOS'] = True # Special "End of Sequence" tag
  34.     return features
  35.  
  36. def extract_features(doc):
  37.     return [doc2features(doc, i) for i in range(len(doc))]
  38.  
  39. def get_labels(doc):
  40.     return [tag for (token,postag,tag) in doc]
  41.  
  42.  
  43. X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
  44. y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา
  45.  
  46. X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
  47. crf = sklearn_crfsuite.CRF(
  48.     algorithm='lbfgs',
  49.     c1=0.1,
  50.     c2=0.1,
  51.     max_iterations=500,
  52.     all_possible_transitions=True,
  53.     model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
  54. )
  55. crf.fit(X, y); # train
  56.  
  57. labels = list(crf.classes_)
  58. labels.remove('O')
  59. y_pred = crf.predict(X_test)
  60. e=metrics.flat_f1_score(y_test, y_pred,
  61.                       average='weighted', labels=labels)
  62. print(e) # โชว์ประสิทธิภาพ
  63. sorted_labels = sorted(
  64.     labels,
  65.     key=lambda name: (name[1:], name[0])
  66. )
  67. print(metrics.flat_classification_report(
  68.     y_test, y_pred, labels=sorted_labels, digits=3
  69. ))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand