Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn_crfsuite import scorers,metrics
- from sklearn.metrics import make_scorer
- from sklearn.model_selection import cross_validate,train_test_split
- import sklearn_crfsuite
- def doc2features(doc, i):
- word = doc[i][0]
- postag = doc[i][1]
- # Features from current word
- features={
- 'word.word': word,
- 'word.isspace':word.isspace(),
- 'postag':postag,
- 'word.isdigit()': word.isdigit()
- }
- if i > 0:
- prevword = doc[i-1][0]
- postag1 = doc[i-1][1]
- features['word.prevword'] = prevword
- features['word.previsspace']=prevword.isspace()
- features['word.prepostag'] = postag1
- features['word.prevwordisdigit'] = prevword.isdigit()
- else:
- features['BOS'] = True # Special "Beginning of Sequence" tag
- # Features from next word
- if i < len(doc)-1:
- nextword = doc[i+1][0]
- postag1 = doc[i+1][1]
- features['word.nextword'] = nextword
- features['word.nextisspace']=nextword.isspace()
- features['word.nextpostag'] = postag1
- features['word.nextwordisdigit'] = nextword.isdigit()
- else:
- features['EOS'] = True # Special "End of Sequence" tag
- return features
- def extract_features(doc):
- return [doc2features(doc, i) for i in range(len(doc))]
- def get_labels(doc):
- return [tag for (token,postag,tag) in doc]
- X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
- y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา
- X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
- crf = sklearn_crfsuite.CRF(
- algorithm='lbfgs',
- c1=0.1,
- c2=0.1,
- max_iterations=500,
- all_possible_transitions=True,
- model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
- )
- crf.fit(X, y); # train
- labels = list(crf.classes_)
- labels.remove('O')
- y_pred = crf.predict(X_test)
- e=metrics.flat_f1_score(y_test, y_pred,
- average='weighted', labels=labels)
- print(e) # โชว์ประสิทธิภาพ
- sorted_labels = sorted(
- labels,
- key=lambda name: (name[1:], name[0])
- )
- print(metrics.flat_classification_report(
- y_test, y_pred, labels=sorted_labels, digits=3
- ))
Add Comment
Please, Sign In to add comment