Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sklearn
- import io
- import re
- import math
- import xml.etree.ElementTree as ET
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.feature_extraction.text import HashingVectorizer
- import pymorphy2
- import nltk
- import string
- import numpy as np
- from nltk.corpus import stopwords
- nltk.download('punkt')
- nltk.download('stopwords')
- TEST_FILENAME = 'news_eval_test.xml'
- TRAIN_FILENAME = 'news_eval_train.xml'
- class Citation:
- def __init__(self, words, evaluation: str):
- self.words = words
- self.eval = evaluation
- def __str__(self):
- return f'{self.eval}: {self.words}'
- def __repr__(self):
- return self.__str__()
- morph = pymorphy2.MorphAnalyzer()
- #print(stopwords.words('russian'))
- def tokenize_me_please(file_text):
- tokens = nltk.word_tokenize(file_text)
- tokens = [i for i in tokens if ( i not in string.punctuation )]
- stop_words = stopwords.words('russian')
- words_to_remove = [ 'все', 'нет', 'ни', 'ничего', 'без', 'никогда', 'наконец', 'больше', 'хорошо', 'лучше','нельзя',
- 'более', 'всегда', 'конечно', 'всю', 'такой', 'впрочем', 'так', 'вот', 'можно', 'даже', 'разве']
- for word in words_to_remove:
- stop_words.remove(word)
- tokens = [morph.parse(re.sub(r'[^\w\s]', '', i).lower())[0].normal_form for i in tokens if ( i not in stop_words )]
- tokens = [i.replace("«", "").replace("»", "") for i in tokens]
- for item in tokens:
- if '' == item or item.isspace():
- while item in tokens:
- tokens.remove(item)
- return tokens
- def parse_xml(file: str) -> list:
- tree = ET.parse(file)
- root = tree.getroot()
- corpus = []
- citations = []
- for elem in root.iter('speech'):
- corpus.append(tokenize_me_please(elem.text))
- i = 0
- for elem in root.iter('evaluation'):
- pair_eval = elem.text.replace("\n", "")
- pair_eval = ''.join(pair_eval.split())
- # print (corpus[i])
- if pair_eval in ['0', '+', '-']:
- citation = Citation(corpus[i], pair_eval)
- citations.append(citation)
- i += 1
- # evaluate_id.append(elem.text)
- return citations
- citations_train = parse_xml(TRAIN_FILENAME)
- citations_test = parse_xml(TEST_FILENAME)
- def vectorized_corpus(parsed_list_train, parsed_list_test):#cheet
- out_list_train = []
- out_list_test = []
- eval_list = []
- X_train = []
- y_train = []
- X_test = []
- y_test = []
- for cite in parsed_list_train:
- citation = cite.words
- evaluation = cite.eval
- citation = ' '.join(citation).strip()
- out_list_train.append(citation)
- y_train.append(evaluation)
- for cite in parsed_list_test:
- citation = cite.words
- evaluation = cite.eval
- citation = ' '.join(citation).strip()
- out_list_test.append(citation)
- y_test.append(evaluation)
- #vectorizer = CountVectorizer()#частотный
- vectorizer = HashingVectorizer(n_features=2**17)#очень долго считает и результаты хуже
- #vectorizer = TfidfVectorizer()#tf-idf
- train_data = vectorizer.fit_transform(out_list_train)
- X_train = train_data.toarray()
- test_data = vectorizer.transform(out_list_test)
- X_test = test_data.toarray()
- return X_train, y_train, X_test, y_test
- X_train, y_train, X_test, y_test = vectorized_corpus(citations_train,citations_test)
- from sklearn import metrics
- from sklearn.linear_model import LogisticRegression
- model = LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, tol=0.0001,
- intercept_scaling=1, max_iter=1000, penalty='l2', random_state=0, solver='saga', multi_class='ovr', warm_start=True)
- model.fit(X_train, y_train)
- print(model)
- # make predictions
- expected = y_test
- predicted = model.predict(X_test)
- # summarize the fit of the model
- print(metrics.classification_report(expected, predicted))
- print(model.score(X_test,y_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement