Advertisement
Guest User

HW_ tonal_analysis

a guest
Jun 24th, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.23 KB | None | 0 0
  1. import sklearn
  2. import io
  3. import re
  4. import math
  5. import xml.etree.ElementTree as ET
  6. from sklearn.feature_extraction.text import CountVectorizer
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from sklearn.feature_extraction.text import HashingVectorizer
  9. import pymorphy2
  10. import nltk
  11. import string
  12. import numpy as np
  13. from nltk.corpus import stopwords
  14. nltk.download('punkt')
  15. nltk.download('stopwords')
  16.  
  17.  
  18.  
  19. TEST_FILENAME = 'news_eval_test.xml'
  20. TRAIN_FILENAME = 'news_eval_train.xml'
  21.  
  22.  
  23.  
  24. class Citation:
  25.     def __init__(self, words, evaluation: str):
  26.         self.words = words
  27.         self.eval = evaluation
  28.  
  29.     def __str__(self):
  30.         return f'{self.eval}: {self.words}'
  31.  
  32.     def __repr__(self):
  33.         return self.__str__()
  34.  
  35.  
  36.  
  37.  
  38. morph = pymorphy2.MorphAnalyzer()
  39. #print(stopwords.words('russian'))
  40. def tokenize_me_please(file_text):
  41.     tokens = nltk.word_tokenize(file_text)
  42.     tokens = [i for i in tokens if ( i not in string.punctuation )]
  43.     stop_words = stopwords.words('russian')
  44.     words_to_remove = [ 'все', 'нет', 'ни',  'ничего', 'без', 'никогда', 'наконец', 'больше', 'хорошо', 'лучше','нельзя',
  45.                        'более', 'всегда', 'конечно', 'всю', 'такой', 'впрочем', 'так', 'вот', 'можно', 'даже', 'разве']
  46.     for word in words_to_remove:
  47.         stop_words.remove(word)
  48.     tokens = [morph.parse(re.sub(r'[^\w\s]', '', i).lower())[0].normal_form for i in tokens if ( i not in stop_words )]
  49.     tokens = [i.replace("«", "").replace("»", "") for i in tokens]
  50.     for item in tokens:
  51.         if '' == item or item.isspace():
  52.             while item in tokens:
  53.                 tokens.remove(item)
  54.     return tokens
  55.  
  56.  
  57.  
  58. def parse_xml(file: str) -> list:
  59.     tree = ET.parse(file)
  60.     root = tree.getroot()
  61.     corpus = []
  62.     citations = []
  63.     for elem in root.iter('speech'):
  64.         corpus.append(tokenize_me_please(elem.text))
  65.     i = 0
  66.     for elem in root.iter('evaluation'):
  67.         pair_eval = elem.text.replace("\n", "")
  68.         pair_eval = ''.join(pair_eval.split())
  69.         # print (corpus[i])
  70.         if pair_eval in ['0', '+', '-']:
  71.             citation = Citation(corpus[i], pair_eval)
  72.             citations.append(citation)
  73.  
  74.         i += 1
  75.         # evaluate_id.append(elem.text)
  76.     return citations
  77.        
  78.  
  79. citations_train = parse_xml(TRAIN_FILENAME)
  80. citations_test = parse_xml(TEST_FILENAME)
  81.  
  82. def vectorized_corpus(parsed_list_train, parsed_list_test):#cheet
  83.     out_list_train = []
  84.     out_list_test = []
  85.     eval_list = []
  86.     X_train = []
  87.     y_train = []
  88.     X_test = []
  89.     y_test = []
  90.    
  91.     for cite in parsed_list_train:
  92.         citation = cite.words
  93.         evaluation = cite.eval
  94.         citation = ' '.join(citation).strip()
  95.         out_list_train.append(citation)
  96.         y_train.append(evaluation)
  97.        
  98.     for cite in parsed_list_test:
  99.         citation = cite.words
  100.         evaluation = cite.eval
  101.         citation = ' '.join(citation).strip()
  102.         out_list_test.append(citation)
  103.         y_test.append(evaluation)
  104.            
  105.     #vectorizer = CountVectorizer()#частотный
  106.     vectorizer = HashingVectorizer(n_features=2**17)#очень долго считает и результаты хуже
  107.     #vectorizer = TfidfVectorizer()#tf-idf
  108.     train_data = vectorizer.fit_transform(out_list_train)
  109.     X_train = train_data.toarray()
  110.     test_data = vectorizer.transform(out_list_test)
  111.     X_test = test_data.toarray()
  112.     return X_train, y_train, X_test, y_test
  113.  
  114.  
  115. X_train, y_train, X_test, y_test = vectorized_corpus(citations_train,citations_test)
  116.  
  117.  
  118. from sklearn import metrics
  119. from sklearn.linear_model import LogisticRegression
  120.  
  121.  
  122. model = LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, tol=0.0001,
  123.          intercept_scaling=1, max_iter=1000, penalty='l2', random_state=0, solver='saga', multi_class='ovr',  warm_start=True)
  124.  
  125. model.fit(X_train, y_train)
  126. print(model)
  127.  
  128. # make predictions
  129. expected = y_test
  130. predicted = model.predict(X_test)
  131. # summarize the fit of the model
  132. print(metrics.classification_report(expected, predicted))
  133. print(model.score(X_test,y_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement