daily pastebin goal
6%
SHARE
TWEET

CorpusManager

a guest Apr 16th, 2018 59 in 19 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import re
  2. import pymorphy2
  3.  
  4.  
  5. class CorpusManager:
  6.     morph = pymorphy2.MorphAnalyzer()
  7.  
  8.     def __init__(self, fileNames):
  9.         self.words = []
  10.         for fileName in fileNames:
  11.             with open(fileName, 'r', encoding='utf-8') as f:
  12.                 self.words += re.findall(r'\w+', f.read())
  13.         self.words = list(map(str.lower, self.words))
  14.         self.lexemes = [self.morph.parse(word)[0].normal_form for word in self.words]
  15.  
  16.     def findFrequency(self, lexeme):
  17.         count = self.lexemes.count(lexeme)
  18.         return 'Слово: {}\nДлина корпуса: {}\nЧисло слововхождений: {}\nЧастота встречаемости: {}'\
  19.             .format(lexeme, len(self.words), count, round(count / len(self.lexemes), 5))
  20.  
  21.     def getMorphInfo(self, word):
  22.         return self.morph.parse(word)[0].tag.cyr_repr
  23.  
  24.     def getWordforms(self, lexeme):
  25.         wordforms = set()
  26.         for word in self.words:
  27.             if self.morph.parse(word)[0].normal_form == lexeme:
  28.                 wordforms.add(word)
  29.         return wordforms
  30.  
  31.     def __isPhrases(self, firstTag, secondTag):
  32.         if 'Apro' in secondTag:
  33.             return False
  34.         if firstTag.number != secondTag.number:
  35.             return False
  36.         if secondTag.POS in ['VERB', 'ADJF', 'ADJS']:
  37.             return True
  38.  
  39.     def getPhrases(self, lexeme):
  40.         collocations = set()
  41.         tag_lexeme = self.morph.parse(lexeme)[0].tag
  42.  
  43.         for i in range(1, len(self.lexemes)-1):
  44.             if self.lexemes[i] == lexeme:
  45.                 tag_leftWord = self.morph.parse(self.words[i-1])[0].tag
  46.                 tag_rightWord = self.morph.parse(self.words[i+1])[0].tag
  47.  
  48.                 if self.__isPhrases(tag_lexeme, tag_leftWord):
  49.                     collocations.add(self.words[i-1] + ' ' + self.words[i])
  50.                 if self.__isPhrases(tag_lexeme, tag_rightWord):
  51.                     collocations.add(self.words[i] + ' ' + self.words[i+1])
  52.         return collocations
  53.  
  54.  
  55. fileNames = ['Essay_Winter_1', 'Essay_Winter_2', 'Essay_Winter_3']
  56. cm = CorpusManager(fileNames)
  57. wordToCheck = 'зима'
  58. print(cm.words)
  59. print(cm.lexemes)
  60. print(cm.findFrequency(wordToCheck))
  61. print('Морфологическая информация:', cm.getMorphInfo(wordToCheck))
  62. print('Словоформы:', cm.getWordforms(wordToCheck))
  63. print(cm.getPhrases(wordToCheck))
RAW Paste Data
Top