SHARE
TWEET

stopwords_utils.py

a guest Sep 20th, 2019 80 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # import morfeusz2
  2. # from stop_words import get_stop_words
  3.  
  4. word_class_name = {'noun': set(['subst', 'depr'])
  5.                    }
  6.  
  7.  
  8. def from_txt_file_to_list(path):
  9.     file = open(path, "r")
  10.     lines = list(map(lambda x: x.rstrip(), list(file.readlines())))
  11.     print(lines)
  12.     return lines
  13.  
  14.  
  15. def get_stopwords_from_db():
  16.     return set()  # set made of stop words from database
  17.  
  18.  
  19. def filter_word_form(word_form, morphologic_tag):
  20.     if len(morphologic_tag.intersection(word_class_name.get(word_form))) > 0:
  21.         return True
  22.     return False
  23.  
  24.  
  25. class SentenceFilter:
  26.     def __init__(self):
  27.         self.database = []  # db connection here
  28.         # self.stop_words = get_stop_words('pl')  # get_stop_words_from_db()
  29.         # self.analyser = morfeusz2.Morfeusz()
  30.  
  31.     def extract_lemma_and_morphologic_tag(self, word):
  32.         analysis_result = self.analyser.analyse(word)
  33.         # print(analysis_result)
  34.         for element in analysis_result:
  35.             try:
  36.                 morphologic_tag = element[2][2]
  37.                 lemat = element[2][1]
  38.             except IndexError:
  39.                 print('No word class avaliable after analysis in: ``extract_lemat_and_morphologic_tag``')
  40.             morphologic_tag_set = set(morphologic_tag.split(':'))
  41.         return lemat, morphologic_tag_set
  42.  
  43.     def filter_sentence(self, sentence):
  44.         words = list(filter(lambda y: y not in self.stop_words, sentence.split()))
  45.         # print(words)
  46.         sentence_filtered = list(
  47.             filter(lambda x_y: filter_word_form('noun', x_y[1]),  # python3 does not support tuple unpacking, that's why
  48.                    map(lambda z: self.extract_lemma_and_morphologic_tag(z), words)))
  49.         return sentence_filtered
  50.  
  51.  
  52. # input = "kto jest rektorem uczelni"
  53. # print('input: ' + input)
  54. # sentence_filtered = SentenceFilter().filter_sentence(input)
  55. # print('output: ')
  56. # for sentence in sentence_filtered:
  57. #     print("    " + sentence[0])
  58. path = "polish_stopwords.txt"
  59. from_txt_file_to_list(path)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top