Advertisement
Guest User

stopwords_utils.py

a guest
Sep 20th, 2019
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.01 KB | None | 0 0
  1. # import morfeusz2
  2. # from stop_words import get_stop_words
  3.  
  4. word_class_name = {'noun': set(['subst', 'depr'])
  5. }
  6.  
  7.  
  8. def from_txt_file_to_list(path):
  9. file = open(path, "r")
  10. lines = list(map(lambda x: x.rstrip(), list(file.readlines())))
  11. print(lines)
  12. return lines
  13.  
  14.  
  15. def get_stopwords_from_db():
  16. return set() # set made of stop words from database
  17.  
  18.  
  19. def filter_word_form(word_form, morphologic_tag):
  20. if len(morphologic_tag.intersection(word_class_name.get(word_form))) > 0:
  21. return True
  22. return False
  23.  
  24.  
  25. class SentenceFilter:
  26. def __init__(self):
  27. self.database = [] # db connection here
  28. # self.stop_words = get_stop_words('pl') # get_stop_words_from_db()
  29. # self.analyser = morfeusz2.Morfeusz()
  30.  
  31. def extract_lemma_and_morphologic_tag(self, word):
  32. analysis_result = self.analyser.analyse(word)
  33. # print(analysis_result)
  34. for element in analysis_result:
  35. try:
  36. morphologic_tag = element[2][2]
  37. lemat = element[2][1]
  38. except IndexError:
  39. print('No word class avaliable after analysis in: ``extract_lemat_and_morphologic_tag``')
  40. morphologic_tag_set = set(morphologic_tag.split(':'))
  41. return lemat, morphologic_tag_set
  42.  
  43. def filter_sentence(self, sentence):
  44. words = list(filter(lambda y: y not in self.stop_words, sentence.split()))
  45. # print(words)
  46. sentence_filtered = list(
  47. filter(lambda x_y: filter_word_form('noun', x_y[1]), # python3 does not support tuple unpacking, that's why
  48. map(lambda z: self.extract_lemma_and_morphologic_tag(z), words)))
  49. return sentence_filtered
  50.  
  51.  
  52. # input = "kto jest rektorem uczelni"
  53. # print('input: ' + input)
  54. # sentence_filtered = SentenceFilter().filter_sentence(input)
  55. # print('output: ')
  56. # for sentence in sentence_filtered:
  57. # print(" " + sentence[0])
  58. path = "polish_stopwords.txt"
  59. from_txt_file_to_list(path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement