Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from POS.POSTagger import POSTagger as PPOSTagger
- def cleanup_string(text):
- elim_char_string = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM12345678910\/\n\t\r.,:-<>!@#$%^&*+=?..\"\')([]«»"
- text = text.translate(str.maketrans(elim_char_string, ' ' * len(elim_char_string)))
- text = text.replace(" ",'')
- text = text.rstrip()
- return text
- def process_text_with_perpos_tagger(input_string):
- global tagger,perpos_tagger
- try:
- input_string.strip()
- print("input_string")
- print(input_string)
- elim_tag_list = ['N','Ne','P','Pe','RES','RESe','DET','DETe','CONJ','CONJe','CL','POSTP','PRO','PROe','NUM','NUMe','PUNC ']
- tokenized_data = perpos_tagger.parse(input_string.split())
- print(tokenized_data)
- reduced_list = []
- i = 0
- j = 0
- while(i<len(tokenized_data)-j):
- d = tokenized_data[i][1]
- if d not in elim_tag_list:
- reduced_list.append(tokenized_data[i][0])
- j+=1
- i+=1
- if len(reduced_list)>0:
- reduced_string = " ".join(reduced_list)
- reduced_string = cleanup_string(reduced_string)
- print("reduced string")
- print(reduced_string)
- return reduced_string
- else:
- return None
- except:
- return None
Add Comment
Please, Sign In to add comment