Guest User

split text into sentences

a guest
Jan 11th, 2022
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.17 KB | None | 0 0
  1. # from nltk import tokenize
  2. import ast
  3. import json
  4. import os
  5. import pyperclip
  6. from binaryornot.check import is_binary
  7.  
  8.  
  9. def read_txt_file_basic(file_loc):
  10.     check_file_type = is_binary(file_loc)
  11.     if not check_file_type:
  12.         file_list = []
  13.         try:
  14.             fh = open(file_loc, 'r', encoding='utf-8')
  15.             lines = fh.readlines()
  16.             for line in range(len(lines)):
  17.                 file_list.append(lines[line].strip())
  18.             return file_list
  19.         except IOError as e:
  20.             print("unable to read file ({})".format(e))
  21.  
  22.  
  23. script_name = os.path.splitext(os.path.basename(__file__))[0]
  24. config_read = read_txt_file_basic(os.getcwd() + os.sep + script_name + '_config.txt')
  25.  
  26. abbreviations_string = str(config_read[0].split('=', 1)[1].strip()).replace("'", '"')
  27. terminators_string = str(config_read[1].split('=', 1)[1].strip())
  28. wrappers_string = str(config_read[2].split('=', 1)[1].strip())
  29.  
  30. abbreviations_input = json.loads('{' + abbreviations_string + '}')
  31. abbreviations_low = dict((str(j).lower(), str(k).lower()) for i, (j, k) in enumerate(abbreviations_input.items()))
  32. abbreviations_cap = dict((str(j).capitalize(), str(k).capitalize()) for i, (j, k) in enumerate(abbreviations_input.items()))
  33. abbreviations_up = dict((str(j).upper(), str(k).upper()) for i, (j, k) in enumerate(abbreviations_input.items()))
  34. abbreviations = {**abbreviations_low, **abbreviations_cap, **abbreviations_up}
  35.  
  36. terminators = ast.literal_eval('[' + terminators_string + ']')
  37. wrappers = ast.literal_eval('[' + wrappers_string + ']')
  38.  
  39.  
  40. def find_sentences(paragraph):
  41.     end = True
  42.     sentences = []
  43.     while end > -1:
  44.         end = find_sentence_end(paragraph)
  45.         if end > -1:
  46.             sentences.append(paragraph[end:].strip())
  47.             paragraph = paragraph[:end]
  48.     sentences.append(paragraph)
  49.     sentences.reverse()
  50.     return sentences
  51.  
  52.  
  53. def find_sentence_end(paragraph):
  54.     [possible_endings, contraction_locations] = [[], []]
  55.     contractions = abbreviations.keys()
  56.     sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
  57.     for sentence_terminator in sentence_terminators:
  58.         t_indices = list(find_all(paragraph, sentence_terminator))
  59.         possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
  60.     for contraction in contractions:
  61.         c_indices = list(find_all(paragraph, contraction))
  62.         contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
  63.     possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
  64.     if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
  65.         max_end_start = max([pe[0] for pe in possible_endings])
  66.         possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
  67.     possible_endings = [pe[0] + pe[1] for pe in possible_endings if
  68.                         sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
  69.     end = (-1 if not len(possible_endings) else max(possible_endings))
  70.     return end
  71.  
  72.  
  73. def find_all(a_str, sub):
  74.     start = 0
  75.     while True:
  76.         start = a_str.find(sub, start)
  77.         if start == -1:
  78.             return
  79.         yield start
  80.         start += len(sub)
  81.  
  82.  
  83. if __name__ == '__main__':
  84.     tmp_sentences = find_sentences(pyperclip.paste())
  85.     result = ''
  86.     for sentence in tmp_sentences:
  87.         result += sentence + '\r\n'
  88.     pyperclip.copy(result)
  89.     # too big compile solution
  90.     # result = ''
  91.     # clipboard = pyperclip.paste()
  92.     # tmp_sentences = tokenize.sent_tokenize(clipboard)
  93.     # for sentence in tmp_sentences:
  94.     #     result += sentence + '\r\n'
  95.     # pyperclip.copy(result)
  96.  
  97.  
  98. # sentences_config.txt file structure (create in same folder as compiled file without # of course):
  99. # abbreviations  = 'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior', 'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'
  100. # terminators = '.', '!', '?'
  101. # wrappers = '"', "'", ')', ']', '}'
Advertisement
Add Comment
Please, Sign In to add comment