Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # from nltk import tokenize
- import ast
- import json
- import os
- import pyperclip
- from binaryornot.check import is_binary
- def read_txt_file_basic(file_loc):
- check_file_type = is_binary(file_loc)
- if not check_file_type:
- file_list = []
- try:
- fh = open(file_loc, 'r', encoding='utf-8')
- lines = fh.readlines()
- for line in range(len(lines)):
- file_list.append(lines[line].strip())
- return file_list
- except IOError as e:
- print("unable to read file ({})".format(e))
- script_name = os.path.splitext(os.path.basename(__file__))[0]
- config_read = read_txt_file_basic(os.getcwd() + os.sep + script_name + '_config.txt')
- abbreviations_string = str(config_read[0].split('=', 1)[1].strip()).replace("'", '"')
- terminators_string = str(config_read[1].split('=', 1)[1].strip())
- wrappers_string = str(config_read[2].split('=', 1)[1].strip())
- abbreviations_input = json.loads('{' + abbreviations_string + '}')
- abbreviations_low = dict((str(j).lower(), str(k).lower()) for i, (j, k) in enumerate(abbreviations_input.items()))
- abbreviations_cap = dict((str(j).capitalize(), str(k).capitalize()) for i, (j, k) in enumerate(abbreviations_input.items()))
- abbreviations_up = dict((str(j).upper(), str(k).upper()) for i, (j, k) in enumerate(abbreviations_input.items()))
- abbreviations = {**abbreviations_low, **abbreviations_cap, **abbreviations_up}
- terminators = ast.literal_eval('[' + terminators_string + ']')
- wrappers = ast.literal_eval('[' + wrappers_string + ']')
- def find_sentences(paragraph):
- end = True
- sentences = []
- while end > -1:
- end = find_sentence_end(paragraph)
- if end > -1:
- sentences.append(paragraph[end:].strip())
- paragraph = paragraph[:end]
- sentences.append(paragraph)
- sentences.reverse()
- return sentences
- def find_sentence_end(paragraph):
- [possible_endings, contraction_locations] = [[], []]
- contractions = abbreviations.keys()
- sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
- for sentence_terminator in sentence_terminators:
- t_indices = list(find_all(paragraph, sentence_terminator))
- possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
- for contraction in contractions:
- c_indices = list(find_all(paragraph, contraction))
- contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
- possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
- if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
- max_end_start = max([pe[0] for pe in possible_endings])
- possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
- possible_endings = [pe[0] + pe[1] for pe in possible_endings if
- sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
- end = (-1 if not len(possible_endings) else max(possible_endings))
- return end
- def find_all(a_str, sub):
- start = 0
- while True:
- start = a_str.find(sub, start)
- if start == -1:
- return
- yield start
- start += len(sub)
- if __name__ == '__main__':
- tmp_sentences = find_sentences(pyperclip.paste())
- result = ''
- for sentence in tmp_sentences:
- result += sentence + '\r\n'
- pyperclip.copy(result)
- # too big compile solution
- # result = ''
- # clipboard = pyperclip.paste()
- # tmp_sentences = tokenize.sent_tokenize(clipboard)
- # for sentence in tmp_sentences:
- # result += sentence + '\r\n'
- # pyperclip.copy(result)
- # sentences_config.txt file structure (create in same folder as compiled file without # of course):
- # abbreviations = 'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior', 'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'
- # terminators = '.', '!', '?'
- # wrappers = '"', "'", ')', ']', '}'
Advertisement
Add Comment
Please, Sign In to add comment