Advertisement
Guest User

Untitled

a guest
Oct 13th, 2019
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.84 KB | None | 0 0
  1. import itertools
  2. from fuzzywuzzy import fuzz, process
  3. import numpy as np
  4. import sys
  5.  
  6. # This script preprocesses the raw text, and allows to query a sentence to find the target value.
  7. # NOTE: The match score is NOT entirely indicative of the actual accuracy, it is dependent on the search query and possible (not yet implemented) preprocess that can be done on text output.
  8. # NOTE: IMPORTANT! make sure the query is both unique in nature and LONG (several words). If you query only a few words, there will be many matches.
  9. # Test terminal command: python textract.py 'maximum amount of credit we may lend you is' AAoutput.txt 95 0
  10.  
  11. # preprocess_text(path_to_text_file)
  12. def preprocesses_text(textfile):
  13.     with open(textfile, 'r') as f:
  14.         text_output = [line.split() for line in f.readlines()]
  15.     text_output = list(filter(None, text_output))
  16.     text_output = list(itertools.chain.from_iterable(text_output))
  17.     text_output = [item.lower() for item in text_output]
  18.     return text_output
  19.  
  20.  
  21. # get_value(search_string, text_output, [threshold], [position])
  22. def get_value(search_string, text_output, threshold=90, position=0):
  23.     search_string_list = search_string.split()
  24.     # Get list of substrings, to iterate fuzz
  25.     sub_string_list = []
  26.     for index, _ in enumerate(text_output):
  27.         sub_string = text_output[index: index+len(search_string_list)]
  28.         sub_string = ' '.join(sub_string)
  29.         sub_string_list.append(sub_string)
  30.  
  31.     # Get list of scores with fuzz
  32.     scores = []
  33.     for string in sub_string_list:
  34.         score = fuzz.ratio(string, search_string)
  35.         scores.append(score)
  36.  
  37.     # Get index of max score
  38.     max_score = np.amax(scores)
  39.     max_scores_indexes = [index for index,
  40.                           score in enumerate(scores) if score == max_score]
  41.     if len(max_scores_indexes) > 2:
  42.         raise Exception(
  43.             'Returned multiple best matches. Please try another query')
  44.  
  45.     if threshold is not None:
  46.         if max_score < threshold:
  47.             raise Exception('Best match does not exceed threshold')
  48.  
  49.     # Return value
  50.     if position == 0:
  51.         target_value_index = max_scores_indexes[0]+len(search_string_list)
  52.         return(text_output[target_value_index], max_score)
  53.     elif position == 1:
  54.         target_value_index = max_scores_indexes[0]-1
  55.         return(text_output[target_value_index], max_score)
  56.     else:
  57.         raise Exception('Please enter correct position')
  58.  
  59. if __name__ == '__main__':
  60.     if len(sys.argv) == 1:
  61.         text_output = preprocesses_text('AAoutput.txt')
  62.         search_string1 = 'maximum amount of credit we may lend you is'
  63.         search_string2 = 'total amount of principal'
  64.         search_string3 = 'draw down on the settlement date'
  65.         search_string4 = 'this is the credit limit'
  66.         value1, max_score1 = get_value(search_string1, text_output, threshold=95, position=0)
  67.         value2, max_score2 = get_value(search_string2, text_output, threshold=95, position=0)
  68.         value3, max_score3 = get_value(search_string3, text_output, threshold=95, position=0)
  69.         value4, max_score4 = get_value(search_string4, text_output, threshold=90, position=1)
  70.         print(f'Maximum Credit Amount: {value1}, Match Score: {max_score1}')
  71.         print(f'Total amount of principal: {value2}, Match Score: {max_score2}')
  72.         print(
  73.             f'Drawdown Fee: {value3}, Match Score: {max_score3}')
  74.         print(f'Maximum Credit Amount (reverse): {value4}, Match Score: {max_score4}')
  75.  
  76.  
  77.  
  78.        
  79.     else:
  80.         search_string = sys.argv[1]
  81.         text_output = preprocesses_text(sys.argv[2])
  82.         threshold = int(sys.argv[3])
  83.         position = int(sys.argv[4])
  84.         value, max_score = get_value(search_string, text_output, threshold=threshold, position=position)
  85.         print(f'{search_string}: {value}, Max Score: {max_score}')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement