Advertisement
Guest User

ocr-estimating blanks

a guest
Apr 19th, 2019
133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.58 KB | None | 0 0
  1. import numpy as np
  2. import re
  3.  
  4. a = """1     Cold Pursuit                                FILMONE        TIMES               1      44    10,820,287                  251,635      7,765                      10,820,287       7,765
  5. 2      Escape Room                                  SILVERBIRD      SONY                  1       38     7,429,412                    200,795       5,962                         8,255,212         6,411
  6. 3        Upside, The                                               FILMONE           TIMES                      4         46       3,512,149          -50%          97,560          2,678             -50%           37,883,002         36,093
  7. 4       Glass                                                 CRIMSON         DISNEY                  4        37      3,265,321         -56%        108,844        2,506            -54%          51,076,046        44,259
  8. 5       Aquaman                                            FILMONE          WARNER BROS        8        47      2,935,951         -55%         91,748        2,497            -51%         447,605,241       305,609"""
  9.  
  10.  
  11. def group_known_terms(data):
  12.     """This is not a good way of doing this but you can add any other multi word terms here (or delete it and come up with something better)"""
  13.     known_terms  = ['WARNER BROS']
  14.     for term in known_terms:
  15.         for i,row in enumerate(data):
  16.             if term in " ".join(row):
  17.                 rng = (data[i].index(term.split(' ')[0]),data[i].index(term.split(' ')[-1]))
  18.                 data[i][rng[0]:rng[-1] + 1] = [" ".join(data[i][rng[0]:rng[-1] + 1])]
  19.     return data
  20.  
  21. def group_titles(data):
  22.     """Assumes all words in title are capitalized"""
  23.     inds = [[i for i,x in enumerate(row) if x.istitle()] for row in data]
  24.     for i,indexes in enumerate(inds):
  25.         assert all(x==indexes[i-1]+1 if i is not 0 is not None else True for i,x in enumerate(indexes)), \
  26.             "Detected Title-like String in wrong position in row {} of data:\n{}".format(i,data[i])
  27.         data[i][indexes[0]:indexes[-1]+1] = [' '.join(data[i][indexes[0]:indexes[-1]+1])]
  28.     return data
  29.  
  30. def validate_data_size(data, column_size):
  31.     for i,row in enumerate(data):
  32.         assert len(row) <= column_size, \
  33.             '{} columns were detected when {} were expected in row {} \n {}\n Consider adding terms with more than one word to list of known terms'.format(
  34.                 len(row), column_size, i, data[i]
  35.             )
  36.  
  37. def get_type(val):
  38.     """We can use datatypes to estimate missing data. this is not an exhaustive method but should work for this"""
  39.     if '%' in val:
  40.         return 'percen'
  41.     try:
  42.         int(val)
  43.         return 'int'
  44.     except:
  45.         return 'up_string' if val.isupper() else 'string'
  46.  
  47. def apply_blanks(row, invalid_schema, valid_schema):
  48.     """Attempt to match valid and invalid schemas and return indexes for where blanks were inserted"""
  49.     if len(invalid_schema)==len(valid_schema) and all([x==y for x,y in zip(invalid_schema,valid_schema)]):
  50.         return row
  51.     for i, item in enumerate(invalid_schema):
  52.         if item != valid_schema[i]:
  53.             row.insert(i,'')
  54.             invalid_schema.insert(i, valid_schema[i])
  55.             return apply_blanks(row, invalid_schema, valid_schema)
  56.  
  57. def estimate_missing_data(data, avg_data_size):
  58.     """We create a schema of sorts to define how the data should look and then compare each line of the invalid data to
  59.    the schema and try to match"""
  60.     p_valid_data = [x for x in data if len(x) == len(avg_data_size)]
  61.     invalid_data = [x for x in data if len(x) != len(avg_data_size)]
  62.     expected_types = [[get_type(x) for x in row] for row in p_valid_data]
  63.     assert all([all(x == types[0] for x in types) for types in np.transpose(expected_types)]),\
  64.         "Could not match types for all columns in data of correct shape"
  65.     expected_types = expected_types[0]
  66.     invalid_data_types = [[get_type(x) for x in row] for row in invalid_data]
  67.     for invalid, types in zip(invalid_data, invalid_data_types):
  68.         p_valid_data.append(apply_blanks(invalid, types, expected_types))
  69.     return data
  70.  
  71. def parse_table_data(raw_text):
  72.     rows = raw_text.split('\n')
  73.     data = [re.sub(' +',' ',x).split(' ') for x in rows]
  74.     data = group_titles(data)
  75.     data = group_known_terms(data)
  76.     column_size = max([len(x) for x in data])
  77.     validate_data_size(data, column_size)
  78.     average_data_group_size = np.mean([[len(i) for i in x] for x in data if len(x) == column_size], axis=0)
  79.     data = estimate_missing_data(data, average_data_group_size)
  80.     return data
  81.  
  82.  
  83. parse_table_data(a)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement