ocr-estimating blanks

import numpy as np
import re

a = """1     Cold Pursuit                                FILMONE        TIMES               1      44    10,820,287                  251,635      7,765                      10,820,287       7,765
2      Escape Room                                  SILVERBIRD      SONY                  1       38     7,429,412                    200,795       5,962                         8,255,212         6,411
3        Upside, The                                               FILMONE           TIMES                      4         46       3,512,149          -50%          97,560          2,678             -50%           37,883,002         36,093
4       Glass                                                 CRIMSON         DISNEY                  4        37      3,265,321         -56%        108,844        2,506            -54%          51,076,046        44,259
5       Aquaman                                            FILMONE          WARNER BROS        8        47      2,935,951         -55%         91,748        2,497            -51%         447,605,241       305,609"""


def group_known_terms(data):
    """This is not a good way of doing this but you can add any other multi word terms here (or delete it and come up with something better)"""
    known_terms  = ['WARNER BROS']
    for term in known_terms:
        for i,row in enumerate(data):
            if term in " ".join(row):
                rng = (data[i].index(term.split(' ')[0]),data[i].index(term.split(' ')[-1]))
                data[i][rng[0]:rng[-1] + 1] = [" ".join(data[i][rng[0]:rng[-1] + 1])]
    return data

def group_titles(data):
    """Assumes all words in title are capitalized"""
    inds = [[i for i,x in enumerate(row) if x.istitle()] for row in data]
    for i,indexes in enumerate(inds):
        assert all(x==indexes[i-1]+1 if i is not 0 is not None else True for i,x in enumerate(indexes)), \
            "Detected Title-like String in wrong position in row {} of data:\n{}".format(i,data[i])
        data[i][indexes[0]:indexes[-1]+1] = [' '.join(data[i][indexes[0]:indexes[-1]+1])]
    return data

def validate_data_size(data, column_size):
    for i,row in enumerate(data):
        assert len(row) <= column_size, \
            '{} columns were detected when {} were expected in row {} \n {}\n Consider adding terms with more than one word to list of known terms'.format(
                len(row), column_size, i, data[i]
            )

def get_type(val):
    """We can use datatypes to estimate missing data. this is not an exhaustive method but should work for this"""
    if '%' in val:
        return 'percen'
    try:
        int(val)
        return 'int'
    except:
        return 'up_string' if val.isupper() else 'string'

def apply_blanks(row, invalid_schema, valid_schema):
    """Attempt to match valid and invalid schemas and return indexes for where blanks were inserted"""
    if len(invalid_schema)==len(valid_schema) and all([x==y for x,y in zip(invalid_schema,valid_schema)]):
        return row
    for i, item in enumerate(invalid_schema):
        if item != valid_schema[i]:
            row.insert(i,'')
            invalid_schema.insert(i, valid_schema[i])
            return apply_blanks(row, invalid_schema, valid_schema)

def estimate_missing_data(data, avg_data_size):
    """We create a schema of sorts to define how the data should look and then compare each line of the invalid data to
    the schema and try to match"""
    p_valid_data = [x for x in data if len(x) == len(avg_data_size)]
    invalid_data = [x for x in data if len(x) != len(avg_data_size)]
    expected_types = [[get_type(x) for x in row] for row in p_valid_data]
    assert all([all(x == types[0] for x in types) for types in np.transpose(expected_types)]),\
        "Could not match types for all columns in data of correct shape"
    expected_types = expected_types[0]
    invalid_data_types = [[get_type(x) for x in row] for row in invalid_data]
    for invalid, types in zip(invalid_data, invalid_data_types):
        p_valid_data.append(apply_blanks(invalid, types, expected_types))
    return data

def parse_table_data(raw_text):
    rows = raw_text.split('\n')
    data = [re.sub(' +',' ',x).split(' ') for x in rows]
    data = group_titles(data)
    data = group_known_terms(data)
    column_size = max([len(x) for x in data])
    validate_data_size(data, column_size)
    average_data_group_size = np.mean([[len(i) for i in x] for x in data if len(x) == column_size], axis=0)
    data = estimate_missing_data(data, average_data_group_size)
    return data


parse_table_data(a)