Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import re
- a = """1 Cold Pursuit FILMONE TIMES 1 44 10,820,287 251,635 7,765 10,820,287 7,765
- 2 Escape Room SILVERBIRD SONY 1 38 7,429,412 200,795 5,962 8,255,212 6,411
- 3 Upside, The FILMONE TIMES 4 46 3,512,149 -50% 97,560 2,678 -50% 37,883,002 36,093
- 4 Glass CRIMSON DISNEY 4 37 3,265,321 -56% 108,844 2,506 -54% 51,076,046 44,259
- 5 Aquaman FILMONE WARNER BROS 8 47 2,935,951 -55% 91,748 2,497 -51% 447,605,241 305,609"""
- def group_known_terms(data):
- """This is not a good way of doing this but you can add any other multi word terms here (or delete it and come up with something better)"""
- known_terms = ['WARNER BROS']
- for term in known_terms:
- for i,row in enumerate(data):
- if term in " ".join(row):
- rng = (data[i].index(term.split(' ')[0]),data[i].index(term.split(' ')[-1]))
- data[i][rng[0]:rng[-1] + 1] = [" ".join(data[i][rng[0]:rng[-1] + 1])]
- return data
- def group_titles(data):
- """Assumes all words in title are capitalized"""
- inds = [[i for i,x in enumerate(row) if x.istitle()] for row in data]
- for i,indexes in enumerate(inds):
- assert all(x==indexes[i-1]+1 if i is not 0 is not None else True for i,x in enumerate(indexes)), \
- "Detected Title-like String in wrong position in row {} of data:\n{}".format(i,data[i])
- data[i][indexes[0]:indexes[-1]+1] = [' '.join(data[i][indexes[0]:indexes[-1]+1])]
- return data
- def validate_data_size(data, column_size):
- for i,row in enumerate(data):
- assert len(row) <= column_size, \
- '{} columns were detected when {} were expected in row {} \n {}\n Consider adding terms with more than one word to list of known terms'.format(
- len(row), column_size, i, data[i]
- )
- def get_type(val):
- """We can use datatypes to estimate missing data. this is not an exhaustive method but should work for this"""
- if '%' in val:
- return 'percen'
- try:
- int(val)
- return 'int'
- except:
- return 'up_string' if val.isupper() else 'string'
- def apply_blanks(row, invalid_schema, valid_schema):
- """Attempt to match valid and invalid schemas and return indexes for where blanks were inserted"""
- if len(invalid_schema)==len(valid_schema) and all([x==y for x,y in zip(invalid_schema,valid_schema)]):
- return row
- for i, item in enumerate(invalid_schema):
- if item != valid_schema[i]:
- row.insert(i,'')
- invalid_schema.insert(i, valid_schema[i])
- return apply_blanks(row, invalid_schema, valid_schema)
- def estimate_missing_data(data, avg_data_size):
- """We create a schema of sorts to define how the data should look and then compare each line of the invalid data to
- the schema and try to match"""
- p_valid_data = [x for x in data if len(x) == len(avg_data_size)]
- invalid_data = [x for x in data if len(x) != len(avg_data_size)]
- expected_types = [[get_type(x) for x in row] for row in p_valid_data]
- assert all([all(x == types[0] for x in types) for types in np.transpose(expected_types)]),\
- "Could not match types for all columns in data of correct shape"
- expected_types = expected_types[0]
- invalid_data_types = [[get_type(x) for x in row] for row in invalid_data]
- for invalid, types in zip(invalid_data, invalid_data_types):
- p_valid_data.append(apply_blanks(invalid, types, expected_types))
- return data
- def parse_table_data(raw_text):
- rows = raw_text.split('\n')
- data = [re.sub(' +',' ',x).split(' ') for x in rows]
- data = group_titles(data)
- data = group_known_terms(data)
- column_size = max([len(x) for x in data])
- validate_data_size(data, column_size)
- average_data_group_size = np.mean([[len(i) for i in x] for x in data if len(x) == column_size], axis=0)
- data = estimate_missing_data(data, average_data_group_size)
- return data
- parse_table_data(a)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement