Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- chunksize = 1000
- ip_filename = '/home/wolfram/Downloads/DataParserData.csv'
- op_filename = 'datafile.csv'
- prefix = 'cleansedquery_'
- all_keys = set()
- first_write = True
- def update_values(key_value, all_keys=all_keys):
- missing_keys = all_keys - set(key_value.keys())
- key_value.update({key:0 for key in missing_keys})
- return key_value
- for df in pd.read_csv(ip_filename, chunksize=chunksize, usecols=['CleansedQuery']):
- df['keys'] = df.CleansedQuery.apply(lambda x: [y.split(':')[0] for y in x.split(' ')])
- all_keys.update({value for row in df['keys'].values for value in row})
- for df in pd.read_csv(ip_filename, chunksize=chunksize):
- df['key_value'] = df.CleansedQuery.apply(lambda x: {y.split(':')[0]: int(y.split(':')[1]) for y in x.split(' ')})
- df['key_value'] = df['key_value'].apply(update_values)
- for key in all_keys:
- df[prefix+key] = df['key_value'].apply(lambda x: x[key])
- df.drop('key_value', inplace=True, axis=1)
- # df.sort_index(axis=1, inplace=True)
- if first_write:
- df.to_csv(open(op_filename, 'w'))
- columns = df.columns
- first_write = False
- else:
- df.loc[: columns].to_csv(open(op_filename, 'a'), header=False)
Add Comment
Please, Sign In to add comment