Guest User

Untitled

a guest
Mar 8th, 2018
88
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. import pandas as pd
  2.  
  3.  
  4. chunksize = 1000
  5. ip_filename = '/home/wolfram/Downloads/DataParserData.csv'
  6. op_filename = 'datafile.csv'
  7. prefix = 'cleansedquery_'
  8.  
  9. all_keys = set()
  10. first_write = True
  11.  
  12. def update_values(key_value, all_keys=all_keys):
  13. missing_keys = all_keys - set(key_value.keys())
  14. key_value.update({key:0 for key in missing_keys})
  15. return key_value
  16.  
  17. for df in pd.read_csv(ip_filename, chunksize=chunksize, usecols=['CleansedQuery']):
  18. df['keys'] = df.CleansedQuery.apply(lambda x: [y.split(':')[0] for y in x.split(' ')])
  19. all_keys.update({value for row in df['keys'].values for value in row})
  20. for df in pd.read_csv(ip_filename, chunksize=chunksize):
  21. df['key_value'] = df.CleansedQuery.apply(lambda x: {y.split(':')[0]: int(y.split(':')[1]) for y in x.split(' ')})
  22. df['key_value'] = df['key_value'].apply(update_values)
  23. for key in all_keys:
  24. df[prefix+key] = df['key_value'].apply(lambda x: x[key])
  25.  
  26. df.drop('key_value', inplace=True, axis=1)
  27. # df.sort_index(axis=1, inplace=True)
  28. if first_write:
  29. df.to_csv(open(op_filename, 'w'))
  30. columns = df.columns
  31. first_write = False
  32. else:
  33. df.loc[: columns].to_csv(open(op_filename, 'a'), header=False)
Add Comment
Please, Sign In to add comment