Guest User

Untitled

a guest
Sep 14th, 2018
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.43 KB | None | 0 0
  1. from pathlib import Path
  2.  
  3. import pandas as pd
  4. import dask.dataframe as dd
  5.  
  6. ordinal_columns = pd.Index([
  7. 'category_0',
  8. 'category_1',
  9. 'category_2',
  10. 'category_3',
  11. 'category_4',
  12. 'category_6',
  13. 'category_7',
  14. 'category_9',
  15. 'category_10',
  16. 'category_11',
  17. 'category_13',
  18. 'category_14',
  19. 'category_17',
  20. 'category_19',
  21. 'category_20',
  22. 'category_21',
  23. 'category_22',
  24. 'category_23',
  25. ])
  26.  
  27. onehot_columns = pd.Index([
  28. 'category_5',
  29. 'category_8',
  30. 'category_12',
  31. 'category_15',
  32. 'category_16',
  33. 'category_18',
  34. 'category_24',
  35. 'category_25',
  36. ])
  37.  
  38.  
  39. def main():
  40. categories = ['category_%d' % i for i in range(26)]
  41. columns = ['click'] + ['numeric_%d' % i for i in range(13)] + categories
  42. encoding = {c: 'bytes' for c in categories}
  43. fixed = {c: 8 for c in categories}
  44.  
  45. chunker = pd.read_csv('data/day_0', sep='\t',
  46. names=columns, header=None,
  47. chunksize=100000,
  48. dtype={col: 'category' for col in onehot_columns})
  49.  
  50. Path('data/split').mkdir(exist_ok=True)
  51.  
  52. for i, df in enumerate(chunker):
  53. print(f"Writing, {i:0>6}")
  54. df.to_parquet(f'data/split/{i:0>6}.parquet',
  55. object_encoding=encoding,
  56. engine='fastparquet',
  57. fixed_text=fixed,
  58. compression='SNAPPY')
  59.  
  60.  
  61. if __name__ == '__main__':
  62. main()
Add Comment
Please, Sign In to add comment