daily pastebin goal
59%
SHARE
TWEET

Untitled

a guest Sep 14th, 2018 57 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from pathlib import Path
  2.  
  3. import pandas as pd
  4. import dask.dataframe as dd
  5.  
  6. ordinal_columns = pd.Index([
  7.     'category_0',
  8.     'category_1',
  9.     'category_2',
  10.     'category_3',
  11.     'category_4',
  12.     'category_6',
  13.     'category_7',
  14.     'category_9',
  15.     'category_10',
  16.     'category_11',
  17.     'category_13',
  18.     'category_14',
  19.     'category_17',
  20.     'category_19',
  21.     'category_20',
  22.     'category_21',
  23.     'category_22',
  24.     'category_23',
  25. ])
  26.  
  27. onehot_columns = pd.Index([
  28.     'category_5',
  29.     'category_8',
  30.     'category_12',
  31.     'category_15',
  32.     'category_16',
  33.     'category_18',
  34.     'category_24',
  35.     'category_25',
  36. ])
  37.  
  38.  
  39. def main():
  40.     categories = ['category_%d' % i for i in range(26)]
  41.     columns = ['click'] + ['numeric_%d' % i for i in range(13)] + categories
  42.     encoding = {c: 'bytes' for c in categories}
  43.     fixed = {c: 8 for c in categories}
  44.  
  45.     chunker = pd.read_csv('data/day_0', sep='\t',
  46.                           names=columns, header=None,
  47.                           chunksize=100000,
  48.                           dtype={col: 'category' for col in onehot_columns})
  49.  
  50.     Path('data/split').mkdir(exist_ok=True)
  51.  
  52.     for i, df in enumerate(chunker):
  53.         print(f"Writing, {i:0>6}")
  54.         df.to_parquet(f'data/split/{i:0>6}.parquet',
  55.                       object_encoding=encoding,
  56.                       engine='fastparquet',
  57.                       fixed_text=fixed,
  58.                       compression='SNAPPY')
  59.  
  60.  
  61. if __name__ == '__main__':
  62.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top