Advertisement
Guest User

Untitled

a guest
Aug 24th, 2019
98
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.55 KB | None | 0 0
  1. from multiprocessing import cpu_count
  2. from multiprocessing import Pool
  3. from dask import dataframe as dd
  4. from dask.multiprocessing import get
  5.  
  6.  
  7.  
  8. def parallelize_df_transform(df, transform_f, n_partitions=cpu_count()):
  9. df_split = np.array_split(df, n_partitions)
  10. pool = Pool(n_partitions)
  11. df = pd.concat(pool.map(transform_f, df_split))
  12. pool.close()
  13. pool.join()
  14. return df
  15.  
  16. def featurise(df):
  17. df['username_email_sim'] = df.apply(lambda x : some_func(x), axis=1)
  18. return df
  19.  
  20. a_data_frame = parallelize_df_transform(a_data_frame, featurise)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement