Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from multiprocessing import cpu_count
- from multiprocessing import Pool
- from dask import dataframe as dd
- from dask.multiprocessing import get
- def parallelize_df_transform(df, transform_f, n_partitions=cpu_count()):
- df_split = np.array_split(df, n_partitions)
- pool = Pool(n_partitions)
- df = pd.concat(pool.map(transform_f, df_split))
- pool.close()
- pool.join()
- return df
- def featurise(df):
- df['username_email_sim'] = df.apply(lambda x : some_func(x), axis=1)
- return df
- a_data_frame = parallelize_df_transform(a_data_frame, featurise)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement