Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import multiprocessing
- import pandas as pd
- import numpy as np
- from multiprocessing import Pool
- num_partitions = 5
- num_cores = multiprocessing.cpu_count()
- def parallelize_dataframe(df, func):
- a,b,c,d,e = np.array_split(df, num_partitions)
- pool = Pool(num_cores)
- df = pd.concat(pool.map(func, [a,b,c,d,e]))
- pool.close()
- pool.join()
- return df
- def square(x):
- return x**2
- def test_func(data):
- print("Process working on: ",data)
- data["square"] = data["col"].apply(square)
- return data
- df = pd.DataFrame({'col': [0,1,2,3,4,5,6,7,8,9]})
- if __name__ == '__main__':
- test = parallelize_dataframe(df, test_func)
- print(test)
Add Comment
Please, Sign In to add comment