Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import multiprocessing
- import datetime
- from random import randint
- import numpy
- import os
- import time
- import pandas as pd
- def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='X'):
- """
- Call in a loop to create terminal progress bar
- @params:
- iteration - Required : current iteration (Int)
- total - Required : total iterations (Int)
- prefix - Optional : prefix string (Str)
- suffix - Optional : suffix string (Str)
- decimals - Optional : positive number of decimals in percent complete (Int)
- length - Optional : character length of bar (Int)
- fill - Optional : bar fill character (Str)
- https://gist.github.com/snakers4/91fa21b9dda9d055a02ecd23f24fbc3d
- """
- percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
- filledLength = int(length * iteration // total)
- bar = fill * filledLength + '-' * (length - filledLength)
- print('\r%s: %s |%s| %s%% %s' % (datetime.datetime.now(), prefix, bar, percent, suffix))
- # Print New Line on Complete
- if iteration == total:
- print()
- def process(df):
- """Process partitioned data and do something to generate your results"""
- # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- # Simulate work, obviously remove this
- # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- time.sleep(randint(0, 2))
- result = []
- for item in df['IP']:
- result.append("Done processing %s" % item)
- return result;
- def handleOutput(result):
- """Handle results from process(). Uses single-thread to append to result file safely."""
- f = open('results.txt', 'a')
- for item in result:
- f.write(item + '\n')
- f.close()
- if __name__ == '__main__':
- # Cleanup from previous run
- os.remove("results.txt")
- # Create a fake workload, you'll probably want to supply your own list
- f = open('ips.csv', 'w')
- f.write('IP\n')
- for i in range(100000):
- f.write(str(i) + '\n')
- f.close()
- df = pd.read_csv("ips.csv", header=0)
- # Number of threads to use
- threads = 16
- # Partition work so that each thread gets around 100 things to work on at once before it finishes and calls handleOutput()
- partition_size_target = 1000
- partitions = int(len(df) / partition_size_target)
- df_split = numpy.array_split(df, partitions)
- task_count = len(df_split)
- pool = multiprocessing.Pool(threads)
- result_objs = []
- for d in df_split:
- result = pool.apply_async(process, (d,), callback=handleOutput)
- result_objs.append(result)
- # Initial call to print 0% progress
- previous_incomplete_count = -1
- while True:
- incomplete_count = sum(1 for x in result_objs if not x.ready())
- # This saves memory by removing complete jobs from result_objs.
- # I was processing images so needed to do this as it contained the bitmap data, you may not
- result_objs = [x for x in result_objs if not x.ready()]
- if incomplete_count == 0:
- print 'Complete'
- break
- if (incomplete_count != previous_incomplete_count):
- printProgressBar(task_count - incomplete_count, task_count, prefix='Progress:', decimals=3, suffix='Complete', length=50)
- previous_incomplete_count = incomplete_count
- time.sleep(.5)
- pool.close()
- pool.join()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement