Untitled

import multiprocessing

import datetime
from random import randint

import numpy
import os
import time

import pandas as pd


def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='X'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    https://gist.github.com/snakers4/91fa21b9dda9d055a02ecd23f24fbc3d
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s: %s |%s| %s%% %s' % (datetime.datetime.now(), prefix, bar, percent, suffix))
    # Print New Line on Complete
    if iteration == total:
        print()

def process(df):
    """Process partitioned data and do something to generate your results"""

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # Simulate work, obviously remove this
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    time.sleep(randint(0, 2))

    result = []

    for item in df['IP']:
        result.append("Done processing %s" % item)

    return result;

def handleOutput(result):
    """Handle results from process(). Uses single-thread to append to result file safely."""
    f = open('results.txt', 'a')
    for item in result:
        f.write(item + '\n')
    f.close()

if __name__ == '__main__':

    # Cleanup from previous run
    os.remove("results.txt")

    # Create a fake workload, you'll probably want to supply your own list
    f = open('ips.csv', 'w')
    f.write('IP\n')
    for i in range(100000):
        f.write(str(i) + '\n')
    f.close()

    df = pd.read_csv("ips.csv", header=0)

    # Number of threads to use
    threads = 16
    # Partition work so that each thread gets around 100 things to work on at once before it finishes and calls handleOutput()
    partition_size_target = 1000

    partitions = int(len(df) / partition_size_target)
    df_split = numpy.array_split(df, partitions)
    task_count = len(df_split)
    pool = multiprocessing.Pool(threads)
    result_objs = []
    for d in df_split:
        result = pool.apply_async(process, (d,), callback=handleOutput)
        result_objs.append(result)

    # Initial call to print 0% progress
    previous_incomplete_count = -1
    while True:
        incomplete_count = sum(1 for x in result_objs if not x.ready())
        # This saves memory by removing complete jobs from result_objs.
        # I was processing images so needed to do this as it contained the bitmap data, you may not
        result_objs = [x for x in result_objs if not x.ready()]

        if incomplete_count == 0:
            print 'Complete'
            break

        if (incomplete_count != previous_incomplete_count):
            printProgressBar(task_count - incomplete_count, task_count, prefix='Progress:', decimals=3, suffix='Complete', length=50)

        previous_incomplete_count = incomplete_count
        time.sleep(.5)

    pool.close()
    pool.join()