benchmark_hashlib_py3.py

#!/usr/bin/env python

import contextlib
import hashlib
import os
import random
import string
import tempfile
import timeit


@contextlib.contextmanager
def createdummyfiles():
    """
    Create a set of files at targetpath with random strings
    Outer for loop decides number of files with range specifying file size
    """
    with tempfile.TemporaryDirectory() as dir:

        def _():
            for sizectr in range(5, 25):
                filename = "file" + str(sizectr) + ".txt"
                fullfilename = os.path.join(dir, filename)
                with open(fullfilename, "w") as f:
                    for ctr in range(2**sizectr):
                        randomstring = "".join([random.choice(string.ascii_letters) for i in range(128)])
                        f.write(randomstring)
                    print("File created: " + filename + " Size: " + str(os.path.getsize(fullfilename)))
                    yield filename, fullfilename

        yield _


def hashchunks(testfile, blk_size):
    filehash = hashlib.md5()
    with open(testfile, "rb") as f:
        while True:
            read_data = f.read(blk_size)
            if not read_data:
                break
            filehash.update(read_data)
    filehash.digest()


def hashcomplete(testfile):
    filehash = hashlib.md5()
    with open(testfile, "rb") as f:
        read_data = f.read()
        filehash.update(read_data)
    filehash.digest()


if __name__ == "__main__":
    result_list = []  # list (of lists) to record file stats

    with createdummyfiles() as dummy_files:
        for filename, fullfilename in dummy_files():
            result = []  # list to record stats of the file
            filesize = os.path.getsize(fullfilename)

            # initialize counters
            least_time = 1000
            least_blk_size = 0

            num_iter = 100

            print(
                "File: {} Size: {} Number of iterations for timing: {}".format(
                    filename, filesize, num_iter
                )
            )
            result.append(filename)
            result.append(filesize)
            result.append(num_iter)
            # first try the hashing file by breaking it up into smaller chunks
            for ctr in range(6, 21):
                blk_size = 2**ctr
                funcstr = "hashchunks('{}', {})".format(fullfilename, str(blk_size))
                exec_time = timeit.timeit(
                    funcstr, setup="from __main__ import hashchunks", number=num_iter
                )
                if exec_time < least_time:
                    least_time = exec_time
                    least_blk_size = blk_size
            print("+++ Most efficient Chunk Size: {} Time taken: {}".format(least_blk_size, least_time))
            result.append(least_blk_size)
            result.append(least_time)

            # now try to hash the file all in one go
            funcstr = "hashcomplete('{}')".format(fullfilename)
            timetaken_complete = timeit.timeit(
                funcstr, setup="from __main__ import hashcomplete", number=num_iter
            )
            print("+++ Time taken for hashing complete file: {}".format(timetaken_complete))
            result.append(timetaken_complete)
            print("====================================================================")
            result_list.append(result)

    for res in result_list:
        print(res)