Untitled

#!/usr/bin/env python

"""Hash files, like md5sum or shasum, only faster. Works with all
hash functions Python's hashlib supports."""

from __future__ import print_function
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from hashlib import algorithms_available
from hashlib import new as new_hash
from io import open
from multiprocessing import Pool, cpu_count
from multiprocessing.dummy import Pool as ThreadPool
from sys import stderr


class HashFile(object):
    def __init__(self, algo='sha1'):
        self.algo = algo

    def __call__(self, file_path):
        return self.hash_file(file_path)

    def hash_file(self, file_path):
        file_hash = new_hash(self.algo)
        one_mb = 1024 * 1024
        # Protect against zero or no block_size
        block_size = max(file_hash.block_size, 1)
        # Feed ~1 MiB at a time to the hash algo, as a multiple of block_size
        buf_size = block_size * (one_mb // block_size)
        try:
            with open(file_path, 'rb') as f:
                f.seek(0)
                while True:
                    buf = f.read(buf_size)
                    if buf:
                        file_hash.update(buf)
                    else:
                        break
        except IOError as e:
            print(e, file=stderr)
            return None, file_path

        return file_hash.hexdigest(), file_path


class FileHashes(object):
    def __init__(
        self, files=None, thread_count=None, use_real_procs=False,
        use_openssl=False, algo='sha1'
    ):
        self.files = files if files else []
        self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1)
        self.use_real_procs = use_real_procs
        self.use_openssl = use_openssl
        self.algo = algo

    def hashes(self):
        if self.use_real_procs:
            pool = Pool(self.thread_count)
        else:
            pool = ThreadPool(self.thread_count)

        # Pool.imap can't pickle functions with non-dummy subprocesses,
        # but it can pickle instances of classes.
        # That is why the hash function is implemented as callabled class instead
        # of class methods.
        hasher = HashFile(self.algo)

        for file_hash, file_path in pool.imap(hasher, self.files):
            if file_hash:
                yield file_hash, file_path


class CommandLine(object):
    def __call__(self):
        self.args = self._args()
        file_hashes = FileHashes(
            files=self.args.files,
            thread_count=self.args.thread_count,
            use_real_procs=self.args.use_real_procs,
            algo=self.args.algo,
        )

        for file_hash, file_path in file_hashes.hashes():
            print(file_path, '=', file_hash)

    def _args(self):
        parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)

        parser.add_argument('files', nargs='+')
        parser.add_argument(
            '-a', '--algorithm', dest='algo',
            default='sha1', choices=sorted(algorithms_available),
            help='Hash algorithm to use'
        )
        parser.add_argument(
            '-t', '--threads', dest='thread_count',
            type=int, default=max(cpu_count() // 2, 1),
            help='Number of threads or processes to use'
        )
        parser.add_argument(
            '-p', '--real-processes', dest='use_real_procs', action='store_true',
            help='Use full processes instead of threads'
        )

        args = parser.parse_args()
        return args


if __name__ == '__main__':
    cmd = CommandLine()
    cmd()