Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """Hash files, like md5sum or shasum, only faster. Works with all
- hash functions Python's hashlib supports."""
- from __future__ import print_function
- from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
- from hashlib import algorithms_available
- from hashlib import new as new_hash
- from io import open
- from multiprocessing import Pool, cpu_count
- from multiprocessing.dummy import Pool as ThreadPool
- from sys import stderr
- class HashFile(object):
- def __init__(self, algo='sha1'):
- self.algo = algo
- def __call__(self, file_path):
- return self.hash_file(file_path)
- def hash_file(self, file_path):
- file_hash = new_hash(self.algo)
- one_mb = 1024 * 1024
- # Protect against zero or no block_size
- block_size = max(file_hash.block_size, 1)
- # Feed ~1 MiB at a time to the hash algo, as a multiple of block_size
- buf_size = block_size * (one_mb // block_size)
- try:
- with open(file_path, 'rb') as f:
- f.seek(0)
- while True:
- buf = f.read(buf_size)
- if buf:
- file_hash.update(buf)
- else:
- break
- except IOError as e:
- print(e, file=stderr)
- return None, file_path
- return file_hash.hexdigest(), file_path
- class FileHashes(object):
- def __init__(
- self, files=None, thread_count=None, use_real_procs=False,
- use_openssl=False, algo='sha1'
- ):
- self.files = files if files else []
- self.thread_count = thread_count if thread_count else max(cpu_count() // 2, 1)
- self.use_real_procs = use_real_procs
- self.use_openssl = use_openssl
- self.algo = algo
- def hashes(self):
- if self.use_real_procs:
- pool = Pool(self.thread_count)
- else:
- pool = ThreadPool(self.thread_count)
- # Pool.imap can't pickle functions with non-dummy subprocesses,
- # but it can pickle instances of classes.
- # That is why the hash function is implemented as callabled class instead
- # of class methods.
- hasher = HashFile(self.algo)
- for file_hash, file_path in pool.imap(hasher, self.files):
- if file_hash:
- yield file_hash, file_path
- class CommandLine(object):
- def __call__(self):
- self.args = self._args()
- file_hashes = FileHashes(
- files=self.args.files,
- thread_count=self.args.thread_count,
- use_real_procs=self.args.use_real_procs,
- algo=self.args.algo,
- )
- for file_hash, file_path in file_hashes.hashes():
- print(file_path, '=', file_hash)
- def _args(self):
- parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
- parser.add_argument('files', nargs='+')
- parser.add_argument(
- '-a', '--algorithm', dest='algo',
- default='sha1', choices=sorted(algorithms_available),
- help='Hash algorithm to use'
- )
- parser.add_argument(
- '-t', '--threads', dest='thread_count',
- type=int, default=max(cpu_count() // 2, 1),
- help='Number of threads or processes to use'
- )
- parser.add_argument(
- '-p', '--real-processes', dest='use_real_procs', action='store_true',
- help='Use full processes instead of threads'
- )
- args = parser.parse_args()
- return args
- if __name__ == '__main__':
- cmd = CommandLine()
- cmd()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement