Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import contextlib
- import hashlib
- import os
- import random
- import string
- import tempfile
- import timeit
- @contextlib.contextmanager
- def createdummyfiles():
- """
- Create a set of files at targetpath with random strings
- Outer for loop decides number of files with range specifying file size
- """
- with tempfile.TemporaryDirectory() as dir:
- def _():
- for sizectr in range(5, 25):
- filename = "file" + str(sizectr) + ".txt"
- fullfilename = os.path.join(dir, filename)
- with open(fullfilename, "w") as f:
- for ctr in range(2**sizectr):
- randomstring = "".join([random.choice(string.ascii_letters) for i in range(128)])
- f.write(randomstring)
- print("File created: " + filename + " Size: " + str(os.path.getsize(fullfilename)))
- yield filename, fullfilename
- yield _
- def hashchunks(testfile, blk_size):
- filehash = hashlib.md5()
- with open(testfile, "rb") as f:
- while True:
- read_data = f.read(blk_size)
- if not read_data:
- break
- filehash.update(read_data)
- filehash.digest()
- def hashcomplete(testfile):
- filehash = hashlib.md5()
- with open(testfile, "rb") as f:
- read_data = f.read()
- filehash.update(read_data)
- filehash.digest()
- if __name__ == "__main__":
- result_list = [] # list (of lists) to record file stats
- with createdummyfiles() as dummy_files:
- for filename, fullfilename in dummy_files():
- result = [] # list to record stats of the file
- filesize = os.path.getsize(fullfilename)
- # initialize counters
- least_time = 1000
- least_blk_size = 0
- num_iter = 100
- print(
- "File: {} Size: {} Number of iterations for timing: {}".format(
- filename, filesize, num_iter
- )
- )
- result.append(filename)
- result.append(filesize)
- result.append(num_iter)
- # first try the hashing file by breaking it up into smaller chunks
- for ctr in range(6, 21):
- blk_size = 2**ctr
- funcstr = "hashchunks('{}', {})".format(fullfilename, str(blk_size))
- exec_time = timeit.timeit(
- funcstr, setup="from __main__ import hashchunks", number=num_iter
- )
- if exec_time < least_time:
- least_time = exec_time
- least_blk_size = blk_size
- print("+++ Most efficient Chunk Size: {} Time taken: {}".format(least_blk_size, least_time))
- result.append(least_blk_size)
- result.append(least_time)
- # now try to hash the file all in one go
- funcstr = "hashcomplete('{}')".format(fullfilename)
- timetaken_complete = timeit.timeit(
- funcstr, setup="from __main__ import hashcomplete", number=num_iter
- )
- print("+++ Time taken for hashing complete file: {}".format(timetaken_complete))
- result.append(timetaken_complete)
- print("====================================================================")
- result_list.append(result)
- for res in result_list:
- print(res)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement