Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import timeit
- import hashlib
- import os
- import random
- import string
- targetpath = "/path/to/test/files/"
- def createdummyfiles():
- '''
- Create a set of files at targetpath with random strings
- Outer for loop decides number of files with range specifying file size
- '''
- for sizectr in xrange(5, 25):
- filename = "file" + str(sizectr) + ".txt"
- fullfilename = os.path.join(targetpath, filename)
- with open(fullfilename, 'w') as f:
- for ctr in xrange(2**sizectr):
- randomstring = "".join([random.choice(string.letters) for i in xrange(128)])
- f.write(randomstring)
- print "File created: " + filename + " Size: " + str(os.path.getsize(fullfilename))
- def hashchunks(testfile, blk_size):
- filehash = hashlib.md5()
- with open(testfile, 'rb') as f:
- while (True):
- read_data = f.read(blk_size)
- if not read_data:
- break
- filehash.update(read_data)
- filehash.digest()
- def hashcomplete(testfile):
- filehash = hashlib.md5()
- with open(testfile, 'rb') as f:
- read_data = f.read()
- filehash.update(read_data)
- filehash.digest()
- if __name__ == '__main__':
- #createdummyfiles()
- result_list = [] #list (of lists) to record file stats
- for filename in os.listdir(targetpath):
- result = [] #list to record stats of the file
- fullfilename = os.path.join(targetpath, filename)
- filesize = os.path.getsize(fullfilename)
- #initialize counters
- least_time = 1000
- least_blk_size = 0
- num_iter = 100
- print "File: {} Size: {} Number of iterations for timing: {}".format(filename, filesize, num_iter)
- result.append(filename)
- result.append(filesize)
- result.append(num_iter)
- #first try the hashing file by breaking it up into smaller chunks
- for ctr in xrange(6, 21):
- blk_size = 2**ctr
- funcstr = "hashchunks('{}', {})".format(fullfilename, str(blk_size))
- exec_time = timeit.timeit(funcstr, setup="from __main__ import hashchunks", number = num_iter)
- if exec_time < least_time:
- least_time = exec_time
- least_blk_size = blk_size
- print "+++ Most efficient Chunk Size: {} Time taken: {}".format(least_blk_size, least_time)
- result.append(least_blk_size)
- result.append(least_time)
- #now try to hash the file all in one go
- funcstr = "hashcomplete('{}')".format(fullfilename)
- timetaken_complete = timeit.timeit(funcstr, setup="from __main__ import hashcomplete", number = num_iter)
- print "+++ Time taken for hashing complete file: {}".format(timetaken_complete)
- result.append(timetaken_complete)
- print "===================================================================="
- result_list.append(result)
- for res in result_list:
- print res
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement