Advertisement
Guest User

hashlib: find optimal buffer size

a guest
Jul 21st, 2013
508
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.02 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import timeit
  4. import hashlib
  5. import os
  6. import random
  7. import string
  8.  
  9. targetpath = "/path/to/test/files/"
  10.  
  11. def createdummyfiles():
  12.     '''
  13.    Create a set of files at targetpath with random strings
  14.    Outer for loop decides number of files with range specifying file size
  15.    '''
  16.     for sizectr in xrange(5, 25):
  17.         filename = "file" + str(sizectr) + ".txt"
  18.         fullfilename = os.path.join(targetpath, filename)
  19.         with open(fullfilename, 'w') as f:
  20.             for ctr in xrange(2**sizectr):
  21.                 randomstring = "".join([random.choice(string.letters) for i in xrange(128)])
  22.                 f.write(randomstring)
  23.         print "File created: " + filename + " Size: " + str(os.path.getsize(fullfilename))
  24.  
  25. def hashchunks(testfile, blk_size):
  26.     filehash = hashlib.md5()
  27.     with open(testfile, 'rb') as f:
  28.         while (True):
  29.             read_data = f.read(blk_size)
  30.             if not read_data:
  31.                 break
  32.             filehash.update(read_data)
  33.     filehash.digest()
  34.  
  35. def hashcomplete(testfile):
  36.     filehash = hashlib.md5()
  37.     with open(testfile, 'rb') as f:
  38.         read_data = f.read()
  39.         filehash.update(read_data)
  40.     filehash.digest()
  41.  
  42. if __name__ == '__main__':
  43.    
  44.     #createdummyfiles()
  45.  
  46.     result_list = [] #list (of lists) to record file stats
  47.  
  48.     for filename in os.listdir(targetpath):
  49.         result = []  #list to record stats of the file
  50.         fullfilename = os.path.join(targetpath, filename)
  51.         filesize = os.path.getsize(fullfilename)
  52.  
  53.         #initialize counters
  54.         least_time = 1000
  55.         least_blk_size = 0
  56.        
  57.         num_iter = 100
  58.        
  59.         print "File: {} Size: {} Number of iterations for timing: {}".format(filename, filesize, num_iter)
  60.         result.append(filename)
  61.         result.append(filesize)
  62.         result.append(num_iter)
  63.         #first try the hashing file by breaking it up into smaller chunks
  64.         for ctr in xrange(6, 21):
  65.             blk_size = 2**ctr
  66.             funcstr = "hashchunks('{}', {})".format(fullfilename, str(blk_size))
  67.             exec_time = timeit.timeit(funcstr, setup="from __main__ import hashchunks", number = num_iter)
  68.             if exec_time < least_time:
  69.                 least_time = exec_time
  70.                 least_blk_size = blk_size
  71.         print "+++ Most efficient Chunk Size: {} Time taken: {}".format(least_blk_size, least_time)
  72.         result.append(least_blk_size)
  73.         result.append(least_time)
  74.  
  75.         #now try to hash the file all in one go
  76.         funcstr = "hashcomplete('{}')".format(fullfilename)
  77.         timetaken_complete = timeit.timeit(funcstr, setup="from __main__ import hashcomplete", number = num_iter)        
  78.         print "+++ Time taken for hashing complete file: {}".format(timetaken_complete)
  79.         result.append(timetaken_complete)
  80.         print "===================================================================="
  81.         result_list.append(result)
  82.  
  83.     for res in result_list:
  84.         print res
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement