Advertisement
Guest User

matching-blocks.py

a guest
Oct 20th, 2022
102
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.73 KB | Software | 0 0
  1. #!/usr/bin/env python
  2. # FILE : matching-blocks.py
  3. # AUTHOR : Mike Fleetwood <mike.fleetwood@googlemail.com>
  4.  
  5. """
  6. Reads standard input in blocks of 1 MiB and reports which blocks match
  7. or not with other blocks.
  8.  
  9. It works by reading standard input once, calculating the MD5 hash of
  10. each 1 MiB block.  It then generates a list of every block with the same
  11. MD5 hash and reports the results.
  12.  
  13. Very hacky Python program!
  14. """
  15.  
  16. import os
  17. import hashlib
  18. import sys
  19.  
  20. BUFSIZE = 1024*1024
  21.  
  22. def read_md5_hashes(fd):
  23.     """Return list of md5 hashes of each block read from file
  24.    descriptor"""
  25.     #fd = os.open(fname, os.O_RDONLY)
  26.     md5_hash_list = []
  27.     while True:
  28.         data = os.read(fd, BUFSIZE)
  29.         if not data:
  30.             break
  31.         md5_hash = hashlib.md5(data).digest()
  32.         md5_hash_list.append(md5_hash)
  33.     #os.close(fd)
  34.     return md5_hash_list
  35.  
  36. # List of MD5 hashes.  List index is the block number, 0 upwards.
  37. md5_hash_list = read_md5_hashes(sys.stdin.fileno())
  38.  
  39. # Create dictionary keyed by MD5 hash of a list of all the block numbers
  40. # with the same hash.
  41. matching_blocks_dict = {}
  42. i = 0
  43. for md5_hash in md5_hash_list:
  44.     if md5_hash in matching_blocks_dict:
  45.         matching_blocks_dict[md5_hash].append(i)
  46.     else:
  47.         matching_blocks_dict[md5_hash] = [i]
  48.     i += 1
  49.  
  50. def blocks_delta_list(block_num):
  51.     """Return list of detlas to other identical blocks.
  52.    Limits results to first 12 matches."""
  53.     LIMIT = 12
  54.     matching_blocks = matching_blocks_dict[md5_hash_list[block_num]]
  55.     delta_list = []
  56.     i = 0
  57.     for bn in matching_blocks:
  58.         if i >= LIMIT:
  59.             break
  60.         if bn != block_num:
  61.             # Don't include own block number in the delta list.
  62.             delta_list.append(bn - block_num)
  63.             i += 1
  64.     return delta_list
  65.  
  66. def ellipsis_list(l):
  67.     """Return string representation of a list, except using ellipsis for
  68.    elements after 10"""
  69.     if len(l) <= 10:
  70.         return str(l)
  71.     else:
  72.         s = '[' + ', '.join(map(lambda x: str(x), l[:10])) + ', ...]'
  73.         return s
  74.  
  75. print ("Block   deltas to matching blocks")
  76. print ("------- -------------------------")
  77. prev_delta_list = None
  78. i = 0
  79. last_printed = 0
  80. for md5_hash in md5_hash_list:
  81.     delta_list = blocks_delta_list(i)
  82.     if delta_list != prev_delta_list:
  83.         if last_printed < i-2:
  84.             print('...')
  85.         if last_printed < i-1:
  86.             print("% 7d %s" % (i-1, ellipsis_list(prev_delta_list)))
  87.         print("% 7d %s" % (i, ellipsis_list(delta_list)))
  88.         last_printed = i
  89.     prev_delta_list = delta_list
  90.     i += 1
  91. if last_printed < i-2:
  92.     print('...')
  93. if last_printed < i-1:
  94.     print("% 7d %s" % (i-1, ellipsis_list(delta_list)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement