Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # FILE : matching-blocks.py
- # AUTHOR : Mike Fleetwood <mike.fleetwood@googlemail.com>
- """
- Reads standard input in blocks of 1 MiB and reports which blocks match
- or not with other blocks.
- It works by reading standard input once, calculating the MD5 hash of
- each 1 MiB block. It then generates a list of every block with the same
- MD5 hash and reports the results.
- Very hacky Python program!
- """
- import os
- import hashlib
- import sys
- BUFSIZE = 1024*1024
- def read_md5_hashes(fd):
- """Return list of md5 hashes of each block read from file
- descriptor"""
- #fd = os.open(fname, os.O_RDONLY)
- md5_hash_list = []
- while True:
- data = os.read(fd, BUFSIZE)
- if not data:
- break
- md5_hash = hashlib.md5(data).digest()
- md5_hash_list.append(md5_hash)
- #os.close(fd)
- return md5_hash_list
- # List of MD5 hashes. List index is the block number, 0 upwards.
- md5_hash_list = read_md5_hashes(sys.stdin.fileno())
- # Create dictionary keyed by MD5 hash of a list of all the block numbers
- # with the same hash.
- matching_blocks_dict = {}
- i = 0
- for md5_hash in md5_hash_list:
- if md5_hash in matching_blocks_dict:
- matching_blocks_dict[md5_hash].append(i)
- else:
- matching_blocks_dict[md5_hash] = [i]
- i += 1
- def blocks_delta_list(block_num):
- """Return list of detlas to other identical blocks.
- Limits results to first 12 matches."""
- LIMIT = 12
- matching_blocks = matching_blocks_dict[md5_hash_list[block_num]]
- delta_list = []
- i = 0
- for bn in matching_blocks:
- if i >= LIMIT:
- break
- if bn != block_num:
- # Don't include own block number in the delta list.
- delta_list.append(bn - block_num)
- i += 1
- return delta_list
- def ellipsis_list(l):
- """Return string representation of a list, except using ellipsis for
- elements after 10"""
- if len(l) <= 10:
- return str(l)
- else:
- s = '[' + ', '.join(map(lambda x: str(x), l[:10])) + ', ...]'
- return s
- print ("Block deltas to matching blocks")
- print ("------- -------------------------")
- prev_delta_list = None
- i = 0
- last_printed = 0
- for md5_hash in md5_hash_list:
- delta_list = blocks_delta_list(i)
- if delta_list != prev_delta_list:
- if last_printed < i-2:
- print('...')
- if last_printed < i-1:
- print("% 7d %s" % (i-1, ellipsis_list(prev_delta_list)))
- print("% 7d %s" % (i, ellipsis_list(delta_list)))
- last_printed = i
- prev_delta_list = delta_list
- i += 1
- if last_printed < i-2:
- print('...')
- if last_printed < i-1:
- print("% 7d %s" % (i-1, ellipsis_list(delta_list)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement