Advertisement
DeaD_EyE

find_duplicates.py

May 2nd, 2017
734
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.20 KB | None | 0 0
  1. from collections import defaultdict
  2. import os
  3.  
  4.  
  5. def find_duplicates_normal(path):
  6.     CHUNKSIZE = 2 * 1024**2
  7.     duplicates = defaultdict(list)
  8.     for root, dirs, files in os.walk(path):
  9.         for file in files:
  10.             file = os.path.join(root, file)
  11.             with open(file, 'rb') as fd:
  12.                 h = hashlib.md5()
  13.                 for chunk in iter(lambda: fd.read(CHUNKSIZE), b''):
  14.                     h.update(chunk)
  15.                 duplicates[h.hexdigest()].append(file)
  16.     return [d for d in duplicates.values() if len(d) > 1]
  17.  
  18.  
  19. def find_duplicates_bytearray(path):
  20.     CHUNKSIZE = 1 * 1024**2
  21.     chunk = bytearray(CHUNKSIZE)
  22.     view = memoryview(chunk)
  23.     duplicates = defaultdict(list)
  24.     for root, dirs, files in os.walk(path):
  25.         for file in files:
  26.             file = os.path.join(root, file)
  27.             with open(file, 'rb') as fd:
  28.                 h = hashlib.md5()
  29.                 while True:
  30.                     size = fd.readinto(view)
  31.                     if not size:
  32.                         break
  33.                     h.update(view[:size])
  34.                 duplicates[h.hexdigest()].append(file)
  35.     return [d for d in duplicates.values() if len(d) > 1]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement