find_duplicates.py

from collections import defaultdict
import os


def find_duplicates_normal(path):
    CHUNKSIZE = 2 * 1024**2
    duplicates = defaultdict(list)
    for root, dirs, files in os.walk(path):
        for file in files:
            file = os.path.join(root, file)
            with open(file, 'rb') as fd:
                h = hashlib.md5()
                for chunk in iter(lambda: fd.read(CHUNKSIZE), b''):
                    h.update(chunk)
                duplicates[h.hexdigest()].append(file)
    return [d for d in duplicates.values() if len(d) > 1]


def find_duplicates_bytearray(path):
    CHUNKSIZE = 1 * 1024**2
    chunk = bytearray(CHUNKSIZE)
    view = memoryview(chunk)
    duplicates = defaultdict(list)
    for root, dirs, files in os.walk(path):
        for file in files:
            file = os.path.join(root, file)
            with open(file, 'rb') as fd:
                h = hashlib.md5()
                while True:
                    size = fd.readinto(view)
                    if not size:
                        break
                    h.update(view[:size])
                duplicates[h.hexdigest()].append(file)
    return [d for d in duplicates.values() if len(d) > 1]