Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Used to find duplicate files in a given folder.
- @author: Thomas Butz
- """
- import functools
- import hashlib
- import os
- import pprint
- import sys
- from collections import defaultdict
- def find_duplicates(folder='.') -> dict:
- """
- Return all duplicate files in the given folder.
- @param folder: the path of the folder to be scanned recursively for duplicates
- @return: a dict which maps lists of duplicates to their checksum
- """
- folder = os.path.abspath(folder)
- file_paths = defaultdict(list)
- for dir_path, folder_names, file_names in os.walk(folder):
- for name in file_names:
- file_paths[dir_path].append(os.path.join(dir_path, name))
- filters = (os.path.getsize,
- functools.partial(_hash_file, partial=True),
- functools.partial(_hash_file, partial=False))
- for filter_func in filters:
- file_paths = _remove_unique(file_paths, filter_func)
- return file_paths
- def _hash_file(path, partial=False) -> str:
- """
- Compute the checksum using the given algorithm.
- @param path: the path of the file to be hashed
- @param partial: flag that indicates that just the first chunk should be hashed
- @return: the checksum in its hex representation
- """
- chunk_size = 2 ** 20
- with open(path, 'rb') as f:
- checksum = hashlib.sha1()
- for chunk in iter(functools.partial(f.read, chunk_size), b''):
- checksum.update(chunk)
- if partial:
- break
- return checksum.hexdigest()
- def _remove_unique(dictionary: dict, func: callable) -> dict:
- """
- Remove all values of the dict that return a unique value from the given callable.
- @param dictionary: a defaultdict mapping a key to a list of values
- @param func: the callable used to determine the key
- @return: a defaultdict mapping a key to a list of values
- """
- stripped_dict = defaultdict(list)
- for key, paths in dictionary.items():
- for path in paths:
- stripped_dict[func(path)].append(path)
- return _reduce(stripped_dict)
- def _reduce(dictionary: dict) -> dict:
- """
- Remove all keys that just belong to a single value.
- @param dictionary: a defaultdict mapping a key to a list of values
- @return: the stripped version of the entered dict
- """
- return {key: value for key, value in dictionary.items() if len(value) > 1}
- if __name__ == '__main__':
- if len(sys.argv) > 1:
- pprint.pprint(find_duplicates(sys.argv[1]))
- else:
- pprint.pprint(find_duplicates())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement