Advertisement
bt90

FindDuplicates

Oct 5th, 2013
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.56 KB | None | 0 0
  1. """
  2. Used to find duplicate files in a given folder.
  3. @author: Thomas Butz
  4. """
  5. import functools
  6. import hashlib
  7. import os
  8. import pprint
  9. import sys
  10. from collections import defaultdict
  11.  
  12.  
  13. def find_duplicates(folder='.') -> dict:
  14.     """
  15.    Return all duplicate files in the given folder.
  16.    @param folder: the path of the folder to be scanned recursively for duplicates
  17.    @return: a dict which maps lists of duplicates to their checksum
  18.    """
  19.     folder = os.path.abspath(folder)
  20.     file_paths = defaultdict(list)
  21.     for dir_path, folder_names, file_names in os.walk(folder):
  22.         for name in file_names:
  23.             file_paths[dir_path].append(os.path.join(dir_path, name))
  24.  
  25.     filters = (os.path.getsize,
  26.                functools.partial(_hash_file, partial=True),
  27.                functools.partial(_hash_file, partial=False))
  28.  
  29.     for filter_func in filters:
  30.         file_paths = _remove_unique(file_paths, filter_func)
  31.  
  32.     return file_paths
  33.  
  34.  
  35. def _hash_file(path, partial=False) -> str:
  36.     """
  37.    Compute the checksum using the given algorithm.
  38.    @param path: the path of the file to be hashed
  39.    @param partial: flag that indicates that just the first chunk should be hashed
  40.    @return: the checksum in its hex representation
  41.    """
  42.     chunk_size = 2 ** 20
  43.     with open(path, 'rb') as f:
  44.         checksum = hashlib.sha1()
  45.         for chunk in iter(functools.partial(f.read, chunk_size), b''):
  46.             checksum.update(chunk)
  47.             if partial:
  48.                 break
  49.     return checksum.hexdigest()
  50.  
  51.  
  52. def _remove_unique(dictionary: dict, func: callable) -> dict:
  53.     """
  54.    Remove all values of the dict that return a unique value from the given callable.
  55.    @param dictionary: a defaultdict mapping a key to a list of values
  56.    @param func: the callable used to determine the key
  57.    @return: a defaultdict mapping a key to a list of values
  58.    """
  59.     stripped_dict = defaultdict(list)
  60.     for key, paths in dictionary.items():
  61.         for path in paths:
  62.             stripped_dict[func(path)].append(path)
  63.     return _reduce(stripped_dict)
  64.  
  65.  
  66. def _reduce(dictionary: dict) -> dict:
  67.     """
  68.    Remove all keys that just belong to a single value.
  69.    @param dictionary: a defaultdict mapping a key to a list of values
  70.    @return: the stripped version of the entered dict
  71.    """
  72.     return {key: value for key, value in dictionary.items() if len(value) > 1}
  73.  
  74.  
  75. if __name__ == '__main__':
  76.     if len(sys.argv) > 1:
  77.         pprint.pprint(find_duplicates(sys.argv[1]))
  78.     else:
  79.         pprint.pprint(find_duplicates())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement