Advertisement
FocusedWolf

Python: Duplicate File Finder

Apr 16th, 2022 (edited)
1,451
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.89 KB | None | 0 0
  1. import os
  2. import sys
  3. import hashlib
  4.  
  5. def findDups(folders):
  6.     dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
  7.  
  8.     if args.redact and args.target == None:
  9.         print('WARNING: Search cancelled. Using --redact without --target means all output should be redacted.')
  10.         return dups
  11.  
  12.     for path in folders:
  13.         if os.path.exists(path):
  14.             print('Searching "%s"' % path)
  15.  
  16.             # Find the duplicated files and append them to the dups.
  17.             mergeDups(dups, findDup(path))
  18.  
  19.         else:
  20.             print('%s does not exist.' % path)
  21.  
  22.     # For all dictionary keys that contain a single file.
  23.     for key in [key for key, value in dups.items() if len(value) == 1]:
  24.         # Delete the item.
  25.         del dups[key]
  26.  
  27.     return dups
  28.  
  29. def findDup(parentFolder):
  30.     dups = {} # This will be a dictionary of lists with this layout: {hash:[names]}
  31.  
  32.     for dirName, subdirs, fileList in os.walk(parentFolder):
  33.         #  print('Searching %s' % dirName)
  34.  
  35.         for filename in fileList:
  36.             # Get the path to the file.
  37.             path = os.path.join(dirName, filename)
  38.             # Calculate hash
  39.             file_hash = hashfile(path)
  40.  
  41.             # Add or append the file path.
  42.             if file_hash in dups:
  43.                 dups[file_hash].append(path)
  44.             else:
  45.                 dups[file_hash] = [path]
  46.  
  47.     return dups
  48.  
  49. def mergeDups(a, b):
  50.     for key in b.keys():
  51.         if key in a:
  52.             # Merge lists.
  53.             a[key] += b[key]
  54.         else:
  55.             # Add list as a new entry.
  56.             a[key] = b[key]
  57.  
  58. def hashfile(path, blocksize = 65536):
  59.     f = open(path, 'rb')
  60.     hasher = hashlib.sha256()
  61.     buf = f.read(blocksize)
  62.     while len(buf) > 0:
  63.         hasher.update(buf)
  64.         buf = f.read(blocksize)
  65.     f.close()
  66.     return hasher.hexdigest()
  67.  
  68. def filterUnique(dups):
  69.     if len(dups) == 0:
  70.         return
  71.  
  72.     if not args.unique:
  73.         return
  74.  
  75.     # If no targets.
  76.     if args.target == None:
  77.         for key, value in dups.items():
  78.             # Remove first item.
  79.             value[:] = value[1:]
  80.         return
  81.  
  82.     for key, value in dups.items():
  83.         # If list is too small.
  84.         if len(value) <= 1:
  85.             continue
  86.  
  87.         # If all items are targets.
  88.         if all([any([target in path for target in args.target]) for path in value]):
  89.             # Remove first item.
  90.             value[:] = value[1:]
  91.             continue
  92.  
  93.         # Remove the first non-target item from every result.
  94.         total = 0
  95.         value[:] = [path for path in value if any([target in path for target in args.target]) or (total := total + 1) > 1]
  96.  
  97. def filterRedact(dups):
  98.     if len(dups) == 0:
  99.         return
  100.  
  101.     if not args.redact:
  102.         return
  103.  
  104.     if args.target == None:
  105.         # Remove every result.
  106.         dups.clear()
  107.  
  108.     else:
  109.         # Remove non-targets from every result.
  110.         for key, value in dups.items():
  111.             value[:] = [path for path in value if any([target in path for target in args.target])]
  112.  
  113. def filterClean(dups):
  114.     if len(dups) == 0:
  115.         return
  116.  
  117.     # For all dictionary keys that contain no files (made empty by other filters).
  118.     for key in [key for key, value in dups.items() if len(value) == 0]:
  119.         # Delete the item.
  120.         del dups[key]
  121.  
  122. def printResults(dups):
  123.     print()
  124.  
  125.     if len(dups) == 0:
  126.         print('No duplicates found.')
  127.         return
  128.  
  129.     print('The following files are identical:')
  130.  
  131.     for key, value in dups.items():
  132.         print()
  133.         for path in sorted(value):
  134.             output = ''
  135.  
  136.             if args.showHash:
  137.                 output += key
  138.                 output += ' '
  139.  
  140.             if args.prepend != '':
  141.                 output += args.prepend
  142.  
  143.             output += '"%s"' % path
  144.  
  145.             if args.append != '':
  146.                 output += args.append
  147.  
  148.             print(output)
  149.  
  150. def wait_for_any_keypress():
  151.     import sys
  152.     if sys.platform == 'win32':
  153.         import os
  154.         os.system('pause')
  155.     elif sys.platform.startswith('linux') or sys.platform == 'darwin':
  156.         print('Press any key to continue . . .')
  157.         import termios
  158.         import tty
  159.         stdin_file_desc = sys.stdin.fileno()
  160.         old_stdin_tty_attr = termios.tcgetattr(stdin_file_desc)
  161.         try:
  162.             tty.setraw(stdin_file_desc)
  163.             sys.stdin.read(1)
  164.         finally:
  165.             termios.tcsetattr(stdin_file_desc, termios.TCSADRAIN, old_stdin_tty_attr)
  166.  
  167. def main():
  168.     import argparse
  169.     parser = argparse.ArgumentParser()
  170.     parser.add_argument('-r', '--redact', action='store_true', help='only display paths which contain a "target" word', required=False)
  171.     parser.add_argument('-a', '--append', help='append this text after every path', type=str, default='', required=False)
  172.     parser.add_argument('-p', '--prepend', help='prepend this text before every path', type=str, default='', required=False)
  173.     parser.add_argument('-t', '--target', action='append', help='only display duplicate groups if one of the paths contains a "target" word', required=False)
  174.     parser.add_argument('-u', '--unique', action='store_true', help='do not display one of the paths in a duplicate group so you can delete the duplicates', required=False)
  175.     parser.add_argument('-s', '--showHash', action='store_true', help='print the SHA-256 hash for each file', required=False)
  176.     parser.add_argument('folders', help='the directory paths to compare', type=str, nargs='+')
  177.  
  178.     try:
  179.         global args
  180.         args = parser.parse_args()
  181.     except SystemExit:
  182.         wait_for_any_keypress()
  183.         return
  184.  
  185.     # Print args.
  186.     import sys
  187.     print(sys.argv)
  188.  
  189.     dups = findDups(args.folders)
  190.     filterUnique(dups)
  191.     filterRedact(dups)
  192.     filterClean(dups)
  193.     printResults(dups)
  194.  
  195. if __name__ == '__main__':
  196.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement