Guest User

Untitled

a guest
Mar 17th, 2018
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.71 KB | None | 0 0
  1. # dupFinder.py
  2. import os, sys, stat
  3. import hashlib
  4.  
  5. def findDup(parentFolder):
  6. # Dups in format {hash:[names]}
  7. dups = {}
  8. for dirName, subdirs, fileList in os.walk(parentFolder):
  9. print('Scanning %s...' % dirName)
  10. for filename in fileList:
  11. # Get the path to the file
  12. path = os.path.join(dirName, filename)
  13. # Calculate hash
  14. file_hash = hashfile(path)
  15. # Add or append the file path
  16. if file_hash in dups:
  17. dups[file_hash]['path'].append(path)
  18. else:
  19. dups[file_hash] = {}
  20. dups[file_hash]['path'] = [path]
  21. if os.path.isfile(path):
  22. dups[file_hash]['size'] = os.stat(path)[stat.ST_SIZE]
  23. return dups
  24.  
  25.  
  26. # Joins two dictionaries
  27. def joinDicts(dict1, dict2):
  28. for key in dict2.keys():
  29. if key in dict1:
  30. dict1[key] = dict1[key] + dict2[key]
  31. else:
  32. dict1[key] = dict2[key]
  33.  
  34.  
  35. def hashfile(path, blocksize = 65536):
  36. afile = open(path, 'rb')
  37. hasher = hashlib.md5()
  38. buf = afile.read(blocksize)
  39. while len(buf) > 0:
  40. hasher.update(buf)
  41. buf = afile.read(blocksize)
  42. afile.close()
  43. return hasher.hexdigest()
  44.  
  45. def sizeof_fmt(num, suffix):
  46. for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
  47. if abs(num) < 1024.0:
  48. return "%3.1f%s%s" % (num, unit, suffix)
  49. num /= 1024.0
  50. return "%.1f%s%s" % (num, 'Yi', suffix)
  51.  
  52. def printResults(dict1):
  53. results = list(filter(lambda x: len(x['path']) > 1, dict1.values()))
  54. summ = 0
  55. if len(results) > 0:
  56. results = sorted(results, key=lambda res: res['size'])
  57. print('Duplicates:')
  58. print('___________________')
  59. for result in results:
  60. coef = len(result['path']) - 1
  61. summ += result['size'] * coef
  62. print('\t%s' % sizeof_fmt(result['size'], 'B'))
  63. for subresult in result['path']:
  64. print('\t\t%s' % subresult)
  65. print('___________________')
  66. print('TOTAL SIZE %s' % sizeof_fmt(summ, 'B'))
  67.  
  68. else:
  69. print('No duplicate files found.')
  70.  
  71.  
  72. if __name__ == '__main__':
  73. if len(sys.argv) > 1:
  74. dups = {}
  75. folders = sys.argv[1:]
  76. for i in folders:
  77. # Iterate the folders given
  78. if os.path.exists(i):
  79. # Find the duplicated files and append them to the dups
  80. joinDicts(dups, findDup(i))
  81. else:
  82. print('%s is not a valid path, please verify' % i)
  83. sys.exit()
  84. printResults(dups)
  85. else:
  86. print('Usage: python duplicates_finder.py folder or python duplicates_finder.py folder1 folder2 folder3')
Add Comment
Please, Sign In to add comment