Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import os
- import hashlib
- def chunk_reader(fobj, chunk_size=1024):
- """Generator that reads a file in chunks of bytes"""
- while True:
- chunk = fobj.read(chunk_size)
- if not chunk:
- return
- yield chunk
- def check_for_duplicates(paths, hash=hashlib.sha1):
- hashes = {}
- for path in paths:
- for dirpath, dirnames, filenames in os.walk(path):
- for filename in filenames:
- full_path = os.path.join(dirpath, filename)
- hashobj = hash()
- for chunk in chunk_reader(open(full_path, 'rb')):
- hashobj.update(chunk)
- file_id = (hashobj.digest(), os.path.getsize(full_path))
- duplicate = hashes.get(file_id, None)
- if duplicate:
- print "Duplicate found: %s and %s" % (full_path, duplicate)
- else:
- hashes[file_id] = full_path
- if sys.argv[1:]:
- check_for_duplicates(sys.argv[1:])
- else:
- print "Please pass the paths to check as parameters to the script"
- import sys
- import os
- import hashlib
- check_path = (lambda filepath, hashes, p = sys.stdout.write:
- (lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
- ((hash in hashes) and (p ('DUPLICATE FILEn'
- ' %sn'
- 'of %sn' % (filepath, hashes[hash])))
- or hashes.setdefault (hash, filepath)))())
- scan = (lambda dirpath, hashes = {}:
- map (lambda (root, dirs, files):
- map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))
- ((len (sys.argv) > 1) and scan (sys.argv[1]))
- def remove_duplicates(dir):
- unique = []
- for filename in os.listdir(dir)
- if os.path.isfile(filename):
- filehash = md5.md5(file(filename).read()).hexdigest()
- if filehash not in unique: unique.append(filehash)
- else: os.remove(filename)
Add Comment
Please, Sign In to add comment