#!/bin/bash
# find_duplicates.sh
# Updated: 2012-02-24
# Version: 1.0
# By: Jayson Kempinger < GlowingApple (at) Gmail (dot) com >
# http://www.kempinger.us
# License: GPLv3 (http://www.gnu.org/licenses/gpl-3.0.txt)
# Isolate duplicate files in a directory by recursively computing the MD5 checksums of every file.
# Usage: find_duplicates.sh [ DIR ]
# Known bugs:
# - some Finder aliases have the same md5 sum, so are misconstrued as duplicates (I assume the link is stored in the resource fork, but md5 only sums the data fork)
# - certain formats of webloc files are actually empty files (I assume the link is stored in the resource fork, but md5 only sums the data fork)
VERBOSE=1 # Set to 0 to hide output; set to 1 to show some simple output
CHECKSUM=`which md5` # set to location of md5 command
# Check that the correct number of arguments are given
if [ $# -eq 1 ]; then
DIR="${1}"
else
if [ $# -eq 0 ]; then
DIR="${PWD}"
else
echo "Please specify a single directory, or leave blank to use the current directory."
exit 1
fi
fi
OLD_IFS="${IFS}"
IFS=$'\n'
if [ $VERBOSE -eq 1 ]; then echo "Checksumming files in ${DIR}..."; fi
FILES=`find "${DIR}" -type f -exec $CHECKSUM -r '{}' \;`
DUPS=`echo "${FILES}" | awk '{print $1}' | uniq -d`
mkdir -p "${DIR}/duplicates"
if [[ $VERBOSE == 1 && "${DUPS}" != "" ]]; then echo "Moving duplicate files to ${DIR}/duplicates..."; fi
for DUP in "${DUPS}"
do
I=0
for FILE in `echo "${FILES}" | grep "${DUP}"`
do
# move all files, except for first file, to duplicates folder
if [ $I -ne 0 ]; then
# Using -n for move, so if two files with the same name exist, will not overwrite
FILEPATH=`echo "${FILE}" | cut -f 2- -d " "`
RESULT=`mv -nv "${FILEPATH}" "${DIR}/duplicates/" | grep -c "not overwritten"`
# Does another file with the same name exist? If so, append random number to filename and move file.
if [ $RESULT -ne 0 ]; then
FILENAME=`basename "${FILEPATH}"`
NEW_FILENAME="${FILENAME%.*}-$RANDOM.${FILENAME##*.}"
# $RANDOM should be sufficiently random for this, but just to be safe, using -n to avoid overwritting any data
mv -n "${FILEPATH}" "${DIR}/duplicates/${NEW_FILENAME}"
fi
fi
let I=$I+1
done
done
IFS="${OLD_IFS}"
if [ $VERBOSE -eq 1 ]; then
COUNT=`ls "${DIR}/duplicates" | wc -l | bc`
echo ""
if [ $COUNT -eq 0 ]; then
rmdir "${DIR}/duplicates"
echo "No duplicate files were found."
else
echo -n "${COUNT} duplicate file"
if [ $COUNT -gt 1 ]; then
echo -n "s"
fi
echo -n " found and moved to "${DIR}/duplicates"; one copy of "
if [ $COUNT -gt 1 ]; then
echo -n "each"
else
echo -n "the"
fi
echo " duplicate set was left in ${DIR}."
echo ""
echo "${DIR}:"
ls -1 "${DIR}/duplicates/"
fi
fi
exit 0