|
|
|
#!/usr/bin/env bash
|
|
|
|
# vim:ts=4:sts=4:sw=4:et
|
|
|
|
#
|
|
|
|
# Author: Hari Sekhon
|
|
|
|
# Date: 2020-07-23 13:05:24 +0100 (Thu, 23 Jul 2020)
|
|
|
|
#
|
|
|
|
# https://github.com/harisekhon/bash-tools
|
|
|
|
#
|
|
|
|
# License: see accompanying Hari Sekhon LICENSE file
|
|
|
|
#
|
|
|
|
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
|
|
|
|
#
|
|
|
|
# https://www.linkedin.com/in/harisekhon
|
|
|
|
#
|
|
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
[ -n "${DEBUG:-}" ] && set -x
|
|
|
|
srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
|
|
|
# shellcheck disable=SC2034
|
|
|
|
usage_description="
|
|
|
|
Finds duplicate files by file checksum
|
|
|
|
|
|
|
|
Only compares files with the same byte counts for efficiency, using the adjacent find_duplicate_files_by_size.sh
|
|
|
|
as a pre-filter to speed up the process
|
|
|
|
|
|
|
|
Output format:
|
|
|
|
|
|
|
|
<md5_checksum> <filename>
|
|
|
|
|
|
|
|
For a much more sophisticated duplicate file finder utilizing size, checksums, basenames and
|
|
|
|
even partial basenames via regex match see
|
|
|
|
|
|
|
|
find_duplicate_files.py
|
|
|
|
|
|
|
|
in the DevOps Python tools repo:
|
|
|
|
|
|
|
|
https://github.com/harisekhon/devops-python-tools
|
|
|
|
"
|
|
|
|
|
|
|
|
# used by usage() in lib/utils.sh
|
|
|
|
# shellcheck disable=SC2034
|
|
|
|
usage_args="[<dir1> <dir2> ...]"
|
|
|
|
|
|
|
|
# shellcheck disable=SC1090
|
|
|
|
. "$srcdir/lib/utils.sh"
|
|
|
|
|
|
|
|
help_usage "$@"
|
|
|
|
|
|
|
|
last_checksum=""
|
|
|
|
last_filename=""
|
|
|
|
last_printed=0
|
|
|
|
|
|
|
|
# discard size and use checksum as next level filter
|
|
|
|
# shellcheck disable=SC2034
|
|
|
|
"$srcdir/find_duplicate_files_by_size.sh" "$@" |
|
|
|
|
while read -r size filename; do
|
|
|
|
md5sum "$filename" # outputs <checksum> <filename>
|
|
|
|
done |
|
|
|
|
sort -k1n |
|
|
|
|
while read -r checksum filename; do
|
|
|
|
if [ "$checksum" = "$last_checksum" ]; then
|
|
|
|
if [ "$last_printed" = 0 ]; then
|
|
|
|
printf '%s\t%s\n' "$last_checksum" "$last_filename"
|
|
|
|
fi
|
|
|
|
printf '%s\t%s\n' "$checksum" "$filename"
|
|
|
|
last_printed=1
|
|
|
|
else
|
|
|
|
last_printed=0
|
|
|
|
fi
|
|
|
|
last_checksum="$checksum"
|
|
|
|
last_filename="$filename"
|
|
|
|
done
|