You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DevOps-Bash-tools/find_duplicate_files_by_siz...

75 lines
1.9 KiB
Bash

#!/usr/bin/env bash
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2020-07-23 12:31:21 +0100 (Thu, 23 Jul 2020)
#
# https://github.com/HariSekhon/DevOps-Bash-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
#
# https://www.linkedin.com/in/HariSekhon
#
set -euo pipefail
[ -n "${DEBUG:-}" ] && set -x
srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC2034
usage_description="
Finds duplicate files by file size in bytes
Output format:
<size_in_bytes> <filename>
For a much more sophisticated duplicate file finder utilizing size, checksums, basenames and
even partial basenames via regex match see
find_duplicate_files.py
in the DevOps Python tools repo:
https://github.com/HariSekhon/DevOps-Python-tools
"
# used by usage() in lib/utils.sh
# shellcheck disable=SC2034
usage_args="[<dir1> <dir2> ...]"
# shellcheck disable=SC1090
. "$srcdir/lib/utils.sh"
help_usage "$@"
last_size=""
last_filename=""
last_printed=0
# GNU coreutils du has bytes, whereas Mac's du only goes to the less granular blocks which is less accurate
if is_mac; then
du_files(){
gdu -ab "$@"
}
else
du_files(){ du -ab "$@"; }
fi
# export function to be used in subshell with xargs, intentionally name it differently than du to make debugging easier
export -f du_files
while read -r size filename; do
if [ "$size" = "$last_size" ]; then
if [ "$last_printed" = 0 ]; then
printf '%s\t%s\n' "$last_size" "$last_filename"
fi
printf '%s\t%s\n' "$size" "$filename"
last_printed=1
else
last_printed=0
fi
last_size="$size"
last_filename="$filename"
done < <(for dir in "${@:-$PWD}"; do find "$dir" -type f -print0; done | xargs -0 bash -c 'du_files "$@"' | sort -k1n)