#!/usr/bin/env bash # vim:ts=4:sts=4:sw=4:et # # Author: Hari Sekhon # Date: 2020-07-23 12:31:21 +0100 (Thu, 23 Jul 2020) # # https://github.com/HariSekhon/DevOps-Bash-tools # # License: see accompanying Hari Sekhon LICENSE file # # If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish # # https://www.linkedin.com/in/HariSekhon # set -euo pipefail [ -n "${DEBUG:-}" ] && set -x srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC2034 usage_description=" Finds duplicate files by file size in bytes Output format: For a much more sophisticated duplicate file finder utilizing size, checksums, basenames and even partial basenames via regex match see find_duplicate_files.py in the DevOps Python tools repo: https://github.com/HariSekhon/DevOps-Python-tools " # used by usage() in lib/utils.sh # shellcheck disable=SC2034 usage_args="[ ...]" # shellcheck disable=SC1090,SC1091 . "$srcdir/lib/utils.sh" help_usage "$@" last_size="" last_filename="" last_printed=0 # GNU coreutils du has bytes, whereas Mac's du only goes to the less granular blocks which is less accurate if is_mac; then du_files(){ gdu -ab "$@" } else du_files(){ du -ab "$@"; } fi # export function to be used in subshell with xargs, intentionally name it differently than du to make debugging easier export -f du_files while read -r size filename; do if [ "$size" = "$last_size" ]; then if [ "$last_printed" = 0 ]; then printf '%s\t%s\n' "$last_size" "$last_filename" fi printf '%s\t%s\n' "$size" "$filename" last_printed=1 else last_printed=0 fi last_size="$size" last_filename="$filename" done < <(for dir in "${@:-$PWD}"; do find "$dir" -type f -print0; done | xargs -0 bash -c 'du_files "$@"' | sort -k1n)