#!/usr/bin/env bash # vim:ts=4:sts=4:sw=4:et # # Author: Hari Sekhon # Date: 2020-07-23 13:05:24 +0100 (Thu, 23 Jul 2020) # # https://github.com/harisekhon/bash-tools # # License: see accompanying Hari Sekhon LICENSE file # # If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish # # https://www.linkedin.com/in/harisekhon # set -euo pipefail [ -n "${DEBUG:-}" ] && set -x srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC2034 usage_description=" Finds duplicate files by file checksum Only compares files with the same byte counts for efficiency, using the adjacent find_duplicate_files_by_size.sh as a pre-filter to speed up the process Output format: For a much more sophisticated duplicate file finder utilizing size, checksums, basenames and even partial basenames via regex match see find_duplicate_files.py in the DevOps Python tools repo: https://github.com/harisekhon/devops-python-tools " # used by usage() in lib/utils.sh # shellcheck disable=SC2034 usage_args="[ ...]" # shellcheck disable=SC1090 . "$srcdir/lib/utils.sh" help_usage "$@" last_checksum="" last_filename="" last_printed=0 # discard size and use checksum as next level filter # shellcheck disable=SC2034 "$srcdir/find_duplicate_files_by_size.sh" "$@" | while read -r size filename; do md5sum "$filename" # outputs done | sort -k1n | while read -r checksum filename; do if [ "$checksum" = "$last_checksum" ]; then if [ "$last_printed" = 0 ]; then printf '%s\t%s\n' "$last_checksum" "$last_filename" fi printf '%s\t%s\n' "$checksum" "$filename" last_printed=1 else last_printed=0 fi last_checksum="$checksum" last_filename="$filename" done