You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DevOps-Bash-tools/find_broken_links.sh

80 lines
1.9 KiB
Bash

#!/usr/bin/env bash
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2020-04-01 13:03:12 +0100 (Wed, 01 Apr 2020)
#
# https://github.com/harisekhon/bash-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
#
# https://www.linkedin.com/in/harisekhon
#
# Crawls a URL argument and finds broken links, throttling to 1 link every 2 seconds
set -euo pipefail
[ -n "${DEBUG:-}" ] && set -x
srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck disable=SC1090
. "$srcdir/lib/utils.sh"
usage(){
if [ -n "$*" ]; then
echo "$*"
fi
cat <<EOF
usage: ${0##*/} <url>
EOF
exit 3
}
if [ $# != 1 ]; then
usage "no url argument given"
fi
url="$1"
if ! [[ "$url" =~ https?:// ]]; then
usage "invalid url argument, must match https?://"
fi
tmp="$(mktemp)"
# want splitting
# shellcheck disable=SC2086
trap 'rm "$tmp"' $TRAP_SIGNALS
# --spider = don't download
# -r = recursive
# -nd / --no-directories = don't create local dirs representing structure
# -nv / --no-verbose = give concise 1 liner information
# -l 1 = crawl 1 level deep (may need to tune this), set to 'inf' for infinite
# -w 2 = wait for 2 secs between requests to avoid tripping defenses
# -H / --span-hosts = follows subdomains + external sites
# -o "$tmp" = output to tmp, now replaced with tee
# -N = --timestamping = don't download unless newer than local copy, use with mirroring not spidering
# -nH = -no-host-directories
# -P = --directory-prefix (use instead of host directories)
# -m = --mirror (-r -N -l inf no-remove-listing)
wget \
--spider \
-r \
-nd \
-nv \
-l 1 \
-w 2 --random-wait \
-H \
"$url" 2>&1 |
tee "$tmp"
if ! grep -q 'Found no broken links' "$tmp"; then
echo
echo "Broken links:"
grep -B1 'broken link!' "$tmp"
fi