#!/usr/bin/env bash # vim:ts=4:sts=4:sw=4:et # # Author: Hari Sekhon # Date: 2020-04-01 13:03:12 +0100 (Wed, 01 Apr 2020) # # https://github.com/harisekhon/bash-tools # # License: see accompanying Hari Sekhon LICENSE file # # If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish # # https://www.linkedin.com/in/harisekhon # # Crawls a URL argument and finds broken links, throttling to 1 link every 2 seconds set -euo pipefail [ -n "${DEBUG:-}" ] && set -x srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1090 . "$srcdir/lib/utils.sh" usage(){ if [ -n "$*" ]; then echo "$*" fi cat < EOF exit 3 } if [ $# != 1 ]; then usage "no url argument given" fi url="$1" if ! [[ "$url" =~ https?:// ]]; then usage "invalid url argument, must match https?://" fi tmp="$(mktemp)" # want splitting # shellcheck disable=SC2086 trap 'rm "$tmp"' $TRAP_SIGNALS # --spider = don't download # -r = recursive # -nd / --no-directories = don't create local dirs representing structure # -nv / --no-verbose = give concise 1 liner information # -l 1 = crawl 1 level deep (may need to tune this), set to 'inf' for infinite # -w 2 = wait for 2 secs between requests to avoid tripping defenses # -H / --span-hosts = follows subdomains + external sites # -o "$tmp" = output to tmp, now replaced with tee # -N = --timestamping = don't download unless newer than local copy, use with mirroring not spidering # -nH = -–no-host-directories # -P = --directory-prefix (use instead of host directories) # -m = --mirror (-r -N -l inf –no-remove-listing) wget \ --spider \ -r \ -nd \ -nv \ -l 1 \ -w 2 --random-wait \ -H \ "$url" 2>&1 | tee "$tmp" if ! grep -q 'Found no broken links' "$tmp"; then echo echo "Broken links:" grep -B1 'broken link!' "$tmp" fi