#!/usr/bin/env bash # vim:ts=4:sts=4:sw=4:et # # Author: Hari Sekhon # Date: 2022-01-17 18:11:09 +0000 (Mon, 17 Jan 2022) # # https://github.com/HariSekhon/DevOps-Bash-tools # # License: see accompanying Hari Sekhon LICENSE file # # If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish # # https://www.linkedin.com/in/HariSekhon # set -euo pipefail [ -n "${DEBUG:-}" ] && set -x srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1090 . "$srcdir/lib/utils.sh" # shellcheck disable=SC2034,SC2154 usage_description=" Checks for broken URL links in a given file or directory tree Sends HEAD requests and follows redirects - as long as the link redirects and succeeds it'll still pass, as this is most relevant to users and READMEs Accepts HTTP 2xx/3xx status codes as well as the following to avoid false positives: - HTTP 400 (bad request) - eg. a valid API URL may complain we're not sending the required parameters/headers/post body - HTTP 401 (unauthorized) - HTTP 403 (forbidden) - HTTP 405 (method not allowed, ie. HEAD) - HTTP 429 (rate limiting) Ignores: - Private addresses (localhost, .local, .svc, .cluster.local) - Loopback IP (127.0.0.1) - Private IPs (10.x.x.x, 172.16.x.x, 192.168.x.x) - APIPA IPs (169.254.x.x) To ignore links created with variables or otherwise composed in a way we can't straight test them, you can set URL_LINKS_IGNORED to a list, one per line of the URLs To ignore links without dots in them, ie. not public URLs such as domains or IP addresses, which are most likely internal shortname services, set IGNORE_URLS_WITHOUT_DOTS to any value If run in CI, runs 'git ls-files' to avoid scanning other local checkouts or git submodules If you want to filter to specific files only, such a README.md, you can set URL_LINKS_FILE_FILTER='README.md' by name or path or ERE regex Examples: # Scan all URLs in all files under your \$PWD, or in CI all committed files under your \$PWD ${0##*/} # Scan URLs in all files found under the 'src' directory ${0##*/} src # Scan URLs in all files called README.md under your local directory (local mode only, not CI) ${0##*/} . -name README.md # Ignore URLs we know won't work because they're samples / fake or constructed with variables we can't determine etc: export URL_LINKS_IGNORED=' http://myplaceholder nonexistent.com https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK https://some.website.com/downloads/v\$version/some.tar.gz ' ${0##*/} " # used by usage() in lib/utils.sh # shellcheck disable=SC2034 usage_args="[ ]" help_usage "$@" #min_args 1 "$@" section "URL Link Checks" start_time="$(start_timer)" startpath="${1:-.}" shift || : trap_cmd 'echo >&2' check_bin curl check_url_link(){ local url="$1" if [ -n "${VERBOSE:-}" ] || [ -n "${DEBUG:-}" ]; then echo -n "$url => " >&2 fi status_code="$(command curl -sSILf --retry 3 --retry-delay 2 "$url" -o /dev/null -w "%{http_code}" 2>/dev/null || :)" if [ -n "${VERBOSE:-}" ] || [ -n "${DEBUG:-}" ]; then echo "$status_code" >&2 else echo -n '.' >&2 fi # DockerHub https://registry.hub.docker.com/v2 returns 401 # GitHub returns HTTP 429 for too many requests if ! [[ "$status_code" =~ ^([23][[:digit:]]{2}|400|401|403|405|429)$ ]]; then echo >&2 echo "Broken Link: $url" >&2 echo >&2 echo 1 return 1 fi } # Mac's BSD grep has a bug around -f ignores if is_mac; then grep(){ command ggrep "$@" } fi timestamp "Aggregating unique URLs from files under '$startpath'" # filtering out LinkedIn.com which prevents crawling with HTTP/2 999 code # GitHub returns HTTP 429 for too many requests #-e 'https://github\.com/marketplace' \ urls="$( if is_CI; then git ls-files "$startpath" "$@" else find -L "$startpath" -type f "$@" | { grep -v -e '/\.git/' -e '/\.svn/' -e '/\.hg/' || : ; } fi | if [ -n "${URL_LINKS_FILE_FILTER:-}" ]; then grep -E "$URL_LINKS_FILE_FILTER" || : else cat fi | while read -r filename; do [ -f "$filename" ] || continue # protects against symlinks to dirs returned by 'git ls-files' # $url_regex defined in lib/utils.sh # shellcheck disable=SC2154 { grep -E "$url_regex" "$filename" || : ; } | #sed 's/#.*//; /^[[:space:]]*$/d' | { grep -Eiv \ -e '\$' \ -e 'localhost' \ -e '\.svc$' \ -e '.local$' \ -e '\.cluster\.local' \ -e 'domain\.com' \ -e 'acmecorp\.com' \ -e 'example\.com' \ -e 'linkedin\.com' \ -e '(169\.254\.)' \ -e '(172\.16\.)' \ -e '(192\.168\.)' \ -e '10\.[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+' \ -e '127.0.0.1' \ -e '\.\.\.' \ -e 'x\.x\.x\.x' || : ; } | { grep -Eo "$url_regex" || : ; } | if [ -n "${URL_LINKS_IGNORED:-}" ]; then grep -Eivf <( tr '[:space:]' '\n' <<< "$URL_LINKS_IGNORED" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//; /^[[:space:]]*$/d' ) else cat fi | if [ -n "${IGNORE_URLS_WITHOUT_DOTS:-}" ]; then grep -E 'https?://[^/]+\.[^/]+' || : else cat fi done | sort -uf )" urls="${urls##[[:space:]]}" urls="${urls%%[[:space:]]}" echo >&2 if is_blank "$urls"; then echo "No URLs found" >&2 exit 0 fi url_count="$(wc -l <<< "$urls" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')" timestamp "Checking $url_count unique URLs" echo >&2 tests=$( while read -r url; do echo "check_url_link '$url'" done <<< "$urls" ) # export function to use in parallel export -f check_url_link export SHELL=/bin/bash # Debian docker container doesn't set this and defaults to sh, failing to find exported function set +eo pipefail tally="$(parallel -j 10 <<< "$tests")" exit_code=$? set -eo pipefail broken_count="$(awk '{sum+=$1} END{print sum}' <<< "$tally")" echo >&2 time_taken "$start_time" echo >&2 if [ $exit_code -eq 0 ]; then section2 "URL links passed" else echo "ERROR: $broken_count/$url_count broken links detected!" >&2 echo >&2 section2 "URL Links FAILED" exit 1 fi