You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
223 lines
6.5 KiB
Bash
223 lines
6.5 KiB
Bash
#!/usr/bin/env bash
|
|
# vim:ts=4:sts=4:sw=4:et
|
|
#
|
|
# Author: Hari Sekhon
|
|
# Date: 2022-01-17 18:11:09 +0000 (Mon, 17 Jan 2022)
|
|
#
|
|
# https://github.com/HariSekhon/DevOps-Bash-tools
|
|
#
|
|
# License: see accompanying Hari Sekhon LICENSE file
|
|
#
|
|
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
|
|
#
|
|
# https://www.linkedin.com/in/HariSekhon
|
|
#
|
|
|
|
set -euo pipefail
|
|
[ -n "${DEBUG:-}" ] && set -x
|
|
srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# shellcheck disable=SC1090
|
|
. "$srcdir/lib/utils.sh"
|
|
|
|
# shellcheck disable=SC2034,SC2154
|
|
usage_description="
|
|
Checks for broken URL links in a given file or directory tree
|
|
|
|
Sends HEAD requests and follows redirects - as long as the link redirects and succeeds it'll still pass, as this is most relevant to users and READMEs
|
|
|
|
Accepts HTTP 2xx/3xx status codes as well as the following to avoid false positives:
|
|
- HTTP 400 (bad request) - eg. a valid API URL may complain we're not sending the required parameters/headers/post body
|
|
- HTTP 401 (unauthorized)
|
|
- HTTP 403 (forbidden)
|
|
- HTTP 405 (method not allowed, ie. HEAD)
|
|
- HTTP 429 (rate limiting)
|
|
|
|
Ignores:
|
|
- Private addresses (localhost, .local, .svc, .cluster.local)
|
|
- Loopback IP (127.0.0.1)
|
|
- Private IPs (10.x.x.x, 172.16.x.x, 192.168.x.x)
|
|
- APIPA IPs (169.254.x.x)
|
|
|
|
To ignore links created with variables or otherwise composed in a way we can't straight test them, you can set URL_LINKS_IGNORED to a list, one per line of the URLs
|
|
To ignore links without dots in them, ie. not public URLs such as domains or IP addresses, which are most likely internal shortname services, set IGNORE_URLS_WITHOUT_DOTS to any value
|
|
|
|
If run in CI, runs 'git ls-files' to avoid scanning other local checkouts or git submodules
|
|
If you want to filter to specific files only, such a README.md, you can set URL_LINKS_FILE_FILTER='README.md' by name or path or ERE regex
|
|
|
|
Examples:
|
|
|
|
# Scan all URLs in all files under your \$PWD, or in CI all committed files under your \$PWD
|
|
|
|
${0##*/}
|
|
|
|
# Scan URLs in all files found under the 'src' directory
|
|
|
|
${0##*/} src
|
|
|
|
# Scan URLs in all files called README.md under your local directory (local mode only, not CI)
|
|
|
|
${0##*/} . -name README.md
|
|
|
|
# Ignore URLs we know won't work because they're samples / fake or constructed with variables we can't determine etc:
|
|
|
|
export URL_LINKS_IGNORED='
|
|
http://myplaceholder
|
|
nonexistent.com
|
|
https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
|
|
https://some.website.com/downloads/v\$version/some.tar.gz
|
|
'
|
|
|
|
${0##*/}
|
|
"
|
|
|
|
# used by usage() in lib/utils.sh
|
|
# shellcheck disable=SC2034
|
|
usage_args="[<file_or_directory> <find_or_git_options>]"
|
|
|
|
help_usage "$@"
|
|
|
|
#min_args 1 "$@"
|
|
|
|
section "URL Link Checks"
|
|
|
|
start_time="$(start_timer)"
|
|
|
|
startpath="${1:-.}"
|
|
shift || :
|
|
|
|
trap_cmd 'echo >&2'
|
|
|
|
check_bin curl
|
|
|
|
check_url_link(){
|
|
local url="$1"
|
|
if [ -n "${VERBOSE:-}" ] || [ -n "${DEBUG:-}" ]; then
|
|
echo -n "$url => " >&2
|
|
fi
|
|
status_code="$(command curl -sSILf --retry 3 --retry-delay 2 "$url" -o /dev/null -w "%{http_code}" 2>/dev/null || :)"
|
|
if [ -n "${VERBOSE:-}" ] || [ -n "${DEBUG:-}" ]; then
|
|
echo "$status_code" >&2
|
|
else
|
|
echo -n '.' >&2
|
|
fi
|
|
# DockerHub https://registry.hub.docker.com/v2 returns 401
|
|
# GitHub returns HTTP 429 for too many requests
|
|
if ! [[ "$status_code" =~ ^([23][[:digit:]]{2}|400|401|403|405|429)$ ]]; then
|
|
echo >&2
|
|
echo "Broken Link: $url" >&2
|
|
echo >&2
|
|
echo 1
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Mac's BSD grep has a bug around -f ignores
|
|
if is_mac; then
|
|
grep(){
|
|
command ggrep "$@"
|
|
}
|
|
fi
|
|
|
|
timestamp "Aggregating unique URLs from files under '$startpath'"
|
|
# filtering out LinkedIn.com which prevents crawling with HTTP/2 999 code
|
|
# GitHub returns HTTP 429 for too many requests
|
|
#-e 'https://github\.com/marketplace' \
|
|
urls="$(
|
|
if is_CI; then
|
|
git ls-files "$startpath" "$@"
|
|
else
|
|
find -L "$startpath" -type f "$@" |
|
|
{ grep -v -e '/\.git/' -e '/\.svn/' -e '/\.hg/' || : ; }
|
|
fi |
|
|
if [ -n "${URL_LINKS_FILE_FILTER:-}" ]; then
|
|
grep -E "$URL_LINKS_FILE_FILTER" || :
|
|
else
|
|
cat
|
|
fi |
|
|
while read -r filename; do
|
|
[ -f "$filename" ] || continue # protects against symlinks to dirs returned by 'git ls-files'
|
|
# $url_regex defined in lib/utils.sh
|
|
# shellcheck disable=SC2154
|
|
{ grep -E "$url_regex" "$filename" || : ; } |
|
|
#sed 's/#.*//; /^[[:space:]]*$/d' |
|
|
{ grep -Eiv \
|
|
-e '\$' \
|
|
-e 'localhost' \
|
|
-e '\.svc$' \
|
|
-e '.local$' \
|
|
-e '\.cluster\.local' \
|
|
-e 'domain\.com' \
|
|
-e 'acmecorp\.com' \
|
|
-e 'example\.com' \
|
|
-e 'linkedin\.com' \
|
|
-e '(169\.254\.)' \
|
|
-e '(172\.16\.)' \
|
|
-e '(192\.168\.)' \
|
|
-e '10\.[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+' \
|
|
-e '127.0.0.1' \
|
|
-e '\.\.\.' \
|
|
-e 'x\.x\.x\.x' || : ; } |
|
|
{ grep -Eo "$url_regex" || : ; } |
|
|
if [ -n "${URL_LINKS_IGNORED:-}" ]; then
|
|
grep -Eivf <(
|
|
tr '[:space:]' '\n' <<< "$URL_LINKS_IGNORED" |
|
|
sed 's/^[[:space:]]*//;
|
|
s/[[:space:]]*$//;
|
|
/^[[:space:]]*$/d'
|
|
)
|
|
else
|
|
cat
|
|
fi |
|
|
if [ -n "${IGNORE_URLS_WITHOUT_DOTS:-}" ]; then
|
|
grep -E 'https?://[^/]+\.[^/]+' || :
|
|
else
|
|
cat
|
|
fi
|
|
done |
|
|
sort -uf
|
|
)"
|
|
urls="${urls##[[:space:]]}"
|
|
urls="${urls%%[[:space:]]}"
|
|
echo >&2
|
|
|
|
if is_blank "$urls"; then
|
|
echo "No URLs found" >&2
|
|
exit 0
|
|
fi
|
|
|
|
url_count="$(wc -l <<< "$urls" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')"
|
|
|
|
timestamp "Checking $url_count unique URLs"
|
|
echo >&2
|
|
|
|
tests=$(
|
|
while read -r url; do
|
|
echo "check_url_link '$url'"
|
|
done <<< "$urls"
|
|
)
|
|
|
|
# export function to use in parallel
|
|
export -f check_url_link
|
|
export SHELL=/bin/bash # Debian docker container doesn't set this and defaults to sh, failing to find exported function
|
|
|
|
set +eo pipefail
|
|
tally="$(parallel -j 10 <<< "$tests")"
|
|
exit_code=$?
|
|
set -eo pipefail
|
|
|
|
broken_count="$(awk '{sum+=$1} END{print sum}' <<< "$tally")"
|
|
|
|
echo >&2
|
|
time_taken "$start_time"
|
|
echo >&2
|
|
|
|
if [ $exit_code -eq 0 ]; then
|
|
section2 "URL links passed"
|
|
else
|
|
echo "ERROR: $broken_count/$url_count broken links detected!" >&2
|
|
echo >&2
|
|
section2 "URL Links FAILED"
|
|
exit 1
|
|
fi
|