You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
166 lines
5.8 KiB
Bash
166 lines
5.8 KiB
Bash
#!/usr/bin/env bash
|
|
# vim:ts=4:sts=4:sw=4:et
|
|
#
|
|
# Author: Hari Sekhon
|
|
# Date: 2020-03-06 12:03:19 +0000 (Fri, 06 Mar 2020)
|
|
#
|
|
# https://github.com/harisekhon/bash-tools
|
|
#
|
|
# License: see accompanying Hari Sekhon LICENSE file
|
|
#
|
|
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
|
|
#
|
|
# https://www.linkedin.com/in/harisekhon
|
|
#
|
|
|
|
# Script to download all historical audit logs from Cloudera Navigator from 2009 to present
|
|
#
|
|
# 2009 was Cloudera's founding year so we don't search for history past that since it can never exist
|
|
#
|
|
# Uses adjacent cloudera_manager_audit.sh, see comments there for more details
|
|
#
|
|
# Tested on Cloudera Enterprise 5.10
|
|
|
|
set -euo pipefail
|
|
[ -n "${DEBUG:-}" ] && set -x
|
|
srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# shellcheck disable=SC1090
|
|
source "$srcdir/lib/utils.sh"
|
|
|
|
trap 'tstamp ERROR' exit
|
|
|
|
services="
|
|
hive
|
|
impala
|
|
hdfs
|
|
hbase
|
|
scm
|
|
"
|
|
|
|
# slighty better compression but takes forever, even slow to decompress
|
|
#compress_cmd="bzip2 -9 -c"
|
|
#compress_cmd="gzip -9 -c"
|
|
#ext="gz"
|
|
|
|
if [[ "${1:-}" =~ ^service== ]]; then
|
|
single_service="${1##service==}"
|
|
shift
|
|
fi
|
|
|
|
download_audit_logs(){
|
|
local year="$1"
|
|
local month="${2#0}" # because maths ops + 1 won't work on zero prefixed string, so re-add it later
|
|
local service="$3"
|
|
shift; shift; shift
|
|
if [ "${#month}" = 1 ]; then
|
|
month="0$month"
|
|
fi
|
|
local log="navigator_audit_${year}-${month}_${service}.csv"
|
|
local log_bytes
|
|
# expand now
|
|
# shellcheck disable=SC2064
|
|
trap "tstamp ERROR; tstamp 'Removing partial log file for restartability without audit gaps: '; rm -fv '$log'" exit
|
|
if validate_log "$log"; then
|
|
tstamp "Skipping previously completed log $log..."
|
|
echo >&2
|
|
return
|
|
else
|
|
tstamp "Querying Cloudera Navigator for $year logs for $service"
|
|
month="${month#0}" # because maths ops + 1 won't work on zero prefixed string, so re-add it later
|
|
if [ "$month" = 12 ]; then
|
|
((end_year=year+1))
|
|
end_month=01
|
|
else
|
|
((end_month=month+1))
|
|
end_year="$year"
|
|
fi
|
|
if [ "${#month}" = 1 ]; then
|
|
month="0$month"
|
|
fi
|
|
if [ "${#end_month}" = 1 ]; then
|
|
end_month="0$end_month"
|
|
fi
|
|
# won't output a newline so the contents of next command will be timestamp prefixed
|
|
tstamp
|
|
#time {
|
|
# don't let a random 401 stop from downloading other logs, can go back and fill in the gaps later by re-running
|
|
# Navigator returns zero byte logs without headers without error so this || : is not the cause of not catching zero byte logs, which we have to check for separately anyway
|
|
"$srcdir/cloudera_navigator_audit_logs.sh" "$year-$month-01T00:00:00" "$end_year-$end_month-01T00:00:00" "service==$service" "$@" | "$srcdir/progress_dots.sh" > "$log" || :
|
|
log_bytes="$(stat_bytes "$log")"
|
|
tstamp "$log = $log_bytes bytes"
|
|
if [ "$log_bytes" = 0 ]; then
|
|
tstamp "ERROR: Navigator returned zero byte audit log for $log, not even containing the headers row!"
|
|
return
|
|
fi
|
|
#}
|
|
fi
|
|
#local compressed_log="$log.$ext"
|
|
#if [ -s "$log" ]; then
|
|
if validate_log "$log"; then
|
|
#tstamp "Compressing audit log: $log > $compressed_log"
|
|
# want splitting
|
|
# shellcheck disable=SC2086
|
|
#$compress_cmd "$log" > "$compressed_log" &
|
|
:
|
|
else
|
|
tstamp "WARNING: $log doesn't look complete, must check"
|
|
fi
|
|
echo >&2
|
|
}
|
|
|
|
validate_log(){
|
|
local log="$1"
|
|
# a single newline in the log file trips this so dive in to deeper checks to make sure we have what looks like enough data
|
|
if [ -s "$log" ]; then
|
|
local log_bytes
|
|
log_bytes="$(stat_bytes "$log")"
|
|
tstamp "$log = $log_bytes bytes"
|
|
if [ "$log_bytes" = 558 ]; then
|
|
tstamp "$log has only headers - inferring there are no logs for that date range"
|
|
return 0
|
|
#elif [ "$log_bytes" -gt 10240 ]; then
|
|
# tstamp "Skipping $log since it already exists and is > 10MB"
|
|
# return 0
|
|
#fi
|
|
# audit logs start at $year-12-* at the top, and end at the bottom in $year-01-* - partial logs often get cut off
|
|
# in between, so if we've gotten all the way to January the log is likely complete - tempted to do January 01 but
|
|
# there will probably be some edge case where a service isn't used on New Year's day or the first few days
|
|
# because a lot of people take time off around then, so this is more generic to just check for January
|
|
# can't check for December also being in the log because this would always fail for the current year
|
|
elif grep -q "^\"$year-$month-0" "$log"; then
|
|
tstamp "$log contains logs for $year-$month-0*, looks complete"
|
|
return 0
|
|
fi
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
current_year="$(date +%Y)"
|
|
current_month="$(date +%m)"
|
|
|
|
# works on Mac but seq on Linux doesn't do reverse, outputs nothing
|
|
#for year in $(seq "$current_year" 2009); do
|
|
|
|
# On Mac tac requires gnu coreutils to be installed via Homebrew
|
|
for year in $(seq 2009 "$current_year" | tac); do
|
|
for month in {12..1}; do
|
|
if [ "$year" -eq "$current_year" ] && [ "$month" -gt "$current_month" ]; then
|
|
# Navigator returns forbidden if querying in the future
|
|
continue
|
|
fi
|
|
if [ -n "${single_service:-}" ]; then
|
|
download_audit_logs "$year" "$month" "$single_service" "$@"
|
|
else
|
|
for service in $services; do
|
|
download_audit_logs "$year" "$month" "$service" "$@"
|
|
done
|
|
fi
|
|
done
|
|
done
|
|
tstamp "Finished querying Cloudera Navigator API"
|
|
tstamp "Waiting for log compression to finish"
|
|
wait
|
|
tstamp "DONE"
|
|
trap - exit
|