#!/usr/bin/env bash
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2020-03-06 12:03:19 +0000 (Fri, 06 Mar 2020)
#
# https://github.com/harisekhon/bash-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
#
# https://www.linkedin.com/in/harisekhon
#
# Script to download all historical audit logs from Cloudera Navigator from 2009 to present
#
# 2009 was Cloudera's founding year so we don't search for history past that since it can never exist
#
# Uses adjacent cloudera_manager_audit.sh, see comments there for more details
#
# Tested on Cloudera Enterprise 5.10
set -euo pipefail
[ -n " ${ DEBUG :- } " ] && set -x
srcdir = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd ) "
# shellcheck disable=SC1090
source " $srcdir /lib/utils.sh "
trap 'tstamp ERROR' exit
services = "
hive
impala
hdfs
hbase
scm
"
# slighty better compression but takes forever, even slow to decompress
#compress_cmd="bzip2 -9 -c"
#compress_cmd="gzip -9 -c"
#ext="gz"
if [ [ " ${ 1 :- } " = ~ ^service= = ] ] ; then
single_service = " ${ 1 ##service== } "
shift
fi
download_audit_logs( ) {
local year = " $1 "
local month = " ${ 2 #0 } " # because maths ops + 1 won't work on zero prefixed string, so re-add it later
local service = " $3 "
shift; shift; shift
if [ " ${# month } " = 1 ] ; then
month = " 0 $month "
fi
local log = " navigator_audit_ ${ year } - ${ month } _ ${ service } .csv "
local log_bytes
# expand now
# shellcheck disable=SC2064
trap " tstamp ERROR; tstamp 'Removing partial log file for restartability without audit gaps: '; rm -fv ' $log ' " exit
if validate_log " $log " ; then
tstamp " Skipping previously completed log $log ... "
echo >& 2
return
else
tstamp " Querying Cloudera Navigator for $year logs for $service "
month = " ${ month #0 } " # because maths ops + 1 won't work on zero prefixed string, so re-add it later
if [ " $month " = 12 ] ; then
( ( end_year = year+1) )
end_month = 01
else
( ( end_month = month+1) )
end_year = " $year "
fi
if [ " ${# month } " = 1 ] ; then
month = " 0 $month "
fi
if [ " ${# end_month } " = 1 ] ; then
end_month = " 0 $end_month "
fi
# won't output a newline so the contents of next command will be timestamp prefixed
tstamp
#time {
# don't let a random 401 stop from downloading other logs, can go back and fill in the gaps later by re-running
# Navigator returns zero byte logs without headers without error so this || : is not the cause of not catching zero byte logs, which we have to check for separately anyway
" $srcdir /cloudera_navigator_audit_logs.sh " " $year - $month -01T00:00:00 " " $end_year - $end_month -01T00:00:00 " " service== $service " " $@ " | " $srcdir /progress_dots.sh " > " $log " || :
log_bytes = " $( stat_bytes " $log " ) "
tstamp " $log = $log_bytes bytes "
if [ " $log_bytes " = 0 ] ; then
tstamp " ERROR: Navigator returned zero byte audit log for $log , not even containing the headers row! "
return
fi
#}
fi
#local compressed_log="$log.$ext"
#if [ -s "$log" ]; then
if validate_log " $log " ; then
#tstamp "Compressing audit log: $log > $compressed_log"
# want splitting
# shellcheck disable=SC2086
#$compress_cmd "$log" > "$compressed_log" &
:
else
tstamp " WARNING: $log doesn't look complete, must check "
fi
echo >& 2
}
validate_log( ) {
local log = " $1 "
# a single newline in the log file trips this so dive in to deeper checks to make sure we have what looks like enough data
if [ -s " $log " ] ; then
local log_bytes
log_bytes = " $( stat_bytes " $log " ) "
tstamp " $log = $log_bytes bytes "
if [ " $log_bytes " = 558 ] ; then
tstamp " $log has only headers - inferring there are no logs for that date range "
return 0
#elif [ "$log_bytes" -gt 10240 ]; then
# tstamp "Skipping $log since it already exists and is > 10MB"
# return 0
#fi
# audit logs start at $year-12-* at the top, and end at the bottom in $year-01-* - partial logs often get cut off
# in between, so if we've gotten all the way to January the log is likely complete - tempted to do January 01 but
# there will probably be some edge case where a service isn't used on New Year's day or the first few days
# because a lot of people take time off around then, so this is more generic to just check for January
# can't check for December also being in the log because this would always fail for the current year
elif grep -q " ^\" $year - $month -0 " " $log " ; then
tstamp " $log contains logs for $year - $month -0*, looks complete "
return 0
fi
fi
return 1
}
current_year = " $( date +%Y) "
current_month = " $( date +%m) "
# works on Mac but seq on Linux doesn't do reverse, outputs nothing
#for year in $(seq "$current_year" 2009); do
# On Mac tac requires gnu coreutils to be installed via Homebrew
for year in $( seq 2009 " $current_year " | tac) ; do
for month in { 12..1} ; do
if [ " $year " -eq " $current_year " ] && [ " $month " -gt " $current_month " ] ; then
# Navigator returns forbidden if querying in the future
continue
fi
if [ -n " ${ single_service :- } " ] ; then
download_audit_logs " $year " " $month " " $single_service " " $@ "
else
for service in $services ; do
download_audit_logs " $year " " $month " " $service " " $@ "
done
fi
done
done
tstamp "Finished querying Cloudera Navigator API"
tstamp "Waiting for log compression to finish"
wait
tstamp "DONE"
trap - exit