You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

100 lines
3.4 KiB

#!/usr/bin/env bash
# vim:ts=4:sts=4:sw=4:et
# Author: Hari Sekhon
# Date: 2020-03-16 14:28:43 +0000 (Mon, 16 Mar 2020)
# License: see accompanying Hari Sekhon LICENSE file
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
# Exports Cloudera Navigator logs from the underlying PostgreSQL database to files in the local directory
# FILTER environment variable will restrict to matching fully qualified tables (<db>.<schema>.<table>)
# CSV Output Format is dependent on database columns and can change, but at time of writing was:
# HDFS logs:
# id,service_name,username,ip_addr,event_time,operation,src,dest,permissions,allowed,impersonator,delegation_token_id
# Tested on AWS RDS PostgreSQL 9.5.15
# For individual table export timings set \timing in ~/.psqlrc
set -eu # -o pipefail
[ -n "${DEBUG:-}" ] && set -x
srcdir="$(dirname "$0")"
# shellcheck disable=SC1090
. "$srcdir/lib/"
# only export tables matching this regex
export FILTER='\.[[:alnum:]]+_audit_events_'
# if you only want Hive + Impala logs to determine table access patterns
#export FILTER='\.(hive|impala)_audit_events_'
tstamp "Exporting Cloudera Navigator logs from PostgreSQL database:"
echo >&2
# doesn't seem to like \copy no matter how many backslashes
#"$srcdir/" "
#select replace('exporting {table}', '\"', '');
#\\copy (SELECT * FROM {db}.{schema}.{table}) TO replace('cloudera_navigator_logs/{db}.{schema}.{table}.csv', '\"', '') WITH (FORMAT CSV, HEADER);
#" "$@"
tstamp "logdir = $logdir"
echo >&2
mkdir -pv "$logdir"
time {
"$srcdir/" "$@" |
while read -r db schema table; do
# echo "SELECT 'Exporting $db.$schema.$table' AS progress;"
# echo "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO 'cloudera_navigator_logs/$db.$schema.$table.csv' WITH (FORMAT CSV, HEADER);"
#done |
#"$srcdir/" "$@"
tstamp "Exporting $db.$schema.$table: "
rm -fv "$filename" # would get overwritten anyway but removing to detect when psql errors out without non-zero exit code -c "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO '$filename' WITH (FORMAT CSV, HEADER);"
if ! [ -f "$filename" ]; then
exit 1
# empty
if ! [ -s "$filename" ]; then
tstamp "${filename##*/} is empty, removing..."
rm -f "$filename"
echo >&2
# only a header line
if wc -l "$filename" | grep -q '^1[[:space:]]'; then
tstamp "${filename##*/} has only header line, removing..."
rm -f "$filename"
echo >&2
# we run out of space without this as logs can easily be dozens of GB per day per service
tstamp "compressing $filename"
# don't background if short on space as big new log will fill faster than old log can be gzipped and is only removed after gzip completes
# --force overwrite of existing gzip logs
gzip -9 --force "$filename" &
echo >&2
done || exit $?
tstamp "waiting for background log compression to finish..."
echo >&2
tstamp "Cloudera Navigator PostgreSQL exports finished"
echo >&2