#!/usr/bin/env bash # vim:ts=4:sts=4:sw=4:et # # Author: Hari Sekhon # Date: 2020-03-16 14:28:43 +0000 (Mon, 16 Mar 2020) # # https://github.com/HariSekhon/DevOps-Bash-tools # # License: see accompanying Hari Sekhon LICENSE file # # If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish # # https://www.linkedin.com/in/HariSekhon # # Exports Cloudera Navigator logs from the underlying PostgreSQL database to files in the local directory # # FILTER environment variable will restrict to matching fully qualified tables (..) # # CSV Output Format is dependent on database columns and can change, but at time of writing was: # # HDFS logs: # # id,service_name,username,ip_addr,event_time,operation,src,dest,permissions,allowed,impersonator,delegation_token_id # # Hive logs: # # id,event_time,allowed,service_name,username,ip_addr,operation,database_name,object_type,table_name,operation_text,impersonator,resource_path,object_usage_type # # Impala logs: # # id,event_time,allowed,service_name,username,impersonator,ip_addr,operation,query_id,session_id,status,database_name,object_type,table_name,privilege,operation_text # # Tested on AWS RDS PostgreSQL 9.5.15 # For individual table export timings set \timing in ~/.psqlrc set -eu # -o pipefail [ -n "${DEBUG:-}" ] && set -x srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck disable=SC1090,SC1091 . "$srcdir/lib/utils.sh" logdir="$PWD/cloudera_navigator_logs" # only export tables matching this regex export FILTER='\.[[:alnum:]]+_audit_events_' # if you only want Hive + Impala logs to determine table access patterns #export FILTER='\.(hive|impala)_audit_events_' # don't background gzip's if filesystem < 30GB free as filesystem will fill up faster than gzip can complete and remove original files to free space MIN_FILESYSTEM_MB=30000 tstamp "Exporting Cloudera Navigator logs from PostgreSQL database:" echo >&2 # doesn't seem to like \copy no matter how many backslashes #"$srcdir/../postgres/postgres_foreach_table.sh" " #select replace('exporting {table}', '\"', ''); #\\copy (SELECT * FROM {db}.{schema}.{table}) TO replace('cloudera_navigator_logs/{db}.{schema}.{table}.csv', '\"', '') WITH (FORMAT CSV, HEADER); #" "$@" tstamp "logdir = $logdir" echo >&2 mkdir -pv "$logdir" time { "$srcdir/../postgres/postgres_list_tables.sh" "$@" | while read -r db schema table; do # echo "SELECT 'Exporting $db.$schema.$table' AS progress;" # echo "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO 'cloudera_navigator_logs/$db.$schema.$table.csv' WITH (FORMAT CSV, HEADER);" #done | #"$srcdir/../postgres/psql.sh" "$@" filename="$logdir/$db.$schema.$table.csv" tstamp "Exporting $db.$schema.$table: " rm -fv -- "$filename" # would get overwritten anyway but removing to detect when psql errors out without non-zero exit code psql.sh -c "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO '$filename' WITH (FORMAT CSV, HEADER);" if ! [ -f "$filename" ]; then tstamp "ERROR: EXPORT FAILED" exit 1 fi # empty if ! [ -s "$filename" ]; then tstamp "${filename##*/} is empty, removing..." rm -f -- "$filename" echo >&2 continue fi # only a header line if wc -l "$filename" | grep -q '^1[[:space:]]'; then tstamp "${filename##*/} has only header line, removing..." rm -f -- "$filename" echo >&2 continue fi # we run out of space without this as logs can easily be dozens of GB per day per service tstamp "compressing $filename" filesystem_free_mb="$(df -m . | awk '{print $4}' | tail -n 1)" if [ "$filesystem_free_mb" -lt $MIN_FILESYSTEM_MB ]; then # --force overwrite of existing gzip logs gzip -9 --force "$filename" else gzip -9 --force "$filename" & fi echo >&2 done || exit $? tstamp "waiting for background log compression to finish..." wait echo >&2 tstamp "Cloudera Navigator PostgreSQL exports finished" echo >&2 }