You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.9 KiB
Bash
115 lines
3.9 KiB
Bash
#!/usr/bin/env bash
|
|
# vim:ts=4:sts=4:sw=4:et
|
|
#
|
|
# Author: Hari Sekhon
|
|
# Date: 2020-03-16 14:28:43 +0000 (Mon, 16 Mar 2020)
|
|
#
|
|
# https://github.com/harisekhon/bash-tools
|
|
#
|
|
# License: see accompanying Hari Sekhon LICENSE file
|
|
#
|
|
# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish
|
|
#
|
|
# https://www.linkedin.com/in/harisekhon
|
|
#
|
|
|
|
# Exports Cloudera Navigator logs from the underlying PostgreSQL database to files in the local directory
|
|
#
|
|
# FILTER environment variable will restrict to matching fully qualified tables (<db>.<schema>.<table>)
|
|
#
|
|
# CSV Output Format is dependent on database columns and can change, but at time of writing was:
|
|
#
|
|
# HDFS logs:
|
|
#
|
|
# id,service_name,username,ip_addr,event_time,operation,src,dest,permissions,allowed,impersonator,delegation_token_id
|
|
#
|
|
# Hive logs:
|
|
#
|
|
# id,event_time,allowed,service_name,username,ip_addr,operation,database_name,object_type,table_name,operation_text,impersonator,resource_path,object_usage_type
|
|
#
|
|
# Impala logs:
|
|
#
|
|
# id,event_time,allowed,service_name,username,impersonator,ip_addr,operation,query_id,session_id,status,database_name,object_type,table_name,privilege,operation_text
|
|
#
|
|
# Tested on AWS RDS PostgreSQL 9.5.15
|
|
|
|
# For individual table export timings set \timing in ~/.psqlrc
|
|
|
|
set -eu # -o pipefail
|
|
[ -n "${DEBUG:-}" ] && set -x
|
|
srcdir="$(dirname "$0")"
|
|
|
|
# shellcheck disable=SC1090
|
|
. "$srcdir/lib/utils.sh"
|
|
|
|
logdir="$PWD/cloudera_navigator_logs"
|
|
|
|
# only export tables matching this regex
|
|
export FILTER='\.[[:alnum:]]+_audit_events_'
|
|
|
|
# if you only want Hive + Impala logs to determine table access patterns
|
|
#export FILTER='\.(hive|impala)_audit_events_'
|
|
|
|
# don't background gzip's if filesystem < 30GB free as filesystem will fill up faster than gzip can complete and remove original files to free space
|
|
MIN_FILESYSTEM_MB=30000
|
|
|
|
tstamp "Exporting Cloudera Navigator logs from PostgreSQL database:"
|
|
echo >&2
|
|
|
|
# doesn't seem to like \copy no matter how many backslashes
|
|
#"$srcdir/postgres_foreach_table.sh" "
|
|
#select replace('exporting {table}', '\"', '');
|
|
#\\copy (SELECT * FROM {db}.{schema}.{table}) TO replace('cloudera_navigator_logs/{db}.{schema}.{table}.csv', '\"', '') WITH (FORMAT CSV, HEADER);
|
|
#" "$@"
|
|
|
|
tstamp "logdir = $logdir"
|
|
echo >&2
|
|
mkdir -pv "$logdir"
|
|
|
|
time {
|
|
"$srcdir/postgres_list_tables.sh" "$@" |
|
|
while read -r db schema table; do
|
|
# echo "SELECT 'Exporting $db.$schema.$table' AS progress;"
|
|
# echo "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO 'cloudera_navigator_logs/$db.$schema.$table.csv' WITH (FORMAT CSV, HEADER);"
|
|
#done |
|
|
#"$srcdir/psql.sh" "$@"
|
|
filename="$logdir/$db.$schema.$table.csv"
|
|
tstamp "Exporting $db.$schema.$table: "
|
|
rm -fv "$filename" # would get overwritten anyway but removing to detect when psql errors out without non-zero exit code
|
|
psql.sh -c "\\copy (SELECT * FROM \"$db\".\"$schema\".\"$table\") TO '$filename' WITH (FORMAT CSV, HEADER);"
|
|
if ! [ -f "$filename" ]; then
|
|
tstamp "ERROR: EXPORT FAILED"
|
|
exit 1
|
|
fi
|
|
# empty
|
|
if ! [ -s "$filename" ]; then
|
|
tstamp "${filename##*/} is empty, removing..."
|
|
rm -f "$filename"
|
|
echo >&2
|
|
continue
|
|
fi
|
|
# only a header line
|
|
if wc -l "$filename" | grep -q '^1[[:space:]]'; then
|
|
tstamp "${filename##*/} has only header line, removing..."
|
|
rm -f "$filename"
|
|
echo >&2
|
|
continue
|
|
fi
|
|
# we run out of space without this as logs can easily be dozens of GB per day per service
|
|
tstamp "compressing $filename"
|
|
filesystem_free_mb="$(df -m . | awk '{print $4}' | tail -n 1)"
|
|
if [ "$filesystem_free_mb" -lt $MIN_FILESYSTEM_MB ]; then
|
|
# --force overwrite of existing gzip logs
|
|
gzip -9 --force "$filename"
|
|
else
|
|
gzip -9 --force "$filename" &
|
|
fi
|
|
echo >&2
|
|
done || exit $?
|
|
tstamp "waiting for background log compression to finish..."
|
|
wait
|
|
echo >&2
|
|
tstamp "Cloudera Navigator PostgreSQL exports finished"
|
|
echo >&2
|
|
}
|