diff --git a/install/download_parquet_tools.sh b/install/download_parquet_tools.sh new file mode 100755 index 00000000..4ffce097 --- /dev/null +++ b/install/download_parquet_tools.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# vim:ts=4:sts=4:sw=4:et +# +# Author: Hari Sekhon +# Date: 2024-09-16 13:47:47 +0200 (Mon, 16 Sep 2024) +# (ported from Knowledge Base parquet page) +# +# https://github.com/HariSekhon/DevOps-Bash-tools +# +# License: see accompanying Hari Sekhon LICENSE file +# +# If you're using my code you're welcome to connect with me on LinkedIn and optionally send me feedback to help steer this or other code I publish +# +# https://www.linkedin.com/in/HariSekhon +# + +set -euo pipefail +[ -n "${DEBUG:-}" ] && set -x +srcdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# shellcheck disable=SC1090,SC1091 +. "$srcdir/../lib/utils.sh" + +# shellcheck disable=SC2034,SC2154 +usage_description=" +Quickly determines and downloads latest Apache Parquet Tools jar or an explicitly given version +" + +# used by usage() in lib/utils.sh +# shellcheck disable=SC2034 +usage_args="[]" + +#version="1.11.2" +version="${1:-latest}" + +downloads_url='https://repo1.maven.org/maven2/org/apache/parquet/parquet-tools' + +# ERE format for grep -E +version_regex='([[:digit:]]+.[[:digit:]]+.[[:digit:]]+)/' + +# Should match these: +# +# 1.10.0/ +# 1.10.1/ +# 1.11.0/ +# 1.11.1/ +# 1.11.2/ +# 1.7.0/ +# 1.8.0/ +# 1.8.1/ +# 1.8.2/ +# 1.8.3/ +# 1.9.0/ + +if [ "$version" = "latest" ]; then + timestamp "Determining latest Parquet Tools version from $downloads_url" + versions="$( + curl -sS "$downloads_url/" | + grep -Eo "$version_regex" | + sed 's|[[:space:]]*$||; s|^.*>||; s|/$||' + )" + version="$(sort -Vr <<< "$versions" | head -n 1)" + timestamp "Determined latest Parquet Tools version to be $version" +fi + +download_url="$downloads_url/$version/parquet-tools-$version.jar" + +"$srcdir/../bin/download_url_file.sh" "$download_url"