I’m working on a “basic” Snowplow GCP implementation with BigQuery as destination. As I’m learning Snowplow I’m trying to keep things as simple as possible. I was able to have a working proof of concept running the collector as single instance VM. As next step, I want to run the collector behind a group of instance. I’m struggling a bit with the startup-script of the instance template (which I’m using for the Instance groups).
When I’m re-using the start-script from the quick start guide, things are working well
# startup-script from quick start guide
#!/bin/bash
set -e -x
# -----------------------------------------------------------------------------
# BASE INSTALL
# -----------------------------------------------------------------------------
readonly CONFIG_DIR=/opt/snowplow/config
function install_base_packages() {
sudo apt install wget curl unzip -y
}
function install_docker_ce() {
sudo apt install docker.io -y
sudo systemctl enable --now docker
}
sudo apt update -y
install_base_packages
install_docker_ce
sudo mkdir -p ${CONFIG_DIR}
sudo cat << EOF > ${CONFIG_DIR}/collector.hocon
collector {
interface = "0.0.0.0"
port = 8080
ssl {
enable = false
redirect = false
port = 8443
}
paths {}
p3p {
policyRef = "/w3c/p3p.xml"
CP = "NOI DSP COR NID PSA OUR IND COM NAV STA"
}
crossDomain {
enabled = false
domains = [ "*" ]
secure = true
}
cookie {
enabled = true
expiration = "365 days"
name = sp
domains = []
fallbackDomain = ""
secure = true
httpOnly = false
sameSite = "None"
}
doNotTrackCookie {
enabled = false
name = ""
value = ""
}
cookieBounce {
enabled = false
name = "n3pc"
fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000"
forwardedProtocolHeader = "X-Forwarded-Proto"
}
enableDefaultRedirect = false
redirectMacro {
enabled = false
placeholder = "[TOKEN]"
}
rootResponse {
enabled = false
statusCode = 302
headers = {}
body = "302, redirecting"
}
cors {
accessControlMaxAge = "5 seconds"
}
prometheusMetrics {
enabled = false
}
streams {
good = sp-raw-topic
bad = sp-bad-1-topic
useIpAddressAsPartitionKey = false
sink {
enabled = google-pub-sub
googleProjectId = "myprojectid"
backoffPolicy {
minBackoff = 1000
maxBackoff = 1000
totalBackoff = 10000
multiplier = 1
}
}
buffer {
byteLimit = 1000000
recordLimit = 500
timeLimit = 500
}
}
telemetry {
disable = false
url = "telemetry-g.snowplowanalytics.com"
userProvidedId = ""
moduleName = "collector-pubsub-ce"
moduleVersion = "0.2.2"
autoGeneratedId = "16111e245575f273568f468984503688"
}
}
akka {
loglevel = WARNING
loggers = ["akka.event.slf4j.Slf4jLogger"]
http.server {
remote-address-header = on
raw-request-uri-header = on
parsing {
max-uri-length = 32768
uri-parsing-mode = relaxed
}
max-connections = 2048
}
}
EOF
sudo docker run \
-d \
--name collector \
--restart always \
--network host \
--log-driver gcplogs \
-v ${CONFIG_DIR}:/snowplow/config \
-p 8080:8080 \
-e 'JAVA_OPTS=-Dorg.slf4j.simpleLogger.defaultLogLevel=info' \
snowplow/scala-stream-collector-pubsub:2.4.5 \
--config /snowplow/config/collector.hocon
# -----------------------------------------------------------------------------
# SNOWPLOW TELEMETRY INSTALLATION
# -----------------------------------------------------------------------------
# Install script
sudo cat > /usr/local/bin/snowplow-track-vm-telemetry << \EOF
#!/bin/bash
set -e
readonly COMPUTE_METADATA_URL="http://metadata.google.internal/computeMetadata"
readonly SNOWPLOW_TRACKING_CLI_BIN="/usr/local/bin/snowplow-tracking-cli"
function log() {
echo "$@"
}
function die() {
local __die_msg="$1"
local __die_code="${2:-1}"
echo "$@" >&2 ; exit $__die_code;
}
function install_snowplow_tracking_cli() {
log "Installing Snowplow Tracking CLI ..."
sudo wget https://github.com/snowplow/snowplow-tracking-cli/releases/download/0.4.0/snowplow_tracking_cli_0.4.0_linux_amd64.zip -P /tmp/
sudo unzip /tmp/snowplow_tracking_cli_0.4.0_linux_amd64.zip -d /usr/local/bin/
}
function lookup_path_in_compute_metadata() {
[ "$#" -eq 1 ] || die "func:lookup_path_in_compute_metadata > 1 argument required, $# provided"
local __path="$1"
curl --silent --location "$COMPUTE_METADATA_URL/$__path" -H "Metadata-Flavor:Google"
}
function get_instance_id() {
lookup_path_in_compute_metadata "v1/instance/id"
}
function snowplow_telemetry_track_vm_event() {
[ "$#" -eq 1 ] || die "func:snowplow_telemetry_track_vm_event > 1 argument required, $# provided"
local __event_schema="$1"
log "Tracking VM Telemetry event '$__event_schema' ..."
if ! command -v ${SNOWPLOW_TRACKING_CLI_BIN} &> /dev/null; then
install_snowplow_tracking_cli
fi
if [ "" == "" ]; then
local user_provided_id=null
else
local user_provided_id="\"\""
fi
local __oss_context_json="{
\"schema\": \"iglu:com.snowplowanalytics.oss/oss_context/jsonschema/1-0-0\",
\"data\": {
\"userProvidedId\": ${user_provided_id},
\"autoGeneratedId\": \"16111e245575f273568f468984503688\",
\"instanceId\": \"$(get_instance_id)\",
\"cloud\": \"GCP\",
\"region\": \"europe-west2\",
\"moduleName\": \"collector-pubsub-ce\",
\"moduleVersion\": \"0.2.2\",
\"applicationName\": \"stream-collector\",
\"applicationVersion\": \"2.4.5\"
}
}"
${SNOWPLOW_TRACKING_CLI_BIN} \
--collector "telemetry-g.snowplowanalytics.com" \
--appid "terraform-oss-modules" \
--method "POST" \
--schema "${__event_schema}" \
--json "{}" \
--contexts "[${__oss_context_json}]"
log "Tracking completed successfully!"
}
[ "$#" -eq 1 ] || die "Snowplow VM Telemetry expected 1 argument, $# provided"
event_schema="${1}"
[[ $event_schema =~ ^iglu:com.snowplowanalytics.oss/vm_[a-z]+/jsonschema/[0-9]+-[0-9]+-[0-9]+$ ]] || \
die "Snowplow VM Telemetry event schema '${event_schema}' failed regex check, must match '^iglu:com.snowplowanalytics.oss/vm_[a-z]+/jsonschema/[0-9]+-[0-9]+-[0-9]+$'!"
snowplow_telemetry_track_vm_event "${event_schema}"
EOF
sudo chmod +x /usr/local/bin/snowplow-track-vm-telemetry
# Manually track 'vm_start' event in user-data
/usr/local/bin/snowplow-track-vm-telemetry "iglu:com.snowplowanalytics.oss/vm_start/jsonschema/1-0-0"
# Add service to capture VM startup
sudo cat > /etc/systemd/system/snowplow-telemetry-start.service << \EOF
[Unit]
Description=Snowplow Telemetry for VM Start
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
RemainAfterExit=true
ExecStart=/usr/local/bin/snowplow-track-vm-telemetry "iglu:com.snowplowanalytics.oss/vm_start/jsonschema/1-0-0"
[Install]
WantedBy=multi-user.target
EOF
# Add service to capture VM shutdown
sudo cat > /etc/systemd/system/snowplow-telemetry-stop.service << \EOF
[Unit]
Description=Snowplow Telemetry for VM Stop
DefaultDependencies=no
Before=shutdown.target reboot.target halt.target
Wants=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/snowplow-track-vm-telemetry "iglu:com.snowplowanalytics.oss/vm_stop/jsonschema/1-0-0"
[Install]
WantedBy=halt.target reboot.target shutdown.target
EOF
# Add service + timer to capture VM heartbeat
sudo cat > /etc/systemd/system/snowplow-telemetry-heartbeat.service << \EOF
[Unit]
Description=Snowplow Telemetry for VM Heartbeat
Wants=snowplow-telemetry-heartbeat.timer
[Service]
Type=oneshot
ExecStart=/usr/local/bin/snowplow-track-vm-telemetry "iglu:com.snowplowanalytics.oss/vm_heartbeat/jsonschema/1-0-0"
[Install]
WantedBy=multi-user.target
EOF
sudo cat > /etc/systemd/system/snowplow-telemetry-heartbeat.timer << \EOF
[Unit]
Description=Snowplow Telemetry for VM Heartbeat scheduler
Requires=snowplow-telemetry-heartbeat.service
[Timer]
OnUnitActiveSec=60min
[Install]
WantedBy=timers.target
EOF
# Enable systemd telemetry services
sudo systemctl enable snowplow-telemetry-start.service
sudo systemctl enable snowplow-telemetry-stop.service
sudo systemctl start snowplow-telemetry-heartbeat.timer
sudo systemctl enable snowplow-telemetry-heartbeat.timer
However since there are a lot of things I don’t understand and that I maybe don’t need immediately, I’d like to be able to use my own startup-script which I will probably improve over time. I have this but I doesn’t work:
# Simplified version
#!/bin/bash
set -e -x
# -----------------------------------------------------------------------------
# BASE INSTALL
# -----------------------------------------------------------------------------
readonly CONFIG_DIR=/opt/snowplow/config
function install_base_packages() {
sudo apt install wget curl unzip -y
}
function install_docker_ce() {
sudo apt install docker.io -y
sudo systemctl enable --now docker
}
sudo apt update -y
install_base_packages
install_docker_ce
sudo mkdir -p ${CONFIG_DIR}
sudo cat << EOF > ${CONFIG_DIR}/collector.hocon
collector collector {
interface = "0.0.0.0"
port = 8080
streams {
good = "good"
bad = "bad"
sink {
googleProjectId = "myprojectid"
}
}
}
EOF
sudo docker run --rm \
-v ${CONFIG_DIR}:/snowplow/config \
-p 8080:8080 \
snowplow/scala-stream-collector-pubsub:2.7.0 \
--config /snowplow/config/collector.hocon
So my question is, between these two startup scripts, which elements of the first startup script (the one from the quick start guide) are mandatory and are missing from the second startup script (simplified version).
I hope that my question make sense.
Thanks.