Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions bob-common/mkosi.build
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,11 @@ build_rust_package \
"input-only-proxy" \
"v0.0.2" \
"https://github.com/flashbots/input-only-proxy"

# Build gomplate (template engine for Prometheus config)
make_git_package \
"gomplate" \
"v4.3.3" \
"https://github.com/hairyhenderson/gomplate" \
'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \
"build/gomplate:/usr/bin/gomplate"
3 changes: 3 additions & 0 deletions bob-common/mkosi.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ Packages=podman
openssh-sftp-server
udev
libsnappy1v5
prometheus
prometheus-node-exporter
prometheus-process-exporter

BuildPackages=build-essential
git
Expand Down
5 changes: 5 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/process-exporter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
process_names:
# Monitor the searcher container (conmon + all children via --children flag)
- name: "searcher-container"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we also monitor lighthouse in bob-l1?

cmdline:
- 'conmon.*searcher-container'
43 changes: 43 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/prometheus.yml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

# Recording rules for aggregated metrics
rule_files:
- /etc/prometheus/recording_rules.yml

# Scrape configurations
scrape_configs:
# Node exporter on localhost
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
metric_relabel_configs:
# Only keep aggregated metrics for remote write
- source_labels: [__name__]
regex: 'node_(cpu|memory|disk|filesystem|network)_.*'
action: keep

# Process exporter for container monitoring
- job_name: 'process'
static_configs:
- targets: ['localhost:9256']

{{- $config := (datasource "config") }}
{{- if $config.remote_write_flashbots_url }}

# Remote write configuration (dynamically configured)
remote_write:
# Flashbots endpoint
- url: {{ $config.remote_write_flashbots_url }}
write_relabel_configs:
# Only send aggregated metrics
- source_labels: [__name__]
regex: 'flashbox:.*'
action: keep
{{- if $config.remote_write_flashbots_auth }}
basic_auth:
username: {{ $config.remote_write_flashbots_username }}
password: {{ $config.remote_write_flashbots_password }}
{{- end }}
{{- end }}
49 changes: 49 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/recording_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
groups:
- name: flashbox_aggregated_metrics
interval: 30s # How often to evaluate rules
rules:
# CPU aggregated metrics
- record: flashbox:cpu_usage_percent
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

- record: flashbox:cpu_usage_percent_by_mode
expr: avg(rate(node_cpu_seconds_total[5m])) by (mode) * 100

# Memory aggregated metrics
- record: flashbox:memory_usage_percent
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

- record: flashbox:memory_available_gb
expr: node_memory_MemAvailable_bytes / 1024 / 1024 / 1024

# Disk aggregated metrics - both root and persistent
# Root filesystem (always available)
- record: flashbox:disk_usage_percent_root
expr: 100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100)

# Persistent storage (available after mount) - returns no data if not mounted
- record: flashbox:disk_usage_percent_persistent
expr: 100 - (node_filesystem_avail_bytes{mountpoint="/persistent"} / node_filesystem_size_bytes{mountpoint="/persistent"} * 100)

- record: flashbox:disk_io_read_mb_per_sec
expr: rate(node_disk_read_bytes_total[5m]) / 1024 / 1024

- record: flashbox:disk_io_write_mb_per_sec
expr: rate(node_disk_written_bytes_total[5m]) / 1024 / 1024

# Container health metrics (using process exporter)
- record: flashbox:container_alive
expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}

- record: flashbox:container_cpu_percent
expr: rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m]) * 100

- record: flashbox:container_memory_mb
expr: namedprocess_namegroup_memory_bytes{groupname=~".*searcher-container.*"} / 1024 / 1024

# Network metrics (only counters, no detailed info)
- record: flashbox:network_receive_mb_total
expr: sum(node_network_receive_bytes_total) / 1024 / 1024

- record: flashbox:network_transmit_mb_total
expr: sum(node_network_transmit_bytes_total) / 1024 / 1024
56 changes: 56 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/node-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-node-exporter \
--web.listen-address=127.0.0.1:9100 \
--collector.cpu \
--collector.meminfo \
--collector.diskstats \
--collector.filesystem \
--collector.netdev \
--collector.loadavg \
--no-collector.arp \
--no-collector.bcache \
--no-collector.bonding \
--no-collector.conntrack \
--no-collector.cpufreq \
--no-collector.edac \
--no-collector.entropy \
--no-collector.filefd \
--no-collector.hwmon \
--no-collector.infiniband \
--no-collector.ipvs \
--no-collector.mdadm \
--no-collector.netclass \
--no-collector.netstat \
--no-collector.nfs \
--no-collector.nfsd \
--no-collector.pressure \
--no-collector.rapl \
--no-collector.schedstat \
--no-collector.sockstat \
--no-collector.softnet \
--no-collector.stat \
--no-collector.textfile \
--no-collector.thermal_zone \
--no-collector.time \
--no-collector.timex \
--no-collector.udp_queues \
--no-collector.uname \
--no-collector.vmstat \
--no-collector.xfs \
--no-collector.zfs \
--no-collector.systemd \
--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/)
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
19 changes: 19 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/process-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Prometheus Process Exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network-online.target searcher-container.service
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-process-exporter \
--web.listen-address=127.0.0.1:9256 \
--config.path=/etc/prometheus/process-exporter.yml \
--children
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
25 changes: 25 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/prometheus.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Unit]
Description=Prometheus Monitoring System
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target fetch-config.service
Wants=network-online.target
Requires=fetch-config.service

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json
ExecStart=/usr/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--storage.tsdb.retention.time=24h \
--web.console.templates=/usr/share/prometheus/consoles \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.listen-address=127.0.0.1:9090
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[Unit]
Description=Searcher Network and Firewall Rules
After=network.target network-setup.service
Requires=network-setup.service
After=network.target network-setup.service fetch-config.service
Requires=network-setup.service fetch-config.service

[Service]
Type=oneshot
Expand Down
157 changes: 157 additions & 0 deletions bob-common/mkosi.extra/usr/bin/fetch-config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/bin/sh
set -eu -o pipefail

# Common configuration fetching script for FlashBox (bob-l1 and bob-l2)
# This script provides shared functionality for configuration management
# Project-specific configuration should be done via /etc/bob/dynamic-config.sh

CONFIG_PATH=/etc/bob/config.env
OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json

# Don't override if config already exists
if [ -f "$CONFIG_PATH" ]; then
echo "Config already exists at $CONFIG_PATH, skipping"
exit 0
fi

# Helper functions
fetch_metadata_value() {
curl -s \
--header "Metadata-Flavor: Google" \
"http://metadata/computeMetadata/v1/instance/attributes/$1"
}

get_ips_from_uris() {
# Extract IP addresses from URIs
echo "$1" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' || echo ""
}

write_observability_config() {
local metrics_flashbots_url="$1"
local metrics_flashbots_username="$2"
local metrics_flashbots_password="$3"

# Extract IP for firewall rules
local metrics_endpoint=""

if [ -n "$metrics_flashbots_url" ]; then
metrics_endpoint=$(get_ips_from_uris "$metrics_flashbots_url" | head -1)
fi

# Append observability config to main config
cat <<EOF >> "$CONFIG_PATH"
CONFIG_METRICS_FLASHBOTS_URL='${metrics_flashbots_url}'
CONFIG_METRICS_FLASHBOTS_USERNAME='${metrics_flashbots_username}'
CONFIG_METRICS_FLASHBOTS_PASSWORD='${metrics_flashbots_password}'
METRICS_ENDPOINT='${metrics_endpoint}'
EOF

# Create observability config for Prometheus (always needed for gomplate templating)
mkdir -p /etc/flashbox
cat <<EOF > "$OBSERVABILITY_CONFIG_PATH"
{
"remote_write_flashbots_url": "${metrics_flashbots_url}",
"remote_write_flashbots_username": "${metrics_flashbots_username}",
"remote_write_flashbots_password": "${metrics_flashbots_password}",
"remote_write_flashbots_auth": $([ -n "${metrics_flashbots_username}" ] && echo '"true"' || echo '""')
}
EOF
echo "Observability configuration written to $OBSERVABILITY_CONFIG_PATH"
}

# Check for local QEMU development environment
if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \
[ -f /etc/systemd/system/serial-console.service ]; then
echo "Running in local QEMU dev image, using default test values"

# Get default gateway (host in QEMU user-mode networking)
GATEWAY=$(ip route | awk '/default/ {print $3}')
if [ -z "$GATEWAY" ]; then
echo "Warning: Could not detect gateway, falling back to 10.0.2.2"
GATEWAY="10.0.2.2"
fi

# Export gateway for custom script
export GATEWAY

# Call project-specific configuration if it exists
if [ -x /etc/bob/dynamic-config.sh ]; then
echo "Running project-specific configuration..."
/etc/bob/dynamic-config.sh qemu "$CONFIG_PATH"
else
echo "Warning: No project-specific configuration found at /etc/bob/dynamic-config.sh"
fi

# Add empty observability config for local dev
write_observability_config "" "" ""

exit 0
fi

# Production configuration using Vault
echo "Fetching configuration from Vault..."

# Get instance metadata
instance_name=$(fetch_metadata_value "name")
vault_addr=$(fetch_metadata_value "vault_addr")
vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp")
vault_kv_path=$(fetch_metadata_value "vault_kv_path")
vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix")

# Authenticate with Vault using GCP identity
gcp_token=$(curl \
--header "Metadata-Flavor: Google" \
--data-urlencode "audience=http://vault/$instance_name" \
--data-urlencode "format=full" \
"http://metadata/computeMetadata/v1/instance/service-accounts/default/identity")

vault_token=$(curl \
--data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \
"${vault_addr}/v1/${vault_auth_mount}/login" | \
jq -r .auth.client_token)

# Fetch common and instance-specific data
common_data=$(curl \
--header "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" |
jq -c .data.data)

secret_data=$(curl \
--header "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/${vault_kv_path}/node/${instance_name}" |
jq -c .data.data)

# Merge data objects
data=$(echo "$common_data $secret_data" | jq -s 'add')

# Helper to get values from merged data
get_data_value() {
echo "$data" | jq -rc --arg key "$1" '.[$key] // ""'
}

# Export data for project-specific script
export VAULT_DATA="$data"
export -f get_data_value
export -f get_ips_from_uris

# Call project-specific configuration
if [ -x /etc/bob/dynamic-config.sh ]; then
echo "Running project-specific configuration..."
/etc/bob/dynamic-config.sh vault "$CONFIG_PATH"
else
echo "Error: No project-specific configuration found at /etc/bob/dynamic-config.sh"
exit 1
fi

# Fetch observability configuration
metrics_flashbots_url=$(get_data_value metrics_flashbots_url)
metrics_flashbots_username=$(get_data_value metrics_flashbots_username)
metrics_flashbots_password=$(get_data_value metrics_flashbots_password)

# Write observability configuration
write_observability_config \
"$metrics_flashbots_url" \
"$metrics_flashbots_username" \
"$metrics_flashbots_password"

echo "Configuration successfully fetched and written to $CONFIG_PATH"
Loading