Overview
Prometheus exporters are agents that expose metrics from third-party systems in a format that Prometheus can scrape. They bridge the gap between systems that don't natively expose Prometheus metrics and your monitoring infrastructure.
Key Concepts:
- Pull-based Model: Prometheus scrapes metrics from exporters via HTTP endpoints
- Metric Types: Counter, Gauge, Histogram, Summary
- Target Discovery: Static configs, service discovery, or Kubernetes annotations
- Performance: Exporters should be lightweight and have minimal impact on monitored systems
Node Exporter
Node Exporter exposes hardware and OS metrics from *NIX kernels.
Linux Installation
#!/bin/bash
# Install Node Exporter on Linux
VERSION="1.8.0"
ARCH="amd64"
# Download and extract
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v${VERSION}/node_exporter-${VERSION}.linux-${ARCH}.tar.gz
tar xvfz node_exporter-${VERSION}.linux-${ARCH}.tar.gz
sudo cp node_exporter-${VERSION}.linux-${ARCH}/node_exporter /usr/local/bin/
sudo chown root:root /usr/local/bin/node_exporter
# Create systemd service
sudo tee /etc/systemd/system/node_exporter.service > /dev/null <<EOF
[Unit]
Description=Node Exporter
After=network.target
[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/usr/local/bin/node_exporter \\
--collector.filesystem.mount-points-exclude='^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)' \\
--collector.filesystem.fs-types-exclude='^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$' \\
--collector.textfile.directory=/var/lib/node_exporter/textfile_collector \\
--web.listen-address=:9100
Restart=always
RestartSec=10s
[Install]
WantedBy=multi-user.target
EOF
# Create user and directories
sudo useradd --no-create-home --shell /bin/false node_exporter
sudo mkdir -p /var/lib/node_exporter/textfile_collector
sudo chown -R node_exporter:node_exporter /var/lib/node_exporter
# Enable and start service
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter
Windows Installation
# Install Node Exporter on Windows
$Version = "1.8.0"
$Url = "https://github.com/prometheus/node_exporter/releases/download/v$Version/node_exporter-$Version.windows-amd64.zip"
$InstallPath = "C:\Program Files\node_exporter"
# Download and extract
Invoke-WebRequest -Uri $Url -OutFile "$env:TEMP\node_exporter.zip"
Expand-Archive -Path "$env:TEMP\node_exporter.zip" -DestinationPath $InstallPath -Force
# Create Windows service using NSSM
nssm install node_exporter "$InstallPath\node_exporter-$Version.windows-amd64\node_exporter.exe"
nssm set node_exporter AppParameters "--collector.cpu --collector.cs --collector.logical_disk --collector.net --collector.os --collector.system --collector.memory"
nssm set node_exporter DisplayName "Prometheus Node Exporter"
nssm set node_exporter Description "Exports machine metrics to Prometheus"
nssm set node_exporter Start SERVICE_AUTO_START
# Start service
Start-Service node_exporter
Docker Setup
# docker-compose.yml
version: '3.8'
services:
node_exporter:
image: prom/node-exporter:v1.8.0
container_name: node_exporter
restart: unless-stopped
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
networks:
monitoring:
driver: bridge
Custom Textfile Collector
#!/bin/bash
# Example: Custom metrics collection script
# Place in /etc/cron.d/custom-metrics (run every 5 minutes)
TEXTFILE_DIR="/var/lib/node_exporter/textfile_collector"
TEMP_FILE="${TEXTFILE_DIR}/custom_metrics.prom.$$"
PROM_FILE="${TEXTFILE_DIR}/custom_metrics.prom"
# Collect custom metrics
{
echo "# HELP custom_backup_age_seconds Age of last backup in seconds"
echo "# TYPE custom_backup_age_seconds gauge"
BACKUP_AGE=$(( $(date +%s) - $(stat -c %Y /backup/latest.tar.gz 2>/dev/null || echo 0) ))
echo "custom_backup_age_seconds ${BACKUP_AGE}"
echo "# HELP custom_database_size_bytes Database size in bytes"
echo "# TYPE custom_database_size_bytes gauge"
DB_SIZE=$(sudo -u postgres psql -t -c "SELECT pg_database_size('mydb')" | tr -d ' ')
echo "custom_database_size_bytes ${DB_SIZE}"
echo "# HELP custom_certificate_expiry_seconds Certificate expiry time in seconds"
echo "# TYPE custom_certificate_expiry_seconds gauge"
CERT_EXPIRY=$(date -d "$(openssl x509 -in /etc/ssl/certs/myapp.crt -noout -enddate | cut -d= -f2)" +%s)
echo "custom_certificate_expiry_seconds ${CERT_EXPIRY}"
} > "${TEMP_FILE}"
# Atomic move
mv "${TEMP_FILE}" "${PROM_FILE}"
cAdvisor
Container Advisor (cAdvisor) provides resource usage and performance characteristics of running containers.
cAdvisor Docker Setup
# docker-compose.yml
version: '3.8'
services:
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
restart: unless-stopped
privileged: true
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
devices:
- /dev/kmsg
command:
- '--housekeeping_interval=30s'
- '--docker_only=true'
- '--disable_metrics=disk,network,tcp,udp,percpu,sched,process'
networks:
- monitoring
networks:
monitoring:
driver: bridge
Kubernetes Deployment
# cadvisor-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cadvisor
namespace: monitoring
spec:
selector:
matchLabels:
app: cadvisor
template:
metadata:
labels:
app: cadvisor
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
automountServiceAccountToken: false
containers:
- name: cadvisor
image: gcr.io/cadvisor/cadvisor:v0.47.2
ports:
- name: metrics
containerPort: 8080
protocol: TCP
resources:
requests:
memory: 200Mi
cpu: 100m
limits:
memory: 400Mi
cpu: 300m
volumeMounts:
- name: rootfs
mountPath: /rootfs
readOnly: true
- name: var-run
mountPath: /var/run
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
- name: docker
mountPath: /var/lib/docker
readOnly: true
- name: disk
mountPath: /dev/disk
readOnly: true
volumes:
- name: rootfs
hostPath:
path: /
- name: var-run
hostPath:
path: /var/run
- name: sys
hostPath:
path: /sys
- name: docker
hostPath:
path: /var/lib/docker
- name: disk
hostPath:
path: /dev/disk
Blackbox Exporter
Blackbox Exporter probes endpoints over HTTP, HTTPS, DNS, TCP, ICMP, and gRPC.
Configuration Examples
# blackbox.yml
modules:
# HTTP 2xx probe
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200, 201, 202, 204]
method: GET
preferred_ip_protocol: "ip4"
follow_redirects: true
fail_if_ssl: false
fail_if_not_ssl: false
tls_config:
insecure_skip_verify: false
# HTTP POST probe with authentication
http_post_auth:
prober: http
timeout: 5s
http:
method: POST
headers:
Content-Type: application/json
Authorization: Bearer {{ env "API_TOKEN" }}
body: '{"health_check": true}'
valid_status_codes: [200]
fail_if_body_not_matches_regexp:
- "status.*ok"
# HTTP probe with specific headers and SSL verification
http_ssl_check:
prober: http
timeout: 10s
http:
method: GET
preferred_ip_protocol: "ip4"
tls_config:
insecure_skip_verify: false
fail_if_not_ssl: true
fail_if_body_not_matches_regexp:
- "Copyright 2025"
# TCP probe
tcp_connect:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: "ip4"
tls: false
# TCP with TLS probe
tcp_connect_tls:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: "ip4"
tls: true
tls_config:
insecure_skip_verify: false
# ICMP probe
icmp_check:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"
source_ip_address: "0.0.0.0"
# DNS probe
dns_check:
prober: dns
timeout: 5s
dns:
preferred_ip_protocol: "ip4"
query_name: "example.com"
query_type: "A"
valid_rcodes:
- NOERROR
validate_answer_rrs:
fail_if_not_matches_regexp:
- ".*"
# DNS probe with specific record validation
dns_mx_check:
prober: dns
timeout: 5s
dns:
query_name: "example.com"
query_type: "MX"
valid_rcodes:
- NOERROR
validate_answer_rrs:
fail_if_not_matches_regexp:
- "mail.*example.com"
Docker Deployment
# docker-compose.yml
version: '3.8'
services:
blackbox_exporter:
image: prom/blackbox-exporter:v0.24.0
container_name: blackbox_exporter
restart: unless-stopped
ports:
- "9115:9115"
volumes:
- ./blackbox.yml:/etc/blackbox_exporter/config.yml:ro
command:
- '--config.file=/etc/blackbox_exporter/config.yml'
networks:
- monitoring
networks:
monitoring:
driver: bridge
Prometheus Configuration for Blackbox
# prometheus.yml
scrape_configs:
- job_name: 'blackbox_http'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://example.com
- https://api.example.com/health
- https://app.example.com/status
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox_exporter:9115
- job_name: 'blackbox_icmp'
metrics_path: /probe
params:
module: [icmp_check]
static_configs:
- targets:
- 8.8.8.8
- 1.1.1.1
- 192.168.1.1
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox_exporter:9115
Unpoller (UniFi)
Unpoller collects metrics from UniFi Controller for network monitoring.
Unpoller Docker Setup
# docker-compose.yml
version: '3.8'
services:
unpoller:
image: golift/unifi-poller:v2.10.0
container_name: unpoller
restart: unless-stopped
ports:
- "9130:9130"
environment:
# Prometheus Settings
UP_PROMETHEUS_DISABLE: "false"
UP_PROMETHEUS_NAMESPACE: "unifi"
# UniFi Controller Settings
UP_UNIFI_CONTROLLER_0_URL: "https://unifi.example.com:8443"
UP_UNIFI_CONTROLLER_0_USER: "prometheus"
UP_UNIFI_CONTROLLER_0_PASS: "${UNIFI_PASSWORD}"
UP_UNIFI_CONTROLLER_0_VERIFY_SSL: "false"
# Polling Settings
UP_UNIFI_DEFAULT_SAVE_SITES: "true"
UP_UNIFI_DEFAULT_SAVE_IDS: "true"
UP_UNIFI_DEFAULT_SAVE_DPI: "true"
UP_UNIFI_DEFAULT_SAVE_ALARMS: "true"
UP_UNIFI_DEFAULT_SAVE_ANOMALIES: "true"
UP_UNIFI_DEFAULT_SAVE_EVENTS: "false"
# Interval Settings
UP_POLLER_DEBUG: "false"
UP_INTERVAL: "30s"
# InfluxDB (optional)
UP_INFLUXDB_DISABLE: "true"
networks:
- monitoring
networks:
monitoring:
driver: bridge
Configuration File Example
# unpoller.conf
[prometheus]
disable = false
namespace = "unifi"
http_listen = "0.0.0.0:9130"
[unifi]
dynamic = false
[unifi.defaults]
role = "main-controller"
verify_ssl = false
save_sites = true
save_ids = true
save_dpi = true
save_alarms = true
save_anomalies = true
save_events = false
[[unifi.controller]]
url = "https://unifi.example.com:8443"
user = "prometheus"
pass = "secure_password_here"
sites = ["all"]
[poller]
debug = false
quiet = false
Key Metrics Overview
# Device status
unifi_device_uptime_seconds{site="default"}
# Client statistics
unifi_client_received_bytes_total{site="default"}
unifi_client_transmitted_bytes_total{site="default"}
# Access Point metrics
unifi_access_point_station_count{site="default"}
unifi_access_point_user_experience_score{site="default"}
# Network statistics
unifi_network_received_bytes_total{site="default"}
unifi_network_transmitted_bytes_total{site="default"}
# Alarm count
unifi_alarms_total{site="default"}
Database Exporters
PostgreSQL Exporter
# docker-compose.yml
version: '3.8'
services:
postgres_exporter:
image: prometheuscommunity/postgres-exporter:v0.15.0
container_name: postgres_exporter
restart: unless-stopped
ports:
- "9187:9187"
environment:
DATA_SOURCE_NAME: "postgresql://postgres_exporter:${DB_PASSWORD}@postgres:5432/postgres?sslmode=disable"
PG_EXPORTER_EXTEND_QUERY_PATH: "/etc/postgres_exporter/queries.yaml"
volumes:
- ./postgres_queries.yaml:/etc/postgres_exporter/queries.yaml:ro
networks:
- monitoring
networks:
monitoring:
driver: bridge
Custom queries file:
# postgres_queries.yaml
pg_database_size:
query: |
SELECT
datname as database,
pg_database_size(datname) as size_bytes
FROM pg_database
WHERE datname NOT IN ('template0', 'template1')
metrics:
- database:
usage: "LABEL"
description: "Database name"
- size_bytes:
usage: "GAUGE"
description: "Database size in bytes"
pg_table_bloat:
query: |
SELECT
schemaname,
tablename,
pg_total_relation_size(schemaname||'.'||tablename) AS total_bytes,
pg_relation_size(schemaname||'.'||tablename) AS table_bytes
FROM pg_tables
WHERE schemaname NOT IN ('pg_catalog', 'information_schema')
metrics:
- schemaname:
usage: "LABEL"
description: "Schema name"
- tablename:
usage: "LABEL"
description: "Table name"
- total_bytes:
usage: "GAUGE"
description: "Total size including indexes"
- table_bytes:
usage: "GAUGE"
description: "Table size"
MySQL Exporter
# docker-compose.yml
version: '3.8'
services:
mysql_exporter:
image: prom/mysqld-exporter:v0.15.1
container_name: mysql_exporter
restart: unless-stopped
ports:
- "9104:9104"
environment:
DATA_SOURCE_NAME: "exporter:${MYSQL_PASSWORD}@(mysql:3306)/"
collect.info_schema.innodb_metrics: "true"
collect.info_schema.tables: "true"
collect.info_schema.tablestats: "true"
collect.perf_schema.tableiowaits: "true"
collect.perf_schema.indexiowaits: "true"
command:
- "--config.my-cnf=/etc/mysqld-exporter/.my.cnf"
- "--collect.global_status"
- "--collect.info_schema.innodb_metrics"
- "--collect.auto_increment.columns"
- "--collect.perf_schema.tableiowaits"
- "--collect.perf_schema.file_events"
volumes:
- ./my.cnf:/etc/mysqld-exporter/.my.cnf:ro
networks:
- monitoring
networks:
monitoring:
driver: bridge
MongoDB Exporter
# docker-compose.yml
version: '3.8'
services:
mongodb_exporter:
image: percona/mongodb_exporter:0.40.0
container_name: mongodb_exporter
restart: unless-stopped
ports:
- "9216:9216"
environment:
MONGODB_URI: "mongodb://exporter:${MONGO_PASSWORD}@mongodb:27017"
MONGODB_ENABLE_DIAGNOSTIC_DATA: "true"
MONGODB_ENABLE_REPLICASET: "true"
command:
- "--mongodb.direct-connect=false"
- "--mongodb.global-conn-pool=false"
- "--collector.diagnosticdata"
- "--collector.replicasetstatus"
- "--collector.dbstats"
- "--collector.topmetrics"
networks:
- monitoring
networks:
monitoring:
driver: bridge
Redis Exporter
# docker-compose.yml
version: '3.8'
services:
redis_exporter:
image: oliver006/redis_exporter:v1.58.0
container_name: redis_exporter
restart: unless-stopped
ports:
- "9121:9121"
environment:
REDIS_ADDR: "redis://redis:6379"
REDIS_PASSWORD: "${REDIS_PASSWORD}"
REDIS_EXPORTER_CHECK_KEYS: "myapp:user:*,myapp:session:*"
REDIS_EXPORTER_CHECK_SINGLE_KEYS: "myapp:config,myapp:feature_flags"
command:
- "--include-system-metrics"
- "--is-tile38=false"
networks:
- monitoring
networks:
monitoring:
driver: bridge
SNMP Exporter
Generator Usage
#!/bin/bash
# Generate SNMP exporter configuration
# Clone generator repository
git clone https://github.com/prometheus/snmp_exporter.git
cd snmp_exporter/generator
# Install dependencies
sudo apt-get install -y unzip build-essential libsnmp-dev
go install
# Create generator configuration
cat > generator.yml <<EOF
modules:
cisco_switch:
walk:
- sysUpTime
- interfaces
- ifXTable
- 1.3.6.1.2.1.31.1.1.1.6 # ifHCInOctets
- 1.3.6.1.2.1.31.1.1.1.10 # ifHCOutOctets
- 1.3.6.1.2.1.2.2.1.8 # ifOperStatus
lookups:
- source_indexes: [ifIndex]
lookup: ifAlias
- source_indexes: [ifIndex]
lookup: ifDescr
overrides:
ifAlias:
ignore: true
ifDescr:
ignore: true
ifName:
ignore: true
ifType:
type: EnumAsInfo
pdu:
walk:
- sysUpTime
- 1.3.6.1.4.1.318.1.1.12.2.3.1.1.2 # rPDUOutletStatusOutletName
- 1.3.6.1.4.1.318.1.1.12.2.3.1.1.4 # rPDUOutletStatusOutletState
- 1.3.6.1.4.1.318.1.1.12.2.3.1.1.6 # rPDUOutletStatusLoad
lookups:
- source_indexes: [rPDUOutletStatusIndex]
lookup: rPDUOutletStatusOutletName
EOF
# Generate snmp.yml
export MIBDIRS=/usr/share/snmp/mibs
./generator generate
Configuration Example
# snmp.yml (example output from generator)
modules:
cisco_switch:
walk:
- 1.3.6.1.2.1.1.3.0 # sysUpTime
- 1.3.6.1.2.1.2.2.1 # interfaces
- 1.3.6.1.2.1.31.1.1.1 # ifXTable
get:
- 1.3.6.1.2.1.1.3.0
metrics:
- name: sysUpTime
oid: 1.3.6.1.2.1.1.3.0
type: gauge
- name: ifInOctets
oid: 1.3.6.1.2.1.2.2.1.10
type: counter
indexes:
- labelname: ifIndex
type: gauge
version: 2
auth:
community: public
SNMP Docker Deployment
# docker-compose.yml
version: '3.8'
services:
snmp_exporter:
image: prom/snmp-exporter:v0.25.0
container_name: snmp_exporter
restart: unless-stopped
ports:
- "9116:9116"
volumes:
- ./snmp.yml:/etc/snmp_exporter/snmp.yml:ro
command:
- '--config.file=/etc/snmp_exporter/snmp.yml'
networks:
- monitoring
networks:
monitoring:
driver: bridge
Prometheus Configuration
# prometheus.yml
scrape_configs:
- job_name: 'snmp_cisco'
static_configs:
- targets:
- 192.168.1.10 # Cisco switch
- 192.168.1.11 # Another switch
metrics_path: /snmp
params:
module: [cisco_switch]
auth: [public_v2]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp_exporter:9116
Custom Exporters
Writing Custom Exporter in Python
#!/usr/bin/env python3
"""
Custom Prometheus Exporter Example
Exposes custom application metrics
"""
from prometheus_client import start_http_server, Gauge, Counter, Histogram
import time
import random
# Define metrics
app_requests = Counter(
'app_requests_total',
'Total application requests',
['method', 'endpoint', 'status']
)
app_request_duration = Histogram(
'app_request_duration_seconds',
'Application request duration',
['method', 'endpoint']
)
app_active_connections = Gauge(
'app_active_connections',
'Number of active connections'
)
app_queue_size = Gauge(
'app_queue_size',
'Size of processing queue'
)
app_last_success_timestamp = Gauge(
'app_last_success_timestamp_seconds',
'Timestamp of last successful operation'
)
def collect_metrics():
"""Simulate metric collection"""
while True:
# Simulate request metrics
method = random.choice(['GET', 'POST', 'PUT'])
endpoint = random.choice(['/api/users', '/api/products', '/api/orders'])
status = random.choice(['200', '200', '200', '404', '500'])
app_requests.labels(method=method, endpoint=endpoint, status=status).inc()
# Simulate request duration
duration = random.uniform(0.01, 2.0)
app_request_duration.labels(method=method, endpoint=endpoint).observe(duration)
# Simulate gauge metrics
app_active_connections.set(random.randint(10, 100))
app_queue_size.set(random.randint(0, 50))
app_last_success_timestamp.set(time.time())
time.sleep(1)
if __name__ == '__main__':
# Start metrics server
start_http_server(8000)
print("Metrics server started on port 8000")
# Collect metrics
collect_metrics()
Custom Exporter in Go
package main
import (
"log"
"math/rand"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// Define metrics
appRequests = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "app_requests_total",
Help: "Total application requests",
},
[]string{"method", "endpoint", "status"},
)
appRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "app_request_duration_seconds",
Help: "Application request duration",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
appActiveConnections = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "app_active_connections",
Help: "Number of active connections",
},
)
appQueueSize = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "app_queue_size",
Help: "Size of processing queue",
},
)
)
func init() {
// Register metrics
prometheus.MustRegister(appRequests)
prometheus.MustRegister(appRequestDuration)
prometheus.MustRegister(appActiveConnections)
prometheus.MustRegister(appQueueSize)
}
func collectMetrics() {
methods := []string{"GET", "POST", "PUT"}
endpoints := []string{"/api/users", "/api/products", "/api/orders"}
statuses := []string{"200", "200", "200", "404", "500"}
for {
// Simulate request metrics
method := methods[rand.Intn(len(methods))]
endpoint := endpoints[rand.Intn(len(endpoints))]
status := statuses[rand.Intn(len(statuses))]
appRequests.WithLabelValues(method, endpoint, status).Inc()
// Simulate request duration
duration := rand.Float64() * 2.0
appRequestDuration.WithLabelValues(method, endpoint).Observe(duration)
// Simulate gauge metrics
appActiveConnections.Set(float64(rand.Intn(90) + 10))
appQueueSize.Set(float64(rand.Intn(51)))
time.Sleep(time.Second)
}
}
func main() {
// Start metric collection
go collectMetrics()
// Expose metrics endpoint
http.Handle("/metrics", promhttp.Handler())
log.Println("Metrics server started on :8000")
log.Fatal(http.ListenAndServe(":8000", nil))
}
Pushgateway Usage
#!/bin/bash
# Example: Push metrics from batch job to Pushgateway
JOB_NAME="backup_job"
INSTANCE="db-server-01"
PUSHGATEWAY_URL="http://pushgateway:9091"
# Capture job start time
START_TIME=$(date +%s)
# Run backup job (example)
if /usr/local/bin/backup-database.sh; then
STATUS="success"
EXIT_CODE=0
else
STATUS="failure"
EXIT_CODE=$?
fi
# Calculate duration
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
# Push metrics to Pushgateway
cat <<EOF | curl --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}"
# TYPE backup_job_success gauge
# HELP backup_job_success Backup job success status (1=success, 0=failure)
backup_job_success{job="${JOB_NAME}",instance="${INSTANCE}"} $([[ $STATUS == "success" ]] && echo 1 || echo 0)
# TYPE backup_job_duration_seconds gauge
# HELP backup_job_duration_seconds Backup job duration in seconds
backup_job_duration_seconds{job="${JOB_NAME}",instance="${INSTANCE}"} ${DURATION}
# TYPE backup_job_last_completion_timestamp gauge
# HELP backup_job_last_completion_timestamp Timestamp of last job completion
backup_job_last_completion_timestamp{job="${JOB_NAME}",instance="${INSTANCE}"} ${END_TIME}
# TYPE backup_job_exit_code gauge
# HELP backup_job_exit_code Exit code of backup job
backup_job_exit_code{job="${JOB_NAME}",instance="${INSTANCE}"} ${EXIT_CODE}
EOF
echo "Metrics pushed to Pushgateway"
Best Practices
Deployment Considerations
- Resource Isolation: Run exporters in dedicated containers or services
- Network Security: Restrict exporter access using firewalls or network policies
- Authentication: Use authentication where supported (basic auth, TLS client certs)
- Minimal Privileges: Run exporters with minimal required permissions
- Health Checks: Implement liveness/readiness probes for containerized exporters
Performance Optimization
# Optimize scrape intervals based on metric importance
scrape_configs:
- job_name: 'critical_metrics'
scrape_interval: 15s
static_configs:
- targets: ['api-server:9090']
- job_name: 'standard_metrics'
scrape_interval: 30s
static_configs:
- targets: ['app-server:9090']
- job_name: 'low_priority_metrics'
scrape_interval: 60s
static_configs:
- targets: ['batch-server:9090']
Labeling Standards
# Use consistent labels across exporters
global:
external_labels:
environment: 'production'
datacenter: 'us-east-1'
cluster: 'main'
scrape_configs:
- job_name: 'node'
static_configs:
- targets: ['node1:9100']
labels:
role: 'webserver'
tier: 'frontend'
- targets: ['node2:9100']
labels:
role: 'database'
tier: 'backend'
Security Best Practices
- TLS Encryption: Enable TLS for exporter endpoints in production
- Authentication: Implement basic auth or mutual TLS
- Network Segmentation: Isolate exporters in monitoring network
- Least Privilege: Run exporters with minimal required permissions
- Regular Updates: Keep exporters updated to latest stable versions
Monitoring Exporters
# prometheus.yml - Monitor exporter health
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Alert on exporter down
groups:
- name: exporter_health
rules:
- alert: ExporterDown
expr: up{job!~"prometheus|alertmanager"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Exporter {{ $labels.job }} is down"
description: "Exporter {{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes"
Troubleshooting
Common Issues
Exporter Not Reachable:
# Test exporter endpoint
curl -v http://localhost:9100/metrics
# Check if port is listening
sudo netstat -tlnp | grep 9100
# Check firewall rules
sudo iptables -L -n | grep 9100
High Cardinality Metrics:
# Find metrics with high cardinality
topk(10, count by (__name__)({__name__=~".+"}))
# Check label cardinality
count by (job, instance) (up)
Permission Issues:
# Node exporter can't read system metrics
sudo setcap cap_dac_read_search+ep /usr/local/bin/node_exporter
# Grant access to textfile collector directory
sudo chown -R node_exporter:node_exporter /var/lib/node_exporter
Memory Issues:
# Limit exporter memory usage
# Add to systemd service file
[Service]
MemoryLimit=500M
MemoryAccounting=true