Monitoring | Geode Database

<h2 id="monitoring" class="position-relative d-flex align-items-center group"> Monitoring <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="monitoring" aria-haspopup="dialog" aria-label="Share link: Monitoring"> Share link </button> </h2><div id="headingShareModal" class="heading-share-modal" role="dialog" aria-modal="true" aria-labelledby="headingShareTitle" hidden> <div class="hsm-dialog" role="document"> <div class="hsm-header"> <h2 id="headingShareTitle" class="h6 mb-0 fw-bold">Share this section</h2> <button type="button" class="hsm-close" aria-label="Close"> </button> </div> <div class="hsm-body"> <label for="headingShareInput" class="form-label small text-muted mb-1 text-uppercase fw-bold" style="font-size: 0.7rem; letter-spacing: 0.5px;">Permalink</label> <div class="input-group mb-4 hsm-url-group"> <input id="headingShareInput" type="text" class="form-control font-monospace" readonly aria-readonly="true" style="font-size: 0.85rem;" /> <button class="btn btn-primary hsm-copy" type="button" aria-label="Copy" title="Copy"> </button> </div> <div class="small fw-bold mb-2 text-muted text-uppercase" style="font-size: 0.7rem; letter-spacing: 0.5px;">Share via</div> <div class="hsm-share-grid"> <a id="share-twitter" class="btn btn-outline-secondary w-100" target="_blank" rel="noopener noreferrer"> Twitter </a> <a id="share-linkedin" class="btn btn-outline-secondary w-100" target="_blank" rel="noopener noreferrer"> LinkedIn </a> <a id="share-facebook" class="btn btn-outline-secondary w-100" target="_blank" rel="noopener noreferrer"> Facebook </a> </div> </div> </div> </div> <style> .heading-share-modal { position: fixed; inset: 0; display: flex; justify-content: center; align-items: center; background: rgba(0, 0, 0, 0.6); z-index: 1050; padding: 1rem; backdrop-filter: blur(4px); -webkit-backdrop-filter: blur(4px); } .heading-share-modal[hidden] { display: none !important; } .hsm-dialog { max-width: 420px; width: 100%; background: var(--bs-body-bg, #fff); color: var(--bs-body-color, #212529); border: 1px solid var(--bs-border-color, rgba(0,0,0,0.1)); border-radius: 1rem; box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.25); overflow: hidden; animation: hsm-fade-in 0.2s ease-out; } @keyframes hsm-fade-in { from { opacity: 0; transform: scale(0.95); } to { opacity: 1; transform: scale(1); } } [data-bs-theme="dark"] .hsm-dialog { background: #1e293b; border-color: rgba(255,255,255,0.1); color: #f8f9fa; } .hsm-header { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; border-bottom: 1px solid var(--bs-border-color, rgba(0,0,0,0.1)); background: rgba(0,0,0,0.02); } [data-bs-theme="dark"] .hsm-header { background: rgba(255,255,255,0.02); border-color: rgba(255,255,255,0.1); } .hsm-close { background: transparent; border: none; color: inherit; opacity: 0.5; padding: 0.25rem 0.5rem; border-radius: 0.25rem; font-size: 1.2rem; line-height: 1; transition: opacity 0.2s; } .hsm-close:hover { opacity: 1; } .hsm-body { padding: 1.5rem; } .hsm-url-group { display: flex !important; align-items: stretch; } .hsm-url-group .form-control { flex: 1; min-width: 0; margin: 0; background: var(--bs-secondary-bg, #f8f9fa); border-color: var(--bs-border-color, #dee2e6); border-top-right-radius: 0; border-bottom-right-radius: 0; height: 42px; } .hsm-url-group .btn { flex: 0 0 auto; margin: 0; margin-left: -1px; border-top-left-radius: 0; border-bottom-left-radius: 0; height: 42px; display: flex; align-items: center; justify-content: center; padding: 0 1.25rem; z-index: 2; } [data-bs-theme="dark"] .hsm-url-group .form-control { background: #0f172a; border-color: #334155; color: #e2e8f0; } .hsm-share-grid { display: flex; flex-direction: column; gap: 0.5rem; } .hsm-share-grid .btn { display: flex; align-items: center; justify-content: center; font-size: 0.9rem; padding: 0.6rem; border-color: var(--bs-border-color); width: 100%; } [data-bs-theme="dark"] .hsm-share-grid .btn { color: #e2e8f0; border-color: #475569; } [data-bs-theme="dark"] .hsm-share-grid .btn:hover { background: #334155; border-color: #cbd5e1; } </style> <script> (function(){ const modal = document.getElementById('headingShareModal'); if(!modal) return; const input = modal.querySelector('#headingShareInput'); const copyBtn = modal.querySelector('.hsm-copy'); const twitter = modal.querySelector('#share-twitter'); const linkedin = modal.querySelector('#share-linkedin'); const facebook = modal.querySelector('#share-facebook'); const closeBtn = modal.querySelector('.hsm-close'); let lastFocus=null; let trapBound=false; function buildUrl(id){ return window.location.origin + window.location.pathname + '#' + id; } function isOpen(){ return !modal.hasAttribute('hidden'); } function hydrate(id){ const url=buildUrl(id); input.value=url; const enc=encodeURIComponent(url); const text=encodeURIComponent(document.title); if(twitter) twitter.href=`https://twitter.com/intent/tweet?url=${enc}&text=${text}`; if(linkedin) linkedin.href=`https://www.linkedin.com/sharing/share-offsite/?url=${enc}`; if(facebook) facebook.href=`https://www.facebook.com/sharer/sharer.php?u=${enc}`; } function openModal(id){ lastFocus=document.activeElement; hydrate(id); if(!isOpen()){ modal.removeAttribute('hidden'); } requestAnimationFrame(()=>{ input.focus(); }); trapFocus(); } function closeModal(){ if(!isOpen()) return; modal.setAttribute('hidden',''); if(lastFocus && typeof lastFocus.focus==='function') lastFocus.focus(); } function copyCurrent(){ try{ navigator.clipboard.writeText(input.value).then(()=>feedback(true),()=>fallback()); } catch(e){ fallback(); } } function fallback(){ input.select(); try{ document.execCommand('copy'); feedback(true);}catch(e){ feedback(false);} } function feedback(ok){ if(!copyBtn) return; const icon=copyBtn.querySelector('i'); if(!icon) return; const prev=copyBtn.getAttribute('data-prev')||icon.className; if(!copyBtn.getAttribute('data-prev')) copyBtn.setAttribute('data-prev',prev); icon.className= ok ? 'fa-duotone fa-clipboard-check':'fa-duotone fa-circle-exclamation'; setTimeout(()=>{ icon.className=prev; },1800); } function handleShareClick(e){ e.preventDefault(); const btn=e.currentTarget; const id=btn.getAttribute('data-share-target'); if(id) openModal(id); } function bindShareButtons(){ document.querySelectorAll('.h-share').forEach(btn=>{ if(!btn.dataset.hShareBound){ btn.addEventListener('click', handleShareClick); btn.dataset.hShareBound='1'; } }); } bindShareButtons(); if(document.readyState==='loading'){ document.addEventListener('DOMContentLoaded', bindShareButtons); } else { requestAnimationFrame(bindShareButtons); } document.addEventListener('click', function(e){ const shareBtn=e.target.closest && e.target.closest('.h-share'); if(shareBtn && !shareBtn.dataset.hShareBound){ handleShareClick.call(shareBtn, e); } }, true); document.addEventListener('click', e=>{ if(e.target===modal) closeModal(); if(e.target.closest && e.target.closest('.hsm-close')){ e.preventDefault(); closeModal(); } if(copyBtn && (e.target===copyBtn || (e.target.closest && e.target.closest('.hsm-copy')))) { e.preventDefault(); copyCurrent(); } }); document.addEventListener('keydown', e=>{ if(e.key==='Escape' && isOpen()) closeModal(); }); function trapFocus(){ if(trapBound) return; trapBound=true; modal.addEventListener('keydown', f=>{ if(f.key==='Tab' && isOpen()){ const focusable=[...modal.querySelectorAll('a[href],button,input,textarea,select,[tabindex]:not([tabindex="-1"])')].filter(el=>!el.hasAttribute('disabled')); if(!focusable.length) return; const first=focusable[0]; const last=focusable[focusable.length-1]; if(f.shiftKey && document.activeElement===first){ f.preventDefault(); last.focus(); } else if(!f.shiftKey && document.activeElement===last){ f.preventDefault(); first.focus(); } } }); } if(closeBtn) closeBtn.addEventListener('click', e=>{ e.preventDefault(); closeModal(); }); })(); </script>This guide covers setting up comprehensive monitoring for Geode, including Prometheus metrics collection, Grafana dashboards, alerting, and log aggregation. <h3 id="overview" class="position-relative d-flex align-items-center group"> Overview <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="overview" aria-haspopup="dialog" aria-label="Share link: Overview"> Share link </button> </h3>Geode provides extensive observability capabilities: <table> <thead> <tr> <th>Component</th> <th>Purpose</th> <th>Endpoint</th> </tr> </thead> <tbody> <tr> <td>Metrics</td> <td>Prometheus-format metrics</td> <td><code>:8080/metrics</code></td> </tr> <tr> <td>Health</td> <td>Health check endpoint</td> <td><code>:8080/health</code></td> </tr> <tr> <td>Ready</td> <td>Readiness probe</td> <td><code>:8080/ready</code></td> </tr> <tr> <td>Live</td> <td>Liveness probe</td> <td><code>:8080/live</code></td> </tr> <tr> <td>Logs</td> <td>Structured JSON logs</td> <td>stdout/file</td> </tr> </tbody> </table> Golden Signals to monitor: <ul> <li>Latency: Request duration (p50, p95, p99)</li> <li>Traffic: Requests per second</li> <li>Errors: Error rate and types</li> <li>Saturation: Resource utilization</li> </ul> <h3 id="prometheus-integration" class="position-relative d-flex align-items-center group"> Prometheus Integration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="prometheus-integration" aria-haspopup="dialog" aria-label="Share link: Prometheus Integration"> Share link </button> </h3> <h4 id="enabling-metrics" class="position-relative d-flex align-items-center group"> Enabling Metrics <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="enabling-metrics" aria-haspopup="dialog" aria-label="Share link: Enabling Metrics"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># geode.yaml monitoring: enabled: true metrics: enabled: true endpoint: '/metrics' port: 8080 include_go_metrics: true include_process_metrics: true </code></pre></div> <h4 id="prometheus-configuration" class="position-relative d-flex align-items-center group"> Prometheus Configuration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="prometheus-configuration" aria-haspopup="dialog" aria-label="Share link: Prometheus Configuration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># prometheus.yml global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'geode' static_configs: - targets: ['geode-server:8080'] metrics_path: /metrics scheme: http # Optional: authentication basic_auth: username: prometheus password_file: /etc/prometheus/password # Optional: TLS tls_config: ca_file: /etc/prometheus/ca.pem </code></pre></div> <h4 id="key-metrics" class="position-relative d-flex align-items-center group"> Key Metrics <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="key-metrics" aria-haspopup="dialog" aria-label="Share link: Key Metrics"> Share link </button> </h4>Query Performance: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-promql" data-lang="promql"># Query latency (p50, p95, p99) histogram_quantile(0.50, rate(geode_query_duration_seconds_bucket[5m])) histogram_quantile(0.95, rate(geode_query_duration_seconds_bucket[5m])) histogram_quantile(0.99, rate(geode_query_duration_seconds_bucket[5m])) # Queries per second rate(geode_query_total[5m]) # Query error rate rate(geode_query_total{status="error"}[5m]) / rate(geode_query_total[5m]) </code></pre></div>Connection Metrics: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-promql" data-lang="promql"># Active connections geode_connections_active # Connection rate rate(geode_connections_total[5m]) # Connection errors rate(geode_connection_errors_total[5m]) </code></pre></div>Storage Metrics: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-promql" data-lang="promql"># Page cache hit ratio geode_storage_cache_hits_total / (geode_storage_cache_hits_total + geode_storage_cache_misses_total) # WAL write rate rate(geode_storage_wal_bytes_written_total[5m]) # Disk usage geode_storage_disk_usage_bytes </code></pre></div>Resource Utilization: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-promql" data-lang="promql"># Memory usage geode_process_memory_bytes # CPU usage rate(geode_process_cpu_seconds_total[5m]) # Goroutines (if Go-based metrics enabled) geode_runtime_goroutines </code></pre></div> <h4 id="complete-metrics-reference" class="position-relative d-flex align-items-center group"> Complete Metrics Reference <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="complete-metrics-reference" aria-haspopup="dialog" aria-label="Share link: Complete Metrics Reference"> Share link </button> </h4><table> <thead> <tr> <th>Metric</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>geode_query_duration_seconds</code></td> <td>histogram</td> <td>Query execution time</td> </tr> <tr> <td><code>geode_query_total</code></td> <td>counter</td> <td>Total queries by status</td> </tr> <tr> <td><code>geode_query_rows_returned</code></td> <td>histogram</td> <td>Rows returned per query</td> </tr> <tr> <td><code>geode_connections_active</code></td> <td>gauge</td> <td>Current active connections</td> </tr> <tr> <td><code>geode_connections_total</code></td> <td>counter</td> <td>Total connections</td> </tr> <tr> <td><code>geode_connection_errors_total</code></td> <td>counter</td> <td>Connection errors</td> </tr> <tr> <td><code>geode_storage_cache_hits_total</code></td> <td>counter</td> <td>Page cache hits</td> </tr> <tr> <td><code>geode_storage_cache_misses_total</code></td> <td>counter</td> <td>Page cache misses</td> </tr> <tr> <td><code>geode_storage_wal_bytes_written_total</code></td> <td>counter</td> <td>WAL bytes written</td> </tr> <tr> <td><code>geode_storage_disk_usage_bytes</code></td> <td>gauge</td> <td>Disk space used</td> </tr> <tr> <td><code>geode_index_lookups_total</code></td> <td>counter</td> <td>Index lookups by type</td> </tr> <tr> <td><code>geode_index_size_bytes</code></td> <td>gauge</td> <td>Index size</td> </tr> <tr> <td><code>geode_auth_attempts_total</code></td> <td>counter</td> <td>Auth attempts by result</td> </tr> <tr> <td><code>geode_backup_last_success_timestamp</code></td> <td>gauge</td> <td>Last backup time</td> </tr> </tbody> </table> <h3 id="grafana-dashboards" class="position-relative d-flex align-items-center group"> Grafana Dashboards <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="grafana-dashboards" aria-haspopup="dialog" aria-label="Share link: Grafana Dashboards"> Share link </button> </h3> <h4 id="dashboard-configuration" class="position-relative d-flex align-items-center group"> Dashboard Configuration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="dashboard-configuration" aria-haspopup="dialog" aria-label="Share link: Dashboard Configuration"> Share link </button> </h4>Create a comprehensive Geode dashboard: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-json" data-lang="json">{ "dashboard": { "title": "Geode Overview", "tags": ["geode", "database"], "panels": [ { "title": "Query Latency", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.50, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p50" }, { "expr": "histogram_quantile(0.95, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p95" }, { "expr": "histogram_quantile(0.99, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p99" } ] }, { "title": "Queries per Second", "type": "stat", "targets": [ { "expr": "rate(geode_query_total[5m])", "legendFormat": "QPS" } ] }, { "title": "Error Rate", "type": "gauge", "targets": [ { "expr": "rate(geode_query_total{status='error'}[5m]) / rate(geode_query_total[5m]) * 100", "legendFormat": "Error %" } ] }, { "title": "Active Connections", "type": "graph", "targets": [ { "expr": "geode_connections_active", "legendFormat": "Connections" } ] }, { "title": "Cache Hit Ratio", "type": "gauge", "targets": [ { "expr": "geode_storage_cache_hits_total / (geode_storage_cache_hits_total + geode_storage_cache_misses_total) * 100", "legendFormat": "Hit %" } ] } ] } } </code></pre></div> <h4 id="pre-built-dashboards" class="position-relative d-flex align-items-center group"> Pre-built Dashboards <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="pre-built-dashboards" aria-haspopup="dialog" aria-label="Share link: Pre-built Dashboards"> Share link </button> </h4>Import pre-built dashboards from Grafana.com: <ul> <li>Dashboard ID: <code>XXXXX</code> - Geode Overview</li> <li>Dashboard ID: <code>XXXXX</code> - Geode Query Performance</li> <li>Dashboard ID: <code>XXXXX</code> - Geode Storage</li> </ul> <h4 id="dashboard-panels" class="position-relative d-flex align-items-center group"> Dashboard Panels <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="dashboard-panels" aria-haspopup="dialog" aria-label="Share link: Dashboard Panels"> Share link </button> </h4>System Health Panel: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-json" data-lang="json">{ "title": "System Health", "type": "stat", "fieldConfig": { "defaults": { "mappings": [ {"type": "value", "options": {"1": {"text": "Healthy", "color": "green"}}}, {"type": "value", "options": {"0": {"text": "Unhealthy", "color": "red"}}} ] } }, "targets": [ {"expr": "up{job='geode'}"} ] } </code></pre></div>Query Performance Panel: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-json" data-lang="json">{ "title": "Query Performance", "type": "timeseries", "fieldConfig": { "defaults": {"unit": "s"} }, "targets": [ {"expr": "histogram_quantile(0.50, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p50"}, {"expr": "histogram_quantile(0.95, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p95"}, {"expr": "histogram_quantile(0.99, rate(geode_query_duration_seconds_bucket[5m]))", "legendFormat": "p99"} ] } </code></pre></div> <h3 id="alerting" class="position-relative d-flex align-items-center group"> Alerting <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="alerting" aria-haspopup="dialog" aria-label="Share link: Alerting"> Share link </button> </h3> <h4 id="prometheus-alert-rules" class="position-relative d-flex align-items-center group"> Prometheus Alert Rules <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="prometheus-alert-rules" aria-haspopup="dialog" aria-label="Share link: Prometheus Alert Rules"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># /etc/prometheus/rules/geode.yml groups: - name: geode_availability rules: - alert: GeodeDown expr: up{job="geode"} == 0 for: 1m labels: severity: critical annotations: summary: "Geode server is down" description: "{{ $labels.instance }} has been down for more than 1 minute" - alert: GeodeHighLatency expr: histogram_quantile(0.99, rate(geode_query_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "High query latency" description: "p99 latency is {{ $value }}s (threshold: 1s)" - alert: GeodeHighErrorRate expr: rate(geode_query_total{status="error"}[5m]) / rate(geode_query_total[5m]) > 0.01 for: 5m labels: severity: warning annotations: summary: "High error rate" description: "Error rate is {{ $value | humanizePercentage }}" - name: geode_resources rules: - alert: GeodeHighMemory expr: geode_process_memory_bytes / geode_process_memory_limit_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is {{ $value | humanizePercentage }}" - alert: GeodeLowCacheHitRatio expr: geode_storage_cache_hits_total / (geode_storage_cache_hits_total + geode_storage_cache_misses_total) < 0.8 for: 10m labels: severity: warning annotations: summary: "Low cache hit ratio" description: "Cache hit ratio is {{ $value | humanizePercentage }}" - alert: GeodeHighDiskUsage expr: geode_storage_disk_usage_bytes / geode_storage_disk_total_bytes > 0.85 for: 5m labels: severity: warning annotations: summary: "High disk usage" description: "Disk usage is {{ $value | humanizePercentage }}" - name: geode_backups rules: - alert: GeodeBackupOld expr: (time() - geode_backup_last_success_timestamp) / 3600 > 26 labels: severity: critical annotations: summary: "Backup is too old" description: "Last backup was {{ $value | humanizeDuration }} ago" - alert: GeodeBackupFailed expr: increase(geode_backup_total{status="failure"}[1h]) > 0 labels: severity: critical annotations: summary: "Backup failed" description: "Backup failure detected in the last hour" - name: geode_security rules: - alert: GeodeHighAuthFailures expr: rate(geode_auth_attempts_total{result="failure"}[5m]) > 10 for: 2m labels: severity: warning annotations: summary: "High authentication failure rate" description: "{{ $value }} auth failures per second" </code></pre></div> <h4 id="alertmanager-configuration" class="position-relative d-flex align-items-center group"> Alertmanager Configuration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="alertmanager-configuration" aria-haspopup="dialog" aria-label="Share link: Alertmanager Configuration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># alertmanager.yml global: smtp_smarthost: 'smtp.example.com:587' smtp_from: '[email protected]' route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'default' routes: - match: severity: critical receiver: 'pagerduty' continue: true - match: severity: critical receiver: 'slack-critical' - match: severity: warning receiver: 'slack-warnings' receivers: - name: 'default' email_configs: - to: '[email protected]' - name: 'pagerduty' pagerduty_configs: - service_key: '<PAGERDUTY_KEY>' severity: critical - name: 'slack-critical' slack_configs: - api_url: '<SLACK_WEBHOOK_URL>' channel: '#alerts-critical' title: '{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}' text: '{{ .CommonAnnotations.description }}' - name: 'slack-warnings' slack_configs: - api_url: '<SLACK_WEBHOOK_URL>' channel: '#alerts-warnings' </code></pre></div> <h3 id="health-checks" class="position-relative d-flex align-items-center group"> Health Checks <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="health-checks" aria-haspopup="dialog" aria-label="Share link: Health Checks"> Share link </button> </h3> <h4 id="endpoints" class="position-relative d-flex align-items-center group"> Endpoints <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="endpoints" aria-haspopup="dialog" aria-label="Share link: Endpoints"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-bash" data-lang="bash"># Health check (detailed status) curl http://localhost:8080/health # Output: { "status": "healthy", "version": "0.2.18", "uptime": "72h15m30s", "checks": { "storage": "healthy", "query_engine": "healthy", "connections": "healthy" } } # Readiness probe (for load balancers) curl http://localhost:8080/ready # Output: { "ready": true } # Liveness probe (for orchestrators) curl http://localhost:8080/live # Output: { "alive": true } </code></pre></div> <h4 id="kubernetes-probes" class="position-relative d-flex align-items-center group"> Kubernetes Probes <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="kubernetes-probes" aria-haspopup="dialog" aria-label="Share link: Kubernetes Probes"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># kubernetes deployment spec: containers: - name: geode livenessProbe: httpGet: path: /live port: 8080 initialDelaySeconds: 30 periodSeconds: 10 failureThreshold: 3 readinessProbe: httpGet: path: /ready port: 8080 initialDelaySeconds: 5 periodSeconds: 5 failureThreshold: 3 startupProbe: httpGet: path: /health port: 8080 initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 30 </code></pre></div> <h4 id="docker-health-check" class="position-relative d-flex align-items-center group"> Docker Health Check <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="docker-health-check" aria-haspopup="dialog" aria-label="Share link: Docker Health Check"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-dockerfile" data-lang="dockerfile">HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \ CMD curl -f http://localhost:8080/health || exit 1 </code></pre></div> <h3 id="log-aggregation" class="position-relative d-flex align-items-center group"> Log Aggregation <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="log-aggregation" aria-haspopup="dialog" aria-label="Share link: Log Aggregation"> Share link </button> </h3> <h4 id="structured-logging" class="position-relative d-flex align-items-center group"> Structured Logging <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="structured-logging" aria-haspopup="dialog" aria-label="Share link: Structured Logging"> Share link </button> </h4>Geode outputs structured JSON logs: <div class="highlight"><pre tabindex="0" class="chroma"><code class="language-json" data-lang="json">{ "timestamp": "2026-01-28T14:30:00.123Z", "level": "info", "message": "Query executed", "query_id": "abc123", "user": "alice", "duration_ms": 23.5, "rows_returned": 150, "trace_id": "xyz789" } </code></pre></div> <h4 id="log-configuration" class="position-relative d-flex align-items-center group"> Log Configuration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="log-configuration" aria-haspopup="dialog" aria-label="Share link: Log Configuration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># geode.yaml logging: level: info # debug, info, warn, error format: json # json or text output: stdout # stdout, file, or both file: path: /var/log/geode/geode.log max_size_mb: 100 max_backups: 5 max_age_days: 30 compress: true # Log specific components components: query: info storage: warn network: info security: info </code></pre></div> <h4 id="loki-integration" class="position-relative d-flex align-items-center group"> Loki Integration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="loki-integration" aria-haspopup="dialog" aria-label="Share link: Loki Integration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># promtail.yml server: http_listen_port: 9080 positions: filename: /tmp/positions.yaml clients: - url: http://loki:3100/loki/api/v1/push scrape_configs: - job_name: geode static_configs: - targets: - localhost labels: job: geode __path__: /var/log/geode/*.log pipeline_stages: - json: expressions: level: level trace_id: trace_id user: user - labels: level: user: - timestamp: source: timestamp format: RFC3339Nano </code></pre></div> <h4 id="grafana-loki-queries" class="position-relative d-flex align-items-center group"> Grafana Loki Queries <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="grafana-loki-queries" aria-haspopup="dialog" aria-label="Share link: Grafana Loki Queries"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-fallback" data-lang="fallback"># All error logs {job="geode"} |= "error" # Slow queries (> 1s) {job="geode"} | json | duration_ms > 1000 # Failed authentication {job="geode"} | json | message = "authentication_failed" # Queries by user {job="geode"} | json | user = "alice" # Error rate over time rate({job="geode"} |= "error" [5m]) </code></pre></div> <h3 id="distributed-tracing" class="position-relative d-flex align-items-center group"> Distributed Tracing <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="distributed-tracing" aria-haspopup="dialog" aria-label="Share link: Distributed Tracing"> Share link </button> </h3> <h4 id="opentelemetry-configuration" class="position-relative d-flex align-items-center group"> OpenTelemetry Configuration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="opentelemetry-configuration" aria-haspopup="dialog" aria-label="Share link: OpenTelemetry Configuration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># geode.yaml tracing: enabled: true exporter: otlp # otlp, jaeger, or zipkin otlp: endpoint: http://otel-collector:4317 insecure: true sampling: type: probabilistic param: 0.1 # Sample 10% of traces propagation: - tracecontext # W3C Trace Context - baggage # W3C Baggage </code></pre></div> <h4 id="jaeger-integration" class="position-relative d-flex align-items-center group"> Jaeger Integration <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="jaeger-integration" aria-haspopup="dialog" aria-label="Share link: Jaeger Integration"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># geode.yaml tracing: enabled: true exporter: jaeger jaeger: agent_host: jaeger-agent agent_port: 6831 service_name: geode </code></pre></div> <h4 id="trace-context" class="position-relative d-flex align-items-center group"> Trace Context <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="trace-context" aria-haspopup="dialog" aria-label="Share link: Trace Context"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-bash" data-lang="bash"># Query with trace context curl -X POST http://localhost:3141/query \ -H "traceparent: 00-abc123-def456-01" \ -d '{"query": "MATCH (n) RETURN n LIMIT 10"}' </code></pre></div> <h3 id="slo-monitoring" class="position-relative d-flex align-items-center group"> SLO Monitoring <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="slo-monitoring" aria-haspopup="dialog" aria-label="Share link: SLO Monitoring"> Share link </button> </h3> <h4 id="service-level-objectives" class="position-relative d-flex align-items-center group"> Service Level Objectives <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="service-level-objectives" aria-haspopup="dialog" aria-label="Share link: Service Level Objectives"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-yaml" data-lang="yaml"># slo.yaml slos: - name: geode_availability target: 99.9 window: 30d indicator: expr: avg_over_time(up{job="geode"}[5m]) - name: geode_latency_p99 target: 99 window: 30d indicator: expr: | histogram_quantile(0.99, rate(geode_query_duration_seconds_bucket[5m]) ) < 1 - name: geode_error_rate target: 99.9 window: 30d indicator: expr: | 1 - ( rate(geode_query_total{status="error"}[5m]) / rate(geode_query_total[5m]) ) </code></pre></div> <h4 id="error-budget" class="position-relative d-flex align-items-center group"> Error Budget <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="error-budget" aria-haspopup="dialog" aria-label="Share link: Error Budget"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-promql" data-lang="promql"># Error budget remaining (30 day window) 1 - ( sum(increase(geode_query_total{status="error"}[30d])) / sum(increase(geode_query_total[30d])) ) / (1 - 0.999) # 99.9% SLO </code></pre></div> <h3 id="best-practices" class="position-relative d-flex align-items-center group"> Best Practices <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="best-practices" aria-haspopup="dialog" aria-label="Share link: Best Practices"> Share link </button> </h3> <h4 id="monitoring-best-practices" class="position-relative d-flex align-items-center group"> Monitoring Best Practices <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="monitoring-best-practices" aria-haspopup="dialog" aria-label="Share link: Monitoring Best Practices"> Share link </button> </h4><ol> <li>Monitor the four golden signals: Latency, traffic, errors, saturation</li> <li>Set meaningful thresholds: Based on SLOs, not arbitrary values</li> <li>Alert on symptoms, not causes: User-facing impact</li> <li>Use dashboards for investigation: Not alerting</li> <li>Document runbooks: Link alerts to remediation steps</li> </ol> <h4 id="alert-best-practices" class="position-relative d-flex align-items-center group"> Alert Best Practices <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="alert-best-practices" aria-haspopup="dialog" aria-label="Share link: Alert Best Practices"> Share link </button> </h4><ol> <li>Actionable alerts only: Every alert should require action</li> <li>Include runbook links: In alert annotations</li> <li>Set appropriate severity: Critical for pages, warning for tickets</li> <li>Use routing wisely: Right alert to right team</li> <li>Regular alert review: Tune or remove noisy alerts</li> </ol> <h4 id="dashboard-best-practices" class="position-relative d-flex align-items-center group"> Dashboard Best Practices <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="dashboard-best-practices" aria-haspopup="dialog" aria-label="Share link: Dashboard Best Practices"> Share link </button> </h4><ol> <li>Consistent layout: Similar structure across dashboards</li> <li>Link related dashboards: Drill-down navigation</li> <li>Include context: Time ranges, annotations</li> <li>Version control dashboards: Infrastructure as code</li> <li>Regular review: Update as system evolves</li> </ol> <h3 id="troubleshooting" class="position-relative d-flex align-items-center group"> Troubleshooting <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="troubleshooting" aria-haspopup="dialog" aria-label="Share link: Troubleshooting"> Share link </button> </h3> <h4 id="metrics-not-appearing" class="position-relative d-flex align-items-center group"> Metrics Not Appearing <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="metrics-not-appearing" aria-haspopup="dialog" aria-label="Share link: Metrics Not Appearing"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-bash" data-lang="bash"># Verify metrics endpoint curl http://localhost:8080/metrics # Check Prometheus targets curl http://prometheus:9090/api/v1/targets # Verify network connectivity curl -v http://geode-server:8080/metrics </code></pre></div> <h4 id="high-cardinality-issues" class="position-relative d-flex align-items-center group"> High Cardinality Issues <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="high-cardinality-issues" aria-haspopup="dialog" aria-label="Share link: High Cardinality Issues"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-bash" data-lang="bash"># Check label cardinality curl http://prometheus:9090/api/v1/label/__name__/values | jq '. | length' # Find high cardinality metrics promtool tsdb analyze /prometheus/data </code></pre></div> <h4 id="missing-alerts" class="position-relative d-flex align-items-center group"> Missing Alerts <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="missing-alerts" aria-haspopup="dialog" aria-label="Share link: Missing Alerts"> Share link </button> </h4><div class="highlight"><pre tabindex="0" class="chroma"><code class="language-bash" data-lang="bash"># Check alert rules loaded curl http://prometheus:9090/api/v1/rules # Check Alertmanager status curl http://alertmanager:9093/api/v2/status # Test alert routing amtool alert add alertname=test severity=critical </code></pre></div> <h3 id="related-documentation" class="position-relative d-flex align-items-center group"> Related Documentation <button type="button" class="h-share btn btn-link p-0 text-decoration-none link-secondary opacity-50 hover-opacity-100 transition-all ms-1" data-share-target="related-documentation" aria-haspopup="dialog" aria-label="Share link: Related Documentation"> Share link </button> </h3><ul> <li><a href="/docs/ops/observability/" >Observability</a> - Complete observability setup</li> <li><a href="/docs/ops/observability/" >Observability</a> - Advanced alerting and observability patterns</li> <li><a href="/docs/ops/telemetry-advanced/" >Telemetry Advanced</a> - Advanced telemetry</li> <li><a href="/docs/guides/troubleshooting/" >Troubleshooting</a> - Common issues</li> </ul>