kapott

kube-prometheus-stack_servicecluster

Jan 22nd, 2026
1,184
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
YAML 3.59 KB | None | 0 0
  1. # values-service-cluster.yaml
  2. # kube-prometheus-stack for SERVICE cluster
  3.  
  4. # ============================================
  5. # PROMETHEUS - optional, for local service cluster metrics
  6. # ============================================
  7. prometheus:
  8.   prometheusSpec:
  9.     externalLabels:
  10.       cluster: "service-cluster"
  11.    
  12.     # Also push to local Thanos Receive (so all metrics go through same path)
  13.     remoteWrite:
  14.       - url: http://thanos-receive.monitoring.svc:19291/api/v1/receive
  15.    
  16.     retention: 24h
  17.    
  18.     resources:
  19.       requests:
  20.         cpu: 200m
  21.         memory: 512Mi
  22.       limits:
  23.         memory: 2Gi
  24.  
  25. # ============================================
  26. # ALERTMANAGER - central instance
  27. # ============================================
  28. alertmanager:
  29.   enabled: true
  30.  
  31.   # Expose via MetalLB so site clusters can reach it
  32.   service:
  33.     type: LoadBalancer
  34.     annotations:
  35.       metallb.universe.tf/loadBalancerIPs: 10.x.x.51
  36.  
  37.   alertmanagerSpec:
  38.     replicas: 2
  39.     retention: 120h
  40.     storage:
  41.       volumeClaimTemplate:
  42.         spec:
  43.           accessModes: ["ReadWriteOnce"]
  44.           resources:
  45.             requests:
  46.               storage: 10Gi
  47.     resources:
  48.       requests:
  49.         cpu: 100m
  50.         memory: 256Mi
  51.  
  52.   config:
  53.     global:
  54.       resolve_timeout: 5m
  55.    
  56.     route:
  57.       receiver: 'default'
  58.       group_by: ['alertname', 'cluster']  # Group by cluster!
  59.       group_wait: 30s
  60.       group_interval: 5m
  61.       repeat_interval: 4h
  62.       routes:
  63.         - match:
  64.             severity: critical
  65.           receiver: 'critical'
  66.    
  67.     receivers:
  68.       - name: 'default'
  69.         # Configure your receivers (Slack, email, etc.)
  70.         # slack_configs:
  71.         #   - channel: '#alerts'
  72.         #     api_url: 'https://hooks.slack.com/...'
  73.       - name: 'critical'
  74.         # pagerduty_configs:
  75.         #   - service_key: '<key>'
  76.  
  77. # ============================================
  78. # GRAFANA - central with SSO
  79. # ============================================
  80. grafana:
  81.   enabled: true
  82.  
  83.   replicas: 1
  84.  
  85.   # Persistence for dashboards
  86.   persistence:
  87.     enabled: true
  88.     size: 5Gi
  89.  
  90.   # Keycloak OIDC
  91.   grafana.ini:
  92.     server:
  93.       root_url: https://grafana.example.com
  94.    
  95.     auth:
  96.       disable_login_form: false  # Set true after SSO works
  97.    
  98.     auth.generic_oauth:
  99.       enabled: true
  100.       name: Keycloak
  101.       allow_sign_up: true
  102.       client_id: grafana
  103.       client_secret: ${GRAFANA_OAUTH_SECRET}
  104.       scopes: openid profile email groups
  105.       auth_url: https://keycloak.example.com/realms/master/protocol/openid-connect/auth
  106.       token_url: https://keycloak.example.com/realms/master/protocol/openid-connect/token
  107.       api_url: https://keycloak.example.com/realms/master/protocol/openid-connect/userinfo
  108.       role_attribute_path: contains(groups[*], 'grafana-admin') && 'Admin' || contains(groups[*], 'grafana-editor') && 'Editor' || 'Viewer'
  109.  
  110.   envFromSecret: grafana-oauth-secret
  111.  
  112.   # Thanos Query as datasource (added automatically via additionalDataSources)
  113.   additionalDataSources:
  114.     - name: Thanos
  115.       type: prometheus
  116.       url: http://thanos-query.monitoring.svc:9090
  117.       access: proxy
  118.       isDefault: true
  119.  
  120.   resources:
  121.     requests:
  122.       cpu: 100m
  123.       memory: 256Mi
  124.  
  125. # ============================================
  126. # EXPORTERS
  127. # ============================================
  128. nodeExporter:
  129.   enabled: true
  130.  
  131. kubeStateMetrics:
  132.   enabled: true
  133.  
  134. # ============================================
  135. # DEFAULT RULES
  136. # ============================================
  137. defaultRules:
  138.   create: true
Advertisement