alert-management

安装量: 125
排名: #6862

安装

npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill alert-management

Alert Management Overview

Design and implement sophisticated alert management systems with PagerDuty integration, escalation policies, alert routing, and incident coordination.

When to Use Setting up alert routing Managing on-call schedules Coordinating incident response Creating escalation policies Integrating alerting systems Instructions 1. PagerDuty Client Integration // pagerduty-client.js const axios = require('axios');

class PagerDutyClient { constructor(apiToken) { this.apiToken = apiToken; this.baseUrl = 'https://api.pagerduty.com'; this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';

this.client = axios.create({
  baseURL: this.baseUrl,
  headers: {
    'Authorization': `Token token=${apiToken}`,
    'Accept': 'application/vnd.pagerduty+json;version=2'
  }
});

}

async triggerEvent(config) { const event = { routing_key: config.routingKey, event_action: config.eventAction || 'trigger', dedup_key: config.dedupKey || event-${Date.now()}, payload: { summary: config.summary, timestamp: new Date().toISOString(), severity: config.severity || 'error', source: config.source || 'Monitoring System', component: config.component, custom_details: config.customDetails || {} } };

try {
  const response = await axios.post(this.eventUrl, event);
  return response.data;
} catch (error) {
  console.error('Failed to trigger PagerDuty event:', error);
  throw error;
}

}

async resolveEvent(dedupKey) { const event = { routing_key: process.env.PAGERDUTY_ROUTING_KEY, event_action: 'resolve', dedup_key: dedupKey };

try {
  return await axios.post(this.eventUrl, event);
} catch (error) {
  console.error('Failed to resolve event:', error);
  throw error;
}

}

async getServices() { const response = await this.client.get('/services'); return response.data.services; }

async getEscalationPolicies() { const response = await this.client.get('/escalation_policies'); return response.data.escalation_policies; }

async createIncident(config) { const incident = { type: 'incident', title: config.title, service: { id: config.serviceId, type: 'service_reference' }, escalation_policy: { id: config.escalationPolicyId, type: 'escalation_policy_reference' }, body: { type: 'incident_body', details: config.details || '' } };

try {
  const response = await this.client.post('/incidents', incident, {
    headers: { 'From': process.env.PAGERDUTY_EMAIL }
  });
  return response.data.incident;
} catch (error) {
  console.error('Failed to create incident:', error);
  throw error;
}

}

async acknowledgeIncident(incidentId, userId) { try { const response = await this.client.put( /incidents/${incidentId}, { incidents: [{ id: incidentId, type: 'incident_reference', status: 'acknowledged' }] }, { headers: { 'From': process.env.PAGERDUTY_EMAIL } } ); return response.data.incidents[0]; } catch (error) { console.error('Failed to acknowledge:', error); throw error; } }

async resolveIncident(incidentId) { try { const response = await this.client.put( /incidents/${incidentId}, { incidents: [{ id: incidentId, type: 'incident_reference', status: 'resolved' }] }, { headers: { 'From': process.env.PAGERDUTY_EMAIL } } ); return response.data.incidents[0]; } catch (error) { console.error('Failed to resolve:', error); throw error; } } }

module.exports = PagerDutyClient;

  1. Alertmanager Configuration

/etc/alertmanager/alertmanager.yml

global: resolve_timeout: 5m slack_api_url: '${SLACK_WEBHOOK_URL}'

templates: - '/etc/alertmanager/templates/*.tmpl'

route: receiver: 'default' group_by: ['alertname', 'cluster', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 4h

routes: - match: severity: critical receiver: pagerduty continue: true group_wait: 0s

- match:
    severity: warning
  receiver: slack

- match:
    service: payment-service
  receiver: payment-team
  group_wait: 30s

receivers: - name: 'default' slack_configs: - channel: '#alerts' title: 'Alert: {{ .GroupLabels.alertname }}'

  • name: 'pagerduty' pagerduty_configs:

    • service_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}'
  • name: 'slack' slack_configs:

    • channel: '#alerts' title: 'Warning: {{ .GroupLabels.alertname }}'
  • name: 'payment-team' pagerduty_configs:

    • service_key: '${PAYMENT_PAGERDUTY_KEY}' slack_configs:
    • channel: '#payment-alerts'

inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'service']

  1. Alert Handler Middleware // alert-handler.js const PagerDutyClient = require('./pagerduty-client');

const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);

class AlertHandler { constructor() { this.alertCache = new Map(); this.deduplicationWindow = 300000; // 5 minutes }

shouldSendAlert(dedupKey) { const cacheEntry = this.alertCache.get(dedupKey);

if (!cacheEntry) return true;

const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
return timeSinceLastAlert >= this.deduplicationWindow;

}

recordAlert(dedupKey) { this.alertCache.set(dedupKey, { timestamp: Date.now() }); }

determineSeverity(value, thresholds) { if (value >= thresholds.critical) return 'critical'; if (value >= thresholds.warning) return 'warning'; return 'info'; }

async sendAlert(config) { const dedupKey = config.dedupKey || alert-${config.alertName}-${Date.now()};

try {
  if (!this.shouldSendAlert(dedupKey)) {
    console.log('Alert recently sent, skipping');
    return;
  }

  const event = {
    routingKey: config.routingKey,
    eventAction: config.eventAction || 'trigger',
    dedupKey: dedupKey,
    summary: config.summary,
    severity: config.severity,
    source: config.source || 'Monitoring System',
    component: config.component,
    customDetails: {
      ...config.customDetails,
      alertName: config.alertName,
      timestamp: new Date().toISOString()
    }
  };

  const result = await pdClient.triggerEvent(event);
  this.recordAlert(dedupKey);

  console.log('Alert sent', {
    alertName: config.alertName,
    severity: config.severity
  });

  return result;
} catch (error) {
  console.error('Failed to send alert:', error);
  await this.sendSlackAlert(config);
}

}

async sendSlackAlert(config) { const axios = require('axios'); const webhookUrl = process.env.SLACK_WEBHOOK_URL;

const message = {
  color: config.severity === 'critical' ? 'danger' : 'warning',
  title: config.summary,
  text: config.customDetails?.description || '',
  fields: [
    { title: 'Severity', value: config.severity, short: true },
    { title: 'Component', value: config.component, short: true }
  ]
};

try {
  await axios.post(webhookUrl, { attachments: [message] });
} catch (error) {
  console.error('Failed to send Slack alert:', error);
}

}

async resolveAlert(dedupKey) { try { await pdClient.resolveEvent(dedupKey); console.log('Alert resolved'); } catch (error) { console.error('Failed to resolve alert:', error); } } }

module.exports = new AlertHandler();

  1. Alert Routing Engine // alert-router.js class AlertRouter { constructor() { this.routes = []; }

addRoute(rule) { this.routes.push({ priority: rule.priority || 0, condition: rule.condition, handler: rule.handler, escalation: rule.escalation }); this.routes.sort((a, b) => b.priority - a.priority); }

async route(alert) { for (const route of this.routes) { if (route.condition(alert)) { return await route.handler(alert, route.escalation); } } return this.defaultHandler(alert); }

async defaultHandler(alert) { console.log('Routing to default handler:', alert.name); return { routed: true, handler: 'default' }; } }

// Usage const router = new AlertRouter();

router.addRoute({ priority: 100, condition: (alert) => alert.severity === 'critical' && alert.component === 'database', handler: async (alert) => { console.log('Routing critical database alert to DBA team'); return { team: 'dba', escalation: 'immediate' }; } });

router.addRoute({ priority: 90, condition: (alert) => alert.component === 'payment-service', handler: async (alert) => { console.log('Routing to payment team'); return { team: 'payment', escalation: 'payment-policy' }; } });

router.addRoute({ priority: 10, condition: (alert) => alert.severity === 'warning', handler: async (alert) => { console.log('Routing warning to Slack'); return { handler: 'slack-only' }; } });

module.exports = router;

  1. Docker Compose Alert Stack

docker-compose.yml

version: '3.8' services: prometheus: image: prom/prometheus:latest ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml

alertmanager: image: prom/alertmanager:latest ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml environment: SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY} depends_on: - prometheus

alert-handler: build: . environment: PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN} SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} ports: - "3000:3000" depends_on: - alertmanager

Best Practices ✅ DO Set appropriate thresholds Implement alert deduplication Use clear alert names Include runbook links Configure escalation properly Test alert rules Monitor alert quality Set repeat intervals Track alert metrics Document alert meanings ❌ DON'T Alert on every anomaly Ignore alert fatigue Set thresholds arbitrarily Skip runbooks Alert without action Disable alerts in production Use vague alert names Forget escalation policies Re-alert too frequently Alert Severity Levels Critical: Immediate action required, customer impact Warning: Investigation needed, potential issues Info: Informational, no action required Key Metrics Alert volume Resolution time False positive rate Escalation frequency MTTD (Mean Time to Detection) MTTR (Mean Time to Resolution)

返回排行榜