Site Reliability Engineering Expert

Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.

Core Concepts SRE Fundamentals Service Level Objectives (SLOs) Service Level Indicators (SLIs) Error budgets Toil reduction Monitoring and alerting Capacity planning Reliability Practices Incident management Post-incident reviews (PIRs) On-call rotations Chaos engineering Disaster recovery Change management Automation Infrastructure as Code Configuration management Deployment automation Self-healing systems Runbook automation Automated remediation SLO/SLI Management from dataclasses import dataclass from datetime import datetime, timedelta from typing import List, Dict import numpy as np

@dataclass class SLI: """Service Level Indicator""" name: str description: str query: str unit: str # 'percentage', 'milliseconds', etc.

@dataclass class SLO: """Service Level Objective""" name: str sli: SLI target: float window_days: int

class SLOTracker: """Track and manage SLOs"""

def __init__(self):
    self.slos: Dict[str, SLO] = {}
    self.measurements: Dict[str, List[Dict]] = {}

def define_slo(self, slo: SLO):
    """Define a new SLO"""
    self.slos[slo.name] = slo
    self.measurements[slo.name] = []

def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
    """Record SLI measurement"""
    if slo_name in self.slos:
        self.measurements[slo_name].append({
            'value': value,
            'timestamp': timestamp
        })

def calculate_slo_compliance(self, slo_name: str) -> Dict:
    """Calculate SLO compliance"""
    slo = self.slos.get(slo_name)
    if not slo:
        return {}

    measurements = self.measurements.get(slo_name, [])
    window_start = datetime.now() - timedelta(days=slo.window_days)

    recent_measurements = [
        m for m in measurements
        if m['timestamp'] > window_start
    ]

    if not recent_measurements:
        return {'status': 'no_data'}

    values = [m['value'] for m in recent_measurements]
    actual = np.mean(values)

    return {
        'slo_name': slo_name,
        'target': slo.target,
        'actual': actual,
        'compliant': actual >= slo.target,
        'window_days': slo.window_days,
        'sample_count': len(recent_measurements)
    }

def calculate_error_budget(self, slo_name: str) -> Dict:
    """Calculate remaining error budget"""
    compliance = self.calculate_slo_compliance(slo_name)

    if compliance.get('status') == 'no_data':
        return {'status': 'no_data'}

    target = compliance['target']
    actual = compliance['actual']

    error_budget_target = 100 - target
    errors_actual = 100 - actual

    remaining = error_budget_target - errors_actual
    remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100

    return {
        'slo_name': slo_name,
        'error_budget_target': error_budget_target,
        'errors_actual': errors_actual,
        'remaining': remaining,
        'remaining_percentage': remaining_pct,
        'exhausted': remaining < 0
    }

Example SLOs

def define_standard_slos() -> List[SLO]: """Define standard SLOs for a web service""" return [ SLO( name="api_availability", sli=SLI( name="availability", description="Percentage of successful requests", query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100", unit="percentage" ), target=99.9, window_days=30 ), SLO( name="api_latency", sli=SLI( name="latency_p95", description="95th percentile latency", query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", unit="seconds" ), target=0.5, # 500ms window_days=30 ) ]

Incident Management from enum import Enum from datetime import datetime from typing import List, Optional

class Severity(Enum): SEV1 = "sev1" # Critical SEV2 = "sev2" # High SEV3 = "sev3" # Medium SEV4 = "sev4" # Low

class IncidentStatus(Enum): INVESTIGATING = "investigating" IDENTIFIED = "identified" MONITORING = "monitoring" RESOLVED = "resolved"

@dataclass class Incident: incident_id: str title: str severity: Severity status: IncidentStatus started_at: datetime detected_at: datetime resolved_at: Optional[datetime] incident_commander: str responders: List[str] affected_services: List[str] timeline: List[Dict] root_cause: Optional[str] = None

class IncidentManager: """Manage incidents following SRE best practices"""

def __init__(self):
    self.incidents: Dict[str, Incident] = {}

def create_incident(self, incident: Incident) -> str:
    """Create new incident"""
    self.incidents[incident.incident_id] = incident

    # Notify on-call
    self.notify_oncall(incident)

    # Start incident timeline
    self.add_timeline_event(
        incident.incident_id,
        "Incident created",
        datetime.now()
    )

    return incident.incident_id

def update_status(self, incident_id: str, new_status: IncidentStatus,
                 note: str):
    """Update incident status"""
    if incident_id in self.incidents:
        incident = self.incidents[incident_id]
        incident.status = new_status

        self.add_timeline_event(
            incident_id,
            f"Status changed to {new_status.value}: {note}",
            datetime.now()
        )

        if new_status == IncidentStatus.RESOLVED:
            incident.resolved_at = datetime.now()

def add_timeline_event(self, incident_id: str, event: str,
                      timestamp: datetime):
    """Add event to incident timeline"""
    if incident_id in self.incidents:
        self.incidents[incident_id].timeline.append({
            'timestamp': timestamp,
            'event': event
        })

def calculate_mttr(self, incident_id: str) -> Optional[float]:
    """Calculate Mean Time To Resolution"""
    incident = self.incidents.get(incident_id)

    if incident and incident.resolved_at:
        duration = incident.resolved_at - incident.detected_at
        return duration.total_seconds() / 60  # minutes

    return None

def generate_incident_report(self, incident_id: str) -> Dict:
    """Generate incident report"""
    incident = self.incidents.get(incident_id)

    if not incident:
        return {}

    return {
        'incident_id': incident.incident_id,
        'title': incident.title,
        'severity': incident.severity.value,
        'status': incident.status.value,
        'duration_minutes': self.calculate_mttr(incident_id),
        'affected_services': incident.affected_services,
        'incident_commander': incident.incident_commander,
        'responders': incident.responders,
        'timeline': incident.timeline,
        'root_cause': incident.root_cause
    }

def notify_oncall(self, incident: Incident):
    """Notify on-call engineer (integrate with PagerDuty, etc.)"""
    # Implementation would integrate with alerting system
    pass

Monitoring and Alerting from prometheus_client import Counter, Histogram, Gauge import time

Metrics

request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status']) request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration') active_connections = Gauge('active_connections', 'Number of active connections')

class MonitoringSystem: """Implement monitoring best practices"""

def __init__(self):
    self.alerts = []

def record_request(self, method: str, endpoint: str, status: int, duration: float):
    """Record HTTP request metrics"""
    request_count.labels(method=method, endpoint=endpoint, status=status).inc()
    request_duration.observe(duration)

def define_alert(self, name: str, expression: str, threshold: float,
                duration: str, severity: str) -> Dict:
    """Define alerting rule"""
    alert = {
        'name': name,
        'expression': expression,
        'threshold': threshold,
        'duration': duration,
        'severity': severity,
        'annotations': {
            'summary': f'{name} alert triggered',
            'runbook_url': f'https://runbooks.example.com/{name}'
        }
    }

    self.alerts.append(alert)
    return alert

def check_golden_signals(self, metrics: Dict) -> Dict:
    """Check the four golden signals"""
    return {
        'latency': self._check_latency(metrics.get('latency', [])),
        'traffic': self._check_traffic(metrics.get('traffic', 0)),
        'errors': self._check_errors(metrics.get('error_rate', 0)),
        'saturation': self._check_saturation(metrics.get('cpu_usage', 0))
    }

def _check_latency(self, latencies: List[float]) -> Dict:
    if not latencies:
        return {'status': 'unknown'}

    p95 = np.percentile(latencies, 95)
    return {
        'status': 'critical' if p95 > 1000 else 'ok',
        'p95_ms': p95
    }

def _check_traffic(self, requests_per_second: float) -> Dict:
    return {
        'status': 'ok',
        'rps': requests_per_second
    }

def _check_errors(self, error_rate: float) -> Dict:
    return {
        'status': 'critical' if error_rate > 1.0 else 'ok',
        'error_rate': error_rate
    }

def _check_saturation(self, cpu_usage: float) -> Dict:
    return {
        'status': 'warning' if cpu_usage > 80 else 'ok',
        'cpu_usage': cpu_usage
    }

Chaos Engineering import random from typing import Callable

class ChaosExperiment: """Run chaos engineering experiments"""

def __init__(self, name: str, hypothesis: str):
    self.name = name
    self.hypothesis = hypothesis
    self.results = []

def inject_latency(self, service_call: Callable, delay_ms: int):
    """Inject latency into service call"""
    time.sleep(delay_ms / 1000)
    return service_call()

def inject_failure(self, service_call: Callable, failure_rate: float):
    """Randomly fail service calls"""
    if random.random() < failure_rate:
        raise Exception("Chaos: Simulated failure")
    return service_call()

def kill_random_instance(self, instances: List[str]) -> str:
    """Kill random instance"""
    victim = random.choice(instances)
    # Implementation would actually kill the instance
    return victim

def run_experiment(self, experiment_func: Callable) -> Dict:
    """Run chaos experiment"""
    start_time = datetime.now()

    try:
        result = experiment_func()
        status = "success"
        error = None
    except Exception as e:
        result = None
        status = "failed"
        error = str(e)

    end_time = datetime.now()

    experiment_result = {
        'name': self.name,
        'hypothesis': self.hypothesis,
        'status': status,
        'result': result,
        'error': error,
        'duration': (end_time - start_time).total_seconds(),
        'timestamp': start_time
    }

    self.results.append(experiment_result)
    return experiment_result

Best Practices SRE Principles Embrace risk management Set SLOs based on user experience Use error budgets for decision making Automate toil away Monitor the four golden signals Practice blameless post-mortems Gradual rollouts and canary deployments Incident Management Clear incident severity definitions Defined incident commander role Communicate proactively Document timeline during incident Conduct post-incident reviews Track action items to completion Share learnings across teams On-Call Reasonable on-call rotations Comprehensive runbooks Alert on symptoms, not causes Actionable alerts only Escalation policies Support on-call engineers Measure and reduce alert fatigue Anti-Patterns

❌ No SLOs defined ❌ Alerts without runbooks ❌ Blame culture for incidents ❌ No post-incident reviews ❌ 100% uptime expectations ❌ Toil not tracked or reduced ❌ Manual processes for common tasks

Resources Google SRE Book: https://sre.google/sre-book/table-of-contents/ Site Reliability Engineering: https://sre.google/ SLO Workshop: https://github.com/google/slo-workshop Chaos Engineering: https://principlesofchaos.org/ Prometheus: https://prometheus.io/

sre-expert

安装

Example SLOs

Metrics