Site Reliability Engineering Expert
Expert guidance for SRE practices, reliability engineering, SLOs/SLIs, incident management, and operational excellence.
Core Concepts SRE Fundamentals Service Level Objectives (SLOs) Service Level Indicators (SLIs) Error budgets Toil reduction Monitoring and alerting Capacity planning Reliability Practices Incident management Post-incident reviews (PIRs) On-call rotations Chaos engineering Disaster recovery Change management Automation Infrastructure as Code Configuration management Deployment automation Self-healing systems Runbook automation Automated remediation SLO/SLI Management from dataclasses import dataclass from datetime import datetime, timedelta from typing import List, Dict import numpy as np
@dataclass class SLI: """Service Level Indicator""" name: str description: str query: str unit: str # 'percentage', 'milliseconds', etc.
@dataclass class SLO: """Service Level Objective""" name: str sli: SLI target: float window_days: int
class SLOTracker: """Track and manage SLOs"""
def __init__(self):
self.slos: Dict[str, SLO] = {}
self.measurements: Dict[str, List[Dict]] = {}
def define_slo(self, slo: SLO):
"""Define a new SLO"""
self.slos[slo.name] = slo
self.measurements[slo.name] = []
def record_measurement(self, slo_name: str, value: float, timestamp: datetime):
"""Record SLI measurement"""
if slo_name in self.slos:
self.measurements[slo_name].append({
'value': value,
'timestamp': timestamp
})
def calculate_slo_compliance(self, slo_name: str) -> Dict:
"""Calculate SLO compliance"""
slo = self.slos.get(slo_name)
if not slo:
return {}
measurements = self.measurements.get(slo_name, [])
window_start = datetime.now() - timedelta(days=slo.window_days)
recent_measurements = [
m for m in measurements
if m['timestamp'] > window_start
]
if not recent_measurements:
return {'status': 'no_data'}
values = [m['value'] for m in recent_measurements]
actual = np.mean(values)
return {
'slo_name': slo_name,
'target': slo.target,
'actual': actual,
'compliant': actual >= slo.target,
'window_days': slo.window_days,
'sample_count': len(recent_measurements)
}
def calculate_error_budget(self, slo_name: str) -> Dict:
"""Calculate remaining error budget"""
compliance = self.calculate_slo_compliance(slo_name)
if compliance.get('status') == 'no_data':
return {'status': 'no_data'}
target = compliance['target']
actual = compliance['actual']
error_budget_target = 100 - target
errors_actual = 100 - actual
remaining = error_budget_target - errors_actual
remaining_pct = (remaining / error_budget_target) * 100 if error_budget_target > 0 else 100
return {
'slo_name': slo_name,
'error_budget_target': error_budget_target,
'errors_actual': errors_actual,
'remaining': remaining,
'remaining_percentage': remaining_pct,
'exhausted': remaining < 0
}
Example SLOs
def define_standard_slos() -> List[SLO]: """Define standard SLOs for a web service""" return [ SLO( name="api_availability", sli=SLI( name="availability", description="Percentage of successful requests", query="sum(rate(http_requests_total{code!~'5..'}[5m])) / sum(rate(http_requests_total[5m])) * 100", unit="percentage" ), target=99.9, window_days=30 ), SLO( name="api_latency", sli=SLI( name="latency_p95", description="95th percentile latency", query="histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", unit="seconds" ), target=0.5, # 500ms window_days=30 ) ]
Incident Management from enum import Enum from datetime import datetime from typing import List, Optional
class Severity(Enum): SEV1 = "sev1" # Critical SEV2 = "sev2" # High SEV3 = "sev3" # Medium SEV4 = "sev4" # Low
class IncidentStatus(Enum): INVESTIGATING = "investigating" IDENTIFIED = "identified" MONITORING = "monitoring" RESOLVED = "resolved"
@dataclass class Incident: incident_id: str title: str severity: Severity status: IncidentStatus started_at: datetime detected_at: datetime resolved_at: Optional[datetime] incident_commander: str responders: List[str] affected_services: List[str] timeline: List[Dict] root_cause: Optional[str] = None
class IncidentManager: """Manage incidents following SRE best practices"""
def __init__(self):
self.incidents: Dict[str, Incident] = {}
def create_incident(self, incident: Incident) -> str:
"""Create new incident"""
self.incidents[incident.incident_id] = incident
# Notify on-call
self.notify_oncall(incident)
# Start incident timeline
self.add_timeline_event(
incident.incident_id,
"Incident created",
datetime.now()
)
return incident.incident_id
def update_status(self, incident_id: str, new_status: IncidentStatus,
note: str):
"""Update incident status"""
if incident_id in self.incidents:
incident = self.incidents[incident_id]
incident.status = new_status
self.add_timeline_event(
incident_id,
f"Status changed to {new_status.value}: {note}",
datetime.now()
)
if new_status == IncidentStatus.RESOLVED:
incident.resolved_at = datetime.now()
def add_timeline_event(self, incident_id: str, event: str,
timestamp: datetime):
"""Add event to incident timeline"""
if incident_id in self.incidents:
self.incidents[incident_id].timeline.append({
'timestamp': timestamp,
'event': event
})
def calculate_mttr(self, incident_id: str) -> Optional[float]:
"""Calculate Mean Time To Resolution"""
incident = self.incidents.get(incident_id)
if incident and incident.resolved_at:
duration = incident.resolved_at - incident.detected_at
return duration.total_seconds() / 60 # minutes
return None
def generate_incident_report(self, incident_id: str) -> Dict:
"""Generate incident report"""
incident = self.incidents.get(incident_id)
if not incident:
return {}
return {
'incident_id': incident.incident_id,
'title': incident.title,
'severity': incident.severity.value,
'status': incident.status.value,
'duration_minutes': self.calculate_mttr(incident_id),
'affected_services': incident.affected_services,
'incident_commander': incident.incident_commander,
'responders': incident.responders,
'timeline': incident.timeline,
'root_cause': incident.root_cause
}
def notify_oncall(self, incident: Incident):
"""Notify on-call engineer (integrate with PagerDuty, etc.)"""
# Implementation would integrate with alerting system
pass
Monitoring and Alerting from prometheus_client import Counter, Histogram, Gauge import time
Metrics
request_count = Counter('http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status']) request_duration = Histogram('http_request_duration_seconds', 'HTTP request duration') active_connections = Gauge('active_connections', 'Number of active connections')
class MonitoringSystem: """Implement monitoring best practices"""
def __init__(self):
self.alerts = []
def record_request(self, method: str, endpoint: str, status: int, duration: float):
"""Record HTTP request metrics"""
request_count.labels(method=method, endpoint=endpoint, status=status).inc()
request_duration.observe(duration)
def define_alert(self, name: str, expression: str, threshold: float,
duration: str, severity: str) -> Dict:
"""Define alerting rule"""
alert = {
'name': name,
'expression': expression,
'threshold': threshold,
'duration': duration,
'severity': severity,
'annotations': {
'summary': f'{name} alert triggered',
'runbook_url': f'https://runbooks.example.com/{name}'
}
}
self.alerts.append(alert)
return alert
def check_golden_signals(self, metrics: Dict) -> Dict:
"""Check the four golden signals"""
return {
'latency': self._check_latency(metrics.get('latency', [])),
'traffic': self._check_traffic(metrics.get('traffic', 0)),
'errors': self._check_errors(metrics.get('error_rate', 0)),
'saturation': self._check_saturation(metrics.get('cpu_usage', 0))
}
def _check_latency(self, latencies: List[float]) -> Dict:
if not latencies:
return {'status': 'unknown'}
p95 = np.percentile(latencies, 95)
return {
'status': 'critical' if p95 > 1000 else 'ok',
'p95_ms': p95
}
def _check_traffic(self, requests_per_second: float) -> Dict:
return {
'status': 'ok',
'rps': requests_per_second
}
def _check_errors(self, error_rate: float) -> Dict:
return {
'status': 'critical' if error_rate > 1.0 else 'ok',
'error_rate': error_rate
}
def _check_saturation(self, cpu_usage: float) -> Dict:
return {
'status': 'warning' if cpu_usage > 80 else 'ok',
'cpu_usage': cpu_usage
}
Chaos Engineering import random from typing import Callable
class ChaosExperiment: """Run chaos engineering experiments"""
def __init__(self, name: str, hypothesis: str):
self.name = name
self.hypothesis = hypothesis
self.results = []
def inject_latency(self, service_call: Callable, delay_ms: int):
"""Inject latency into service call"""
time.sleep(delay_ms / 1000)
return service_call()
def inject_failure(self, service_call: Callable, failure_rate: float):
"""Randomly fail service calls"""
if random.random() < failure_rate:
raise Exception("Chaos: Simulated failure")
return service_call()
def kill_random_instance(self, instances: List[str]) -> str:
"""Kill random instance"""
victim = random.choice(instances)
# Implementation would actually kill the instance
return victim
def run_experiment(self, experiment_func: Callable) -> Dict:
"""Run chaos experiment"""
start_time = datetime.now()
try:
result = experiment_func()
status = "success"
error = None
except Exception as e:
result = None
status = "failed"
error = str(e)
end_time = datetime.now()
experiment_result = {
'name': self.name,
'hypothesis': self.hypothesis,
'status': status,
'result': result,
'error': error,
'duration': (end_time - start_time).total_seconds(),
'timestamp': start_time
}
self.results.append(experiment_result)
return experiment_result
Best Practices SRE Principles Embrace risk management Set SLOs based on user experience Use error budgets for decision making Automate toil away Monitor the four golden signals Practice blameless post-mortems Gradual rollouts and canary deployments Incident Management Clear incident severity definitions Defined incident commander role Communicate proactively Document timeline during incident Conduct post-incident reviews Track action items to completion Share learnings across teams On-Call Reasonable on-call rotations Comprehensive runbooks Alert on symptoms, not causes Actionable alerts only Escalation policies Support on-call engineers Measure and reduce alert fatigue Anti-Patterns
❌ No SLOs defined ❌ Alerts without runbooks ❌ Blame culture for incidents ❌ No post-incident reviews ❌ 100% uptime expectations ❌ Toil not tracked or reduced ❌ Manual processes for common tasks
Resources Google SRE Book: https://sre.google/sre-book/table-of-contents/ Site Reliability Engineering: https://sre.google/ SLO Workshop: https://github.com/google/slo-workshop Chaos Engineering: https://principlesofchaos.org/ Prometheus: https://prometheus.io/