Uptime Monitoring Overview

Set up comprehensive uptime monitoring with health checks, status pages, and incident tracking to ensure visibility into service availability.

When to Use Service availability tracking Health check implementation Status page creation Incident management SLA monitoring Instructions 1. Health Check Endpoints // Node.js health check const express = require('express'); const app = express();

app.get('/health', (req, res) => { res.json({ status: 'ok', timestamp: new Date().toISOString(), uptime: process.uptime() }); });

app.get('/health/deep', async (req, res) => { const health = { status: 'ok', checks: { database: 'unknown', cache: 'unknown', externalApi: 'unknown' } };

try { const dbResult = await db.query('SELECT 1'); health.checks.database = dbResult ? 'ok' : 'error'; } catch { health.checks.database = 'error'; health.status = 'degraded'; }

try { const cacheResult = await redis.ping(); health.checks.cache = cacheResult === 'PONG' ? 'ok' : 'error'; } catch { health.checks.cache = 'error'; }

try { const response = await fetch('https://api.example.com/health'); health.checks.externalApi = response.ok ? 'ok' : 'error'; } catch { health.checks.externalApi = 'error'; }

const statusCode = health.status === 'ok' ? 200 : 503; res.status(statusCode).json(health); });

app.get('/readiness', async (req, res) => { try { const dbCheck = await db.query('SELECT 1'); const cacheCheck = await redis.ping();

if (dbCheck && cacheCheck === 'PONG') {
  res.json({ ready: true });
} else {
  res.status(503).json({ ready: false });
}

} catch { res.status(503).json({ ready: false }); } });

app.get('/liveness', (req, res) => { res.json({ alive: true }); });

Python Health Checks from flask import Flask, jsonify import time

app = Flask(name) startup_time = time.time()

def get_uptime(): return int(time.time() - startup_time)

@app.route('/health') def health(): return jsonify({ 'status': 'ok', 'uptime_seconds': get_uptime() }), 200

@app.route('/health/deep') def health_deep(): health_status = { 'status': 'ok', 'checks': { 'database': 'unknown', 'cache': 'unknown' } }

try:
    db.session.execute('SELECT 1')
    health_status['checks']['database'] = 'ok'
except:
    health_status['checks']['database'] = 'error'
    health_status['status'] = 'degraded'

try:
    cache.get('_health')
    health_status['checks']['cache'] = 'ok'
except:
    health_status['checks']['cache'] = 'error'

status_code = 200 if health_status['status'] == 'ok' else 503
return jsonify(health_status), status_code

@app.route('/readiness') def readiness(): try: db.session.execute('SELECT 1') return jsonify({'ready': True}), 200 except: return jsonify({'ready': False}), 503

Uptime Monitor with Heartbeat // heartbeat.js const axios = require('axios');

class UptimeMonitor { constructor(config = {}) { this.checkInterval = config.checkInterval || 60000; this.timeout = config.timeout || 5000; this.endpoints = config.endpoints || []; }

async checkEndpoint(endpoint) { const startTime = Date.now();

try {
  const response = await axios.get(endpoint.url, {
    timeout: this.timeout,
    validateStatus: (s) => s >= 200 && s < 300
  });

  const check = {
    endpoint: endpoint.name,
    status: 'up',
    responseTime: Date.now() - startTime,
    timestamp: new Date()
  };

  await this.saveCheck(check);
  return check;
} catch (error) {
  const check = {
    endpoint: endpoint.name,
    status: 'down',
    responseTime: Date.now() - startTime,
    timestamp: new Date(),
    error: error.message
  };

  await this.saveCheck(check);
  return check;
}

}

async saveCheck(check) { try { await db.query( 'INSERT INTO uptime_checks (endpoint, status, response_time, timestamp) VALUES (?, ?, ?, ?)', [check.endpoint, check.status, check.responseTime, check.timestamp] ); } catch (error) { console.error('Failed to save check:', error); } }

async runChecks() { return Promise.all( this.endpoints.map(e => this.checkEndpoint(e)) ); }

start() { this.runChecks(); this.interval = setInterval(() => this.runChecks(), this.checkInterval); }

stop() { if (this.interval) clearInterval(this.interval); }

async getStats(endpoint, hours = 24) { const [stats] = await db.query(SELECT COUNT(*) as total_checks, SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime_checks, AVG(response_time) as avg_response_time FROM uptime_checks WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL ? HOUR), [endpoint, hours]); return stats[0]; } }

module.exports = UptimeMonitor;

Public Status Page API // status-page-api.js const express = require('express'); const router = express.Router();

router.get('/api/status', async (req, res) => { try { const endpoints = await db.query(SELECT DISTINCT endpoint FROM uptime_checks);

const status = {
  page: { name: 'My Service Status', updated_at: new Date().toISOString() },
  components: []
};

for (const { endpoint } of endpoints) {
  const [lastCheck] = await db.query(`
    SELECT status FROM uptime_checks
    WHERE endpoint = ? ORDER BY timestamp DESC LIMIT 1
  `, [endpoint]);

  status.components.push({
    id: endpoint,
    name: endpoint,
    status: lastCheck?.status === 'up' ? 'operational' : 'major_outage'
  });
}

const allUp = status.components.every(c => c.status === 'operational');
status.status = {
  overall: allUp ? 'all_operational' : 'major_outage'
};

res.json(status);

} catch (error) { res.status(500).json({ error: 'Failed to fetch status' }); } });

router.get('/api/status/uptime/:endpoint', async (req, res) => { try { const stats = await db.query(SELECT DATE(timestamp) as date, COUNT(*) as total, SUM(CASE WHEN status = 'up' THEN 1 ELSE 0 END) as uptime FROM uptime_checks WHERE endpoint = ? AND timestamp > DATE_SUB(NOW(), INTERVAL 30 DAY) GROUP BY DATE(timestamp) ORDER BY date DESC, [req.params.endpoint]);

res.json(stats);

} catch (error) { res.status(500).json({ error: 'Failed to fetch statistics' }); } });

module.exports = router;

Kubernetes Health Probes apiVersion: apps/v1 kind: Deployment spec: template: spec: containers:
- name: api-service image: api-service:latest
  
  startupProbe: httpGet: path: /health port: 3000 initialDelaySeconds: 0 periodSeconds: 10 failureThreshold: 30
  
  readinessProbe: httpGet: path: /readiness port: 3000 initialDelaySeconds: 5 periodSeconds: 5 failureThreshold: 3
  
  livenessProbe: httpGet: path: /liveness port: 3000 initialDelaySeconds: 15 periodSeconds: 20 failureThreshold: 3

Best Practices ✅ DO Implement comprehensive health checks Check all critical dependencies Use appropriate timeout values Track response times Store check history Monitor uptime trends Alert on status changes Use standard HTTP status codes ❌ DON'T Check only application process Ignore external dependencies Set timeouts too low Alert on every failure Use health checks for load balancing Expose sensitive information SLA Compliance Calculation function calculateSLA(upChecks, totalChecks) { const uptime = (upChecks / totalChecks) * 100; return { uptime_percentage: uptime.toFixed(4), meets_99_9: uptime >= 99.9, meets_99_99: uptime >= 99.99 }; }

uptime-monitoring

安装