Skip to content

Node.js 监控和日志

更新: 8/8/2025 字数: 0 字 时长: 0 分钟

本章将介绍 Node.js 应用的监控策略、日志管理和运维最佳实践。

应用监控

1. 健康检查

javascript
// 健康检查服务
const os = require('os');
const mongoose = require('mongoose');
const redis = require('redis');

class HealthCheckService {
  constructor() {
    this.checks = new Map();
    this.setupDefaultChecks();
  }
  
  // 注册健康检查
  registerCheck(name, checkFunction, timeout = 5000) {
    this.checks.set(name, { checkFunction, timeout });
  }
  
  // 设置默认检查
  setupDefaultChecks() {
    // 数据库连接检查
    this.registerCheck('mongodb', async () => {
      if (mongoose.connection.readyState !== 1) {
        throw new Error('MongoDB not connected');
      }
      
      await mongoose.connection.db.admin().ping();
      return { status: 'healthy', latency: Date.now() };
    });
    
    // Redis 连接检查
    this.registerCheck('redis', async () => {
      const client = redis.createClient();
      await client.connect();
      
      const start = Date.now();
      await client.ping();
      const latency = Date.now() - start;
      
      await client.disconnect();
      return { status: 'healthy', latency };
    });
    
    // 内存使用检查
    this.registerCheck('memory', async () => {
      const usage = process.memoryUsage();
      const totalMemory = os.totalmem();
      const freeMemory = os.freemem();
      const usedMemory = totalMemory - freeMemory;
      
      const memoryUsagePercent = (usedMemory / totalMemory) * 100;
      const heapUsagePercent = (usage.heapUsed / usage.heapTotal) * 100;
      
      const status = memoryUsagePercent > 90 || heapUsagePercent > 90 ? 'unhealthy' : 'healthy';
      
      return {
        status,
        systemMemory: {
          total: Math.round(totalMemory / 1024 / 1024),
          used: Math.round(usedMemory / 1024 / 1024),
          free: Math.round(freeMemory / 1024 / 1024),
          usagePercent: Math.round(memoryUsagePercent)
        },
        heapMemory: {
          total: Math.round(usage.heapTotal / 1024 / 1024),
          used: Math.round(usage.heapUsed / 1024 / 1024),
          usagePercent: Math.round(heapUsagePercent)
        }
      };
    });
    
    // CPU 使用检查
    this.registerCheck('cpu', async () => {
      const cpus = os.cpus();
      const loadAvg = os.loadavg();
      
      // 计算 CPU 使用率
      const cpuUsage = await this.getCPUUsage();
      
      const status = loadAvg[0] > cpus.length * 0.8 ? 'unhealthy' : 'healthy';
      
      return {
        status,
        cores: cpus.length,
        loadAverage: {
          '1min': loadAvg[0].toFixed(2),
          '5min': loadAvg[1].toFixed(2),
          '15min': loadAvg[2].toFixed(2)
        },
        usage: `${cpuUsage.toFixed(2)}%`
      };
    });
    
    // 磁盘空间检查
    this.registerCheck('disk', async () => {
      const stats = await this.getDiskUsage();
      const usagePercent = ((stats.total - stats.free) / stats.total) * 100;
      
      const status = usagePercent > 90 ? 'unhealthy' : 'healthy';
      
      return {
        status,
        total: `${Math.round(stats.total / 1024 / 1024 / 1024)}GB`,
        used: `${Math.round((stats.total - stats.free) / 1024 / 1024 / 1024)}GB`,
        free: `${Math.round(stats.free / 1024 / 1024 / 1024)}GB`,
        usagePercent: Math.round(usagePercent)
      };
    });
  }
  
  // 获取 CPU 使用率
  async getCPUUsage() {
    return new Promise((resolve) => {
      const startUsage = process.cpuUsage();
      const startTime = Date.now();
      
      setTimeout(() => {
        const currentUsage = process.cpuUsage(startUsage);
        const currentTime = Date.now();
        
        const elapsedTime = (currentTime - startTime) * 1000; // 微秒
        const totalUsage = currentUsage.user + currentUsage.system;
        const cpuPercent = (totalUsage / elapsedTime) * 100;
        
        resolve(Math.min(cpuPercent, 100));
      }, 100);
    });
  }
  
  // 获取磁盘使用情况
  async getDiskUsage() {
    const fs = require('fs').promises;
    
    try {
      const stats = await fs.statfs('.');
      return {
        total: stats.blocks * stats.bsize,
        free: stats.bavail * stats.bsize
      };
    } catch (error) {
      // 备用方法
      return {
        total: 100 * 1024 * 1024 * 1024, // 假设 100GB
        free: 50 * 1024 * 1024 * 1024    // 假设 50GB 可用
      };
    }
  }
  
  // 执行单个检查
  async runCheck(name) {
    const check = this.checks.get(name);
    if (!check) {
      throw new Error(`Health check '${name}' not found`);
    }
    
    const { checkFunction, timeout } = check;
    
    try {
      const result = await Promise.race([
        checkFunction(),
        new Promise((_, reject) => {
          setTimeout(() => reject(new Error('Health check timeout')), timeout);
        })
      ]);
      
      return {
        name,
        status: result.status || 'healthy',
        timestamp: new Date().toISOString(),
        details: result
      };
    } catch (error) {
      return {
        name,
        status: 'unhealthy',
        timestamp: new Date().toISOString(),
        error: error.message
      };
    }
  }
  
  // 执行所有检查
  async runAllChecks() {
    const results = await Promise.allSettled(
      Array.from(this.checks.keys()).map(name => this.runCheck(name))
    );
    
    const healthChecks = results.map(result => 
      result.status === 'fulfilled' ? result.value : {
        name: 'unknown',
        status: 'error',
        timestamp: new Date().toISOString(),
        error: result.reason.message
      }
    );
    
    const overallStatus = healthChecks.every(check => check.status === 'healthy') 
      ? 'healthy' 
      : 'unhealthy';
    
    return {
      status: overallStatus,
      timestamp: new Date().toISOString(),
      checks: healthChecks,
      uptime: process.uptime(),
      version: process.version
    };
  }
  
  // 创建健康检查端点
  createHealthEndpoint(app) {
    // 简单健康检查
    app.get('/health', async (req, res) => {
      try {
        const health = await this.runAllChecks();
        const statusCode = health.status === 'healthy' ? 200 : 503;
        res.status(statusCode).json(health);
      } catch (error) {
        res.status(500).json({
          status: 'error',
          timestamp: new Date().toISOString(),
          error: error.message
        });
      }
    });
    
    // 详细健康检查
    app.get('/health/detailed', async (req, res) => {
      try {
        const health = await this.runAllChecks();
        res.json(health);
      } catch (error) {
        res.status(500).json({
          status: 'error',
          timestamp: new Date().toISOString(),
          error: error.message
        });
      }
    });
    
    // 单个检查
    app.get('/health/:checkName', async (req, res) => {
      try {
        const result = await this.runCheck(req.params.checkName);
        const statusCode = result.status === 'healthy' ? 200 : 503;
        res.status(statusCode).json(result);
      } catch (error) {
        res.status(404).json({
          status: 'error',
          timestamp: new Date().toISOString(),
          error: error.message
        });
      }
    });
  }
}

// 使用示例
const healthCheck = new HealthCheckService();

// 注册自定义检查
healthCheck.registerCheck('external-api', async () => {
  const response = await fetch('https://api.example.com/health');
  if (!response.ok) {
    throw new Error(`API returned ${response.status}`);
  }
  
  return {
    status: 'healthy',
    latency: response.headers.get('x-response-time')
  };
});

// 在 Express 应用中使用
const express = require('express');
const app = express();

healthCheck.createHealthEndpoint(app);

module.exports = { HealthCheckService, healthCheck };

2. 性能指标收集

javascript
// 性能指标收集器
const EventEmitter = require('events');

class MetricsCollector extends EventEmitter {
  constructor() {
    super();
    this.metrics = new Map();
    this.counters = new Map();
    this.histograms = new Map();
    this.gauges = new Map();
    
    this.startCollection();
  }
  
  // 开始收集系统指标
  startCollection() {
    // 每秒收集一次指标
    this.collectionInterval = setInterval(() => {
      this.collectSystemMetrics();
    }, 1000);
    
    // 每分钟收集一次详细指标
    this.detailedInterval = setInterval(() => {
      this.collectDetailedMetrics();
    }, 60000);
  }
  
  // 收集系统指标
  collectSystemMetrics() {
    const timestamp = Date.now();
    
    // 内存指标
    const memUsage = process.memoryUsage();
    this.setGauge('memory.rss', memUsage.rss, timestamp);
    this.setGauge('memory.heap_total', memUsage.heapTotal, timestamp);
    this.setGauge('memory.heap_used', memUsage.heapUsed, timestamp);
    this.setGauge('memory.external', memUsage.external, timestamp);
    
    // CPU 指标
    const cpuUsage = process.cpuUsage();
    this.setGauge('cpu.user', cpuUsage.user, timestamp);
    this.setGauge('cpu.system', cpuUsage.system, timestamp);
    
    // 事件循环延迟
    this.measureEventLoopDelay();
    
    // 活跃句柄和请求
    this.setGauge('handles.active', process._getActiveHandles().length, timestamp);
    this.setGauge('requests.active', process._getActiveRequests().length, timestamp);
  }
  
  // 收集详细指标
  collectDetailedMetrics() {
    const timestamp = Date.now();
    
    // 系统负载
    const os = require('os');
    const loadAvg = os.loadavg();
    this.setGauge('system.load_1m', loadAvg[0], timestamp);
    this.setGauge('system.load_5m', loadAvg[1], timestamp);
    this.setGauge('system.load_15m', loadAvg[2], timestamp);
    
    // 系统内存
    const totalMem = os.totalmem();
    const freeMem = os.freemem();
    this.setGauge('system.memory_total', totalMem, timestamp);
    this.setGauge('system.memory_free', freeMem, timestamp);
    this.setGauge('system.memory_used', totalMem - freeMem, timestamp);
    
    // 运行时间
    this.setGauge('process.uptime', process.uptime(), timestamp);
  }
  
  // 测量事件循环延迟
  measureEventLoopDelay() {
    const start = process.hrtime.bigint();
    
    setImmediate(() => {
      const delay = Number(process.hrtime.bigint() - start) / 1000000; // 转换为毫秒
      this.setGauge('eventloop.delay', delay, Date.now());
      
      if (delay > 10) {
        this.emit('high_eventloop_delay', { delay });
      }
    });
  }
  
  // 计数器
  incrementCounter(name, value = 1, tags = {}) {
    const key = this.getMetricKey(name, tags);
    const current = this.counters.get(key) || 0;
    this.counters.set(key, current + value);
    
    this.emit('counter', { name, value, tags, total: current + value });
  }
  
  // 直方图(用于测量分布)
  recordHistogram(name, value, tags = {}) {
    const key = this.getMetricKey(name, tags);
    
    if (!this.histograms.has(key)) {
      this.histograms.set(key, []);
    }
    
    const values = this.histograms.get(key);
    values.push({ value, timestamp: Date.now() });
    
    // 保持最近 1000 个值
    if (values.length > 1000) {
      values.shift();
    }
    
    this.emit('histogram', { name, value, tags });
  }
  
  // 仪表盘(瞬时值)
  setGauge(name, value, timestamp = Date.now(), tags = {}) {
    const key = this.getMetricKey(name, tags);
    this.gauges.set(key, { value, timestamp, tags });
    
    this.emit('gauge', { name, value, timestamp, tags });
  }
  
  // 获取指标键
  getMetricKey(name, tags) {
    const tagString = Object.entries(tags)
      .sort(([a], [b]) => a.localeCompare(b))
      .map(([k, v]) => `${k}=${v}`)
      .join(',');
    
    return tagString ? `${name}{${tagString}}` : name;
  }
  
  // 获取直方图统计
  getHistogramStats(name, tags = {}) {
    const key = this.getMetricKey(name, tags);
    const values = this.histograms.get(key);
    
    if (!values || values.length === 0) {
      return null;
    }
    
    const sortedValues = values.map(v => v.value).sort((a, b) => a - b);
    const count = sortedValues.length;
    const sum = sortedValues.reduce((a, b) => a + b, 0);
    
    return {
      count,
      sum,
      min: sortedValues[0],
      max: sortedValues[count - 1],
      mean: sum / count,
      p50: this.percentile(sortedValues, 0.5),
      p90: this.percentile(sortedValues, 0.9),
      p95: this.percentile(sortedValues, 0.95),
      p99: this.percentile(sortedValues, 0.99)
    };
  }
  
  // 计算百分位数
  percentile(sortedValues, p) {
    const index = Math.ceil(sortedValues.length * p) - 1;
    return sortedValues[Math.max(0, index)];
  }
  
  // 获取所有指标
  getAllMetrics() {
    const timestamp = Date.now();
    
    return {
      timestamp,
      counters: Object.fromEntries(this.counters),
      gauges: Object.fromEntries(this.gauges),
      histograms: Object.fromEntries(
        Array.from(this.histograms.keys()).map(key => [
          key,
          this.getHistogramStats(key.split('{')[0], {})
        ])
      )
    };
  }
  
  // 重置指标
  reset() {
    this.counters.clear();
    this.histograms.clear();
    this.gauges.clear();
  }
  
  // 停止收集
  stop() {
    if (this.collectionInterval) {
      clearInterval(this.collectionInterval);
    }
    
    if (this.detailedInterval) {
      clearInterval(this.detailedInterval);
    }
  }
}

// HTTP 请求指标中间件
const createMetricsMiddleware = (metricsCollector) => {
  return (req, res, next) => {
    const startTime = Date.now();
    
    // 记录请求开始
    metricsCollector.incrementCounter('http.requests.total', 1, {
      method: req.method,
      route: req.route?.path || req.path
    });
    
    // 监听响应完成
    res.on('finish', () => {
      const duration = Date.now() - startTime;
      const statusClass = `${Math.floor(res.statusCode / 100)}xx`;
      
      // 记录响应时间
      metricsCollector.recordHistogram('http.request.duration', duration, {
        method: req.method,
        status_class: statusClass,
        status_code: res.statusCode.toString()
      });
      
      // 记录响应状态
      metricsCollector.incrementCounter('http.responses.total', 1, {
        method: req.method,
        status_class: statusClass,
        status_code: res.statusCode.toString()
      });
      
      // 记录响应大小
      const contentLength = res.get('Content-Length');
      if (contentLength) {
        metricsCollector.recordHistogram('http.response.size', parseInt(contentLength), {
          method: req.method
        });
      }
    });
    
    next();
  };
};

// 数据库查询指标
const createDatabaseMetrics = (metricsCollector) => {
  return {
    // 记录查询
    recordQuery: (operation, collection, duration, success = true) => {
      metricsCollector.recordHistogram('db.query.duration', duration, {
        operation,
        collection
      });
      
      metricsCollector.incrementCounter('db.queries.total', 1, {
        operation,
        collection,
        status: success ? 'success' : 'error'
      });
    },
    
    // 记录连接池状态
    recordConnectionPool: (active, idle, total) => {
      metricsCollector.setGauge('db.connections.active', active);
      metricsCollector.setGauge('db.connections.idle', idle);
      metricsCollector.setGauge('db.connections.total', total);
    }
  };
};

// 使用示例
const metricsCollector = new MetricsCollector();

// 监听高事件循环延迟
metricsCollector.on('high_eventloop_delay', ({ delay }) => {
  console.warn(`High event loop delay detected: ${delay.toFixed(2)}ms`);
});

// 在 Express 应用中使用
const express = require('express');
const app = express();

app.use(createMetricsMiddleware(metricsCollector));

// 指标端点
app.get('/metrics', (req, res) => {
  const metrics = metricsCollector.getAllMetrics();
  res.json(metrics);
});

// Prometheus 格式指标
app.get('/metrics/prometheus', (req, res) => {
  const metrics = metricsCollector.getAllMetrics();
  const prometheus = convertToPrometheusFormat(metrics);
  res.set('Content-Type', 'text/plain');
  res.send(prometheus);
});

// 转换为 Prometheus 格式
function convertToPrometheusFormat(metrics) {
  let output = '';
  
  // 计数器
  for (const [key, value] of Object.entries(metrics.counters)) {
    output += `# TYPE ${key.replace(/\./g, '_')} counter\n`;
    output += `${key.replace(/\./g, '_')} ${value}\n`;
  }
  
  // 仪表盘
  for (const [key, data] of Object.entries(metrics.gauges)) {
    output += `# TYPE ${key.replace(/\./g, '_')} gauge\n`;
    output += `${key.replace(/\./g, '_')} ${data.value}\n`;
  }
  
  return output;
}

module.exports = {
  MetricsCollector,
  createMetricsMiddleware,
  createDatabaseMetrics
};

日志管理

1. 结构化日志

javascript
// 高级日志管理系统
const winston = require('winston');
const path = require('path');
const fs = require('fs');

class LogManager {
  constructor(config = {}) {
    this.config = {
      level: process.env.LOG_LEVEL || 'info',
      format: 'json',
      maxFileSize: 10 * 1024 * 1024, // 10MB
      maxFiles: 10,
      logDir: path.join(process.cwd(), 'logs'),
      ...config
    };
    
    this.ensureLogDirectory();
    this.createLoggers();
    this.setupProcessHandlers();
  }
  
  // 确保日志目录存在
  ensureLogDirectory() {
    if (!fs.existsSync(this.config.logDir)) {
      fs.mkdirSync(this.config.logDir, { recursive: true });
    }
  }
  
  // 创建日志记录器
  createLoggers() {
    // 主应用日志
    this.appLogger = winston.createLogger({
      level: this.config.level,
      format: this.createFormat('app'),
      transports: [
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'app-error.log'),
          level: 'error',
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        }),
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'app-combined.log'),
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        })
      ]
    });
    
    // HTTP 访问日志
    this.accessLogger = winston.createLogger({
      level: 'info',
      format: this.createFormat('access'),
      transports: [
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'access.log'),
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        })
      ]
    });
    
    // 安全日志
    this.securityLogger = winston.createLogger({
      level: 'info',
      format: this.createFormat('security'),
      transports: [
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'security.log'),
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        })
      ]
    });
    
    // 性能日志
    this.performanceLogger = winston.createLogger({
      level: 'info',
      format: this.createFormat('performance'),
      transports: [
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'performance.log'),
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        })
      ]
    });
    
    // 审计日志
    this.auditLogger = winston.createLogger({
      level: 'info',
      format: this.createFormat('audit'),
      transports: [
        new winston.transports.File({
          filename: path.join(this.config.logDir, 'audit.log'),
          maxsize: this.config.maxFileSize,
          maxFiles: this.config.maxFiles
        })
      ]
    });
    
    // 开发环境添加控制台输出
    if (process.env.NODE_ENV !== 'production') {
      const consoleTransport = new winston.transports.Console({
        format: winston.format.combine(
          winston.format.colorize(),
          winston.format.simple()
        )
      });
      
      this.appLogger.add(consoleTransport);
    }
  }
  
  // 创建日志格式
  createFormat(loggerType) {
    return winston.format.combine(
      winston.format.timestamp(),
      winston.format.errors({ stack: true }),
      winston.format.json(),
      winston.format.printf(info => {
        return JSON.stringify({
          timestamp: info.timestamp,
          level: info.level,
          logger: loggerType,
          message: info.message,
          ...info.meta,
          ...(info.stack && { stack: info.stack })
        });
      })
    );
  }
  
  // 设置进程处理器
  setupProcessHandlers() {
    // 未捕获异常
    process.on('uncaughtException', (error) => {
      this.appLogger.error('Uncaught Exception', {
        error: {
          message: error.message,
          stack: error.stack,
          code: error.code
        },
        process: {
          pid: process.pid,
          uptime: process.uptime(),
          memoryUsage: process.memoryUsage()
        }
      });
      
      // 给日志时间写入后退出
      setTimeout(() => {
        process.exit(1);
      }, 1000);
    });
    
    // 未处理的 Promise 拒绝
    process.on('unhandledRejection', (reason, promise) => {
      this.appLogger.error('Unhandled Promise Rejection', {
        reason: reason instanceof Error ? {
          message: reason.message,
          stack: reason.stack
        } : reason,
        promise: promise.toString()
      });
    });
    
    // 进程退出
    process.on('exit', (code) => {
      this.appLogger.info('Process Exit', {
        exitCode: code,
        uptime: process.uptime()
      });
    });
  }
  
  // 应用日志方法
  info(message, meta = {}) {
    this.appLogger.info(message, { meta });
  }
  
  error(message, error = null, meta = {}) {
    const errorMeta = error ? {
      error: {
        message: error.message,
        stack: error.stack,
        code: error.code
      }
    } : {};
    
    this.appLogger.error(message, { meta: { ...meta, ...errorMeta } });
  }
  
  warn(message, meta = {}) {
    this.appLogger.warn(message, { meta });
  }
  
  debug(message, meta = {}) {
    this.appLogger.debug(message, { meta });
  }
  
  // HTTP 访问日志
  logAccess(req, res, responseTime) {
    this.accessLogger.info('HTTP Request', {
      method: req.method,
      url: req.url,
      statusCode: res.statusCode,
      responseTime: `${responseTime}ms`,
      contentLength: res.get('Content-Length') || 0,
      userAgent: req.get('User-Agent'),
      ip: req.ip,
      userId: req.user?.id,
      sessionId: req.sessionID
    });
  }
  
  // 安全事件日志
  logSecurity(event, details = {}) {
    this.securityLogger.warn('Security Event', {
      event,
      ...details,
      timestamp: new Date().toISOString()
    });
  }
  
  // 性能日志
  logPerformance(operation, duration, details = {}) {
    const level = duration > 1000 ? 'warn' : 'info';
    
    this.performanceLogger.log(level, 'Performance Metric', {
      operation,
      duration: `${duration}ms`,
      ...details
    });
  }
  
  // 审计日志
  logAudit(action, details = {}) {
    this.auditLogger.info('Audit Event', {
      action,
      ...details,
      timestamp: new Date().toISOString()
    });
  }
  
  // 创建子日志器
  createChildLogger(module, additionalMeta = {}) {
    return {
      info: (message, meta = {}) => {
        this.info(message, { module, ...additionalMeta, ...meta });
      },
      error: (message, error = null, meta = {}) => {
        this.error(message, error, { module, ...additionalMeta, ...meta });
      },
      warn: (message, meta = {}) => {
        this.warn(message, { module, ...additionalMeta, ...meta });
      },
      debug: (message, meta = {}) => {
        this.debug(message, { module, ...additionalMeta, ...meta });
      }
    };
  }
  
  // 日志轮转
  async rotateLogs() {
    const logFiles = fs.readdirSync(this.config.logDir)
      .filter(file => file.endsWith('.log'))
      .map(file => path.join(this.config.logDir, file));
    
    for (const logFile of logFiles) {
      const stats = fs.statSync(logFile);
      
      if (stats.size > this.config.maxFileSize) {
        const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
        const rotatedFile = `${logFile}.${timestamp}`;
        
        fs.renameSync(logFile, rotatedFile);
        this.info('Log file rotated', {
          originalFile: logFile,
          rotatedFile,
          size: stats.size
        });
      }
    }
  }
  
  // 清理旧日志
  async cleanupOldLogs(maxAge = 30 * 24 * 60 * 60 * 1000) { // 30 天
    const logFiles = fs.readdirSync(this.config.logDir)
      .map(file => path.join(this.config.logDir, file))
      .filter(file => {
        const stats = fs.statSync(file);
        return Date.now() - stats.mtime.getTime() > maxAge;
      });
    
    for (const logFile of logFiles) {
      fs.unlinkSync(logFile);
      this.info('Old log file deleted', { file: logFile });
    }
  }
}

// 日志中间件
const createLoggingMiddleware = (logManager) => {
  return (req, res, next) => {
    const startTime = Date.now();
    
    res.on('finish', () => {
      const responseTime = Date.now() - startTime;
      logManager.logAccess(req, res, responseTime);
    });
    
    next();
  };
};

// 错误日志中间件
const createErrorLoggingMiddleware = (logManager) => {
  return (error, req, res, next) => {
    logManager.error('HTTP Error', error, {
      method: req.method,
      url: req.url,
      ip: req.ip,
      userAgent: req.get('User-Agent'),
      userId: req.user?.id
    });
    
    next(error);
  };
};

// 使用示例
const logManager = new LogManager({
  level: 'debug',
  logDir: path.join(__dirname, '../logs')
});

// 创建模块特定的日志器
const dbLogger = logManager.createChildLogger('database');
const authLogger = logManager.createChildLogger('authentication');

// 在 Express 应用中使用
const express = require('express');
const app = express();

app.use(createLoggingMiddleware(logManager));
app.use(createErrorLoggingMiddleware(logManager));

// 定期清理日志
setInterval(() => {
  logManager.rotateLogs();
  logManager.cleanupOldLogs();
}, 24 * 60 * 60 * 1000); // 每天执行一次

module.exports = {
  LogManager,
  createLoggingMiddleware,
  createErrorLoggingMiddleware
};

2. 日志聚合和分析

javascript
// 日志聚合和分析工具
const fs = require('fs');
const readline = require('readline');
const path = require('path');

class LogAnalyzer {
  constructor(logDir) {
    this.logDir = logDir;
  }
  
  // 分析访问日志
  async analyzeAccessLogs(timeRange = 24 * 60 * 60 * 1000) { // 24小时
    const logFile = path.join(this.logDir, 'access.log');
    const cutoffTime = Date.now() - timeRange;
    
    const stats = {
      totalRequests: 0,
      statusCodes: {},
      methods: {},
      topUrls: {},
      topIPs: {},
      topUserAgents: {},
      responseTimeStats: [],
      errorRequests: []
    };
    
    if (!fs.existsSync(logFile)) {
      return stats;
    }
    
    const fileStream = fs.createReadStream(logFile);
    const rl = readline.createInterface({
      input: fileStream,
      crlfDelay: Infinity
    });
    
    for await (const line of rl) {
      try {
        const logEntry = JSON.parse(line);
        const timestamp = new Date(logEntry.timestamp).getTime();
        
        // 只分析指定时间范围内的日志
        if (timestamp < cutoffTime) continue;
        
        stats.totalRequests++;
        
        // 状态码统计
        const statusCode = logEntry.statusCode;
        stats.statusCodes[statusCode] = (stats.statusCodes[statusCode] || 0) + 1;
        
        // HTTP 方法统计
        const method = logEntry.method;
        stats.methods[method] = (stats.methods[method] || 0) + 1;
        
        // URL 统计
        const url = logEntry.url;
        stats.topUrls[url] = (stats.topUrls[url] || 0) + 1;
        
        // IP 统计
        const ip = logEntry.ip;
        stats.topIPs[ip] = (stats.topIPs[ip] || 0) + 1;
        
        // User Agent 统计
        const userAgent = logEntry.userAgent;
        if (userAgent) {
          stats.topUserAgents[userAgent] = (stats.topUserAgents[userAgent] || 0) + 1;
        }
        
        // 响应时间统计
        const responseTime = parseInt(logEntry.responseTime);
        if (!isNaN(responseTime)) {
          stats.responseTimeStats.push(responseTime);
        }
        
        // 错误请求
        if (statusCode >= 400) {
          stats.errorRequests.push({
            timestamp: logEntry.timestamp,
            method: logEntry.method,
            url: logEntry.url,
            statusCode: logEntry.statusCode,
            ip: logEntry.ip,
            userAgent: logEntry.userAgent
          });
        }
      } catch (error) {
        // 忽略无法解析的日志行
        continue;
      }
    }
    
    // 处理响应时间统计
    if (stats.responseTimeStats.length > 0) {
      stats.responseTimeStats.sort((a, b) => a - b);
      const count = stats.responseTimeStats.length;
      
      stats.responseTime = {
        min: stats.responseTimeStats[0],
        max: stats.responseTimeStats[count - 1],
        mean: stats.responseTimeStats.reduce((a, b) => a + b, 0) / count,
        median: stats.responseTimeStats[Math.floor(count / 2)],
        p95: stats.responseTimeStats[Math.floor(count * 0.95)],
        p99: stats.responseTimeStats[Math.floor(count * 0.99)]
      };
    }
    
    // 转换为排序的数组
    stats.topUrls = this.sortObject(stats.topUrls, 10);
    stats.topIPs = this.sortObject(stats.topIPs, 10);
    stats.topUserAgents = this.sortObject(stats.topUserAgents, 5);
    
    return stats;
  }
  
  // 分析错误日志
  async analyzeErrorLogs(timeRange = 24 * 60 * 60 * 1000) {
    const logFile = path.join(this.logDir, 'app-error.log');
    const cutoffTime = Date.now() - timeRange;
    
    const stats = {
      totalErrors: 0,
      errorTypes: {},
      errorMessages: {},
      errorsByHour: {},
      criticalErrors: []
    };
    
    if (!fs.existsSync(logFile)) {
      return stats;
    }
    
    const fileStream = fs.createReadStream(logFile);
    const rl = readline.createInterface({
      input: fileStream,
      crlfDelay: Infinity
    });
    
    for await (const line of rl) {
      try {
        const logEntry = JSON.parse(line);
        const timestamp = new Date(logEntry.timestamp).getTime();
        
        if (timestamp < cutoffTime) continue;
        
        stats.totalErrors++;
        
        // 按小时分组
        const hour = new Date(timestamp).getHours();
        stats.errorsByHour[hour] = (stats.errorsByHour[hour] || 0) + 1;
        
        // 错误类型统计
        const errorType = logEntry.meta?.error?.code || 'UNKNOWN';
        stats.errorTypes[errorType] = (stats.errorTypes[errorType] || 0) + 1;
        
        // 错误消息统计
        const errorMessage = logEntry.meta?.error?.message || logEntry.message;
        stats.errorMessages[errorMessage] = (stats.errorMessages[errorMessage] || 0) + 1;
        
        // 关键错误
        if (logEntry.level === 'error' && logEntry.meta?.error?.stack) {
          stats.criticalErrors.push({
            timestamp: logEntry.timestamp,
            message: errorMessage,
            stack: logEntry.meta.error.stack,
            module: logEntry.meta?.module
          });
        }
      } catch (error) {
        continue;
      }
    }
    
    stats.errorTypes = this.sortObject(stats.errorTypes, 10);
    stats.errorMessages = this.sortObject(stats.errorMessages, 10);
    
    return stats;
  }
  
  // 分析性能日志
  async analyzePerformanceLogs(timeRange = 24 * 60 * 60 * 1000) {
    const logFile = path.join(this.logDir, 'performance.log');
    const cutoffTime = Date.now() - timeRange;
    
    const stats = {
      totalOperations: 0,
      operationStats: {},
      slowOperations: [],
      averageResponseTime: 0
    };
    
    if (!fs.existsSync(logFile)) {
      return stats;
    }
    
    const fileStream = fs.createReadStream(logFile);
    const rl = readline.createInterface({
      input: fileStream,
      crlfDelay: Infinity
    });
    
    const allDurations = [];
    
    for await (const line of rl) {
      try {
        const logEntry = JSON.parse(line);
        const timestamp = new Date(logEntry.timestamp).getTime();
        
        if (timestamp < cutoffTime) continue;
        
        stats.totalOperations++;
        
        const operation = logEntry.operation;
        const duration = parseInt(logEntry.duration);
        
        if (!isNaN(duration)) {
          allDurations.push(duration);
          
          if (!stats.operationStats[operation]) {
            stats.operationStats[operation] = {
              count: 0,
              totalDuration: 0,
              minDuration: Infinity,
              maxDuration: 0,
              durations: []
            };
          }
          
          const opStats = stats.operationStats[operation];
          opStats.count++;
          opStats.totalDuration += duration;
          opStats.minDuration = Math.min(opStats.minDuration, duration);
          opStats.maxDuration = Math.max(opStats.maxDuration, duration);
          opStats.durations.push(duration);
          
          // 慢操作(超过 1 秒)
          if (duration > 1000) {
            stats.slowOperations.push({
              timestamp: logEntry.timestamp,
              operation,
              duration,
              details: logEntry.meta
            });
          }
        }
      } catch (error) {
        continue;
      }
    }
    
    // 计算平均响应时间
    if (allDurations.length > 0) {
      stats.averageResponseTime = allDurations.reduce((a, b) => a + b, 0) / allDurations.length;
    }
    
    // 计算每个操作的统计信息
    for (const [operation, opStats] of Object.entries(stats.operationStats)) {
      opStats.averageDuration = opStats.totalDuration / opStats.count;
      
      // 计算百分位数
      opStats.durations.sort((a, b) => a - b);
      const count = opStats.durations.length;
      opStats.p95 = opStats.durations[Math.floor(count * 0.95)];
      opStats.p99 = opStats.durations[Math.floor(count * 0.99)];
      
      // 清理原始数据以节省内存
      delete opStats.durations;
    }
    
    return stats;
  }
  
  // 生成综合报告
  async generateReport(timeRange = 24 * 60 * 60 * 1000) {
    const [accessStats, errorStats, performanceStats] = await Promise.all([
      this.analyzeAccessLogs(timeRange),
      this.analyzeErrorLogs(timeRange),
      this.analyzePerformanceLogs(timeRange)
    ]);
    
    return {
      timestamp: new Date().toISOString(),
      timeRange: `${timeRange / (60 * 60 * 1000)} hours`,
      access: accessStats,
      errors: errorStats,
      performance: performanceStats,
      summary: {
        totalRequests: accessStats.totalRequests,
        totalErrors: errorStats.totalErrors,
        errorRate: accessStats.totalRequests > 0 
          ? (errorStats.totalErrors / accessStats.totalRequests * 100).toFixed(2) + '%'
          : '0%',
        averageResponseTime: performanceStats.averageResponseTime
          ? `${performanceStats.averageResponseTime.toFixed(2)}ms`
          : 'N/A'
      }
    };
  }
  
  // 辅助方法:排序对象
  sortObject(obj, limit = 10) {
    return Object.entries(obj)
      .sort(([,a], [,b]) => b - a)
      .slice(0, limit)
      .map(([key, value]) => ({ key, value }));
  }
  
  // 导出报告为 JSON
  async exportReport(outputPath, timeRange = 24 * 60 * 60 * 1000) {
    const report = await this.generateReport(timeRange);
    fs.writeFileSync(outputPath, JSON.stringify(report, null, 2));
    return report;
  }
  
  // 实时日志监控
  startRealTimeMonitoring(callback) {
    const logFiles = [
      path.join(this.logDir, 'app-error.log'),
      path.join(this.logDir, 'access.log')
    ];
    
    const watchers = logFiles.map(logFile => {
      if (fs.existsSync(logFile)) {
        return fs.watch(logFile, (eventType) => {
          if (eventType === 'change') {
            this.readLatestLogEntries(logFile, callback);
          }
        });
      }
      return null;
    }).filter(Boolean);
    
    return () => {
      watchers.forEach(watcher => watcher.close());
    };
  }
  
  // 读取最新的日志条目
  async readLatestLogEntries(logFile, callback, lines = 10) {
    try {
      const data = fs.readFileSync(logFile, 'utf8');
      const logLines = data.trim().split('\n').slice(-lines);
      
      for (const line of logLines) {
        try {
          const logEntry = JSON.parse(line);
          callback(logEntry, path.basename(logFile));
        } catch (error) {
          // 忽略无法解析的行
        }
      }
    } catch (error) {
      console.error('Error reading log file:', error.message);
    }
  }
}

// 使用示例
const logAnalyzer = new LogAnalyzer(path.join(__dirname, '../logs'));

// 生成日报
const generateDailyReport = async () => {
  try {
    const report = await logAnalyzer.generateReport(24 * 60 * 60 * 1000);
    const outputPath = path.join(__dirname, '../reports', `daily-report-${new Date().toISOString().split('T')[0]}.json`);
    
    await logAnalyzer.exportReport(outputPath);
    console.log('Daily report generated:', outputPath);
    
    return report;
  } catch (error) {
    console.error('Failed to generate daily report:', error.message);
  }
};

// 启动实时监控
const stopMonitoring = logAnalyzer.startRealTimeMonitoring((logEntry, logFile) => {
  if (logEntry.level === 'error') {
    console.warn(`[ALERT] Error detected in ${logFile}:`, logEntry.message);
    
    // 可以在这里发送告警通知
    // sendAlert(logEntry);
  }
});

// 定期生成报告
setInterval(generateDailyReport, 24 * 60 * 60 * 1000); // 每天生成一次

module.exports = {
  LogAnalyzer
};

告警系统

1. 告警规则和通知

javascript
// 告警系统
const nodemailer = require('nodemailer');
const axios = require('axios');

class AlertManager {
  constructor(config = {}) {
    this.config = {
      email: {
        enabled: false,
        smtp: {
          host: process.env.SMTP_HOST,
          port: process.env.SMTP_PORT || 587,
          secure: false,
          auth: {
            user: process.env.SMTP_USER,
            pass: process.env.SMTP_PASS
          }
        },
        from: process.env.ALERT_FROM_EMAIL,
        to: process.env.ALERT_TO_EMAIL?.split(',') || []
      },
      slack: {
        enabled: false,
        webhookUrl: process.env.SLACK_WEBHOOK_URL
      },
      webhook: {
        enabled: false,
        url: process.env.ALERT_WEBHOOK_URL
      },
      ...config
    };
    
    this.rules = new Map();
    this.alertHistory = [];
    this.setupDefaultRules();
    this.setupEmailTransporter();
  }
  
  // 设置邮件传输器
  setupEmailTransporter() {
    if (this.config.email.enabled) {
      this.emailTransporter = nodemailer.createTransporter(this.config.email.smtp);
    }
  }
  
  // 设置默认告警规则
  setupDefaultRules() {
    // 高错误率告警
    this.addRule('high_error_rate', {
      condition: (metrics) => {
        const totalRequests = metrics.counters['http.requests.total'] || 0;
        const errorRequests = metrics.counters['http.responses.5xx'] || 0;
        
        if (totalRequests > 100) {
          const errorRate = (errorRequests / totalRequests) * 100;
          return errorRate > 5; // 错误率超过 5%
        }
        
        return false;
      },
      severity: 'high',
      message: 'High error rate detected',
      cooldown: 5 * 60 * 1000 // 5 分钟冷却期
    });
    
    // 高内存使用告警
    this.addRule('high_memory_usage', {
      condition: (metrics) => {
        const heapUsed = metrics.gauges['memory.heap_used']?.value || 0;
        const heapTotal = metrics.gauges['memory.heap_total']?.value || 1;
        
        const memoryUsage = (heapUsed / heapTotal) * 100;
        return memoryUsage > 90; // 内存使用超过 90%
      },
      severity: 'medium',
      message: 'High memory usage detected',
      cooldown: 10 * 60 * 1000 // 10 分钟冷却期
    });
    
    // 高响应时间告警
    this.addRule('high_response_time', {
      condition: (metrics) => {
        const responseTimeStats = metrics.histograms['http.request.duration'];
        if (responseTimeStats && responseTimeStats.p95) {
          return responseTimeStats.p95 > 2000; // P95 响应时间超过 2 秒
        }
        return false;
      },
      severity: 'medium',
      message: 'High response time detected',
      cooldown: 5 * 60 * 1000
    });
    
    // 事件循环延迟告警
    this.addRule('high_eventloop_delay', {
      condition: (metrics) => {
        const eventLoopDelay = metrics.gauges['eventloop.delay']?.value || 0;
        return eventLoopDelay > 100; // 事件循环延迟超过 100ms
      },
      severity: 'high',
      message: 'High event loop delay detected',
      cooldown: 2 * 60 * 1000
    });
    
    // 数据库连接告警
    this.addRule('database_connection_issues', {
      condition: (metrics) => {
        const activeConnections = metrics.gauges['db.connections.active']?.value || 0;
        const totalConnections = metrics.gauges['db.connections.total']?.value || 1;
        
        const connectionUsage = (activeConnections / totalConnections) * 100;
        return connectionUsage > 95; // 连接池使用率超过 95%
      },
      severity: 'high',
      message: 'Database connection pool nearly exhausted',
      cooldown: 5 * 60 * 1000
    });
  }
  
  // 添加告警规则
  addRule(name, rule) {
    this.rules.set(name, {
      ...rule,
      lastTriggered: 0
    });
  }
  
  // 移除告警规则
  removeRule(name) {
    this.rules.delete(name);
  }
  
  // 检查告警条件
  async checkAlerts(metrics) {
    const currentTime = Date.now();
    const triggeredAlerts = [];
    
    for (const [name, rule] of this.rules) {
      try {
        // 检查冷却期
        if (currentTime - rule.lastTriggered < rule.cooldown) {
          continue;
        }
        
        // 检查条件
        if (rule.condition(metrics)) {
          rule.lastTriggered = currentTime;
          
          const alert = {
            name,
            severity: rule.severity,
            message: rule.message,
            timestamp: new Date().toISOString(),
            metrics: this.extractRelevantMetrics(metrics, name)
          };
          
          triggeredAlerts.push(alert);
          this.alertHistory.push(alert);
          
          // 发送告警
          await this.sendAlert(alert);
        }
      } catch (error) {
        console.error(`Error checking alert rule '${name}':`, error.message);
      }
    }
    
    // 保持告警历史记录在合理范围内
    if (this.alertHistory.length > 1000) {
      this.alertHistory = this.alertHistory.slice(-500);
    }
    
    return triggeredAlerts;
  }
  
  // 提取相关指标
  extractRelevantMetrics(metrics, ruleName) {
    const relevantMetrics = {};
    
    switch (ruleName) {
      case 'high_error_rate':
        relevantMetrics.totalRequests = metrics.counters['http.requests.total'];
        relevantMetrics.errorRequests = metrics.counters['http.responses.5xx'];
        break;
      
      case 'high_memory_usage':
        relevantMetrics.heapUsed = metrics.gauges['memory.heap_used'];
        relevantMetrics.heapTotal = metrics.gauges['memory.heap_total'];
        break;
      
      case 'high_response_time':
        relevantMetrics.responseTime = metrics.histograms['http.request.duration'];
        break;
      
      case 'high_eventloop_delay':
        relevantMetrics.eventLoopDelay = metrics.gauges['eventloop.delay'];
        break;
      
      case 'database_connection_issues':
        relevantMetrics.activeConnections = metrics.gauges['db.connections.active'];
        relevantMetrics.totalConnections = metrics.gauges['db.connections.total'];
        break;
    }
    
    return relevantMetrics;
  }
  
  // 发送告警
  async sendAlert(alert) {
    const promises = [];
    
    if (this.config.email.enabled) {
      promises.push(this.sendEmailAlert(alert));
    }
    
    if (this.config.slack.enabled) {
      promises.push(this.sendSlackAlert(alert));
    }
    
    if (this.config.webhook.enabled) {
      promises.push(this.sendWebhookAlert(alert));
    }
    
    await Promise.allSettled(promises);
  }
  
  // 发送邮件告警
  async sendEmailAlert(alert) {
    if (!this.emailTransporter) {
      throw new Error('Email transporter not configured');
    }
    
    const subject = `[${alert.severity.toUpperCase()}] ${alert.message}`;
    const html = this.generateEmailTemplate(alert);
    
    const mailOptions = {
      from: this.config.email.from,
      to: this.config.email.to,
      subject,
      html
    };
    
    await this.emailTransporter.sendMail(mailOptions);
  }
  
  // 发送 Slack 告警
  async sendSlackAlert(alert) {
    const color = {
      low: '#36a64f',
      medium: '#ff9500',
      high: '#ff0000'
    }[alert.severity] || '#808080';
    
    const payload = {
      attachments: [{
        color,
        title: `${alert.severity.toUpperCase()} Alert: ${alert.message}`,
        fields: [
          {
            title: 'Alert Name',
            value: alert.name,
            short: true
          },
          {
            title: 'Timestamp',
            value: alert.timestamp,
            short: true
          },
          {
            title: 'Metrics',
            value: JSON.stringify(alert.metrics, null, 2),
            short: false
          }
        ],
        footer: 'Node.js Monitoring System',
        ts: Math.floor(Date.now() / 1000)
      }]
    };
    
    await axios.post(this.config.slack.webhookUrl, payload);
  }
  
  // 发送 Webhook 告警
  async sendWebhookAlert(alert) {
    await axios.post(this.config.webhook.url, alert, {
      headers: {
        'Content-Type': 'application/json'
      }
    });
  }
  
  // 生成邮件模板
  generateEmailTemplate(alert) {
    return `
      <!DOCTYPE html>
      <html>
      <head>
        <style>
          body { font-family: Arial, sans-serif; margin: 0; padding: 20px; }
          .alert-header { background-color: ${alert.severity === 'high' ? '#ff4444' : alert.severity === 'medium' ? '#ff9500' : '#36a64f'}; color: white; padding: 15px; border-radius: 5px; }
          .alert-content { background-color: #f9f9f9; padding: 15px; border-radius: 5px; margin-top: 10px; }
          .metrics { background-color: #e9e9e9; padding: 10px; border-radius: 3px; font-family: monospace; }
        </style>
      </head>
      <body>
        <div class="alert-header">
          <h2>${alert.severity.toUpperCase()} Alert: ${alert.message}</h2>
        </div>
        <div class="alert-content">
          <p><strong>Alert Name:</strong> ${alert.name}</p>
          <p><strong>Timestamp:</strong> ${alert.timestamp}</p>
          <p><strong>Severity:</strong> ${alert.severity}</p>
          <h3>Relevant Metrics:</h3>
          <div class="metrics">
            <pre>${JSON.stringify(alert.metrics, null, 2)}</pre>
          </div>
        </div>
      </body>
      </html>
    `;
  }
  
  // 获取告警历史
  getAlertHistory(limit = 50) {
    return this.alertHistory.slice(-limit);
  }
  
  // 获取告警统计
  getAlertStats(timeRange = 24 * 60 * 60 * 1000) {
    const cutoffTime = Date.now() - timeRange;
    const recentAlerts = this.alertHistory.filter(alert => 
      new Date(alert.timestamp).getTime() > cutoffTime
    );
    
    const stats = {
      total: recentAlerts.length,
      bySeverity: {},
      byRule: {},
      timeline: []
    };
    
    recentAlerts.forEach(alert => {
      // 按严重程度统计
      stats.bySeverity[alert.severity] = (stats.bySeverity[alert.severity] || 0) + 1;
      
      // 按规则统计
      stats.byRule[alert.name] = (stats.byRule[alert.name] || 0) + 1;
      
      // 时间线
      stats.timeline.push({
        timestamp: alert.timestamp,
        name: alert.name,
        severity: alert.severity
      });
    });
    
    return stats;
  }
}

// 使用示例
const alertManager = new AlertManager({
  email: {
    enabled: true,
    from: 'alerts@example.com',
    to: ['admin@example.com', 'dev@example.com']
  },
  slack: {
    enabled: true,
    webhookUrl: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
  }
});

// 自定义告警规则
alertManager.addRule('custom_business_metric', {
  condition: (metrics) => {
    const orderCount = metrics.counters['business.orders.total'] || 0;
    const lastHourOrders = orderCount; // 假设这是最近一小时的订单数
    
    return lastHourOrders < 10; // 如果一小时内订单少于 10 个则告警
  },
  severity: 'medium',
  message: 'Low order volume detected',
  cooldown: 30 * 60 * 1000 // 30 分钟冷却期
});

module.exports = { AlertManager };

2. 监控仪表板

javascript
// 监控仪表板 API
const express = require('express');
const path = require('path');

class MonitoringDashboard {
  constructor(metricsCollector, logAnalyzer, alertManager) {
    this.metricsCollector = metricsCollector;
    this.logAnalyzer = logAnalyzer;
    this.alertManager = alertManager;
    this.app = express();
    
    this.setupRoutes();
    this.setupWebSocket();
  }
  
  // 设置路由
  setupRoutes() {
    // 静态文件服务
    this.app.use('/static', express.static(path.join(__dirname, 'dashboard-static')));
    
    // 主仪表板页面
    this.app.get('/', (req, res) => {
      res.send(this.generateDashboardHTML());
    });
    
    // 实时指标 API
    this.app.get('/api/metrics', (req, res) => {
      const metrics = this.metricsCollector.getAllMetrics();
      res.json(metrics);
    });
    
    // 历史指标 API
    this.app.get('/api/metrics/history', async (req, res) => {
      const timeRange = parseInt(req.query.timeRange) || 24 * 60 * 60 * 1000;
      const interval = parseInt(req.query.interval) || 60 * 1000; // 1 分钟间隔
      
      const history = await this.getMetricsHistory(timeRange, interval);
      res.json(history);
    });
    
    // 日志分析 API
    this.app.get('/api/logs/analysis', async (req, res) => {
      const timeRange = parseInt(req.query.timeRange) || 24 * 60 * 60 * 1000;
      const analysis = await this.logAnalyzer.generateReport(timeRange);
      res.json(analysis);
    });
    
    // 告警历史 API
    this.app.get('/api/alerts', (req, res) => {
      const limit = parseInt(req.query.limit) || 50;
      const alerts = this.alertManager.getAlertHistory(limit);
      res.json(alerts);
    });
    
    // 告警统计 API
    this.app.get('/api/alerts/stats', (req, res) => {
      const timeRange = parseInt(req.query.timeRange) || 24 * 60 * 60 * 1000;
      const stats = this.alertManager.getAlertStats(timeRange);
      res.json(stats);
    });
    
    // 系统健康检查 API
    this.app.get('/api/health', async (req, res) => {
      const health = await this.getSystemHealth();
      res.json(health);
    });
    
    // 性能概览 API
    this.app.get('/api/performance', (req, res) => {
      const performance = this.getPerformanceOverview();
      res.json(performance);
    });
  }
  
  // 设置 WebSocket 实时更新
  setupWebSocket() {
    const http = require('http');
    const socketIo = require('socket.io');
    
    this.server = http.createServer(this.app);
    this.io = socketIo(this.server);
    
    this.io.on('connection', (socket) => {
      console.log('Dashboard client connected');
      
      // 发送初始数据
      socket.emit('metrics', this.metricsCollector.getAllMetrics());
      
      // 定期发送更新
      const interval = setInterval(() => {
        socket.emit('metrics', this.metricsCollector.getAllMetrics());
      }, 5000); // 每 5 秒更新一次
      
      socket.on('disconnect', () => {
        clearInterval(interval);
        console.log('Dashboard client disconnected');
      });
    });
    
    // 监听告警事件
    this.alertManager.on('alert', (alert) => {
      this.io.emit('alert', alert);
    });
  }
  
  // 获取指标历史
  async getMetricsHistory(timeRange, interval) {
    // 这里应该从持久化存储中获取历史数据
    // 为了演示,我们生成一些模拟数据
    const history = [];
    const now = Date.now();
    const points = Math.floor(timeRange / interval);
    
    for (let i = points; i >= 0; i--) {
      const timestamp = now - (i * interval);
      const metrics = this.metricsCollector.getAllMetrics();
      
      history.push({
        timestamp,
        ...metrics
      });
    }
    
    return history;
  }
  
  // 获取系统健康状态
  async getSystemHealth() {
    const metrics = this.metricsCollector.getAllMetrics();
    const alerts = this.alertManager.getAlertStats(60 * 60 * 1000); // 最近 1 小时
    
    // 计算健康分数
    let healthScore = 100;
    
    // 根据告警数量扣分
    healthScore -= alerts.total * 5;
    
    // 根据错误率扣分
    const totalRequests = metrics.counters['http.requests.total'] || 0;
    const errorRequests = metrics.counters['http.responses.5xx'] || 0;
    if (totalRequests > 0) {
      const errorRate = (errorRequests / totalRequests) * 100;
      healthScore -= errorRate * 10;
    }
    
    // 根据响应时间扣分
    const responseTime = metrics.histograms['http.request.duration'];
    if (responseTime && responseTime.p95 > 1000) {
      healthScore -= (responseTime.p95 - 1000) / 100;
    }
    
    healthScore = Math.max(0, Math.min(100, healthScore));
    
    return {
      score: Math.round(healthScore),
      status: healthScore > 80 ? 'healthy' : healthScore > 60 ? 'warning' : 'critical',
      uptime: process.uptime(),
      version: process.version,
      environment: process.env.NODE_ENV || 'development',
      lastUpdate: new Date().toISOString()
    };
  }
  
  // 获取性能概览
  getPerformanceOverview() {
    const metrics = this.metricsCollector.getAllMetrics();
    
    return {
      memory: {
        heapUsed: metrics.gauges['memory.heap_used']?.value || 0,
        heapTotal: metrics.gauges['memory.heap_total']?.value || 0,
        rss: metrics.gauges['memory.rss']?.value || 0
      },
      cpu: {
        usage: metrics.gauges['cpu.user']?.value || 0,
        loadAverage: {
          '1m': metrics.gauges['system.load_1m']?.value || 0,
          '5m': metrics.gauges['system.load_5m']?.value || 0,
          '15m': metrics.gauges['system.load_15m']?.value || 0
        }
      },
      eventLoop: {
        delay: metrics.gauges['eventloop.delay']?.value || 0
      },
      http: {
        totalRequests: metrics.counters['http.requests.total'] || 0,
        responseTime: metrics.histograms['http.request.duration'] || {},
        errorRate: this.calculateErrorRate(metrics)
      },
      database: {
        activeConnections: metrics.gauges['db.connections.active']?.value || 0,
        totalConnections: metrics.gauges['db.connections.total']?.value || 0
      }
    };
  }
  
  // 计算错误率
  calculateErrorRate(metrics) {
    const totalRequests = metrics.counters['http.requests.total'] || 0;
    const errorRequests = (metrics.counters['http.responses.4xx'] || 0) + 
                         (metrics.counters['http.responses.5xx'] || 0);
    
    return totalRequests > 0 ? (errorRequests / totalRequests) * 100 : 0;
  }
  
  // 生成仪表板 HTML
  generateDashboardHTML() {
    return `
      <!DOCTYPE html>
      <html lang="en">
      <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Node.js Monitoring Dashboard</title>
        <script src="https://cdn.socket.io/4.0.0/socket.io.min.js"></script>
        <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
        <style>
          * { margin: 0; padding: 0; box-sizing: border-box; }
          body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #f5f5f5; }
          .header { background: #2c3e50; color: white; padding: 1rem; text-align: center; }
          .container { max-width: 1200px; margin: 0 auto; padding: 2rem; }
          .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 1rem; }
          .card { background: white; border-radius: 8px; padding: 1.5rem; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
          .metric { display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem; }
          .metric-label { font-weight: 600; color: #555; }
          .metric-value { font-size: 1.2rem; font-weight: bold; }
          .status-healthy { color: #27ae60; }
          .status-warning { color: #f39c12; }
          .status-critical { color: #e74c3c; }
          .chart-container { position: relative; height: 300px; }
          .alert { padding: 0.75rem; margin-bottom: 0.5rem; border-radius: 4px; }
          .alert-high { background: #ffebee; border-left: 4px solid #f44336; }
          .alert-medium { background: #fff3e0; border-left: 4px solid #ff9800; }
          .alert-low { background: #e8f5e8; border-left: 4px solid #4caf50; }
        </style>
      </head>
      <body>
        <div class="header">
          <h1>Node.js Monitoring Dashboard</h1>
          <p>Real-time application monitoring and alerting</p>
        </div>
        
        <div class="container">
          <div class="grid">
            <!-- 系统健康 -->
            <div class="card">
              <h3>System Health</h3>
              <div class="metric">
                <span class="metric-label">Health Score</span>
                <span class="metric-value" id="health-score">--</span>
              </div>
              <div class="metric">
                <span class="metric-label">Status</span>
                <span class="metric-value" id="health-status">--</span>
              </div>
              <div class="metric">
                <span class="metric-label">Uptime</span>
                <span class="metric-value" id="uptime">--</span>
              </div>
            </div>
            
            <!-- 内存使用 -->
            <div class="card">
              <h3>Memory Usage</h3>
              <div class="chart-container">
                <canvas id="memory-chart"></canvas>
              </div>
            </div>
            
            <!-- HTTP 指标 -->
            <div class="card">
              <h3>HTTP Metrics</h3>
              <div class="metric">
                <span class="metric-label">Total Requests</span>
                <span class="metric-value" id="total-requests">--</span>
              </div>
              <div class="metric">
                <span class="metric-label">Error Rate</span>
                <span class="metric-value" id="error-rate">--</span>
              </div>
              <div class="metric">
                <span class="metric-label">Avg Response Time</span>
                <span class="metric-value" id="avg-response-time">--</span>
              </div>
            </div>
            
            <!-- 最近告警 -->
            <div class="card">
              <h3>Recent Alerts</h3>
              <div id="recent-alerts">
                <p>No recent alerts</p>
              </div>
            </div>
          </div>
        </div>
        
        <script>
          const socket = io();
          let memoryChart;
          
          // 初始化图表
          function initCharts() {
            const ctx = document.getElementById('memory-chart').getContext('2d');
            memoryChart = new Chart(ctx, {
              type: 'line',
              data: {
                labels: [],
                datasets: [{
                  label: 'Heap Used (MB)',
                  data: [],
                  borderColor: '#3498db',
                  backgroundColor: 'rgba(52, 152, 219, 0.1)',
                  tension: 0.4
                }]
              },
              options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: {
                  y: {
                    beginAtZero: true
                  }
                }
              }
            });
          }
          
          // 更新指标显示
          function updateMetrics(metrics) {
            // 更新内存图表
            const heapUsed = metrics.gauges['memory.heap_used']?.value || 0;
            const heapUsedMB = Math.round(heapUsed / 1024 / 1024);
            
            const now = new Date().toLocaleTimeString();
            memoryChart.data.labels.push(now);
            memoryChart.data.datasets[0].data.push(heapUsedMB);
            
            // 保持最近 20 个数据点
            if (memoryChart.data.labels.length > 20) {
              memoryChart.data.labels.shift();
              memoryChart.data.datasets[0].data.shift();
            }
            
            memoryChart.update('none');
            
            // 更新 HTTP 指标
            const totalRequests = metrics.counters['http.requests.total'] || 0;
            document.getElementById('total-requests').textContent = totalRequests.toLocaleString();
            
            const errorRate = calculateErrorRate(metrics);
            document.getElementById('error-rate').textContent = errorRate.toFixed(2) + '%';
            
            const responseTime = metrics.histograms['http.request.duration'];
            const avgResponseTime = responseTime?.mean || 0;
            document.getElementById('avg-response-time').textContent = Math.round(avgResponseTime) + 'ms';
          }
          
          // 计算错误率
          function calculateErrorRate(metrics) {
            const totalRequests = metrics.counters['http.requests.total'] || 0;
            const errorRequests = (metrics.counters['http.responses.4xx'] || 0) + 
                                 (metrics.counters['http.responses.5xx'] || 0);
            
            return totalRequests > 0 ? (errorRequests / totalRequests) * 100 : 0;
          }
          
          // 更新系统健康状态
          async function updateHealth() {
            try {
              const response = await fetch('/api/health');
              const health = await response.json();
              
              document.getElementById('health-score').textContent = health.score;
              
              const statusElement = document.getElementById('health-status');
              statusElement.textContent = health.status.toUpperCase();
              statusElement.className = `metric-value status-${health.status}`;
              
              const uptimeHours = Math.floor(health.uptime / 3600);
              const uptimeMinutes = Math.floor((health.uptime % 3600) / 60);
              document.getElementById('uptime').textContent = `${uptimeHours}h ${uptimeMinutes}m`;
            } catch (error) {
              console.error('Failed to fetch health data:', error);
            }
          }
          
          // 更新告警显示
          async function updateAlerts() {
            try {
              const response = await fetch('/api/alerts?limit=5');
              const alerts = await response.json();
              
              const alertsContainer = document.getElementById('recent-alerts');
              
              if (alerts.length === 0) {
                alertsContainer.innerHTML = '<p>No recent alerts</p>';
                return;
              }
              
              alertsContainer.innerHTML = alerts.map(alert => `
                <div class="alert alert-${alert.severity}">
                  <strong>${alert.message}</strong><br>
                  <small>${new Date(alert.timestamp).toLocaleString()}</small>
                </div>
              `).join('');
            } catch (error) {
              console.error('Failed to fetch alerts:', error);
            }
          }
          
          // Socket 事件监听
          socket.on('metrics', updateMetrics);
          
          socket.on('alert', (alert) => {
            // 显示新告警通知
            console.log('New alert:', alert);
            updateAlerts();
          });
          
          // 初始化
          document.addEventListener('DOMContentLoaded', () => {
            initCharts();
            updateHealth();
            updateAlerts();
            
            // 定期更新健康状态和告警
            setInterval(updateHealth, 30000); // 每 30 秒
            setInterval(updateAlerts, 60000);  // 每分钟
          });
        </script>
      </body>
      </html>
    `;
  }
  
  // 启动仪表板服务器
  start(port = 3001) {
    this.server.listen(port, () => {
      console.log(`Monitoring dashboard running on http://localhost:${port}`);
    });
  }
}

module.exports = { MonitoringDashboard };

最佳实践

1. 监控策略

  • 分层监控:应用层、系统层、业务层
  • 关键指标:响应时间、错误率、吞吐量、资源使用
  • 告警阈值:根据历史数据设置合理阈值
  • 监控覆盖:覆盖所有关键路径和组件

2. 日志管理

  • 结构化日志:使用 JSON 格式便于分析
  • 日志级别:合理使用不同级别
  • 日志轮转:防止日志文件过大
  • 敏感信息:避免记录敏感数据

3. 告警设计

  • 告警分级:区分不同严重程度
  • 告警去重:避免重复告警
  • 告警恢复:及时通知问题解决
  • 告警测试:定期测试告警机制

4. 性能优化

  • 异步处理:避免阻塞主线程
  • 批量操作:减少 I/O 操作次数
  • 缓存策略:合理使用缓存
  • 资源清理:及时释放资源

总结

本章介绍了 Node.js 应用的监控和日志管理,包括:

  • 健康检查:系统状态监控
  • 性能指标:关键指标收集和分析
  • 日志管理:结构化日志和分析
  • 告警系统:智能告警和通知
  • 监控仪表板:可视化监控界面

通过完善的监控体系,可以及时发现和解决问题,确保应用的稳定运行。