333 lines
12 KiB
Python
333 lines
12 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
工作线程管理模块
|
||
负责工作线程的创建、监控和调整
|
||
"""
|
||
|
||
import asyncio
|
||
import logging
|
||
import time
|
||
from typing import Dict, List, Any, Optional, Set, Callable, Coroutine
|
||
from datetime import datetime, timedelta
|
||
import psutil
|
||
|
||
from utils.logger import get_logger
|
||
|
||
# 获取日志记录器
|
||
logger = get_logger("services.enhanced_scheduler.worker_manager")
|
||
|
||
class WorkerManager:
|
||
"""
|
||
工作线程管理器
|
||
负责管理和监控工作线程池
|
||
"""
|
||
|
||
def __init__(self,
|
||
min_workers: int = 5,
|
||
max_workers: int = 20,
|
||
worker_heartbeat_interval: int = 30,
|
||
auto_scale_interval: int = 60,
|
||
cpu_threshold: float = 80.0,
|
||
memory_threshold: float = 80.0):
|
||
"""
|
||
初始化工作线程管理器
|
||
|
||
Args:
|
||
min_workers: 最小工作线程数
|
||
max_workers: 最大工作线程数
|
||
worker_heartbeat_interval: 工作线程心跳间隔(秒)
|
||
auto_scale_interval: 自动扩缩容间隔(秒)
|
||
cpu_threshold: CPU使用率阈值(百分比)
|
||
memory_threshold: 内存使用率阈值(百分比)
|
||
"""
|
||
self.min_workers = min_workers
|
||
self.max_workers = max_workers
|
||
self.worker_heartbeat_interval = worker_heartbeat_interval
|
||
self.auto_scale_interval = auto_scale_interval
|
||
self.cpu_threshold = cpu_threshold
|
||
self.memory_threshold = memory_threshold
|
||
|
||
self.workers = {} # 工作线程字典 {worker_id: worker_task}
|
||
self.worker_status = {} # 工作线程状态 {worker_id: status_dict}
|
||
self.worker_heartbeats = {} # 工作线程心跳 {worker_id: last_heartbeat_time}
|
||
|
||
self.worker_factory = None # 工作线程工厂函数
|
||
self.is_running = False
|
||
|
||
self.monitor_task = None # 监控任务
|
||
self.last_auto_scale_time = datetime.now()
|
||
|
||
logger.info(f"初始化工作线程管理器: min={min_workers}, max={max_workers}, "
|
||
f"心跳间隔={worker_heartbeat_interval}秒, 自动扩缩容间隔={auto_scale_interval}秒")
|
||
|
||
def set_worker_factory(self, factory: Callable[[int], Coroutine]):
|
||
"""
|
||
设置工作线程工厂函数
|
||
|
||
Args:
|
||
factory: 工作线程工厂函数,接受worker_id参数,返回协程对象
|
||
"""
|
||
self.worker_factory = factory
|
||
|
||
async def start(self, initial_workers: int = None) -> None:
|
||
"""
|
||
启动工作线程管理器
|
||
|
||
Args:
|
||
initial_workers: 初始工作线程数,默认为min_workers
|
||
"""
|
||
if self.is_running:
|
||
logger.warning("工作线程管理器已经在运行中")
|
||
return
|
||
|
||
if not self.worker_factory:
|
||
raise ValueError("未设置工作线程工厂函数")
|
||
|
||
self.is_running = True
|
||
|
||
# 启动初始工作线程
|
||
initial_count = initial_workers if initial_workers is not None else self.min_workers
|
||
initial_count = max(self.min_workers, min(initial_count, self.max_workers))
|
||
|
||
for i in range(initial_count):
|
||
await self.add_worker()
|
||
|
||
# 启动监控任务
|
||
# self.monitor_task = asyncio.create_task(self._monitor_workers())
|
||
|
||
logger.info(f"工作线程管理器启动成功,初始工作线程数: {initial_count}")
|
||
|
||
async def stop(self) -> None:
|
||
"""
|
||
停止工作线程管理器
|
||
"""
|
||
if not self.is_running:
|
||
logger.warning("工作线程管理器未在运行")
|
||
return
|
||
|
||
self.is_running = False
|
||
|
||
# 取消监控任务
|
||
if self.monitor_task:
|
||
self.monitor_task.cancel()
|
||
try:
|
||
await self.monitor_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self.monitor_task = None
|
||
|
||
# 取消所有工作线程
|
||
for worker_id, worker in list(self.workers.items()):
|
||
await self.remove_worker(worker_id)
|
||
|
||
logger.info("工作线程管理器已停止")
|
||
|
||
async def add_worker(self) -> int:
|
||
"""
|
||
添加工作线程
|
||
|
||
Returns:
|
||
int: 工作线程ID
|
||
"""
|
||
if len(self.workers) >= self.max_workers:
|
||
logger.warning(f"已达到最大工作线程数 {self.max_workers}")
|
||
return -1
|
||
|
||
# 生成新的工作线程ID
|
||
worker_id = 0
|
||
while worker_id in self.workers:
|
||
worker_id += 1
|
||
|
||
# 创建工作线程
|
||
worker = asyncio.create_task(self.worker_factory(worker_id))
|
||
|
||
# 记录工作线程
|
||
self.workers[worker_id] = worker
|
||
self.worker_status[worker_id] = {
|
||
"state": "running",
|
||
"start_time": datetime.now(),
|
||
"last_activity": datetime.now(),
|
||
"task_count": 0
|
||
}
|
||
self.worker_heartbeats[worker_id] = datetime.now()
|
||
|
||
logger.info(f"添加工作线程 {worker_id}, 当前工作线程数: {len(self.workers)}")
|
||
|
||
return worker_id
|
||
|
||
async def remove_worker(self, worker_id: int) -> bool:
|
||
"""
|
||
移除工作线程
|
||
|
||
Args:
|
||
worker_id: 工作线程ID
|
||
|
||
Returns:
|
||
bool: 是否成功移除
|
||
"""
|
||
if worker_id not in self.workers:
|
||
logger.warning(f"工作线程 {worker_id} 不存在")
|
||
return False
|
||
|
||
# 取消工作线程
|
||
worker = self.workers.pop(worker_id)
|
||
worker.cancel()
|
||
|
||
try:
|
||
await worker
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
# 移除状态记录
|
||
self.worker_status.pop(worker_id, None)
|
||
self.worker_heartbeats.pop(worker_id, None)
|
||
|
||
logger.info(f"移除工作线程 {worker_id}, 当前工作线程数: {len(self.workers)}")
|
||
|
||
return True
|
||
|
||
def update_worker_heartbeat(self, worker_id: int) -> None:
|
||
"""
|
||
更新工作线程心跳
|
||
|
||
Args:
|
||
worker_id: 工作线程ID
|
||
"""
|
||
if worker_id in self.worker_heartbeats:
|
||
self.worker_heartbeats[worker_id] = datetime.now()
|
||
self.worker_status[worker_id]["last_activity"] = datetime.now()
|
||
|
||
def update_worker_status(self, worker_id: int, status_update: Dict[str, Any]) -> None:
|
||
"""
|
||
更新工作线程状态
|
||
|
||
Args:
|
||
worker_id: 工作线程ID
|
||
status_update: 状态更新字典
|
||
"""
|
||
if worker_id in self.worker_status:
|
||
self.worker_status[worker_id].update(status_update)
|
||
self.update_worker_heartbeat(worker_id)
|
||
|
||
async def _monitor_workers(self) -> None:
|
||
"""
|
||
监控工作线程
|
||
检查心跳、资源使用情况,自动扩缩容
|
||
"""
|
||
logger.info("工作线程监控任务启动")
|
||
while self.is_running:
|
||
try:
|
||
# 检查工作线程心跳
|
||
now = datetime.now()
|
||
print("self.worker_heartbeats:::::::::::::::::",self.worker_heartbeats)
|
||
print("self.worker_status:::::::::::::::::",self.worker_status)
|
||
for worker_id, last_heartbeat in list(self.worker_heartbeats.items()):
|
||
if (now - last_heartbeat).total_seconds() > self.worker_heartbeat_interval * 2:
|
||
logger.warning(f"工作线程 {worker_id} 心跳超时,重启中...")
|
||
|
||
# 重启工作线程
|
||
await self.remove_worker(worker_id)
|
||
await self.add_worker()
|
||
|
||
# 自动扩缩容
|
||
if (now - self.last_auto_scale_time).total_seconds() > self.auto_scale_interval:
|
||
await self._auto_scale()
|
||
self.last_auto_scale_time = now
|
||
|
||
# 休眠一段时间
|
||
await asyncio.sleep(self.worker_heartbeat_interval / 2)
|
||
|
||
except asyncio.CancelledError:
|
||
# 取消异常,退出循环
|
||
logger.info("工作线程监控任务被取消")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"工作线程监控任务异常: {str(e)}")
|
||
# 出现异常时短暂休眠,避免频繁错误
|
||
await asyncio.sleep(5.0)
|
||
|
||
logger.info("工作线程监控任务结束")
|
||
|
||
async def _auto_scale(self) -> None:
|
||
"""
|
||
自动扩缩容
|
||
根据系统资源使用情况和任务队列长度自动调整工作线程数量
|
||
"""
|
||
try:
|
||
current_workers = len(self.workers)
|
||
|
||
# 获取系统资源使用情况
|
||
cpu_percent = psutil.cpu_percent()
|
||
memory_percent = psutil.virtual_memory().percent
|
||
|
||
# 获取任务队列长度(需要外部传入或通过其他方式获取)
|
||
queue_size = self.get_queue_size()
|
||
|
||
# 根据资源使用情况和队列长度决定是否调整工作线程数量
|
||
if cpu_percent > self.cpu_threshold or memory_percent > self.memory_threshold:
|
||
# 资源使用率过高,减少工作线程
|
||
if current_workers > self.min_workers:
|
||
# 移除最后添加的工作线程
|
||
worker_id = max(self.workers.keys())
|
||
await self.remove_worker(worker_id)
|
||
logger.info(f"资源使用率过高(CPU:{cpu_percent}%, MEM:{memory_percent}%), 减少工作线程至 {current_workers-1}")
|
||
elif queue_size > current_workers * 2:
|
||
# 队列任务较多,增加工作线程
|
||
if current_workers < self.max_workers:
|
||
await self.add_worker()
|
||
logger.info(f"队列任务较多(size:{queue_size}), 增加工作线程至 {current_workers+1}")
|
||
elif queue_size == 0 and current_workers > self.min_workers:
|
||
# 队列为空,减少工作线程
|
||
worker_id = max(self.workers.keys())
|
||
await self.remove_worker(worker_id)
|
||
logger.info(f"队列为空,减少工作线程至 {current_workers-1}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"自动扩缩容异常: {str(e)}")
|
||
|
||
def get_queue_size(self) -> int:
|
||
"""
|
||
获取任务队列长度
|
||
默认实现,需要在子类中重写或设置外部获取方法
|
||
|
||
Returns:
|
||
int: 队列长度
|
||
"""
|
||
# 此方法应由子类重写或设置外部方法
|
||
return 0
|
||
|
||
def set_queue_size_getter(self, getter: Callable[[], int]) -> None:
|
||
"""
|
||
设置队列长度获取方法
|
||
|
||
Args:
|
||
getter: 队列长度获取函数
|
||
"""
|
||
self.get_queue_size = getter
|
||
|
||
def get_worker_count(self) -> int:
|
||
"""
|
||
获取当前工作线程数量
|
||
|
||
Returns:
|
||
int: 工作线程数量
|
||
"""
|
||
return len(self.workers)
|
||
|
||
def get_worker_status(self) -> Dict[str, Any]:
|
||
"""
|
||
获取工作线程状态信息
|
||
|
||
Returns:
|
||
Dict[str, Any]: 工作线程状态
|
||
"""
|
||
return {
|
||
"worker_count": len(self.workers),
|
||
"min_workers": self.min_workers,
|
||
"max_workers": self.max_workers,
|
||
"workers": self.worker_status,
|
||
"cpu_usage": psutil.cpu_percent(),
|
||
"memory_usage": psutil.virtual_memory().percent
|
||
} |