VWED_server/services/execution/handlers/storage_queue_manager.py

462 lines
18 KiB
Python
Raw Normal View History

2025-07-30 15:11:59 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
库位请求队列管理器
提供稳定的库位分配队列机制避免高并发冲突
"""
import asyncio
import time
import json
import uuid
from typing import Dict, Any, List, Optional, Callable
from dataclasses import dataclass, field
from enum import Enum
from collections import defaultdict
import heapq
from utils.logger import get_logger
from config.settings import settings
logger = get_logger("services.execution.handlers.storage_queue_manager")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
class RequestPriority(Enum):
"""请求优先级"""
LOW = 1
NORMAL = 2
HIGH = 3
URGENT = 4
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
class RequestStatus(Enum):
"""请求状态"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
TIMEOUT = "timeout"
CANCELLED = "cancelled"
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
@dataclass
class StorageRequest:
"""库位请求"""
request_id: str
handler_type: str # "idle_site" 或 "idle_crowded_site"
input_params: Dict[str, Any]
context_data: Dict[str, Any]
map_id: str
task_record_id: str
priority: RequestPriority = RequestPriority.NORMAL
created_at: float = field(default_factory=time.time)
2025-10-19 15:33:54 +08:00
timeout: float = field(
default_factory=lambda: settings.STORAGE_QUEUE_DEFAULT_TIMEOUT if settings.STORAGE_QUEUE_ENABLE_TIMEOUT else float(
'inf'))
2025-07-30 15:11:59 +08:00
retry_count: int = 0
max_retries: int = 3
status: RequestStatus = RequestStatus.PENDING
result: Optional[Dict[str, Any]] = None
error_message: Optional[str] = None
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
def __lt__(self, other):
"""用于优先级队列排序"""
if self.priority.value != other.priority.value:
return self.priority.value > other.priority.value # 高优先级优先
return self.created_at < other.created_at # 时间早的优先
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
class StorageQueueManager:
"""库位请求队列管理器"""
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
def __init__(self, max_workers: int = settings.STORAGE_QUEUE_MAX_WORKERS, max_queue_size: int = None):
self.max_workers = max_workers or settings.STORAGE_QUEUE_MAX_WORKERS
self.max_queue_size = max_queue_size or settings.STORAGE_QUEUE_MAX_SIZE
self.enable_timeout = settings.STORAGE_QUEUE_ENABLE_TIMEOUT
self.cleanup_interval = settings.STORAGE_QUEUE_CLEANUP_INTERVAL
self.completed_request_ttl = settings.STORAGE_QUEUE_COMPLETED_REQUEST_TTL
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 优先级队列
self.request_queue: List[StorageRequest] = []
self.queue_lock = asyncio.Lock()
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 请求跟踪
self.pending_requests: Dict[str, StorageRequest] = {}
self.processing_requests: Dict[str, StorageRequest] = {}
self.completed_requests: Dict[str, StorageRequest] = {}
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 工作者管理
self.workers: List[asyncio.Task] = []
self.worker_semaphore = asyncio.Semaphore(max_workers)
self.shutdown_event = asyncio.Event()
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 处理器注册
self.handlers: Dict[str, Callable] = {}
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 统计信息
self.stats = {
'requests_total': 0,
'requests_completed': 0,
'requests_failed': 0,
'requests_timeout': 0,
'avg_processing_time': 0,
'queue_size': 0,
'active_workers': 0
}
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 启动清理任务
self.cleanup_task = None
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def start(self):
"""启动队列管理器"""
logger.info("启动库位请求队列管理器")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 启动工作者
for i in range(self.max_workers):
worker = asyncio.create_task(self._worker(f"worker-{i}"))
self.workers.append(worker)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 启动清理任务
self.cleanup_task = asyncio.create_task(self._cleanup_completed_requests())
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
logger.info(f"队列管理器已启动,工作者数量: {self.max_workers}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def stop(self):
"""停止队列管理器"""
logger.info("停止库位请求队列管理器")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 设置停止信号
self.shutdown_event.set()
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 等待所有工作者完成
if self.workers:
await asyncio.gather(*self.workers, return_exceptions=True)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 停止清理任务
if self.cleanup_task:
self.cleanup_task.cancel()
try:
await self.cleanup_task
except asyncio.CancelledError:
pass
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
logger.info("队列管理器已停止")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
def register_handler(self, handler_type: str, handler_func: Callable):
"""注册处理器"""
self.handlers[handler_type] = handler_func
logger.info(f"注册处理器: {handler_type}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def submit_request(self, request: StorageRequest) -> str:
"""提交请求到队列"""
if len(self.request_queue) >= self.max_queue_size:
raise Exception("队列已满,无法提交新请求")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async with self.queue_lock:
# 添加到优先级队列
heapq.heappush(self.request_queue, request)
self.pending_requests[request.request_id] = request
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 更新统计
self.stats['requests_total'] += 1
self.stats['queue_size'] = len(self.request_queue)
2025-10-19 15:33:54 +08:00
2025-09-09 10:41:27 +08:00
logger.debug(f"提交请求到队列: {request.request_id}, 优先级: {request.priority.name}")
2025-07-30 15:11:59 +08:00
return request.request_id
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def get_request_status(self, request_id: str) -> Optional[Dict[str, Any]]:
"""获取请求状态"""
# 检查各个状态的请求
if request_id in self.pending_requests:
request = self.pending_requests[request_id]
return {
"request_id": request_id,
"status": request.status.value,
"created_at": request.created_at,
"queue_position": self._get_queue_position(request_id)
}
elif request_id in self.processing_requests:
request = self.processing_requests[request_id]
return {
"request_id": request_id,
"status": request.status.value,
"created_at": request.created_at,
"processing_time": time.time() - request.created_at
}
elif request_id in self.completed_requests:
request = self.completed_requests[request_id]
return {
"request_id": request_id,
"status": request.status.value,
"created_at": request.created_at,
"result": request.result,
"error_message": request.error_message,
"processing_time": time.time() - request.created_at
}
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
return None
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def wait_for_result(self, request_id: str, timeout: float = None) -> Dict[str, Any]:
"""等待请求结果"""
start_time = time.time()
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 根据配置决定是否使用超时
if not self.enable_timeout:
# 禁用超时,无限等待
check_timeout = float('inf')
2025-09-09 10:41:27 +08:00
logger.debug(f"等待请求结果(无超时限制): {request_id}")
2025-07-30 15:11:59 +08:00
else:
# 启用超时,使用提供的超时时间或默认值
check_timeout = timeout or settings.STORAGE_QUEUE_DEFAULT_TIMEOUT
2025-09-09 10:41:27 +08:00
logger.debug(f"等待请求结果(超时 {check_timeout}s: {request_id}")
2025-10-19 15:33:54 +08:00
try:
while True:
# 检查是否完成
if request_id in self.completed_requests:
request = self.completed_requests[request_id]
if request.status == RequestStatus.COMPLETED:
result = request.result
# 立即清理已完成的请求,避免内存积累
del self.completed_requests[request_id]
logger.debug(f"请求结果已取走并清理: {request_id}")
return result
else:
error_msg = request.error_message
# 失败的请求也立即清理
del self.completed_requests[request_id]
logger.debug(f"请求失败结果已取走并清理: {request_id}")
raise Exception(f"请求失败: {error_msg}")
# 只有在启用超时时才检查超时
if self.enable_timeout:
# 检查等待超时
if time.time() - start_time >= check_timeout:
2025-07-30 15:11:59 +08:00
await self._mark_request_timeout(request_id)
2025-10-19 15:33:54 +08:00
raise Exception("等待结果超时")
# 检查请求本身是否超时
if request_id in self.pending_requests:
request = self.pending_requests[request_id]
if request.timeout != float('inf') and time.time() - request.created_at > request.timeout:
await self._mark_request_timeout(request_id)
raise Exception("请求超时")
await asyncio.sleep(0.1) # 避免忙等待
except Exception:
# 发生异常时,也要尝试清理可能存在的已完成请求
if request_id in self.completed_requests:
del self.completed_requests[request_id]
logger.debug(f"异常时清理请求: {request_id}")
raise
2025-07-30 15:11:59 +08:00
async def cancel_request(self, request_id: str) -> bool:
"""取消请求"""
async with self.queue_lock:
if request_id in self.pending_requests:
request = self.pending_requests[request_id]
request.status = RequestStatus.CANCELLED
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 从队列中移除
self.request_queue = [r for r in self.request_queue if r.request_id != request_id]
heapq.heapify(self.request_queue)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 移动到完成队列
del self.pending_requests[request_id]
self.completed_requests[request_id] = request
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
self.stats['queue_size'] = len(self.request_queue)
logger.info(f"取消请求: {request_id}")
return True
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
return False
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
def get_queue_stats(self) -> Dict[str, Any]:
"""获取队列统计信息"""
self.stats['queue_size'] = len(self.request_queue)
self.stats['active_workers'] = sum(1 for w in self.workers if not w.done())
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
return self.stats.copy()
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _worker(self, worker_name: str):
"""工作者协程"""
logger.info(f"工作者 {worker_name} 启动")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
while not self.shutdown_event.is_set():
try:
# 获取请求
request = await self._get_next_request()
if not request:
await asyncio.sleep(0.1)
continue
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 处理请求
async with self.worker_semaphore:
await self._process_request(request, worker_name)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
except Exception as e:
logger.error(f"工作者 {worker_name} 处理请求异常: {str(e)}")
await asyncio.sleep(1)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
logger.info(f"工作者 {worker_name} 停止")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _get_next_request(self) -> Optional[StorageRequest]:
"""获取下一个请求"""
async with self.queue_lock:
while self.request_queue:
request = heapq.heappop(self.request_queue)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 检查请求是否仍然有效
if request.request_id in self.pending_requests:
# 只有在启用超时且请求设置了有限超时时才检查超时
2025-10-19 15:33:54 +08:00
if (self.enable_timeout and
request.timeout != float('inf') and
time.time() - request.created_at > request.timeout):
2025-07-30 15:11:59 +08:00
await self._mark_request_timeout(request.request_id)
continue
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 移动到处理队列
del self.pending_requests[request.request_id]
self.processing_requests[request.request_id] = request
request.status = RequestStatus.PROCESSING
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
self.stats['queue_size'] = len(self.request_queue)
return request
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
return None
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _process_request(self, request: StorageRequest, worker_name: str):
"""处理单个请求"""
start_time = time.time()
2025-09-09 10:41:27 +08:00
logger.debug(f"工作者 {worker_name} 开始处理请求: {request.request_id}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
try:
# 获取处理器
handler = self.handlers.get(request.handler_type)
if not handler:
raise Exception(f"未找到处理器: {request.handler_type}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 执行处理
result = await handler(request.input_params, request.context_data, request.map_id, request.task_record_id)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 标记完成
await self._mark_request_completed(request.request_id, result)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
processing_time = time.time() - start_time
2025-09-09 10:41:27 +08:00
logger.debug(f"工作者 {worker_name} 完成请求: {request.request_id}, 用时: {processing_time:.3f}s")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
except Exception as e:
# 处理失败
error_msg = str(e)
logger.error(f"工作者 {worker_name} 处理请求失败: {request.request_id}, 错误: {error_msg}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 检查是否需要重试
if request.retry_count < request.max_retries:
await self._retry_request(request)
else:
await self._mark_request_failed(request.request_id, error_msg)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
finally:
2025-10-19 15:33:54 +08:00
# 更新平均处理时间(只在这里统计,移除 _mark_request_completed 中的重复计数)
2025-07-30 15:11:59 +08:00
processing_time = time.time() - start_time
2025-10-19 15:33:54 +08:00
if self.stats['requests_completed'] > 0:
total_time = self.stats['avg_processing_time'] * self.stats['requests_completed']
self.stats['avg_processing_time'] = (total_time + processing_time) / (
self.stats['requests_completed'] + 1)
else:
self.stats['avg_processing_time'] = processing_time
2025-07-30 15:11:59 +08:00
async def _mark_request_completed(self, request_id: str, result: Dict[str, Any]):
"""标记请求完成"""
if request_id in self.processing_requests:
request = self.processing_requests[request_id]
request.status = RequestStatus.COMPLETED
request.result = result
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
del self.processing_requests[request_id]
self.completed_requests[request_id] = request
2025-10-19 15:33:54 +08:00
# 更新完成计数(移除了重复计数)
2025-07-30 15:11:59 +08:00
self.stats['requests_completed'] += 1
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _mark_request_failed(self, request_id: str, error_message: str):
"""标记请求失败"""
if request_id in self.processing_requests:
request = self.processing_requests[request_id]
request.status = RequestStatus.FAILED
request.error_message = error_message
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
del self.processing_requests[request_id]
self.completed_requests[request_id] = request
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
self.stats['requests_failed'] += 1
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _mark_request_timeout(self, request_id: str):
"""标记请求超时"""
request = None
if request_id in self.pending_requests:
request = self.pending_requests[request_id]
del self.pending_requests[request_id]
elif request_id in self.processing_requests:
request = self.processing_requests[request_id]
del self.processing_requests[request_id]
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
if request:
request.status = RequestStatus.TIMEOUT
request.error_message = "请求超时"
self.completed_requests[request_id] = request
self.stats['requests_timeout'] += 1
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _retry_request(self, request: StorageRequest):
"""重试请求"""
request.retry_count += 1
request.status = RequestStatus.PENDING
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 重新加入队列
async with self.queue_lock:
heapq.heappush(self.request_queue, request)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
del self.processing_requests[request.request_id]
self.pending_requests[request.request_id] = request
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
self.stats['queue_size'] = len(self.request_queue)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
logger.info(f"重试请求: {request.request_id}, 第 {request.retry_count} 次重试")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
def _get_queue_position(self, request_id: str) -> int:
"""获取请求在队列中的位置"""
for i, request in enumerate(self.request_queue):
if request.request_id == request_id:
return i + 1
return -1
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
async def _cleanup_completed_requests(self):
2025-10-19 15:33:54 +08:00
"""
清理已完成的请求兜底机制
注意采用立即清理模式后此方法主要作为兜底保护防止异常情况下的内存泄漏
"""
2025-07-30 15:11:59 +08:00
while not self.shutdown_event.is_set():
try:
await asyncio.sleep(self.cleanup_interval) # 使用配置的清理间隔
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
current_time = time.time()
cleanup_threshold = self.completed_request_ttl # 使用配置的保留时间
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
to_remove = []
for request_id, request in self.completed_requests.items():
if current_time - request.created_at > cleanup_threshold:
to_remove.append(request_id)
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
for request_id in to_remove:
del self.completed_requests[request_id]
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
if to_remove:
2025-10-19 15:33:54 +08:00
logger.info(
f"兜底清理了 {len(to_remove)} 个未被取走的已完成请求(可能存在未调用 wait_for_result 的情况)")
2025-07-30 15:11:59 +08:00
except Exception as e:
logger.error(f"清理已完成请求异常: {str(e)}")
2025-10-19 15:33:54 +08:00
2025-07-30 15:11:59 +08:00
# 全局队列管理器实例
storage_queue_manager = StorageQueueManager()