修复内存溢出问题

This commit is contained in:
靳中伟 2025-10-19 15:33:54 +08:00
parent 23b7b99dc6
commit 3f578c03e8
21 changed files with 15003 additions and 1005 deletions

View File

@ -245,10 +245,10 @@ class BaseConfig(BaseSettings):
DB_NAME: str = Field(default=_db_config['database'], env="DB_NAME")
DB_CHARSET: str = Field(default=_db_config['charset'], env="DB_CHARSET")
DB_ECHO: bool = False # 是否输出SQL语句
DB_POOL_SIZE: int = 50 # 增加连接池基础大小
DB_MAX_OVERFLOW: int = 100 # 增加溢出连接数
DB_POOL_RECYCLE: int = 1800 # 减少连接回收时间,防止连接过期
DB_POOL_TIMEOUT: int = 60 # 获取连接的超时时间
DB_POOL_SIZE: int = 10 # 连接池基础大小优化从50降到10节省内存
DB_MAX_OVERFLOW: int = 20 # 溢出连接数优化从100降到20节省内存
DB_POOL_RECYCLE: int = 1800 # 连接回收时间30分钟,防止连接过期
DB_POOL_TIMEOUT: int = 30 # 获取连接的超时时间优化从60降到30秒
DB_POOL_PRE_PING: bool = True # 连接前检测连接可用性
# Redis配置

View File

@ -62,7 +62,7 @@ TASK_TYPE_AREA={
"LG": "AGW/PL"
}
# 从环境变量读取配置,或使用默认值
TF_API_BASE_URL = os.getenv("TF_API_BASE_URL", "http://192.168.189.206:8080/jeecg-boot")
TF_API_BASE_URL = os.getenv("TF_API_BASE_URL", "http://192.168.189.187:8080/jeecg-boot")
# TF_API_BASE_URL = os.getenv("TF_API_BASE_URL", "http://111.231.146.230:4080/jeecg-boot")
TF_API_TIMEOUT = int(os.getenv("TF_API_TIMEOUT", "10")) # 减少超时时间从60秒到10秒
TF_API_RETRY_TIMES = int(os.getenv("TF_API_RETRY_TIMES", "3"))

View File

@ -0,0 +1,89 @@
# 选择机器人块输出参数立即更新说明
## 修改背景
选择机器人块CSelectAgvBp和机器人执行动作块CAgvOperationBp是嵌套关系。之前的实现中虽然选择机器人块在选择完成后状态就显示为成功但是输出内容选择出的机器人结果要等到整个嵌套的动作块执行完成后才会更新到表中在任务记录详情里才能显示出来。这不符合实际的显示需求。
## 修改内容
### 1. 修改 `RobotBlockHandler.update_block_record` 方法
**文件**: `services/execution/handlers/robot_scheduling.py`
**修改点**:
- 增加 `block_name``output_data` 参数
- 在选择机器人成功后,立即更新块记录的输出参数到数据库
- 构建完整的输出结构:`{"blocks": {"块名称": {"selectedAgvId": "机器人名称"}}}`
- 同时更新 `output_params``block_out_params_value` 字段
### 2. 修改选择机器人块的调用逻辑
**文件**: `services/execution/handlers/robot_scheduling.py`
**修改点**:
- 在选择机器人成功后第897-902行立即调用 `update_block_record` 方法
- 传入块名称和输出数据(包含选择的机器人名称)
- 这样用户可以立即在任务记录详情中看到选择的机器人
## 执行流程
### 修改前的流程
1. 选择机器人块开始执行
2. 调用天风系统接口选择机器人
3. 等待机器人选择完成
4. 更新块记录状态为成功(但不更新输出参数)
5. 执行嵌套的动作块
6. **等待所有子块执行完成**
7. **最后才更新输出参数到数据库**
8. 用户才能看到选择的机器人
### 修改后的流程
1. 选择机器人块开始执行
2. 调用天风系统接口选择机器人
3. 等待机器人选择完成
4. **立即更新块记录状态为成功,并更新输出参数到数据库**
5. **用户立即可以在任务记录详情中看到选择的机器人**
6. 执行嵌套的动作块
7. 所有子块执行完成后,再次更新输出参数(内容相同,不影响显示)
## 技术细节
### 输出参数格式
```json
{
"blocks": {
"块名称": {
"selectedAgvId": "机器人名称"
}
}
}
```
### 数据库字段
- `output_params`: 完整的输出结构(包含 blocks 层级)
- `block_out_params_value`: 输出参数值(只包含选择的机器人信息)
## 注意事项
1. **输出参数会被更新两次**:第一次在选择机器人完成时立即更新,第二次在所有子块执行完成后再次更新。两次更新的内容相同,不会造成问题。
2. **不影响子块执行**:立即更新输出参数不会影响子块的执行逻辑,子块仍然可以正常访问 context 中的机器人信息。
3. **兼容性**这个修改不会影响其他类型的块只针对选择机器人块CSelectAgvBp
## 测试建议
1. 启动一个包含选择机器人块和嵌套动作块的任务
2. 在选择机器人完成后,立即查询任务记录详情
3. 验证能否看到选择的机器人信息
4. 等待动作块执行完成后,再次查询任务记录详情
5. 验证机器人信息保持一致
## 相关文件
- `services/execution/handlers/robot_scheduling.py` - 选择机器人块处理器
- `services/execution/block_executor.py` - 块执行器
- `data/models/blockrecord.py` - 块记录数据模型
## 修改日期
2025-10-17

File diff suppressed because it is too large Load Diff

6247
logs/app.log.2025-10-16 Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,9 +9,10 @@
import json
import asyncio
import aiohttp
from typing import Dict, Any
from typing import Dict, Any, Set, Optional
from fastapi import APIRouter, Body, Request, Path
from routes.model.external_task_model import ExternalTaskRequest, ExternalTaskResponse, TaskTypeEnum, GenAgvSchedulingTaskRequest, CancelTaskRequest
from routes.model.external_task_model import ExternalTaskRequest, ExternalTaskResponse, TaskTypeEnum, \
GenAgvSchedulingTaskRequest, CancelTaskRequest
from routes.model.task_edit_model import TaskEditRunRequest, TaskInputParamNew, InputParamType
from services.task_edit_service import TaskEditService
from services.external_task_record_service import ExternalTaskRecordService
@ -19,10 +20,10 @@ from services.task_record_service import TaskRecordService
from services.sync_service import set_task_terminated, get_login_token, refresh_token_if_needed
from routes.common_api import format_response, error_response
from utils.logger import get_logger
from utils.background_task_manager import create_background_task
from data.enum.task_record_enum import SourceType, TaskStatus
from data.models.external_task_record import ExternalTaskStatusEnum
from config.tf_api_config import TF_API_TOKEN, TF_API_BASE_URL, CM_ID, DG_ID, TASK_TYPE_PRIORITY, TASK_TYPE_AREA, TF_WEB_POST, sync_disabled_label
from config.tf_api_config import TF_API_TOKEN, TF_API_BASE_URL, CM_ID, DG_ID, TASK_TYPE_PRIORITY, TASK_TYPE_AREA, \
TF_WEB_POST, sync_disabled_label
# 创建路由
router = APIRouter(
@ -33,6 +34,121 @@ router = APIRouter(
# 设置日志
logger = get_logger("app.external_task_api")
# 后台监控任务集合,用于追踪和清理
_background_monitor_tasks: Set[asyncio.Task] = set()
# 全局HTTP会话用于复用连接池
_global_http_session: Optional[aiohttp.ClientSession] = None
_session_lock = asyncio.Lock() # 确保线程安全
async def get_http_session() -> aiohttp.ClientSession:
"""
获取全局HTTP会话如果不存在则创建
使用连接池复用减少内存占用和TCP连接数
Returns:
aiohttp.ClientSession: 全局HTTP会话对象
"""
global _global_http_session
async with _session_lock:
if _global_http_session is None or _global_http_session.closed:
# 配置连接器
connector = aiohttp.TCPConnector(
limit=100, # 总连接数限制
limit_per_host=30, # 每个主机的连接数限制
ttl_dns_cache=300, # DNS缓存时间
force_close=False, # 允许连接复用
enable_cleanup_closed=True # 启用清理关闭的连接
)
# 配置超时
timeout = aiohttp.ClientTimeout(
total=60, # 总超时时间
connect=10, # 连接超时
sock_read=30 # 读取超时
)
_global_http_session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
raise_for_status=False # 不自动抛出HTTP错误
)
logger.info("创建全局HTTP会话启用连接池复用")
return _global_http_session
async def close_http_session() -> None:
"""
关闭全局HTTP会话应用关闭时调用
"""
global _global_http_session
if _global_http_session and not _global_http_session.closed:
await _global_http_session.close()
logger.info("已关闭全局HTTP会话")
_global_http_session = None
# 等待连接完全关闭
await asyncio.sleep(0.25)
def _cleanup_background_task(task: asyncio.Task) -> None:
"""
清理完成的后台任务
这个回调函数会在任务完成取消或异常时自动调用
"""
_background_monitor_tasks.discard(task)
# 记录任务完成状态
try:
if task.cancelled():
logger.debug(f"后台监控任务已取消,已清理。当前活跃任务数: {len(_background_monitor_tasks)}")
elif task.exception():
logger.error(
f"后台监控任务异常结束: {task.exception()},已清理。当前活跃任务数: {len(_background_monitor_tasks)}")
else:
logger.debug(f"后台监控任务正常完成,已清理。当前活跃任务数: {len(_background_monitor_tasks)}")
except Exception as e:
logger.error(f"清理后台任务时出错: {str(e)}")
def get_active_monitor_tasks_count() -> int:
"""
获取当前活跃的监控任务数量
可用于监控和调试
Returns:
int: 当前活跃的监控任务数量
"""
return len(_background_monitor_tasks)
async def cancel_all_monitor_tasks() -> int:
"""
取消所有活跃的监控任务仅用于应用关闭时
Returns:
int: 被取消的任务数量
"""
count = len(_background_monitor_tasks)
if count > 0:
logger.info(f"正在取消 {count} 个活跃的监控任务...")
# 复制集合以避免在迭代时修改
tasks_to_cancel = list(_background_monitor_tasks)
for task in tasks_to_cancel:
if not task.done():
task.cancel()
# 等待所有任务完成取消
if tasks_to_cancel:
await asyncio.gather(*tasks_to_cancel, return_exceptions=True)
logger.info(f"已取消所有监控任务,共 {count}")
return count
# 外部回调接口URL
EXTERNAL_CALLBACK_URL = "http://roh.vwfawedl.mobi:9001/AGVService/ContainerSendBackRequest" # 生产线到毛坯库任务
AGV_GOODS_MOVE_URL = "http://roh.vwfawedl.mobi:9001/AGVService/HUGoodsMoveRequest" # 毛坯库到产线任务
@ -61,7 +177,7 @@ async def get_tf_api_token() -> str:
async def call_external_callback(arrival_no: str, arrival_user: str = "000307") -> bool:
"""
调用外部回调接口
调用外部回调接口使用全局Session复用连接池
Args:
arrival_no: 到货编号ReqCode
@ -80,7 +196,8 @@ async def call_external_callback(arrival_no: str, arrival_user: str = "000307")
while retry_count < max_retries:
try:
async with aiohttp.ClientSession() as session:
# 使用全局HTTP会话复用连接池
session = await get_http_session()
async with session.post(EXTERNAL_CALLBACK_URL, json=payload) as response:
result = await response.json()
logger.info(f"外部接口调用响应: {result}, arrival_no={arrival_no}, 重试次数={retry_count}")
@ -106,9 +223,10 @@ async def call_external_callback(arrival_no: str, arrival_user: str = "000307")
logger.error(f"外部接口调用失败,已达到最大重试次数: arrival_no={arrival_no}, 最大重试次数={max_retries}")
return False
async def call_agv_goods_move_callback(pid: str, user_id: str = "000307") -> bool:
"""
调用AGV货物移动回调接口
调用AGV货物移动回调接口使用全局Session复用连接池
Args:
pid: 对应的req_code
@ -127,7 +245,8 @@ async def call_agv_goods_move_callback(pid: str, user_id: str = "000307") -> boo
while retry_count < max_retries:
try:
async with aiohttp.ClientSession() as session:
# 使用全局HTTP会话复用连接池
session = await get_http_session()
async with session.post(AGV_GOODS_MOVE_URL, json=payload) as response:
result = await response.json()
logger.info(f"AGV货物移动接口调用响应: {result}, PID={pid}, 重试次数={retry_count}")
@ -153,6 +272,7 @@ async def call_agv_goods_move_callback(pid: str, user_id: str = "000307") -> boo
logger.error(f"AGV货物移动接口调用失败已达到最大重试次数: PID={pid}, 最大重试次数={max_retries}")
return False
async def monitor_task_and_callback(task_record_id: str, req_code: str):
"""
异步监控任务状态并在成功时调用外部回调接口
@ -163,9 +283,7 @@ async def monitor_task_and_callback(task_record_id: str, req_code: str):
"""
logger.info(f"开始监控任务状态: task_record_id={task_record_id}, req_code={req_code}")
# max_wait_time = 1800 # 最大等待时间30分钟
# wait_count = 0
try:
while True:
try:
task_detail_result = await TaskRecordService.get_task_record_detail(task_record_id)
@ -178,7 +296,8 @@ async def monitor_task_and_callback(task_record_id: str, req_code: str):
# 如果任务已完成(成功)
if task_status == TaskStatus.COMPLETED:
logger.info(f"任务执行成功,开始调用外部回调接口: task_record_id={task_record_id}, req_code={req_code}")
logger.info(
f"任务执行成功,开始调用外部回调接口: task_record_id={task_record_id}, req_code={req_code}")
# 调用外部回调接口
success = await call_external_callback(req_code)
if success:
@ -189,24 +308,29 @@ async def monitor_task_and_callback(task_record_id: str, req_code: str):
# 如果任务已失败或取消
elif task_status in [TaskStatus.FAILED, TaskStatus.CANCELED]:
logger.info(f"任务执行失败或取消,不调用外部回调接口: task_record_id={task_record_id}, status={task_status}")
logger.info(
f"任务执行失败或取消,不调用外部回调接口: task_record_id={task_record_id}, status={task_status}")
break
# 任务还在运行中,继续等待
else:
logger.debug(f"任务仍在执行中,继续等待: task_record_id={task_record_id}, status={task_status}")
await asyncio.sleep(2) # 等待10秒
# wait_count += 10
await asyncio.sleep(2)
else:
logger.warning(f"无法获取任务详情,继续等待: task_record_id={task_record_id}")
await asyncio.sleep(2) # 等待10秒
# wait_count += 10
await asyncio.sleep(2)
except asyncio.CancelledError:
logger.info(f"监控任务被取消: task_record_id={task_record_id}, req_code={req_code}")
raise # 重新抛出以便正确处理取消
except Exception as e:
logger.error(f"监控任务状态时出现异常: {str(e)}, task_record_id={task_record_id}")
await asyncio.sleep(2) # 等待10秒
# wait_count += 10
await asyncio.sleep(2)
finally:
# 确保释放所有资源
logger.info(f"监控任务结束,清理资源: task_record_id={task_record_id}, req_code={req_code}")
async def monitor_agv_task_and_callback(task_record_id: str, req_code: str):
"""
@ -218,6 +342,7 @@ async def monitor_agv_task_and_callback(task_record_id: str, req_code: str):
"""
logger.info(f"开始监控AGV调度任务状态: task_record_id={task_record_id}, req_code={req_code}")
try:
while True:
try:
task_detail_result = await TaskRecordService.get_task_record_detail(task_record_id)
@ -230,36 +355,48 @@ async def monitor_agv_task_and_callback(task_record_id: str, req_code: str):
# 如果任务已完成(成功)
if task_status == TaskStatus.COMPLETED:
logger.info(f"AGV调度任务执行成功开始调用AGV货物移动回调接口: task_record_id={task_record_id}, req_code={req_code}")
logger.info(
f"AGV调度任务执行成功开始调用AGV货物移动回调接口: task_record_id={task_record_id}, req_code={req_code}")
# 调用AGV货物移动回调接口
success = await call_agv_goods_move_callback(req_code)
if success:
logger.info(f"AGV货物移动回调接口调用成功: task_record_id={task_record_id}, req_code={req_code}")
logger.info(
f"AGV货物移动回调接口调用成功: task_record_id={task_record_id}, req_code={req_code}")
else:
logger.error(f"AGV货物移动回调接口调用失败: task_record_id={task_record_id}, req_code={req_code}")
logger.error(
f"AGV货物移动回调接口调用失败: task_record_id={task_record_id}, req_code={req_code}")
break
# 如果任务已失败或取消
elif task_status in [TaskStatus.FAILED, TaskStatus.CANCELED]:
logger.info(f"AGV调度任务执行失败或取消不调用AGV货物移动回调接口: task_record_id={task_record_id}, status={task_status}")
logger.info(
f"AGV调度任务执行失败或取消不调用AGV货物移动回调接口: task_record_id={task_record_id}, status={task_status}")
break
# 任务还在运行中,继续等待
else:
logger.debug(f"AGV调度任务仍在执行中继续等待: task_record_id={task_record_id}, status={task_status}")
await asyncio.sleep(2) # 等待2秒
logger.debug(
f"AGV调度任务仍在执行中继续等待: task_record_id={task_record_id}, status={task_status}")
await asyncio.sleep(2)
else:
logger.warning(f"无法获取AGV调度任务详情继续等待: task_record_id={task_record_id}")
await asyncio.sleep(2) # 等待2秒
await asyncio.sleep(2)
except asyncio.CancelledError:
logger.info(f"AGV监控任务被取消: task_record_id={task_record_id}, req_code={req_code}")
raise # 重新抛出以便正确处理取消
except Exception as e:
logger.error(f"监控AGV调度任务状态时出现异常: {str(e)}, task_record_id={task_record_id}")
await asyncio.sleep(2) # 等待2秒
await asyncio.sleep(2)
finally:
# 确保释放所有资源
logger.info(f"AGV监控任务结束清理资源: task_record_id={task_record_id}, req_code={req_code}")
async def check_task_permission(tf_api_token: str, tf_api_base_url: str, module_name: str = "其他") -> bool:
"""
检查是否允许处理任务
检查是否允许处理任务使用全局Session复用连接池
调用参数配置-三方接口调用接口检查系统限制
Args:
@ -279,7 +416,8 @@ async def check_task_permission(tf_api_token: str, tf_api_base_url: str, module_
api_url = f"{tf_api_base_url}/parameter/getByModule"
try:
async with aiohttp.ClientSession() as session:
# 使用全局HTTP会话复用连接池
session = await get_http_session()
async with session.get(api_url, data=module_name, headers=headers) as response:
if response.status == 200:
result = await response.json()
@ -312,9 +450,10 @@ async def check_task_permission(tf_api_token: str, tf_api_base_url: str, module_
# 如果出现异常,默认允许处理任务
return False
async def get_amr_loading_state(task_record_id: str, tf_api_token: str) -> Dict[str, Any]:
"""
获取任务中小车负载状态
获取任务中小车负载状态使用全局Session复用连接池
Args:
task_record_id: 天风任务ID
@ -332,7 +471,8 @@ async def get_amr_loading_state(task_record_id: str, tf_api_token: str) -> Dict[
api_url = f"{TF_API_BASE_URL}/task/vwedtask/{task_record_id}/getAmrState"
try:
async with aiohttp.ClientSession() as session:
# 使用全局HTTP会话复用连接池
session = await get_http_session()
async with session.get(api_url, headers=headers) as response:
if response.status == 200:
result = await response.json()
@ -355,6 +495,7 @@ async def get_amr_loading_state(task_record_id: str, tf_api_token: str) -> Dict[
"code": 500
}
# # 任务类型到任务优先级
TASK_TYPE_TEMPLATE_MAPPING = {
TaskTypeEnum.GG2MP: "GG",
@ -393,6 +534,8 @@ TASK_TYPE_REMARK = {
TaskTypeEnum.MP2LG: "毛坯库:{0}-连杆:{1}",
TaskTypeEnum.MP2PHZ: "毛坯库:{0}-平衡轴:{1}",
}
@router.post("/newTask")
async def create_new_task(request: Request, task_request: ExternalTaskRequest = Body(...)):
"""
@ -539,7 +682,7 @@ async def create_new_task(request: Request, task_request: ExternalTaskRequest =
source_device=request.client.host if request.client else "unknown", # 使用客户端IP作为设备标识
use_modbus=False,
modbus_timeout=5000,
priority = priority
priority=priority
)
# 更新外部任务记录状态为运行中
@ -636,6 +779,7 @@ async def create_new_task(request: Request, task_request: ExternalTaskRequest =
rowCount=0
)
@router.post("/GenAgvSchedulingTask")
async def gen_agv_scheduling_task(request: Request, task_request: GenAgvSchedulingTaskRequest = Body(...)):
"""
@ -657,7 +801,6 @@ async def gen_agv_scheduling_task(request: Request, task_request: GenAgvScheduli
priority = TASK_TYPE_PRIORITY.get(TASK_TYPE_TEMPLATE_MAPPING.get(task_request.TaskTyp, "OR"))
remark = TASK_TYPE_REMARK.get(task_request.TaskTyp)
external_record = None
try:
logger.info(f"收到AGV调度任务请求:{task_request}")
@ -698,16 +841,15 @@ async def gen_agv_scheduling_task(request: Request, task_request: GenAgvScheduli
}
client_info_str = json.dumps(client_info, ensure_ascii=False)
# 根据任务类型获取对应的模板ID
template_id = DG_ID
# 创建外部任务记录
external_record = await ExternalTaskRecordService.create_agv_scheduling_task_record(
req_code=task_request.ReqCode,
task_code=task_request.TaskCode,
business_task_type=task_request.TaskTyp.value if hasattr(task_request.TaskTyp, 'value') else str(task_request.TaskTyp),
business_task_type=task_request.TaskTyp.value if hasattr(task_request.TaskTyp, 'value') else str(
task_request.TaskTyp),
security_key=task_request.SecurityKey or "",
type_field=task_request.Type,
sub_type=task_request.SubType,
@ -816,7 +958,7 @@ async def gen_agv_scheduling_task(request: Request, task_request: GenAgvScheduli
source_device=request.client.host if request.client else "unknown", # 使用客户端IP作为设备标识
use_modbus=False,
modbus_timeout=5000,
priority = priority
priority=priority
)
# 更新外部任务记录状态为运行中
@ -889,17 +1031,20 @@ async def gen_agv_scheduling_task(request: Request, task_request: GenAgvScheduli
# 定义需要监控的任务类型
agv_callback_task_types = ["MP2GG", "MP2GT", "MP2ZG", "MP2QZ", "MP2LG", "MP2PHZ"]
# 启动异步任务监控,不阻塞当前接口 - 使用后台任务管理器
# 启动异步任务监控,不阻塞当前接口
if task_record_id and task_request.TaskTyp in agv_callback_task_types and TF_WEB_POST:
create_background_task(
monitor_agv_task_and_callback(
# 创建后台监控任务并添加到管理集合
monitor_task = asyncio.create_task(monitor_agv_task_and_callback(
task_record_id=task_record_id,
req_code=task_request.TaskCode
),
name=f"monitor_agv_task_{task_record_id}",
context=f"TaskType={task_request.TaskTyp}, ReqCode={task_request.TaskCode}"
)
logger.info(f"已启动AGV调度任务监控: TaskType={task_request.TaskTyp}, TaskRecordId={task_record_id}")
))
# 将任务添加到集合中进行追踪
_background_monitor_tasks.add(monitor_task)
# 添加完成回调,自动清理
monitor_task.add_done_callback(_cleanup_background_task)
logger.info(
f"已启动AGV调度任务监控: TaskType={task_request.TaskTyp}, TaskRecordId={task_record_id}, 当前活跃监控任务数: {len(_background_monitor_tasks)}")
return ExternalTaskResponse(
code=0,
@ -931,6 +1076,7 @@ async def gen_agv_scheduling_task(request: Request, task_request: GenAgvScheduli
rowCount=0
)
@router.post("/cancelTask")
async def cancel_task(request: Request, cancel_request: CancelTaskRequest = Body(...)):
"""
@ -1014,7 +1160,8 @@ async def cancel_task(request: Request, cancel_request: CancelTaskRequest = Body
amr_loading = amr_state_data.get("amr_loading", False)
amr_name = amr_state_data.get("amr_name", "")
logger.info(f"小车负载状态: task_record_id={task_record_id}, amr_loading={amr_loading}, amr_name={amr_name}")
logger.info(
f"小车负载状态: task_record_id={task_record_id}, amr_loading={amr_loading}, amr_name={amr_name}")
# 如果小车处于负载状态,不允许取消任务
if amr_loading:
@ -1145,3 +1292,30 @@ async def get_external_task_record_by_req_code(
message=f"查询外部任务记录失败: {str(e)}",
code=500
)
@router.get("/api/monitor-tasks/status")
async def get_monitor_tasks_status():
"""
获取后台监控任务状态用于调试和监控
Returns:
包含活跃监控任务数量的响应
"""
try:
active_count = get_active_monitor_tasks_count()
return format_response(
data={
"activeMonitorTasks": active_count,
"message": f"当前有 {active_count} 个活跃的后台监控任务"
},
message="成功获取监控任务状态"
)
except Exception as e:
logger.error(f"获取监控任务状态异常: {str(e)}")
return error_response(
message=f"获取监控任务状态失败: {str(e)}",
code=500
)

View File

@ -975,6 +975,22 @@ class EnhancedTaskScheduler:
source_client_info=source_client_info
)
def _cleanup_executor(self, executor: Optional['TaskExecutor'], context: str = "") -> None:
"""
清理executor对象防止内存泄漏
Args:
executor: 任务执行器对象
context: 上下文信息用于日志记录
"""
if executor:
try:
if hasattr(executor, 'cleanup'):
executor.cleanup()
logger.debug(f"{context} executor已清理")
except Exception as e:
logger.error(f"{context} 清理executor异常: {str(e)}")
async def _worker(self, worker_id: int) -> None:
"""
工作线程
@ -986,6 +1002,10 @@ class EnhancedTaskScheduler:
logger.info(f"工作线程 {worker_id} 启动")
while self.is_running:
executor = None # 在外层定义,确保异常处理时可访问
task_record_id = None
queue_index = -1
cancel_checker_task = None
try:
# 从队列获取任务
queue_index, item = await self.queue_manager.dequeue(worker_id, self.worker_manager.get_worker_count())
@ -1014,8 +1034,6 @@ class EnhancedTaskScheduler:
logger.info(f"工作线程 {worker_id} 获取到任务: {task_record_id}, 优先级: {priority}")
# 执行任务
try:
# 创建任务执行器
executor = TaskExecutor(task_record_id)
@ -1031,16 +1049,10 @@ class EnhancedTaskScheduler:
# 创建一个取消任务检查器,定期检查数据库中任务是否被标记为取消
cancel_checker_task = asyncio.create_task(self._check_task_cancel(task_record_id, executor))
try:
# 执行任务
result = await executor.execute()
# 取消检查器任务
cancel_checker_task.cancel()
try:
await cancel_checker_task
except asyncio.CancelledError:
pass
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"current_task": None,
@ -1049,62 +1061,12 @@ class EnhancedTaskScheduler:
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
})
# 显式清理executor对象释放内存
try:
if executor:
# 清理TaskContext
if hasattr(executor, 'task_context') and executor.task_context:
if hasattr(executor.task_context, 'cleanup'):
executor.task_context.cleanup()
executor.task_context = None
# 清理BlockExecutor
if hasattr(executor, 'block_executor') and executor.block_executor:
executor.block_executor.task_context = None
executor.block_executor = None
# 清空其他引用
executor.task_record = None
executor.task_def = None
logger.debug(f"任务 {task_record_id} 的executor已清理")
except Exception as cleanup_error:
logger.error(f"清理executor失败: {str(cleanup_error)}")
# 移除正在执行的任务
self.running_tasks.pop(task_record_id, None)
logger.info(f"工作线程 {worker_id} 完成任务: {task_record_id}, 结果: {result.get('success')}")
except Exception as e:
logger.error(f"工作线程 {worker_id} 执行任务异常: {str(e)}")
logger.error(traceback.format_exc())
# 显式清理executor对象释放内存
try:
if executor:
# 清理TaskContext
if hasattr(executor, 'task_context') and executor.task_context:
if hasattr(executor.task_context, 'cleanup'):
executor.task_context.cleanup()
executor.task_context = None
# 清理BlockExecutor
if hasattr(executor, 'block_executor') and executor.block_executor:
executor.block_executor.task_context = None
executor.block_executor = None
# 清空其他引用
executor.task_record = None
executor.task_def = None
logger.debug(f"任务 {task_record_id} 的executor已清理(异常分支)")
except Exception as cleanup_error:
logger.error(f"清理executor失败(异常分支): {str(cleanup_error)}")
# 移除正在执行的任务
self.running_tasks.pop(task_record_id, None)
# 检查是否需要重试
await self._handle_task_error(task_record_id, str(e))
@ -1117,7 +1079,26 @@ class EnhancedTaskScheduler:
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
})
finally:
# 统一清理逻辑,无论成功或失败都会执行
# 取消检查器任务
if cancel_checker_task:
cancel_checker_task.cancel()
try:
await cancel_checker_task
except asyncio.CancelledError:
pass
# 移除正在执行的任务
if task_record_id:
self.running_tasks.pop(task_record_id, None)
# 清理executor对象防止内存泄漏
self._cleanup_executor(executor, f"工作线程 {worker_id}")
executor = None
# 标记任务完成
if queue_index != -1:
self.queue_manager.task_done(queue_index)
# 更新工作线程心跳
@ -1129,11 +1110,16 @@ class EnhancedTaskScheduler:
except asyncio.CancelledError:
# 取消异常,退出循环
logger.info(f"工作线程 {worker_id} 被取消")
# 清理可能存在的executor
self._cleanup_executor(executor, f"工作线程 {worker_id} 取消时")
break
except Exception as e:
logger.error(f"工作线程 {worker_id} 异常: {str(e)}")
logger.error(traceback.format_exc())
# 清理可能存在的executor
self._cleanup_executor(executor, f"工作线程 {worker_id} 异常时")
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"error": str(e),

View File

@ -43,6 +43,7 @@ class BlockExecutor:
self.task_context = task_context
self.is_canceled = False
self.is_error = False
self._is_cleaned = False # 防止重复清理标志
def cancel(self) -> None:
"""
@ -167,13 +168,25 @@ class BlockExecutor:
)
return result
else:
# 检查是否是"自身成功但子块失败"的特殊情况
if result.get("self_success", False):
# 本块执行成功,更新状态为成功
success_msg = result.get("message", "执行成功")
await self._update_block_record(
block_record_id,
TaskBlockRecordStatus.SUCCESS,
success_msg,
result.get("output", {})
)
logger.info(f"{block_name} 自身执行成功,但子块失败")
else:
# 更新块状态为失败
error_msg = result.get("message", "执行失败")
await self._update_block_record(block_record_id, TaskBlockRecordStatus.FAILED, error_msg)
# 设置错误信息
self.task_context.set_error(error_msg, block_id)
# 设置错误信息(无论哪种情况,任务都要标记为失败)
self.task_context.set_error(result.get("message", "执行失败"), block_id)
return result
except Exception as e:
@ -448,6 +461,19 @@ class BlockExecutor:
}
else:
logger.error(f"子块 {child_id} 执行失败: {result.get('message', '未知错误')}")
# 检查是否是"自身成功但子块失败"的特殊情况
if result.get("self_success", False):
# 本块执行成功,更新状态为成功
success_msg = result.get("message", "执行成功")
await self._update_block_record(
block_record_id,
TaskBlockRecordStatus.SUCCESS,
success_msg,
result.get("output", {})
)
logger.info(f"子块 {child_name} 自身执行成功,但其子块失败")
else:
# 更新块记录状态为失败
await self._update_block_record(
block_record_id,
@ -455,6 +481,7 @@ class BlockExecutor:
result.get("message", "执行失败"),
)
# 无论哪种情况,都需要终止后续兄弟块的执行
# 为剩余未执行的兄弟块创建终止记录
await self._create_terminated_records_for_remaining_siblings(
children, i + 1, f"因前序块 {child_name} 执行失败而终止"
@ -1834,3 +1861,30 @@ class BlockExecutor:
return params
def cleanup(self) -> None:
"""
清理 BlockExecutor 占用的资源
优化释放对 task_context 的引用避免循环引用导致的内存泄漏
"""
# 防止重复清理
if self._is_cleaned:
return
try:
# 清空 task_context 引用 (注意: task_context 由 TaskExecutor 负责清理)
# 这里只需要解除引用,不调用 task_context.cleanup()
if self.task_context:
self.task_context = None
# 重置状态标志
self.is_canceled = False
self.is_error = False
# 标记已清理
self._is_cleaned = True
logger.debug("BlockExecutor 已清理")
except Exception as e:
logger.error(f"清理 BlockExecutor 失败: {str(e)}")

View File

@ -341,14 +341,15 @@ class RobotBlockHandler(BlockHandler):
logger.info(f"成功更新任务记录 {task_record_id} 的agv_id字段: {final_agv_id}")
except Exception as e:
logger.error(f"更新任务记录 {task_record_id} 的agv_id字段时发生错误: {str(e)}")
async def update_block_record(self, block_record_id: str, agv_id: str = None) -> None:
async def update_block_record(self, block_record_id: str, agv_id: str = None, block_name: str = None, output_data: Dict[str, Any] = None) -> None:
"""
更新块记录的通用方法
更新块记录的通用方法立即更新输出参数
Args:
block_record_id: 块记录ID
status: 状态码
message: 消息
agv_id: 选择的机器人名称
block_name: 块名称
output_data: 输出数据
"""
try:
from sqlalchemy.ext.asyncio import AsyncSession
@ -356,18 +357,54 @@ class RobotBlockHandler(BlockHandler):
from data.models.blockrecord import VWEDBlockRecord
from sqlalchemy import select, update
from data.enum.task_block_record_enum import TaskBlockRecordStatus
from data.enum.task_input_param_enum import TaskInputParamVariables
from datetime import datetime
if not block_record_id:
logger.warning(f"未提供块记录ID无法更新块记录")
return
stmt = update(VWEDBlockRecord).where(VWEDBlockRecord.id == block_record_id).values(
status=TaskBlockRecordStatus.SUCCESS,
ended_reason="执行成功",
remark="执行成功"
)
async with get_async_session() as session:
session: AsyncSession = session
# 获取块记录以获取块名称
stmt_select = select(VWEDBlockRecord).where(VWEDBlockRecord.id == block_record_id)
result = await session.execute(stmt_select)
block_record = result.scalar_one_or_none()
if not block_record:
logger.warning(f"未找到块记录: {block_record_id}")
return
# 使用传入的块名称或从记录中获取
actual_block_name = block_name or block_record.block_name
# 构建输出参数
update_values = {
"status": TaskBlockRecordStatus.SUCCESS,
"ended_reason": "选择机器人成功",
"remark": "选择机器人成功"
}
# 如果提供了输出数据,立即更新输出参数
if output_data:
# 构建完整输出结构 {"blocks": {"块名称": 输出内容}}
full_output = {TaskInputParamVariables.BLOCKS: {actual_block_name: output_data}}
output_full_json = json.dumps(full_output, ensure_ascii=False)
output_value_json = json.dumps(output_data, ensure_ascii=False)
update_values["output_params"] = output_full_json
update_values["block_out_params_value"] = output_value_json
logger.info(f"立即更新块 {actual_block_name} 的输出参数: {output_data}")
# 更新块记录
stmt = update(VWEDBlockRecord).where(VWEDBlockRecord.id == block_record_id).values(**update_values)
await session.execute(stmt)
await session.commit()
logger.info(f"成功更新块记录 {block_record_id} 的状态和输出参数")
except Exception as e:
logger.error(f"更新块记录 {block_record_id} 时发生错误: {str(e)}")
@ -840,20 +877,29 @@ class SelectAgvBlockHandler(RobotBlockHandler):
}
await self._record_task_log(block, result, context)
return result
# 获取当前块ID和名称提前获取以便更新块记录
current_block_id = block.get("id", "unknown")
current_block_name = block.get("name", f"b{current_block_id}")
# 构建输出数据
output_data = {
"selectedAgvId": amr_name,
}
results = {
"success": True,
"message": f"选择机器人块成功, 块id{current_block_name}",
"output": {
"selectedAgvId": amr_name,
}
"output": output_data
}
await self._record_task_log(block, results, context)
# 更新块记录状态为成功
await self.update_block_record(context.block_record_id, amr_name)
# 获取当前块ID和名称
current_block_id = block.get("id", "unknown")
current_block_name = block.get("name", f"b{current_block_id}")
# 立即更新块记录状态为成功,并更新输出参数
await self.update_block_record(
block_record_id=context.block_record_id,
agv_id=amr_name,
block_name=current_block_name,
output_data=output_data
)
# 更新任务记录中的agv_id字段
await self._update_task_record_agv_id(context.task_record_id, amr_name)
@ -926,7 +972,7 @@ class SelectAgvBlockHandler(RobotBlockHandler):
if "子块" not in result["message"]:
result["message"] = f"{result['message']},子块执行成功, 块id{current_block_name}"
else:
# 子块执行失败,根据失败的子块更新消息
# 子块执行失败,但选择机器人本身是成功的
logger.error(f"选择机器人块 {current_block_name} 的子块执行失败: {loop_result.get('message')}")
# 创建包含子块失败信息的结果
@ -935,6 +981,8 @@ class SelectAgvBlockHandler(RobotBlockHandler):
result = {
"success": False,
"self_success": True, # 标记:自身执行成功,但子块失败
"children_failed": True, # 标记:子块执行失败
"message": f"选择执行机器人成功 选择小车:{amr_name},但子块执行失败: {error_msg}失败块ID: {failed_block_id}",
"output": {
"selectedAgvId": amr_name,

View File

@ -20,6 +20,7 @@ from config.settings import settings
logger = get_logger("services.execution.handlers.storage_queue_manager")
class RequestPriority(Enum):
"""请求优先级"""
LOW = 1
@ -27,6 +28,7 @@ class RequestPriority(Enum):
HIGH = 3
URGENT = 4
class RequestStatus(Enum):
"""请求状态"""
PENDING = "pending"
@ -36,6 +38,7 @@ class RequestStatus(Enum):
TIMEOUT = "timeout"
CANCELLED = "cancelled"
@dataclass
class StorageRequest:
"""库位请求"""
@ -47,7 +50,9 @@ class StorageRequest:
task_record_id: str
priority: RequestPriority = RequestPriority.NORMAL
created_at: float = field(default_factory=time.time)
timeout: float = field(default_factory=lambda: settings.STORAGE_QUEUE_DEFAULT_TIMEOUT if settings.STORAGE_QUEUE_ENABLE_TIMEOUT else float('inf'))
timeout: float = field(
default_factory=lambda: settings.STORAGE_QUEUE_DEFAULT_TIMEOUT if settings.STORAGE_QUEUE_ENABLE_TIMEOUT else float(
'inf'))
retry_count: int = 0
max_retries: int = 3
status: RequestStatus = RequestStatus.PENDING
@ -60,6 +65,7 @@ class StorageRequest:
return self.priority.value > other.priority.value # 高优先级优先
return self.created_at < other.created_at # 时间早的优先
class StorageQueueManager:
"""库位请求队列管理器"""
@ -204,14 +210,23 @@ class StorageQueueManager:
check_timeout = timeout or settings.STORAGE_QUEUE_DEFAULT_TIMEOUT
logger.debug(f"等待请求结果(超时 {check_timeout}s: {request_id}")
try:
while True:
# 检查是否完成
if request_id in self.completed_requests:
request = self.completed_requests[request_id]
if request.status == RequestStatus.COMPLETED:
return request.result
result = request.result
# 立即清理已完成的请求,避免内存积累
del self.completed_requests[request_id]
logger.debug(f"请求结果已取走并清理: {request_id}")
return result
else:
raise Exception(f"请求失败: {request.error_message}")
error_msg = request.error_message
# 失败的请求也立即清理
del self.completed_requests[request_id]
logger.debug(f"请求失败结果已取走并清理: {request_id}")
raise Exception(f"请求失败: {error_msg}")
# 只有在启用超时时才检查超时
if self.enable_timeout:
@ -228,6 +243,12 @@ class StorageQueueManager:
raise Exception("请求超时")
await asyncio.sleep(0.1) # 避免忙等待
except Exception:
# 发生异常时,也要尝试清理可能存在的已完成请求
if request_id in self.completed_requests:
del self.completed_requests[request_id]
logger.debug(f"异常时清理请求: {request_id}")
raise
async def cancel_request(self, request_id: str) -> bool:
"""取消请求"""
@ -336,11 +357,14 @@ class StorageQueueManager:
await self._mark_request_failed(request.request_id, error_msg)
finally:
# 更新平均处理时间
# 更新平均处理时间(只在这里统计,移除 _mark_request_completed 中的重复计数)
processing_time = time.time() - start_time
if self.stats['requests_completed'] > 0:
total_time = self.stats['avg_processing_time'] * self.stats['requests_completed']
self.stats['requests_completed'] += 1
self.stats['avg_processing_time'] = (total_time + processing_time) / self.stats['requests_completed']
self.stats['avg_processing_time'] = (total_time + processing_time) / (
self.stats['requests_completed'] + 1)
else:
self.stats['avg_processing_time'] = processing_time
async def _mark_request_completed(self, request_id: str, result: Dict[str, Any]):
"""标记请求完成"""
@ -352,6 +376,7 @@ class StorageQueueManager:
del self.processing_requests[request_id]
self.completed_requests[request_id] = request
# 更新完成计数(移除了重复计数)
self.stats['requests_completed'] += 1
async def _mark_request_failed(self, request_id: str, error_message: str):
@ -406,7 +431,10 @@ class StorageQueueManager:
return -1
async def _cleanup_completed_requests(self):
"""清理已完成的请求"""
"""
清理已完成的请求兜底机制
注意采用立即清理模式后此方法主要作为兜底保护防止异常情况下的内存泄漏
"""
while not self.shutdown_event.is_set():
try:
await asyncio.sleep(self.cleanup_interval) # 使用配置的清理间隔
@ -423,10 +451,12 @@ class StorageQueueManager:
del self.completed_requests[request_id]
if to_remove:
logger.info(f"清理了 {len(to_remove)} 个已完成的请求")
logger.info(
f"兜底清理了 {len(to_remove)} 个未被取走的已完成请求(可能存在未调用 wait_for_result 的情况)")
except Exception as e:
logger.error(f"清理已完成请求异常: {str(e)}")
# 全局队列管理器实例
storage_queue_manager = StorageQueueManager()

View File

@ -61,6 +61,7 @@ class TaskContext:
self.map_id = map_id # 地图ID
self.parent_log_id = None # 当前父日志ID(用于建立层级关系)
self.current_iteration_index = None # 当前迭代索引
self._is_cleaned = False # 标记是否已清理,防止重复清理
def set_current_block(self, block_id: str, block_name: str):
"""
@ -396,28 +397,36 @@ class TaskContext:
def cleanup(self) -> None:
"""
清理上下文数据释放内存
用于任务执行完成后释放大型数据结构防止内存泄漏
修复内存泄漏: 任务完成后及时清理大型数据结构
优化彻底置空所有引用而不是仅清空容器
"""
# # 防止重复清理
if self._is_cleaned:
return
try:
# 清理大型字典
if self.variables:
self.variables.clear()
if self.variable_sources:
self.variable_sources.clear()
if self.block_outputs:
self.block_outputs.clear()
if self.outputs:
self.outputs.clear()
# 清理列表
if self.execution_path:
self.execution_path.clear()
# 清空引用
self.input_params = {}
# 彻底清空所有字典和列表(直接置 None而不是 clear()
self.variables = None
self.variable_sources = None
self.block_outputs = None
self.outputs = None
self.execution_path = None
self.input_params = None
self.error = None
logger.debug(f"任务上下文 {self.task_record_id} 已清理")
# 清理其他可能的大对象引用
self.token = None
self.map_id = None
self.block_record_id = None
self.skip_to_component_id = None
self.failure_reason = None
# 清理 ID 和名称引用
self.current_block_id = None
self.current_block_name = None
self._is_cleaned = True
logger.debug(f"任务上下文 {self.task_record_id} 已彻底清理")
except Exception as e:
logger.error(f"清理任务上下文失败: {str(e)}")

View File

@ -52,6 +52,7 @@ class TaskExecutor:
self.timeout = 3600*10 # 默认超时时间10小时
self.is_canceled = False
self.is_error = False
self._is_cleaned = False # 防止重复清理标志
def set_timeout(self, timeout_seconds: int) -> None:
"""
@ -215,7 +216,7 @@ class TaskExecutor:
task_detail = json.loads(task_detail_str)
root_block = task_detail.get("rootBlock", {})
release_sites = self.task_def.release_sites
print("root_block:::::::::::", root_block)
# print("root_block:::::::::::", root_block)
# 更新任务状态为执行中
async with get_async_session() as session:
await self._update_task_status(session, TaskStatus.RUNNING, "任务执行中", task_detail=task_detail_str)
@ -384,22 +385,9 @@ class TaskExecutor:
finally:
self.is_running = False
# 清理任务执行器,释放内存
# 统一调用 cleanup() 方法清理所有资源
try:
if self.task_context:
if hasattr(self.task_context, 'cleanup'):
self.task_context.cleanup()
self.task_context = None
if self.block_executor:
self.block_executor.task_context = None
self.block_executor = None
# 清空任务记录引用
self.task_record = None
self.task_def = None
logger.debug(f"任务执行器 {self.task_record_id} 已清理")
self.cleanup()
except Exception as cleanup_error:
logger.error(f"清理任务执行器失败: {str(cleanup_error)}")
@ -625,3 +613,50 @@ class TaskExecutor:
except Exception as e:
logger.error(f"外部API同步失败: {str(e)}")
# 外部API失败不应该影响主任务流程只记录日志
def cleanup(self) -> None:
"""
清理 TaskExecutor 占用的资源
优化彻底清理所有子组件和引用防止内存泄漏
"""
# 防止重复清理
if self._is_cleaned:
return
try:
# 先清理 block_executor (按依赖顺序: block_executor -> task_context)
if self.block_executor:
try:
if hasattr(self.block_executor, 'cleanup'):
self.block_executor.cleanup()
except Exception as e:
logger.error(f"清理 block_executor 失败: {str(e)}")
finally:
self.block_executor = None
# 再清理 task_context
if self.task_context:
try:
if hasattr(self.task_context, 'cleanup'):
self.task_context.cleanup()
except Exception as e:
logger.error(f"清理 task_context 失败: {str(e)}")
finally:
self.task_context = None
# 清空其他引用
self.task_record = None
self.task_def = None
self.error_message = None
# 重置状态
self.is_running = False
self.is_canceled = False
self.is_error = False
# 标记已清理
self._is_cleaned = True
logger.debug(f"TaskExecutor {self.task_record_id} 已彻底清理")
except Exception as e:
logger.error(f"清理 TaskExecutor 失败: {str(e)}")

View File

@ -727,10 +727,9 @@ async def set_task_failed(task_id: str, token: str = None) -> Optional[ApiRespon
# 构建请求头
headers = {}
headers[TFApiConfig.TOKEN_HEADER] = token
headers["x-tenant-id"] = "1000"
# headers["x-tenant-id"] = "1000"
try:
logger.info(f"正在设置任务状态为已失败: {task_id}")
timeout = aiohttp.ClientTimeout(total=TFApiConfig.TIMEOUT, connect=5)
async with aiohttp.ClientSession(timeout=timeout, trust_env=False) as session:
async with session.put(