Files
TexasPoker-AI/monitor.sh
2026-05-13 17:48:45 +08:00

63 lines
2.1 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# 基础配置
SCRIPT_NAME="mccfr_trainer.py"
LOG_FILE="trainer_monitor.log"
# --- Healthcheck 配置 ---
# 在 healthchecks.io 创建 Check 后获得的 Ping URL
HEALTHCHECK_URL="https://hc-ping.com/e44e0daa-5384-4cf6-8566-039f74c4610c"
# --- 锦标赛后台守护配置 ---
TOURNAMENT_INTERVAL=3600 # 每 3600 秒1 小时)执行一次锦标赛
echo "=== 训练守护进程已启动 ===" | tee -a $LOG_FILE
# 在后台启动心跳进程
# 每 60 秒发送一次 curl只要脚本在跑心跳就不会停
(
while true; do
# -fsS: 静默模式但显示错误; -m 10: 10秒超时
curl -fsS -m 10 --retry 3 $HEALTHCHECK_URL > /dev/null 2>&1
sleep 60
done
) &
HEARTBEAT_PID=$!
# 在后台启动锦标赛循环
# 先 sleep 再执行,避免与训练启动抢资源;输出静默写入日志不干扰主训练日志
(
while true; do
sleep $TOURNAMENT_INTERVAL
python run_tournament.py --num_games 50000 --num_workers 2 >> tournament_output.log 2>&1
done
) &
TOURNEY_PID=$!
# 脚本退出时自动杀死后台心跳进程和锦标赛进程
trap "kill $HEARTBEAT_PID $TOURNEY_PID; echo '[$(date)] 监控已停止' | tee -a $LOG_FILE; exit" SIGINT SIGTERM
while true
do
echo "[$(date)] 正在启动训练程序..." | tee -a $LOG_FILE
# 执行训练程序
python $SCRIPT_NAME
# 获取退出状态码
EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
echo "[$(date)] 训练程序正常完成退出。" | tee -a $LOG_FILE
# 告知 Healthchecks 任务已正常结束(可选,如果不发,任务结束后会报宕机)
curl -fsS -m 10 "${HEALTHCHECK_URL}/complete" > /dev/null 2>&1
kill $HEARTBEAT_PID $TOURNEY_PID
break
else
echo "[$(date)] 警告:训练程序崩溃 (退出码: $EXIT_CODE)。5秒后自动重启..." | tee -a $LOG_FILE
# 可以在此处通过 /fail 告知 Healthchecks 程序崩溃了(虽然它不会停止心跳)
curl -fsS -m 10 "${HEALTHCHECK_URL}/fail" > /dev/null 2>&1
sleep 5
fi
done