63 lines
2.1 KiB
Bash
Executable File
63 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# 基础配置
|
||
SCRIPT_NAME="mccfr_trainer.py"
|
||
LOG_FILE="trainer_monitor.log"
|
||
|
||
# --- Healthcheck 配置 ---
|
||
# 在 healthchecks.io 创建 Check 后获得的 Ping URL
|
||
HEALTHCHECK_URL="https://hc-ping.com/e44e0daa-5384-4cf6-8566-039f74c4610c"
|
||
|
||
# --- 锦标赛后台守护配置 ---
|
||
TOURNAMENT_INTERVAL=3600 # 每 3600 秒(1 小时)执行一次锦标赛
|
||
|
||
echo "=== 训练守护进程已启动 ===" | tee -a $LOG_FILE
|
||
|
||
# 在后台启动心跳进程
|
||
# 每 60 秒发送一次 curl,只要脚本在跑,心跳就不会停
|
||
(
|
||
while true; do
|
||
# -fsS: 静默模式但显示错误; -m 10: 10秒超时
|
||
curl -fsS -m 10 --retry 3 $HEALTHCHECK_URL > /dev/null 2>&1
|
||
sleep 60
|
||
done
|
||
) &
|
||
HEARTBEAT_PID=$!
|
||
|
||
# 在后台启动锦标赛循环
|
||
# 先 sleep 再执行,避免与训练启动抢资源;输出静默写入日志不干扰主训练日志
|
||
(
|
||
while true; do
|
||
sleep $TOURNAMENT_INTERVAL
|
||
python run_tournament.py --num_games 50000 --num_workers 2 >> tournament_output.log 2>&1
|
||
done
|
||
) &
|
||
TOURNEY_PID=$!
|
||
|
||
# 脚本退出时自动杀死后台心跳进程和锦标赛进程
|
||
trap "kill $HEARTBEAT_PID $TOURNEY_PID; echo '[$(date)] 监控已停止' | tee -a $LOG_FILE; exit" SIGINT SIGTERM
|
||
|
||
while true
|
||
do
|
||
echo "[$(date)] 正在启动训练程序..." | tee -a $LOG_FILE
|
||
|
||
# 执行训练程序
|
||
python $SCRIPT_NAME
|
||
|
||
# 获取退出状态码
|
||
EXIT_CODE=$?
|
||
|
||
if [ $EXIT_CODE -eq 0 ]; then
|
||
echo "[$(date)] 训练程序正常完成退出。" | tee -a $LOG_FILE
|
||
# 告知 Healthchecks 任务已正常结束(可选,如果不发,任务结束后会报宕机)
|
||
curl -fsS -m 10 "${HEALTHCHECK_URL}/complete" > /dev/null 2>&1
|
||
kill $HEARTBEAT_PID $TOURNEY_PID
|
||
break
|
||
else
|
||
echo "[$(date)] 警告:训练程序崩溃 (退出码: $EXIT_CODE)。5秒后自动重启..." | tee -a $LOG_FILE
|
||
# 可以在此处通过 /fail 告知 Healthchecks 程序崩溃了(虽然它不会停止心跳)
|
||
curl -fsS -m 10 "${HEALTHCHECK_URL}/fail" > /dev/null 2>&1
|
||
sleep 5
|
||
fi
|
||
done
|