rubenifrah's picture
Chess Challenge submission by rubenifrah
481ffb7 verified
#!/bin/bash
#SBATCH -p mesonet
#SBATCH -N 1
#SBATCH -c 8 # Increased CPUs for data loading
#SBATCH --gres=gpu:1 # 1 GPU per job
#SBATCH --time=12:00:00 # Sufficient for 3 epochs
#SBATCH --mem=32G
#SBATCH --account=m25146
#SBATCH --job-name=chess_train
#SBATCH --output=logs/%x_%j.out
#SBATCH --error=logs/%x_%j.err
export PYTHONUNBUFFERED=1
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export CUDA_LAUNCH_BLOCKING=1
# Exit immediately if a command exits with a non-zero status
set -e
# Load modules (adjust based on your cluster, e.g., module load cuda/11.8)
# module load cuda/12.1 <-- Uncomment if needed
# Activate venv
if [ ! -d "venv" ]; then
echo "Creating venv..."
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -e .
else
source venv/bin/activate
fi
# Define paths
MODEL_DIR="./output_chess_model"
mkdir -p logs
echo "============================================"
echo "Starting Chess Training Pipeline"
echo "Job ID: $SLURM_JOB_ID"
echo "Output Dir: $MODEL_DIR"
echo "============================================"
# 1. Training
echo "[1/2] Starting Training..."
python -m src.train \
--output_dir $MODEL_DIR \
--num_train_epochs 3 \
--per_device_train_batch_size 64 \
--save_steps 2000
# 2. Evaluation
echo "--------------------------------------------"
echo "[2/2] Starting Evaluation..."
# Legal moves check
echo "Running legal move evaluation..."
python -m src.evaluate \
--model_path "${MODEL_DIR}/final_model" \
--mode legal \
--n_positions 1000
# Win rate check
echo "Running win rate check vs Stockfish..."
python -m src.evaluate \
--model_path "${MODEL_DIR}/final_model" \
--mode winrate \
--n_games 50 \
--stockfish_level 1
echo "============================================"
echo "Pipeline Completed!"