echo $ARNOLD_ID echo $ARNOLD_WORKER_GPU echo $ARNOLD_WORKER_0_PORT echo $ARNOLD_WORKER_0_HOST echo $ARNOLD_WORKER_NUM mkdir -p data/m1_python mkdir -p data/m1_full cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > data/m1_python/m1.chunk.0.jsonl cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.2.jsonl > data/m1_python/m1.chunk.1.jsonl head -n 20000000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > data/m1_full/m1.chunk.0.jsonl export WANDB_API_KEY=549ae7ae396007b48e31bfc4398d7a38c31fe998 # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=6 model.n_heads=8 \ # optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=6 model.n_heads=8 \ # optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512 # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=6 model.n_heads=8 \ # optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=6 model.n_heads=8 \ # optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512 ################################################## EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_python torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=768 model.n_layers=12 model.n_heads=12 \ optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_python torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=768 model.n_layers=12 model.n_heads=12 \ optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_full torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=768 model.n_layers=12 model.n_heads=12 \ optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=768 model.n_layers=12 model.n_heads=12 \ # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 ################################################## EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_python torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=512 model.n_layers=12 model.n_heads=8 \ optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048 EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_python torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=512 model.n_layers=12 model.n_heads=8 \ optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512 EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_full torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=512 model.n_layers=12 model.n_heads=8 \ optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048 EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_full torchrun --nnodes=$ARNOLD_WORKER_NUM \ --node_rank=$ARNOLD_ID \ --nproc_per_node=$ARNOLD_WORKER_GPU \ --master_addr=$ARNOLD_WORKER_0_HOST \ --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ dump_dir=$EXP_NAME \ logging.wandb.name=$EXP_NAME name=$EXP_NAME \ model.dim=512 model.n_layers=12 model.n_heads=8 \ optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512 ################################################## # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=12 model.n_heads=8 \ # optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=12 model.n_heads=8 \ # optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512 # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=12 model.n_heads=8 \ # optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # model.dim=512 model.n_layers=12 model.n_heads=8 \ # optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512 ################################################## # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_python # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_full # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ # dump_dir=$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 ################################################## # EXP_NAME=m1_6M_lr3e-2_steps50k_bs32_seqlen2048 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/xinyu/m1/checkpoints/$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=3e-2 steps=50000 data.batch_size=8 data.seq_len=1024 # EXP_NAME=m1_6M_lr5e-2_steps10k_bs32_seqlen2048 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048 # EXP_NAME=m1_6M_lr5e-2_steps50k_bs32_seqlen2048 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048 # EXP_NAME=m1_6M_lr5e-2_steps50k_bs512_seqlen128 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=5e-2 steps=50000 data.batch_size=512 data.seq_len=128 # EXP_NAME=m1_6M_lr1e-2_steps50k_bs128_seqlen512 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=1e-2 steps=50000 data.batch_size=128 data.seq_len=512 # EXP_NAME=m1_6M_lr3e-2_steps50k_bs128_seqlen512 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=3e-2 steps=50000 data.batch_size=128 data.seq_len=512 # EXP_NAME=m1_6M_lr5e-2_steps50k_bs128_seqlen512 # torchrun --nnodes=$ARNOLD_WORKER_NUM \ # --node_rank=$ARNOLD_ID \ # --nproc_per_node=$ARNOLD_WORKER_GPU \ # --master_addr=$ARNOLD_WORKER_0_HOST \ # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ # optim.lr=5e-2 steps=50000 data.batch_size=128 data.seq_len=512