diff --git a/launch_minishogi_clients.sh b/launch_minishogi_clients.sh new file mode 100644 index 00000000..b7d85290 --- /dev/null +++ b/launch_minishogi_clients.sh @@ -0,0 +1,11 @@ +# Launching clients. + +export me=`whoami` +export host="`squeue -u $me | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" +echo "host=<${host}>" +for k in `seq 5` +do +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 8 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +done +sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 8 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & + diff --git a/launch_minishogi_server.sh b/launch_minishogi_server.sh new file mode 100644 index 00000000..1d9e1f3b --- /dev/null +++ b/launch_minishogi_server.sh @@ -0,0 +1,5 @@ +for i in 4096 2048 1024 512 256 128 64 +do | +# LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ # uncomment on H2 +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --checkpoint_dir exps/minishogi --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 8 --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 +done diff --git a/meta_minishogi.sh b/meta_minishogi.sh new file mode 100644 index 00000000..fcaf8b7a --- /dev/null +++ b/meta_minishogi.sh @@ -0,0 +1,19 @@ +# Launch this script, wait 10 days, and you should get an excellent minishogi model. + +export me=`whoami` | +sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" | +sleep 200 +export host=`squeue -u $me| grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'` +echo "host=<${host}>" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser4 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser5 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +./launch_minishogi_clients.sh +sleep 86400 +./launch_minishogi_clients.sh +sleep 86400 +./launch_minishogi_clients.sh