From 1f80958a8e3e1610b3d25e5e32c66d97c51867ae Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 13 Jun 2020 12:04:19 +0200 Subject: [PATCH 1/8] Launcher for minishogi --- launch_minishogi_clients.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 launch_minishogi_clients.sh diff --git a/launch_minishogi_clients.sh b/launch_minishogi_clients.sh new file mode 100644 index 00000000..0f272118 --- /dev/null +++ b/launch_minishogi_clients.sh @@ -0,0 +1,11 @@ +# Launching clients. + +me=`whoami` +export host="`squeue -u $me | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" +echo "host=<${host}>" +for k in `seq 5` +do +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +done +sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & + From 0e22cf2f8248befa41bce0545f1c418badbd41f8 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 13 Jun 2020 12:09:15 +0200 Subject: [PATCH 2/8] add the server Servers are useful for doing distributed training. --- launch_minishogi_server.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 launch_minishogi_server.sh diff --git a/launch_minishogi_server.sh b/launch_minishogi_server.sh new file mode 100644 index 00000000..671e0520 --- /dev/null +++ b/launch_minishogi_server.sh @@ -0,0 +1,5 @@ +for i in 4096 2048 1024 512 256 128 64 +do | +LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --checkpoint_dir exps/minishogi --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 +done From d2bebecb2d085d4236840a52b2dff6e614f46621 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 13 Jun 2020 12:10:15 +0200 Subject: [PATCH 3/8] Create meta_minishogi.sh --- meta_minishogi.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 meta_minishogi.sh diff --git a/meta_minishogi.sh b/meta_minishogi.sh new file mode 100644 index 00000000..c091b85f --- /dev/null +++ b/meta_minishogi.sh @@ -0,0 +1,19 @@ +# Launch this script, wait 10 days, and you should get an excellent minishogi model. + +me=`whoami` | +sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" | +sleep 200 +export host=`squeue -u $me| grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'` +echo "host=<${host}>" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser4 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +sleep 60 +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser5 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" +./launch_minishogi_clients.sh +sleep 86400 +./launch_minishogi_clients.sh +sleep 86400 +./launch_minishogi_clients.sh From d50684dbbed2b56eeaa7c403fa102a41724ce359 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 13 Jun 2020 15:12:28 +0200 Subject: [PATCH 4/8] Update launch_minishogi_server.sh --- launch_minishogi_server.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_minishogi_server.sh b/launch_minishogi_server.sh index 671e0520..81c1f5b5 100644 --- a/launch_minishogi_server.sh +++ b/launch_minishogi_server.sh @@ -1,5 +1,5 @@ for i in 4096 2048 1024 512 256 128 64 do | -LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ +# LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ # uncomment on H2 python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --checkpoint_dir exps/minishogi --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 done From 9910f587397c20eb6c5e4885665ddcb82cef56dc Mon Sep 17 00:00:00 2001 From: Teytaud Date: Tue, 16 Jun 2020 16:54:03 +0200 Subject: [PATCH 5/8] Update launch_minishogi_server.sh --- launch_minishogi_server.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_minishogi_server.sh b/launch_minishogi_server.sh index 81c1f5b5..1d9e1f3b 100644 --- a/launch_minishogi_server.sh +++ b/launch_minishogi_server.sh @@ -1,5 +1,5 @@ for i in 4096 2048 1024 512 256 128 64 do | # LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ # uncomment on H2 -python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --checkpoint_dir exps/minishogi --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --checkpoint_dir exps/minishogi --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 8 --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 done From eb18d5ae0e877f7ff4826f83be30562b83730d49 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Tue, 16 Jun 2020 16:54:53 +0200 Subject: [PATCH 6/8] Update launch_minishogi_clients.sh --- launch_minishogi_clients.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launch_minishogi_clients.sh b/launch_minishogi_clients.sh index 0f272118..ad13cd90 100644 --- a/launch_minishogi_clients.sh +++ b/launch_minishogi_clients.sh @@ -5,7 +5,7 @@ export host="`squeue -u $me | grep -iv pd | grep -i pg | grep ser | sed 's/.*lea echo "host=<${host}>" for k in `seq 5` do -sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 8 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & done -sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name minishogi --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 8 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & From 8bf2fb3bb7de40cdeedf848ca585b22663faf54b Mon Sep 17 00:00:00 2001 From: Teytaud Date: Wed, 17 Jun 2020 15:53:28 +0200 Subject: [PATCH 7/8] Update launch_minishogi_clients.sh --- launch_minishogi_clients.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_minishogi_clients.sh b/launch_minishogi_clients.sh index ad13cd90..b7d85290 100644 --- a/launch_minishogi_clients.sh +++ b/launch_minishogi_clients.sh @@ -1,6 +1,6 @@ # Launching clients. -me=`whoami` +export me=`whoami` export host="`squeue -u $me | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" echo "host=<${host}>" for k in `seq 5` From 4ccec043349b1de75b247fed665a4aaeae155d62 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Wed, 17 Jun 2020 15:53:44 +0200 Subject: [PATCH 8/8] Update meta_minishogi.sh --- meta_minishogi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meta_minishogi.sh b/meta_minishogi.sh index c091b85f..fcaf8b7a 100644 --- a/meta_minishogi.sh +++ b/meta_minishogi.sh @@ -1,6 +1,6 @@ # Launch this script, wait 10 days, and you should get an excellent minishogi model. -me=`whoami` | +export me=`whoami` | sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_minishogi_server.sh" | sleep 200 export host=`squeue -u $me| grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`