From bba3f9a3b38aa6ac736684e97a86bce9e89df8dc Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 16 May 2020 08:52:48 +0200 Subject: [PATCH 1/8] Create launch_havannah10_clients.sh --- launch_havannah10_clients.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 launch_havannah10_clients.sh diff --git a/launch_havannah10_clients.sh b/launch_havannah10_clients.sh new file mode 100644 index 00000000..ef3a3c9e --- /dev/null +++ b/launch_havannah10_clients.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Extracts the machine name for the server: +export host="`squeue -u oteytaud | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" + +# Checked: this is ok. +echo "host=<${host}>" + + +# Launching three arrays. +for k in `seq 3` +do +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 -- +wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 +cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 999 --replay_capacity 20000 --r +eplay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie +--model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 19 --bn --server_conne +ct_hostname tcp://$host:10023 --num_game 20 " & +done From 9f43afc13cbd2a0e0820ffde6dc62ae1387909a0 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 16 May 2020 08:54:07 +0200 Subject: [PATCH 2/8] Create launch_havannah10_server.sh --- launch_havannah10_server.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 launch_havannah10_server.sh diff --git a/launch_havannah10_server.sh b/launch_havannah10_server.sh new file mode 100644 index 00000000..79a37dbc --- /dev/null +++ b/launch_havannah10_server.sh @@ -0,0 +1,11 @@ +# The loop is just here in case of crash... + +for i in `seq 50` +do +LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda: +3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize 3072 --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do +_not_save_replay_buffer --ddp true --checkpoint_dir exps/totoro10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --tu +rn_features --bn --nnks 3 --history 2 --nnsize 5 --nb_nets 19 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr + 1e-5 +done From 126e36bf3fcb928d2f28bf30b60f61b87deda77c Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sat, 16 May 2020 08:56:43 +0200 Subject: [PATCH 3/8] Create havannah10_alllaunch.sh --- havannah10_alllaunch.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 havannah10_alllaunch.sh diff --git a/havannah10_alllaunch.sh b/havannah10_alllaunch.sh new file mode 100644 index 00000000..1a7ce380 --- /dev/null +++ b/havannah10_alllaunch.sh @@ -0,0 +1,23 @@ +# Launch a server on dev. +sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" + +# Wait a bit for launching the clients +sleep 200 +./launch_havannah10_clients.sh +sleep 86400 + +# We want the same machine for the next server. +export host=`squeue -u oteytaud | grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'` +echo "host=<${host}>" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" + +# And more clients. +./launch_havannah10_clients.sh +sleep 86400 + +sleep 86400 +./launch_havannah10_clients.sh + +sleep 86400 +./launch_havannah10_clients.sh From c6a56eb28b6d6a9dae784156c1165568008c968d Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sun, 7 Jun 2020 11:31:13 +0200 Subject: [PATCH 4/8] Update havannah10_alllaunch.sh --- havannah10_alllaunch.sh | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/havannah10_alllaunch.sh b/havannah10_alllaunch.sh index 1a7ce380..861e231d 100644 --- a/havannah10_alllaunch.sh +++ b/havannah10_alllaunch.sh @@ -1,23 +1,17 @@ -# Launch a server on dev. sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" - -# Wait a bit for launching the clients sleep 200 -./launch_havannah10_clients.sh -sleep 86400 - -# We want the same machine for the next server. export host=`squeue -u oteytaud | grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'` echo "host=<${host}>" -sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" -sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" - -# And more clients. +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" ./launch_havannah10_clients.sh sleep 86400 - +./launch_havannah10_clients.sh sleep 86400 ./launch_havannah10_clients.sh - sleep 86400 ./launch_havannah10_clients.sh From e13ac56df676e1ce1fefbfac9742ba56ca05c180 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sun, 7 Jun 2020 11:31:51 +0200 Subject: [PATCH 5/8] Update launch_havannah10_clients.sh --- launch_havannah10_clients.sh | 52 ++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/launch_havannah10_clients.sh b/launch_havannah10_clients.sh index ef3a3c9e..606cb0c3 100644 --- a/launch_havannah10_clients.sh +++ b/launch_havannah10_clients.sh @@ -1,19 +1,43 @@ -#!/bin/bash - -# Extracts the machine name for the server: +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=10022 +export MASTER_PORT=10023 +export MASTER_PORT=10022 +export RANK=0 +export WORLD_SIZE=1 +#for k in `seq 12` +#do export host="`squeue -u oteytaud | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" - -# Checked: this is ok. echo "host=<${host}>" - - -# Launching three arrays. for k in `seq 3` do -sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 -- -wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 -cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 999 --replay_capacity 20000 --r -eplay_warmup 2000 --do_not_save_replay_buffer --ddp true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie ---model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 19 --bn --server_conne -ct_hostname tcp://$host:10023 --num_game 20 " & +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m + pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 c +uda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 100 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp tr +ue --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 -- +nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & done +sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python - +u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda +:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 100 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --dd +p true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks +3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +#export host=learnfair0932; sbatch --array=0-179%25 --comment=icganote_dec19 --partition=priority --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus +-per-task=50 --wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --savin +g_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 + --sync_period 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed +'s/ /_/g'`_$i" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 + --bn --server_connect_hostname tcp://$host:10023 --num_game 32 " +#export host=learnfair0932; sbatch --array=0-179%30 --comment=havperf --partition=learnfair --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus-per-t +ask=50 --wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --saving_peri +od=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --syn +c_period 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed 's/ /_ +/g'`_$i" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 --bn + --server_connect_hostname tcp://$host:10023 --num_game 32 " +#export host=learnfair0932; sbatch --array=0-40%1 --comment=havperf --partition=dev --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=50 +--wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --saving_period=4 -- +num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_perio +d 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed 's/ /_/g'`_$i +" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 --bn --serv +er_connect_hostname tcp://$host:10023 --num_game 32 " +#sleep 7200 +#done From 8a174f92f22045a6dcbe21b43cbe27dd566c69f9 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Sun, 7 Jun 2020 11:32:56 +0200 Subject: [PATCH 6/8] Update launch_havannah10_server.sh --- launch_havannah10_server.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/launch_havannah10_server.sh b/launch_havannah10_server.sh index 79a37dbc..83a53962 100644 --- a/launch_havannah10_server.sh +++ b/launch_havannah10_server.sh @@ -1,11 +1,9 @@ # The loop is just here in case of crash... - for i in `seq 50` do LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ -python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda: -3 cuda:4 cuda:5 cuda:6 cuda:7 --epoch_len 256 --batchsize 3072 --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do -_not_save_replay_buffer --ddp true --checkpoint_dir exps/totoro10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --tu -rn_features --bn --nnks 3 --history 2 --nnsize 5 --nb_nets 19 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr - 1e-5 +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cu +da:6 cuda:7 --epoch_len 256 --batchsize 512 --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true -- +checkpoint_dir exps/bighavannah10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 - +-nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 done From e0120cba97403dc6fe21f3172d26514e27f9ded4 Mon Sep 17 00:00:00 2001 From: Teytaud Date: Tue, 9 Jun 2020 20:56:55 +0200 Subject: [PATCH 7/8] Update launch_havannah10_server.sh --- launch_havannah10_server.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/launch_havannah10_server.sh b/launch_havannah10_server.sh index 83a53962..f92858f9 100644 --- a/launch_havannah10_server.sh +++ b/launch_havannah10_server.sh @@ -1,9 +1,9 @@ # The loop is just here in case of crash... -for i in `seq 50` +for i in 4096 2048 1024 512 256 128 64 do LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cu -da:6 cuda:7 --epoch_len 256 --batchsize 512 --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true -- -checkpoint_dir exps/bighavannah10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 - --nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 +da:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --c +heckpoint_dir exps/bighavannah10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 -- +nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 done From 47508617c0a5558c5f0b80cf1b36a203726f443b Mon Sep 17 00:00:00 2001 From: Teytaud Date: Tue, 9 Jun 2020 20:57:50 +0200 Subject: [PATCH 8/8] Update launch_havannah10_clients.sh --- launch_havannah10_clients.sh | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/launch_havannah10_clients.sh b/launch_havannah10_clients.sh index 606cb0c3..ee7b5d0f 100644 --- a/launch_havannah10_clients.sh +++ b/launch_havannah10_clients.sh @@ -1,43 +1,17 @@ -export MASTER_ADDR=127.0.0.1 -export MASTER_PORT=10022 -export MASTER_PORT=10023 -export MASTER_PORT=10022 -export RANK=0 -export WORLD_SIZE=1 -#for k in `seq 12` -#do + +# Detecting the server export host="`squeue -u oteytaud | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" echo "host=<${host}>" for k in `seq 3` do sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 c -uda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 100 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp tr +uda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp tr ue --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 -- nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & done sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python - u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda -:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 100 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --dd +:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --dd p true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & -#export host=learnfair0932; sbatch --array=0-179%25 --comment=icganote_dec19 --partition=priority --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus --per-task=50 --wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --savin -g_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 - --sync_period 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed -'s/ /_/g'`_$i" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 - --bn --server_connect_hostname tcp://$host:10023 --num_game 32 " -#export host=learnfair0932; sbatch --array=0-179%30 --comment=havperf --partition=learnfair --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus-per-t -ask=50 --wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --saving_peri -od=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --syn -c_period 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed 's/ /_ -/g'`_$i" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 --bn - --server_connect_hostname tcp://$host:10023 --num_game 32 " -#export host=learnfair0932; sbatch --array=0-40%1 --comment=havperf --partition=dev --time=72:00:00 --mem=100Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=50 ---wrap="singularity exec --nv --overlay overlay.img /checkpoint/polygames/polygames_190927.simg python -u -m pypolygames train --max_time=259200 --saving_period=4 -- -num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_perio -d 256 --num_rollouts 600 --replay_capacity 2000000 --replay_warmup 200000 --do_not_save_replay_buffer --ddp true --checkpoint_dir "exps/yo`date | sed 's/ /_/g'`_$i -" --out_feature --game_name havannah10pie --model_name ResConvConvLogitModel --turn_features --bn --nnks 3 --nnsize 8 --nb_layers_per_net 6 --nb_nets 19 --bn --serv -er_connect_hostname tcp://$host:10023 --num_game 32 " -#sleep 7200 -#done