diff --git a/havannah10_alllaunch.sh b/havannah10_alllaunch.sh new file mode 100644 index 00000000..861e231d --- /dev/null +++ b/havannah10_alllaunch.sh @@ -0,0 +1,17 @@ +sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh" +sleep 200 +export host=`squeue -u oteytaud | grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'` +echo "host=<${host}>" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" +sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh +" +./launch_havannah10_clients.sh +sleep 86400 +./launch_havannah10_clients.sh +sleep 86400 +./launch_havannah10_clients.sh +sleep 86400 +./launch_havannah10_clients.sh diff --git a/launch_havannah10_clients.sh b/launch_havannah10_clients.sh new file mode 100644 index 00000000..ee7b5d0f --- /dev/null +++ b/launch_havannah10_clients.sh @@ -0,0 +1,17 @@ + +# Detecting the server +export host="`squeue -u oteytaud | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`" +echo "host=<${host}>" +for k in `seq 3` +do +sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m + pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 c +uda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp tr +ue --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 -- +nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & +done +sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python - +u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda +:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --dd +p true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks +3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " & diff --git a/launch_havannah10_server.sh b/launch_havannah10_server.sh new file mode 100644 index 00000000..f92858f9 --- /dev/null +++ b/launch_havannah10_server.sh @@ -0,0 +1,9 @@ +# The loop is just here in case of crash... +for i in 4096 2048 1024 512 256 128 64 +do +LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \ +python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cu +da:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --c +heckpoint_dir exps/bighavannah10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 -- +nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5 +done