Skip to content
This repository was archived by the owner on Mar 2, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions havannah10_alllaunch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
sbatch --partition=dev --time=72:00:00 --mem=150Go --job-name=pgserver --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh"
sleep 200
export host=`squeue -u oteytaud | grep -i pgserver | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`
echo "host=<${host}>"
sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser2 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh
"
sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh
"
sbatch -w $host --partition=dev --time=72:00:00 --mem=150Go --job-name=pgser3 --gres=gpu:8 --cpus-per-task=80 --wrap="./launch_havannah10_server.sh
"
./launch_havannah10_clients.sh
sleep 86400
./launch_havannah10_clients.sh
sleep 86400
./launch_havannah10_clients.sh
sleep 86400
./launch_havannah10_clients.sh
17 changes: 17 additions & 0 deletions launch_havannah10_clients.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

# Detecting the server
export host="`squeue -u oteytaud | grep -iv pd | grep -i pg | grep ser | sed 's/.*learnfair/learnfair/g' | sed 's/ //g'`"
echo "host=<${host}>"
for k in `seq 3`
do
sbatch --array=0-279%20 --comment=notenough --partition=learnfair --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -u -m
pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 c
uda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --ddp tr
ue --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --
nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " &
done
sbatch --array=0-279%20 --comment=notenough --partition=uninterrupted --time=72:00:00 --mem=150Go --job-name=polytrain --gres=gpu:8 --cpus-per-task=80 --wrap="python -
u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 120 --per_thread_batchsize 192 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda
:6 cuda:7 --nnks 3 --epoch_len 256 --batchsize 396 --sync_period 256 --num_rollouts 600 --replay_capacity 20000 --replay_warmup 2000 --do_not_save_replay_buffer --dd
p true --checkpoint_dir \"exps/yaclient_\$SLURM_JOB_ID\" --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks
3 --nnsize 5 --history 2 --nb_layers_per_net 6 --nb_nets 31 --bn --server_connect_hostname tcp://$host:10023 --num_game 20 " &
9 changes: 9 additions & 0 deletions launch_havannah10_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# The loop is just here in case of crash...
for i in 4096 2048 1024 512 256 128 64
do
LD_PRELOAD=/private/home/vegardmella/libjemalloc.so.1 \
python -u -m pypolygames train --max_time=259200 --saving_period=4 --num_game 40 --per_thread_batchsize 12 --device cuda:0 cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cu
da:6 cuda:7 --epoch_len 256 --batchsize $i --sync_period 32 --num_rollouts 600 --replay_capacity 100000 --replay_warmup 9000 --do_not_save_replay_buffer --ddp true --c
heckpoint_dir exps/bighavannah10 --out_feature --game_name havannah10pie --model_name ResConvConvLogitPoolModel --turn_features --bn --nnks 3 --history 2 --nnsize 5 --
nb_nets 31 --nb_layers_per_net 6 --nnks 3 --server_listen_endpoint tcp://*:10023 --num_game 0 --lr 1e-5
done