-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimulator.py
More file actions
87 lines (65 loc) · 4.79 KB
/
simulator.py
File metadata and controls
87 lines (65 loc) · 4.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import time
import os
import argparse
import utils
import cluster
def main(args):
code_start = time.perf_counter()
"""Logger Setting"""
log_dir = f"{args.log_dir}/{args.experiment_name}"
if not os.path.exists(log_dir):
os.makedirs(log_dir + "/logfile")
logger = utils.logger_init(file=f"{log_dir}/logfile/{args.scheduler}_{args.num_node}_{args.interarrival_time}")
logger.info(args)
trace_df, start_ts, trace_typical = utils.get_trace(args.experiment_name, args.trace_dir, read_full=True,
range=args.range, args=args)
logger.info(f"Total Job Number in Cluster Training: {len(trace_df)}")
trace = utils.trace_parser(trace_df, args.experiment_name, trace_typical, args)
CLUSTER = cluster.Cluster(args.num_node, args.num_gpus_per_node, args.num_cpus_per_node)
trace.reset_trace(CLUSTER)
utils.simulate(trace, CLUSTER, log_dir, args.scheduler, logger, start_ts, args)
logger.info(f"Execution Time: {round(time.perf_counter() - code_start, 2)}s")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simulator")
parser.add_argument("--experiment-name", default="PAI", type=str, help="Experiment Name")
parser.add_argument("--log-dir", default="./log", type=str, help="Log Directory")
parser.add_argument("--trace-dir", default="./clusterdata", type=str, help="Trace File Directory")
parser.add_argument("--trace-file", default="sampled_traces.csv", type=str, help="Trace File name")
parser.add_argument("--range", type=tuple, default=(0, 50000), help="Range of Jobs in Trace File")
parser.add_argument("--interarrival-time", type=float, default=8, help="control job arrival rate")
parser.add_argument("--num-node", type=int, default=64, choices=[32, 64, 128], help="Number of nodes")
parser.add_argument("--num-gpus-per-node", type=int, default=8, help="Number of GPUs per node")
parser.add_argument("--num-cpus-per-node", type=int, default=96, help="Number of CPU cores per node")
parser.add_argument(
"-s", "--scheduler", default="DRR", choices=["ElasticFlow", "R&P", "FGD", "Hops", "DRR"], type=str,
help="Scheduler Algorithm"
)
parser.add_argument("--fgd_scaling", type=float, default=0.2, help="FGD Scaling Factor")
# DRL
parser.add_argument("--maxcount", type=int, default=128, help="Max Count of Jobs in One Episode")
parser.add_argument("--batch_size", type=int, default=128, help="Batch Size for PPO Training")
parser.add_argument("--init_dim", type=int, default=3584, choices=[1792, 3584, 7168], help="Input Dimension of Full Connection Layer")
parser.add_argument("--hidden_size", type=int, default=512, help="Hidden Size of Full Connection Layer")
parser.add_argument("--action_space", type=int, default=64, choices=[32, 64, 128], help="Action Space Size")
parser.add_argument("--lr_actor", type=float, default=0.04, help="Learning Rate of Actor Network")
parser.add_argument("--lr_critic", type=float, default=0.02, help="Learning Rate of Critic Network")
parser.add_argument("--gamma", type=float, default=0.9, help="Discount Factor for Reward Calculation")
parser.add_argument("--k_epochs", type=int, default=10, help="Optimize Policy for K Epochs")
parser.add_argument("--node_feature", type=int, default=14, help="Node Feature Dimension")
parser.add_argument("--job_feature", type=int, default=2, help="Job Feature Dimension")
parser.add_argument("--eps_clip", type=float, default=0.2, help="Clip Parameter")
parser.add_argument("--action_std_init", type=float, default=0.6)
parser.add_argument("--use_imitation", type=bool, default=True, help="Whether to Use Imitation in Action Selection")
parser.add_argument("--imitation_loss_weight", type=float, default=0.1, help="Imitation Learning Loss Weight")
parser.add_argument("--use_dynamic_entropy", type=bool, default=True, help="Whether to Use Dynamic Entropy")
parser.add_argument("--beta0", type=float, default=0.04)
parser.add_argument("--beta1", type=float, default=0)
parser.add_argument("--use_attn", type=bool, default=True, help="Whether to Use Attention Mechanism in Actor Network")
parser.add_argument("--use_advantage_adjustment", type=float, default=0.6, help="Advantage Adjustment Coefficient, 0 means not using")
# Rescheduler
parser.add_argument("--use_rescheduling", type=bool, default=True, help="Whether to Use Rescheduler")
parser.add_argument("--util_threshold", type=float, default=0.5, help="Utilization Threshold for nodes partition")
parser.add_argument("--re_time", type=int, default=3600, help="Rescheduling Time Interval (seconds)")
parser.add_argument("--re_num", type=int, default=5, help="Max Number of Nodes to Reschedule Each Time")
args = parser.parse_args()
main(args)