From 10d630e1486e7b5c36e2df82bc2b159b404e7172 Mon Sep 17 00:00:00 2001 From: Aditya Gudimella Date: Wed, 19 Aug 2020 14:41:17 -0700 Subject: [PATCH 1/2] Updated params for Pendulum --- rllib/tests/agents/parameters.py | 237 +++++++++++++++------------- rllib/tests/agents/test_learning.py | 1 + 2 files changed, 129 insertions(+), 109 deletions(-) diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py index 1deaac960d2a..83bffed590b8 100644 --- a/rllib/tests/agents/parameters.py +++ b/rllib/tests/agents/parameters.py @@ -212,124 +212,143 @@ def astuple(self): ] = [ x.astuple() for x in chain( - TestAgentParams.for_cart_pole( - algorithm=DiscreteActionSpaceAlgorithm.PPO, - config_updates={ - "num_gpus": 2, - "_fake_gpus": True, - "num_workers": 1, - "lr": 0.0003, - "observation_filter": "MeanStdFilter", - "num_sgd_iter": 6, - "vf_share_layers": True, - "vf_loss_coeff": 0.01, - "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"}, - }, - n_iter=200, - threshold=150.0, - ), + # TestAgentParams.for_cart_pole( + # algorithm=DiscreteActionSpaceAlgorithm.PPO, + # config_updates={ + # "num_gpus": 2, + # "_fake_gpus": True, + # "num_workers": 1, + # "lr": 0.0003, + # "observation_filter": "MeanStdFilter", + # "num_sgd_iter": 6, + # "vf_share_layers": True, + # "vf_loss_coeff": 0.01, + # "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"}, + # }, + # n_iter=200, + # threshold=150.0, + # ), + # TestAgentParams.for_pendulum( + # algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG, + # config_updates={ + # "use_huber": True, + # "clip_rewards": False, + # "num_workers": 4, + # "n_step": 1, + # "target_network_update_freq": 50000, + # "tau": 1.0, + # }, + # n_iter=200, + # threshold=-750.0, + # ), + # TestAgentParams.for_cart_pole( + # algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN, + # config_updates={ + # "target_network_update_freq": 20000, + # "num_workers": 4, + # "num_envs_per_worker": 8, + # "train_batch_size": 64, + # "gamma": 0.95, + # }, + # n_iter=200, + # threshold=150.0, + # ), + # TestAgentParams.for_cart_pole( + # algorithm=DiscreteActionSpaceAlgorithm.SAC, + # config_updates={ + # "num_workers": 4, + # "twin_q": True, + # "soft_horizon": True, + # "clip_actions": False, + # "normalize_actions": True, + # "learning_starts": 0, + # "prioritized_replay": True, + # "Q_model": {"fcnet_hiddens": [64, 64]}, + # "policy_model": {"fcnet_hiddens": [64, 64],}, + # }, + # n_iter=200, + # threshold=100.0, + # ), + # TestAgentParams.for_cart_pole( + # algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, + # config_updates={ + # "num_workers": 8, + # "prioritized_replay": True, + # "timesteps_per_iteration": 2500, + # "min_iter_time_s": 1, + # "optimizer": {"max_weight_sync_delay": 100}, + # "learning_starts": 6000, + # # "exploration_config": {"type": "StochasticSampling"}, + # "worker_side_prioritization": True, + # "rollout_fragment_length": 50, + # # "no_done_at_end": True + # }, + # # config_updates={ + # # "seed": 42, + # # "num_workers": 8, + # # "buffer_size": 200000, + # # "learning_starts": 6000, + # # "train_batch_size": 64, + # # "target_network_update_freq": 0, + # # "timesteps_per_iteration": 2500, + # # "min_iter_time_s": 3, + # # }, + # n_iter=200, + # threshold=175.0, + # frameworks=[Framework.TensorFlow], + # ), TestAgentParams.for_pendulum( - algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG, - config_updates={ - "use_huber": True, - "clip_rewards": False, - "num_workers": 4, - "n_step": 1, - "target_network_update_freq": 50000, - "tau": 1.0, - }, - n_iter=200, - threshold=-750.0, - ), - TestAgentParams.for_cart_pole( - algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN, - config_updates={ - "target_network_update_freq": 20000, - "num_workers": 4, - "num_envs_per_worker": 8, - "train_batch_size": 64, - "gamma": 0.95, - }, - n_iter=200, - threshold=150.0, - ), - TestAgentParams.for_cart_pole( - algorithm=DiscreteActionSpaceAlgorithm.SAC, - config_updates={ - "num_workers": 4, - "twin_q": True, - "soft_horizon": True, - "clip_actions": False, - "normalize_actions": True, - "learning_starts": 0, - "prioritized_replay": True, - "Q_model": {"fcnet_hiddens": [64, 64]}, - "policy_model": {"fcnet_hiddens": [64, 64],}, - }, - n_iter=200, - threshold=100.0, - ), - TestAgentParams.for_cart_pole( algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, config_updates={ - "seed": 42, "num_workers": 8, - "buffer_size": 200000, - "learning_starts": 6000, - "train_batch_size": 64, - "target_network_update_freq": 0, - "timesteps_per_iteration": 2500, - "min_iter_time_s": 3, - }, - n_iter=200, - threshold=175.0, - ), - TestAgentParams.for_pendulum( - algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, - config_updates={ - "num_workers": 4, "prioritized_replay": True, - "timesteps_per_iteration": 100, + "timesteps_per_iteration": 2500, "min_iter_time_s": 1, - "optimizer": {"num_replay_buffer_shards": 1}, - "learning_starts": 0, - }, - n_iter=200, - ), - TestAgentParams.for_pendulum( - algorithm=DiscreteActionSpaceAlgorithm.SAC, - config_updates={ - "horizon": 200, - "soft_horizon": True, - "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]}, - "policy_model": { - "fcnet_activation": "relu", - "fcnet_hiddens": [256, 256], - }, - "tau": 0.005, - "target_entropy": "auto", - "no_done_at_end": True, - "n_step": 1, + "optimizer": {"max_weight_sync_delay": 100}, + "learning_starts": 6000, + "exploration_config": {"type": "StochasticSampling"}, + "worker_side_prioritization": True, "rollout_fragment_length": 1, - "prioritized_replay": True, - "train_batch_size": 256, - "target_network_update_freq": 1, - "timesteps_per_iteration": 1000, - "learning_starts": 256, - "optimization": { - "actor_learning_rate": 0.0003, - "critic_learning_rate": 0.0003, - "entropy_learning_rate": 0.0003, - }, - "num_workers": 4, - "num_gpus": 0, - "clip_actions": False, - "normalize_actions": True, - "metrics_smoothing_episodes": 5, + "no_done_at_end": True }, - n_iter=200, - threshold=-750.0, + n_iter=1000, + threshold=-350.0, + frameworks=[Framework.TensorFlow], ), + # TestAgentParams.for_pendulum( + # algorithm=DiscreteActionSpaceAlgorithm.SAC, + # config_updates={ + # "horizon": 200, + # "soft_horizon": True, + # "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]}, + # "policy_model": { + # "fcnet_activation": "relu", + # "fcnet_hiddens": [256, 256], + # }, + # "tau": 0.005, + # "target_entropy": "auto", + # "no_done_at_end": True, + # "n_step": 1, + # "rollout_fragment_length": 1, + # "prioritized_replay": True, + # "train_batch_size": 256, + # "target_network_update_freq": 1, + # "timesteps_per_iteration": 1000, + # "learning_starts": 256, + # "optimization": { + # "actor_learning_rate": 0.0003, + # "critic_learning_rate": 0.0003, + # "entropy_learning_rate": 0.0003, + # }, + # "num_workers": 4, + # "num_gpus": 0, + # "clip_actions": False, + # "normalize_actions": True, + # "metrics_smoothing_episodes": 5, + # }, + # n_iter=200, + # threshold=-750.0, + # ), ) ] diff --git a/rllib/tests/agents/test_learning.py b/rllib/tests/agents/test_learning.py index 3e55f79bf583..99d905053240 100644 --- a/rllib/tests/agents/test_learning.py +++ b/rllib/tests/agents/test_learning.py @@ -80,6 +80,7 @@ def test_monotonically_improving_algorithms_can_converge_with_different_framewor for i in range(n_iter): results = trainer.train() episode_reward_mean = results["episode_reward_mean"] + print(episode_reward_mean) if episode_reward_mean >= threshold: learnt = True break From 150f9ea81af05aa7bb2d61056ccd4314d6450696 Mon Sep 17 00:00:00 2001 From: Aditya Gudimella Date: Mon, 24 Aug 2020 12:51:50 -0700 Subject: [PATCH 2/2] Tuned hyperparams for ApexSAC --- rllib/agents/sac/apex.py | 8 +- rllib/optimizers/async_replay_optimizer.py | 31 ++- rllib/tests/agents/parameters.py | 241 ++++++++++----------- rllib/tests/agents/test_learning.py | 1 - 4 files changed, 135 insertions(+), 146 deletions(-) diff --git a/rllib/agents/sac/apex.py b/rllib/agents/sac/apex.py index f60c4287431a..78618ffc0d93 100644 --- a/rllib/agents/sac/apex.py +++ b/rllib/agents/sac/apex.py @@ -17,15 +17,15 @@ "n_step": 1, "num_gpus": 0, "num_workers": 32, - "buffer_size": 2000000, - "learning_starts": 50000, + "buffer_size": 200000, + "learning_starts": 5000, "train_batch_size": 512, "rollout_fragment_length": 50, "target_network_update_freq": 0, - "timesteps_per_iteration": 25000, + "timesteps_per_iteration": 1000, "exploration_config": {"type": "PerWorkerEpsilonGreedy"}, "worker_side_prioritization": True, - "min_iter_time_s": 30, + "min_iter_time_s": 10, "prioritized_replay": True, }, ) diff --git a/rllib/optimizers/async_replay_optimizer.py b/rllib/optimizers/async_replay_optimizer.py index eec759d13e65..056beac37275 100644 --- a/rllib/optimizers/async_replay_optimizer.py +++ b/rllib/optimizers/async_replay_optimizer.py @@ -474,20 +474,27 @@ def step(self): except KeyError: pass else: - is_box_action_space = all(( - hasattr(self.local_worker, "policy_map"), - self.local_worker.policy_map.get( - "default_policy", None - ) is not None, - isinstance( - self.local_worker.policy_map[ - "default_policy" - ].action_space, - gym.spaces.Box + is_box_action_space = all( + ( + hasattr(self.local_worker, "policy_map"), + self.local_worker.policy_map.get("default_policy", None) + is not None, + isinstance( + self.local_worker.policy_map[ + "default_policy" + ].action_space, + gym.spaces.Box, + ), ) - )) + ) if is_box_action_space: - batch["actions"] = batch["actions"].reshape((-1, 1)) + # Reshape to (batch_size, action_space_dim) + action_space = self.local_worker.policy_map[ + "default_policy" + ].action_space + batch["actions"] = batch["actions"].reshape( + (-1, action_space.shape[0]) + ) grad_out = self.local_worker.learn_on_batch(replay) for pid, info in grad_out.items(): td_error = info.get( diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py index 83bffed590b8..94bf9722d09e 100644 --- a/rllib/tests/agents/parameters.py +++ b/rllib/tests/agents/parameters.py @@ -212,143 +212,126 @@ def astuple(self): ] = [ x.astuple() for x in chain( - # TestAgentParams.for_cart_pole( - # algorithm=DiscreteActionSpaceAlgorithm.PPO, - # config_updates={ - # "num_gpus": 2, - # "_fake_gpus": True, - # "num_workers": 1, - # "lr": 0.0003, - # "observation_filter": "MeanStdFilter", - # "num_sgd_iter": 6, - # "vf_share_layers": True, - # "vf_loss_coeff": 0.01, - # "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"}, - # }, - # n_iter=200, - # threshold=150.0, - # ), - # TestAgentParams.for_pendulum( - # algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG, - # config_updates={ - # "use_huber": True, - # "clip_rewards": False, - # "num_workers": 4, - # "n_step": 1, - # "target_network_update_freq": 50000, - # "tau": 1.0, - # }, - # n_iter=200, - # threshold=-750.0, - # ), - # TestAgentParams.for_cart_pole( - # algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN, - # config_updates={ - # "target_network_update_freq": 20000, - # "num_workers": 4, - # "num_envs_per_worker": 8, - # "train_batch_size": 64, - # "gamma": 0.95, - # }, - # n_iter=200, - # threshold=150.0, - # ), - # TestAgentParams.for_cart_pole( - # algorithm=DiscreteActionSpaceAlgorithm.SAC, - # config_updates={ - # "num_workers": 4, - # "twin_q": True, - # "soft_horizon": True, - # "clip_actions": False, - # "normalize_actions": True, - # "learning_starts": 0, - # "prioritized_replay": True, - # "Q_model": {"fcnet_hiddens": [64, 64]}, - # "policy_model": {"fcnet_hiddens": [64, 64],}, - # }, - # n_iter=200, - # threshold=100.0, - # ), - # TestAgentParams.for_cart_pole( - # algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, - # config_updates={ - # "num_workers": 8, - # "prioritized_replay": True, - # "timesteps_per_iteration": 2500, - # "min_iter_time_s": 1, - # "optimizer": {"max_weight_sync_delay": 100}, - # "learning_starts": 6000, - # # "exploration_config": {"type": "StochasticSampling"}, - # "worker_side_prioritization": True, - # "rollout_fragment_length": 50, - # # "no_done_at_end": True - # }, - # # config_updates={ - # # "seed": 42, - # # "num_workers": 8, - # # "buffer_size": 200000, - # # "learning_starts": 6000, - # # "train_batch_size": 64, - # # "target_network_update_freq": 0, - # # "timesteps_per_iteration": 2500, - # # "min_iter_time_s": 3, - # # }, - # n_iter=200, - # threshold=175.0, - # frameworks=[Framework.TensorFlow], - # ), + TestAgentParams.for_cart_pole( + algorithm=DiscreteActionSpaceAlgorithm.PPO, + config_updates={ + "num_gpus": 2, + "_fake_gpus": True, + "num_workers": 1, + "lr": 0.0003, + "observation_filter": "MeanStdFilter", + "num_sgd_iter": 6, + "vf_share_layers": True, + "vf_loss_coeff": 0.01, + "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"}, + }, + n_iter=200, + threshold=150.0, + ), + TestAgentParams.for_pendulum( + algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG, + config_updates={ + "use_huber": True, + "clip_rewards": False, + "num_workers": 4, + "n_step": 1, + "target_network_update_freq": 50000, + "tau": 1.0, + }, + n_iter=200, + threshold=-750.0, + ), + TestAgentParams.for_cart_pole( + algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN, + config_updates={ + "target_network_update_freq": 20000, + "num_workers": 4, + "num_envs_per_worker": 8, + "train_batch_size": 64, + "gamma": 0.95, + }, + n_iter=200, + threshold=150.0, + ), + TestAgentParams.for_cart_pole( + algorithm=DiscreteActionSpaceAlgorithm.SAC, + config_updates={ + "num_workers": 4, + "twin_q": True, + "soft_horizon": True, + "clip_actions": False, + "normalize_actions": True, + "learning_starts": 0, + "prioritized_replay": True, + "Q_model": {"fcnet_hiddens": [64, 64]}, + "policy_model": {"fcnet_hiddens": [64, 64],}, + }, + n_iter=200, + threshold=100.0, + ), + TestAgentParams.for_cart_pole( + algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, + config_updates={ + "num_workers": 8, + }, + n_iter=100, + threshold=175.0, + ), TestAgentParams.for_pendulum( algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, config_updates={ "num_workers": 8, - "prioritized_replay": True, - "timesteps_per_iteration": 2500, - "min_iter_time_s": 1, - "optimizer": {"max_weight_sync_delay": 100}, - "learning_starts": 6000, "exploration_config": {"type": "StochasticSampling"}, - "worker_side_prioritization": True, - "rollout_fragment_length": 1, - "no_done_at_end": True + "no_done_at_end": True, }, - n_iter=1000, + n_iter=100, threshold=-350.0, - frameworks=[Framework.TensorFlow], ), - # TestAgentParams.for_pendulum( - # algorithm=DiscreteActionSpaceAlgorithm.SAC, - # config_updates={ - # "horizon": 200, - # "soft_horizon": True, - # "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]}, - # "policy_model": { - # "fcnet_activation": "relu", - # "fcnet_hiddens": [256, 256], - # }, - # "tau": 0.005, - # "target_entropy": "auto", - # "no_done_at_end": True, - # "n_step": 1, - # "rollout_fragment_length": 1, - # "prioritized_replay": True, - # "train_batch_size": 256, - # "target_network_update_freq": 1, - # "timesteps_per_iteration": 1000, - # "learning_starts": 256, - # "optimization": { - # "actor_learning_rate": 0.0003, - # "critic_learning_rate": 0.0003, - # "entropy_learning_rate": 0.0003, - # }, - # "num_workers": 4, - # "num_gpus": 0, - # "clip_actions": False, - # "normalize_actions": True, - # "metrics_smoothing_episodes": 5, - # }, - # n_iter=200, - # threshold=-750.0, - # ), + TestAgentParams.for_frameworks( + algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, + config_updates={ + "num_workers": 8, + "exploration_config": {"type": "StochasticSampling"}, + }, + env="MountainCarContinuous-v0", + # TensorFlow returns Nan mean ep reward for the first few epoch + n_iter=100, + threshold=100.0, + ), + TestAgentParams.for_pendulum( + algorithm=DiscreteActionSpaceAlgorithm.SAC, + config_updates={ + "horizon": 200, + "soft_horizon": True, + "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]}, + "policy_model": { + "fcnet_activation": "relu", + "fcnet_hiddens": [256, 256], + }, + "tau": 0.005, + "target_entropy": "auto", + "no_done_at_end": True, + "n_step": 1, + "rollout_fragment_length": 1, + "prioritized_replay": True, + "train_batch_size": 256, + "target_network_update_freq": 1, + "timesteps_per_iteration": 1000, + "learning_starts": 256, + "optimization": { + "actor_learning_rate": 0.0003, + "critic_learning_rate": 0.0003, + "entropy_learning_rate": 0.0003, + }, + "num_workers": 4, + "num_gpus": 0, + "clip_actions": False, + "normalize_actions": True, + "metrics_smoothing_episodes": 5, + }, + n_iter=200, + threshold=-750.0, + ), ) ] diff --git a/rllib/tests/agents/test_learning.py b/rllib/tests/agents/test_learning.py index 99d905053240..3e55f79bf583 100644 --- a/rllib/tests/agents/test_learning.py +++ b/rllib/tests/agents/test_learning.py @@ -80,7 +80,6 @@ def test_monotonically_improving_algorithms_can_converge_with_different_framewor for i in range(n_iter): results = trainer.train() episode_reward_mean = results["episode_reward_mean"] - print(episode_reward_mean) if episode_reward_mean >= threshold: learnt = True break