From 10d630e1486e7b5c36e2df82bc2b159b404e7172 Mon Sep 17 00:00:00 2001
From: Aditya Gudimella <aditya.gudimella@gmail.com>
Date: Wed, 19 Aug 2020 14:41:17 -0700
Subject: [PATCH 1/2] Updated params for Pendulum

---
 rllib/tests/agents/parameters.py    | 237 +++++++++++++++-------------
 rllib/tests/agents/test_learning.py |   1 +
 2 files changed, 129 insertions(+), 109 deletions(-)

diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py
index 1deaac960d2a..83bffed590b8 100644
--- a/rllib/tests/agents/parameters.py
+++ b/rllib/tests/agents/parameters.py
@@ -212,124 +212,143 @@ def astuple(self):
 ] = [
     x.astuple()
     for x in chain(
-        TestAgentParams.for_cart_pole(
-            algorithm=DiscreteActionSpaceAlgorithm.PPO,
-            config_updates={
-                "num_gpus": 2,
-                "_fake_gpus": True,
-                "num_workers": 1,
-                "lr": 0.0003,
-                "observation_filter": "MeanStdFilter",
-                "num_sgd_iter": 6,
-                "vf_share_layers": True,
-                "vf_loss_coeff": 0.01,
-                "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"},
-            },
-            n_iter=200,
-            threshold=150.0,
-        ),
+        # TestAgentParams.for_cart_pole(
+        #     algorithm=DiscreteActionSpaceAlgorithm.PPO,
+        #     config_updates={
+        #         "num_gpus": 2,
+        #         "_fake_gpus": True,
+        #         "num_workers": 1,
+        #         "lr": 0.0003,
+        #         "observation_filter": "MeanStdFilter",
+        #         "num_sgd_iter": 6,
+        #         "vf_share_layers": True,
+        #         "vf_loss_coeff": 0.01,
+        #         "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"},
+        #     },
+        #     n_iter=200,
+        #     threshold=150.0,
+        # ),
+        # TestAgentParams.for_pendulum(
+        #     algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG,
+        #     config_updates={
+        #         "use_huber": True,
+        #         "clip_rewards": False,
+        #         "num_workers": 4,
+        #         "n_step": 1,
+        #         "target_network_update_freq": 50000,
+        #         "tau": 1.0,
+        #     },
+        #     n_iter=200,
+        #     threshold=-750.0,
+        # ),
+        # TestAgentParams.for_cart_pole(
+        #     algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN,
+        #     config_updates={
+        #         "target_network_update_freq": 20000,
+        #         "num_workers": 4,
+        #         "num_envs_per_worker": 8,
+        #         "train_batch_size": 64,
+        #         "gamma": 0.95,
+        #     },
+        #     n_iter=200,
+        #     threshold=150.0,
+        # ),
+        # TestAgentParams.for_cart_pole(
+        #     algorithm=DiscreteActionSpaceAlgorithm.SAC,
+        #     config_updates={
+        #         "num_workers": 4,
+        #         "twin_q": True,
+        #         "soft_horizon": True,
+        #         "clip_actions": False,
+        #         "normalize_actions": True,
+        #         "learning_starts": 0,
+        #         "prioritized_replay": True,
+        #         "Q_model": {"fcnet_hiddens": [64, 64]},
+        #         "policy_model": {"fcnet_hiddens": [64, 64],},
+        #     },
+        #     n_iter=200,
+        #     threshold=100.0,
+        # ),
+        # TestAgentParams.for_cart_pole(
+        #     algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
+        #     config_updates={
+        #         "num_workers": 8,
+        #         "prioritized_replay": True,
+        #         "timesteps_per_iteration": 2500,
+        #         "min_iter_time_s": 1,
+        #         "optimizer": {"max_weight_sync_delay": 100},
+        #         "learning_starts": 6000,
+        #         # "exploration_config": {"type": "StochasticSampling"},
+        #         "worker_side_prioritization": True,
+        #         "rollout_fragment_length": 50,
+        #         # "no_done_at_end": True
+        #     },
+        #     # config_updates={
+        #     #     "seed": 42,
+        #     #     "num_workers": 8,
+        #     #     "buffer_size": 200000,
+        #     #     "learning_starts": 6000,
+        #     #     "train_batch_size": 64,
+        #     #     "target_network_update_freq": 0,
+        #     #     "timesteps_per_iteration": 2500,
+        #     #     "min_iter_time_s": 3,
+        #     # },
+        #     n_iter=200,
+        #     threshold=175.0,
+        #     frameworks=[Framework.TensorFlow],
+        # ),
         TestAgentParams.for_pendulum(
-            algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG,
-            config_updates={
-                "use_huber": True,
-                "clip_rewards": False,
-                "num_workers": 4,
-                "n_step": 1,
-                "target_network_update_freq": 50000,
-                "tau": 1.0,
-            },
-            n_iter=200,
-            threshold=-750.0,
-        ),
-        TestAgentParams.for_cart_pole(
-            algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN,
-            config_updates={
-                "target_network_update_freq": 20000,
-                "num_workers": 4,
-                "num_envs_per_worker": 8,
-                "train_batch_size": 64,
-                "gamma": 0.95,
-            },
-            n_iter=200,
-            threshold=150.0,
-        ),
-        TestAgentParams.for_cart_pole(
-            algorithm=DiscreteActionSpaceAlgorithm.SAC,
-            config_updates={
-                "num_workers": 4,
-                "twin_q": True,
-                "soft_horizon": True,
-                "clip_actions": False,
-                "normalize_actions": True,
-                "learning_starts": 0,
-                "prioritized_replay": True,
-                "Q_model": {"fcnet_hiddens": [64, 64]},
-                "policy_model": {"fcnet_hiddens": [64, 64],},
-            },
-            n_iter=200,
-            threshold=100.0,
-        ),
-        TestAgentParams.for_cart_pole(
             algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
             config_updates={
-                "seed": 42,
                 "num_workers": 8,
-                "buffer_size": 200000,
-                "learning_starts": 6000,
-                "train_batch_size": 64,
-                "target_network_update_freq": 0,
-                "timesteps_per_iteration": 2500,
-                "min_iter_time_s": 3,
-            },
-            n_iter=200,
-            threshold=175.0,
-        ),
-        TestAgentParams.for_pendulum(
-            algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
-            config_updates={
-                "num_workers": 4,
                 "prioritized_replay": True,
-                "timesteps_per_iteration": 100,
+                "timesteps_per_iteration": 2500,
                 "min_iter_time_s": 1,
-                "optimizer": {"num_replay_buffer_shards": 1},
-                "learning_starts": 0,
-            },
-            n_iter=200,
-        ),
-        TestAgentParams.for_pendulum(
-            algorithm=DiscreteActionSpaceAlgorithm.SAC,
-            config_updates={
-                "horizon": 200,
-                "soft_horizon": True,
-                "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]},
-                "policy_model": {
-                    "fcnet_activation": "relu",
-                    "fcnet_hiddens": [256, 256],
-                },
-                "tau": 0.005,
-                "target_entropy": "auto",
-                "no_done_at_end": True,
-                "n_step": 1,
+                "optimizer": {"max_weight_sync_delay": 100},
+                "learning_starts": 6000,
+                "exploration_config": {"type": "StochasticSampling"},
+                "worker_side_prioritization": True,
                 "rollout_fragment_length": 1,
-                "prioritized_replay": True,
-                "train_batch_size": 256,
-                "target_network_update_freq": 1,
-                "timesteps_per_iteration": 1000,
-                "learning_starts": 256,
-                "optimization": {
-                    "actor_learning_rate": 0.0003,
-                    "critic_learning_rate": 0.0003,
-                    "entropy_learning_rate": 0.0003,
-                },
-                "num_workers": 4,
-                "num_gpus": 0,
-                "clip_actions": False,
-                "normalize_actions": True,
-                "metrics_smoothing_episodes": 5,
+                "no_done_at_end": True
             },
-            n_iter=200,
-            threshold=-750.0,
+            n_iter=1000,
+            threshold=-350.0,
+            frameworks=[Framework.TensorFlow],
         ),
+        # TestAgentParams.for_pendulum(
+        #     algorithm=DiscreteActionSpaceAlgorithm.SAC,
+        #     config_updates={
+        #         "horizon": 200,
+        #         "soft_horizon": True,
+        #         "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]},
+        #         "policy_model": {
+        #             "fcnet_activation": "relu",
+        #             "fcnet_hiddens": [256, 256],
+        #         },
+        #         "tau": 0.005,
+        #         "target_entropy": "auto",
+        #         "no_done_at_end": True,
+        #         "n_step": 1,
+        #         "rollout_fragment_length": 1,
+        #         "prioritized_replay": True,
+        #         "train_batch_size": 256,
+        #         "target_network_update_freq": 1,
+        #         "timesteps_per_iteration": 1000,
+        #         "learning_starts": 256,
+        #         "optimization": {
+        #             "actor_learning_rate": 0.0003,
+        #             "critic_learning_rate": 0.0003,
+        #             "entropy_learning_rate": 0.0003,
+        #         },
+        #         "num_workers": 4,
+        #         "num_gpus": 0,
+        #         "clip_actions": False,
+        #         "normalize_actions": True,
+        #         "metrics_smoothing_episodes": 5,
+        #     },
+        #     n_iter=200,
+        #     threshold=-750.0,
+        # ),
     )
 ]
 
diff --git a/rllib/tests/agents/test_learning.py b/rllib/tests/agents/test_learning.py
index 3e55f79bf583..99d905053240 100644
--- a/rllib/tests/agents/test_learning.py
+++ b/rllib/tests/agents/test_learning.py
@@ -80,6 +80,7 @@ def test_monotonically_improving_algorithms_can_converge_with_different_framewor
     for i in range(n_iter):
         results = trainer.train()
         episode_reward_mean = results["episode_reward_mean"]
+        print(episode_reward_mean)
         if episode_reward_mean >= threshold:
             learnt = True
             break

From 150f9ea81af05aa7bb2d61056ccd4314d6450696 Mon Sep 17 00:00:00 2001
From: Aditya Gudimella <aditya.gudimella@gmail.com>
Date: Mon, 24 Aug 2020 12:51:50 -0700
Subject: [PATCH 2/2] Tuned hyperparams for ApexSAC

---
 rllib/agents/sac/apex.py                   |   8 +-
 rllib/optimizers/async_replay_optimizer.py |  31 ++-
 rllib/tests/agents/parameters.py           | 241 ++++++++++-----------
 rllib/tests/agents/test_learning.py        |   1 -
 4 files changed, 135 insertions(+), 146 deletions(-)

diff --git a/rllib/agents/sac/apex.py b/rllib/agents/sac/apex.py
index f60c4287431a..78618ffc0d93 100644
--- a/rllib/agents/sac/apex.py
+++ b/rllib/agents/sac/apex.py
@@ -17,15 +17,15 @@
         "n_step": 1,
         "num_gpus": 0,
         "num_workers": 32,
-        "buffer_size": 2000000,
-        "learning_starts": 50000,
+        "buffer_size": 200000,
+        "learning_starts": 5000,
         "train_batch_size": 512,
         "rollout_fragment_length": 50,
         "target_network_update_freq": 0,
-        "timesteps_per_iteration": 25000,
+        "timesteps_per_iteration": 1000,
         "exploration_config": {"type": "PerWorkerEpsilonGreedy"},
         "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
+        "min_iter_time_s": 10,
         "prioritized_replay": True,
     },
 )
diff --git a/rllib/optimizers/async_replay_optimizer.py b/rllib/optimizers/async_replay_optimizer.py
index eec759d13e65..056beac37275 100644
--- a/rllib/optimizers/async_replay_optimizer.py
+++ b/rllib/optimizers/async_replay_optimizer.py
@@ -474,20 +474,27 @@ def step(self):
                     except KeyError:
                         pass
                     else:
-                        is_box_action_space = all((
-                            hasattr(self.local_worker, "policy_map"),
-                            self.local_worker.policy_map.get(
-                                "default_policy", None
-                            ) is not None,
-                            isinstance(
-                                self.local_worker.policy_map[
-                                    "default_policy"
-                                ].action_space,
-                                gym.spaces.Box
+                        is_box_action_space = all(
+                            (
+                                hasattr(self.local_worker, "policy_map"),
+                                self.local_worker.policy_map.get("default_policy", None)
+                                is not None,
+                                isinstance(
+                                    self.local_worker.policy_map[
+                                        "default_policy"
+                                    ].action_space,
+                                    gym.spaces.Box,
+                                ),
                             )
-                        ))
+                        )
                         if is_box_action_space:
-                            batch["actions"] = batch["actions"].reshape((-1, 1))
+                            # Reshape to (batch_size, action_space_dim)
+                            action_space = self.local_worker.policy_map[
+                                "default_policy"
+                            ].action_space
+                            batch["actions"] = batch["actions"].reshape(
+                                (-1, action_space.shape[0])
+                            )
                     grad_out = self.local_worker.learn_on_batch(replay)
                     for pid, info in grad_out.items():
                         td_error = info.get(
diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py
index 83bffed590b8..94bf9722d09e 100644
--- a/rllib/tests/agents/parameters.py
+++ b/rllib/tests/agents/parameters.py
@@ -212,143 +212,126 @@ def astuple(self):
 ] = [
     x.astuple()
     for x in chain(
-        # TestAgentParams.for_cart_pole(
-        #     algorithm=DiscreteActionSpaceAlgorithm.PPO,
-        #     config_updates={
-        #         "num_gpus": 2,
-        #         "_fake_gpus": True,
-        #         "num_workers": 1,
-        #         "lr": 0.0003,
-        #         "observation_filter": "MeanStdFilter",
-        #         "num_sgd_iter": 6,
-        #         "vf_share_layers": True,
-        #         "vf_loss_coeff": 0.01,
-        #         "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"},
-        #     },
-        #     n_iter=200,
-        #     threshold=150.0,
-        # ),
-        # TestAgentParams.for_pendulum(
-        #     algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG,
-        #     config_updates={
-        #         "use_huber": True,
-        #         "clip_rewards": False,
-        #         "num_workers": 4,
-        #         "n_step": 1,
-        #         "target_network_update_freq": 50000,
-        #         "tau": 1.0,
-        #     },
-        #     n_iter=200,
-        #     threshold=-750.0,
-        # ),
-        # TestAgentParams.for_cart_pole(
-        #     algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN,
-        #     config_updates={
-        #         "target_network_update_freq": 20000,
-        #         "num_workers": 4,
-        #         "num_envs_per_worker": 8,
-        #         "train_batch_size": 64,
-        #         "gamma": 0.95,
-        #     },
-        #     n_iter=200,
-        #     threshold=150.0,
-        # ),
-        # TestAgentParams.for_cart_pole(
-        #     algorithm=DiscreteActionSpaceAlgorithm.SAC,
-        #     config_updates={
-        #         "num_workers": 4,
-        #         "twin_q": True,
-        #         "soft_horizon": True,
-        #         "clip_actions": False,
-        #         "normalize_actions": True,
-        #         "learning_starts": 0,
-        #         "prioritized_replay": True,
-        #         "Q_model": {"fcnet_hiddens": [64, 64]},
-        #         "policy_model": {"fcnet_hiddens": [64, 64],},
-        #     },
-        #     n_iter=200,
-        #     threshold=100.0,
-        # ),
-        # TestAgentParams.for_cart_pole(
-        #     algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
-        #     config_updates={
-        #         "num_workers": 8,
-        #         "prioritized_replay": True,
-        #         "timesteps_per_iteration": 2500,
-        #         "min_iter_time_s": 1,
-        #         "optimizer": {"max_weight_sync_delay": 100},
-        #         "learning_starts": 6000,
-        #         # "exploration_config": {"type": "StochasticSampling"},
-        #         "worker_side_prioritization": True,
-        #         "rollout_fragment_length": 50,
-        #         # "no_done_at_end": True
-        #     },
-        #     # config_updates={
-        #     #     "seed": 42,
-        #     #     "num_workers": 8,
-        #     #     "buffer_size": 200000,
-        #     #     "learning_starts": 6000,
-        #     #     "train_batch_size": 64,
-        #     #     "target_network_update_freq": 0,
-        #     #     "timesteps_per_iteration": 2500,
-        #     #     "min_iter_time_s": 3,
-        #     # },
-        #     n_iter=200,
-        #     threshold=175.0,
-        #     frameworks=[Framework.TensorFlow],
-        # ),
+        TestAgentParams.for_cart_pole(
+            algorithm=DiscreteActionSpaceAlgorithm.PPO,
+            config_updates={
+                "num_gpus": 2,
+                "_fake_gpus": True,
+                "num_workers": 1,
+                "lr": 0.0003,
+                "observation_filter": "MeanStdFilter",
+                "num_sgd_iter": 6,
+                "vf_share_layers": True,
+                "vf_loss_coeff": 0.01,
+                "model": {"fcnet_hiddens": [32], "fcnet_activation": "linear"},
+            },
+            n_iter=200,
+            threshold=150.0,
+        ),
+        TestAgentParams.for_pendulum(
+            algorithm=ContinuousActionSpaceAlgorithm.APEX_DDPG,
+            config_updates={
+                "use_huber": True,
+                "clip_rewards": False,
+                "num_workers": 4,
+                "n_step": 1,
+                "target_network_update_freq": 50000,
+                "tau": 1.0,
+            },
+            n_iter=200,
+            threshold=-750.0,
+        ),
+        TestAgentParams.for_cart_pole(
+            algorithm=DiscreteActionSpaceAlgorithm.APEX_DQN,
+            config_updates={
+                "target_network_update_freq": 20000,
+                "num_workers": 4,
+                "num_envs_per_worker": 8,
+                "train_batch_size": 64,
+                "gamma": 0.95,
+            },
+            n_iter=200,
+            threshold=150.0,
+        ),
+        TestAgentParams.for_cart_pole(
+            algorithm=DiscreteActionSpaceAlgorithm.SAC,
+            config_updates={
+                "num_workers": 4,
+                "twin_q": True,
+                "soft_horizon": True,
+                "clip_actions": False,
+                "normalize_actions": True,
+                "learning_starts": 0,
+                "prioritized_replay": True,
+                "Q_model": {"fcnet_hiddens": [64, 64]},
+                "policy_model": {"fcnet_hiddens": [64, 64],},
+            },
+            n_iter=200,
+            threshold=100.0,
+        ),
+        TestAgentParams.for_cart_pole(
+            algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
+            config_updates={
+                "num_workers": 8,
+            },
+            n_iter=100,
+            threshold=175.0,
+        ),
         TestAgentParams.for_pendulum(
             algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
             config_updates={
                 "num_workers": 8,
-                "prioritized_replay": True,
-                "timesteps_per_iteration": 2500,
-                "min_iter_time_s": 1,
-                "optimizer": {"max_weight_sync_delay": 100},
-                "learning_starts": 6000,
                 "exploration_config": {"type": "StochasticSampling"},
-                "worker_side_prioritization": True,
-                "rollout_fragment_length": 1,
-                "no_done_at_end": True
+                "no_done_at_end": True,
             },
-            n_iter=1000,
+            n_iter=100,
             threshold=-350.0,
-            frameworks=[Framework.TensorFlow],
         ),
-        # TestAgentParams.for_pendulum(
-        #     algorithm=DiscreteActionSpaceAlgorithm.SAC,
-        #     config_updates={
-        #         "horizon": 200,
-        #         "soft_horizon": True,
-        #         "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]},
-        #         "policy_model": {
-        #             "fcnet_activation": "relu",
-        #             "fcnet_hiddens": [256, 256],
-        #         },
-        #         "tau": 0.005,
-        #         "target_entropy": "auto",
-        #         "no_done_at_end": True,
-        #         "n_step": 1,
-        #         "rollout_fragment_length": 1,
-        #         "prioritized_replay": True,
-        #         "train_batch_size": 256,
-        #         "target_network_update_freq": 1,
-        #         "timesteps_per_iteration": 1000,
-        #         "learning_starts": 256,
-        #         "optimization": {
-        #             "actor_learning_rate": 0.0003,
-        #             "critic_learning_rate": 0.0003,
-        #             "entropy_learning_rate": 0.0003,
-        #         },
-        #         "num_workers": 4,
-        #         "num_gpus": 0,
-        #         "clip_actions": False,
-        #         "normalize_actions": True,
-        #         "metrics_smoothing_episodes": 5,
-        #     },
-        #     n_iter=200,
-        #     threshold=-750.0,
-        # ),
+        TestAgentParams.for_frameworks(
+            algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
+            config_updates={
+                "num_workers": 8,
+                "exploration_config": {"type": "StochasticSampling"},
+            },
+            env="MountainCarContinuous-v0",
+            # TensorFlow returns Nan mean ep reward for the first few epoch
+            n_iter=100,
+            threshold=100.0,
+        ),
+        TestAgentParams.for_pendulum(
+            algorithm=DiscreteActionSpaceAlgorithm.SAC,
+            config_updates={
+                "horizon": 200,
+                "soft_horizon": True,
+                "Q_model": {"fcnet_activation": "relu", "fcnet_hiddens": [256, 256]},
+                "policy_model": {
+                    "fcnet_activation": "relu",
+                    "fcnet_hiddens": [256, 256],
+                },
+                "tau": 0.005,
+                "target_entropy": "auto",
+                "no_done_at_end": True,
+                "n_step": 1,
+                "rollout_fragment_length": 1,
+                "prioritized_replay": True,
+                "train_batch_size": 256,
+                "target_network_update_freq": 1,
+                "timesteps_per_iteration": 1000,
+                "learning_starts": 256,
+                "optimization": {
+                    "actor_learning_rate": 0.0003,
+                    "critic_learning_rate": 0.0003,
+                    "entropy_learning_rate": 0.0003,
+                },
+                "num_workers": 4,
+                "num_gpus": 0,
+                "clip_actions": False,
+                "normalize_actions": True,
+                "metrics_smoothing_episodes": 5,
+            },
+            n_iter=200,
+            threshold=-750.0,
+        ),
     )
 ]
 
diff --git a/rllib/tests/agents/test_learning.py b/rllib/tests/agents/test_learning.py
index 99d905053240..3e55f79bf583 100644
--- a/rllib/tests/agents/test_learning.py
+++ b/rllib/tests/agents/test_learning.py
@@ -80,7 +80,6 @@ def test_monotonically_improving_algorithms_can_converge_with_different_framewor
     for i in range(n_iter):
         results = trainer.train()
         episode_reward_mean = results["episode_reward_mean"]
-        print(episode_reward_mean)
         if episode_reward_mean >= threshold:
             learnt = True
             break