BonsaiAI · AdityaGudimella · Aug 19, 2020 · Aug 24, 2020 · Aug 24, 2020
@@ -17,15 +17,15 @@
         "n_step": 1,
         "num_gpus": 0,
         "num_workers": 32,
-        "buffer_size": 2000000,
-        "learning_starts": 50000,
+        "buffer_size": 200000,
+        "learning_starts": 5000,
         "train_batch_size": 512,
         "rollout_fragment_length": 50,
         "target_network_update_freq": 0,
-        "timesteps_per_iteration": 25000,
+        "timesteps_per_iteration": 1000,
         "exploration_config": {"type": "PerWorkerEpsilonGreedy"},
         "worker_side_prioritization": True,
-        "min_iter_time_s": 30,
+        "min_iter_time_s": 10,
         "prioritized_replay": True,
     },
 )

@@ -474,20 +474,27 @@ def step(self):
                     except KeyError:
                         pass
                     else:
-                        is_box_action_space = all((
-                            hasattr(self.local_worker, "policy_map"),
-                            self.local_worker.policy_map.get(
-                                "default_policy", None
-                            ) is not None,
-                            isinstance(
-                                self.local_worker.policy_map[
-                                    "default_policy"
-                                ].action_space,
-                                gym.spaces.Box
+                        is_box_action_space = all(
+                            (
+                                hasattr(self.local_worker, "policy_map"),
+                                self.local_worker.policy_map.get("default_policy", None)
+                                is not None,
+                                isinstance(
+                                    self.local_worker.policy_map[
+                                        "default_policy"
+                                    ].action_space,
+                                    gym.spaces.Box,
+                                ),
                             )
-                        ))
+                        )
                         if is_box_action_space:
-                            batch["actions"] = batch["actions"].reshape((-1, 1))
+                            # Reshape to (batch_size, action_space_dim)
+                            action_space = self.local_worker.policy_map[
+                                "default_policy"
+                            ].action_space
+                            batch["actions"] = batch["actions"].reshape(
+                                (-1, action_space.shape[0])
+                            )
                     grad_out = self.local_worker.learn_on_batch(replay)
                     for pid, info in grad_out.items():
                         td_error = info.get(

@@ -272,29 +272,31 @@ def astuple(self):
         TestAgentParams.for_cart_pole(
             algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
             config_updates={
-                "seed": 42,
                 "num_workers": 8,
-                "buffer_size": 200000,
-                "learning_starts": 6000,
-                "train_batch_size": 64,
-                "target_network_update_freq": 0,
-                "timesteps_per_iteration": 2500,
-                "min_iter_time_s": 3,
             },
-            n_iter=200,
+            n_iter=100,
             threshold=175.0,
         ),
         TestAgentParams.for_pendulum(
             algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
             config_updates={
-                "num_workers": 4,
-                "prioritized_replay": True,
-                "timesteps_per_iteration": 100,
-                "min_iter_time_s": 1,
-                "optimizer": {"num_replay_buffer_shards": 1},
-                "learning_starts": 0,
+                "num_workers": 8,
+                "exploration_config": {"type": "StochasticSampling"},
+                "no_done_at_end": True,
             },
-            n_iter=200,
+            n_iter=100,
+            threshold=-350.0,
+        ),
+        TestAgentParams.for_frameworks(
+            algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC,
+            config_updates={
+                "num_workers": 8,
+                "exploration_config": {"type": "StochasticSampling"},
+            },
+            env="MountainCarContinuous-v0",
+            # TensorFlow returns Nan mean ep reward for the first few epoch
+            n_iter=100,
+            threshold=100.0,
         ),
         TestAgentParams.for_pendulum(
             algorithm=DiscreteActionSpaceAlgorithm.SAC,