diff --git a/rllib/agents/sac/apex.py b/rllib/agents/sac/apex.py index f60c4287431a..78618ffc0d93 100644 --- a/rllib/agents/sac/apex.py +++ b/rllib/agents/sac/apex.py @@ -17,15 +17,15 @@ "n_step": 1, "num_gpus": 0, "num_workers": 32, - "buffer_size": 2000000, - "learning_starts": 50000, + "buffer_size": 200000, + "learning_starts": 5000, "train_batch_size": 512, "rollout_fragment_length": 50, "target_network_update_freq": 0, - "timesteps_per_iteration": 25000, + "timesteps_per_iteration": 1000, "exploration_config": {"type": "PerWorkerEpsilonGreedy"}, "worker_side_prioritization": True, - "min_iter_time_s": 30, + "min_iter_time_s": 10, "prioritized_replay": True, }, ) diff --git a/rllib/optimizers/async_replay_optimizer.py b/rllib/optimizers/async_replay_optimizer.py index eec759d13e65..056beac37275 100644 --- a/rllib/optimizers/async_replay_optimizer.py +++ b/rllib/optimizers/async_replay_optimizer.py @@ -474,20 +474,27 @@ def step(self): except KeyError: pass else: - is_box_action_space = all(( - hasattr(self.local_worker, "policy_map"), - self.local_worker.policy_map.get( - "default_policy", None - ) is not None, - isinstance( - self.local_worker.policy_map[ - "default_policy" - ].action_space, - gym.spaces.Box + is_box_action_space = all( + ( + hasattr(self.local_worker, "policy_map"), + self.local_worker.policy_map.get("default_policy", None) + is not None, + isinstance( + self.local_worker.policy_map[ + "default_policy" + ].action_space, + gym.spaces.Box, + ), ) - )) + ) if is_box_action_space: - batch["actions"] = batch["actions"].reshape((-1, 1)) + # Reshape to (batch_size, action_space_dim) + action_space = self.local_worker.policy_map[ + "default_policy" + ].action_space + batch["actions"] = batch["actions"].reshape( + (-1, action_space.shape[0]) + ) grad_out = self.local_worker.learn_on_batch(replay) for pid, info in grad_out.items(): td_error = info.get( diff --git a/rllib/tests/agents/parameters.py b/rllib/tests/agents/parameters.py index 1deaac960d2a..94bf9722d09e 100644 --- a/rllib/tests/agents/parameters.py +++ b/rllib/tests/agents/parameters.py @@ -272,29 +272,31 @@ def astuple(self): TestAgentParams.for_cart_pole( algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, config_updates={ - "seed": 42, "num_workers": 8, - "buffer_size": 200000, - "learning_starts": 6000, - "train_batch_size": 64, - "target_network_update_freq": 0, - "timesteps_per_iteration": 2500, - "min_iter_time_s": 3, }, - n_iter=200, + n_iter=100, threshold=175.0, ), TestAgentParams.for_pendulum( algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, config_updates={ - "num_workers": 4, - "prioritized_replay": True, - "timesteps_per_iteration": 100, - "min_iter_time_s": 1, - "optimizer": {"num_replay_buffer_shards": 1}, - "learning_starts": 0, + "num_workers": 8, + "exploration_config": {"type": "StochasticSampling"}, + "no_done_at_end": True, }, - n_iter=200, + n_iter=100, + threshold=-350.0, + ), + TestAgentParams.for_frameworks( + algorithm=ContinuousActionSpaceAlgorithm.APEX_SAC, + config_updates={ + "num_workers": 8, + "exploration_config": {"type": "StochasticSampling"}, + }, + env="MountainCarContinuous-v0", + # TensorFlow returns Nan mean ep reward for the first few epoch + n_iter=100, + threshold=100.0, ), TestAgentParams.for_pendulum( algorithm=DiscreteActionSpaceAlgorithm.SAC,