DLR-RM
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmark.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hyperparams/a2c.yml‎
Lines changed: 2 additions & 2 deletions b/‎hyperparams/a2c.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎hyperparams/ppo.yml‎
Lines changed: 9 additions & 7 deletions b/‎hyperparams/ppo.yml‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎logs/benchmark/benchmark.md‎
Lines changed: 1 addition & 1 deletion b/‎logs/benchmark/benchmark.md‎
Lines changed: 1 addition & 1 deletion
@@ -13,6 +13,7 @@
 ### Other
 - Updated docker image
 - constrained gym version: gym>=0.17,<0.20
+- Better hyperparameters for A2C/PPO on Pendulum
 
 ## Release 1.2.0 (2021-09-08)
 
 
@@ -77,7 +77,7 @@ and also allow users to have access to pretrained agents.*
 |ppo  |LunarLanderContinuous-v2   |    270.863|    32.072|1M         |        149956|          526|
 |ppo  |MountainCar-v0             |   -110.423|    19.473|1M         |        149954|         1358|
 |ppo  |MountainCarContinuous-v0   |     88.343|     2.572|20k        |        149983|          633|
-|ppo  |Pendulum-v0                |   -169.887|   104.904|2M         |        150000|          750|
+|ppo  |Pendulum-v0                |   -172.225|   104.159|100k       |        150000|          750|
 |ppo  |PongNoFrameskip-v4         |     20.989|     0.105|10M        |        599902|           90|
 |ppo  |QbertNoFrameskip-v4        |  15627.108|  3313.538|10M        |        600248|           83|
 |ppo  |ReacherBulletEnv-v0        |     17.091|    11.048|1M         |        150000|         1000|
 
@@ -38,7 +38,7 @@ Acrobot-v1:
   policy: 'MlpPolicy'
   ent_coef: .0
 
-# Almost tuned
+# Tuned
 Pendulum-v0:
   normalize: True
   n_envs: 8
@@ -49,7 +49,7 @@ Pendulum-v0:
   n_steps: 8
   gae_lambda: 0.9
   vf_coef: 0.4
-  gamma: 0.99
+  gamma: 0.9
   use_rms_prop: True
   normalize_advantage: False
   learning_rate: lin_7e-4
 
@@ -13,18 +13,20 @@ atari:
   vf_coef: 0.5
   ent_coef: 0.01
 
+# Tuned
 Pendulum-v0:
-  n_envs: 8
-  n_timesteps: !!float 2e6
+  n_envs: 4
+  n_timesteps: !!float 1e5
   policy: 'MlpPolicy'
-  n_steps: 2048
-  batch_size: 64
+  n_steps: 1024
   gae_lambda: 0.95
-  gamma: 0.99
+  gamma: 0.9
   n_epochs: 10
   ent_coef: 0.0
-  learning_rate: !!float 3e-4
+  learning_rate: !!float 1e-3
   clip_range: 0.2
+  use_sde: True
+  sde_sample_freq: 4
 
 # Tuned
 CartPole-v1:
@@ -524,4 +526,4 @@ Walker2d-v2:
   n_epochs: 20
   gae_lambda: 0.95
   max_grad_norm: 1
-  vf_coef: 0.871923
+  vf_coef: 0.871923
@@ -77,7 +77,7 @@ and also allow users to have access to pretrained agents.*
 |ppo  |LunarLanderContinuous-v2   |    270.863|    32.072|1M         |        149956|          526|
 |ppo  |MountainCar-v0             |   -110.423|    19.473|1M         |        149954|         1358|
 |ppo  |MountainCarContinuous-v0   |     88.343|     2.572|20k        |        149983|          633|
-|ppo  |Pendulum-v0                |   -169.887|   104.904|2M         |        150000|          750|
+|ppo  |Pendulum-v0                |   -172.225|   104.159|100k       |        150000|          750|
 |ppo  |PongNoFrameskip-v4         |     20.989|     0.105|10M        |        599902|           90|
 |ppo  |QbertNoFrameskip-v4        |  15627.108|  3313.538|10M        |        600248|           83|
 |ppo  |ReacherBulletEnv-v0        |     17.091|    11.048|1M         |        150000|         1000|