Add an option to the CEM planner to keep exploring after the variance…

… has converged to a small amount. At the moment, as CEM finds a good solution, the variance of the samples it produces goes down to zero. This means that when a big external change happens, there is not enough exploration noise to find a new solution. After this commit, there's a new option, explore_fraction, which keeps a fraction of the rollout trajectories using the initial exploration noise, instead of the one derived from CEM. Switch the Shadow task to use CEM. PiperOrigin-RevId: 696509896 Change-Id: Ib8ad7cda058da1dc7a82c7dff3fcf33a8dcdab2f
google-deepmind · Nov 14, 2024 · dff75d8 · dff75d8
1 parent 80129e6
commit dff75d8
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 6 deletions.
diff --git a/mjpc/planners/cross_entropy/planner.cc b/mjpc/planners/cross_entropy/planner.cc
@@ -55,8 +55,11 @@ void CrossEntropyPlanner::Initialize(mjModel* model, const Task& task) {
   // sampling noise
   std_initial_ =
       GetNumberOrDefault(0.1, model,
-                         "sampling_exploration");        // initial variance
-  std_min_ = GetNumberOrDefault(0.1, model, "std_min");  // minimum variance
+                         "sampling_exploration");         // initial variance
+  std_min_ = GetNumberOrDefault(0.01, model, "std_min");  // minimum variance
+  // fraction of the trajectories that will use full exploration noise
+  explore_fraction_ =
+      GetNumberOrDefault(0.0, model, "explore_fraction");
 
   // set number of trajectories to rollout
   num_trajectory_ = GetNumberOrDefault(10, model, "sampling_trajectories");
@@ -389,14 +392,21 @@ void CrossEntropyPlanner::Rollouts(int num_trajectory, int horizon,
 
   // lock std_min
   double std_min = std_min_;
+  double std_initial = std_initial_;
 
   // random search
   int count_before = pool.GetCount();
   for (int i = 0; i < num_trajectory; i++) {
+    double std;
+    if (i < num_trajectory * explore_fraction_) {
+      std = std_initial;
+    } else {
+      std = std_min;
+    }
     pool.Schedule([&s = *this, &model = this->model, &task = this->task,
                    &state = this->state, &time = this->time,
                    &mocap = this->mocap, &userdata = this->userdata, horizon,
-                   std_min, i]() {
+                   std, i]() {
       // copy nominal policy and sample noise
       {
         const std::shared_lock<std::shared_mutex> lock(s.mtx_);
@@ -406,7 +416,7 @@ void CrossEntropyPlanner::Rollouts(int num_trajectory, int horizon,
             s.resampled_policy.plan.Interpolation());
 
         // sample noise
-        s.AddNoiseToPolicy(i, std_min);
+        s.AddNoiseToPolicy(i, std);
       }
 
       // ----- rollout sample policy ----- //
@@ -491,6 +501,7 @@ void CrossEntropyPlanner::GUI(mjUI& ui) {
       {mjITEM_SLIDERINT, "Spline Pts", 2, &policy.num_spline_points, "0 1"},
       {mjITEM_SLIDERNUM, "Init. Std", 2, &std_initial_, "0 1"},
       {mjITEM_SLIDERNUM, "Min. Std", 2, &std_min_, "0.01 0.5"},
+      {mjITEM_SLIDERNUM, "Explore", 2, &explore_fraction_, "0.0 1.0"},
       {mjITEM_SLIDERINT, "Elite", 2, &n_elite_, "2 128"},
       {mjITEM_END}};
 

diff --git a/mjpc/planners/cross_entropy/planner.h b/mjpc/planners/cross_entropy/planner.h
@@ -122,6 +122,8 @@ class CrossEntropyPlanner : public Planner {
   double std_initial_;  // standard deviation for sampling normal: N(0,
                         // std)
   double std_min_;      // the minimum allowable std
+  double explore_fraction_ = 0;  // fraction of trajectories that will use
+                                 // std_initial instead of the variance from CEM
   std::vector<double> noise;
   std::vector<double> variance;
 

diff --git a/mjpc/tasks/shadow_reorient/task.xml b/mjpc/tasks/shadow_reorient/task.xml
@@ -4,13 +4,17 @@
   <size memory="1M"/>
 
   <custom>
-    <numeric name="agent_planner" data="0" />
+    <numeric name="agent_planner" data="5" />
     <numeric name="agent_horizon" data="0.25" />
     <numeric name="agent_timestep" data="0.01" />
     <numeric name="agent_policy_width" data="0.0035" />
     <numeric name="sampling_spline_points" data="5" />
-    <numeric name="sampling_exploration" data="0.1" />
+    <numeric name="sampling_exploration" data="0.2" />
     <numeric name="sampling_representation" data="0" />
+    <numeric name="sampling_trajectories" data="60" />
+    <numeric name="n_elite" data="8" />
+    <numeric name="explore_fraction" data="0.5" />
+
     <numeric name="robust_xfrc" data="0.004" />
   </custom>